Movatterモバイル変換

Go to the documentation of this file.

1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//

2//

3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4// See https://llvm.org/LICENSE.txt for license information.

5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6//

7//===----------------------------------------------------------------------===//

8//

9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive

10// stores that can be put together into vector-stores. Next, it attempts to

11// construct vectorizable tree using the use-def chains. If a profitable tree

12// was found, the SLP vectorizer performs vectorization on the tree.

13//

14// The pass is inspired by the work described in the paper:

15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.

16//

17//===----------------------------------------------------------------------===//

19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"

20#include "llvm/ADT/DenseMap.h"

21#include "llvm/ADT/DenseSet.h"

22#include "llvm/ADT/PriorityQueue.h"

23#include "llvm/ADT/STLExtras.h"

24#include "llvm/ADT/ScopeExit.h"

25#include "llvm/ADT/SetOperations.h"

26#include "llvm/ADT/SetVector.h"

27#include "llvm/ADT/SmallBitVector.h"

28#include "llvm/ADT/SmallPtrSet.h"

29#include "llvm/ADT/SmallSet.h"

30#include "llvm/ADT/SmallString.h"

31#include "llvm/ADT/Statistic.h"

32#include "llvm/ADT/iterator.h"

33#include "llvm/ADT/iterator_range.h"

34#include "llvm/Analysis/AliasAnalysis.h"

35#include "llvm/Analysis/AssumptionCache.h"

36#include "llvm/Analysis/CodeMetrics.h"

37#include "llvm/Analysis/ConstantFolding.h"

38#include "llvm/Analysis/DemandedBits.h"

39#include "llvm/Analysis/GlobalsModRef.h"

40#include "llvm/Analysis/IVDescriptors.h"

41#include "llvm/Analysis/LoopAccessAnalysis.h"

42#include "llvm/Analysis/LoopInfo.h"

43#include "llvm/Analysis/MemoryLocation.h"

44#include "llvm/Analysis/OptimizationRemarkEmitter.h"

45#include "llvm/Analysis/ScalarEvolution.h"

46#include "llvm/Analysis/ScalarEvolutionExpressions.h"

47#include "llvm/Analysis/TargetLibraryInfo.h"

48#include "llvm/Analysis/TargetTransformInfo.h"

49#include "llvm/Analysis/ValueTracking.h"

50#include "llvm/Analysis/VectorUtils.h"

51#include "llvm/IR/Attributes.h"

52#include "llvm/IR/BasicBlock.h"

53#include "llvm/IR/Constant.h"

54#include "llvm/IR/Constants.h"

55#include "llvm/IR/DataLayout.h"

56#include "llvm/IR/DerivedTypes.h"

57#include "llvm/IR/Dominators.h"

58#include "llvm/IR/Function.h"

59#include "llvm/IR/IRBuilder.h"

60#include "llvm/IR/InstrTypes.h"

61#include "llvm/IR/Instruction.h"

62#include "llvm/IR/Instructions.h"

63#include "llvm/IR/IntrinsicInst.h"

64#include "llvm/IR/Intrinsics.h"

65#include "llvm/IR/Module.h"

66#include "llvm/IR/Operator.h"

67#include "llvm/IR/PatternMatch.h"

68#include "llvm/IR/Type.h"

69#include "llvm/IR/Use.h"

70#include "llvm/IR/User.h"

71#include "llvm/IR/Value.h"

72#include "llvm/IR/ValueHandle.h"

73#ifdef EXPENSIVE_CHECKS

74#include "llvm/IR/Verifier.h"

75#endif

76#include "llvm/Pass.h"

77#include "llvm/Support/Casting.h"

78#include "llvm/Support/CommandLine.h"

79#include "llvm/Support/Compiler.h"

80#include "llvm/Support/DOTGraphTraits.h"

81#include "llvm/Support/Debug.h"

82#include "llvm/Support/DebugCounter.h"

83#include "llvm/Support/ErrorHandling.h"

84#include "llvm/Support/GraphWriter.h"

85#include "llvm/Support/InstructionCost.h"

86#include "llvm/Support/KnownBits.h"

87#include "llvm/Support/MathExtras.h"

88#include "llvm/Support/raw_ostream.h"

89#include "llvm/Transforms/Utils/InjectTLIMappings.h"

90#include "llvm/Transforms/Utils/Local.h"

91#include "llvm/Transforms/Utils/LoopUtils.h"

92#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"

93#include <algorithm>

94#include <cassert>

95#include <cstdint>

96#include <iterator>

97#include <memory>

98#include <optional>

99#include <set>

100#include <string>

101#include <tuple>

102#include <utility>

103

104using namespacellvm;

105using namespacellvm::PatternMatch;

106using namespaceslpvectorizer;

107using namespacestd::placeholders;

108

109#define SV_NAME "slp-vectorizer"

110#define DEBUG_TYPE "SLP"

111

112STATISTIC(NumVectorInstructions,"Number of vector instructions generated");

113

114DEBUG_COUNTER(VectorizedGraphs,"slp-vectorized",

115"Controls which SLP graphs should be vectorized.");

116

117staticcl::opt<bool>

118RunSLPVectorization("vectorize-slp",cl::init(true),cl::Hidden,

119cl::desc("Run the SLP vectorization passes"));

120

121staticcl::opt<bool>

122SLPReVec("slp-revec",cl::init(false),cl::Hidden,

123cl::desc("Enable vectorization for wider vector utilization"));

124

125staticcl::opt<int>

126SLPCostThreshold("slp-threshold",cl::init(0),cl::Hidden,

127cl::desc("Only vectorize if you gain more than this "

128"number "));

129

130staticcl::opt<bool>SLPSkipEarlyProfitabilityCheck(

131"slp-skip-early-profitability-check",cl::init(false),cl::Hidden,

132cl::desc("When true, SLP vectorizer bypasses profitability checks based on "

133"heuristics and makes vectorization decision via cost modeling."));

134

135staticcl::opt<bool>

136ShouldVectorizeHor("slp-vectorize-hor",cl::init(true),cl::Hidden,

137cl::desc("Attempt to vectorize horizontal reductions"));

138

139staticcl::opt<bool>ShouldStartVectorizeHorAtStore(

140"slp-vectorize-hor-store",cl::init(false),cl::Hidden,

141cl::desc(

142"Attempt to vectorize horizontal reductions feeding into a store"));

143

144staticcl::opt<int>

145MaxVectorRegSizeOption("slp-max-reg-size",cl::init(128),cl::Hidden,

146cl::desc("Attempt to vectorize for this register size in bits"));

147

148staticcl::opt<unsigned>

149MaxVFOption("slp-max-vf",cl::init(0),cl::Hidden,

150cl::desc("Maximum SLP vectorization factor (0=unlimited)"));

151

152/// Limits the size of scheduling regions in a block.

153/// It avoid long compile times for _very_ large blocks where vector

154/// instructions are spread over a wide range.

155/// This limit is way higher than needed by real-world functions.

156staticcl::opt<int>

157ScheduleRegionSizeBudget("slp-schedule-budget",cl::init(100000),cl::Hidden,

158cl::desc("Limit the size of the SLP scheduling region per block"));

159

160staticcl::opt<int>MinVectorRegSizeOption(

161"slp-min-reg-size",cl::init(128),cl::Hidden,

162cl::desc("Attempt to vectorize for this register size in bits"));

163

164staticcl::opt<unsigned>RecursionMaxDepth(

165"slp-recursion-max-depth",cl::init(12),cl::Hidden,

166cl::desc("Limit the recursion depth when building a vectorizable tree"));

167

168staticcl::opt<unsigned>MinTreeSize(

169"slp-min-tree-size",cl::init(3),cl::Hidden,

170cl::desc("Only vectorize small trees if they are fully vectorizable"));

171

172// The maximum depth that the look-ahead score heuristic will explore.

173// The higher this value, the higher the compilation time overhead.

174staticcl::opt<int>LookAheadMaxDepth(

175"slp-max-look-ahead-depth",cl::init(2),cl::Hidden,

176cl::desc("The maximum look-ahead depth for operand reordering scores"));

177

178// The maximum depth that the look-ahead score heuristic will explore

179// when it probing among candidates for vectorization tree roots.

180// The higher this value, the higher the compilation time overhead but unlike

181// similar limit for operands ordering this is less frequently used, hence

182// impact of higher value is less noticeable.

183staticcl::opt<int>RootLookAheadMaxDepth(

184"slp-max-root-look-ahead-depth",cl::init(2),cl::Hidden,

185cl::desc("The maximum look-ahead depth for searching best rooting option"));

186

187staticcl::opt<unsigned>MinProfitableStridedLoads(

188"slp-min-strided-loads",cl::init(2),cl::Hidden,

189cl::desc("The minimum number of loads, which should be considered strided, "

190"if the stride is > 1 or is runtime value"));

191

192staticcl::opt<unsigned>MaxProfitableLoadStride(

193"slp-max-stride",cl::init(8),cl::Hidden,

194cl::desc("The maximum stride, considered to be profitable."));

195

196staticcl::opt<bool>

197ViewSLPTree("view-slp-tree",cl::Hidden,

198cl::desc("Display the SLP trees with Graphviz"));

199

200staticcl::opt<bool>VectorizeNonPowerOf2(

201"slp-vectorize-non-power-of-2",cl::init(false),cl::Hidden,

202cl::desc("Try to vectorize with non-power-of-2 number of elements."));

203

204// Limit the number of alias checks. The limit is chosen so that

205// it has no negative effect on the llvm benchmarks.

206staticconstunsignedAliasedCheckLimit = 10;

207

208// Limit of the number of uses for potentially transformed instructions/values,

209// used in checks to avoid compile-time explode.

210staticconstexprintUsesLimit = 64;

211

212// Another limit for the alias checks: The maximum distance between load/store

213// instructions where alias checks are done.

214// This limit is useful for very large basic blocks.

215staticconstunsignedMaxMemDepDistance = 160;

216

217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling

218/// regions to be handled.

219staticconstintMinScheduleRegionSize = 16;

220

221/// Maximum allowed number of operands in the PHI nodes.

222staticconstunsignedMaxPHINumOperands = 128;

223

224/// Predicate for the element types that the SLP vectorizer supports.

225///

226/// The most important thing to filter here are types which are invalid in LLVM

227/// vectors. We also filter target specific types which have absolutely no

228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just

229/// avoids spending time checking the cost model and realizing that they will

230/// be inevitably scalarized.

231staticboolisValidElementType(Type *Ty) {

232// TODO: Support ScalableVectorType.

233if (SLPReVec && isa<FixedVectorType>(Ty))

234 Ty = Ty->getScalarType();

235return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&

236 !Ty->isPPC_FP128Ty();

237}

238

239/// Returns the type of the given value/instruction \p V. If it is store,

240/// returns the type of its value operand, for Cmp - the types of the compare

241/// operands and for insertelement - the type os the inserted operand.

242/// Otherwise, just the type of the value is returned.

243staticType *getValueType(Value *V) {

244if (auto *SI = dyn_cast<StoreInst>(V))

245return SI->getValueOperand()->getType();

246if (auto *CI = dyn_cast<CmpInst>(V))

247return CI->getOperand(0)->getType();

248if (auto *IE = dyn_cast<InsertElementInst>(V))

249return IE->getOperand(1)->getType();

250return V->getType();

251}

252

253/// \returns the number of elements for Ty.

254staticunsignedgetNumElements(Type *Ty) {

255assert(!isa<ScalableVectorType>(Ty) &&

256"ScalableVectorType is not supported.");

257if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))

258return VecTy->getNumElements();

259return 1;

260}

261

262/// \returns the vector type of ScalarTy based on vectorization factor.

263staticFixedVectorType *getWidenedType(Type *ScalarTy,unsigned VF) {

264returnFixedVectorType::get(ScalarTy->getScalarType(),

265 VF *getNumElements(ScalarTy));

266}

267

268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,

269/// which forms type, which splits by \p TTI into whole vector types during

270/// legalization.

271staticunsignedgetFullVectorNumberOfElements(constTargetTransformInfo &TTI,

272Type *Ty,unsigned Sz) {

273if (!isValidElementType(Ty))

274returnbit_ceil(Sz);

275// Find the number of elements, which forms full vectors.

276constunsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));

277if (NumParts == 0 || NumParts >= Sz)

278returnbit_ceil(Sz);

279returnbit_ceil(divideCeil(Sz, NumParts)) * NumParts;

280}

281

282/// Returns the number of elements of the given type \p Ty, not greater than \p

283/// Sz, which forms type, which splits by \p TTI into whole vector types during

284/// legalization.

285staticunsigned

286getFloorFullVectorNumberOfElements(constTargetTransformInfo &TTI,Type *Ty,

287unsigned Sz) {

288if (!isValidElementType(Ty))

289returnbit_floor(Sz);

290// Find the number of elements, which forms full vectors.

291unsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));

292if (NumParts == 0 || NumParts >= Sz)

293returnbit_floor(Sz);

294unsigned RegVF =bit_ceil(divideCeil(Sz, NumParts));

295if (RegVF > Sz)

296returnbit_floor(Sz);

297return (Sz / RegVF) * RegVF;

298}

299

300staticvoidtransformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,

301SmallVectorImpl<int> &Mask) {

302// The ShuffleBuilder implementation use shufflevector to splat an "element".

303// But the element have different meaning for SLP (scalar) and REVEC

304// (vector). We need to expand Mask into masks which shufflevector can use

305// directly.

306SmallVector<int> NewMask(Mask.size() * VecTyNumElements);

307for (unsignedI : seq<unsigned>(Mask.size()))

308for (auto [J, MaskV] :enumerate(MutableArrayRef(NewMask).slice(

309I * VecTyNumElements, VecTyNumElements)))

310 MaskV = Mask[I] ==PoisonMaskElem ?PoisonMaskElem

311 : Mask[I] * VecTyNumElements + J;

312 Mask.swap(NewMask);

313}

314

315/// \returns the number of groups of shufflevector

316/// A group has the following features

317/// 1. All of value in a group are shufflevector.

318/// 2. The mask of all shufflevector is isExtractSubvectorMask.

319/// 3. The mask of all shufflevector uses all of the elements of the source.

320/// e.g., it is 1 group (%0)

321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,

322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,

324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>

325/// it is 2 groups (%3 and %4)

326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,

327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,

329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>

330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,

331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,

333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>

334/// it is 0 group

335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,

336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,

338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

339staticunsignedgetShufflevectorNumGroups(ArrayRef<Value *> VL) {

340if (VL.empty())

341return 0;

342if (!all_of(VL, IsaPred<ShuffleVectorInst>))

343return 0;

344auto *SV = cast<ShuffleVectorInst>(VL.front());

345unsigned SVNumElements =

346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();

347unsigned ShuffleMaskSize = SV->getShuffleMask().size();

348if (SVNumElements % ShuffleMaskSize != 0)

349return 0;

350unsigned GroupSize = SVNumElements / ShuffleMaskSize;

351if (GroupSize == 0 || (VL.size() % GroupSize) != 0)

352return 0;

353unsigned NumGroup = 0;

354for (size_tI = 0, E = VL.size();I != E;I += GroupSize) {

355auto *SV = cast<ShuffleVectorInst>(VL[I]);

356Value *Src = SV->getOperand(0);

357ArrayRef<Value *> Group = VL.slice(I, GroupSize);

358SmallBitVector ExpectedIndex(GroupSize);

359if (!all_of(Group, [&](Value *V) {

360auto *SV = cast<ShuffleVectorInst>(V);

361// From the same source.

362if (SV->getOperand(0) != Src)

363returnfalse;

364int Index;

365if (!SV->isExtractSubvectorMask(Index))

366returnfalse;

367 ExpectedIndex.set(Index / ShuffleMaskSize);

368returntrue;

369 }))

370return 0;

371if (!ExpectedIndex.all())

372return 0;

373 ++NumGroup;

374 }

375assert(NumGroup == (VL.size() / GroupSize) &&"Unexpected number of groups");

376return NumGroup;

377}

378

379/// \returns a shufflevector mask which is used to vectorize shufflevectors

380/// e.g.,

381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,

382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,

384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>

385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,

386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,

388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>

389/// the result is

390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>

391staticSmallVector<int>calculateShufflevectorMask(ArrayRef<Value *> VL) {

392assert(getShufflevectorNumGroups(VL) &&"Not supported shufflevector usage.");

393auto *SV = cast<ShuffleVectorInst>(VL.front());

394unsigned SVNumElements =

395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();

396SmallVector<int> Mask;

397unsigned AccumulateLength = 0;

398for (Value *V : VL) {

399auto *SV = cast<ShuffleVectorInst>(V);

400for (int M : SV->getShuffleMask())

401 Mask.push_back(M ==PoisonMaskElem ?PoisonMaskElem

402 : AccumulateLength + M);

403 AccumulateLength += SVNumElements;

404 }

405return Mask;

406}

407

408/// \returns True if the value is a constant (but not globals/constant

409/// expressions).

410staticboolisConstant(Value *V) {

411return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);

412}

413

414/// Checks if \p V is one of vector-like instructions, i.e. undef,

415/// insertelement/extractelement with constant indices for fixed vector type or

416/// extractvalue instruction.

417staticboolisVectorLikeInstWithConstOps(Value *V) {

418if (!isa<InsertElementInst, ExtractElementInst>(V) &&

419 !isa<ExtractValueInst, UndefValue>(V))

420returnfalse;

421auto *I = dyn_cast<Instruction>(V);

422if (!I || isa<ExtractValueInst>(I))

423returntrue;

424if (!isa<FixedVectorType>(I->getOperand(0)->getType()))

425returnfalse;

426if (isa<ExtractElementInst>(I))

427returnisConstant(I->getOperand(1));

428assert(isa<InsertElementInst>(V) &&"Expected only insertelement.");

429returnisConstant(I->getOperand(2));

430}

431

432/// Returns power-of-2 number of elements in a single register (part), given the

433/// total number of elements \p Size and number of registers (parts) \p

434/// NumParts.

435staticunsignedgetPartNumElems(unsignedSize,unsigned NumParts) {

436return std::min<unsigned>(Size,bit_ceil(divideCeil(Size, NumParts)));

437}

438

439/// Returns correct remaining number of elements, considering total amount \p

440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems

441/// and current register (part) \p Part.

442staticunsignedgetNumElems(unsignedSize,unsigned PartNumElems,

443unsigned Part) {

444return std::min<unsigned>(PartNumElems,Size - Part * PartNumElems);

445}

446

447#if !defined(NDEBUG)

448/// Print a short descriptor of the instruction bundle suitable for debug output.

449static std::stringshortBundleName(ArrayRef<Value *> VL,intIdx = -1) {

450 std::string Result;

451raw_string_ostream OS(Result);

452if (Idx >= 0)

453OS <<"Idx: " <<Idx <<", ";

454OS <<"n=" << VL.size() <<" [" << *VL.front() <<", ..]";

455return Result;

456}

457#endif

458

459/// \returns true if all of the instructions in \p VL are in the same block or

460/// false otherwise.

461staticboolallSameBlock(ArrayRef<Value *> VL) {

462auto *It =find_if(VL, IsaPred<Instruction>);

463if (It == VL.end())

464returnfalse;

465Instruction *I0 = cast<Instruction>(*It);

466if (all_of(VL,isVectorLikeInstWithConstOps))

467returntrue;

468

469BasicBlock *BB = I0->getParent();

470for (Value *V :iterator_range(It, VL.end())) {

471if (isa<PoisonValue>(V))

472continue;

473auto *II = dyn_cast<Instruction>(V);

474if (!II)

475returnfalse;

476

477if (BB !=II->getParent())

478returnfalse;

479 }

480returntrue;

481}

482

483/// \returns True if all of the values in \p VL are constants (but not

484/// globals/constant expressions).

485staticboolallConstant(ArrayRef<Value *> VL) {

486// Constant expressions and globals can't be vectorized like normal integer/FP

487// constants.

488returnall_of(VL,isConstant);

489}

490

491/// \returns True if all of the values in \p VL are identical or some of them

492/// are UndefValue.

493staticboolisSplat(ArrayRef<Value *> VL) {

494Value *FirstNonUndef =nullptr;

495for (Value *V : VL) {

496if (isa<UndefValue>(V))

497continue;

498if (!FirstNonUndef) {

499 FirstNonUndef = V;

500continue;

501 }

502if (V != FirstNonUndef)

503returnfalse;

504 }

505return FirstNonUndef !=nullptr;

506}

507

508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.

509staticboolisCommutative(Instruction *I) {

510if (auto *Cmp = dyn_cast<CmpInst>(I))

511return Cmp->isCommutative();

512if (auto *BO = dyn_cast<BinaryOperator>(I))

513return BO->isCommutative() ||

514 (BO->getOpcode() == Instruction::Sub &&

515 !BO->hasNUsesOrMore(UsesLimit) &&

516all_of(

517 BO->uses(),

518 [](constUse &U) {

519// Commutative, if icmp eq/ne sub, 0

520 CmpPredicate Pred;

521 if (match(U.getUser(),

522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&

523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))

524 return true;

525// Commutative, if abs(sub nsw, true) or abs(sub, false).

526 ConstantInt *Flag;

527 return match(U.getUser(),

528 m_Intrinsic<Intrinsic::abs>(

529 m_Specific(U.get()), m_ConstantInt(Flag))) &&

530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||

531 Flag->isOne());

532 })) ||

533 (BO->getOpcode() == Instruction::FSub &&

534 !BO->hasNUsesOrMore(UsesLimit) &&

535all_of(BO->uses(), [](constUse &U) {

536 return match(U.getUser(),

537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));

538 }));

539returnI->isCommutative();

540}

541

542template <typename T>

543static std::optional<unsigned>getInsertExtractIndex(constValue *Inst,

544unsignedOffset) {

545static_assert(std::is_same_v<T, InsertElementInst> ||

546 std::is_same_v<T, ExtractElementInst>,

547"unsupported T");

548int Index =Offset;

549if (constauto *IE = dyn_cast<T>(Inst)) {

550constauto *VT = dyn_cast<FixedVectorType>(IE->getType());

551if (!VT)

552return std::nullopt;

553constauto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));

554if (!CI)

555return std::nullopt;

556if (CI->getValue().uge(VT->getNumElements()))

557return std::nullopt;

558 Index *= VT->getNumElements();

559 Index += CI->getZExtValue();

560return Index;

561 }

562return std::nullopt;

563}

564

565/// \returns inserting or extracting index of InsertElement, ExtractElement or

566/// InsertValue instruction, using Offset as base offset for index.

567/// \returns std::nullopt if the index is not an immediate.

568static std::optional<unsigned>getElementIndex(constValue *Inst,

569unsignedOffset = 0) {

570if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst,Offset))

571return Index;

572if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,Offset))

573return Index;

574

575int Index =Offset;

576

577constauto *IV = dyn_cast<InsertValueInst>(Inst);

578if (!IV)

579return std::nullopt;

580

581Type *CurrentType =IV->getType();

582for (unsignedI :IV->indices()) {

583if (constauto *ST = dyn_cast<StructType>(CurrentType)) {

584 Index *= ST->getNumElements();

585 CurrentType = ST->getElementType(I);

586 }elseif (constauto *AT = dyn_cast<ArrayType>(CurrentType)) {

587 Index *= AT->getNumElements();

588 CurrentType = AT->getElementType();

589 }else {

590return std::nullopt;

591 }

592 Index +=I;

593 }

594return Index;

595}

596

597namespace{

598/// Specifies the way the mask should be analyzed for undefs/poisonous elements

599/// in the shuffle mask.

600enum class UseMask {

601 FirstArg,///< The mask is expected to be for permutation of 1-2 vectors,

602 ///< check for the mask elements for the first argument (mask

603 ///< indices are in range [0:VF)).

604 SecondArg,///< The mask is expected to be for permutation of 2 vectors, check

605 ///< for the mask elements for the second argument (mask indices

606 ///< are in range [VF:2*VF))

607 UndefsAsMask///< Consider undef mask elements (-1) as placeholders for

608 ///< future shuffle elements and mark them as ones as being used

609 ///< in future. Non-undef elements are considered as unused since

610 ///< they're already marked as used in the mask.

611};

612}// namespace

613

614/// Prepares a use bitset for the given mask either for the first argument or

615/// for the second.

616staticSmallBitVector buildUseMask(int VF,ArrayRef<int> Mask,

617 UseMask MaskArg) {

618SmallBitVector UseMask(VF,true);

619for (auto [Idx,Value] :enumerate(Mask)) {

620if (Value ==PoisonMaskElem) {

621if (MaskArg == UseMask::UndefsAsMask)

622 UseMask.reset(Idx);

623continue;

624 }

625if (MaskArg == UseMask::FirstArg &&Value < VF)

626 UseMask.reset(Value);

627elseif (MaskArg == UseMask::SecondArg &&Value >= VF)

628 UseMask.reset(Value - VF);

629 }

630return UseMask;

631}

632

633/// Checks if the given value is actually an undefined constant vector.

634/// Also, if the \p UseMask is not empty, tries to check if the non-masked

635/// elements actually mask the insertelement buildvector, if any.

636template <bool IsPoisonOnly = false>

637staticSmallBitVector isUndefVector(constValue *V,

638constSmallBitVector &UseMask = {}) {

639SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(),true);

640usingT = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;

641if (isa<T>(V))

642return Res;

643auto *VecTy = dyn_cast<FixedVectorType>(V->getType());

644if (!VecTy)

645return Res.reset();

646auto *C = dyn_cast<Constant>(V);

647if (!C) {

648if (!UseMask.empty()) {

649constValue *Base =V;

650while (auto *II = dyn_cast<InsertElementInst>(Base)) {

651Base =II->getOperand(0);

652if (isa<T>(II->getOperand(1)))

653continue;

654 std::optional<unsigned>Idx =getElementIndex(II);

655if (!Idx) {

656 Res.reset();

657return Res;

658 }

659if (*Idx < UseMask.size() && !UseMask.test(*Idx))

660 Res.reset(*Idx);

661 }

662// TODO: Add analysis for shuffles here too.

663if (V ==Base) {

664 Res.reset();

665 }else {

666SmallBitVector SubMask(UseMask.size(),false);

667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);

668 }

669 }else {

670 Res.reset();

671 }

672return Res;

673 }

674for (unsignedI = 0, E = VecTy->getNumElements();I != E; ++I) {

675if (Constant *Elem =C->getAggregateElement(I))

676if (!isa<T>(Elem) &&

677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))

678 Res.reset(I);

679 }

680return Res;

681}

682

683/// Checks if the vector of instructions can be represented as a shuffle, like:

684/// %x0 = extractelement <4 x i8> %x, i32 0

685/// %x3 = extractelement <4 x i8> %x, i32 3

686/// %y1 = extractelement <4 x i8> %y, i32 1

687/// %y2 = extractelement <4 x i8> %y, i32 2

688/// %x0x0 = mul i8 %x0, %x0

689/// %x3x3 = mul i8 %x3, %x3

690/// %y1y1 = mul i8 %y1, %y1

691/// %y2y2 = mul i8 %y2, %y2

692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0

693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1

694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2

695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3

696/// ret <4 x i8> %ins4

697/// can be transformed into:

698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,

699/// i32 6>

700/// %2 = mul <4 x i8> %1, %1

701/// ret <4 x i8> %2

702/// Mask will return the Shuffle Mask equivalent to the extracted elements.

703/// TODO: Can we split off and reuse the shuffle mask detection from

704/// ShuffleVectorInst/getShuffleCost?

705static std::optional<TargetTransformInfo::ShuffleKind>

706isFixedVectorShuffle(ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,

707AssumptionCache *AC) {

708constauto *It =find_if(VL, IsaPred<ExtractElementInst>);

709if (It == VL.end())

710return std::nullopt;

711unsignedSize =

712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S,Value *V) {

713 auto *EI = dyn_cast<ExtractElementInst>(V);

714 if (!EI)

715 return S;

716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

717 if (!VTy)

718 return S;

719 return std::max(S, VTy->getNumElements());

720 });

721

722Value *Vec1 =nullptr;

723Value *Vec2 =nullptr;

724bool HasNonUndefVec =any_of(VL, [&](Value *V) {

725auto *EE = dyn_cast<ExtractElementInst>(V);

726if (!EE)

727returnfalse;

728Value *Vec = EE->getVectorOperand();

729if (isa<UndefValue>(Vec))

730returnfalse;

731returnisGuaranteedNotToBePoison(Vec, AC);

732 });

733enum ShuffleMode {Unknown,Select, Permute };

734 ShuffleMode CommonShuffleMode =Unknown;

735 Mask.assign(VL.size(),PoisonMaskElem);

736for (unsignedI = 0, E = VL.size();I < E; ++I) {

737// Undef can be represented as an undef element in a vector.

738if (isa<UndefValue>(VL[I]))

739continue;

740auto *EI = cast<ExtractElementInst>(VL[I]);

741if (isa<ScalableVectorType>(EI->getVectorOperandType()))

742return std::nullopt;

743auto *Vec = EI->getVectorOperand();

744// We can extractelement from undef or poison vector.

745if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())

746continue;

747// All vector operands must have the same number of vector elements.

748if (isa<UndefValue>(Vec)) {

749 Mask[I] =I;

750 }else {

751if (isa<UndefValue>(EI->getIndexOperand()))

752continue;

753auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());

754if (!Idx)

755return std::nullopt;

756// Undefined behavior if Idx is negative or >= Size.

757if (Idx->getValue().uge(Size))

758continue;

759unsigned IntIdx =Idx->getValue().getZExtValue();

760 Mask[I] = IntIdx;

761 }

762if (isUndefVector(Vec).all() && HasNonUndefVec)

763continue;

764// For correct shuffling we have to have at most 2 different vector operands

765// in all extractelement instructions.

766if (!Vec1 || Vec1 == Vec) {

767 Vec1 = Vec;

768 }elseif (!Vec2 || Vec2 == Vec) {

769 Vec2 = Vec;

770 Mask[I] +=Size;

771 }else {

772return std::nullopt;

773 }

774if (CommonShuffleMode == Permute)

775continue;

776// If the extract index is not the same as the operation number, it is a

777// permutation.

778if (Mask[I] %Size !=I) {

779 CommonShuffleMode = Permute;

780continue;

781 }

782 CommonShuffleMode =Select;

783 }

784// If we're not crossing lanes in different vectors, consider it as blending.

785if (CommonShuffleMode ==Select && Vec2)

786returnTargetTransformInfo::SK_Select;

787// If Vec2 was never used, we have a permutation of a single vector, otherwise

788// we have permutation of 2 vectors.

789return Vec2 ?TargetTransformInfo::SK_PermuteTwoSrc

790 :TargetTransformInfo::SK_PermuteSingleSrc;

791}

792

793/// \returns True if Extract{Value,Element} instruction extracts element Idx.

794static std::optional<unsigned>getExtractIndex(Instruction *E) {

795unsigned Opcode = E->getOpcode();

796assert((Opcode == Instruction::ExtractElement ||

797 Opcode == Instruction::ExtractValue) &&

798"Expected extractelement or extractvalue instruction.");

799if (Opcode == Instruction::ExtractElement) {

800auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));

801if (!CI)

802return std::nullopt;

803return CI->getZExtValue();

804 }

805auto *EI = cast<ExtractValueInst>(E);

806if (EI->getNumIndices() != 1)

807return std::nullopt;

808return *EI->idx_begin();

809}

810

811namespace{

812

813/// Main data required for vectorization of instructions.

814classInstructionsState {

815 /// The main/alternate instruction. MainOp is also VL0.

816Instruction *MainOp =nullptr;

817Instruction *AltOp =nullptr;

818

819public:

820Instruction *getMainOp() const{

821assert(valid() &&"InstructionsState is invalid.");

822return MainOp;

823 }

824

825Instruction *getAltOp() const{

826assert(valid() &&"InstructionsState is invalid.");

827return AltOp;

828 }

829

830 /// The main/alternate opcodes for the list of instructions.

831unsignedgetOpcode() const{return getMainOp()->getOpcode(); }

832

833unsigned getAltOpcode() const{return getAltOp()->getOpcode(); }

834

835 /// Some of the instructions in the list have alternate opcodes.

836bool isAltShuffle() const{return getMainOp() != getAltOp(); }

837

838bool isOpcodeOrAlt(Instruction *I) const{

839unsigned CheckedOpcode =I->getOpcode();

840returngetOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;

841 }

842

843 /// Checks if the current state is valid, i.e. has non-null MainOp

844bool valid() const{return MainOp && AltOp; }

845

846explicitoperatorbool() const{return valid(); }

847

848 InstructionsState() =delete;

849 InstructionsState(Instruction *MainOp,Instruction *AltOp)

850 : MainOp(MainOp), AltOp(AltOp) {}

851static InstructionsState invalid() {return {nullptr,nullptr}; }

852};

853

854}// end anonymous namespace

855

856/// \returns true if \p Opcode is allowed as part of the main/alternate

857/// instruction for SLP vectorization.

858///

859/// Example of unsupported opcode is SDIV that can potentially cause UB if the

860/// "shuffled out" lane would result in division by zero.

861staticboolisValidForAlternation(unsigned Opcode) {

862if (Instruction::isIntDivRem(Opcode))

863returnfalse;

864

865returntrue;

866}

867

868static InstructionsStategetSameOpcode(ArrayRef<Value *> VL,

869constTargetLibraryInfo &TLI);

870

871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.

872/// compatible instructions or constants, or just some other regular values.

873staticboolareCompatibleCmpOps(Value *BaseOp0,Value *BaseOp1,Value *Op0,

874Value *Op1,constTargetLibraryInfo &TLI) {

875return (isConstant(BaseOp0) &&isConstant(Op0)) ||

876 (isConstant(BaseOp1) &&isConstant(Op1)) ||

877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&

878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||

879 BaseOp0 == Op0 || BaseOp1 == Op1 ||

880getSameOpcode({BaseOp0, Op0}, TLI) ||

881getSameOpcode({BaseOp1, Op1}, TLI);

882}

883

884/// \returns true if a compare instruction \p CI has similar "look" and

885/// same predicate as \p BaseCI, "as is" or with its operands and predicate

886/// swapped, false otherwise.

887staticboolisCmpSameOrSwapped(constCmpInst *BaseCI,constCmpInst *CI,

888constTargetLibraryInfo &TLI) {

889assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&

890"Assessing comparisons of different types?");

891CmpInst::Predicate BasePred = BaseCI->getPredicate();

892CmpInst::Predicate Pred = CI->getPredicate();

893CmpInst::Predicate SwappedPred =CmpInst::getSwappedPredicate(Pred);

894

895Value *BaseOp0 = BaseCI->getOperand(0);

896Value *BaseOp1 = BaseCI->getOperand(1);

897Value *Op0 = CI->getOperand(0);

898Value *Op1 = CI->getOperand(1);

899

900return (BasePred == Pred &&

901areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||

902 (BasePred == SwappedPred &&

903areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));

904}

905

906/// \returns analysis of the Instructions in \p VL described in

907/// InstructionsState, the Opcode that we suppose the whole list

908/// could be vectorized even if its structure is diverse.

909static InstructionsStategetSameOpcode(ArrayRef<Value *> VL,

910constTargetLibraryInfo &TLI) {

911// Make sure these are all Instructions.

912if (!all_of(VL, IsaPred<Instruction, PoisonValue>))

913return InstructionsState::invalid();

914

915auto *It =find_if(VL, IsaPred<Instruction>);

916if (It == VL.end())

917return InstructionsState::invalid();

918

919Instruction *MainOp = cast<Instruction>(*It);

920unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);

921if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||

922 (VL.size() == 2 && InstCnt < 2))

923return InstructionsState::invalid();

924

925bool IsCastOp = isa<CastInst>(MainOp);

926bool IsBinOp = isa<BinaryOperator>(MainOp);

927bool IsCmpOp = isa<CmpInst>(MainOp);

928CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()

929 :CmpInst::BAD_ICMP_PREDICATE;

930Instruction *AltOp = MainOp;

931unsigned Opcode = MainOp->getOpcode();

932unsigned AltOpcode = Opcode;

933

934bool SwappedPredsCompatible = IsCmpOp && [&]() {

935SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;

936 UniquePreds.insert(BasePred);

937 UniqueNonSwappedPreds.insert(BasePred);

938for (Value *V : VL) {

939auto *I = dyn_cast<CmpInst>(V);

940if (!I)

941returnfalse;

942CmpInst::Predicate CurrentPred =I->getPredicate();

943CmpInst::Predicate SwappedCurrentPred =

944CmpInst::getSwappedPredicate(CurrentPred);

945 UniqueNonSwappedPreds.insert(CurrentPred);

946if (!UniquePreds.contains(CurrentPred) &&

947 !UniquePreds.contains(SwappedCurrentPred))

948 UniquePreds.insert(CurrentPred);

949 }

950// Total number of predicates > 2, but if consider swapped predicates

951// compatible only 2, consider swappable predicates as compatible opcodes,

952// not alternate.

953return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;

954 }();

955// Check for one alternate opcode from another BinaryOperator.

956// TODO - generalize to support all operators (types, calls etc.).

957Intrinsic::ID BaseID = 0;

958SmallVector<VFInfo> BaseMappings;

959if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {

960 BaseID =getVectorIntrinsicIDForCall(CallBase, &TLI);

961 BaseMappings =VFDatabase(*CallBase).getMappings(*CallBase);

962if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())

963return InstructionsState::invalid();

964 }

965bool AnyPoison = InstCnt != VL.size();

966// Skip MainOp.

967for (Value *V :iterator_range(It + 1, VL.end())) {

968auto *I = dyn_cast<Instruction>(V);

969if (!I)

970continue;

971

972// Cannot combine poison and divisions.

973// TODO: do some smart analysis of the CallInsts to exclude divide-like

974// intrinsics/functions only.

975if (AnyPoison && (I->isIntDivRem() ||I->isFPDivRem() || isa<CallInst>(I)))

976return InstructionsState::invalid();

977unsigned InstOpcode =I->getOpcode();

978if (IsBinOp && isa<BinaryOperator>(I)) {

979if (InstOpcode == Opcode || InstOpcode == AltOpcode)

980continue;

981if (Opcode == AltOpcode &&isValidForAlternation(InstOpcode) &&

982isValidForAlternation(Opcode)) {

983 AltOpcode = InstOpcode;

984 AltOp =I;

985continue;

986 }

987 }elseif (IsCastOp && isa<CastInst>(I)) {

988Value *Op0 = MainOp->getOperand(0);

989Type *Ty0 = Op0->getType();

990Value *Op1 =I->getOperand(0);

991Type *Ty1 = Op1->getType();

992if (Ty0 == Ty1) {

993if (InstOpcode == Opcode || InstOpcode == AltOpcode)

994continue;

995if (Opcode == AltOpcode) {

996assert(isValidForAlternation(Opcode) &&

997isValidForAlternation(InstOpcode) &&

998"Cast isn't safe for alternation, logic needs to be updated!");

999 AltOpcode = InstOpcode;

1000 AltOp =I;

1001continue;

1002 }

1003 }

1004 }elseif (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {

1005auto *BaseInst = cast<CmpInst>(MainOp);

1006Type *Ty0 = BaseInst->getOperand(0)->getType();

1007Type *Ty1 = Inst->getOperand(0)->getType();

1008if (Ty0 == Ty1) {

1009assert(InstOpcode == Opcode &&"Expected same CmpInst opcode.");

1010assert(InstOpcode == AltOpcode &&

1011"Alternate instructions are only supported by BinaryOperator "

1012"and CastInst.");

1013// Check for compatible operands. If the corresponding operands are not

1014// compatible - need to perform alternate vectorization.

1015CmpInst::Predicate CurrentPred = Inst->getPredicate();

1016CmpInst::Predicate SwappedCurrentPred =

1017CmpInst::getSwappedPredicate(CurrentPred);

1018

1019if ((VL.size() == 2 || SwappedPredsCompatible) &&

1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))

1021continue;

1022

1023if (isCmpSameOrSwapped(BaseInst, Inst, TLI))

1024continue;

1025auto *AltInst = cast<CmpInst>(AltOp);

1026if (MainOp != AltOp) {

1027if (isCmpSameOrSwapped(AltInst, Inst, TLI))

1028continue;

1029 }elseif (BasePred != CurrentPred) {

1030assert(

1031isValidForAlternation(InstOpcode) &&

1032"CmpInst isn't safe for alternation, logic needs to be updated!");

1033 AltOp =I;

1034continue;

1035 }

1036CmpInst::Predicate AltPred = AltInst->getPredicate();

1037if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||

1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)

1039continue;

1040 }

1041 }elseif (InstOpcode == Opcode) {

1042assert(InstOpcode == AltOpcode &&

1043"Alternate instructions are only supported by BinaryOperator and "

1044"CastInst.");

1045if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {

1046if (Gep->getNumOperands() != 2 ||

1047 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())

1048return InstructionsState::invalid();

1049 }elseif (auto *EI = dyn_cast<ExtractElementInst>(I)) {

1050if (!isVectorLikeInstWithConstOps(EI))

1051return InstructionsState::invalid();

1052 }elseif (auto *LI = dyn_cast<LoadInst>(I)) {

1053auto *BaseLI = cast<LoadInst>(MainOp);

1054if (!LI->isSimple() || !BaseLI->isSimple())

1055return InstructionsState::invalid();

1056 }elseif (auto *Call = dyn_cast<CallInst>(I)) {

1057auto *CallBase = cast<CallInst>(MainOp);

1058if (Call->getCalledFunction() !=CallBase->getCalledFunction())

1059return InstructionsState::invalid();

1060if (Call->hasOperandBundles() &&

1061 (!CallBase->hasOperandBundles() ||

1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),

1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),

1064CallBase->op_begin() +

1065CallBase->getBundleOperandsStartIndex())))

1066return InstructionsState::invalid();

1067Intrinsic::ID ID =getVectorIntrinsicIDForCall(Call, &TLI);

1068if (ID != BaseID)

1069return InstructionsState::invalid();

1070if (!ID) {

1071SmallVector<VFInfo> Mappings =VFDatabase(*Call).getMappings(*Call);

1072if (Mappings.size() != BaseMappings.size() ||

1073 Mappings.front().ISA != BaseMappings.front().ISA ||

1074 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||

1075 Mappings.front().VectorName != BaseMappings.front().VectorName ||

1076 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||

1077 Mappings.front().Shape.Parameters !=

1078 BaseMappings.front().Shape.Parameters)

1079return InstructionsState::invalid();

1080 }

1081 }

1082continue;

1083 }

1084return InstructionsState::invalid();

1085 }

1086

1087return InstructionsState(MainOp, AltOp);

1088}

1089

1090/// \returns true if all of the values in \p VL have the same type or false

1091/// otherwise.

1092staticboolallSameType(ArrayRef<Value *> VL) {

1093Type *Ty = VL.front()->getType();

1094returnall_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });

1095}

1096

1097/// \returns True if in-tree use also needs extract. This refers to

1098/// possible scalar operand in vectorized instruction.

1099staticbooldoesInTreeUserNeedToExtract(Value *Scalar,Instruction *UserInst,

1100TargetLibraryInfo *TLI,

1101constTargetTransformInfo *TTI) {

1102if (!UserInst)

1103returnfalse;

1104unsigned Opcode = UserInst->getOpcode();

1105switch (Opcode) {

1106case Instruction::Load: {

1107LoadInst *LI = cast<LoadInst>(UserInst);

1108return (LI->getPointerOperand() == Scalar);

1109 }

1110case Instruction::Store: {

1111StoreInst *SI = cast<StoreInst>(UserInst);

1112return (SI->getPointerOperand() == Scalar);

1113 }

1114case Instruction::Call: {

1115CallInst *CI = cast<CallInst>(UserInst);

1116Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

1117returnany_of(enumerate(CI->args()), [&](auto &&Arg) {

1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&

1119 Arg.value().get() == Scalar;

1120 });

1121 }

1122default:

1123returnfalse;

1124 }

1125}

1126

1127/// \returns the AA location that is being access by the instruction.

1128staticMemoryLocation getLocation(Instruction *I) {

1129if (StoreInst *SI = dyn_cast<StoreInst>(I))

1130returnMemoryLocation::get(SI);

1131if (LoadInst *LI = dyn_cast<LoadInst>(I))

1132returnMemoryLocation::get(LI);

1133returnMemoryLocation();

1134}

1135

1136/// \returns True if the instruction is not a volatile or atomic load/store.

1137staticboolisSimple(Instruction *I) {

1138if (LoadInst *LI = dyn_cast<LoadInst>(I))

1139return LI->isSimple();

1140if (StoreInst *SI = dyn_cast<StoreInst>(I))

1141return SI->isSimple();

1142if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))

1143return !MI->isVolatile();

1144returntrue;

1145}

1146

1147/// Shuffles \p Mask in accordance with the given \p SubMask.

1148/// \param ExtendingManyInputs Supports reshuffling of the mask with not only

1149/// one but two input vectors.

1150staticvoidaddMask(SmallVectorImpl<int> &Mask,ArrayRef<int> SubMask,

1151bool ExtendingManyInputs =false) {

1152if (SubMask.empty())

1153return;

1154assert(

1155 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||

1156// Check if input scalars were extended to match the size of other node.

1157 (SubMask.size() == Mask.size() && Mask.back() ==PoisonMaskElem)) &&

1158"SubMask with many inputs support must be larger than the mask.");

1159if (Mask.empty()) {

1160 Mask.append(SubMask.begin(), SubMask.end());

1161return;

1162 }

1163SmallVector<int> NewMask(SubMask.size(),PoisonMaskElem);

1164int TermValue = std::min(Mask.size(), SubMask.size());

1165for (intI = 0, E = SubMask.size();I < E; ++I) {

1166if (SubMask[I] ==PoisonMaskElem ||

1167 (!ExtendingManyInputs &&

1168 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))

1169continue;

1170 NewMask[I] = Mask[SubMask[I]];

1171 }

1172 Mask.swap(NewMask);

1173}

1174

1175/// Order may have elements assigned special value (size) which is out of

1176/// bounds. Such indices only appear on places which correspond to undef values

1177/// (see canReuseExtract for details) and used in order to avoid undef values

1178/// have effect on operands ordering.

1179/// The first loop below simply finds all unused indices and then the next loop

1180/// nest assigns these indices for undef values positions.

1181/// As an example below Order has two undef positions and they have assigned

1182/// values 3 and 7 respectively:

1183/// before: 6 9 5 4 9 2 1 0

1184/// after: 6 3 5 4 7 2 1 0

1185staticvoidfixupOrderingIndices(MutableArrayRef<unsigned> Order) {

1186constunsigned Sz = Order.size();

1187SmallBitVector UnusedIndices(Sz,/*t=*/true);

1188SmallBitVector MaskedIndices(Sz);

1189for (unsignedI = 0;I < Sz; ++I) {

1190if (Order[I] < Sz)

1191 UnusedIndices.reset(Order[I]);

1192else

1193 MaskedIndices.set(I);

1194 }

1195if (MaskedIndices.none())

1196return;

1197assert(UnusedIndices.count() == MaskedIndices.count() &&

1198"Non-synced masked/available indices.");

1199intIdx = UnusedIndices.find_first();

1200int MIdx = MaskedIndices.find_first();

1201while (MIdx >= 0) {

1202assert(Idx >= 0 &&"Indices must be synced.");

1203 Order[MIdx] =Idx;

1204Idx = UnusedIndices.find_next(Idx);

1205 MIdx = MaskedIndices.find_next(MIdx);

1206 }

1207}

1208

1209/// \returns a bitset for selecting opcodes. false for Opcode0 and true for

1210/// Opcode1.

1211staticSmallBitVector getAltInstrMask(ArrayRef<Value *> VL,unsigned Opcode0,

1212unsigned Opcode1) {

1213Type *ScalarTy = VL[0]->getType();

1214unsigned ScalarTyNumElements =getNumElements(ScalarTy);

1215SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements,false);

1216for (unsigned Lane : seq<unsigned>(VL.size())) {

1217if (isa<PoisonValue>(VL[Lane]))

1218continue;

1219if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)

1220 OpcodeMask.set(Lane * ScalarTyNumElements,

1221 Lane * ScalarTyNumElements + ScalarTyNumElements);

1222 }

1223return OpcodeMask;

1224}

1225

1226namespacellvm {

1227

1228staticvoidinversePermutation(ArrayRef<unsigned> Indices,

1229SmallVectorImpl<int> &Mask) {

1230 Mask.clear();

1231constunsigned E = Indices.size();

1232 Mask.resize(E,PoisonMaskElem);

1233for (unsignedI = 0;I < E; ++I)

1234 Mask[Indices[I]] =I;

1235}

1236

1237/// Reorders the list of scalars in accordance with the given \p Mask.

1238staticvoidreorderScalars(SmallVectorImpl<Value *> &Scalars,

1239ArrayRef<int> Mask) {

1240assert(!Mask.empty() &&"Expected non-empty mask.");

1241SmallVector<Value *> Prev(Scalars.size(),

1242PoisonValue::get(Scalars.front()->getType()));

1243 Prev.swap(Scalars);

1244for (unsignedI = 0, E = Prev.size();I < E; ++I)

1245if (Mask[I] !=PoisonMaskElem)

1246 Scalars[Mask[I]] = Prev[I];

1247}

1248

1249/// Checks if the provided value does not require scheduling. It does not

1250/// require scheduling if this is not an instruction or it is an instruction

1251/// that does not read/write memory and all operands are either not instructions

1252/// or phi nodes or instructions from different blocks.

1253staticboolareAllOperandsNonInsts(Value *V) {

1254auto *I = dyn_cast<Instruction>(V);

1255if (!I)

1256returntrue;

1257return !mayHaveNonDefUseDependency(*I) &&

1258all_of(I->operands(), [I](Value *V) {

1259 auto *IO = dyn_cast<Instruction>(V);

1260 if (!IO)

1261 return true;

1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();

1263 });

1264}

1265

1266/// Checks if the provided value does not require scheduling. It does not

1267/// require scheduling if this is not an instruction or it is an instruction

1268/// that does not read/write memory and all users are phi nodes or instructions

1269/// from the different blocks.

1270staticboolisUsedOutsideBlock(Value *V) {

1271auto *I = dyn_cast<Instruction>(V);

1272if (!I)

1273returntrue;

1274// Limits the number of uses to save compile time.

1275return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&

1276all_of(I->users(), [I](User *U) {

1277 auto *IU = dyn_cast<Instruction>(U);

1278 if (!IU)

1279 return true;

1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);

1281 });

1282}

1283

1284/// Checks if the specified value does not require scheduling. It does not

1285/// require scheduling if all operands and all users do not need to be scheduled

1286/// in the current basic block.

1287staticbooldoesNotNeedToBeScheduled(Value *V) {

1288returnareAllOperandsNonInsts(V) &&isUsedOutsideBlock(V);

1289}

1290

1291/// Checks if the specified array of instructions does not require scheduling.

1292/// It is so if all either instructions have operands that do not require

1293/// scheduling or their users do not require scheduling since they are phis or

1294/// in other basic blocks.

1295staticbooldoesNotNeedToSchedule(ArrayRef<Value *> VL) {

1296return !VL.empty() &&

1297 (all_of(VL,isUsedOutsideBlock) ||all_of(VL,areAllOperandsNonInsts));

1298}

1299

1300/// Returns true if widened type of \p Ty elements with size \p Sz represents

1301/// full vector type, i.e. adding extra element results in extra parts upon type

1302/// legalization.

1303staticboolhasFullVectorsOrPowerOf2(constTargetTransformInfo &TTI,Type *Ty,

1304unsigned Sz) {

1305if (Sz <= 1)

1306returnfalse;

1307if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))

1308returnfalse;

1309if (has_single_bit(Sz))

1310returntrue;

1311constunsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));

1312return NumParts > 0 && NumParts < Sz &&has_single_bit(Sz / NumParts) &&

1313 Sz % NumParts == 0;

1314}

1315

1316namespaceslpvectorizer {

1317

1318/// Bottom Up SLP Vectorizer.

1319classBoUpSLP {

1320structTreeEntry;

1321structScheduleData;

1322classShuffleCostEstimator;

1323classShuffleInstructionBuilder;

1324

1325public:

1326 /// Tracks the state we can represent the loads in the given sequence.

1327enum classLoadsState {

1328Gather,

1329Vectorize,

1330ScatterVectorize,

1331StridedVectorize

1332 };

1333

1334usingValueList =SmallVector<Value *, 8>;

1335usingInstrList =SmallVector<Instruction *, 16>;

1336usingValueSet =SmallPtrSet<Value *, 16>;

1337usingStoreList =SmallVector<StoreInst *, 8>;

1338usingExtraValueToDebugLocsMap =SmallDenseSet<Value *, 4>;

1339usingOrdersType =SmallVector<unsigned, 4>;

1340

1341BoUpSLP(Function *Func,ScalarEvolution *Se,TargetTransformInfo *Tti,

1342TargetLibraryInfo *TLi,AAResults *Aa,LoopInfo *Li,

1343DominatorTree *Dt,AssumptionCache *AC,DemandedBits *DB,

1344constDataLayout *DL,OptimizationRemarkEmitter *ORE)

1345 : BatchAA(*Aa),F(Func), SE(Se),TTI(Tti), TLI(TLi), LI(Li), DT(Dt),

1346 AC(AC), DB(DB),DL(DL), ORE(ORE),

1347 Builder(Se->getContext(),TargetFolder(*DL)) {

1348CodeMetrics::collectEphemeralValues(F, AC, EphValues);

1349// Use the vector register size specified by the target unless overridden

1350// by a command-line option.

1351// TODO: It would be better to limit the vectorization factor based on

1352// data type rather than just register size. For example, x86 AVX has

1353// 256-bit registers, but it does not support integer operations

1354// at that width (that requires AVX2).

1355if (MaxVectorRegSizeOption.getNumOccurrences())

1356 MaxVecRegSize =MaxVectorRegSizeOption;

1357else

1358 MaxVecRegSize =

1359TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

1360 .getFixedValue();

1361

1362if (MinVectorRegSizeOption.getNumOccurrences())

1363 MinVecRegSize =MinVectorRegSizeOption;

1364else

1365 MinVecRegSize =TTI->getMinVectorRegisterBitWidth();

1366 }

1367

1368 /// Vectorize the tree that starts with the elements in \p VL.

1369 /// Returns the vectorized root.

1370Value *vectorizeTree();

1371

1372 /// Vectorize the tree but with the list of externally used values \p

1373 /// ExternallyUsedValues. Values in this MapVector can be replaced but the

1374 /// generated extractvalue instructions.

1375Value *

1376vectorizeTree(constExtraValueToDebugLocsMap &ExternallyUsedValues,

1377Instruction *ReductionRoot =nullptr);

1378

1379 /// \returns the cost incurred by unwanted spills and fills, caused by

1380 /// holding live values over call sites.

1381InstructionCost getSpillCost()const;

1382

1383 /// \returns the vectorization cost of the subtree that starts at \p VL.

1384 /// A negative number means that this is profitable.

1385InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});

1386

1387 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for

1388 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.

1389voidbuildTree(ArrayRef<Value *> Roots,

1390constSmallDenseSet<Value *> &UserIgnoreLst);

1391

1392 /// Construct a vectorizable tree that starts at \p Roots.

1393voidbuildTree(ArrayRef<Value *> Roots);

1394

1395 /// Returns whether the root node has in-tree uses.

1396booldoesRootHaveInTreeUses() const{

1397return !VectorizableTree.empty() &&

1398 !VectorizableTree.front()->UserTreeIndices.empty();

1399 }

1400

1401 /// Return the scalars of the root node.

1402ArrayRef<Value *>getRootNodeScalars() const{

1403assert(!VectorizableTree.empty() &&"No graph to get the first node from");

1404return VectorizableTree.front()->Scalars;

1405 }

1406

1407 /// Returns the type/is-signed info for the root node in the graph without

1408 /// casting.

1409 std::optional<std::pair<Type *, bool>>getRootNodeTypeWithNoCast() const{

1410const TreeEntry &Root = *VectorizableTree.front().get();

1411if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||

1412 !Root.Scalars.front()->getType()->isIntegerTy())

1413return std::nullopt;

1414auto It = MinBWs.find(&Root);

1415if (It != MinBWs.end())

1416return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),

1417 It->second.first),

1418 It->second.second);

1419if (Root.getOpcode() == Instruction::ZExt ||

1420 Root.getOpcode() == Instruction::SExt)

1421return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),

1422 Root.getOpcode() == Instruction::SExt);

1423return std::nullopt;

1424 }

1425

1426 /// Checks if the root graph node can be emitted with narrower bitwidth at

1427 /// codegen and returns it signedness, if so.

1428boolisSignedMinBitwidthRootNode() const{

1429return MinBWs.at(VectorizableTree.front().get()).second;

1430 }

1431

1432 /// Returns reduction type after minbitdth analysis.

1433FixedVectorType *getReductionType() const{

1434if (ReductionBitWidth == 0 ||

1435 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||

1436 ReductionBitWidth >=

1437DL->getTypeSizeInBits(

1438 VectorizableTree.front()->Scalars.front()->getType()))

1439returngetWidenedType(

1440 VectorizableTree.front()->Scalars.front()->getType(),

1441 VectorizableTree.front()->getVectorFactor());

1442returngetWidenedType(

1443IntegerType::get(

1444 VectorizableTree.front()->Scalars.front()->getContext(),

1445 ReductionBitWidth),

1446 VectorizableTree.front()->getVectorFactor());

1447 }

1448

1449 /// Builds external uses of the vectorized scalars, i.e. the list of

1450 /// vectorized scalars to be extracted, their lanes and their scalar users. \p

1451 /// ExternallyUsedValues contains additional list of external uses to handle

1452 /// vectorization of reductions.

1453void

1454buildExternalUses(constExtraValueToDebugLocsMap &ExternallyUsedValues = {});

1455

1456 /// Transforms graph nodes to target specific representations, if profitable.

1457voidtransformNodes();

1458

1459 /// Clear the internal data structures that are created by 'buildTree'.

1460voiddeleteTree() {

1461 VectorizableTree.clear();

1462 ScalarToTreeEntry.clear();

1463 MultiNodeScalars.clear();

1464 MustGather.clear();

1465 NonScheduledFirst.clear();

1466 EntryToLastInstruction.clear();

1467 LoadEntriesToVectorize.clear();

1468 IsGraphTransformMode =false;

1469 GatheredLoadsEntriesFirst.reset();

1470 ExternalUses.clear();

1471 ExternalUsesAsOriginalScalar.clear();

1472for (auto &Iter : BlocksSchedules) {

1473 BlockScheduling *BS = Iter.second.get();

1474 BS->clear();

1475 }

1476 MinBWs.clear();

1477 ReductionBitWidth = 0;

1478 BaseGraphSize = 1;

1479 CastMaxMinBWSizes.reset();

1480 ExtraBitWidthNodes.clear();

1481 InstrElementSize.clear();

1482 UserIgnoreList =nullptr;

1483 PostponedGathers.clear();

1484 ValueToGatherNodes.clear();

1485 }

1486

1487unsignedgetTreeSize() const{return VectorizableTree.size(); }

1488

1489 /// Returns the base graph size, before any transformations.

1490unsignedgetCanonicalGraphSize() const{return BaseGraphSize; }

1491

1492 /// Perform LICM and CSE on the newly generated gather sequences.

1493voidoptimizeGatherSequence();

1494

1495 /// Does this non-empty order represent an identity order? Identity

1496 /// should be represented as an empty order, so this is used to

1497 /// decide if we can canonicalize a computed order. Undef elements

1498 /// (represented as size) are ignored.

1499boolisIdentityOrder(ArrayRef<unsigned> Order) const{

1500assert(!Order.empty() &&"expected non-empty order");

1501constunsigned Sz = Order.size();

1502returnall_of(enumerate(Order), [&](constauto &P) {

1503returnP.value() ==P.index() ||P.value() == Sz;

1504 });

1505 }

1506

1507 /// Checks if the specified gather tree entry \p TE can be represented as a

1508 /// shuffled vector entry + (possibly) permutation with other gathers. It

1509 /// implements the checks only for possibly ordered scalars (Loads,

1510 /// ExtractElement, ExtractValue), which can be part of the graph.

1511 std::optional<OrdersType>findReusedOrderedScalars(const TreeEntry &TE);

1512

1513 /// Sort loads into increasing pointers offsets to allow greater clustering.

1514 std::optional<OrdersType>findPartiallyOrderedLoads(const TreeEntry &TE);

1515

1516 /// Gets reordering data for the given tree entry. If the entry is vectorized

1517 /// - just return ReorderIndices, otherwise check if the scalars can be

1518 /// reordered and return the most optimal order.

1519 /// \return std::nullopt if ordering is not important, empty order, if

1520 /// identity order is important, or the actual order.

1521 /// \param TopToBottom If true, include the order of vectorized stores and

1522 /// insertelement nodes, otherwise skip them.

1523 std::optional<OrdersType>getReorderingData(const TreeEntry &TE,

1524bool TopToBottom);

1525

1526 /// Reorders the current graph to the most profitable order starting from the

1527 /// root node to the leaf nodes. The best order is chosen only from the nodes

1528 /// of the same size (vectorization factor). Smaller nodes are considered

1529 /// parts of subgraph with smaller VF and they are reordered independently. We

1530 /// can make it because we still need to extend smaller nodes to the wider VF

1531 /// and we can merge reordering shuffles with the widening shuffles.

1532voidreorderTopToBottom();

1533

1534 /// Reorders the current graph to the most profitable order starting from

1535 /// leaves to the root. It allows to rotate small subgraphs and reduce the

1536 /// number of reshuffles if the leaf nodes use the same order. In this case we

1537 /// can merge the orders and just shuffle user node instead of shuffling its

1538 /// operands. Plus, even the leaf nodes have different orders, it allows to

1539 /// sink reordering in the graph closer to the root node and merge it later

1540 /// during analysis.

1541voidreorderBottomToTop(bool IgnoreReorder =false);

1542

1543 /// \return The vector element size in bits to use when vectorizing the

1544 /// expression tree ending at \p V. If V is a store, the size is the width of

1545 /// the stored value. Otherwise, the size is the width of the largest loaded

1546 /// value reaching V. This method is used by the vectorizer to calculate

1547 /// vectorization factors.

1548unsignedgetVectorElementSize(Value *V);

1549

1550 /// Compute the minimum type sizes required to represent the entries in a

1551 /// vectorizable tree.

1552voidcomputeMinimumValueSizes();

1553

1554// \returns maximum vector register size as set by TTI or overridden by cl::opt.

1555unsignedgetMaxVecRegSize() const{

1556return MaxVecRegSize;

1557 }

1558

1559// \returns minimum vector register size as set by cl::opt.

1560unsignedgetMinVecRegSize() const{

1561return MinVecRegSize;

1562 }

1563

1564unsignedgetMinVF(unsigned Sz) const{

1565return std::max(2U,getMinVecRegSize() / Sz);

1566 }

1567

1568unsignedgetMaximumVF(unsigned ElemWidth,unsigned Opcode) const{

1569unsigned MaxVF =MaxVFOption.getNumOccurrences() ?

1570MaxVFOption :TTI->getMaximumVF(ElemWidth, Opcode);

1571return MaxVF ? MaxVF : UINT_MAX;

1572 }

1573

1574 /// Check if homogeneous aggregate is isomorphic to some VectorType.

1575 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like

1576 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },

1577 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.

1578 ///

1579 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.

1580unsignedcanMapToVector(Type *T)const;

1581

1582 /// \returns True if the VectorizableTree is both tiny and not fully

1583 /// vectorizable. We do not vectorize such trees.

1584boolisTreeTinyAndNotFullyVectorizable(bool ForReduction =false)const;

1585

1586 /// Checks if the graph and all its subgraphs cannot be better vectorized.

1587 /// It may happen, if all gather nodes are loads and they cannot be

1588 /// "clusterized". In this case even subgraphs cannot be vectorized more

1589 /// effectively than the base graph.

1590boolisTreeNotExtendable()const;

1591

1592 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values

1593 /// can be load combined in the backend. Load combining may not be allowed in

1594 /// the IR optimizer, so we do not want to alter the pattern. For example,

1595 /// partially transforming a scalar bswap() pattern into vector code is

1596 /// effectively impossible for the backend to undo.

1597 /// TODO: If load combining is allowed in the IR optimizer, this analysis

1598 /// may not be necessary.

1599boolisLoadCombineReductionCandidate(RecurKind RdxKind)const;

1600

1601 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values

1602 /// can be load combined in the backend. Load combining may not be allowed in

1603 /// the IR optimizer, so we do not want to alter the pattern. For example,

1604 /// partially transforming a scalar bswap() pattern into vector code is

1605 /// effectively impossible for the backend to undo.

1606 /// TODO: If load combining is allowed in the IR optimizer, this analysis

1607 /// may not be necessary.

1608boolisLoadCombineCandidate(ArrayRef<Value *> Stores)const;

1609

1610 /// Checks if the given array of loads can be represented as a vectorized,

1611 /// scatter or just simple gather.

1612 /// \param VL list of loads.

1613 /// \param VL0 main load value.

1614 /// \param Order returned order of load instructions.

1615 /// \param PointerOps returned list of pointer operands.

1616 /// \param BestVF return best vector factor, if recursive check found better

1617 /// vectorization sequences rather than masked gather.

1618 /// \param TryRecursiveCheck used to check if long masked gather can be

1619 /// represented as a serie of loads/insert subvector, if profitable.

1620LoadsState canVectorizeLoads(ArrayRef<Value *> VL,constValue *VL0,

1621SmallVectorImpl<unsigned> &Order,

1622SmallVectorImpl<Value *> &PointerOps,

1623unsigned *BestVF =nullptr,

1624bool TryRecursiveCheck =true)const;

1625

1626 /// Registers non-vectorizable sequence of loads

1627template <typename T>voidregisterNonVectorizableLoads(ArrayRef<T *> VL) {

1628 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));

1629 }

1630

1631 /// Checks if the given loads sequence is known as not vectorizable

1632template <typename T>

1633boolareKnownNonVectorizableLoads(ArrayRef<T *> VL) const{

1634return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));

1635 }

1636

1637OptimizationRemarkEmitter *getORE() {return ORE; }

1638

1639 /// This structure holds any data we need about the edges being traversed

1640 /// during buildTree_rec(). We keep track of:

1641 /// (i) the user TreeEntry index, and

1642 /// (ii) the index of the edge.

1643structEdgeInfo {

1644EdgeInfo() =default;

1645EdgeInfo(TreeEntry *UserTE,unsignedEdgeIdx)

1646 :UserTE(UserTE),EdgeIdx(EdgeIdx) {}

1647 /// The user TreeEntry.

1648 TreeEntry *UserTE =nullptr;

1649 /// The operand index of the use.

1650unsignedEdgeIdx = UINT_MAX;

1651#ifndef NDEBUG

1652friendinlineraw_ostream &operator<<(raw_ostream &OS,

1653constBoUpSLP::EdgeInfo &EI) {

1654 EI.dump(OS);

1655returnOS;

1656 }

1657 /// Debug print.

1658voiddump(raw_ostream &OS) const{

1659OS <<"{User:" << (UserTE ? std::to_string(UserTE->Idx) :"null")

1660 <<" EdgeIdx:" <<EdgeIdx <<"}";

1661 }

1662LLVM_DUMP_METHODvoiddump() const{dump(dbgs()); }

1663#endif

1664bool operator == (constEdgeInfo &Other) const{

1665returnUserTE ==Other.UserTE &&EdgeIdx ==Other.EdgeIdx;

1666 }

1667 };

1668

1669 /// A helper class used for scoring candidates for two consecutive lanes.

1670classLookAheadHeuristics {

1671constTargetLibraryInfo &TLI;

1672constDataLayout &DL;

1673ScalarEvolution &SE;

1674constBoUpSLP &R;

1675int NumLanes;// Total number of lanes (aka vectorization factor).

1676int MaxLevel;// The maximum recursion depth for accumulating score.

1677

1678public:

1679LookAheadHeuristics(constTargetLibraryInfo &TLI,constDataLayout &DL,

1680ScalarEvolution &SE,constBoUpSLP &R,int NumLanes,

1681int MaxLevel)

1682 : TLI(TLI),DL(DL), SE(SE), R(R), NumLanes(NumLanes),

1683 MaxLevel(MaxLevel) {}

1684

1685// The hard-coded scores listed here are not very important, though it shall

1686// be higher for better matches to improve the resulting cost. When

1687// computing the scores of matching one sub-tree with another, we are

1688// basically counting the number of values that are matching. So even if all

1689// scores are set to 1, we would still get a decent matching result.

1690// However, sometimes we have to break ties. For example we may have to

1691// choose between matching loads vs matching opcodes. This is what these

1692// scores are helping us with: they provide the order of preference. Also,

1693// this is important if the scalar is externally used or used in another

1694// tree entry node in the different lane.

1695

1696 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

1697staticconstintScoreConsecutiveLoads = 4;

1698 /// The same load multiple times. This should have a better score than

1699 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it

1700 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for

1701 /// a vector load and 1.0 for a broadcast.

1702staticconstintScoreSplatLoads = 3;

1703 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).

1704staticconstintScoreReversedLoads = 3;

1705 /// A load candidate for masked gather.

1706staticconstintScoreMaskedGatherCandidate = 1;

1707 /// ExtractElementInst from same vector and consecutive indexes.

1708staticconstintScoreConsecutiveExtracts = 4;

1709 /// ExtractElementInst from same vector and reversed indices.

1710staticconstintScoreReversedExtracts = 3;

1711 /// Constants.

1712staticconstintScoreConstants = 2;

1713 /// Instructions with the same opcode.

1714staticconstintScoreSameOpcode = 2;

1715 /// Instructions with alt opcodes (e.g, add + sub).

1716staticconstintScoreAltOpcodes = 1;

1717 /// Identical instructions (a.k.a. splat or broadcast).

1718staticconstintScoreSplat = 1;

1719 /// Matching with an undef is preferable to failing.

1720staticconstintScoreUndef = 1;

1721 /// Score for failing to find a decent match.

1722staticconstintScoreFail = 0;

1723 /// Score if all users are vectorized.

1724staticconstintScoreAllUserVectorized = 1;

1725

1726 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.

1727 /// \p U1 and \p U2 are the users of \p V1 and \p V2.

1728 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p

1729 /// MainAltOps.

1730intgetShallowScore(Value *V1,Value *V2,Instruction *U1,Instruction *U2,

1731ArrayRef<Value *> MainAltOps) const{

1732if (!isValidElementType(V1->getType()) ||

1733 !isValidElementType(V2->getType()))

1734returnLookAheadHeuristics::ScoreFail;

1735

1736if (V1 == V2) {

1737if (isa<LoadInst>(V1)) {

1738// Retruns true if the users of V1 and V2 won't need to be extracted.

1739auto AllUsersAreInternal = [U1, U2,this](Value *V1,Value *V2) {

1740// Bail out if we have too many uses to save compilation time.

1741if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))

1742returnfalse;

1743

1744auto AllUsersVectorized = [U1, U2,this](Value *V) {

1745returnllvm::all_of(V->users(), [U1, U2,this](Value *U) {

1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;

1747 });

1748 };

1749return AllUsersVectorized(V1) && AllUsersVectorized(V2);

1750 };

1751// A broadcast of a load can be cheaper on some targets.

1752if (R.TTI->isLegalBroadcastLoad(V1->getType(),

1753ElementCount::getFixed(NumLanes)) &&

1754 ((int)V1->getNumUses() == NumLanes ||

1755 AllUsersAreInternal(V1, V2)))

1756returnLookAheadHeuristics::ScoreSplatLoads;

1757 }

1758returnLookAheadHeuristics::ScoreSplat;

1759 }

1760

1761auto CheckSameEntryOrFail = [&]() {

1762if (const TreeEntry *TE1 = R.getTreeEntry(V1);

1763 TE1 && TE1 == R.getTreeEntry(V2))

1764returnLookAheadHeuristics::ScoreSplatLoads;

1765returnLookAheadHeuristics::ScoreFail;

1766 };

1767

1768auto *LI1 = dyn_cast<LoadInst>(V1);

1769auto *LI2 = dyn_cast<LoadInst>(V2);

1770if (LI1 && LI2) {

1771if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||

1772 !LI2->isSimple())

1773return CheckSameEntryOrFail();

1774

1775 std::optional<int> Dist =getPointersDiff(

1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),

1777 LI2->getPointerOperand(),DL, SE,/*StrictCheck=*/true);

1778if (!Dist || *Dist == 0) {

1779if (getUnderlyingObject(LI1->getPointerOperand()) ==

1780getUnderlyingObject(LI2->getPointerOperand()) &&

1781 R.TTI->isLegalMaskedGather(

1782getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))

1783returnLookAheadHeuristics::ScoreMaskedGatherCandidate;

1784return CheckSameEntryOrFail();

1785 }

1786// The distance is too large - still may be profitable to use masked

1787// loads/gathers.

1788if (std::abs(*Dist) > NumLanes / 2)

1789returnLookAheadHeuristics::ScoreMaskedGatherCandidate;

1790// This still will detect consecutive loads, but we might have "holes"

1791// in some cases. It is ok for non-power-2 vectorization and may produce

1792// better results. It should not affect current vectorization.

1793return (*Dist > 0) ?LookAheadHeuristics::ScoreConsecutiveLoads

1794 :LookAheadHeuristics::ScoreReversedLoads;

1795 }

1796

1797auto *C1 = dyn_cast<Constant>(V1);

1798auto *C2 = dyn_cast<Constant>(V2);

1799if (C1 && C2)

1800returnLookAheadHeuristics::ScoreConstants;

1801

1802// Extracts from consecutive indexes of the same vector better score as

1803// the extracts could be optimized away.

1804Value *EV1;

1805ConstantInt *Ex1Idx;

1806if (match(V1,m_ExtractElt(m_Value(EV1),m_ConstantInt(Ex1Idx)))) {

1807// Undefs are always profitable for extractelements.

1808// Compiler can easily combine poison and extractelement <non-poison> or

1809// undef and extractelement <poison>. But combining undef +

1810// extractelement <non-poison-but-may-produce-poison> requires some

1811// extra operations.

1812if (isa<UndefValue>(V2))

1813return (isa<PoisonValue>(V2) ||isUndefVector(EV1).all())

1814 ?LookAheadHeuristics::ScoreConsecutiveExtracts

1815 :LookAheadHeuristics::ScoreSameOpcode;

1816Value *EV2 =nullptr;

1817ConstantInt *Ex2Idx =nullptr;

1818if (match(V2,

1819m_ExtractElt(m_Value(EV2),m_CombineOr(m_ConstantInt(Ex2Idx),

1820m_Undef())))) {

1821// Undefs are always profitable for extractelements.

1822if (!Ex2Idx)

1823returnLookAheadHeuristics::ScoreConsecutiveExtracts;

1824if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())

1825returnLookAheadHeuristics::ScoreConsecutiveExtracts;

1826if (EV2 == EV1) {

1827int Idx1 = Ex1Idx->getZExtValue();

1828int Idx2 = Ex2Idx->getZExtValue();

1829int Dist = Idx2 - Idx1;

1830// The distance is too large - still may be profitable to use

1831// shuffles.

1832if (std::abs(Dist) == 0)

1833returnLookAheadHeuristics::ScoreSplat;

1834if (std::abs(Dist) > NumLanes / 2)

1835returnLookAheadHeuristics::ScoreSameOpcode;

1836return (Dist > 0) ?LookAheadHeuristics::ScoreConsecutiveExtracts

1837 :LookAheadHeuristics::ScoreReversedExtracts;

1838 }

1839returnLookAheadHeuristics::ScoreAltOpcodes;

1840 }

1841return CheckSameEntryOrFail();

1842 }

1843

1844auto *I1 = dyn_cast<Instruction>(V1);

1845auto *I2 = dyn_cast<Instruction>(V2);

1846if (I1 && I2) {

1847if (I1->getParent() != I2->getParent())

1848return CheckSameEntryOrFail();

1849SmallVector<Value *, 4> Ops(MainAltOps);

1850 Ops.push_back(I1);

1851 Ops.push_back(I2);

1852 InstructionsState S =getSameOpcode(Ops, TLI);

1853// Note: Only consider instructions with <= 2 operands to avoid

1854// complexity explosion.

1855if (S &&

1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||

1857 !S.isAltShuffle()) &&

1858all_of(Ops, [&S](Value *V) {

1859return isa<PoisonValue>(V) ||

1860 cast<Instruction>(V)->getNumOperands() ==

1861 S.getMainOp()->getNumOperands();

1862 }))

1863return S.isAltShuffle() ?LookAheadHeuristics::ScoreAltOpcodes

1864 :LookAheadHeuristics::ScoreSameOpcode;

1865 }

1866

1867if (I1 && isa<PoisonValue>(V2))

1868returnLookAheadHeuristics::ScoreSameOpcode;

1869

1870if (isa<UndefValue>(V2))

1871returnLookAheadHeuristics::ScoreUndef;

1872

1873return CheckSameEntryOrFail();

1874 }

1875

1876 /// Go through the operands of \p LHS and \p RHS recursively until

1877 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are

1878 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands

1879 /// of \p U1 and \p U2), except at the beginning of the recursion where

1880 /// these are set to nullptr.

1881 ///

1882 /// For example:

1883 /// \verbatim

1884 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]

1885 /// \ / \ / \ / \ /

1886 /// + + + +

1887 /// G1 G2 G3 G4

1888 /// \endverbatim

1889 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at

1890 /// each level recursively, accumulating the score. It starts from matching

1891 /// the additions at level 0, then moves on to the loads (level 1). The

1892 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and

1893 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while

1894 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.

1895 /// Please note that the order of the operands does not matter, as we

1896 /// evaluate the score of all profitable combinations of operands. In

1897 /// other words the score of G1 and G4 is the same as G1 and G2. This

1898 /// heuristic is based on ideas described in:

1899 /// Look-ahead SLP: Auto-vectorization in the presence of commutative

1900 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,

1901 /// Luís F. W. Góes

1902intgetScoreAtLevelRec(Value *LHS,Value *RHS,Instruction *U1,

1903Instruction *U2,int CurrLevel,

1904ArrayRef<Value *> MainAltOps) const{

1905

1906// Get the shallow score of V1 and V2.

1907int ShallowScoreAtThisLevel =

1908getShallowScore(LHS,RHS, U1, U2, MainAltOps);

1909

1910// If reached MaxLevel,

1911// or if V1 and V2 are not instructions,

1912// or if they are SPLAT,

1913// or if they are not consecutive,

1914// or if profitable to vectorize loads or extractelements, early return

1915// the current cost.

1916auto *I1 = dyn_cast<Instruction>(LHS);

1917auto *I2 = dyn_cast<Instruction>(RHS);

1918if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||

1919 ShallowScoreAtThisLevel ==LookAheadHeuristics::ScoreFail ||

1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||

1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||

1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&

1923 ShallowScoreAtThisLevel))

1924return ShallowScoreAtThisLevel;

1925assert(I1 && I2 &&"Should have early exited.");

1926

1927// Contains the I2 operand indexes that got matched with I1 operands.

1928SmallSet<unsigned, 4> Op2Used;

1929

1930// Recursion towards the operands of I1 and I2. We are trying all possible

1931// operand pairs, and keeping track of the best score.

1932for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();

1933 OpIdx1 != NumOperands1; ++OpIdx1) {

1934// Try to pair op1I with the best operand of I2.

1935int MaxTmpScore = 0;

1936unsigned MaxOpIdx2 = 0;

1937bool FoundBest =false;

1938// If I2 is commutative try all combinations.

1939unsigned FromIdx =isCommutative(I2) ? 0 : OpIdx1;

1940unsigned ToIdx =isCommutative(I2)

1941 ? I2->getNumOperands()

1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);

1943assert(FromIdx <= ToIdx &&"Bad index");

1944for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {

1945// Skip operands already paired with OpIdx1.

1946if (Op2Used.count(OpIdx2))

1947continue;

1948// Recursively calculate the cost at each level

1949int TmpScore =

1950getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),

1951 I1, I2, CurrLevel + 1, {});

1952// Look for the best score.

1953if (TmpScore >LookAheadHeuristics::ScoreFail &&

1954 TmpScore > MaxTmpScore) {

1955 MaxTmpScore = TmpScore;

1956 MaxOpIdx2 = OpIdx2;

1957 FoundBest =true;

1958 }

1959 }

1960if (FoundBest) {

1961// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.

1962 Op2Used.insert(MaxOpIdx2);

1963 ShallowScoreAtThisLevel += MaxTmpScore;

1964 }

1965 }

1966return ShallowScoreAtThisLevel;

1967 }

1968 };

1969 /// A helper data structure to hold the operands of a vector of instructions.

1970 /// This supports a fixed vector length for all operand vectors.

1971classVLOperands {

1972 /// For each operand we need (i) the value, and (ii) the opcode that it

1973 /// would be attached to if the expression was in a left-linearized form.

1974 /// This is required to avoid illegal operand reordering.

1975 /// For example:

1976 /// \verbatim

1977 /// 0 Op1

1978 /// |/

1979 /// Op1 Op2 Linearized + Op2

1980 /// \ / ----------> |/

1981 /// - -

1982 ///

1983 /// Op1 - Op2 (0 + Op1) - Op2

1984 /// \endverbatim

1985 ///

1986 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.

1987 ///

1988 /// Another way to think of this is to track all the operations across the

1989 /// path from the operand all the way to the root of the tree and to

1990 /// calculate the operation that corresponds to this path. For example, the

1991 /// path from Op2 to the root crosses the RHS of the '-', therefore the

1992 /// corresponding operation is a '-' (which matches the one in the

1993 /// linearized tree, as shown above).

1994 ///

1995 /// For lack of a better term, we refer to this operation as Accumulated

1996 /// Path Operation (APO).

1997structOperandData {

1998 OperandData() =default;

1999 OperandData(Value *V,bool APO,bool IsUsed)

2000 : V(V), APO(APO), IsUsed(IsUsed) {}

2001 /// The operand value.

2002Value *V =nullptr;

2003 /// TreeEntries only allow a single opcode, or an alternate sequence of

2004 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the

2005 /// APO. It is set to 'true' if 'V' is attached to an inverse operation

2006 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise

2007 /// (e.g., Add/Mul)

2008bool APO =false;

2009 /// Helper data for the reordering function.

2010bool IsUsed =false;

2011 };

2012

2013 /// During operand reordering, we are trying to select the operand at lane

2014 /// that matches best with the operand at the neighboring lane. Our

2015 /// selection is based on the type of value we are looking for. For example,

2016 /// if the neighboring lane has a load, we need to look for a load that is

2017 /// accessing a consecutive address. These strategies are summarized in the

2018 /// 'ReorderingMode' enumerator.

2019enum class ReorderingMode {

2020 Load,///< Matching loads to consecutive memory addresses

2021 Opcode,///< Matching instructions based on opcode (same or alternate)

2022Constant,///< Matching constants

2023Splat,///< Matching the same instruction multiple times (broadcast)

2024Failed,///< We failed to create a vectorizable group

2025 };

2026

2027usingOperandDataVec =SmallVector<OperandData, 2>;

2028

2029 /// A vector of operand vectors.

2030SmallVector<OperandDataVec, 4> OpsVec;

2031 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]

2032 /// is not IntrinsicInst, ArgSize is User::getNumOperands.

2033unsigned ArgSize = 0;

2034

2035constTargetLibraryInfo &TLI;

2036constDataLayout &DL;

2037ScalarEvolution &SE;

2038constBoUpSLP &R;

2039constLoop *L =nullptr;

2040

2041 /// \returns the operand data at \p OpIdx and \p Lane.

2042 OperandData &getData(unsigned OpIdx,unsigned Lane) {

2043return OpsVec[OpIdx][Lane];

2044 }

2045

2046 /// \returns the operand data at \p OpIdx and \p Lane. Const version.

2047const OperandData &getData(unsigned OpIdx,unsigned Lane) const{

2048return OpsVec[OpIdx][Lane];

2049 }

2050

2051 /// Clears the used flag for all entries.

2052void clearUsed() {

2053for (unsigned OpIdx = 0, NumOperands = getNumOperands();

2054 OpIdx != NumOperands; ++OpIdx)

2055for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

2056 ++Lane)

2057 OpsVec[OpIdx][Lane].IsUsed =false;

2058 }

2059

2060 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.

2061void swap(unsigned OpIdx1,unsigned OpIdx2,unsigned Lane) {

2062std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

2063 }

2064

2065 /// \param Lane lane of the operands under analysis.

2066 /// \param OpIdx operand index in \p Lane lane we're looking the best

2067 /// candidate for.

2068 /// \param Idx operand index of the current candidate value.

2069 /// \returns The additional score due to possible broadcasting of the

2070 /// elements in the lane. It is more profitable to have power-of-2 unique

2071 /// elements in the lane, it will be vectorized with higher probability

2072 /// after removing duplicates. Currently the SLP vectorizer supports only

2073 /// vectorization of the power-of-2 number of unique scalars.

2074int getSplatScore(unsigned Lane,unsigned OpIdx,unsignedIdx,

2075constSmallBitVector &UsedLanes) const{

2076Value *IdxLaneV = getData(Idx, Lane).V;

2077if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||

2078 isa<ExtractElementInst>(IdxLaneV))

2079return 0;

2080SmallDenseMap<Value *, unsigned, 4> Uniques;

2081for (unsigned Ln : seq<unsigned>(getNumLanes())) {

2082if (Ln == Lane)

2083continue;

2084Value *OpIdxLnV = getData(OpIdx, Ln).V;

2085if (!isa<Instruction>(OpIdxLnV))

2086return 0;

2087 Uniques.try_emplace(OpIdxLnV, Ln);

2088 }

2089unsigned UniquesCount = Uniques.size();

2090auto IdxIt = Uniques.find(IdxLaneV);

2091unsigned UniquesCntWithIdxLaneV =

2092 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;

2093Value *OpIdxLaneV = getData(OpIdx, Lane).V;

2094auto OpIdxIt = Uniques.find(OpIdxLaneV);

2095unsigned UniquesCntWithOpIdxLaneV =

2096 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;

2097if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)

2098return 0;

2099return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -

2100 UniquesCntWithOpIdxLaneV,

2101 UniquesCntWithOpIdxLaneV -

2102bit_floor(UniquesCntWithOpIdxLaneV)) -

2103 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))

2104 ? UniquesCntWithIdxLaneV -bit_floor(UniquesCntWithIdxLaneV)

2105 :bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);

2106 }

2107

2108 /// \param Lane lane of the operands under analysis.

2109 /// \param OpIdx operand index in \p Lane lane we're looking the best

2110 /// candidate for.

2111 /// \param Idx operand index of the current candidate value.

2112 /// \returns The additional score for the scalar which users are all

2113 /// vectorized.

2114int getExternalUseScore(unsigned Lane,unsigned OpIdx,unsignedIdx) const{

2115Value *IdxLaneV = getData(Idx, Lane).V;

2116Value *OpIdxLaneV = getData(OpIdx, Lane).V;

2117// Do not care about number of uses for vector-like instructions

2118// (extractelement/extractvalue with constant indices), they are extracts

2119// themselves and already externally used. Vectorization of such

2120// instructions does not add extra extractelement instruction, just may

2121// remove it.

2122if (isVectorLikeInstWithConstOps(IdxLaneV) &&

2123isVectorLikeInstWithConstOps(OpIdxLaneV))

2124returnLookAheadHeuristics::ScoreAllUserVectorized;

2125auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);

2126if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))

2127return 0;

2128return R.areAllUsersVectorized(IdxLaneI)

2129 ?LookAheadHeuristics::ScoreAllUserVectorized

2130 : 0;

2131 }

2132

2133 /// Score scaling factor for fully compatible instructions but with

2134 /// different number of external uses. Allows better selection of the

2135 /// instructions with less external uses.

2136staticconstint ScoreScaleFactor = 10;

2137

2138 /// \Returns the look-ahead score, which tells us how much the sub-trees

2139 /// rooted at \p LHS and \p RHS match, the more they match the higher the

2140 /// score. This helps break ties in an informed way when we cannot decide on

2141 /// the order of the operands by just considering the immediate

2142 /// predecessors.

2143int getLookAheadScore(Value *LHS,Value *RHS,ArrayRef<Value *> MainAltOps,

2144int Lane,unsigned OpIdx,unsignedIdx,

2145bool &IsUsed,constSmallBitVector &UsedLanes) {

2146LookAheadHeuristics LookAhead(TLI,DL, SE, R, getNumLanes(),

2147LookAheadMaxDepth);

2148// Keep track of the instruction stack as we recurse into the operands

2149// during the look-ahead score exploration.

2150int Score =

2151 LookAhead.getScoreAtLevelRec(LHS,RHS,/*U1=*/nullptr,/*U2=*/nullptr,

2152/*CurrLevel=*/1, MainAltOps);

2153if (Score) {

2154int SplatScore = getSplatScore(Lane, OpIdx,Idx, UsedLanes);

2155if (Score <= -SplatScore) {

2156// Failed score.

2157 Score = 0;

2158 }else {

2159 Score += SplatScore;

2160// Scale score to see the difference between different operands

2161// and similar operands but all vectorized/not all vectorized

2162// uses. It does not affect actual selection of the best

2163// compatible operand in general, just allows to select the

2164// operand with all vectorized uses.

2165 Score *= ScoreScaleFactor;

2166 Score += getExternalUseScore(Lane, OpIdx,Idx);

2167 IsUsed =true;

2168 }

2169 }

2170return Score;

2171 }

2172

2173 /// Best defined scores per lanes between the passes. Used to choose the

2174 /// best operand (with the highest score) between the passes.

2175 /// The key - {Operand Index, Lane}.

2176 /// The value - the best score between the passes for the lane and the

2177 /// operand.

2178SmallDenseMap<std::pair<unsigned, unsigned>,unsigned, 8>

2179 BestScoresPerLanes;

2180

2181// Search all operands in Ops[*][Lane] for the one that matches best

2182// Ops[OpIdx][LastLane] and return its opreand index.

2183// If no good match can be found, return std::nullopt.

2184 std::optional<unsigned>

2185 getBestOperand(unsigned OpIdx,int Lane,int LastLane,

2186ArrayRef<ReorderingMode> ReorderingModes,

2187ArrayRef<Value *> MainAltOps,

2188constSmallBitVector &UsedLanes) {

2189unsigned NumOperands = getNumOperands();

2190

2191// The operand of the previous lane at OpIdx.

2192Value *OpLastLane = getData(OpIdx, LastLane).V;

2193

2194// Our strategy mode for OpIdx.

2195 ReorderingMode RMode = ReorderingModes[OpIdx];

2196if (RMode == ReorderingMode::Failed)

2197return std::nullopt;

2198

2199// The linearized opcode of the operand at OpIdx, Lane.

2200bool OpIdxAPO = getData(OpIdx, Lane).APO;

2201

2202// The best operand index and its score.

2203// Sometimes we have more than one option (e.g., Opcode and Undefs), so we

2204// are using the score to differentiate between the two.

2205structBestOpData {

2206 std::optional<unsigned>Idx;

2207unsigned Score = 0;

2208 } BestOp;

2209 BestOp.Score =

2210 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)

2211 .first->second;

2212

2213// Track if the operand must be marked as used. If the operand is set to

2214// Score 1 explicitly (because of non power-of-2 unique scalars, we may

2215// want to reestimate the operands again on the following iterations).

2216bool IsUsed = RMode == ReorderingMode::Splat ||

2217 RMode == ReorderingMode::Constant ||

2218 RMode == ReorderingMode::Load;

2219// Iterate through all unused operands and look for the best.

2220for (unsignedIdx = 0;Idx != NumOperands; ++Idx) {

2221// Get the operand at Idx and Lane.

2222 OperandData &OpData = getData(Idx, Lane);

2223Value *Op = OpData.V;

2224bool OpAPO = OpData.APO;

2225

2226// Skip already selected operands.

2227if (OpData.IsUsed)

2228continue;

2229

2230// Skip if we are trying to move the operand to a position with a

2231// different opcode in the linearized tree form. This would break the

2232// semantics.

2233if (OpAPO != OpIdxAPO)

2234continue;

2235

2236// Look for an operand that matches the current mode.

2237switch (RMode) {

2238case ReorderingMode::Load:

2239case ReorderingMode::Opcode: {

2240bool LeftToRight = Lane > LastLane;

2241Value *OpLeft = (LeftToRight) ? OpLastLane :Op;

2242Value *OpRight = (LeftToRight) ?Op : OpLastLane;

2243int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,

2244 OpIdx,Idx, IsUsed, UsedLanes);

2245if (Score >static_cast<int>(BestOp.Score) ||

2246 (Score > 0 && Score ==static_cast<int>(BestOp.Score) &&

2247Idx == OpIdx)) {

2248 BestOp.Idx =Idx;

2249 BestOp.Score = Score;

2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;

2251 }

2252break;

2253 }

2254case ReorderingMode::Constant:

2255if (isa<Constant>(Op) ||

2256 (!BestOp.Score && L && L->isLoopInvariant(Op))) {

2257 BestOp.Idx =Idx;

2258if (isa<Constant>(Op)) {

2259 BestOp.Score =LookAheadHeuristics::ScoreConstants;

2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =

2261LookAheadHeuristics::ScoreConstants;

2262 }

2263if (isa<UndefValue>(Op) || !isa<Constant>(Op))

2264 IsUsed =false;

2265 }

2266break;

2267case ReorderingMode::Splat:

2268if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {

2269 IsUsed =Op == OpLastLane;

2270if (Op == OpLastLane) {

2271 BestOp.Score =LookAheadHeuristics::ScoreSplat;

2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =

2273LookAheadHeuristics::ScoreSplat;

2274 }

2275 BestOp.Idx =Idx;

2276 }

2277break;

2278case ReorderingMode::Failed:

2279llvm_unreachable("Not expected Failed reordering mode.");

2280 }

2281 }

2282

2283if (BestOp.Idx) {

2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;

2285return BestOp.Idx;

2286 }

2287// If we could not find a good match return std::nullopt.

2288return std::nullopt;

2289 }

2290

2291 /// Helper for reorderOperandVecs.

2292 /// \returns the lane that we should start reordering from. This is the one

2293 /// which has the least number of operands that can freely move about or

2294 /// less profitable because it already has the most optimal set of operands.

2295unsigned getBestLaneToStartReordering() const{

2296unsigned Min = UINT_MAX;

2297unsigned SameOpNumber = 0;

2298// std::pair<unsigned, unsigned> is used to implement a simple voting

2299// algorithm and choose the lane with the least number of operands that

2300// can freely move about or less profitable because it already has the

2301// most optimal set of operands. The first unsigned is a counter for

2302// voting, the second unsigned is the counter of lanes with instructions

2303// with same/alternate opcodes and same parent basic block.

2304MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;

2305// Try to be closer to the original results, if we have multiple lanes

2306// with same cost. If 2 lanes have the same cost, use the one with the

2307// highest index.

2308for (intI = getNumLanes();I > 0; --I) {

2309unsigned Lane =I - 1;

2310 OperandsOrderData NumFreeOpsHash =

2311 getMaxNumOperandsThatCanBeReordered(Lane);

2312// Compare the number of operands that can move and choose the one with

2313// the least number.

2314if (NumFreeOpsHash.NumOfAPOs < Min) {

2315 Min = NumFreeOpsHash.NumOfAPOs;

2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

2317 HashMap.clear();

2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

2319 }elseif (NumFreeOpsHash.NumOfAPOs == Min &&

2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {

2321// Select the most optimal lane in terms of number of operands that

2322// should be moved around.

2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

2325 }elseif (NumFreeOpsHash.NumOfAPOs == Min &&

2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {

2327auto [It, Inserted] =

2328 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);

2329if (!Inserted)

2330 ++It->second.first;

2331 }

2332 }

2333// Select the lane with the minimum counter.

2334unsigned BestLane = 0;

2335unsigned CntMin = UINT_MAX;

2336for (constauto &Data :reverse(HashMap)) {

2337if (Data.second.first < CntMin) {

2338 CntMin =Data.second.first;

2339 BestLane =Data.second.second;

2340 }

2341 }

2342return BestLane;

2343 }

2344

2345 /// Data structure that helps to reorder operands.

2346structOperandsOrderData {

2347 /// The best number of operands with the same APOs, which can be

2348 /// reordered.

2349unsigned NumOfAPOs = UINT_MAX;

2350 /// Number of operands with the same/alternate instruction opcode and

2351 /// parent.

2352unsigned NumOpsWithSameOpcodeParent = 0;

2353 /// Hash for the actual operands ordering.

2354 /// Used to count operands, actually their position id and opcode

2355 /// value. It is used in the voting mechanism to find the lane with the

2356 /// least number of operands that can freely move about or less profitable

2357 /// because it already has the most optimal set of operands. Can be

2358 /// replaced with SmallVector<unsigned> instead but hash code is faster

2359 /// and requires less memory.

2360unsigned Hash = 0;

2361 };

2362 /// \returns the maximum number of operands that are allowed to be reordered

2363 /// for \p Lane and the number of compatible instructions(with the same

2364 /// parent/opcode). This is used as a heuristic for selecting the first lane

2365 /// to start operand reordering.

2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const{

2367unsigned CntTrue = 0;

2368unsigned NumOperands = getNumOperands();

2369// Operands with the same APO can be reordered. We therefore need to count

2370// how many of them we have for each APO, like this: Cnt[APO] = x.

2371// Since we only have two APOs, namely true and false, we can avoid using

2372// a map. Instead we can simply count the number of operands that

2373// correspond to one of them (in this case the 'true' APO), and calculate

2374// the other by subtracting it from the total number of operands.

2375// Operands with the same instruction opcode and parent are more

2376// profitable since we don't need to move them in many cases, with a high

2377// probability such lane already can be vectorized effectively.

2378bool AllUndefs =true;

2379unsigned NumOpsWithSameOpcodeParent = 0;

2380Instruction *OpcodeI =nullptr;

2381BasicBlock *Parent =nullptr;

2382unsigned Hash = 0;

2383for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

2384const OperandData &OpData = getData(OpIdx, Lane);

2385if (OpData.APO)

2386 ++CntTrue;

2387// Use Boyer-Moore majority voting for finding the majority opcode and

2388// the number of times it occurs.

2389if (auto *I = dyn_cast<Instruction>(OpData.V)) {

2390if (!OpcodeI || !getSameOpcode({OpcodeI,I}, TLI) ||

2391I->getParent() != Parent) {

2392if (NumOpsWithSameOpcodeParent == 0) {

2393 NumOpsWithSameOpcodeParent = 1;

2394 OpcodeI =I;

2395 Parent =I->getParent();

2396 }else {

2397 --NumOpsWithSameOpcodeParent;

2398 }

2399 }else {

2400 ++NumOpsWithSameOpcodeParent;

2401 }

2402 }

2403 Hash =hash_combine(

2404 Hash,hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));

2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);

2406 }

2407if (AllUndefs)

2408return {};

2409 OperandsOrderDataData;

2410Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);

2411Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;

2412Data.Hash = Hash;

2413returnData;

2414 }

2415

2416 /// Go through the instructions in VL and append their operands.

2417void appendOperandsOfVL(ArrayRef<Value *> VL,const InstructionsState &S) {

2418assert(!VL.empty() &&"Bad VL");

2419assert((empty() || VL.size() == getNumLanes()) &&

2420"Expected same number of lanes");

2421assert(S.valid() &&"InstructionsState is invalid.");

2422// IntrinsicInst::isCommutative returns true if swapping the first "two"

2423// arguments to the intrinsic produces the same result.

2424constexprunsigned IntrinsicNumOperands = 2;

2425Instruction *MainOp = S.getMainOp();

2426unsigned NumOperands = MainOp->getNumOperands();

2427 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;

2428 OpsVec.resize(NumOperands);

2429unsigned NumLanes = VL.size();

2430for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

2431 OpsVec[OpIdx].resize(NumLanes);

2432for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

2433assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&

2434"Expected instruction or poison value");

2435// Our tree has just 3 nodes: the root and two operands.

2436// It is therefore trivial to get the APO. We only need to check the

2437// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or

2438// RHS operand. The LHS operand of both add and sub is never attached

2439// to an inversese operation in the linearized form, therefore its APO

2440// is false. The RHS is true only if VL[Lane] is an inverse operation.

2441

2442// Since operand reordering is performed on groups of commutative

2443// operations or alternating sequences (e.g., +, -), we can safely

2444// tell the inverse operations by checking commutativity.

2445if (isa<PoisonValue>(VL[Lane])) {

2446if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {

2447if (OpIdx == 0) {

2448 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),true,false};

2449continue;

2450 }

2451 }elseif (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {

2452if (OpIdx == 0) {

2453 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),true,false};

2454continue;

2455 }

2456 }

2457 OpsVec[OpIdx][Lane] = {

2458PoisonValue::get(MainOp->getOperand(OpIdx)->getType()),true,

2459false};

2460continue;

2461 }

2462bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));

2463bool APO = (OpIdx == 0) ?false : IsInverseOperation;

2464 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),

2465 APO,false};

2466 }

2467 }

2468 }

2469

2470 /// \returns the number of operands.

2471unsigned getNumOperands() const{return ArgSize; }

2472

2473 /// \returns the number of lanes.

2474unsigned getNumLanes() const{return OpsVec[0].size(); }

2475

2476 /// \returns the operand value at \p OpIdx and \p Lane.

2477Value *getValue(unsigned OpIdx,unsigned Lane) const{

2478return getData(OpIdx, Lane).V;

2479 }

2480

2481 /// \returns true if the data structure is empty.

2482bool empty() const{return OpsVec.empty(); }

2483

2484 /// Clears the data.

2485void clear() { OpsVec.clear(); }

2486

2487 /// \Returns true if there are enough operands identical to \p Op to fill

2488 /// the whole vector (it is mixed with constants or loop invariant values).

2489 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.

2490bool shouldBroadcast(Value *Op,unsigned OpIdx,unsigned Lane) {

2491assert(Op == getValue(OpIdx, Lane) &&

2492"Op is expected to be getValue(OpIdx, Lane).");

2493// Small number of loads - try load matching.

2494if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)

2495returnfalse;

2496bool OpAPO = getData(OpIdx, Lane).APO;

2497bool IsInvariant = L && L->isLoopInvariant(Op);

2498unsigned Cnt = 0;

2499for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {

2500if (Ln == Lane)

2501continue;

2502// This is set to true if we found a candidate for broadcast at Lane.

2503bool FoundCandidate =false;

2504for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {

2505 OperandData &Data = getData(OpI, Ln);

2506if (Data.APO != OpAPO ||Data.IsUsed)

2507continue;

2508Value *OpILane = getValue(OpI, Lane);

2509bool IsConstantOp = isa<Constant>(OpILane);

2510// Consider the broadcast candidate if:

2511// 1. Same value is found in one of the operands.

2512if (Data.V ==Op ||

2513// 2. The operand in the given lane is not constant but there is a

2514// constant operand in another lane (which can be moved to the

2515// given lane). In this case we can represent it as a simple

2516// permutation of constant and broadcast.

2517 (!IsConstantOp &&

2518 ((Lns > 2 && isa<Constant>(Data.V)) ||

2519// 2.1. If we have only 2 lanes, need to check that value in the

2520// next lane does not build same opcode sequence.

2521 (Lns == 2 &&

2522 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&

2523 isa<Constant>(Data.V)))) ||

2524// 3. The operand in the current lane is loop invariant (can be

2525// hoisted out) and another operand is also a loop invariant

2526// (though not a constant). In this case the whole vector can be

2527// hoisted out.

2528// FIXME: need to teach the cost model about this case for better

2529// estimation.

2530 (IsInvariant && !isa<Constant>(Data.V) &&

2531 !getSameOpcode({Op,Data.V}, TLI) &&

2532 L->isLoopInvariant(Data.V))) {

2533 FoundCandidate =true;

2534Data.IsUsed =Data.V ==Op;

2535if (Data.V ==Op)

2536 ++Cnt;

2537break;

2538 }

2539 }

2540if (!FoundCandidate)

2541returnfalse;

2542 }

2543return getNumLanes() == 2 || Cnt > 1;

2544 }

2545

2546 /// Checks if there is at least single compatible operand in lanes other

2547 /// than \p Lane, compatible with the operand \p Op.

2548bool canBeVectorized(Instruction *Op,unsigned OpIdx,unsigned Lane) const{

2549assert(Op == getValue(OpIdx, Lane) &&

2550"Op is expected to be getValue(OpIdx, Lane).");

2551bool OpAPO = getData(OpIdx, Lane).APO;

2552for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {

2553if (Ln == Lane)

2554continue;

2555if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {

2556const OperandData &Data = getData(OpI, Ln);

2557if (Data.APO != OpAPO ||Data.IsUsed)

2558returntrue;

2559Value *OpILn = getValue(OpI, Ln);

2560return (L && L->isLoopInvariant(OpILn)) ||

2561 (getSameOpcode({Op, OpILn}, TLI) &&

2562allSameBlock({Op, OpILn}));

2563 }))

2564returntrue;

2565 }

2566returnfalse;

2567 }

2568

2569public:

2570 /// Initialize with all the operands of the instruction vector \p RootVL.

2571VLOperands(ArrayRef<Value *> RootVL,const InstructionsState &S,

2572constBoUpSLP &R)

2573 : TLI(*R.TLI),DL(*R.DL), SE(*R.SE), R(R),

2574 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {

2575// Append all the operands of RootVL.

2576 appendOperandsOfVL(RootVL, S);

2577 }

2578

2579 /// \Returns a value vector with the operands across all lanes for the

2580 /// opearnd at \p OpIdx.

2581ValueList getVL(unsigned OpIdx) const{

2582ValueList OpVL(OpsVec[OpIdx].size());

2583assert(OpsVec[OpIdx].size() == getNumLanes() &&

2584"Expected same num of lanes across all operands");

2585for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)

2586 OpVL[Lane] = OpsVec[OpIdx][Lane].V;

2587return OpVL;

2588 }

2589

2590// Performs operand reordering for 2 or more operands.

2591// The original operands are in OrigOps[OpIdx][Lane].

2592// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.

2593voidreorder() {

2594unsigned NumOperands = getNumOperands();

2595unsigned NumLanes = getNumLanes();

2596// Each operand has its own mode. We are using this mode to help us select

2597// the instructions for each lane, so that they match best with the ones

2598// we have selected so far.

2599SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);

2600

2601// This is a greedy single-pass algorithm. We are going over each lane

2602// once and deciding on the best order right away with no back-tracking.

2603// However, in order to increase its effectiveness, we start with the lane

2604// that has operands that can move the least. For example, given the

2605// following lanes:

2606// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd

2607// Lane 1 : A[1] = C[1] - B[1] // Visited 1st

2608// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd

2609// Lane 3 : A[3] = C[3] - B[3] // Visited 4th

2610// we will start at Lane 1, since the operands of the subtraction cannot

2611// be reordered. Then we will visit the rest of the lanes in a circular

2612// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.

2613

2614// Find the first lane that we will start our search from.

2615unsigned FirstLane = getBestLaneToStartReordering();

2616

2617// Initialize the modes.

2618for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

2619Value *OpLane0 = getValue(OpIdx, FirstLane);

2620// Keep track if we have instructions with all the same opcode on one

2621// side.

2622if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {

2623// Check if OpLane0 should be broadcast.

2624if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||

2625 !canBeVectorized(OpILane0, OpIdx, FirstLane))

2626 ReorderingModes[OpIdx] = ReorderingMode::Splat;

2627elseif (isa<LoadInst>(OpILane0))

2628 ReorderingModes[OpIdx] = ReorderingMode::Load;

2629else

2630 ReorderingModes[OpIdx] = ReorderingMode::Opcode;

2631 }elseif (isa<Constant>(OpLane0)) {

2632 ReorderingModes[OpIdx] = ReorderingMode::Constant;

2633 }elseif (isa<Argument>(OpLane0)) {

2634// Our best hope is a Splat. It may save some cost in some cases.

2635 ReorderingModes[OpIdx] = ReorderingMode::Splat;

2636 }else {

2637llvm_unreachable("Unexpected value kind.");

2638 }

2639 }

2640

2641// Check that we don't have same operands. No need to reorder if operands

2642// are just perfect diamond or shuffled diamond match. Do not do it only

2643// for possible broadcasts or non-power of 2 number of scalars (just for

2644// now).

2645auto &&SkipReordering = [this]() {

2646SmallPtrSet<Value *, 4> UniqueValues;

2647ArrayRef<OperandData> Op0 = OpsVec.front();

2648for (const OperandData &Data : Op0)

2649 UniqueValues.insert(Data.V);

2650for (ArrayRef<OperandData>Op :

2651ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {

2652if (any_of(Op, [&UniqueValues](const OperandData &Data) {

2653return !UniqueValues.contains(Data.V);

2654 }))

2655returnfalse;

2656 }

2657// TODO: Check if we can remove a check for non-power-2 number of

2658// scalars after full support of non-power-2 vectorization.

2659return UniqueValues.size() != 2 &&

2660hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),

2661 UniqueValues.size());

2662 };

2663

2664// If the initial strategy fails for any of the operand indexes, then we

2665// perform reordering again in a second pass. This helps avoid assigning

2666// high priority to the failed strategy, and should improve reordering for

2667// the non-failed operand indexes.

2668for (intPass = 0;Pass != 2; ++Pass) {

2669// Check if no need to reorder operands since they're are perfect or

2670// shuffled diamond match.

2671// Need to do it to avoid extra external use cost counting for

2672// shuffled matches, which may cause regressions.

2673if (SkipReordering())

2674break;

2675// Skip the second pass if the first pass did not fail.

2676bool StrategyFailed =false;

2677// Mark all operand data as free to use.

2678 clearUsed();

2679// We keep the original operand order for the FirstLane, so reorder the

2680// rest of the lanes. We are visiting the nodes in a circular fashion,

2681// using FirstLane as the center point and increasing the radius

2682// distance.

2683SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);

2684for (unsignedI = 0;I < NumOperands; ++I)

2685 MainAltOps[I].push_back(getData(I, FirstLane).V);

2686

2687SmallBitVector UsedLanes(NumLanes);

2688 UsedLanes.set(FirstLane);

2689for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {

2690// Visit the lane on the right and then the lane on the left.

2691for (intDirection : {+1, -1}) {

2692int Lane = FirstLane +Direction * Distance;

2693if (Lane < 0 || Lane >= (int)NumLanes)

2694continue;

2695 UsedLanes.set(Lane);

2696int LastLane = Lane -Direction;

2697assert(LastLane >= 0 && LastLane < (int)NumLanes &&

2698"Out of bounds");

2699// Look for a good match for each operand.

2700for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

2701// Search for the operand that matches SortedOps[OpIdx][Lane-1].

2702 std::optional<unsigned> BestIdx =

2703 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,

2704 MainAltOps[OpIdx], UsedLanes);

2705// By not selecting a value, we allow the operands that follow to

2706// select a better matching value. We will get a non-null value in

2707// the next run of getBestOperand().

2708if (BestIdx) {

2709// Swap the current operand with the one returned by

2710// getBestOperand().

2711 swap(OpIdx, *BestIdx, Lane);

2712 }else {

2713// Enable the second pass.

2714 StrategyFailed =true;

2715 }

2716// Try to get the alternate opcode and follow it during analysis.

2717if (MainAltOps[OpIdx].size() != 2) {

2718 OperandData &AltOp = getData(OpIdx, Lane);

2719 InstructionsState OpS =

2720getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);

2721if (OpS && OpS.isAltShuffle())

2722 MainAltOps[OpIdx].push_back(AltOp.V);

2723 }

2724 }

2725 }

2726 }

2727// Skip second pass if the strategy did not fail.

2728if (!StrategyFailed)

2729break;

2730 }

2731 }

2732

2733#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

2734LLVM_DUMP_METHODstaticStringRef getModeStr(ReorderingMode RMode) {

2735switch (RMode) {

2736case ReorderingMode::Load:

2737return"Load";

2738case ReorderingMode::Opcode:

2739return"Opcode";

2740case ReorderingMode::Constant:

2741return"Constant";

2742case ReorderingMode::Splat:

2743return"Splat";

2744case ReorderingMode::Failed:

2745return"Failed";

2746 }

2747llvm_unreachable("Unimplemented Reordering Type");

2748 }

2749

2750LLVM_DUMP_METHODstaticraw_ostream &printMode(ReorderingMode RMode,

2751raw_ostream &OS) {

2752returnOS <<getModeStr(RMode);

2753 }

2754

2755 /// Debug print.

2756LLVM_DUMP_METHODstaticvoiddumpMode(ReorderingMode RMode) {

2757printMode(RMode,dbgs());

2758 }

2759

2760friendraw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {

2761returnprintMode(RMode,OS);

2762 }

2763

2764LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const{

2765constunsigned Indent = 2;

2766unsigned Cnt = 0;

2767for (constOperandDataVec &OpDataVec : OpsVec) {

2768OS <<"Operand " << Cnt++ <<"\n";

2769for (const OperandData &OpData : OpDataVec) {

2770OS.indent(Indent) <<"{";

2771if (Value *V = OpData.V)

2772OS << *V;

2773else

2774OS <<"null";

2775OS <<", APO:" << OpData.APO <<"}\n";

2776 }

2777OS <<"\n";

2778 }

2779returnOS;

2780 }

2781

2782 /// Debug print.

2783LLVM_DUMP_METHODvoiddump() const{print(dbgs()); }

2784#endif

2785 };

2786

2787 /// Evaluate each pair in \p Candidates and return index into \p Candidates

2788 /// for a pair which have highest score deemed to have best chance to form

2789 /// root of profitable tree to vectorize. Return std::nullopt if no candidate

2790 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit

2791 /// of the cost, considered to be good enough score.

2792 std::optional<int>

2793findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,

2794int Limit =LookAheadHeuristics::ScoreFail) const{

2795LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this,/*NumLanes=*/2,

2796RootLookAheadMaxDepth);

2797int BestScore = Limit;

2798 std::optional<int> Index;

2799for (intI : seq<int>(0, Candidates.size())) {

2800int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,

2801 Candidates[I].second,

2802/*U1=*/nullptr,/*U2=*/nullptr,

2803/*CurrLevel=*/1, {});

2804if (Score > BestScore) {

2805 BestScore = Score;

2806 Index =I;

2807 }

2808 }

2809return Index;

2810 }

2811

2812 /// Checks if the instruction is marked for deletion.

2813boolisDeleted(Instruction *I) const{return DeletedInstructions.count(I); }

2814

2815 /// Removes an instruction from its block and eventually deletes it.

2816 /// It's like Instruction::eraseFromParent() except that the actual deletion

2817 /// is delayed until BoUpSLP is destructed.

2818voideraseInstruction(Instruction *I) {

2819 DeletedInstructions.insert(I);

2820 }

2821

2822 /// Remove instructions from the parent function and clear the operands of \p

2823 /// DeadVals instructions, marking for deletion trivially dead operands.

2824template <typename T>

2825voidremoveInstructionsAndOperands(ArrayRef<T *> DeadVals) {

2826SmallVector<WeakTrackingVH> DeadInsts;

2827for (T *V : DeadVals) {

2828auto *I = cast<Instruction>(V);

2829 DeletedInstructions.insert(I);

2830 }

2831DenseSet<Value *> Processed;

2832for (T *V : DeadVals) {

2833if (!V || !Processed.insert(V).second)

2834continue;

2835auto *I = cast<Instruction>(V);

2836salvageDebugInfo(*I);

2837SmallVector<const TreeEntry *> Entries;

2838if (const TreeEntry *Entry = getTreeEntry(I)) {

2839 Entries.push_back(Entry);

2840auto It = MultiNodeScalars.find(I);

2841if (It != MultiNodeScalars.end())

2842 Entries.append(It->second.begin(), It->second.end());

2843 }

2844for (Use &U :I->operands()) {

2845if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());

2846 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&

2847wouldInstructionBeTriviallyDead(OpI, TLI) &&

2848 (Entries.empty() ||none_of(Entries, [&](const TreeEntry *Entry) {

2849return Entry->VectorizedValue == OpI;

2850 })))

2851 DeadInsts.push_back(OpI);

2852 }

2853I->dropAllReferences();

2854 }

2855for (T *V : DeadVals) {

2856auto *I = cast<Instruction>(V);

2857if (!I->getParent())

2858continue;

2859assert((I->use_empty() ||all_of(I->uses(),

2860 [&](Use &U) {

2861 return isDeleted(

2862 cast<Instruction>(U.getUser()));

2863 })) &&

2864"trying to erase instruction with users.");

2865I->removeFromParent();

2866 SE->forgetValue(I);

2867 }

2868// Process the dead instruction list until empty.

2869while (!DeadInsts.empty()) {

2870Value *V = DeadInsts.pop_back_val();

2871Instruction *VI = cast_or_null<Instruction>(V);

2872if (!VI || !VI->getParent())

2873continue;

2874assert(isInstructionTriviallyDead(VI, TLI) &&

2875"Live instruction found in dead worklist!");

2876assert(VI->use_empty() &&"Instructions with uses are not dead.");

2877

2878// Don't lose the debug info while deleting the instructions.

2879salvageDebugInfo(*VI);

2880

2881// Null out all of the instruction's operands to see if any operand

2882// becomes dead as we go.

2883for (Use &OpU : VI->operands()) {

2884Value *OpV = OpU.get();

2885if (!OpV)

2886continue;

2887 OpU.set(nullptr);

2888

2889if (!OpV->use_empty())

2890continue;

2891

2892// If the operand is an instruction that became dead as we nulled out

2893// the operand, and if it is 'trivially' dead, delete it in a future

2894// loop iteration.

2895if (auto *OpI = dyn_cast<Instruction>(OpV))

2896if (!DeletedInstructions.contains(OpI) &&

2897isInstructionTriviallyDead(OpI, TLI))

2898 DeadInsts.push_back(OpI);

2899 }

2900

2901 VI->removeFromParent();

2902 DeletedInstructions.insert(VI);

2903 SE->forgetValue(VI);

2904 }

2905 }

2906

2907 /// Checks if the instruction was already analyzed for being possible

2908 /// reduction root.

2909boolisAnalyzedReductionRoot(Instruction *I) const{

2910return AnalyzedReductionsRoots.count(I);

2911 }

2912 /// Register given instruction as already analyzed for being possible

2913 /// reduction root.

2914voidanalyzedReductionRoot(Instruction *I) {

2915 AnalyzedReductionsRoots.insert(I);

2916 }

2917 /// Checks if the provided list of reduced values was checked already for

2918 /// vectorization.

2919boolareAnalyzedReductionVals(ArrayRef<Value *> VL) const{

2920return AnalyzedReductionVals.contains(hash_value(VL));

2921 }

2922 /// Adds the list of reduced values to list of already checked values for the

2923 /// vectorization.

2924voidanalyzedReductionVals(ArrayRef<Value *> VL) {

2925 AnalyzedReductionVals.insert(hash_value(VL));

2926 }

2927 /// Clear the list of the analyzed reduction root instructions.

2928voidclearReductionData() {

2929 AnalyzedReductionsRoots.clear();

2930 AnalyzedReductionVals.clear();

2931 AnalyzedMinBWVals.clear();

2932 }

2933 /// Checks if the given value is gathered in one of the nodes.

2934boolisAnyGathered(constSmallDenseSet<Value *> &Vals) const{

2935returnany_of(MustGather, [&](Value *V) {return Vals.contains(V); });

2936 }

2937 /// Checks if the given value is gathered in one of the nodes.

2938boolisGathered(constValue *V) const{

2939return MustGather.contains(V);

2940 }

2941 /// Checks if the specified value was not schedule.

2942boolisNotScheduled(constValue *V) const{

2943return NonScheduledFirst.contains(V);

2944 }

2945

2946 /// Check if the value is vectorized in the tree.

2947boolisVectorized(Value *V) const{return getTreeEntry(V); }

2948

2949~BoUpSLP();

2950

2951private:

2952 /// Determine if a node \p E in can be demoted to a smaller type with a

2953 /// truncation. We collect the entries that will be demoted in ToDemote.

2954 /// \param E Node for analysis

2955 /// \param ToDemote indices of the nodes to be demoted.

2956bool collectValuesToDemote(

2957const TreeEntry &E,bool IsProfitableToDemoteRoot,unsigned &BitWidth,

2958SmallVectorImpl<unsigned> &ToDemote,DenseSet<const TreeEntry *> &Visited,

2959constSmallDenseSet<unsigned, 8> &NodesToKeepBWs,unsigned &MaxDepthLevel,

2960bool &IsProfitableToDemote,bool IsTruncRoot)const;

2961

2962 /// Check if the operands on the edges \p Edges of the \p UserTE allows

2963 /// reordering (i.e. the operands can be reordered because they have only one

2964 /// user and reordarable).

2965 /// \param ReorderableGathers List of all gather nodes that require reordering

2966 /// (e.g., gather of extractlements or partially vectorizable loads).

2967 /// \param GatherOps List of gather operand nodes for \p UserTE that require

2968 /// reordering, subset of \p NonVectorized.

2969bool

2970 canReorderOperands(TreeEntry *UserTE,

2971SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

2972ArrayRef<TreeEntry *> ReorderableGathers,

2973SmallVectorImpl<TreeEntry *> &GatherOps);

2974

2975 /// Checks if the given \p TE is a gather node with clustered reused scalars

2976 /// and reorders it per given \p Mask.

2977void reorderNodeWithReuses(TreeEntry &TE,ArrayRef<int> Mask)const;

2978

2979 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,

2980 /// if any. If it is not vectorized (gather node), returns nullptr.

2981 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,unsigned OpIdx) {

2982ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);

2983 TreeEntry *TE =nullptr;

2984constauto *It =find_if(VL, [&](Value *V) {

2985 TE = getTreeEntry(V);

2986if (TE &&is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))

2987returntrue;

2988auto It = MultiNodeScalars.find(V);

2989if (It != MultiNodeScalars.end()) {

2990for (TreeEntry *E : It->second) {

2991if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {

2992 TE = E;

2993returntrue;

2994 }

2995 }

2996 }

2997returnfalse;

2998 });

2999if (It != VL.end()) {

3000assert(TE->isSame(VL) &&"Expected same scalars.");

3001returnTE;

3002 }

3003returnnullptr;

3004 }

3005

3006 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,

3007 /// if any. If it is not vectorized (gather node), returns nullptr.

3008const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,

3009unsigned OpIdx) const{

3010returnconst_cast<BoUpSLP *>(this)->getVectorizedOperand(

3011const_cast<TreeEntry *>(UserTE), OpIdx);

3012 }

3013

3014 /// Checks if all users of \p I are the part of the vectorization tree.

3015bool areAllUsersVectorized(

3016Instruction *I,

3017constSmallDenseSet<Value *> *VectorizedVals =nullptr)const;

3018

3019 /// Return information about the vector formed for the specified index

3020 /// of a vector of (the same) instruction.

3021TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);

3022

3023 /// \ returns the graph entry for the \p Idx operand of the \p E entry.

3024const TreeEntry *getOperandEntry(const TreeEntry *E,unsignedIdx)const;

3025

3026 /// Gets the root instruction for the given node. If the node is a strided

3027 /// load/store node with the reverse order, the root instruction is the last

3028 /// one.

3029Instruction *getRootEntryInstruction(const TreeEntry &Entry)const;

3030

3031 /// \returns Cast context for the given graph node.

3032TargetTransformInfo::CastContextHint

3033 getCastContextHint(const TreeEntry &TE)const;

3034

3035 /// \returns the cost of the vectorizable entry.

3036InstructionCost getEntryCost(const TreeEntry *E,

3037ArrayRef<Value *> VectorizedVals,

3038SmallPtrSetImpl<Value *> &CheckedExtracts);

3039

3040 /// This is the recursive part of buildTree.

3041void buildTree_rec(ArrayRef<Value *> Roots,unsignedDepth,

3042const EdgeInfo &EI,unsigned InterleaveFactor = 0);

3043

3044 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can

3045 /// be vectorized to use the original vector (or aggregate "bitcast" to a

3046 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise

3047 /// returns false, setting \p CurrentOrder to either an empty vector or a

3048 /// non-identity permutation that allows to reuse extract instructions.

3049 /// \param ResizeAllowed indicates whether it is allowed to handle subvector

3050 /// extract order.

3051bool canReuseExtract(ArrayRef<Value *> VL,

3052SmallVectorImpl<unsigned> &CurrentOrder,

3053bool ResizeAllowed =false)const;

3054

3055 /// Vectorize a single entry in the tree.

3056 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to

3057 /// avoid issues with def-use order.

3058Value *vectorizeTree(TreeEntry *E,bool PostponedPHIs);

3059

3060 /// Returns vectorized operand node, that matches the order of the scalars

3061 /// operand number \p NodeIdx in entry \p E.

3062 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,unsigned NodeIdx);

3063const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,

3064unsigned NodeIdx) const{

3065returnconst_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);

3066 }

3067

3068 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry

3069 /// \p E.

3070 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to

3071 /// avoid issues with def-use order.

3072Value *vectorizeOperand(TreeEntry *E,unsigned NodeIdx,bool PostponedPHIs);

3073

3074 /// Create a new vector from a list of scalar values. Produces a sequence

3075 /// which exploits values reused across lanes, and arranges the inserts

3076 /// for ease of later optimization.

3077template <typename BVTy,typename ResTy,typename...Args>

3078 ResTy processBuildVector(const TreeEntry *E,Type *ScalarTy, Args &...Params);

3079

3080 /// Create a new vector from a list of scalar values. Produces a sequence

3081 /// which exploits values reused across lanes, and arranges the inserts

3082 /// for ease of later optimization.

3083Value *createBuildVector(const TreeEntry *E,Type *ScalarTy,

3084bool PostponedPHIs);

3085

3086 /// Returns the instruction in the bundle, which can be used as a base point

3087 /// for scheduling. Usually it is the last instruction in the bundle, except

3088 /// for the case when all operands are external (in this case, it is the first

3089 /// instruction in the list).

3090Instruction &getLastInstructionInBundle(const TreeEntry *E);

3091

3092 /// Tries to find extractelement instructions with constant indices from fixed

3093 /// vector type and gather such instructions into a bunch, which highly likely

3094 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

3095 /// was successful, the matched scalars are replaced by poison values in \p VL

3096 /// for future analysis.

3097 std::optional<TargetTransformInfo::ShuffleKind>

3098 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,

3099SmallVectorImpl<int> &Mask)const;

3100

3101 /// Tries to find extractelement instructions with constant indices from fixed

3102 /// vector type and gather such instructions into a bunch, which highly likely

3103 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

3104 /// was successful, the matched scalars are replaced by poison values in \p VL

3105 /// for future analysis.

3106SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

3107 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

3108SmallVectorImpl<int> &Mask,

3109unsigned NumParts)const;

3110

3111 /// Checks if the gathered \p VL can be represented as a single register

3112 /// shuffle(s) of previous tree entries.

3113 /// \param TE Tree entry checked for permutation.

3114 /// \param VL List of scalars (a subset of the TE scalar), checked for

3115 /// permutations. Must form single-register vector.

3116 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also

3117 /// commands to build the mask using the original vector value, without

3118 /// relying on the potential reordering.

3119 /// \returns ShuffleKind, if gathered values can be represented as shuffles of

3120 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.

3121 std::optional<TargetTransformInfo::ShuffleKind>

3122 isGatherShuffledSingleRegisterEntry(

3123const TreeEntry *TE,ArrayRef<Value *> VL,MutableArrayRef<int> Mask,

3124SmallVectorImpl<const TreeEntry *> &Entries,unsigned Part,

3125bool ForOrder);

3126

3127 /// Checks if the gathered \p VL can be represented as multi-register

3128 /// shuffle(s) of previous tree entries.

3129 /// \param TE Tree entry checked for permutation.

3130 /// \param VL List of scalars (a subset of the TE scalar), checked for

3131 /// permutations.

3132 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also

3133 /// commands to build the mask using the original vector value, without

3134 /// relying on the potential reordering.

3135 /// \returns per-register series of ShuffleKind, if gathered values can be

3136 /// represented as shuffles of previous tree entries. \p Mask is filled with

3137 /// the shuffle mask (also on per-register base).

3138SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

3139 isGatherShuffledEntry(

3140const TreeEntry *TE,ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,

3141SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,

3142unsigned NumParts,bool ForOrder =false);

3143

3144 /// \returns the cost of gathering (inserting) the values in \p VL into a

3145 /// vector.

3146 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.

3147InstructionCost getGatherCost(ArrayRef<Value *> VL,bool ForPoisonSrc,

3148Type *ScalarTy)const;

3149

3150 /// Set the Builder insert point to one after the last instruction in

3151 /// the bundle

3152void setInsertPointAfterBundle(const TreeEntry *E);

3153

3154 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not

3155 /// specified, the starting vector value is poison.

3156Value *

3157 gather(ArrayRef<Value *> VL,Value *Root,Type *ScalarTy,

3158function_ref<Value *(Value *,Value *,ArrayRef<int>)> CreateShuffle);

3159

3160 /// \returns whether the VectorizableTree is fully vectorizable and will

3161 /// be beneficial even the tree height is tiny.

3162bool isFullyVectorizableTinyTree(bool ForReduction)const;

3163

3164 /// Run through the list of all gathered loads in the graph and try to find

3165 /// vector loads/masked gathers instead of regular gathers. Later these loads

3166 /// are reshufled to build final gathered nodes.

3167void tryToVectorizeGatheredLoads(

3168constSmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,

3169SmallVector<SmallVector<std::pair<LoadInst *, int>>>,

3170 8> &GatheredLoads);

3171

3172 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the

3173 /// users of \p TE and collects the stores. It returns the map from the store

3174 /// pointers to the collected stores.

3175SmallVector<SmallVector<StoreInst *>>

3176 collectUserStores(const BoUpSLP::TreeEntry *TE)const;

3177

3178 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the

3179 /// stores in \p StoresVec can form a vector instruction. If so it returns

3180 /// true and populates \p ReorderIndices with the shuffle indices of the

3181 /// stores when compared to the sorted vector.

3182bool canFormVector(ArrayRef<StoreInst *> StoresVec,

3183OrdersType &ReorderIndices)const;

3184

3185 /// Iterates through the users of \p TE, looking for scalar stores that can be

3186 /// potentially vectorized in a future SLP-tree. If found, it keeps track of

3187 /// their order and builds an order index vector for each store bundle. It

3188 /// returns all these order vectors found.

3189 /// We run this after the tree has formed, otherwise we may come across user

3190 /// instructions that are not yet in the tree.

3191SmallVector<OrdersType, 1>

3192 findExternalStoreUsersReorderIndices(TreeEntry *TE)const;

3193

3194 /// Tries to reorder the gathering node for better vectorization

3195 /// opportunities.

3196void reorderGatherNode(TreeEntry &TE);

3197

3198structTreeEntry {

3199usingVecTreeTy =SmallVector<std::unique_ptr<TreeEntry>, 8>;

3200 TreeEntry(VecTreeTy &Container) : Container(Container) {}

3201

3202 /// \returns Common mask for reorder indices and reused scalars.

3203SmallVector<int> getCommonMask() const{

3204SmallVector<int>Mask;

3205inversePermutation(ReorderIndices, Mask);

3206::addMask(Mask, ReuseShuffleIndices);

3207returnMask;

3208 }

3209

3210 /// \returns true if the scalars in VL are equal to this entry.

3211bool isSame(ArrayRef<Value *> VL) const{

3212auto &&IsSame = [VL](ArrayRef<Value *> Scalars,ArrayRef<int>Mask) {

3213if (Mask.size() != VL.size() && VL.size() == Scalars.size())

3214return std::equal(VL.begin(), VL.end(), Scalars.begin());

3215return VL.size() ==Mask.size() &&

3216 std::equal(VL.begin(), VL.end(),Mask.begin(),

3217 [Scalars](Value *V,int Idx) {

3218 return (isa<UndefValue>(V) &&

3219 Idx == PoisonMaskElem) ||

3220 (Idx != PoisonMaskElem && V == Scalars[Idx]);

3221 });

3222 };

3223if (!ReorderIndices.empty()) {

3224// TODO: implement matching if the nodes are just reordered, still can

3225// treat the vector as the same if the list of scalars matches VL

3226// directly, without reordering.

3227SmallVector<int>Mask;

3228inversePermutation(ReorderIndices, Mask);

3229if (VL.size() == Scalars.size())

3230return IsSame(Scalars, Mask);

3231if (VL.size() == ReuseShuffleIndices.size()) {

3232::addMask(Mask, ReuseShuffleIndices);

3233return IsSame(Scalars, Mask);

3234 }

3235returnfalse;

3236 }

3237return IsSame(Scalars, ReuseShuffleIndices);

3238 }

3239

3240bool isOperandGatherNode(const EdgeInfo &UserEI) const{

3241returnisGather() && !UserTreeIndices.empty() &&

3242 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&

3243 UserTreeIndices.front().UserTE == UserEI.UserTE;

3244 }

3245

3246 /// \returns true if current entry has same operands as \p TE.

3247bool hasEqualOperands(const TreeEntry &TE) const{

3248if (TE.getNumOperands() != getNumOperands())

3249returnfalse;

3250SmallBitVector Used(getNumOperands());

3251for (unsignedI = 0, E = getNumOperands();I < E; ++I) {

3252unsigned PrevCount =Used.count();

3253for (unsigned K = 0;K < E; ++K) {

3254if (Used.test(K))

3255continue;

3256if (getOperand(K) ==TE.getOperand(I)) {

3257Used.set(K);

3258break;

3259 }

3260 }

3261// Check if we actually found the matching operand.

3262if (PrevCount ==Used.count())

3263returnfalse;

3264 }

3265returntrue;

3266 }

3267

3268 /// \return Final vectorization factor for the node. Defined by the total

3269 /// number of vectorized scalars, including those, used several times in the

3270 /// entry and counted in the \a ReuseShuffleIndices, if any.

3271unsigned getVectorFactor() const{

3272if (!ReuseShuffleIndices.empty())

3273return ReuseShuffleIndices.size();

3274return Scalars.size();

3275 };

3276

3277 /// Checks if the current node is a gather node.

3278boolisGather() const{return State == NeedToGather; }

3279

3280 /// A vector of scalars.

3281ValueList Scalars;

3282

3283 /// The Scalars are vectorized into this value. It is initialized to Null.

3284WeakTrackingVH VectorizedValue =nullptr;

3285

3286 /// New vector phi instructions emitted for the vectorized phi nodes.

3287PHINode *PHI =nullptr;

3288

3289 /// Do we need to gather this sequence or vectorize it

3290 /// (either with vector instruction or with scatter/gather

3291 /// intrinsics for store/load)?

3292enum EntryState {

3293 Vectorize,///< The node is regularly vectorized.

3294 ScatterVectorize,///< Masked scatter/gather node.

3295 StridedVectorize,///< Strided loads (and stores)

3296 NeedToGather,///< Gather/buildvector node.

3297 CombinedVectorize,///< Vectorized node, combined with its user into more

3298 ///< complex node like select/cmp to minmax, mul/add to

3299 ///< fma, etc. Must be used for the following nodes in

3300 ///< the pattern, not the very first one.

3301 };

3302 EntryState State;

3303

3304 /// List of combined opcodes supported by the vectorizer.

3305enum CombinedOpcode {

3306 NotCombinedOp = -1,

3307MinMax = Instruction::OtherOpsEnd + 1,

3308 };

3309 CombinedOpcode CombinedOp = NotCombinedOp;

3310

3311 /// Does this sequence require some shuffling?

3312SmallVector<int, 4> ReuseShuffleIndices;

3313

3314 /// Does this entry require reordering?

3315SmallVector<unsigned, 4> ReorderIndices;

3316

3317 /// Points back to the VectorizableTree.

3318 ///

3319 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has

3320 /// to be a pointer and needs to be able to initialize the child iterator.

3321 /// Thus we need a reference back to the container to translate the indices

3322 /// to entries.

3323 VecTreeTy &Container;

3324

3325 /// The TreeEntry index containing the user of this entry. We can actually

3326 /// have multiple users so the data structure is not truly a tree.

3327SmallVector<EdgeInfo, 1> UserTreeIndices;

3328

3329 /// The index of this treeEntry in VectorizableTree.

3330unsignedIdx = 0;

3331

3332 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from

3333 /// other nodes as a series of insertvector instructions.

3334SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;

3335

3336private:

3337 /// The operands of each instruction in each lane Operands[op_index][lane].

3338 /// Note: This helps avoid the replication of the code that performs the

3339 /// reordering of operands during buildTree_rec() and vectorizeTree().

3340SmallVector<ValueList, 2>Operands;

3341

3342 /// MainOp and AltOp are recorded inside. S should be obtained from

3343 /// newTreeEntry.

3344 InstructionsState S = InstructionsState::invalid();

3345

3346 /// Interleaving factor for interleaved loads Vectorize nodes.

3347unsigned InterleaveFactor = 0;

3348

3349public:

3350 /// Returns interleave factor for interleave nodes.

3351unsigned getInterleaveFactor() const{return InterleaveFactor; }

3352 /// Sets interleaving factor for the interleaving nodes.

3353void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }

3354

3355 /// Set this bundle's \p OpIdx'th operand to \p OpVL.

3356void setOperand(unsigned OpIdx,ArrayRef<Value *> OpVL) {

3357if (Operands.size() < OpIdx + 1)

3358Operands.resize(OpIdx + 1);

3359assert(Operands[OpIdx].empty() &&"Already resized?");

3360assert(OpVL.size() <= Scalars.size() &&

3361"Number of operands is greater than the number of scalars.");

3362Operands[OpIdx].resize(OpVL.size());

3363copy(OpVL, Operands[OpIdx].begin());

3364 }

3365

3366 /// Set this bundle's operand from Scalars.

3367void setOperand(constBoUpSLP &R,bool RequireReorder =false) {

3368 VLOperands Ops(Scalars, S, R);

3369if (RequireReorder)

3370 Ops.reorder();

3371for (unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands()))

3372 setOperand(I, Ops.getVL(I));

3373 }

3374

3375 /// Reorders operands of the node to the given mask \p Mask.

3376void reorderOperands(ArrayRef<int> Mask) {

3377for (ValueList &Operand : Operands)

3378reorderScalars(Operand, Mask);

3379 }

3380

3381 /// \returns the \p OpIdx operand of this TreeEntry.

3382ValueList &getOperand(unsigned OpIdx) {

3383assert(OpIdx <Operands.size() &&"Off bounds");

3384returnOperands[OpIdx];

3385 }

3386

3387 /// \returns the \p OpIdx operand of this TreeEntry.

3388ArrayRef<Value *> getOperand(unsigned OpIdx) const{

3389assert(OpIdx <Operands.size() &&"Off bounds");

3390returnOperands[OpIdx];

3391 }

3392

3393 /// \returns the number of operands.

3394unsigned getNumOperands() const{returnOperands.size(); }

3395

3396 /// \return the single \p OpIdx operand.

3397Value *getSingleOperand(unsigned OpIdx) const{

3398assert(OpIdx <Operands.size() &&"Off bounds");

3399assert(!Operands[OpIdx].empty() &&"No operand available");

3400returnOperands[OpIdx][0];

3401 }

3402

3403 /// Some of the instructions in the list have alternate opcodes.

3404bool isAltShuffle() const{return S.isAltShuffle(); }

3405

3406bool isOpcodeOrAlt(Instruction *I) const{return S.isOpcodeOrAlt(I); }

3407

3408 /// Chooses the correct key for scheduling data. If \p Op has the same (or

3409 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is

3410 /// \p OpValue.

3411Value *isOneOf(Value *Op) const{

3412auto *I = dyn_cast<Instruction>(Op);

3413if (I && isOpcodeOrAlt(I))

3414returnOp;

3415return S.getMainOp();

3416 }

3417

3418void setOperations(const InstructionsState &S) {

3419assert(S &&"InstructionsState is invalid.");

3420 this->S = S;

3421 }

3422

3423Instruction *getMainOp() const{return S.getMainOp(); }

3424

3425Instruction *getAltOp() const{return S.getAltOp(); }

3426

3427 /// The main/alternate opcodes for the list of instructions.

3428unsigned getOpcode() const{return S.getOpcode(); }

3429

3430unsigned getAltOpcode() const{return S.getAltOpcode(); }

3431

3432bool hasState() const{return S.valid(); }

3433

3434 /// When ReuseReorderShuffleIndices is empty it just returns position of \p

3435 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.

3436int findLaneForValue(Value *V) const{

3437unsigned FoundLane = getVectorFactor();

3438for (auto *It =find(Scalars, V), *End = Scalars.end(); It !=End;

3439 std::advance(It, 1)) {

3440if (*It != V)

3441continue;

3442 FoundLane = std::distance(Scalars.begin(), It);

3443assert(FoundLane < Scalars.size() &&"Couldn't find extract lane");

3444if (!ReorderIndices.empty())

3445 FoundLane = ReorderIndices[FoundLane];

3446assert(FoundLane < Scalars.size() &&"Couldn't find extract lane");

3447if (ReuseShuffleIndices.empty())

3448break;

3449if (auto *RIt =find(ReuseShuffleIndices, FoundLane);

3450 RIt != ReuseShuffleIndices.end()) {

3451 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);

3452break;

3453 }

3454 }

3455assert(FoundLane < getVectorFactor() &&"Unable to find given value.");

3456return FoundLane;

3457 }

3458

3459 /// Build a shuffle mask for graph entry which represents a merge of main

3460 /// and alternate operations.

3461void

3462 buildAltOpShuffleMask(constfunction_ref<bool(Instruction *)> IsAltOp,

3463SmallVectorImpl<int> &Mask,

3464SmallVectorImpl<Value *> *OpScalars =nullptr,

3465SmallVectorImpl<Value *> *AltScalars =nullptr)const;

3466

3467 /// Return true if this is a non-power-of-2 node.

3468bool isNonPowOf2Vec() const{

3469bool IsNonPowerOf2 = !has_single_bit(Scalars.size());

3470return IsNonPowerOf2;

3471 }

3472

3473 /// Return true if this is a node, which tries to vectorize number of

3474 /// elements, forming whole vectors.

3475bool

3476 hasNonWholeRegisterOrNonPowerOf2Vec(constTargetTransformInfo &TTI) const{

3477bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(

3478TTI,getValueType(Scalars.front()), Scalars.size());

3479assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&

3480"Reshuffling not supported with non-power-of-2 vectors yet.");

3481return IsNonPowerOf2;

3482 }

3483

3484Value *getOrdered(unsigned Idx) const{

3485assert(isGather() &&"Must be used only for buildvectors/gathers.");

3486if (ReorderIndices.empty())

3487return Scalars[Idx];

3488SmallVector<int>Mask;

3489inversePermutation(ReorderIndices, Mask);

3490return Scalars[Mask[Idx]];

3491 }

3492

3493#ifndef NDEBUG

3494 /// Debug printer.

3495LLVM_DUMP_METHODvoiddump() const{

3496dbgs() <<Idx <<".\n";

3497for (unsigned OpI = 0, OpE =Operands.size(); OpI != OpE; ++OpI) {

3498dbgs() <<"Operand " << OpI <<":\n";

3499for (constValue *V : Operands[OpI])

3500dbgs().indent(2) << *V <<"\n";

3501 }

3502dbgs() <<"Scalars: \n";

3503for (Value *V : Scalars)

3504dbgs().indent(2) << *V <<"\n";

3505dbgs() <<"State: ";

3506switch (State) {

3507case Vectorize:

3508if (InterleaveFactor > 0) {

3509dbgs() <<"Vectorize with interleave factor " << InterleaveFactor

3510 <<"\n";

3511 }else {

3512dbgs() <<"Vectorize\n";

3513 }

3514break;

3515case ScatterVectorize:

3516dbgs() <<"ScatterVectorize\n";

3517break;

3518case StridedVectorize:

3519dbgs() <<"StridedVectorize\n";

3520break;

3521case NeedToGather:

3522dbgs() <<"NeedToGather\n";

3523break;

3524case CombinedVectorize:

3525dbgs() <<"CombinedVectorize\n";

3526break;

3527 }

3528if (S) {

3529dbgs() <<"MainOp: " << *S.getMainOp() <<"\n";

3530dbgs() <<"AltOp: " << *S.getAltOp() <<"\n";

3531 }else {

3532dbgs() <<"MainOp: NULL\n";

3533dbgs() <<"AltOp: NULL\n";

3534 }

3535dbgs() <<"VectorizedValue: ";

3536if (VectorizedValue)

3537dbgs() << *VectorizedValue <<"\n";

3538else

3539dbgs() <<"NULL\n";

3540dbgs() <<"ReuseShuffleIndices: ";

3541if (ReuseShuffleIndices.empty())

3542dbgs() <<"Empty";

3543else

3544for (int ReuseIdx : ReuseShuffleIndices)

3545dbgs() << ReuseIdx <<", ";

3546dbgs() <<"\n";

3547dbgs() <<"ReorderIndices: ";

3548for (unsigned ReorderIdx : ReorderIndices)

3549dbgs() << ReorderIdx <<", ";

3550dbgs() <<"\n";

3551dbgs() <<"UserTreeIndices: ";

3552for (constauto &EInfo : UserTreeIndices)

3553dbgs() << EInfo <<", ";

3554dbgs() <<"\n";

3555if (!CombinedEntriesWithIndices.empty()) {

3556dbgs() <<"Combined entries: ";

3557interleaveComma(CombinedEntriesWithIndices,dbgs(), [&](constauto &P) {

3558dbgs() <<"Entry index " <<P.first <<" with offset " <<P.second;

3559 });

3560dbgs() <<"\n";

3561 }

3562 }

3563#endif

3564 };

3565

3566#ifndef NDEBUG

3567void dumpTreeCosts(const TreeEntry *E,InstructionCost ReuseShuffleCost,

3568InstructionCost VecCost,InstructionCost ScalarCost,

3569StringRef Banner) const{

3570dbgs() <<"SLP: " << Banner <<":\n";

3571 E->dump();

3572dbgs() <<"SLP: Costs:\n";

3573dbgs() <<"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<"\n";

3574dbgs() <<"SLP: VectorCost = " << VecCost <<"\n";

3575dbgs() <<"SLP: ScalarCost = " << ScalarCost <<"\n";

3576dbgs() <<"SLP: ReuseShuffleCost + VecCost - ScalarCost = "

3577 << ReuseShuffleCost + VecCost - ScalarCost <<"\n";

3578 }

3579#endif

3580

3581 /// Create a new VectorizableTree entry.

3582 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,

3583 std::optional<ScheduleData *> Bundle,

3584const InstructionsState &S,

3585const EdgeInfo &UserTreeIdx,

3586ArrayRef<int> ReuseShuffleIndices = {},

3587ArrayRef<unsigned> ReorderIndices = {},

3588unsigned InterleaveFactor = 0) {

3589 TreeEntry::EntryState EntryState =

3590 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;

3591 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,

3592 ReuseShuffleIndices, ReorderIndices);

3593if (E && InterleaveFactor > 0)

3594 E->setInterleave(InterleaveFactor);

3595return E;

3596 }

3597

3598 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,

3599 TreeEntry::EntryState EntryState,

3600 std::optional<ScheduleData *> Bundle,

3601const InstructionsState &S,

3602const EdgeInfo &UserTreeIdx,

3603ArrayRef<int> ReuseShuffleIndices = {},

3604ArrayRef<unsigned> ReorderIndices = {}) {

3605assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||

3606 (Bundle && EntryState != TreeEntry::NeedToGather)) &&

3607"Need to vectorize gather entry?");

3608// Gathered loads still gathered? Do not create entry, use the original one.

3609if (GatheredLoadsEntriesFirst.has_value() &&

3610 EntryState == TreeEntry::NeedToGather && S &&

3611 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&

3612 !UserTreeIdx.UserTE)

3613returnnullptr;

3614 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));

3615 TreeEntry *Last = VectorizableTree.back().get();

3616Last->Idx = VectorizableTree.size() - 1;

3617Last->State = EntryState;

3618// FIXME: Remove once support for ReuseShuffleIndices has been implemented

3619// for non-power-of-two vectors.

3620assert(

3621 (hasFullVectorsOrPowerOf2(*TTI,getValueType(VL.front()), VL.size()) ||

3622 ReuseShuffleIndices.empty()) &&

3623"Reshuffling scalars not yet supported for nodes with padding");

3624Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),

3625 ReuseShuffleIndices.end());

3626if (ReorderIndices.empty()) {

3627Last->Scalars.assign(VL.begin(), VL.end());

3628if (S)

3629Last->setOperations(S);

3630 }else {

3631// Reorder scalars and build final mask.

3632Last->Scalars.assign(VL.size(),nullptr);

3633transform(ReorderIndices,Last->Scalars.begin(),

3634 [VL](unsignedIdx) ->Value * {

3635 if (Idx >= VL.size())

3636 return UndefValue::get(VL.front()->getType());

3637 return VL[Idx];

3638 });

3639 InstructionsState S =getSameOpcode(Last->Scalars, *TLI);

3640if (S)

3641Last->setOperations(S);

3642Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());

3643 }

3644if (!Last->isGather()) {

3645for (Value *V : VL) {

3646if (isa<PoisonValue>(V))

3647continue;

3648const TreeEntry *TE = getTreeEntry(V);

3649assert((!TE || TE ==Last ||doesNotNeedToBeScheduled(V)) &&

3650"Scalar already in tree!");

3651if (TE) {

3652if (TE !=Last)

3653 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);

3654continue;

3655 }

3656 ScalarToTreeEntry[V] =Last;

3657 }

3658// Update the scheduler bundle to point to this TreeEntry.

3659 ScheduleData *BundleMember = *Bundle;

3660assert((BundleMember || isa<PHINode>(S.getMainOp()) ||

3661isVectorLikeInstWithConstOps(S.getMainOp()) ||

3662doesNotNeedToSchedule(VL)) &&

3663"Bundle and VL out of sync");

3664if (BundleMember) {

3665for (Value *V : VL) {

3666if (doesNotNeedToBeScheduled(V))

3667continue;

3668if (!BundleMember)

3669continue;

3670 BundleMember->TE =Last;

3671 BundleMember = BundleMember->NextInBundle;

3672 }

3673 }

3674assert(!BundleMember &&"Bundle and VL out of sync");

3675 }else {

3676// Build a map for gathered scalars to the nodes where they are used.

3677bool AllConstsOrCasts =true;

3678for (Value *V : VL)

3679if (!isConstant(V)) {

3680auto *I = dyn_cast<CastInst>(V);

3681 AllConstsOrCasts &=I &&I->getType()->isIntegerTy();

3682if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||

3683 !UserTreeIdx.UserTE->isGather())

3684 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);

3685 }

3686if (AllConstsOrCasts)

3687 CastMaxMinBWSizes =

3688 std::make_pair(std::numeric_limits<unsigned>::max(), 1);

3689 MustGather.insert(VL.begin(), VL.end());

3690 }

3691

3692if (UserTreeIdx.UserTE)

3693Last->UserTreeIndices.push_back(UserTreeIdx);

3694returnLast;

3695 }

3696

3697 /// -- Vectorization State --

3698 /// Holds all of the tree entries.

3699 TreeEntry::VecTreeTy VectorizableTree;

3700

3701#ifndef NDEBUG

3702 /// Debug printer.

3703LLVM_DUMP_METHODvoid dumpVectorizableTree() const{

3704for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {

3705 VectorizableTree[Id]->dump();

3706dbgs() <<"\n";

3707 }

3708 }

3709#endif

3710

3711 TreeEntry *getTreeEntry(Value *V) {

3712assert(V &&"V cannot be nullptr.");

3713return ScalarToTreeEntry.lookup(V);

3714 }

3715

3716const TreeEntry *getTreeEntry(Value *V) const{

3717assert(V &&"V cannot be nullptr.");

3718return ScalarToTreeEntry.lookup(V);

3719 }

3720

3721 /// Check that the operand node of alternate node does not generate

3722 /// buildvector sequence. If it is, then probably not worth it to build

3723 /// alternate shuffle, if number of buildvector operands + alternate

3724 /// instruction > than the number of buildvector instructions.

3725 /// \param S the instructions state of the analyzed values.

3726 /// \param VL list of the instructions with alternate opcodes.

3727bool areAltOperandsProfitable(const InstructionsState &S,

3728ArrayRef<Value *> VL)const;

3729

3730 /// Checks if the specified list of the instructions/values can be vectorized

3731 /// and fills required data before actual scheduling of the instructions.

3732 TreeEntry::EntryState

3733 getScalarsVectorizationState(const InstructionsState &S,ArrayRef<Value *> VL,

3734bool IsScatterVectorizeUserTE,

3735OrdersType &CurrentOrder,

3736SmallVectorImpl<Value *> &PointerOps);

3737

3738 /// Maps a specific scalar to its tree entry.

3739SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;

3740

3741 /// List of scalars, used in several vectorize nodes, and the list of the

3742 /// nodes.

3743SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;

3744

3745 /// Maps a value to the proposed vectorizable size.

3746SmallDenseMap<Value *, unsigned> InstrElementSize;

3747

3748 /// A list of scalars that we found that we need to keep as scalars.

3749ValueSet MustGather;

3750

3751 /// A set of first non-schedulable values.

3752ValueSet NonScheduledFirst;

3753

3754 /// A map between the vectorized entries and the last instructions in the

3755 /// bundles. The bundles are built in use order, not in the def order of the

3756 /// instructions. So, we cannot rely directly on the last instruction in the

3757 /// bundle being the last instruction in the program order during

3758 /// vectorization process since the basic blocks are affected, need to

3759 /// pre-gather them before.

3760DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;

3761

3762 /// List of gather nodes, depending on other gather/vector nodes, which should

3763 /// be emitted after the vector instruction emission process to correctly

3764 /// handle order of the vector instructions and shuffles.

3765SetVector<const TreeEntry *> PostponedGathers;

3766

3767usingValueToGatherNodesMap =

3768DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;

3769 ValueToGatherNodesMap ValueToGatherNodes;

3770

3771 /// A list of the load entries (node indices), which can be vectorized using

3772 /// strided or masked gather approach, but attempted to be represented as

3773 /// contiguous loads.

3774SetVector<unsigned> LoadEntriesToVectorize;

3775

3776 /// true if graph nodes transforming mode is on.

3777bool IsGraphTransformMode =false;

3778

3779 /// The index of the first gathered load entry in the VectorizeTree.

3780 std::optional<unsigned> GatheredLoadsEntriesFirst;

3781

3782 /// This POD struct describes one external user in the vectorized tree.

3783structExternalUser {

3784 ExternalUser(Value *S,llvm::User *U,int L)

3785 :Scalar(S),User(U), Lane(L) {}

3786

3787// Which scalar in our function.

3788Value *Scalar;

3789

3790// Which user that uses the scalar.

3791llvm::User *User;

3792

3793// Which lane does the scalar belong to.

3794int Lane;

3795 };

3796usingUserList =SmallVector<ExternalUser, 16>;

3797

3798 /// Checks if two instructions may access the same memory.

3799 ///

3800 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it

3801 /// is invariant in the calling loop.

3802bool isAliased(constMemoryLocation &Loc1,Instruction *Inst1,

3803Instruction *Inst2) {

3804if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))

3805returntrue;

3806// First check if the result is already in the cache.

3807 AliasCacheKeyKey = std::make_pair(Inst1, Inst2);

3808auto It = AliasCache.find(Key);

3809if (It != AliasCache.end())

3810return It->second;

3811bool Aliased =isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));

3812// Store the result in the cache.

3813 AliasCache.try_emplace(Key, Aliased);

3814 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);

3815return Aliased;

3816 }

3817

3818usingAliasCacheKey = std::pair<Instruction *, Instruction *>;

3819

3820 /// Cache for alias results.

3821 /// TODO: consider moving this to the AliasAnalysis itself.

3822DenseMap<AliasCacheKey, bool> AliasCache;

3823

3824// Cache for pointerMayBeCaptured calls inside AA. This is preserved

3825// globally through SLP because we don't perform any action which

3826// invalidates capture results.

3827BatchAAResults BatchAA;

3828

3829 /// Temporary store for deleted instructions. Instructions will be deleted

3830 /// eventually when the BoUpSLP is destructed. The deferral is required to

3831 /// ensure that there are no incorrect collisions in the AliasCache, which

3832 /// can happen if a new instruction is allocated at the same address as a

3833 /// previously deleted instruction.

3834DenseSet<Instruction *> DeletedInstructions;

3835

3836 /// Set of the instruction, being analyzed already for reductions.

3837SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;

3838

3839 /// Set of hashes for the list of reduction values already being analyzed.

3840DenseSet<size_t> AnalyzedReductionVals;

3841

3842 /// Values, already been analyzed for mininmal bitwidth and found to be

3843 /// non-profitable.

3844DenseSet<Value *> AnalyzedMinBWVals;

3845

3846 /// A list of values that need to extracted out of the tree.

3847 /// This list holds pairs of (Internal Scalar : External User). External User

3848 /// can be nullptr, it means that this Internal Scalar will be used later,

3849 /// after vectorization.

3850 UserList ExternalUses;

3851

3852 /// A list of GEPs which can be reaplced by scalar GEPs instead of

3853 /// extractelement instructions.

3854SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;

3855

3856 /// Values used only by @llvm.assume calls.

3857SmallPtrSet<const Value *, 32> EphValues;

3858

3859 /// Holds all of the instructions that we gathered, shuffle instructions and

3860 /// extractelements.

3861SetVector<Instruction *> GatherShuffleExtractSeq;

3862

3863 /// A list of blocks that we are going to CSE.

3864DenseSet<BasicBlock *> CSEBlocks;

3865

3866 /// List of hashes of vector of loads, which are known to be non vectorizable.

3867DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;

3868

3869 /// Contains all scheduling relevant data for an instruction.

3870 /// A ScheduleData either represents a single instruction or a member of an

3871 /// instruction bundle (= a group of instructions which is combined into a

3872 /// vector instruction).

3873structScheduleData {

3874// The initial value for the dependency counters. It means that the

3875// dependencies are not calculated yet.

3876enum { InvalidDeps = -1 };

3877

3878 ScheduleData() =default;

3879

3880voidinit(int BlockSchedulingRegionID,Instruction *I) {

3881 FirstInBundle =this;

3882 NextInBundle =nullptr;

3883 NextLoadStore =nullptr;

3884 IsScheduled =false;

3885 SchedulingRegionID = BlockSchedulingRegionID;

3886 clearDependencies();

3887 Inst =I;

3888TE =nullptr;

3889 }

3890

3891 /// Verify basic self consistency properties

3892voidverify() {

3893if (hasValidDependencies()) {

3894assert(UnscheduledDeps <= Dependencies &&"invariant");

3895 }else {

3896assert(UnscheduledDeps == Dependencies &&"invariant");

3897 }

3898

3899if (IsScheduled) {

3900assert(isSchedulingEntity() &&

3901"unexpected scheduled state");

3902for (const ScheduleData *BundleMember =this; BundleMember;

3903 BundleMember = BundleMember->NextInBundle) {

3904assert(BundleMember->hasValidDependencies() &&

3905 BundleMember->UnscheduledDeps == 0 &&

3906"unexpected scheduled state");

3907assert((BundleMember ==this || !BundleMember->IsScheduled) &&

3908"only bundle is marked scheduled");

3909 }

3910 }

3911

3912assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&

3913"all bundle members must be in same basic block");

3914 }

3915

3916 /// Returns true if the dependency information has been calculated.

3917 /// Note that depenendency validity can vary between instructions within

3918 /// a single bundle.

3919bool hasValidDependencies() const{return Dependencies != InvalidDeps; }

3920

3921 /// Returns true for single instructions and for bundle representatives

3922 /// (= the head of a bundle).

3923bool isSchedulingEntity() const{return FirstInBundle ==this; }

3924

3925 /// Returns true if it represents an instruction bundle and not only a

3926 /// single instruction.

3927bool isPartOfBundle() const{

3928return NextInBundle !=nullptr || FirstInBundle !=this ||TE;

3929 }

3930

3931 /// Returns true if it is ready for scheduling, i.e. it has no more

3932 /// unscheduled depending instructions/bundles.

3933bool isReady() const{

3934assert(isSchedulingEntity() &&

3935"can't consider non-scheduling entity for ready list");

3936return unscheduledDepsInBundle() == 0 && !IsScheduled;

3937 }

3938

3939 /// Modifies the number of unscheduled dependencies for this instruction,

3940 /// and returns the number of remaining dependencies for the containing

3941 /// bundle.

3942int incrementUnscheduledDeps(int Incr) {

3943assert(hasValidDependencies() &&

3944"increment of unscheduled deps would be meaningless");

3945 UnscheduledDeps += Incr;

3946return FirstInBundle->unscheduledDepsInBundle();

3947 }

3948

3949 /// Sets the number of unscheduled dependencies to the number of

3950 /// dependencies.

3951void resetUnscheduledDeps() {

3952 UnscheduledDeps = Dependencies;

3953 }

3954

3955 /// Clears all dependency information.

3956void clearDependencies() {

3957 Dependencies = InvalidDeps;

3958 resetUnscheduledDeps();

3959 MemoryDependencies.clear();

3960 ControlDependencies.clear();

3961 }

3962

3963int unscheduledDepsInBundle() const{

3964assert(isSchedulingEntity() &&"only meaningful on the bundle");

3965int Sum = 0;

3966for (const ScheduleData *BundleMember =this; BundleMember;

3967 BundleMember = BundleMember->NextInBundle) {

3968if (BundleMember->UnscheduledDeps == InvalidDeps)

3969return InvalidDeps;

3970 Sum += BundleMember->UnscheduledDeps;

3971 }

3972return Sum;

3973 }

3974

3975voiddump(raw_ostream &os) const{

3976if (!isSchedulingEntity()) {

3977 os <<"/ " << *Inst;

3978 }elseif (NextInBundle) {

3979 os <<'[' << *Inst;

3980 ScheduleData *SD = NextInBundle;

3981while (SD) {

3982 os <<';' << *SD->Inst;

3983 SD = SD->NextInBundle;

3984 }

3985 os <<']';

3986 }else {

3987 os << *Inst;

3988 }

3989 }

3990

3991LLVM_DUMP_METHODvoiddump() const{dump(dbgs()); }

3992

3993Instruction *Inst =nullptr;

3994

3995 /// The TreeEntry that this instruction corresponds to.

3996 TreeEntry *TE =nullptr;

3997

3998 /// Points to the head in an instruction bundle (and always to this for

3999 /// single instructions).

4000 ScheduleData *FirstInBundle =nullptr;

4001

4002 /// Single linked list of all instructions in a bundle. Null if it is a

4003 /// single instruction.

4004 ScheduleData *NextInBundle =nullptr;

4005

4006 /// Single linked list of all memory instructions (e.g. load, store, call)

4007 /// in the block - until the end of the scheduling region.

4008 ScheduleData *NextLoadStore =nullptr;

4009

4010 /// The dependent memory instructions.

4011 /// This list is derived on demand in calculateDependencies().

4012SmallVector<ScheduleData *, 4> MemoryDependencies;

4013

4014 /// List of instructions which this instruction could be control dependent

4015 /// on. Allowing such nodes to be scheduled below this one could introduce

4016 /// a runtime fault which didn't exist in the original program.

4017 /// ex: this is a load or udiv following a readonly call which inf loops

4018SmallVector<ScheduleData *, 4> ControlDependencies;

4019

4020 /// This ScheduleData is in the current scheduling region if this matches

4021 /// the current SchedulingRegionID of BlockScheduling.

4022int SchedulingRegionID = 0;

4023

4024 /// Used for getting a "good" final ordering of instructions.

4025int SchedulingPriority = 0;

4026

4027 /// The number of dependencies. Constitutes of the number of users of the

4028 /// instruction plus the number of dependent memory instructions (if any).

4029 /// This value is calculated on demand.

4030 /// If InvalidDeps, the number of dependencies is not calculated yet.

4031int Dependencies = InvalidDeps;

4032

4033 /// The number of dependencies minus the number of dependencies of scheduled

4034 /// instructions. As soon as this is zero, the instruction/bundle gets ready

4035 /// for scheduling.

4036 /// Note that this is negative as long as Dependencies is not calculated.

4037int UnscheduledDeps = InvalidDeps;

4038

4039 /// True if this instruction is scheduled (or considered as scheduled in the

4040 /// dry-run).

4041bool IsScheduled =false;

4042 };

4043

4044#ifndef NDEBUG

4045friendinlineraw_ostream &operator<<(raw_ostream &os,

4046const BoUpSLP::ScheduleData &SD) {

4047 SD.dump(os);

4048return os;

4049 }

4050#endif

4051

4052friendstructGraphTraits<BoUpSLP *>;

4053friendstructDOTGraphTraits<BoUpSLP *>;

4054

4055 /// Contains all scheduling data for a basic block.

4056 /// It does not schedules instructions, which are not memory read/write

4057 /// instructions and their operands are either constants, or arguments, or

4058 /// phis, or instructions from others blocks, or their users are phis or from

4059 /// the other blocks. The resulting vector instructions can be placed at the

4060 /// beginning of the basic block without scheduling (if operands does not need

4061 /// to be scheduled) or at the end of the block (if users are outside of the

4062 /// block). It allows to save some compile time and memory used by the

4063 /// compiler.

4064 /// ScheduleData is assigned for each instruction in between the boundaries of

4065 /// the tree entry, even for those, which are not part of the graph. It is

4066 /// required to correctly follow the dependencies between the instructions and

4067 /// their correct scheduling. The ScheduleData is not allocated for the

4068 /// instructions, which do not require scheduling, like phis, nodes with

4069 /// extractelements/insertelements only or nodes with instructions, with

4070 /// uses/operands outside of the block.

4071structBlockScheduling {

4072 BlockScheduling(BasicBlock *BB)

4073 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}

4074

4075void clear() {

4076 ReadyInsts.clear();

4077 ScheduleStart =nullptr;

4078 ScheduleEnd =nullptr;

4079 FirstLoadStoreInRegion =nullptr;

4080 LastLoadStoreInRegion =nullptr;

4081 RegionHasStackSave =false;

4082

4083// Reduce the maximum schedule region size by the size of the

4084// previous scheduling run.

4085 ScheduleRegionSizeLimit -= ScheduleRegionSize;

4086if (ScheduleRegionSizeLimit <MinScheduleRegionSize)

4087 ScheduleRegionSizeLimit =MinScheduleRegionSize;

4088 ScheduleRegionSize = 0;

4089

4090// Make a new scheduling region, i.e. all existing ScheduleData is not

4091// in the new region yet.

4092 ++SchedulingRegionID;

4093 }

4094

4095 ScheduleData *getScheduleData(Instruction *I) {

4096if (BB !=I->getParent())

4097// Avoid lookup if can't possibly be in map.

4098returnnullptr;

4099 ScheduleData *SD = ScheduleDataMap.lookup(I);

4100if (SD && isInSchedulingRegion(SD))

4101return SD;

4102returnnullptr;

4103 }

4104

4105 ScheduleData *getScheduleData(Value *V) {

4106if (auto *I = dyn_cast<Instruction>(V))

4107return getScheduleData(I);

4108returnnullptr;

4109 }

4110

4111bool isInSchedulingRegion(ScheduleData *SD) const{

4112return SD->SchedulingRegionID == SchedulingRegionID;

4113 }

4114

4115 /// Marks an instruction as scheduled and puts all dependent ready

4116 /// instructions into the ready-list.

4117template <typename ReadyListType>

4118void schedule(ScheduleData *SD, ReadyListType &ReadyList) {

4119 SD->IsScheduled =true;

4120LLVM_DEBUG(dbgs() <<"SLP: schedule " << *SD <<"\n");

4121

4122for (ScheduleData *BundleMember = SD; BundleMember;

4123 BundleMember = BundleMember->NextInBundle) {

4124

4125// Handle the def-use chain dependencies.

4126

4127// Decrement the unscheduled counter and insert to ready list if ready.

4128auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {

4129 ScheduleData *OpDef = getScheduleData(I);

4130if (OpDef && OpDef->hasValidDependencies() &&

4131 OpDef->incrementUnscheduledDeps(-1) == 0) {

4132// There are no more unscheduled dependencies after

4133// decrementing, so we can put the dependent instruction

4134// into the ready list.

4135 ScheduleData *DepBundle = OpDef->FirstInBundle;

4136assert(!DepBundle->IsScheduled &&

4137"already scheduled bundle gets ready");

4138 ReadyList.insert(DepBundle);

4139LLVM_DEBUG(dbgs()

4140 <<"SLP: gets ready (def): " << *DepBundle <<"\n");

4141 }

4142 };

4143

4144// If BundleMember is a vector bundle, its operands may have been

4145// reordered during buildTree(). We therefore need to get its operands

4146// through the TreeEntry.

4147if (TreeEntry *TE = BundleMember->TE) {

4148// Need to search for the lane since the tree entry can be reordered.

4149auto *In = BundleMember->Inst;

4150int Lane = std::distance(TE->Scalars.begin(),

4151find(TE->Scalars, In));

4152assert(Lane >= 0 &&"Lane not set");

4153

4154// Since vectorization tree is being built recursively this assertion

4155// ensures that the tree entry has all operands set before reaching

4156// this code. Couple of exceptions known at the moment are extracts

4157// where their second (immediate) operand is not added. Since

4158// immediates do not affect scheduler behavior this is considered

4159// okay.

4160assert(

4161 In &&

4162 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||

4163In->getNumOperands() ==TE->getNumOperands()) &&

4164"Missed TreeEntry operands?");

4165

4166for (unsigned OpIdx : seq<unsigned>(TE->getNumOperands()))

4167if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))

4168 DecrUnsched(I);

4169 }else {

4170// If BundleMember is a stand-alone instruction, no operand reordering

4171// has taken place, so we directly access its operands.

4172for (Use &U : BundleMember->Inst->operands())

4173if (auto *I = dyn_cast<Instruction>(U.get()))

4174 DecrUnsched(I);

4175 }

4176// Handle the memory dependencies.

4177for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {

4178if (MemoryDepSD->hasValidDependencies() &&

4179 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {

4180// There are no more unscheduled dependencies after decrementing,

4181// so we can put the dependent instruction into the ready list.

4182 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;

4183assert(!DepBundle->IsScheduled &&

4184"already scheduled bundle gets ready");

4185 ReadyList.insert(DepBundle);

4186LLVM_DEBUG(dbgs()

4187 <<"SLP: gets ready (mem): " << *DepBundle <<"\n");

4188 }

4189 }

4190// Handle the control dependencies.

4191for (ScheduleData *DepSD : BundleMember->ControlDependencies) {

4192if (DepSD->incrementUnscheduledDeps(-1) == 0) {

4193// There are no more unscheduled dependencies after decrementing,

4194// so we can put the dependent instruction into the ready list.

4195 ScheduleData *DepBundle = DepSD->FirstInBundle;

4196assert(!DepBundle->IsScheduled &&

4197"already scheduled bundle gets ready");

4198 ReadyList.insert(DepBundle);

4199LLVM_DEBUG(dbgs()

4200 <<"SLP: gets ready (ctl): " << *DepBundle <<"\n");

4201 }

4202 }

4203 }

4204 }

4205

4206 /// Verify basic self consistency properties of the data structure.

4207voidverify() {

4208if (!ScheduleStart)

4209return;

4210

4211assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&

4212 ScheduleStart->comesBefore(ScheduleEnd) &&

4213"Not a valid scheduling region?");

4214

4215for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {

4216auto *SD = getScheduleData(I);

4217if (!SD)

4218continue;

4219assert(isInSchedulingRegion(SD) &&

4220"primary schedule data not in window?");

4221assert(isInSchedulingRegion(SD->FirstInBundle) &&

4222"entire bundle in window!");

4223 SD->verify();

4224 }

4225

4226for (auto *SD : ReadyInsts) {

4227assert(SD->isSchedulingEntity() && SD->isReady() &&

4228"item in ready list not ready?");

4229 (void)SD;

4230 }

4231 }

4232

4233 /// Put all instructions into the ReadyList which are ready for scheduling.

4234template <typename ReadyListType>

4235void initialFillReadyList(ReadyListType &ReadyList) {

4236for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {

4237 ScheduleData *SD = getScheduleData(I);

4238if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&

4239 SD->isReady()) {

4240 ReadyList.insert(SD);

4241LLVM_DEBUG(dbgs()

4242 <<"SLP: initially in ready list: " << *SD <<"\n");

4243 }

4244 }

4245 }

4246

4247 /// Build a bundle from the ScheduleData nodes corresponding to the

4248 /// scalar instruction for each lane.

4249 ScheduleData *buildBundle(ArrayRef<Value *> VL);

4250

4251 /// Checks if a bundle of instructions can be scheduled, i.e. has no

4252 /// cyclic dependencies. This is only a dry-run, no instructions are

4253 /// actually moved at this stage.

4254 /// \returns the scheduling bundle. The returned Optional value is not

4255 /// std::nullopt if \p VL is allowed to be scheduled.

4256 std::optional<ScheduleData *>

4257 tryScheduleBundle(ArrayRef<Value *> VL,BoUpSLP *SLP,

4258const InstructionsState &S);

4259

4260 /// Un-bundles a group of instructions.

4261void cancelScheduling(ArrayRef<Value *> VL,Value *OpValue);

4262

4263 /// Allocates schedule data chunk.

4264 ScheduleData *allocateScheduleDataChunks();

4265

4266 /// Extends the scheduling region so that V is inside the region.

4267 /// \returns true if the region size is within the limit.

4268bool extendSchedulingRegion(Value *V,const InstructionsState &S);

4269

4270 /// Initialize the ScheduleData structures for new instructions in the

4271 /// scheduling region.

4272void initScheduleData(Instruction *FromI,Instruction *ToI,

4273 ScheduleData *PrevLoadStore,

4274 ScheduleData *NextLoadStore);

4275

4276 /// Updates the dependency information of a bundle and of all instructions/

4277 /// bundles which depend on the original bundle.

4278void calculateDependencies(ScheduleData *SD,bool InsertInReadyList,

4279BoUpSLP *SLP);

4280

4281 /// Sets all instruction in the scheduling region to un-scheduled.

4282void resetSchedule();

4283

4284BasicBlock *BB;

4285

4286 /// Simple memory allocation for ScheduleData.

4287SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

4288

4289 /// The size of a ScheduleData array in ScheduleDataChunks.

4290int ChunkSize;

4291

4292 /// The allocator position in the current chunk, which is the last entry

4293 /// of ScheduleDataChunks.

4294int ChunkPos;

4295

4296 /// Attaches ScheduleData to Instruction.

4297 /// Note that the mapping survives during all vectorization iterations, i.e.

4298 /// ScheduleData structures are recycled.

4299DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;

4300

4301 /// The ready-list for scheduling (only used for the dry-run).

4302SetVector<ScheduleData *> ReadyInsts;

4303

4304 /// The first instruction of the scheduling region.

4305Instruction *ScheduleStart =nullptr;

4306

4307 /// The first instruction _after_ the scheduling region.

4308Instruction *ScheduleEnd =nullptr;

4309

4310 /// The first memory accessing instruction in the scheduling region

4311 /// (can be null).

4312 ScheduleData *FirstLoadStoreInRegion =nullptr;

4313

4314 /// The last memory accessing instruction in the scheduling region

4315 /// (can be null).

4316 ScheduleData *LastLoadStoreInRegion =nullptr;

4317

4318 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling

4319 /// region? Used to optimize the dependence calculation for the

4320 /// common case where there isn't.

4321bool RegionHasStackSave =false;

4322

4323 /// The current size of the scheduling region.

4324int ScheduleRegionSize = 0;

4325

4326 /// The maximum size allowed for the scheduling region.

4327int ScheduleRegionSizeLimit =ScheduleRegionSizeBudget;

4328

4329 /// The ID of the scheduling region. For a new vectorization iteration this

4330 /// is incremented which "removes" all ScheduleData from the region.

4331 /// Make sure that the initial SchedulingRegionID is greater than the

4332 /// initial SchedulingRegionID in ScheduleData (which is 0).

4333int SchedulingRegionID = 1;

4334 };

4335

4336 /// Attaches the BlockScheduling structures to basic blocks.

4337MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;

4338

4339 /// Performs the "real" scheduling. Done before vectorization is actually

4340 /// performed in a basic block.

4341void scheduleBlock(BlockScheduling *BS);

4342

4343 /// List of users to ignore during scheduling and that don't need extracting.

4344constSmallDenseSet<Value *> *UserIgnoreList =nullptr;

4345

4346 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of

4347 /// sorted SmallVectors of unsigned.

4348structOrdersTypeDenseMapInfo {

4349staticOrdersType getEmptyKey() {

4350OrdersType V;

4351V.push_back(~1U);

4352returnV;

4353 }

4354

4355staticOrdersType getTombstoneKey() {

4356OrdersType V;

4357V.push_back(~2U);

4358returnV;

4359 }

4360

4361staticunsigned getHashValue(constOrdersType &V) {

4362returnstatic_cast<unsigned>(hash_combine_range(V.begin(),V.end()));

4363 }

4364

4365staticboolisEqual(constOrdersType &LHS,constOrdersType &RHS) {

4366returnLHS ==RHS;

4367 }

4368 };

4369

4370// Analysis and block reference.

4371Function *F;

4372ScalarEvolution *SE;

4373TargetTransformInfo *TTI;

4374TargetLibraryInfo *TLI;

4375LoopInfo *LI;

4376DominatorTree *DT;

4377AssumptionCache *AC;

4378DemandedBits *DB;

4379constDataLayout *DL;

4380OptimizationRemarkEmitter *ORE;

4381

4382unsigned MaxVecRegSize;// This is set by TTI or overridden by cl::opt.

4383unsigned MinVecRegSize;// Set by cl::opt (default: 128).

4384

4385 /// Instruction builder to construct the vectorized tree.

4386IRBuilder<TargetFolder> Builder;

4387

4388 /// A map of scalar integer values to the smallest bit width with which they

4389 /// can legally be represented. The values map to (width, signed) pairs,

4390 /// where "width" indicates the minimum bit width and "signed" is True if the

4391 /// value must be signed-extended, rather than zero-extended, back to its

4392 /// original width.

4393DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;

4394

4395 /// Final size of the reduced vector, if the current graph represents the

4396 /// input for the reduction and it was possible to narrow the size of the

4397 /// reduction.

4398unsigned ReductionBitWidth = 0;

4399

4400 /// Canonical graph size before the transformations.

4401unsigned BaseGraphSize = 1;

4402

4403 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of

4404 /// type sizes, used in the tree.

4405 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;

4406

4407 /// Indices of the vectorized nodes, which supposed to be the roots of the new

4408 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.

4409DenseSet<unsigned> ExtraBitWidthNodes;

4410};

4411

4412}// end namespace slpvectorizer

4413

4414template <>structGraphTraits<BoUpSLP *> {

4415usingTreeEntry = BoUpSLP::TreeEntry;

4416

4417 /// NodeRef has to be a pointer per the GraphWriter.

4418usingNodeRef =TreeEntry *;

4419

4420usingContainerTy =BoUpSLP::TreeEntry::VecTreeTy;

4421

4422 /// Add the VectorizableTree to the index iterator to be able to return

4423 /// TreeEntry pointers.

4424structChildIteratorType

4425 :publiciterator_adaptor_base<

4426 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {

4427ContainerTy &VectorizableTree;

4428

4429ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,

4430ContainerTy &VT)

4431 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}

4432

4433NodeRef operator*() {returnI->UserTE; }

4434 };

4435

4436staticNodeRef getEntryNode(BoUpSLP &R) {

4437return R.VectorizableTree[0].get();

4438 }

4439

4440static ChildIteratorTypechild_begin(NodeRef N) {

4441return {N->UserTreeIndices.begin(),N->Container};

4442 }

4443

4444static ChildIteratorTypechild_end(NodeRef N) {

4445return {N->UserTreeIndices.end(),N->Container};

4446 }

4447

4448 /// For the node iterator we just need to turn the TreeEntry iterator into a

4449 /// TreeEntry* iterator so that it dereferences to NodeRef.

4450classnodes_iterator {

4451usingItTy =ContainerTy::iterator;

4452ItTy It;

4453

4454public:

4455nodes_iterator(constItTy &It2) : It(It2) {}

4456NodeRef operator*() {return It->get(); }

4457 nodes_iteratoroperator++() {

4458 ++It;

4459return *this;

4460 }

4461booloperator!=(const nodes_iterator &N2) const{return N2.It != It; }

4462 };

4463

4464static nodes_iteratornodes_begin(BoUpSLP *R) {

4465return nodes_iterator(R->VectorizableTree.begin());

4466 }

4467

4468static nodes_iteratornodes_end(BoUpSLP *R) {

4469return nodes_iterator(R->VectorizableTree.end());

4470 }

4471

4472staticunsignedsize(BoUpSLP *R) {return R->VectorizableTree.size(); }

4473};

4474

4475template <>structDOTGraphTraits<BoUpSLP *> :publicDefaultDOTGraphTraits {

4476usingTreeEntry = BoUpSLP::TreeEntry;

4477

4478DOTGraphTraits(bool IsSimple =false) :DefaultDOTGraphTraits(IsSimple) {}

4479

4480 std::stringgetNodeLabel(constTreeEntry *Entry,constBoUpSLP *R) {

4481 std::string Str;

4482raw_string_ostream OS(Str);

4483OS << Entry->Idx <<".\n";

4484if (isSplat(Entry->Scalars))

4485OS <<"<splat> ";

4486for (auto *V : Entry->Scalars) {

4487OS << *V;

4488if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {

4489 return EU.Scalar == V;

4490 }))

4491OS <<" <extract>";

4492OS <<"\n";

4493 }

4494return Str;

4495 }

4496

4497static std::stringgetNodeAttributes(constTreeEntry *Entry,

4498constBoUpSLP *) {

4499if (Entry->isGather())

4500return"color=red";

4501if (Entry->State == TreeEntry::ScatterVectorize ||

4502 Entry->State == TreeEntry::StridedVectorize)

4503return"color=blue";

4504return"";

4505 }

4506};

4507

4508}// end namespace llvm

4509

4510BoUpSLP::~BoUpSLP() {

4511SmallVector<WeakTrackingVH> DeadInsts;

4512for (auto *I : DeletedInstructions) {

4513if (!I->getParent()) {

4514// Temporarily insert instruction back to erase them from parent and

4515// memory later.

4516if (isa<PHINode>(I))

4517// Phi nodes must be the very first instructions in the block.

4518I->insertBefore(F->getEntryBlock(),

4519F->getEntryBlock().getFirstNonPHIIt());

4520else

4521I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());

4522continue;

4523 }

4524for (Use &U :I->operands()) {

4525auto *Op = dyn_cast<Instruction>(U.get());

4526if (Op && !DeletedInstructions.count(Op) &&Op->hasOneUser() &&

4527wouldInstructionBeTriviallyDead(Op, TLI))

4528 DeadInsts.emplace_back(Op);

4529 }

4530I->dropAllReferences();

4531 }

4532for (auto *I : DeletedInstructions) {

4533assert(I->use_empty() &&

4534"trying to erase instruction with users.");

4535I->eraseFromParent();

4536 }

4537

4538// Cleanup any dead scalar code feeding the vectorized instructions

4539RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);

4540

4541#ifdef EXPENSIVE_CHECKS

4542// If we could guarantee that this call is not extremely slow, we could

4543// remove the ifdef limitation (see PR47712).

4544assert(!verifyFunction(*F, &dbgs()));

4545#endif

4546}

4547

4548/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses

4549/// contains original mask for the scalars reused in the node. Procedure

4550/// transform this mask in accordance with the given \p Mask.

4551staticvoidreorderReuses(SmallVectorImpl<int> &Reuses,ArrayRef<int> Mask) {

4552assert(!Mask.empty() && Reuses.size() == Mask.size() &&

4553"Expected non-empty mask.");

4554SmallVector<int> Prev(Reuses.begin(), Reuses.end());

4555 Prev.swap(Reuses);

4556for (unsignedI = 0,E = Prev.size();I <E; ++I)

4557if (Mask[I] !=PoisonMaskElem)

4558 Reuses[Mask[I]] = Prev[I];

4559}

4560

4561/// Reorders the given \p Order according to the given \p Mask. \p Order - is

4562/// the original order of the scalars. Procedure transforms the provided order

4563/// in accordance with the given \p Mask. If the resulting \p Order is just an

4564/// identity order, \p Order is cleared.

4565staticvoidreorderOrder(SmallVectorImpl<unsigned> &Order,ArrayRef<int> Mask,

4566bool BottomOrder =false) {

4567assert(!Mask.empty() &&"Expected non-empty mask.");

4568unsigned Sz = Mask.size();

4569if (BottomOrder) {

4570SmallVector<unsigned> PrevOrder;

4571if (Order.empty()) {

4572 PrevOrder.resize(Sz);

4573 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);

4574 }else {

4575 PrevOrder.swap(Order);

4576 }

4577 Order.assign(Sz, Sz);

4578for (unsignedI = 0;I < Sz; ++I)

4579if (Mask[I] !=PoisonMaskElem)

4580 Order[I] = PrevOrder[Mask[I]];

4581if (all_of(enumerate(Order), [&](constauto &Data) {

4582returnData.value() == Sz ||Data.index() ==Data.value();

4583 })) {

4584 Order.clear();

4585return;

4586 }

4587fixupOrderingIndices(Order);

4588return;

4589 }

4590SmallVector<int> MaskOrder;

4591if (Order.empty()) {

4592 MaskOrder.resize(Sz);

4593 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);

4594 }else {

4595inversePermutation(Order, MaskOrder);

4596 }

4597reorderReuses(MaskOrder, Mask);

4598if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {

4599 Order.clear();

4600return;

4601 }

4602 Order.assign(Sz, Sz);

4603for (unsignedI = 0;I < Sz; ++I)

4604if (MaskOrder[I] !=PoisonMaskElem)

4605 Order[MaskOrder[I]] =I;

4606fixupOrderingIndices(Order);

4607}

4608

4609std::optional<BoUpSLP::OrdersType>

4610BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

4611assert(TE.isGather() &&"Expected gather node only.");

4612// Try to find subvector extract/insert patterns and reorder only such

4613// patterns.

4614SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());

4615Type *ScalarTy = GatheredScalars.front()->getType();

4616int NumScalars = GatheredScalars.size();

4617if (!isValidElementType(ScalarTy))

4618return std::nullopt;

4619auto *VecTy =getWidenedType(ScalarTy, NumScalars);

4620int NumParts =TTI->getNumberOfParts(VecTy);

4621if (NumParts == 0 || NumParts >= NumScalars ||

4622 VecTy->getNumElements() % NumParts != 0 ||

4623 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),

4624 VecTy->getNumElements() / NumParts))

4625 NumParts = 1;

4626SmallVector<int> ExtractMask;

4627SmallVector<int> Mask;

4628SmallVector<SmallVector<const TreeEntry *>> Entries;

4629SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =

4630 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);

4631SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =

4632 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,

4633/*ForOrder=*/true);

4634// No shuffled operands - ignore.

4635if (GatherShuffles.empty() && ExtractShuffles.empty())

4636return std::nullopt;

4637OrdersType CurrentOrder(NumScalars, NumScalars);

4638if (GatherShuffles.size() == 1 &&

4639 *GatherShuffles.front() ==TTI::SK_PermuteSingleSrc &&

4640 Entries.front().front()->isSame(TE.Scalars)) {

4641// Perfect match in the graph, will reuse the previously vectorized

4642// node. Cost is 0.

4643 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);

4644return CurrentOrder;

4645 }

4646auto IsSplatMask = [](ArrayRef<int> Mask) {

4647int SingleElt =PoisonMaskElem;

4648returnall_of(Mask, [&](intI) {

4649if (SingleElt ==PoisonMaskElem &&I !=PoisonMaskElem)

4650 SingleElt =I;

4651returnI ==PoisonMaskElem ||I == SingleElt;

4652 });

4653 };

4654// Exclusive broadcast mask - ignore.

4655if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&

4656 (Entries.size() != 1 ||

4657 Entries.front().front()->ReorderIndices.empty())) ||

4658 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))

4659return std::nullopt;

4660SmallBitVector ShuffledSubMasks(NumParts);

4661auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,

4662ArrayRef<int> Mask,int PartSz,int NumParts,

4663function_ref<unsigned(unsigned)> GetVF) {

4664for (intI : seq<int>(0, NumParts)) {

4665if (ShuffledSubMasks.test(I))

4666continue;

4667constint VF = GetVF(I);

4668if (VF == 0)

4669continue;

4670unsigned Limit =getNumElems(CurrentOrder.size(), PartSz,I);

4671MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);

4672// Shuffle of at least 2 vectors - ignore.

4673if (any_of(Slice, [&](intI) {returnI != NumScalars; })) {

4674 std::fill(Slice.begin(), Slice.end(), NumScalars);

4675 ShuffledSubMasks.set(I);

4676continue;

4677 }

4678// Try to include as much elements from the mask as possible.

4679int FirstMin = INT_MAX;

4680int SecondVecFound =false;

4681for (int K : seq<int>(Limit)) {

4682intIdx = Mask[I * PartSz + K];

4683if (Idx ==PoisonMaskElem) {

4684Value *V = GatheredScalars[I * PartSz + K];

4685if (isConstant(V) && !isa<PoisonValue>(V)) {

4686 SecondVecFound =true;

4687break;

4688 }

4689continue;

4690 }

4691if (Idx < VF) {

4692if (FirstMin >Idx)

4693 FirstMin =Idx;

4694 }else {

4695 SecondVecFound =true;

4696break;

4697 }

4698 }

4699 FirstMin = (FirstMin / PartSz) * PartSz;

4700// Shuffle of at least 2 vectors - ignore.

4701if (SecondVecFound) {

4702 std::fill(Slice.begin(), Slice.end(), NumScalars);

4703 ShuffledSubMasks.set(I);

4704continue;

4705 }

4706for (int K : seq<int>(Limit)) {

4707intIdx = Mask[I * PartSz + K];

4708if (Idx ==PoisonMaskElem)

4709continue;

4710Idx -= FirstMin;

4711if (Idx >= PartSz) {

4712 SecondVecFound =true;

4713break;

4714 }

4715if (CurrentOrder[I * PartSz +Idx] >

4716static_cast<unsigned>(I * PartSz + K) &&

4717 CurrentOrder[I * PartSz +Idx] !=

4718static_cast<unsigned>(I * PartSz +Idx))

4719 CurrentOrder[I * PartSz +Idx] =I * PartSz + K;

4720 }

4721// Shuffle of at least 2 vectors - ignore.

4722if (SecondVecFound) {

4723 std::fill(Slice.begin(), Slice.end(), NumScalars);

4724 ShuffledSubMasks.set(I);

4725continue;

4726 }

4727 }

4728 };

4729int PartSz =getPartNumElems(NumScalars, NumParts);

4730if (!ExtractShuffles.empty())

4731 TransformMaskToOrder(

4732 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsignedI) {

4733if (!ExtractShuffles[I])

4734return 0U;

4735unsigned VF = 0;

4736unsigned Sz =getNumElems(TE.getVectorFactor(), PartSz,I);

4737for (unsignedIdx : seq<unsigned>(Sz)) {

4738int K =I * PartSz +Idx;

4739if (ExtractMask[K] ==PoisonMaskElem)

4740continue;

4741if (!TE.ReuseShuffleIndices.empty())

4742 K = TE.ReuseShuffleIndices[K];

4743if (K ==PoisonMaskElem)

4744continue;

4745if (!TE.ReorderIndices.empty())

4746 K = std::distance(TE.ReorderIndices.begin(),

4747find(TE.ReorderIndices, K));

4748auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);

4749if (!EI)

4750continue;

4751 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())

4752 ->getElementCount()

4753 .getKnownMinValue());

4754 }

4755return VF;

4756 });

4757// Check special corner case - single shuffle of the same entry.

4758if (GatherShuffles.size() == 1 && NumParts != 1) {

4759if (ShuffledSubMasks.any())

4760return std::nullopt;

4761 PartSz = NumScalars;

4762 NumParts = 1;

4763 }

4764if (!Entries.empty())

4765 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsignedI) {

4766if (!GatherShuffles[I])

4767return 0U;

4768return std::max(Entries[I].front()->getVectorFactor(),

4769 Entries[I].back()->getVectorFactor());

4770 });

4771int NumUndefs =

4772count_if(CurrentOrder, [&](intIdx) {returnIdx == NumScalars; });

4773if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))

4774return std::nullopt;

4775return std::move(CurrentOrder);

4776}

4777

4778staticboolarePointersCompatible(Value *Ptr1,Value *Ptr2,

4779constTargetLibraryInfo &TLI,

4780bool CompareOpcodes =true) {

4781if (getUnderlyingObject(Ptr1,RecursionMaxDepth) !=

4782getUnderlyingObject(Ptr2,RecursionMaxDepth))

4783returnfalse;

4784auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);

4785auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);

4786return (!GEP1 || GEP1->getNumOperands() == 2) &&

4787 (!GEP2 || GEP2->getNumOperands() == 2) &&

4788 (((!GEP1 ||isConstant(GEP1->getOperand(1))) &&

4789 (!GEP2 ||isConstant(GEP2->getOperand(1)))) ||

4790 !CompareOpcodes ||

4791 (GEP1 && GEP2 &&

4792getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));

4793}

4794

4795/// Calculates minimal alignment as a common alignment.

4796template <typename T>

4797staticAlign computeCommonAlignment(ArrayRef<Value *> VL) {

4798Align CommonAlignment = cast<T>(VL.front())->getAlign();

4799for (Value *V : VL.drop_front())

4800 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());

4801return CommonAlignment;

4802}

4803

4804/// Check if \p Order represents reverse order.

4805staticboolisReverseOrder(ArrayRef<unsigned> Order) {

4806assert(!Order.empty() &&

4807"Order is empty. Please check it before using isReverseOrder.");

4808unsigned Sz = Order.size();

4809returnall_of(enumerate(Order), [&](constauto &Pair) {

4810return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();

4811 });

4812}

4813

4814/// Checks if the provided list of pointers \p Pointers represents the strided

4815/// pointers for type ElemTy. If they are not, std::nullopt is returned.

4816/// Otherwise, if \p Inst is not specified, just initialized optional value is

4817/// returned to show that the pointers represent strided pointers. If \p Inst

4818/// specified, the runtime stride is materialized before the given \p Inst.

4819/// \returns std::nullopt if the pointers are not pointers with the runtime

4820/// stride, nullptr or actual stride value, otherwise.

4821static std::optional<Value *>

4822calculateRtStride(ArrayRef<Value *> PointerOps,Type *ElemTy,

4823constDataLayout &DL,ScalarEvolution &SE,

4824SmallVectorImpl<unsigned> &SortedIndices,

4825Instruction *Inst =nullptr) {

4826SmallVector<const SCEV *> SCEVs;

4827constSCEV *PtrSCEVLowest =nullptr;

4828constSCEV *PtrSCEVHighest =nullptr;

4829// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest

4830// addresses).

4831for (Value *Ptr : PointerOps) {

4832constSCEV *PtrSCEV = SE.getSCEV(Ptr);

4833if (!PtrSCEV)

4834return std::nullopt;

4835 SCEVs.push_back(PtrSCEV);

4836if (!PtrSCEVLowest && !PtrSCEVHighest) {

4837 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;

4838continue;

4839 }

4840constSCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);

4841if (isa<SCEVCouldNotCompute>(Diff))

4842return std::nullopt;

4843if (Diff->isNonConstantNegative()) {

4844 PtrSCEVLowest = PtrSCEV;

4845continue;

4846 }

4847constSCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);

4848if (isa<SCEVCouldNotCompute>(Diff1))

4849return std::nullopt;

4850if (Diff1->isNonConstantNegative()) {

4851 PtrSCEVHighest = PtrSCEV;

4852continue;

4853 }

4854 }

4855// Dist = PtrSCEVHighest - PtrSCEVLowest;

4856constSCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);

4857if (isa<SCEVCouldNotCompute>(Dist))

4858return std::nullopt;

4859intSize =DL.getTypeStoreSize(ElemTy);

4860auto TryGetStride = [&](constSCEV *Dist,

4861constSCEV *Multiplier) ->constSCEV * {

4862if (constauto *M = dyn_cast<SCEVMulExpr>(Dist)) {

4863if (M->getOperand(0) == Multiplier)

4864return M->getOperand(1);

4865if (M->getOperand(1) == Multiplier)

4866return M->getOperand(0);

4867returnnullptr;

4868 }

4869if (Multiplier == Dist)

4870return SE.getConstant(Dist->getType(), 1);

4871return SE.getUDivExactExpr(Dist, Multiplier);

4872 };

4873// Stride_in_elements = Dist / element_size * (num_elems - 1).

4874constSCEV *Stride =nullptr;

4875if (Size != 1 || SCEVs.size() > 2) {

4876constSCEV *Sz = SE.getConstant(Dist->getType(),Size * (SCEVs.size() - 1));

4877 Stride = TryGetStride(Dist, Sz);

4878if (!Stride)

4879return std::nullopt;

4880 }

4881if (!Stride || isa<SCEVConstant>(Stride))

4882return std::nullopt;

4883// Iterate through all pointers and check if all distances are

4884// unique multiple of Stride.

4885usingDistOrdPair = std::pair<int64_t, int>;

4886auto Compare =llvm::less_first();

4887 std::set<DistOrdPair,decltype(Compare)> Offsets(Compare);

4888int Cnt = 0;

4889bool IsConsecutive =true;

4890for (constSCEV *PtrSCEV : SCEVs) {

4891unsigned Dist = 0;

4892if (PtrSCEV != PtrSCEVLowest) {

4893constSCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);

4894constSCEV *Coeff = TryGetStride(Diff, Stride);

4895if (!Coeff)

4896return std::nullopt;

4897constauto *SC = dyn_cast<SCEVConstant>(Coeff);

4898if (!SC || isa<SCEVCouldNotCompute>(SC))

4899return std::nullopt;

4900if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,

4901 SE.getMulExpr(Stride, SC)))

4902 ->isZero())

4903return std::nullopt;

4904 Dist = SC->getAPInt().getZExtValue();

4905 }

4906// If the strides are not the same or repeated, we can't vectorize.

4907if ((Dist /Size) *Size != Dist || (Dist /Size) >= SCEVs.size())

4908return std::nullopt;

4909auto Res = Offsets.emplace(Dist, Cnt);

4910if (!Res.second)

4911return std::nullopt;

4912// Consecutive order if the inserted element is the last one.

4913 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();

4914 ++Cnt;

4915 }

4916if (Offsets.size() != SCEVs.size())

4917return std::nullopt;

4918 SortedIndices.clear();

4919if (!IsConsecutive) {

4920// Fill SortedIndices array only if it is non-consecutive.

4921 SortedIndices.resize(PointerOps.size());

4922 Cnt = 0;

4923for (const std::pair<int64_t, int> &Pair : Offsets) {

4924 SortedIndices[Cnt] = Pair.second;

4925 ++Cnt;

4926 }

4927 }

4928if (!Inst)

4929returnnullptr;

4930SCEVExpander Expander(SE,DL,"strided-load-vec");

4931return Expander.expandCodeFor(Stride, Stride->getType(), Inst);

4932}

4933

4934static std::pair<InstructionCost, InstructionCost>

4935getGEPCosts(constTargetTransformInfo &TTI,ArrayRef<Value *> Ptrs,

4936Value *BasePtr,unsigned Opcode,TTI::TargetCostKind CostKind,

4937Type *ScalarTy,VectorType *VecTy);

4938

4939/// Returns the cost of the shuffle instructions with the given \p Kind, vector

4940/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert

4941/// subvector pattern.

4942staticInstructionCost

4943getShuffleCost(constTargetTransformInfo &TTI,TTI::ShuffleKind Kind,

4944VectorType *Tp,ArrayRef<int> Mask = {},

4945TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput,

4946intIndex = 0,VectorType *SubTp =nullptr,

4947ArrayRef<const Value *>Args = {}) {

4948if (Kind !=TTI::SK_PermuteTwoSrc)

4949returnTTI.getShuffleCost(Kind, Tp, Mask,CostKind,Index, SubTp, Args);

4950int NumSrcElts = Tp->getElementCount().getKnownMinValue();

4951int NumSubElts;

4952if (Mask.size() > 2 &&ShuffleVectorInst::isInsertSubvectorMask(

4953 Mask, NumSrcElts, NumSubElts,Index)) {

4954if (Index + NumSubElts > NumSrcElts &&

4955Index + NumSrcElts <=static_cast<int>(Mask.size()))

4956returnTTI.getShuffleCost(

4957TTI::SK_InsertSubvector,

4958getWidenedType(Tp->getElementType(),Mask.size()), Mask,

4959TTI::TCK_RecipThroughput,Index, Tp);

4960 }

4961returnTTI.getShuffleCost(Kind, Tp, Mask,CostKind,Index, SubTp, Args);

4962}

4963

4964/// Correctly creates insert_subvector, checking that the index is multiple of

4965/// the subvectors length. Otherwise, generates shuffle using \p Generator or

4966/// using default shuffle.

4967staticValue *createInsertVector(

4968IRBuilderBase &Builder,Value *Vec,Value *V,unsignedIndex,

4969function_ref<Value *(Value *,Value *,ArrayRef<int>)> Generator = {}) {

4970constunsigned SubVecVF =getNumElements(V->getType());

4971if (Index % SubVecVF == 0) {

4972 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,

4973 Builder.getInt64(Index));

4974 }else {

4975// Create shuffle, insertvector requires that index is multiple of

4976// the subvector length.

4977constunsigned VecVF =getNumElements(Vec->getType());

4978SmallVector<int>Mask(VecVF,PoisonMaskElem);

4979 std::iota(Mask.begin(),Mask.end(), 0);

4980for (unsignedI : seq<unsigned>(SubVecVF))

4981Mask[I +Index] =I + VecVF;

4982if (Generator) {

4983 Vec = Generator(Vec, V, Mask);

4984 }else {

4985// 1. Resize V to the size of Vec.

4986SmallVector<int> ResizeMask(VecVF,PoisonMaskElem);

4987 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);

4988V = Builder.CreateShuffleVector(V, ResizeMask);

4989 Vec = Builder.CreateShuffleVector(Vec, V, Mask);

4990 }

4991 }

4992return Vec;

4993}

4994

4995/// Correctly creates extract_subvector, checking that the index is multiple of

4996/// the subvectors length. Otherwise, generates shuffle using \p Generator or

4997/// using default shuffle.

4998staticValue *createExtractVector(IRBuilderBase &Builder,Value *Vec,

4999unsigned SubVecVF,unsignedIndex) {

5000if (Index % SubVecVF == 0) {

5001VectorType *SubVecTy =

5002getWidenedType(Vec->getType()->getScalarType(), SubVecVF);

5003return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));

5004 }

5005// Create shuffle, extract_subvector requires that index is multiple of

5006// the subvector length.

5007SmallVector<int> Mask(SubVecVF,PoisonMaskElem);

5008 std::iota(Mask.begin(), Mask.end(),Index);

5009return Builder.CreateShuffleVector(Vec, Mask);

5010}

5011

5012BoUpSLP::LoadsState

5013BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL,constValue *VL0,

5014SmallVectorImpl<unsigned> &Order,

5015SmallVectorImpl<Value *> &PointerOps,

5016unsigned *BestVF,bool TryRecursiveCheck) const{

5017// Check that a vectorized load would load the same memory as a scalar

5018// load. For example, we don't want to vectorize loads that are smaller

5019// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

5020// treats loading/storing it as an i8 struct. If we vectorize loads/stores

5021// from such a struct, we read/write packed bits disagreeing with the

5022// unvectorized version.

5023if (BestVF)

5024 *BestVF = 0;

5025if (areKnownNonVectorizableLoads(VL))

5026returnLoadsState::Gather;

5027Type *ScalarTy = VL0->getType();

5028

5029if (DL->getTypeSizeInBits(ScalarTy) !=DL->getTypeAllocSizeInBits(ScalarTy))

5030returnLoadsState::Gather;

5031

5032// Make sure all loads in the bundle are simple - we can't vectorize

5033// atomic or volatile loads.

5034 PointerOps.clear();

5035constunsigned Sz = VL.size();

5036 PointerOps.resize(Sz);

5037auto *POIter = PointerOps.begin();

5038for (Value *V : VL) {

5039auto *L = dyn_cast<LoadInst>(V);

5040if (!L || !L->isSimple())

5041returnLoadsState::Gather;

5042 *POIter = L->getPointerOperand();

5043 ++POIter;

5044 }

5045

5046 Order.clear();

5047// Check the order of pointer operands or that all pointers are the same.

5048bool IsSorted =sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);

5049

5050auto *VecTy =getWidenedType(ScalarTy, Sz);

5051Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);

5052if (!IsSorted) {

5053if (Sz >MinProfitableStridedLoads &&TTI->isTypeLegal(VecTy)) {

5054if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&

5055calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))

5056returnLoadsState::StridedVectorize;

5057 }

5058

5059if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

5060TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))

5061returnLoadsState::Gather;

5062

5063if (!all_of(PointerOps, [&](Value *P) {

5064returnarePointersCompatible(P, PointerOps.front(), *TLI);

5065 }))

5066returnLoadsState::Gather;

5067

5068 }else {

5069Value *Ptr0;

5070Value *PtrN;

5071if (Order.empty()) {

5072 Ptr0 = PointerOps.front();

5073 PtrN = PointerOps.back();

5074 }else {

5075 Ptr0 = PointerOps[Order.front()];

5076 PtrN = PointerOps[Order.back()];

5077 }

5078 std::optional<int> Diff =

5079getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

5080// Check that the sorted loads are consecutive.

5081if (static_cast<unsigned>(*Diff) == Sz - 1)

5082returnLoadsState::Vectorize;

5083if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

5084TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))

5085returnLoadsState::Gather;

5086// Simple check if not a strided access - clear order.

5087bool IsPossibleStrided = *Diff % (Sz - 1) == 0;

5088// Try to generate strided load node if:

5089// 1. Target with strided load support is detected.

5090// 2. The number of loads is greater than MinProfitableStridedLoads,

5091// or the potential stride <= MaxProfitableLoadStride and the

5092// potential stride is power-of-2 (to avoid perf regressions for the very

5093// small number of loads) and max distance > number of loads, or potential

5094// stride is -1.

5095// 3. The loads are ordered, or number of unordered loads <=

5096// MaxProfitableUnorderedLoads, or loads are in reversed order.

5097// (this check is to avoid extra costs for very expensive shuffles).

5098// 4. Any pointer operand is an instruction with the users outside of the

5099// current graph (for masked gathers extra extractelement instructions

5100// might be required).

5101auto IsAnyPointerUsedOutGraph =

5102 IsPossibleStrided &&any_of(PointerOps, [&](Value *V) {

5103return isa<Instruction>(V) &&any_of(V->users(), [&](User *U) {

5104 return !getTreeEntry(U) && !MustGather.contains(U);

5105 });

5106 });

5107constunsigned AbsoluteDiff = std::abs(*Diff);

5108if (IsPossibleStrided &&

5109 (IsAnyPointerUsedOutGraph ||

5110 (AbsoluteDiff > Sz &&

5111 (Sz >MinProfitableStridedLoads ||

5112 (AbsoluteDiff <=MaxProfitableLoadStride * Sz &&

5113 AbsoluteDiff % Sz == 0 &&has_single_bit(AbsoluteDiff / Sz)))) ||

5114 *Diff == -(static_cast<int>(Sz) - 1))) {

5115int Stride = *Diff /static_cast<int>(Sz - 1);

5116if (*Diff == Stride *static_cast<int>(Sz - 1)) {

5117Align Alignment =

5118 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])

5119 ->getAlign();

5120if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {

5121// Iterate through all pointers and check if all distances are

5122// unique multiple of Dist.

5123SmallSet<int, 4> Dists;

5124for (Value *Ptr : PointerOps) {

5125int Dist = 0;

5126if (Ptr == PtrN)

5127 Dist = *Diff;

5128elseif (Ptr != Ptr0)

5129 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy,Ptr, *DL, *SE);

5130// If the strides are not the same or repeated, we can't

5131// vectorize.

5132if (((Dist / Stride) * Stride) != Dist ||

5133 !Dists.insert(Dist).second)

5134break;

5135 }

5136if (Dists.size() == Sz)

5137returnLoadsState::StridedVectorize;

5138 }

5139 }

5140 }

5141 }

5142// Correctly identify compare the cost of loads + shuffles rather than

5143// strided/masked gather loads. Returns true if vectorized + shuffles

5144// representation is better than just gather.

5145auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,

5146unsigned *BestVF,

5147bool ProfitableGatherPointers) {

5148if (BestVF)

5149 *BestVF = 0;

5150// Compare masked gather cost and loads + insert subvector costs.

5151TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

5152auto [ScalarGEPCost, VectorGEPCost] =

5153getGEPCosts(TTI, PointerOps, PointerOps.front(),

5154 Instruction::GetElementPtr,CostKind, ScalarTy, VecTy);

5155// Estimate the cost of masked gather GEP. If not a splat, roughly

5156// estimate as a buildvector, otherwise estimate as splat.

5157APInt DemandedElts =APInt::getAllOnes(VecTy->getNumElements());

5158VectorType *PtrVecTy =

5159getWidenedType(PointerOps.front()->getType()->getScalarType(),

5160 VecTy->getNumElements());

5161if (static_cast<unsigned>(count_if(

5162 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||

5163any_of(PointerOps, [&](Value *V) {

5164returngetUnderlyingObject(V) !=

5165getUnderlyingObject(PointerOps.front());

5166 }))

5167 VectorGEPCost +=TTI.getScalarizationOverhead(

5168 PtrVecTy, DemandedElts,/*Insert=*/true,/*Extract=*/false,CostKind);

5169else

5170 VectorGEPCost +=

5171TTI.getScalarizationOverhead(

5172 PtrVecTy,APInt::getOneBitSet(VecTy->getNumElements(), 0),

5173/*Insert=*/true,/*Extract=*/false,CostKind) +

5174::getShuffleCost(TTI,TTI::SK_Broadcast, PtrVecTy, {},CostKind);

5175// The cost of scalar loads.

5176InstructionCost ScalarLoadsCost =

5177 std::accumulate(VL.begin(), VL.end(),InstructionCost(),

5178 [&](InstructionCost C,Value *V) {

5179returnC +TTI.getInstructionCost(

5180 cast<Instruction>(V),CostKind);

5181 }) +

5182 ScalarGEPCost;

5183// The cost of masked gather.

5184InstructionCost MaskedGatherCost =

5185TTI.getGatherScatterOpCost(

5186 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),

5187/*VariableMask=*/false, CommonAlignment,CostKind) +

5188 (ProfitableGatherPointers ? 0 : VectorGEPCost);

5189InstructionCost GatherCost =

5190TTI.getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,

5191/*Extract=*/false,CostKind) +

5192 ScalarLoadsCost;

5193// The list of loads is small or perform partial check already - directly

5194// compare masked gather cost and gather cost.

5195constexprunsigned ListLimit = 4;

5196if (!TryRecursiveCheck || VL.size() < ListLimit)

5197return MaskedGatherCost - GatherCost >= -SLPCostThreshold;

5198

5199// FIXME: The following code has not been updated for non-power-of-2

5200// vectors (and not whole registers). The splitting logic here does not

5201// cover the original vector if the vector factor is not a power of two.

5202if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))

5203returnfalse;

5204

5205unsigned Sz =DL->getTypeSizeInBits(ScalarTy);

5206unsigned MinVF =getMinVF(2 * Sz);

5207 DemandedElts.clearAllBits();

5208// Iterate through possible vectorization factors and check if vectorized +

5209// shuffles is better than just gather.

5210for (unsigned VF =

5211getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);

5212 VF >= MinVF;

5213 VF =getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {

5214SmallVector<LoadsState> States;

5215for (unsigned Cnt = 0,End = VL.size(); Cnt + VF <=End; Cnt += VF) {

5216ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

5217SmallVector<unsigned> Order;

5218SmallVector<Value *> PointerOps;

5219LoadsState LS =

5220canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,

5221/*TryRecursiveCheck=*/false);

5222// Check that the sorted loads are consecutive.

5223if (LS ==LoadsState::Gather) {

5224if (BestVF) {

5225 DemandedElts.setAllBits();

5226break;

5227 }

5228 DemandedElts.setBits(Cnt, Cnt + VF);

5229continue;

5230 }

5231// If need the reorder - consider as high-cost masked gather for now.

5232if ((LS ==LoadsState::Vectorize ||

5233 LS ==LoadsState::StridedVectorize) &&

5234 !Order.empty() && !isReverseOrder(Order))

5235 LS =LoadsState::ScatterVectorize;

5236 States.push_back(LS);

5237 }

5238if (DemandedElts.isAllOnes())

5239// All loads gathered - try smaller VF.

5240continue;

5241// Can be vectorized later as a serie of loads/insertelements.

5242InstructionCost VecLdCost = 0;

5243if (!DemandedElts.isZero()) {

5244 VecLdCost =

5245TTI.getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,

5246/*Extract=*/false,CostKind) +

5247 ScalarGEPCost;

5248for (unsignedIdx : seq<unsigned>(VL.size()))

5249if (DemandedElts[Idx])

5250 VecLdCost +=

5251TTI.getInstructionCost(cast<Instruction>(VL[Idx]),CostKind);

5252 }

5253unsigned ScalarTyNumElements =getNumElements(ScalarTy);

5254auto *SubVecTy =getWidenedType(ScalarTy, VF);

5255for (auto [I, LS] :enumerate(States)) {

5256auto *LI0 = cast<LoadInst>(VL[I * VF]);

5257InstructionCost VectorGEPCost =

5258 (LS ==LoadsState::ScatterVectorize && ProfitableGatherPointers)

5259 ? 0

5260 :getGEPCosts(TTI,ArrayRef(PointerOps).slice(I * VF, VF),

5261 LI0->getPointerOperand(),

5262 Instruction::GetElementPtr,CostKind, ScalarTy,

5263 SubVecTy)

5264 .second;

5265if (LS ==LoadsState::ScatterVectorize) {

5266if (static_cast<unsigned>(

5267count_if(PointerOps, IsaPred<GetElementPtrInst>)) <

5268 PointerOps.size() - 1 ||

5269any_of(PointerOps, [&](Value *V) {

5270returngetUnderlyingObject(V) !=

5271getUnderlyingObject(PointerOps.front());

5272 }))

5273 VectorGEPCost +=TTI.getScalarizationOverhead(

5274 SubVecTy,APInt::getAllOnes(VF),

5275/*Insert=*/true,/*Extract=*/false,CostKind);

5276else

5277 VectorGEPCost +=

5278TTI.getScalarizationOverhead(

5279 SubVecTy,APInt::getOneBitSet(ScalarTyNumElements * VF, 0),

5280/*Insert=*/true,/*Extract=*/false,CostKind) +

5281::getShuffleCost(TTI,TTI::SK_Broadcast, SubVecTy, {},

5282CostKind);

5283 }

5284switch (LS) {

5285caseLoadsState::Vectorize:

5286 VecLdCost +=

5287TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),

5288 LI0->getPointerAddressSpace(),CostKind,

5289TTI::OperandValueInfo()) +

5290 VectorGEPCost;

5291break;

5292caseLoadsState::StridedVectorize:

5293 VecLdCost +=TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,

5294 LI0->getPointerOperand(),

5295/*VariableMask=*/false,

5296 CommonAlignment,CostKind) +

5297 VectorGEPCost;

5298break;

5299caseLoadsState::ScatterVectorize:

5300 VecLdCost +=TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,

5301 LI0->getPointerOperand(),

5302/*VariableMask=*/false,

5303 CommonAlignment,CostKind) +

5304 VectorGEPCost;

5305break;

5306caseLoadsState::Gather:

5307// Gathers are already calculated - ignore.

5308continue;

5309 }

5310SmallVector<int> ShuffleMask(VL.size());

5311for (intIdx : seq<int>(0, VL.size()))

5312 ShuffleMask[Idx] =Idx / VF ==I ? VL.size() +Idx % VF :Idx;

5313if (I > 0)

5314 VecLdCost +=

5315::getShuffleCost(TTI,TTI::SK_InsertSubvector, VecTy, ShuffleMask,

5316CostKind,I * VF, SubVecTy);

5317 }

5318// If masked gather cost is higher - better to vectorize, so

5319// consider it as a gather node. It will be better estimated

5320// later.

5321if (MaskedGatherCost >= VecLdCost &&

5322 VecLdCost - GatherCost < -SLPCostThreshold) {

5323if (BestVF)

5324 *BestVF = VF;

5325returntrue;

5326 }

5327 }

5328return MaskedGatherCost - GatherCost >= -SLPCostThreshold;

5329 };

5330// TODO: need to improve analysis of the pointers, if not all of them are

5331// GEPs or have > 2 operands, we end up with a gather node, which just

5332// increases the cost.

5333Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());

5334bool ProfitableGatherPointers =

5335 L && Sz > 2 &&static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {

5336return L->isLoopInvariant(V);

5337 })) <= Sz / 2;

5338if (ProfitableGatherPointers ||all_of(PointerOps, [](Value *P) {

5339auto *GEP = dyn_cast<GetElementPtrInst>(P);

5340return (!GEP &&doesNotNeedToBeScheduled(P)) ||

5341 (GEP &&GEP->getNumOperands() == 2 &&

5342 isa<Constant, Instruction>(GEP->getOperand(1)));

5343 })) {

5344// Check if potential masked gather can be represented as series

5345// of loads + insertsubvectors.

5346// If masked gather cost is higher - better to vectorize, so

5347// consider it as a gather node. It will be better estimated

5348// later.

5349if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,

5350 ProfitableGatherPointers))

5351returnLoadsState::ScatterVectorize;

5352 }

5353

5354returnLoadsState::Gather;

5355}

5356

5357staticboolclusterSortPtrAccesses(ArrayRef<Value *> VL,

5358ArrayRef<BasicBlock *> BBs,Type *ElemTy,

5359constDataLayout &DL,ScalarEvolution &SE,

5360SmallVectorImpl<unsigned> &SortedIndices) {

5361assert(

5362all_of(VL, [](constValue *V) {return V->getType()->isPointerTy(); }) &&

5363"Expected list of pointer operands.");

5364// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each

5365// Ptr into, sort and return the sorted indices with values next to one

5366// another.

5367SmallMapVector<std::pair<BasicBlock *, Value *>,

5368SmallVector<SmallVector<std::tuple<Value *, int, unsigned>>>, 8>

5369 Bases;

5370 Bases

5371 .try_emplace(std::make_pair(

5372 BBs.front(),getUnderlyingObject(VL.front(),RecursionMaxDepth)))

5373 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);

5374

5375 SortedIndices.clear();

5376for (auto [Cnt,Ptr] :enumerate(VL.drop_front())) {

5377auto Key = std::make_pair(BBs[Cnt + 1],

5378getUnderlyingObject(Ptr,RecursionMaxDepth));

5379bool Found =any_of(Bases.try_emplace(Key).first->second,

5380 [&, &Cnt = Cnt, &Ptr =Ptr](auto &Base) {

5381 std::optional<int> Diff = getPointersDiff(

5382 ElemTy, std::get<0>(Base.front()), ElemTy,

5383 Ptr, DL, SE,

5384/*StrictCheck=*/true);

5385 if (!Diff)

5386 return false;

5387

5388 Base.emplace_back(Ptr, *Diff, Cnt + 1);

5389 return true;

5390 });

5391

5392if (!Found) {

5393// If we haven't found enough to usefully cluster, return early.

5394if (Bases.size() > VL.size() / 2 - 1)

5395returnfalse;

5396

5397// Not found already - add a new Base

5398 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);

5399 }

5400 }

5401

5402if (Bases.size() == VL.size())

5403returnfalse;

5404

5405if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||

5406 Bases.front().second.size() == VL.size()))

5407returnfalse;

5408

5409// For each of the bases sort the pointers by Offset and check if any of the

5410// base become consecutively allocated.

5411auto ComparePointers = [](Value *Ptr1,Value *Ptr2) {

5412SmallPtrSet<Value *, 13> FirstPointers;

5413SmallPtrSet<Value *, 13> SecondPointers;

5414Value *P1 = Ptr1;

5415Value *P2 = Ptr2;

5416unsignedDepth = 0;

5417while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {

5418if (P1 == P2 ||Depth >RecursionMaxDepth)

5419returnfalse;

5420 FirstPointers.insert(P1);

5421 SecondPointers.insert(P2);

5422 P1 =getUnderlyingObject(P1,/*MaxLookup=*/1);

5423 P2 =getUnderlyingObject(P2,/*MaxLookup=*/1);

5424 ++Depth;

5425 }

5426assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&

5427"Unable to find matching root.");

5428return FirstPointers.contains(P2) && !SecondPointers.contains(P1);

5429 };

5430for (auto &Base : Bases) {

5431for (auto &Vec :Base.second) {

5432if (Vec.size() > 1) {

5433stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,

5434const std::tuple<Value *, int, unsigned> &Y) {

5435return std::get<1>(X) < std::get<1>(Y);

5436 });

5437int InitialOffset = std::get<1>(Vec[0]);

5438bool AnyConsecutive =

5439all_of(enumerate(Vec), [InitialOffset](constauto &P) {

5440return std::get<1>(P.value()) == int(P.index()) + InitialOffset;

5441 });

5442// Fill SortedIndices array only if it looks worth-while to sort the

5443// ptrs.

5444if (!AnyConsecutive)

5445returnfalse;

5446 }

5447 }

5448stable_sort(Base.second, [&](constauto &V1,constauto &V2) {

5449 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));

5450 });

5451 }

5452

5453for (auto &T : Bases)

5454for (constauto &Vec :T.second)

5455for (constauto &P : Vec)

5456 SortedIndices.push_back(std::get<2>(P));

5457

5458assert(SortedIndices.size() == VL.size() &&

5459"Expected SortedIndices to be the size of VL");

5460returntrue;

5461}

5462

5463std::optional<BoUpSLP::OrdersType>

5464BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {

5465assert(TE.isGather() &&"Expected gather node only.");

5466Type *ScalarTy = TE.Scalars[0]->getType();

5467

5468SmallVector<Value *> Ptrs;

5469 Ptrs.reserve(TE.Scalars.size());

5470SmallVector<BasicBlock *> BBs;

5471 BBs.reserve(TE.Scalars.size());

5472for (Value *V : TE.Scalars) {

5473auto *L = dyn_cast<LoadInst>(V);

5474if (!L || !L->isSimple())

5475return std::nullopt;

5476 Ptrs.push_back(L->getPointerOperand());

5477 BBs.push_back(L->getParent());

5478 }

5479

5480BoUpSLP::OrdersType Order;

5481if (!LoadEntriesToVectorize.contains(TE.Idx) &&

5482clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))

5483return std::move(Order);

5484return std::nullopt;

5485}

5486

5487/// Check if two insertelement instructions are from the same buildvector.

5488staticboolareTwoInsertFromSameBuildVector(

5489InsertElementInst *VU,InsertElementInst *V,

5490function_ref<Value *(InsertElementInst *)> GetBaseOperand) {

5491// Instructions must be from the same basic blocks.

5492if (VU->getParent() != V->getParent())

5493returnfalse;

5494// Checks if 2 insertelements are from the same buildvector.

5495if (VU->getType() != V->getType())

5496returnfalse;

5497// Multiple used inserts are separate nodes.

5498if (!VU->hasOneUse() && !V->hasOneUse())

5499returnfalse;

5500auto *IE1 = VU;

5501auto *IE2 = V;

5502 std::optional<unsigned> Idx1 =getElementIndex(IE1);

5503 std::optional<unsigned> Idx2 =getElementIndex(IE2);

5504if (Idx1 == std::nullopt || Idx2 == std::nullopt)

5505returnfalse;

5506// Go through the vector operand of insertelement instructions trying to find

5507// either VU as the original vector for IE2 or V as the original vector for

5508// IE1.

5509SmallBitVector ReusedIdx(

5510 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());

5511bool IsReusedIdx =false;

5512do {

5513if (IE2 == VU && !IE1)

5514return VU->hasOneUse();

5515if (IE1 == V && !IE2)

5516return V->hasOneUse();

5517if (IE1 && IE1 != V) {

5518unsigned Idx1 =getElementIndex(IE1).value_or(*Idx2);

5519 IsReusedIdx |= ReusedIdx.test(Idx1);

5520 ReusedIdx.set(Idx1);

5521if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)

5522 IE1 =nullptr;

5523else

5524 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));

5525 }

5526if (IE2 && IE2 != VU) {

5527unsigned Idx2 =getElementIndex(IE2).value_or(*Idx1);

5528 IsReusedIdx |= ReusedIdx.test(Idx2);

5529 ReusedIdx.set(Idx2);

5530if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)

5531 IE2 =nullptr;

5532else

5533 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));

5534 }

5535 }while (!IsReusedIdx && (IE1 || IE2));

5536returnfalse;

5537}

5538

5539std::optional<BoUpSLP::OrdersType>

5540BoUpSLP::getReorderingData(const TreeEntry &TE,bool TopToBottom) {

5541// No need to reorder if need to shuffle reuses, still need to shuffle the

5542// node.

5543if (!TE.ReuseShuffleIndices.empty()) {

5544// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.

5545assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&

5546"Reshuffling scalars not yet supported for nodes with padding");

5547

5548if (isSplat(TE.Scalars))

5549return std::nullopt;

5550// Check if reuse shuffle indices can be improved by reordering.

5551// For this, check that reuse mask is "clustered", i.e. each scalar values

5552// is used once in each submask of size <number_of_scalars>.

5553// Example: 4 scalar values.

5554// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.

5555// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because

5556// element 3 is used twice in the second submask.

5557unsigned Sz = TE.Scalars.size();

5558if (TE.isGather()) {

5559if (std::optional<OrdersType> CurrentOrder =

5560findReusedOrderedScalars(TE)) {

5561SmallVector<int> Mask;

5562fixupOrderingIndices(*CurrentOrder);

5563inversePermutation(*CurrentOrder, Mask);

5564::addMask(Mask, TE.ReuseShuffleIndices);

5565OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());

5566unsigned Sz = TE.Scalars.size();

5567for (int K = 0,E = TE.getVectorFactor() / Sz; K <E; ++K) {

5568for (auto [I,Idx] :enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))

5569if (Idx !=PoisonMaskElem)

5570 Res[Idx + K * Sz] =I + K * Sz;

5571 }

5572return std::move(Res);

5573 }

5574 }

5575if (Sz == 2 && TE.getVectorFactor() == 4 &&

5576TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),

5577 2 * TE.getVectorFactor())) == 1)

5578return std::nullopt;

5579if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,

5580 Sz)) {

5581SmallVector<int> ReorderMask(Sz,PoisonMaskElem);

5582if (TE.ReorderIndices.empty())

5583 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);

5584else

5585inversePermutation(TE.ReorderIndices, ReorderMask);

5586::addMask(ReorderMask, TE.ReuseShuffleIndices);

5587unsigned VF = ReorderMask.size();

5588OrdersType ResOrder(VF, VF);

5589unsigned NumParts =divideCeil(VF, Sz);

5590SmallBitVector UsedVals(NumParts);

5591for (unsignedI = 0;I < VF;I += Sz) {

5592int Val =PoisonMaskElem;

5593unsigned UndefCnt = 0;

5594unsigned Limit = std::min(Sz, VF -I);

5595if (any_of(ArrayRef(ReorderMask).slice(I, Limit),

5596 [&](intIdx) {

5597if (Val ==PoisonMaskElem &&Idx !=PoisonMaskElem)

5598 Val =Idx;

5599if (Idx ==PoisonMaskElem)

5600 ++UndefCnt;

5601returnIdx !=PoisonMaskElem &&Idx != Val;

5602 }) ||

5603 Val >=static_cast<int>(NumParts) || UsedVals.test(Val) ||

5604 UndefCnt > Sz / 2)

5605return std::nullopt;

5606 UsedVals.set(Val);

5607for (unsigned K = 0; K < NumParts; ++K) {

5608unsignedIdx = Val + Sz * K;

5609if (Idx < VF)

5610 ResOrder[Idx] =I + K;

5611 }

5612 }

5613return std::move(ResOrder);

5614 }

5615unsigned VF = TE.getVectorFactor();

5616// Try build correct order for extractelement instructions.

5617SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),

5618 TE.ReuseShuffleIndices.end());

5619if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&

5620all_of(TE.Scalars, [Sz](Value *V) {

5621 if (isa<PoisonValue>(V))

5622 return true;

5623 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));

5624 return Idx && *Idx < Sz;

5625 })) {

5626assert(!TE.isAltShuffle() &&"Alternate instructions are only supported "

5627"by BinaryOperator and CastInst.");

5628SmallVector<int> ReorderMask(Sz,PoisonMaskElem);

5629if (TE.ReorderIndices.empty())

5630 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);

5631else

5632inversePermutation(TE.ReorderIndices, ReorderMask);

5633for (unsignedI = 0;I < VF; ++I) {

5634int &Idx = ReusedMask[I];

5635if (Idx ==PoisonMaskElem)

5636continue;

5637Value *V = TE.Scalars[ReorderMask[Idx]];

5638 std::optional<unsigned> EI =getExtractIndex(cast<Instruction>(V));

5639Idx = std::distance(ReorderMask.begin(),find(ReorderMask, *EI));

5640 }

5641 }

5642// Build the order of the VF size, need to reorder reuses shuffles, they are

5643// always of VF size.

5644OrdersType ResOrder(VF);

5645 std::iota(ResOrder.begin(), ResOrder.end(), 0);

5646auto *It = ResOrder.begin();

5647for (unsigned K = 0; K < VF; K += Sz) {

5648OrdersType CurrentOrder(TE.ReorderIndices);

5649SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};

5650if (SubMask.front() ==PoisonMaskElem)

5651 std::iota(SubMask.begin(), SubMask.end(), 0);

5652reorderOrder(CurrentOrder, SubMask);

5653transform(CurrentOrder, It, [K](unsigned Pos) {return Pos + K; });

5654 std::advance(It, Sz);

5655 }

5656if (TE.isGather() &&all_of(enumerate(ResOrder), [](constauto &Data) {

5657returnData.index() ==Data.value();

5658 }))

5659return std::nullopt;// No need to reorder.

5660return std::move(ResOrder);

5661 }

5662if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&

5663any_of(TE.UserTreeIndices,

5664 [](constEdgeInfo &EI) {

5665 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());

5666 }) &&

5667 (TE.ReorderIndices.empty() ||isReverseOrder(TE.ReorderIndices)))

5668return std::nullopt;

5669if ((TE.State == TreeEntry::Vectorize ||

5670 TE.State == TreeEntry::StridedVectorize) &&

5671 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||

5672 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {

5673assert(!TE.isAltShuffle() &&"Alternate instructions are only supported by "

5674"BinaryOperator and CastInst.");

5675return TE.ReorderIndices;

5676 }

5677if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {

5678if (!TE.ReorderIndices.empty())

5679return TE.ReorderIndices;

5680

5681SmallVector<Instruction *> UserBVHead(TE.Scalars.size());

5682for (auto [I, V] :zip(UserBVHead, TE.Scalars)) {

5683if (!V->hasNUsesOrMore(1))

5684continue;

5685auto *II = dyn_cast<InsertElementInst>(*V->user_begin());

5686if (!II)

5687continue;

5688Instruction *BVHead =nullptr;

5689BasicBlock *BB =II->getParent();

5690while (II &&II->hasOneUse() &&II->getParent() == BB) {

5691 BVHead =II;

5692II = dyn_cast<InsertElementInst>(II->getOperand(0));

5693 }

5694I = BVHead;

5695 }

5696

5697auto CompareByBasicBlocks = [&](BasicBlock *BB1,BasicBlock *BB2) {

5698assert(BB1 != BB2 &&"Expected different basic blocks.");

5699auto *NodeA = DT->getNode(BB1);

5700auto *NodeB = DT->getNode(BB2);

5701assert(NodeA &&"Should only process reachable instructions");

5702assert(NodeB &&"Should only process reachable instructions");

5703assert((NodeA == NodeB) ==

5704 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

5705"Different nodes should have different DFS numbers");

5706return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();

5707 };

5708auto PHICompare = [&](unsigned I1,unsigned I2) {

5709Value *V1 = TE.Scalars[I1];

5710Value *V2 = TE.Scalars[I2];

5711if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))

5712returnfalse;

5713if (isa<PoisonValue>(V1))

5714returntrue;

5715if (isa<PoisonValue>(V2))

5716returnfalse;

5717if (V1->getNumUses() < V2->getNumUses())

5718returntrue;

5719if (V1->getNumUses() > V2->getNumUses())

5720returnfalse;

5721auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());

5722auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());

5723if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())

5724return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),

5725 FirstUserOfPhi2->getParent());

5726auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);

5727auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);

5728auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);

5729auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);

5730if (IE1 && !IE2)

5731returntrue;

5732if (!IE1 && IE2)

5733returnfalse;

5734if (IE1 && IE2) {

5735if (UserBVHead[I1] && !UserBVHead[I2])

5736returntrue;

5737if (!UserBVHead[I1])

5738returnfalse;

5739if (UserBVHead[I1] == UserBVHead[I2])

5740returngetElementIndex(IE1) <getElementIndex(IE2);

5741if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())

5742return CompareByBasicBlocks(UserBVHead[I1]->getParent(),

5743 UserBVHead[I2]->getParent());

5744return UserBVHead[I1]->comesBefore(UserBVHead[I2]);

5745 }

5746if (EE1 && !EE2)

5747returntrue;

5748if (!EE1 && EE2)

5749returnfalse;

5750if (EE1 && EE2) {

5751auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));

5752auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));

5753auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));

5754auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));

5755if (!Inst2 && !P2)

5756return Inst1 || P1;

5757if (EE1->getOperand(0) == EE2->getOperand(0))

5758returngetElementIndex(EE1) <getElementIndex(EE2);

5759if (!Inst1 && Inst2)

5760returnfalse;

5761if (Inst1 && Inst2) {

5762if (Inst1->getParent() != Inst2->getParent())

5763return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());

5764return Inst1->comesBefore(Inst2);

5765 }

5766if (!P1 && P2)

5767returnfalse;

5768assert(P1 && P2 &&

5769"Expected either instructions or arguments vector operands.");

5770return P1->getArgNo() < P2->getArgNo();

5771 }

5772returnfalse;

5773 };

5774OrdersType Phis(TE.Scalars.size());

5775 std::iota(Phis.begin(), Phis.end(), 0);

5776stable_sort(Phis, PHICompare);

5777if (isIdentityOrder(Phis))

5778return std::nullopt;// No need to reorder.

5779return std::move(Phis);

5780 }

5781if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&

5782allSameType(TE.Scalars)) {

5783// TODO: add analysis of other gather nodes with extractelement

5784// instructions and other values/instructions, not only undefs.

5785if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||

5786 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&

5787any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&

5788all_of(TE.Scalars, [](Value *V) {

5789 auto *EE = dyn_cast<ExtractElementInst>(V);

5790 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());

5791 })) {

5792// Check that gather of extractelements can be represented as

5793// just a shuffle of a single vector.

5794OrdersType CurrentOrder;

5795bool Reuse =

5796 canReuseExtract(TE.Scalars, CurrentOrder,/*ResizeAllowed=*/true);

5797if (Reuse || !CurrentOrder.empty())

5798return std::move(CurrentOrder);

5799 }

5800// If the gather node is <undef, v, .., poison> and

5801// insertelement poison, v, 0 [+ permute]

5802// is cheaper than

5803// insertelement poison, v, n - try to reorder.

5804// If rotating the whole graph, exclude the permute cost, the whole graph

5805// might be transformed.

5806int Sz = TE.Scalars.size();

5807if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&

5808count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {

5809constauto *It =

5810find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });

5811if (It == TE.Scalars.begin())

5812returnOrdersType();

5813auto *Ty =getWidenedType(TE.Scalars.front()->getType(), Sz);

5814if (It != TE.Scalars.end()) {

5815OrdersType Order(Sz, Sz);

5816unsignedIdx = std::distance(TE.Scalars.begin(), It);

5817 Order[Idx] = 0;

5818fixupOrderingIndices(Order);

5819SmallVector<int> Mask;

5820inversePermutation(Order, Mask);

5821InstructionCost PermuteCost =

5822 TopToBottom

5823 ? 0

5824 :::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, Ty, Mask);

5825InstructionCost InsertFirstCost =TTI->getVectorInstrCost(

5826 Instruction::InsertElement, Ty,TTI::TCK_RecipThroughput, 0,

5827PoisonValue::get(Ty), *It);

5828InstructionCost InsertIdxCost =TTI->getVectorInstrCost(

5829 Instruction::InsertElement, Ty,TTI::TCK_RecipThroughput,Idx,

5830PoisonValue::get(Ty), *It);

5831if (InsertFirstCost + PermuteCost < InsertIdxCost) {

5832OrdersType Order(Sz, Sz);

5833 Order[Idx] = 0;

5834return std::move(Order);

5835 }

5836 }

5837 }

5838if (isSplat(TE.Scalars))

5839return std::nullopt;

5840if (TE.Scalars.size() >= 3)

5841if (std::optional<OrdersType> Order =findPartiallyOrderedLoads(TE))

5842return Order;

5843// Check if can include the order of vectorized loads. For masked gathers do

5844// extra analysis later, so include such nodes into a special list.

5845if (TE.hasState() && TE.getOpcode() == Instruction::Load) {

5846SmallVector<Value *> PointerOps;

5847OrdersType CurrentOrder;

5848LoadsState Res =canVectorizeLoads(TE.Scalars, TE.Scalars.front(),

5849 CurrentOrder, PointerOps);

5850if (Res ==LoadsState::Vectorize || Res ==LoadsState::StridedVectorize)

5851return std::move(CurrentOrder);

5852 }

5853// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars

5854// has been auditted for correctness with non-power-of-two vectors.

5855if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))

5856if (std::optional<OrdersType> CurrentOrder =findReusedOrderedScalars(TE))

5857return CurrentOrder;

5858 }

5859return std::nullopt;

5860}

5861

5862/// Checks if the given mask is a "clustered" mask with the same clusters of

5863/// size \p Sz, which are not identity submasks.

5864staticboolisRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,

5865unsigned Sz) {

5866ArrayRef<int> FirstCluster = Mask.slice(0, Sz);

5867if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))

5868returnfalse;

5869for (unsignedI = Sz,E = Mask.size();I <E;I += Sz) {

5870ArrayRef<int> Cluster = Mask.slice(I, Sz);

5871if (Cluster != FirstCluster)

5872returnfalse;

5873 }

5874returntrue;

5875}

5876

5877void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,ArrayRef<int> Mask) const{

5878// Reorder reuses mask.

5879reorderReuses(TE.ReuseShuffleIndices, Mask);

5880constunsigned Sz =TE.Scalars.size();

5881// For vectorized and non-clustered reused no need to do anything else.

5882if (!TE.isGather() ||

5883 !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,

5884 Sz) ||

5885 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))

5886return;

5887SmallVector<int> NewMask;

5888inversePermutation(TE.ReorderIndices, NewMask);

5889addMask(NewMask,TE.ReuseShuffleIndices);

5890// Clear reorder since it is going to be applied to the new mask.

5891TE.ReorderIndices.clear();

5892// Try to improve gathered nodes with clustered reuses, if possible.

5893ArrayRef<int> Slice =ArrayRef(NewMask).slice(0, Sz);

5894SmallVector<unsigned> NewOrder(Slice);

5895inversePermutation(NewOrder, NewMask);

5896reorderScalars(TE.Scalars, NewMask);

5897// Fill the reuses mask with the identity submasks.

5898for (auto *It =TE.ReuseShuffleIndices.begin(),

5899 *End =TE.ReuseShuffleIndices.end();

5900 It !=End; std::advance(It, Sz))

5901 std::iota(It, std::next(It, Sz), 0);

5902}

5903

5904staticvoidcombineOrders(MutableArrayRef<unsigned> Order,

5905ArrayRef<unsigned> SecondaryOrder) {

5906assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&

5907"Expected same size of orders");

5908unsigned Sz = Order.size();

5909SmallBitVector UsedIndices(Sz);

5910for (unsignedIdx : seq<unsigned>(0, Sz)) {

5911if (Order[Idx] != Sz)

5912 UsedIndices.set(Order[Idx]);

5913 }

5914if (SecondaryOrder.empty()) {

5915for (unsignedIdx : seq<unsigned>(0, Sz))

5916if (Order[Idx] == Sz && !UsedIndices.test(Idx))

5917 Order[Idx] =Idx;

5918 }else {

5919for (unsignedIdx : seq<unsigned>(0, Sz))

5920if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&

5921 !UsedIndices.test(SecondaryOrder[Idx]))

5922 Order[Idx] = SecondaryOrder[Idx];

5923 }

5924}

5925

5926voidBoUpSLP::reorderTopToBottom() {

5927// Maps VF to the graph nodes.

5928DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;

5929// ExtractElement gather nodes which can be vectorized and need to handle

5930// their ordering.

5931DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

5932

5933// Phi nodes can have preferred ordering based on their result users

5934DenseMap<const TreeEntry *, OrdersType> PhisToOrders;

5935

5936// AltShuffles can also have a preferred ordering that leads to fewer

5937// instructions, e.g., the addsub instruction in x86.

5938DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;

5939

5940// Maps a TreeEntry to the reorder indices of external users.

5941DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>

5942 ExternalUserReorderMap;

5943// Find all reorderable nodes with the given VF.

5944// Currently the are vectorized stores,loads,extracts + some gathering of

5945// extracts.

5946for_each(VectorizableTree, [&, &TTIRef = *TTI](

5947const std::unique_ptr<TreeEntry> &TE) {

5948// Look for external users that will probably be vectorized.

5949SmallVector<OrdersType, 1> ExternalUserReorderIndices =

5950 findExternalStoreUsersReorderIndices(TE.get());

5951if (!ExternalUserReorderIndices.empty()) {

5952 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

5953 ExternalUserReorderMap.try_emplace(TE.get(),

5954 std::move(ExternalUserReorderIndices));

5955 }

5956

5957// Patterns like [fadd,fsub] can be combined into a single instruction in

5958// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need

5959// to take into account their order when looking for the most used order.

5960if (TE->hasState() && TE->isAltShuffle()) {

5961VectorType *VecTy =

5962getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());

5963unsigned Opcode0 = TE->getOpcode();

5964unsigned Opcode1 = TE->getAltOpcode();

5965SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));

5966// If this pattern is supported by the target then we consider the order.

5967if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {

5968 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

5969 AltShufflesToOrders.try_emplace(TE.get(),OrdersType());

5970 }

5971// TODO: Check the reverse order too.

5972 }

5973

5974if (std::optional<OrdersType> CurrentOrder =

5975getReorderingData(*TE,/*TopToBottom=*/true)) {

5976// Do not include ordering for nodes used in the alt opcode vectorization,

5977// better to reorder them during bottom-to-top stage. If follow the order

5978// here, it causes reordering of the whole graph though actually it is

5979// profitable just to reorder the subgraph that starts from the alternate

5980// opcode vectorization node. Such nodes already end-up with the shuffle

5981// instruction and it is just enough to change this shuffle rather than

5982// rotate the scalars for the whole graph.

5983unsigned Cnt = 0;

5984const TreeEntry *UserTE = TE.get();

5985while (UserTE && Cnt <RecursionMaxDepth) {

5986if (UserTE->UserTreeIndices.size() != 1)

5987break;

5988if (all_of(UserTE->UserTreeIndices, [](constEdgeInfo &EI) {

5989 return EI.UserTE->State == TreeEntry::Vectorize &&

5990 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;

5991 }))

5992return;

5993 UserTE = UserTE->UserTreeIndices.back().UserTE;

5994 ++Cnt;

5995 }

5996 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

5997if (!(TE->State == TreeEntry::Vectorize ||

5998 TE->State == TreeEntry::StridedVectorize) ||

5999 !TE->ReuseShuffleIndices.empty())

6000 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

6001if (TE->State == TreeEntry::Vectorize &&

6002 TE->getOpcode() == Instruction::PHI)

6003 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);

6004 }

6005 });

6006

6007// Reorder the graph nodes according to their vectorization factor.

6008for (unsigned VF = VectorizableTree.front()->getVectorFactor();

6009 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {

6010auto It = VFToOrderedEntries.find(VF);

6011if (It == VFToOrderedEntries.end())

6012continue;

6013// Try to find the most profitable order. We just are looking for the most

6014// used order and reorder scalar elements in the nodes according to this

6015// mostly used order.

6016ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();

6017// Delete VF entry upon exit.

6018autoCleanup =make_scope_exit([&]() { VFToOrderedEntries.erase(It); });

6019

6020// All operands are reordered and used only in this node - propagate the

6021// most used order to the user node.

6022MapVector<OrdersType,unsigned,

6023DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

6024 OrdersUses;

6025SmallPtrSet<const TreeEntry *, 4> VisitedOps;

6026for (const TreeEntry *OpTE : OrderedEntries) {

6027// No need to reorder this nodes, still need to extend and to use shuffle,

6028// just need to merge reordering shuffle and the reuse shuffle.

6029if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))

6030continue;

6031// Count number of orders uses.

6032constauto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,

6033 &PhisToOrders]() ->constOrdersType & {

6034if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {

6035auto It = GathersToOrders.find(OpTE);

6036if (It != GathersToOrders.end())

6037return It->second;

6038 }

6039if (OpTE->hasState() && OpTE->isAltShuffle()) {

6040auto It = AltShufflesToOrders.find(OpTE);

6041if (It != AltShufflesToOrders.end())

6042return It->second;

6043 }

6044if (OpTE->State == TreeEntry::Vectorize &&

6045 OpTE->getOpcode() == Instruction::PHI) {

6046auto It = PhisToOrders.find(OpTE);

6047if (It != PhisToOrders.end())

6048return It->second;

6049 }

6050return OpTE->ReorderIndices;

6051 }();

6052// First consider the order of the external scalar users.

6053auto It = ExternalUserReorderMap.find(OpTE);

6054if (It != ExternalUserReorderMap.end()) {

6055constauto &ExternalUserReorderIndices = It->second;

6056// If the OpTE vector factor != number of scalars - use natural order,

6057// it is an attempt to reorder node with reused scalars but with

6058// external uses.

6059if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {

6060 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=

6061 ExternalUserReorderIndices.size();

6062 }else {

6063for (constOrdersType &ExtOrder : ExternalUserReorderIndices)

6064 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;

6065 }

6066// No other useful reorder data in this entry.

6067if (Order.empty())

6068continue;

6069 }

6070// Stores actually store the mask, not the order, need to invert.

6071if (OpTE->State == TreeEntry::Vectorize &&

6072 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

6073assert(!OpTE->isAltShuffle() &&

6074"Alternate instructions are only supported by BinaryOperator "

6075"and CastInst.");

6076SmallVector<int> Mask;

6077inversePermutation(Order, Mask);

6078unsignedE = Order.size();

6079OrdersType CurrentOrder(E,E);

6080transform(Mask, CurrentOrder.begin(), [E](intIdx) {

6081 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);

6082 });

6083fixupOrderingIndices(CurrentOrder);

6084 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;

6085 }else {

6086 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;

6087 }

6088 }

6089if (OrdersUses.empty())

6090continue;

6091// Choose the most used order.

6092unsigned IdentityCnt = 0;

6093unsigned FilledIdentityCnt = 0;

6094OrdersType IdentityOrder(VF, VF);

6095for (auto &Pair : OrdersUses) {

6096if (Pair.first.empty() ||isIdentityOrder(Pair.first)) {

6097if (!Pair.first.empty())

6098 FilledIdentityCnt += Pair.second;

6099 IdentityCnt += Pair.second;

6100combineOrders(IdentityOrder, Pair.first);

6101 }

6102 }

6103MutableArrayRef<unsigned> BestOrder = IdentityOrder;

6104unsigned Cnt = IdentityCnt;

6105for (auto &Pair : OrdersUses) {

6106// Prefer identity order. But, if filled identity found (non-empty order)

6107// with same number of uses, as the new candidate order, we can choose

6108// this candidate order.

6109if (Cnt < Pair.second ||

6110 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&

6111 Cnt == Pair.second && !BestOrder.empty() &&

6112isIdentityOrder(BestOrder))) {

6113combineOrders(Pair.first, BestOrder);

6114 BestOrder = Pair.first;

6115 Cnt = Pair.second;

6116 }else {

6117combineOrders(BestOrder, Pair.first);

6118 }

6119 }

6120// Set order of the user node.

6121if (isIdentityOrder(BestOrder))

6122continue;

6123fixupOrderingIndices(BestOrder);

6124SmallVector<int> Mask;

6125inversePermutation(BestOrder, Mask);

6126SmallVector<int> MaskOrder(BestOrder.size(),PoisonMaskElem);

6127unsignedE = BestOrder.size();

6128transform(BestOrder, MaskOrder.begin(), [E](unsignedI) {

6129 return I < E ? static_cast<int>(I) : PoisonMaskElem;

6130 });

6131// Do an actual reordering, if profitable.

6132for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

6133// Just do the reordering for the nodes with the given VF.

6134if (TE->Scalars.size() != VF) {

6135if (TE->ReuseShuffleIndices.size() == VF) {

6136// Need to reorder the reuses masks of the operands with smaller VF to

6137// be able to find the match between the graph nodes and scalar

6138// operands of the given node during vectorization/cost estimation.

6139assert(all_of(TE->UserTreeIndices,

6140 [VF, &TE](constEdgeInfo &EI) {

6141 return EI.UserTE->Scalars.size() == VF ||

6142 EI.UserTE->Scalars.size() ==

6143 TE->Scalars.size();

6144 }) &&

6145"All users must be of VF size.");

6146if (SLPReVec) {

6147assert(SLPReVec &&"Only supported by REVEC.");

6148// ShuffleVectorInst does not do reorderOperands (and it should not

6149// because ShuffleVectorInst supports only a limited set of

6150// patterns). Only do reorderNodeWithReuses if all of the users are

6151// not ShuffleVectorInst.

6152if (all_of(TE->UserTreeIndices, [&](constEdgeInfo &EI) {

6153 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());

6154 }))

6155continue;

6156assert(none_of(TE->UserTreeIndices,

6157 [&](constEdgeInfo &EI) {

6158 return isa<ShuffleVectorInst>(

6159 EI.UserTE->getMainOp());

6160 }) &&

6161"Does not know how to reorder.");

6162 }

6163// Update ordering of the operands with the smaller VF than the given

6164// one.

6165 reorderNodeWithReuses(*TE, Mask);

6166 }

6167continue;

6168 }

6169if ((TE->State == TreeEntry::Vectorize ||

6170 TE->State == TreeEntry::StridedVectorize) &&

6171 (isa<ExtractElementInst,ExtractValueInst,LoadInst,StoreInst,

6172InsertElementInst>(TE->getMainOp()) ||

6173 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {

6174assert(!TE->isAltShuffle() &&

6175"Alternate instructions are only supported by BinaryOperator "

6176"and CastInst.");

6177// Build correct orders for extract{element,value}, loads and

6178// stores.

6179reorderOrder(TE->ReorderIndices, Mask);

6180if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))

6181 TE->reorderOperands(Mask);

6182 }else {

6183// Reorder the node and its operands.

6184 TE->reorderOperands(Mask);

6185assert(TE->ReorderIndices.empty() &&

6186"Expected empty reorder sequence.");

6187reorderScalars(TE->Scalars, Mask);

6188 }

6189if (!TE->ReuseShuffleIndices.empty()) {

6190// Apply reversed order to keep the original ordering of the reused

6191// elements to avoid extra reorder indices shuffling.

6192OrdersType CurrentOrder;

6193reorderOrder(CurrentOrder, MaskOrder);

6194SmallVector<int> NewReuses;

6195inversePermutation(CurrentOrder, NewReuses);

6196addMask(NewReuses, TE->ReuseShuffleIndices);

6197 TE->ReuseShuffleIndices.swap(NewReuses);

6198 }

6199 }

6200 }

6201}

6202

6203bool BoUpSLP::canReorderOperands(

6204 TreeEntry *UserTE,SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

6205ArrayRef<TreeEntry *> ReorderableGathers,

6206SmallVectorImpl<TreeEntry *> &GatherOps) {

6207for (unsignedI = 0,E = UserTE->getNumOperands();I <E; ++I) {

6208if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {

6209return OpData.first ==I &&

6210 (OpData.second->State == TreeEntry::Vectorize ||

6211 OpData.second->State == TreeEntry::StridedVectorize);

6212 }))

6213continue;

6214if (TreeEntry *TE = getVectorizedOperand(UserTE,I)) {

6215// Do not reorder if operand node is used by many user nodes.

6216if (any_of(TE->UserTreeIndices,

6217 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))

6218returnfalse;

6219// Add the node to the list of the ordered nodes with the identity

6220// order.

6221 Edges.emplace_back(I, TE);

6222// Add ScatterVectorize nodes to the list of operands, where just

6223// reordering of the scalars is required. Similar to the gathers, so

6224// simply add to the list of gathered ops.

6225// If there are reused scalars, process this node as a regular vectorize

6226// node, just reorder reuses mask.

6227if (TE->State != TreeEntry::Vectorize &&

6228 TE->State != TreeEntry::StridedVectorize &&

6229 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())

6230 GatherOps.push_back(TE);

6231continue;

6232 }

6233 TreeEntry *Gather =nullptr;

6234if (count_if(ReorderableGathers,

6235 [&Gather, UserTE,I](TreeEntry *TE) {

6236assert(TE->State != TreeEntry::Vectorize &&

6237 TE->State != TreeEntry::StridedVectorize &&

6238"Only non-vectorized nodes are expected.");

6239if (any_of(TE->UserTreeIndices,

6240 [UserTE,I](const EdgeInfo &EI) {

6241 return EI.UserTE == UserTE && EI.EdgeIdx == I;

6242 })) {

6243assert(TE->isSame(UserTE->getOperand(I)) &&

6244"Operand entry does not match operands.");

6245Gather = TE;

6246returntrue;

6247 }

6248returnfalse;

6249 }) > 1 &&

6250 !allConstant(UserTE->getOperand(I)))

6251returnfalse;

6252if (Gather)

6253 GatherOps.push_back(Gather);

6254 }

6255returntrue;

6256}

6257

6258voidBoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

6259SetVector<TreeEntry *> OrderedEntries;

6260DenseSet<const TreeEntry *> GathersToOrders;

6261// Find all reorderable leaf nodes with the given VF.

6262// Currently the are vectorized loads,extracts without alternate operands +

6263// some gathering of extracts.

6264SmallVector<TreeEntry *> NonVectorized;

6265for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

6266if (TE->State != TreeEntry::Vectorize &&

6267 TE->State != TreeEntry::StridedVectorize)

6268 NonVectorized.push_back(TE.get());

6269if (std::optional<OrdersType> CurrentOrder =

6270getReorderingData(*TE,/*TopToBottom=*/false)) {

6271 OrderedEntries.insert(TE.get());

6272if (!(TE->State == TreeEntry::Vectorize ||

6273 TE->State == TreeEntry::StridedVectorize) ||

6274 !TE->ReuseShuffleIndices.empty())

6275 GathersToOrders.insert(TE.get());

6276 }

6277 }

6278

6279// 1. Propagate order to the graph nodes, which use only reordered nodes.

6280// I.e., if the node has operands, that are reordered, try to make at least

6281// one operand order in the natural order and reorder others + reorder the

6282// user node itself.

6283SmallPtrSet<const TreeEntry *, 4> Visited;

6284while (!OrderedEntries.empty()) {

6285// 1. Filter out only reordered nodes.

6286// 2. If the entry has multiple uses - skip it and jump to the next node.

6287DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>Users;

6288SmallVector<TreeEntry *> Filtered;

6289for (TreeEntry *TE : OrderedEntries) {

6290if (!(TE->State == TreeEntry::Vectorize ||

6291 TE->State == TreeEntry::StridedVectorize ||

6292 (TE->isGather() && GathersToOrders.contains(TE))) ||

6293 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

6294 !all_of(drop_begin(TE->UserTreeIndices),

6295 [TE](constEdgeInfo &EI) {

6296 return EI.UserTE == TE->UserTreeIndices.front().UserTE;

6297 }) ||

6298 !Visited.insert(TE).second) {

6299 Filtered.push_back(TE);

6300continue;

6301 }

6302// Build a map between user nodes and their operands order to speedup

6303// search. The graph currently does not provide this dependency directly.

6304for (EdgeInfo &EI : TE->UserTreeIndices)

6305Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);

6306 }

6307// Erase filtered entries.

6308for (TreeEntry *TE : Filtered)

6309 OrderedEntries.remove(TE);

6310SmallVector<

6311 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>

6312 UsersVec(Users.begin(),Users.end());

6313sort(UsersVec, [](constauto &Data1,constauto &Data2) {

6314return Data1.first->Idx > Data2.first->Idx;

6315 });

6316for (auto &Data : UsersVec) {

6317// Check that operands are used only in the User node.

6318SmallVector<TreeEntry *> GatherOps;

6319if (!canReorderOperands(Data.first,Data.second, NonVectorized,

6320 GatherOps)) {

6321for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)

6322 OrderedEntries.remove(Op.second);

6323continue;

6324 }

6325// All operands are reordered and used only in this node - propagate the

6326// most used order to the user node.

6327MapVector<OrdersType,unsigned,

6328DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

6329 OrdersUses;

6330// Do the analysis for each tree entry only once, otherwise the order of

6331// the same node my be considered several times, though might be not

6332// profitable.

6333SmallPtrSet<const TreeEntry *, 4> VisitedOps;

6334SmallPtrSet<const TreeEntry *, 4> VisitedUsers;

6335for (constauto &Op :Data.second) {

6336 TreeEntry *OpTE =Op.second;

6337if (!VisitedOps.insert(OpTE).second)

6338continue;

6339if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))

6340continue;

6341constauto Order = [&]() ->constOrdersType {

6342if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())

6343returngetReorderingData(*OpTE,/*TopToBottom=*/false)

6344 .value_or(OrdersType(1));

6345return OpTE->ReorderIndices;

6346 }();

6347// The order is partially ordered, skip it in favor of fully non-ordered

6348// orders.

6349if (Order.size() == 1)

6350continue;

6351unsigned NumOps =count_if(

6352Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {

6353 return P.second == OpTE;

6354 });

6355// Stores actually store the mask, not the order, need to invert.

6356if (OpTE->State == TreeEntry::Vectorize &&

6357 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

6358assert(!OpTE->isAltShuffle() &&

6359"Alternate instructions are only supported by BinaryOperator "

6360"and CastInst.");

6361SmallVector<int> Mask;

6362inversePermutation(Order, Mask);

6363unsignedE = Order.size();

6364OrdersType CurrentOrder(E,E);

6365transform(Mask, CurrentOrder.begin(), [E](intIdx) {

6366 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);

6367 });

6368fixupOrderingIndices(CurrentOrder);

6369 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=

6370 NumOps;

6371 }else {

6372 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;

6373 }

6374auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));

6375constauto AllowsReordering = [&](const TreeEntry *TE) {

6376if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

6377 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||

6378 (IgnoreReorder && TE->Idx == 0))

6379returntrue;

6380if (TE->isGather()) {

6381if (GathersToOrders.contains(TE))

6382return !getReorderingData(*TE,/*TopToBottom=*/false)

6383 .value_or(OrdersType(1))

6384 .empty();

6385returntrue;

6386 }

6387returnfalse;

6388 };

6389for (constEdgeInfo &EI : OpTE->UserTreeIndices) {

6390 TreeEntry *UserTE = EI.UserTE;

6391if (!VisitedUsers.insert(UserTE).second)

6392continue;

6393// May reorder user node if it requires reordering, has reused

6394// scalars, is an alternate op vectorize node or its op nodes require

6395// reordering.

6396if (AllowsReordering(UserTE))

6397continue;

6398// Check if users allow reordering.

6399// Currently look up just 1 level of operands to avoid increase of

6400// the compile time.

6401// Profitable to reorder if definitely more operands allow

6402// reordering rather than those with natural order.

6403ArrayRef<std::pair<unsigned, TreeEntry *>> Ops =Users[UserTE];

6404if (static_cast<unsigned>(count_if(

6405 Ops, [UserTE, &AllowsReordering](

6406const std::pair<unsigned, TreeEntry *> &Op) {

6407return AllowsReordering(Op.second) &&

6408all_of(Op.second->UserTreeIndices,

6409 [UserTE](constEdgeInfo &EI) {

6410 return EI.UserTE == UserTE;

6411 });

6412 })) <= Ops.size() / 2)

6413 ++Res.first->second;

6414 }

6415 }

6416if (OrdersUses.empty()) {

6417for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)

6418 OrderedEntries.remove(Op.second);

6419continue;

6420 }

6421// Choose the most used order.

6422unsigned IdentityCnt = 0;

6423unsigned VF =Data.second.front().second->getVectorFactor();

6424OrdersType IdentityOrder(VF, VF);

6425for (auto &Pair : OrdersUses) {

6426if (Pair.first.empty() ||isIdentityOrder(Pair.first)) {

6427 IdentityCnt += Pair.second;

6428combineOrders(IdentityOrder, Pair.first);

6429 }

6430 }

6431MutableArrayRef<unsigned> BestOrder = IdentityOrder;

6432unsigned Cnt = IdentityCnt;

6433for (auto &Pair : OrdersUses) {

6434// Prefer identity order. But, if filled identity found (non-empty

6435// order) with same number of uses, as the new candidate order, we can

6436// choose this candidate order.

6437if (Cnt < Pair.second) {

6438combineOrders(Pair.first, BestOrder);

6439 BestOrder = Pair.first;

6440 Cnt = Pair.second;

6441 }else {

6442combineOrders(BestOrder, Pair.first);

6443 }

6444 }

6445// Set order of the user node.

6446if (isIdentityOrder(BestOrder)) {

6447for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)

6448 OrderedEntries.remove(Op.second);

6449continue;

6450 }

6451fixupOrderingIndices(BestOrder);

6452// Erase operands from OrderedEntries list and adjust their orders.

6453 VisitedOps.clear();

6454SmallVector<int> Mask;

6455inversePermutation(BestOrder, Mask);

6456SmallVector<int> MaskOrder(BestOrder.size(),PoisonMaskElem);

6457unsignedE = BestOrder.size();

6458transform(BestOrder, MaskOrder.begin(), [E](unsignedI) {

6459 return I < E ? static_cast<int>(I) : PoisonMaskElem;

6460 });

6461for (const std::pair<unsigned, TreeEntry *> &Op :Data.second) {

6462 TreeEntry *TE =Op.second;

6463 OrderedEntries.remove(TE);

6464if (!VisitedOps.insert(TE).second)

6465continue;

6466if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {

6467 reorderNodeWithReuses(*TE, Mask);

6468continue;

6469 }

6470// Gathers are processed separately.

6471if (TE->State != TreeEntry::Vectorize &&

6472 TE->State != TreeEntry::StridedVectorize &&

6473 (TE->State != TreeEntry::ScatterVectorize ||

6474 TE->ReorderIndices.empty()))

6475continue;

6476assert((BestOrder.size() == TE->ReorderIndices.size() ||

6477 TE->ReorderIndices.empty()) &&

6478"Non-matching sizes of user/operand entries.");

6479reorderOrder(TE->ReorderIndices, Mask);

6480if (IgnoreReorder && TE == VectorizableTree.front().get())

6481 IgnoreReorder =false;

6482 }

6483// For gathers just need to reorder its scalars.

6484for (TreeEntry *Gather : GatherOps) {

6485assert(Gather->ReorderIndices.empty() &&

6486"Unexpected reordering of gathers.");

6487if (!Gather->ReuseShuffleIndices.empty()) {

6488// Just reorder reuses indices.

6489reorderReuses(Gather->ReuseShuffleIndices, Mask);

6490continue;

6491 }

6492reorderScalars(Gather->Scalars, Mask);

6493 OrderedEntries.remove(Gather);

6494 }

6495// Reorder operands of the user node and set the ordering for the user

6496// node itself.

6497if (Data.first->State != TreeEntry::Vectorize ||

6498 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(

6499Data.first->getMainOp()) ||

6500Data.first->isAltShuffle())

6501Data.first->reorderOperands(Mask);

6502if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||

6503Data.first->isAltShuffle() ||

6504Data.first->State == TreeEntry::StridedVectorize) {

6505reorderScalars(Data.first->Scalars, Mask);

6506reorderOrder(Data.first->ReorderIndices, MaskOrder,

6507/*BottomOrder=*/true);

6508if (Data.first->ReuseShuffleIndices.empty() &&

6509 !Data.first->ReorderIndices.empty() &&

6510 !Data.first->isAltShuffle()) {

6511// Insert user node to the list to try to sink reordering deeper in

6512// the graph.

6513 OrderedEntries.insert(Data.first);

6514 }

6515 }else {

6516reorderOrder(Data.first->ReorderIndices, Mask);

6517 }

6518 }

6519 }

6520// If the reordering is unnecessary, just remove the reorder.

6521if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&

6522 VectorizableTree.front()->ReuseShuffleIndices.empty())

6523 VectorizableTree.front()->ReorderIndices.clear();

6524}

6525

6526Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const{

6527if ((Entry.getOpcode() == Instruction::Store ||

6528 Entry.getOpcode() == Instruction::Load) &&

6529 Entry.State == TreeEntry::StridedVectorize &&

6530 !Entry.ReorderIndices.empty() &&isReverseOrder(Entry.ReorderIndices))

6531return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);

6532return dyn_cast<Instruction>(Entry.Scalars.front());

6533}

6534

6535voidBoUpSLP::buildExternalUses(

6536constExtraValueToDebugLocsMap &ExternallyUsedValues) {

6537DenseMap<Value *, unsigned> ScalarToExtUses;

6538// Collect the values that we need to extract from the tree.

6539for (auto &TEPtr : VectorizableTree) {

6540 TreeEntry *Entry = TEPtr.get();

6541

6542// No need to handle users of gathered values.

6543if (Entry->isGather())

6544continue;

6545

6546// For each lane:

6547for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

6548Value *Scalar = Entry->Scalars[Lane];

6549if (!isa<Instruction>(Scalar))

6550continue;

6551// All uses must be replaced already? No need to do it again.

6552auto It = ScalarToExtUses.find(Scalar);

6553if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)

6554continue;

6555

6556// Check if the scalar is externally used as an extra arg.

6557constauto ExtI = ExternallyUsedValues.find(Scalar);

6558if (ExtI != ExternallyUsedValues.end()) {

6559int FoundLane = Entry->findLaneForValue(Scalar);

6560LLVM_DEBUG(dbgs() <<"SLP: Need to extract: Extra arg from lane "

6561 << FoundLane <<" from " << *Scalar <<".\n");

6562 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());

6563 ExternalUses.emplace_back(Scalar,nullptr, FoundLane);

6564continue;

6565 }

6566for (User *U : Scalar->users()) {

6567LLVM_DEBUG(dbgs() <<"SLP: Checking user:" << *U <<".\n");

6568

6569Instruction *UserInst = dyn_cast<Instruction>(U);

6570if (!UserInst ||isDeleted(UserInst))

6571continue;

6572

6573// Ignore users in the user ignore list.

6574if (UserIgnoreList && UserIgnoreList->contains(UserInst))

6575continue;

6576

6577// Skip in-tree scalars that become vectors

6578if (TreeEntry *UseEntry = getTreeEntry(U)) {

6579// Some in-tree scalars will remain as scalar in vectorized

6580// instructions. If that is the case, the one in FoundLane will

6581// be used.

6582if (UseEntry->State == TreeEntry::ScatterVectorize ||

6583 !doesInTreeUserNeedToExtract(

6584 Scalar, getRootEntryInstruction(*UseEntry), TLI,TTI)) {

6585LLVM_DEBUG(dbgs() <<"SLP: \tInternal user will be removed:" << *U

6586 <<".\n");

6587assert(!UseEntry->isGather() &&"Bad state");

6588continue;

6589 }

6590 U =nullptr;

6591if (It != ScalarToExtUses.end()) {

6592 ExternalUses[It->second].User =nullptr;

6593break;

6594 }

6595 }

6596

6597if (U && Scalar->hasNUsesOrMore(UsesLimit))

6598 U =nullptr;

6599int FoundLane = Entry->findLaneForValue(Scalar);

6600LLVM_DEBUG(dbgs() <<"SLP: Need to extract:" << *UserInst

6601 <<" from lane " << FoundLane <<" from " << *Scalar

6602 <<".\n");

6603 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;

6604 ExternalUses.emplace_back(Scalar, U, FoundLane);

6605if (!U)

6606break;

6607 }

6608 }

6609 }

6610}

6611

6612SmallVector<SmallVector<StoreInst *>>

6613BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const{

6614SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,

6615SmallVector<StoreInst *>, 8>

6616 PtrToStoresMap;

6617for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {

6618Value *V = TE->Scalars[Lane];

6619// Don't iterate over the users of constant data.

6620if (!isa<Instruction>(V))

6621continue;

6622// To save compilation time we don't visit if we have too many users.

6623if (V->hasNUsesOrMore(UsesLimit))

6624break;

6625

6626// Collect stores per pointer object.

6627for (User *U : V->users()) {

6628auto *SI = dyn_cast<StoreInst>(U);

6629// Test whether we can handle the store. V might be a global, which could

6630// be used in a different function.

6631if (SI ==nullptr || !SI->isSimple() || SI->getFunction() !=F ||

6632 !isValidElementType(SI->getValueOperand()->getType()))

6633continue;

6634// Skip entry if already

6635if (getTreeEntry(U))

6636continue;

6637

6638Value *Ptr =

6639getUnderlyingObject(SI->getPointerOperand(),RecursionMaxDepth);

6640auto &StoresVec = PtrToStoresMap[{SI->getParent(),

6641 SI->getValueOperand()->getType(),Ptr}];

6642// For now just keep one store per pointer object per lane.

6643// TODO: Extend this to support multiple stores per pointer per lane

6644if (StoresVec.size() > Lane)

6645continue;

6646if (!StoresVec.empty()) {

6647 std::optional<int> Diff =getPointersDiff(

6648 SI->getValueOperand()->getType(), SI->getPointerOperand(),

6649 SI->getValueOperand()->getType(),

6650 StoresVec.front()->getPointerOperand(), *DL, *SE,

6651/*StrictCheck=*/true);

6652// We failed to compare the pointers so just abandon this store.

6653if (!Diff)

6654continue;

6655 }

6656 StoresVec.push_back(SI);

6657 }

6658 }

6659SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());

6660unsignedI = 0;

6661for (auto &P : PtrToStoresMap) {

6662 Res[I].swap(P.second);

6663 ++I;

6664 }

6665return Res;

6666}

6667

6668bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,

6669 OrdersType &ReorderIndices) const{

6670// We check whether the stores in StoreVec can form a vector by sorting them

6671// and checking whether they are consecutive.

6672

6673// To avoid calling getPointersDiff() while sorting we create a vector of

6674// pairs {store, offset from first} and sort this instead.

6675SmallVector<std::pair<int, unsigned>> StoreOffsetVec;

6676StoreInst *S0 = StoresVec[0];

6677 StoreOffsetVec.emplace_back(0, 0);

6678Type *S0Ty = S0->getValueOperand()->getType();

6679Value *S0Ptr = S0->getPointerOperand();

6680for (unsignedIdx : seq<unsigned>(1, StoresVec.size())) {

6681StoreInst *SI = StoresVec[Idx];

6682 std::optional<int> Diff =

6683getPointersDiff(S0Ty, S0Ptr,SI->getValueOperand()->getType(),

6684SI->getPointerOperand(), *DL, *SE,

6685/*StrictCheck=*/true);

6686 StoreOffsetVec.emplace_back(*Diff,Idx);

6687 }

6688

6689// Check if the stores are consecutive by checking if their difference is 1.

6690if (StoreOffsetVec.size() != StoresVec.size())

6691returnfalse;

6692sort(StoreOffsetVec,

6693 [](const std::pair<int, unsigned> &L,

6694const std::pair<int, unsigned> &R) {returnL.first <R.first; });

6695unsignedIdx = 0;

6696int PrevDist = 0;

6697for (constauto &P : StoreOffsetVec) {

6698if (Idx > 0 &&P.first != PrevDist + 1)

6699returnfalse;

6700 PrevDist =P.first;

6701 ++Idx;

6702 }

6703

6704// Calculate the shuffle indices according to their offset against the sorted

6705// StoreOffsetVec.

6706 ReorderIndices.assign(StoresVec.size(), 0);

6707bool IsIdentity =true;

6708for (auto [I,P] :enumerate(StoreOffsetVec)) {

6709 ReorderIndices[P.second] =I;

6710 IsIdentity &=P.second ==I;

6711 }

6712// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in

6713// reorderTopToBottom() and reorderBottomToTop(), so we are following the

6714// same convention here.

6715if (IsIdentity)

6716 ReorderIndices.clear();

6717

6718returntrue;

6719}

6720

6721#ifndef NDEBUG

6722LLVM_DUMP_METHODstaticvoiddumpOrder(constBoUpSLP::OrdersType &Order) {

6723for (unsignedIdx : Order)

6724dbgs() <<Idx <<", ";

6725dbgs() <<"\n";

6726}

6727#endif

6728

6729SmallVector<BoUpSLP::OrdersType, 1>

6730BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const{

6731unsigned NumLanes =TE->Scalars.size();

6732

6733SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);

6734

6735// Holds the reorder indices for each candidate store vector that is a user of

6736// the current TreeEntry.

6737SmallVector<OrdersType, 1> ExternalReorderIndices;

6738

6739// Now inspect the stores collected per pointer and look for vectorization

6740// candidates. For each candidate calculate the reorder index vector and push

6741// it into `ExternalReorderIndices`

6742for (ArrayRef<StoreInst *> StoresVec : Stores) {

6743// If we have fewer than NumLanes stores, then we can't form a vector.

6744if (StoresVec.size() != NumLanes)

6745continue;

6746

6747// If the stores are not consecutive then abandon this StoresVec.

6748OrdersType ReorderIndices;

6749if (!canFormVector(StoresVec, ReorderIndices))

6750continue;

6751

6752// We now know that the scalars in StoresVec can form a vector instruction,

6753// so set the reorder indices.

6754 ExternalReorderIndices.push_back(ReorderIndices);

6755 }

6756return ExternalReorderIndices;

6757}

6758

6759voidBoUpSLP::buildTree(ArrayRef<Value *> Roots,

6760constSmallDenseSet<Value *> &UserIgnoreLst) {

6761deleteTree();

6762 UserIgnoreList = &UserIgnoreLst;

6763if (!allSameType(Roots))

6764return;

6765 buildTree_rec(Roots, 0,EdgeInfo());

6766}

6767

6768voidBoUpSLP::buildTree(ArrayRef<Value *> Roots) {

6769deleteTree();

6770if (!allSameType(Roots))

6771return;

6772 buildTree_rec(Roots, 0,EdgeInfo());

6773}

6774

6775/// Tries to find subvector of loads and builds new vector of only loads if can

6776/// be profitable.

6777staticvoidgatherPossiblyVectorizableLoads(

6778constBoUpSLP &R,ArrayRef<Value *> VL,constDataLayout &DL,

6779ScalarEvolution &SE,constTargetTransformInfo &TTI,

6780SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,

6781bool AddNew =true) {

6782if (VL.empty())

6783return;

6784Type *ScalarTy =getValueType(VL.front());

6785if (!isValidElementType(ScalarTy))

6786return;

6787SmallVector<SmallVector<std::pair<LoadInst *, int>>> ClusteredLoads;

6788SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;

6789for (Value *V : VL) {

6790auto *LI = dyn_cast<LoadInst>(V);

6791if (!LI)

6792continue;

6793if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())

6794continue;

6795bool IsFound =false;

6796for (auto [Map,Data] :zip(ClusteredDistToLoad, ClusteredLoads)) {

6797assert(LI->getParent() ==Data.front().first->getParent() &&

6798 LI->getType() ==Data.front().first->getType() &&

6799getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth) ==

6800getUnderlyingObject(Data.front().first->getPointerOperand(),

6801RecursionMaxDepth) &&

6802"Expected loads with the same type, same parent and same "

6803"underlying pointer.");

6804 std::optional<int> Dist =getPointersDiff(

6805 LI->getType(), LI->getPointerOperand(),Data.front().first->getType(),

6806Data.front().first->getPointerOperand(),DL, SE,

6807/*StrictCheck=*/true);

6808if (!Dist)

6809continue;

6810auto It = Map.find(*Dist);

6811if (It != Map.end() && It->second != LI)

6812continue;

6813if (It == Map.end()) {

6814Data.emplace_back(LI, *Dist);

6815 Map.try_emplace(*Dist, LI);

6816 }

6817 IsFound =true;

6818break;

6819 }

6820if (!IsFound) {

6821 ClusteredLoads.emplace_back().emplace_back(LI, 0);

6822 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);

6823 }

6824 }

6825auto FindMatchingLoads =

6826 [&](ArrayRef<std::pair<LoadInst *, int>> Loads,

6827SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>>

6828 &GatheredLoads,

6829SetVector<unsigned> &ToAdd,SetVector<unsigned> &Repeated,

6830int &Offset,unsigned &Start) {

6831if (Loads.empty())

6832return GatheredLoads.end();

6833SmallVector<std::pair<int, int>> Res;

6834LoadInst *LI = Loads.front().first;

6835for (auto [Idx,Data] :enumerate(GatheredLoads)) {

6836if (Idx < Start)

6837continue;

6838 ToAdd.clear();

6839if (LI->getParent() !=Data.front().first->getParent() ||

6840 LI->getType() !=Data.front().first->getType())

6841continue;

6842 std::optional<int> Dist =

6843getPointersDiff(LI->getType(), LI->getPointerOperand(),

6844Data.front().first->getType(),

6845Data.front().first->getPointerOperand(),DL, SE,

6846/*StrictCheck=*/true);

6847if (!Dist)

6848continue;

6849SmallSet<int, 4> DataDists;

6850SmallPtrSet<LoadInst *, 4> DataLoads;

6851for (std::pair<LoadInst *, int>P :Data) {

6852 DataDists.insert(P.second);

6853 DataLoads.insert(P.first);

6854 }

6855// Found matching gathered loads - check if all loads are unique or

6856// can be effectively vectorized.

6857unsigned NumUniques = 0;

6858for (auto [Cnt, Pair] :enumerate(Loads)) {

6859bool Used = DataLoads.contains(Pair.first);

6860if (!Used && !DataDists.contains(*Dist + Pair.second)) {

6861 ++NumUniques;

6862 ToAdd.insert(Cnt);

6863 }elseif (Used) {

6864 Repeated.insert(Cnt);

6865 }

6866 }

6867if (NumUniques > 0 &&

6868 (Loads.size() == NumUniques ||

6869 (Loads.size() - NumUniques >= 2 &&

6870 Loads.size() - NumUniques >= Loads.size() / 2 &&

6871 (has_single_bit(Data.size() + NumUniques) ||

6872bit_ceil(Data.size()) <

6873bit_ceil(Data.size() + NumUniques))))) {

6874Offset = *Dist;

6875 Start =Idx + 1;

6876return std::next(GatheredLoads.begin(),Idx);

6877 }

6878 }

6879 ToAdd.clear();

6880return GatheredLoads.end();

6881 };

6882for (ArrayRef<std::pair<LoadInst *, int>>Data : ClusteredLoads) {

6883unsigned Start = 0;

6884SetVector<unsigned> ToAdd, LocalToAdd, Repeated;

6885intOffset = 0;

6886auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,

6887Offset, Start);

6888while (It != GatheredLoads.end()) {

6889assert(!LocalToAdd.empty() &&"Expected some elements to add.");

6890for (unsignedIdx : LocalToAdd)

6891 It->emplace_back(Data[Idx].first,Data[Idx].second +Offset);

6892 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());

6893 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,Offset,

6894 Start);

6895 }

6896if (any_of(seq<unsigned>(Data.size()), [&](unsignedIdx) {

6897 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);

6898 })) {

6899auto AddNewLoads =

6900 [&](SmallVectorImpl<std::pair<LoadInst *, int>> &Loads) {

6901for (unsignedIdx : seq<unsigned>(Data.size())) {

6902if (ToAdd.contains(Idx) || Repeated.contains(Idx))

6903continue;

6904 Loads.push_back(Data[Idx]);

6905 }

6906 };

6907if (!AddNew) {

6908LoadInst *LI =Data.front().first;

6909 It =find_if(

6910 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {

6911return PD.front().first->getParent() == LI->getParent() &&

6912 PD.front().first->getType() == LI->getType();

6913 });

6914while (It != GatheredLoads.end()) {

6915 AddNewLoads(*It);

6916 It = std::find_if(

6917 std::next(It), GatheredLoads.end(),

6918 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {

6919 return PD.front().first->getParent() == LI->getParent() &&

6920 PD.front().first->getType() == LI->getType();

6921 });

6922 }

6923 }

6924 GatheredLoads.emplace_back().append(Data.begin(),Data.end());

6925 AddNewLoads(GatheredLoads.emplace_back());

6926 }

6927 }

6928}

6929

6930void BoUpSLP::tryToVectorizeGatheredLoads(

6931constSmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,

6932SmallVector<SmallVector<std::pair<LoadInst *, int>>>,

6933 8> &GatheredLoads) {

6934 GatheredLoadsEntriesFirst = VectorizableTree.size();

6935

6936SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(

6937 LoadEntriesToVectorize.size());

6938for (auto [Idx, Set] :zip(LoadEntriesToVectorize, LoadSetsToVectorize))

6939Set.insert(VectorizableTree[Idx]->Scalars.begin(),

6940 VectorizableTree[Idx]->Scalars.end());

6941

6942// Sort loads by distance.

6943auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,

6944const std::pair<LoadInst *, int> &L2) {

6945return L1.second > L2.second;

6946 };

6947

6948auto IsMaskedGatherSupported = [&,TTI =TTI](ArrayRef<LoadInst *> Loads) {

6949ArrayRef<Value *> Values(reinterpret_cast<Value *const*>(Loads.begin()),

6950 Loads.size());

6951Align Alignment = computeCommonAlignment<LoadInst>(Values);

6952auto *Ty =getWidenedType(Loads.front()->getType(), Loads.size());

6953returnTTI->isLegalMaskedGather(Ty, Alignment) &&

6954 !TTI->forceScalarizeMaskedGather(Ty, Alignment);

6955 };

6956

6957auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,

6958BoUpSLP::ValueSet &VectorizedLoads,

6959SmallVectorImpl<LoadInst *> &NonVectorized,

6960bool Final,unsigned MaxVF) {

6961SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>Results;

6962unsigned StartIdx = 0;

6963SmallVector<int> CandidateVFs;

6964if (VectorizeNonPowerOf2 &&has_single_bit(MaxVF + 1))

6965 CandidateVFs.push_back(MaxVF);

6966for (int NumElts =getFloorFullVectorNumberOfElements(

6967 *TTI, Loads.front()->getType(), MaxVF);

6968 NumElts > 1; NumElts =getFloorFullVectorNumberOfElements(

6969 *TTI, Loads.front()->getType(), NumElts - 1)) {

6970 CandidateVFs.push_back(NumElts);

6971if (VectorizeNonPowerOf2 && NumElts > 2)

6972 CandidateVFs.push_back(NumElts - 1);

6973 }

6974

6975if (Final && CandidateVFs.empty())

6976returnResults;

6977

6978unsigned BestVF = Final ? CandidateVFs.back() : 0;

6979for (unsigned NumElts : CandidateVFs) {

6980if (Final && NumElts > BestVF)

6981continue;

6982SmallVector<unsigned> MaskedGatherVectorized;

6983for (unsigned Cnt = StartIdx,E = Loads.size(); Cnt <E;

6984 ++Cnt) {

6985ArrayRef<LoadInst *> Slice =

6986ArrayRef(Loads).slice(Cnt, std::min(NumElts,E - Cnt));

6987if (VectorizedLoads.count(Slice.front()) ||

6988 VectorizedLoads.count(Slice.back()) ||

6989areKnownNonVectorizableLoads(Slice))

6990continue;

6991// Check if it is profitable to try vectorizing gathered loads. It is

6992// profitable if we have more than 3 consecutive loads or if we have

6993// less but all users are vectorized or deleted.

6994bool AllowToVectorize =false;

6995// Check if it is profitable to vectorize 2-elements loads.

6996if (NumElts == 2) {

6997bool IsLegalBroadcastLoad =TTI->isLegalBroadcastLoad(

6998 Slice.front()->getType(),ElementCount::getFixed(NumElts));

6999auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {

7000for (LoadInst *LI : Slice) {

7001// If single use/user - allow to vectorize.

7002if (LI->hasOneUse())

7003continue;

7004// 1. Check if number of uses equals number of users.

7005// 2. All users are deleted.

7006// 3. The load broadcasts are not allowed or the load is not

7007// broadcasted.

7008if (static_cast<unsignedint>(std::distance(

7009 LI->user_begin(), LI->user_end())) != LI->getNumUses())

7010returnfalse;

7011if (!IsLegalBroadcastLoad)

7012continue;

7013if (LI->hasNUsesOrMore(UsesLimit))

7014returnfalse;

7015for (User *U : LI->users()) {

7016if (auto *UI = dyn_cast<Instruction>(U); UI &&isDeleted(UI))

7017continue;

7018if (const TreeEntry *UTE = getTreeEntry(U)) {

7019for (intI : seq<int>(UTE->getNumOperands())) {

7020if (all_of(UTE->getOperand(I),

7021 [LI](Value *V) { return V == LI; }))

7022// Found legal broadcast - do not vectorize.

7023returnfalse;

7024 }

7025 }

7026 }

7027 }

7028returntrue;

7029 };

7030 AllowToVectorize = CheckIfAllowed(Slice);

7031 }else {

7032 AllowToVectorize =

7033 (NumElts >= 3 ||

7034any_of(ValueToGatherNodes.at(Slice.front()),

7035 [=](const TreeEntry *TE) {

7036 return TE->Scalars.size() == 2 &&

7037 ((TE->Scalars.front() == Slice.front() &&

7038 TE->Scalars.back() == Slice.back()) ||

7039 (TE->Scalars.front() == Slice.back() &&

7040 TE->Scalars.back() == Slice.front()));

7041 })) &&

7042hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),

7043 Slice.size());

7044 }

7045if (AllowToVectorize) {

7046SmallVector<Value *> PointerOps;

7047OrdersType CurrentOrder;

7048// Try to build vector load.

7049ArrayRef<Value *> Values(

7050reinterpret_cast<Value *const*>(Slice.begin()), Slice.size());

7051LoadsState LS =canVectorizeLoads(Values, Slice.front(), CurrentOrder,

7052 PointerOps, &BestVF);

7053if (LS !=LoadsState::Gather ||

7054 (BestVF > 1 &&static_cast<unsigned>(NumElts) == 2 * BestVF)) {

7055if (LS ==LoadsState::ScatterVectorize) {

7056if (MaskedGatherVectorized.empty() ||

7057 Cnt >= MaskedGatherVectorized.back() + NumElts)

7058 MaskedGatherVectorized.push_back(Cnt);

7059continue;

7060 }

7061if (LS !=LoadsState::Gather) {

7062Results.emplace_back(Values, LS);

7063 VectorizedLoads.insert(Slice.begin(), Slice.end());

7064// If we vectorized initial block, no need to try to vectorize it

7065// again.

7066if (Cnt == StartIdx)

7067 StartIdx += NumElts;

7068 }

7069// Check if the whole array was vectorized already - exit.

7070if (StartIdx >= Loads.size())

7071break;

7072// Erase last masked gather candidate, if another candidate within

7073// the range is found to be better.

7074if (!MaskedGatherVectorized.empty() &&

7075 Cnt < MaskedGatherVectorized.back() + NumElts)

7076 MaskedGatherVectorized.pop_back();

7077 Cnt += NumElts - 1;

7078continue;

7079 }

7080 }

7081if (!AllowToVectorize || BestVF == 0)

7082registerNonVectorizableLoads(Slice);

7083 }

7084// Mark masked gathers candidates as vectorized, if any.

7085for (unsigned Cnt : MaskedGatherVectorized) {

7086ArrayRef<LoadInst *> Slice =ArrayRef(Loads).slice(

7087 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));

7088ArrayRef<Value *> Values(

7089reinterpret_cast<Value *const*>(Slice.begin()), Slice.size());

7090Results.emplace_back(Values,LoadsState::ScatterVectorize);

7091 VectorizedLoads.insert(Slice.begin(), Slice.end());

7092// If we vectorized initial block, no need to try to vectorize it again.

7093if (Cnt == StartIdx)

7094 StartIdx += NumElts;

7095 }

7096 }

7097for (LoadInst *LI : Loads) {

7098if (!VectorizedLoads.contains(LI))

7099 NonVectorized.push_back(LI);

7100 }

7101returnResults;

7102 };

7103auto ProcessGatheredLoads =

7104 [&, &TTI = *TTI](

7105ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,

7106bool Final =false) {

7107SmallVector<LoadInst *> NonVectorized;

7108for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {

7109if (LoadsDists.size() <= 1) {

7110 NonVectorized.push_back(LoadsDists.back().first);

7111continue;

7112 }

7113SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);

7114SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());

7115transform(LoadsDists, OriginalLoads.begin(),

7116 [](const std::pair<LoadInst *, int> &L) ->LoadInst * {

7117 return L.first;

7118 });

7119stable_sort(LocalLoadsDists, LoadSorter);

7120SmallVector<LoadInst *> Loads;

7121unsigned MaxConsecutiveDistance = 0;

7122unsigned CurrentConsecutiveDist = 1;

7123int LastDist = LocalLoadsDists.front().second;

7124bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);

7125for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {

7126if (getTreeEntry(L.first))

7127continue;

7128assert(LastDist >=L.second &&

7129"Expected first distance always not less than second");

7130if (static_cast<unsigned>(LastDist -L.second) ==

7131 CurrentConsecutiveDist) {

7132 ++CurrentConsecutiveDist;

7133 MaxConsecutiveDistance =

7134 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);

7135 Loads.push_back(L.first);

7136continue;

7137 }

7138if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&

7139 !Loads.empty())

7140 Loads.pop_back();

7141 CurrentConsecutiveDist = 1;

7142 LastDist =L.second;

7143 Loads.push_back(L.first);

7144 }

7145if (Loads.size() <= 1)

7146continue;

7147if (AllowMaskedGather)

7148 MaxConsecutiveDistance = Loads.size();

7149elseif (MaxConsecutiveDistance < 2)

7150continue;

7151BoUpSLP::ValueSet VectorizedLoads;

7152SmallVector<LoadInst *> SortedNonVectorized;

7153SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>Results =

7154 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,

7155 Final, MaxConsecutiveDistance);

7156if (!Results.empty() && !SortedNonVectorized.empty() &&

7157 OriginalLoads.size() == Loads.size() &&

7158 MaxConsecutiveDistance == Loads.size() &&

7159all_of(Results,

7160 [](const std::pair<ArrayRef<Value *>,LoadsState> &P) {

7161returnP.second ==LoadsState::ScatterVectorize;

7162 })) {

7163 VectorizedLoads.clear();

7164SmallVector<LoadInst *> UnsortedNonVectorized;

7165SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>

7166 UnsortedResults =

7167 GetVectorizedRanges(OriginalLoads, VectorizedLoads,

7168 UnsortedNonVectorized, Final,

7169 OriginalLoads.size());

7170if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {

7171 SortedNonVectorized.swap(UnsortedNonVectorized);

7172Results.swap(UnsortedResults);

7173 }

7174 }

7175for (auto [Slice,_] :Results) {

7176LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize gathered loads ("

7177 << Slice.size() <<")\n");

7178if (any_of(Slice, [&](Value *V) {return getTreeEntry(V); })) {

7179for (Value *L : Slice)

7180if (!getTreeEntry(L))

7181 SortedNonVectorized.push_back(cast<LoadInst>(L));

7182continue;

7183 }

7184

7185// Select maximum VF as a maximum of user gathered nodes and

7186// distance between scalar loads in these nodes.

7187unsigned MaxVF = Slice.size();

7188unsigned UserMaxVF = 0;

7189unsigned InterleaveFactor = 0;

7190if (MaxVF == 2) {

7191 UserMaxVF = MaxVF;

7192 }else {

7193// Found distance between segments of the interleaved loads.

7194 std::optional<unsigned> InterleavedLoadsDistance = 0;

7195unsigned Order = 0;

7196 std::optional<unsigned> CommonVF = 0;

7197DenseMap<const TreeEntry *, unsigned> EntryToPosition;

7198SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;

7199for (auto [Idx, V] :enumerate(Slice)) {

7200for (const TreeEntry *E : ValueToGatherNodes.at(V)) {

7201 UserMaxVF = std::max<unsigned>(UserMaxVF,E->Scalars.size());

7202unsigned Pos =

7203 EntryToPosition.try_emplace(E,Idx).first->second;

7204 UserMaxVF = std::max<unsigned>(UserMaxVF,Idx - Pos + 1);

7205if (CommonVF) {

7206if (*CommonVF == 0) {

7207 CommonVF =E->Scalars.size();

7208continue;

7209 }

7210if (*CommonVF !=E->Scalars.size())

7211 CommonVF.reset();

7212 }

7213// Check if the load is the part of the interleaved load.

7214if (Pos !=Idx && InterleavedLoadsDistance) {

7215if (!DeinterleavedNodes.contains(E) &&

7216any_of(E->Scalars, [&, Slice = Slice](Value *V) {

7217 if (isa<Constant>(V))

7218 return false;

7219 if (getTreeEntry(V))

7220 return true;

7221 const auto &Nodes = ValueToGatherNodes.at(V);

7222 return (Nodes.size() != 1 || !Nodes.contains(E)) &&

7223 !is_contained(Slice, V);

7224 })) {

7225 InterleavedLoadsDistance.reset();

7226continue;

7227 }

7228 DeinterleavedNodes.insert(E);

7229if (*InterleavedLoadsDistance == 0) {

7230 InterleavedLoadsDistance =Idx - Pos;

7231continue;

7232 }

7233if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||

7234 (Idx - Pos) / *InterleavedLoadsDistance < Order)

7235 InterleavedLoadsDistance.reset();

7236 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);

7237 }

7238 }

7239 }

7240 DeinterleavedNodes.clear();

7241// Check if the large load represents interleaved load operation.

7242if (InterleavedLoadsDistance.value_or(0) > 1 &&

7243 CommonVF.value_or(0) != 0) {

7244 InterleaveFactor =bit_ceil(*InterleavedLoadsDistance);

7245unsigned VF = *CommonVF;

7246OrdersType Order;

7247SmallVector<Value *> PointerOps;

7248// Segmented load detected - vectorize at maximum vector factor.

7249if (InterleaveFactor <= Slice.size() &&

7250TTI.isLegalInterleavedAccessType(

7251getWidenedType(Slice.front()->getType(), VF),

7252 InterleaveFactor,

7253 cast<LoadInst>(Slice.front())->getAlign(),

7254 cast<LoadInst>(Slice.front())

7255 ->getPointerAddressSpace()) &&

7256canVectorizeLoads(Slice, Slice.front(), Order,

7257 PointerOps) ==LoadsState::Vectorize) {

7258 UserMaxVF = InterleaveFactor * VF;

7259 }else {

7260 InterleaveFactor = 0;

7261 }

7262 }

7263// Cannot represent the loads as consecutive vectorizable nodes -

7264// just exit.

7265unsigned ConsecutiveNodesSize = 0;

7266if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&

7267any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),

7268 [&, Slice = Slice](constauto &P) {

7269constauto *It =find_if(Slice, [&](Value *V) {

7270return std::get<1>(P).contains(V);

7271 });

7272if (It == Slice.end())

7273returnfalse;

7274ArrayRef<Value *> VL =

7275 VectorizableTree[std::get<0>(P)]->Scalars;

7276 ConsecutiveNodesSize += VL.size();

7277unsigned Start = std::distance(Slice.begin(), It);

7278unsigned Sz = Slice.size() - Start;

7279return Sz < VL.size() ||

7280 Slice.slice(std::distance(Slice.begin(), It),

7281 VL.size()) != VL;

7282 }))

7283continue;

7284// Try to build long masked gather loads.

7285 UserMaxVF =bit_ceil(UserMaxVF);

7286if (InterleaveFactor == 0 &&

7287any_of(seq<unsigned>(Slice.size() / UserMaxVF),

7288 [&, Slice = Slice](unsignedIdx) {

7289 OrdersType Order;

7290 SmallVector<Value *> PointerOps;

7291 return canVectorizeLoads(

7292 Slice.slice(Idx * UserMaxVF, UserMaxVF),

7293 Slice[Idx * UserMaxVF], Order,

7294 PointerOps) ==

7295 LoadsState::ScatterVectorize;

7296 }))

7297 UserMaxVF = MaxVF;

7298if (Slice.size() != ConsecutiveNodesSize)

7299 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);

7300 }

7301for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {

7302bool IsVectorized =true;

7303for (unsignedI = 0,E = Slice.size();I <E;I += VF) {

7304ArrayRef<Value *> SubSlice =

7305 Slice.slice(I, std::min(VF,E -I));

7306if (getTreeEntry(SubSlice.front()))

7307continue;

7308// Check if the subslice is to be-vectorized entry, which is not

7309// equal to entry.

7310if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),

7311 [&](constauto &P) {

7312return !SubSlice.equals(

7313 VectorizableTree[std::get<0>(P)]

7314 ->Scalars) &&

7315set_is_subset(SubSlice, std::get<1>(P));

7316 }))

7317continue;

7318unsigned Sz = VectorizableTree.size();

7319 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);

7320if (Sz == VectorizableTree.size()) {

7321 IsVectorized =false;

7322// Try non-interleaved vectorization with smaller vector

7323// factor.

7324if (InterleaveFactor > 0) {

7325 VF = 2 * (MaxVF / InterleaveFactor);

7326 InterleaveFactor = 0;

7327 }

7328continue;

7329 }

7330 }

7331if (IsVectorized)

7332break;

7333 }

7334 }

7335 NonVectorized.append(SortedNonVectorized);

7336 }

7337return NonVectorized;

7338 };

7339for (constauto &GLs : GatheredLoads) {

7340constauto &Ref = GLs.second;

7341SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);

7342if (!Ref.empty() && !NonVectorized.empty() &&

7343 std::accumulate(

7344Ref.begin(),Ref.end(), 0u,

7345 [](unsigned S,

7346ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->unsigned {

7347 return S + LoadsDists.size();

7348 }) != NonVectorized.size() &&

7349 IsMaskedGatherSupported(NonVectorized)) {

7350SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;

7351for (LoadInst *LI : NonVectorized) {

7352// Reinsert non-vectorized loads to other list of loads with the same

7353// base pointers.

7354gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,

7355 FinalGatheredLoads,

7356/*AddNew=*/false);

7357 }

7358// Final attempt to vectorize non-vectorized loads.

7359 (void)ProcessGatheredLoads(FinalGatheredLoads,/*Final=*/true);

7360 }

7361 }

7362// Try to vectorize postponed load entries, previously marked as gathered.

7363for (unsignedIdx : LoadEntriesToVectorize) {

7364const TreeEntry &E = *VectorizableTree[Idx];

7365SmallVector<Value *> GatheredScalars(E.Scalars.begin(),E.Scalars.end());

7366// Avoid reordering, if possible.

7367if (!E.ReorderIndices.empty()) {

7368// Build a mask out of the reorder indices and reorder scalars per this

7369// mask.

7370SmallVector<int> ReorderMask;

7371inversePermutation(E.ReorderIndices, ReorderMask);

7372reorderScalars(GatheredScalars, ReorderMask);

7373 }

7374 buildTree_rec(GatheredScalars, 0, EdgeInfo());

7375 }

7376// If no new entries created, consider it as no gathered loads entries must be

7377// handled.

7378if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==

7379 VectorizableTree.size())

7380 GatheredLoadsEntriesFirst.reset();

7381}

7382

7383/// \return true if the specified list of values has only one instruction that

7384/// requires scheduling, false otherwise.

7385#ifndef NDEBUG

7386staticboolneedToScheduleSingleInstruction(ArrayRef<Value *> VL) {

7387Value *NeedsScheduling =nullptr;

7388for (Value *V : VL) {

7389if (doesNotNeedToBeScheduled(V))

7390continue;

7391if (!NeedsScheduling) {

7392 NeedsScheduling = V;

7393continue;

7394 }

7395returnfalse;

7396 }

7397return NeedsScheduling;

7398}

7399#endif

7400

7401/// Generates key/subkey pair for the given value to provide effective sorting

7402/// of the values and better detection of the vectorizable values sequences. The

7403/// keys/subkeys can be used for better sorting of the values themselves (keys)

7404/// and in values subgroups (subkeys).

7405static std::pair<size_t, size_t>generateKeySubkey(

7406Value *V,constTargetLibraryInfo *TLI,

7407function_ref<hash_code(size_t,LoadInst *)> LoadsSubkeyGenerator,

7408bool AllowAlternate) {

7409hash_code Key =hash_value(V->getValueID() + 2);

7410hash_code SubKey =hash_value(0);

7411// Sort the loads by the distance between the pointers.

7412if (auto *LI = dyn_cast<LoadInst>(V)) {

7413 Key =hash_combine(LI->getType(),hash_value(Instruction::Load), Key);

7414if (LI->isSimple())

7415 SubKey =hash_value(LoadsSubkeyGenerator(Key, LI));

7416else

7417 Key = SubKey =hash_value(LI);

7418 }elseif (isVectorLikeInstWithConstOps(V)) {

7419// Sort extracts by the vector operands.

7420if (isa<ExtractElementInst, UndefValue>(V))

7421 Key =hash_value(Value::UndefValueVal + 1);

7422if (auto *EI = dyn_cast<ExtractElementInst>(V)) {

7423if (!isUndefVector(EI->getVectorOperand()).all() &&

7424 !isa<UndefValue>(EI->getIndexOperand()))

7425 SubKey =hash_value(EI->getVectorOperand());

7426 }

7427 }elseif (auto *I = dyn_cast<Instruction>(V)) {

7428// Sort other instructions just by the opcodes except for CMPInst.

7429// For CMP also sort by the predicate kind.

7430if ((isa<BinaryOperator, CastInst>(I)) &&

7431isValidForAlternation(I->getOpcode())) {

7432if (AllowAlternate)

7433 Key =hash_value(isa<BinaryOperator>(I) ? 1 : 0);

7434else

7435 Key =hash_combine(hash_value(I->getOpcode()), Key);

7436 SubKey =hash_combine(

7437hash_value(I->getOpcode()),hash_value(I->getType()),

7438hash_value(isa<BinaryOperator>(I)

7439 ?I->getType()

7440 : cast<CastInst>(I)->getOperand(0)->getType()));

7441// For casts, look through the only operand to improve compile time.

7442if (isa<CastInst>(I)) {

7443 std::pair<size_t, size_t> OpVals =

7444generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,

7445/*AllowAlternate=*/true);

7446 Key =hash_combine(OpVals.first, Key);

7447 SubKey =hash_combine(OpVals.first, SubKey);

7448 }

7449 }elseif (auto *CI = dyn_cast<CmpInst>(I)) {

7450CmpInst::Predicate Pred = CI->getPredicate();

7451if (CI->isCommutative())

7452 Pred = std::min(Pred,CmpInst::getInversePredicate(Pred));

7453CmpInst::Predicate SwapPred =CmpInst::getSwappedPredicate(Pred);

7454 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(Pred),

7455hash_value(SwapPred),

7456hash_value(CI->getOperand(0)->getType()));

7457 }elseif (auto *Call = dyn_cast<CallInst>(I)) {

7458Intrinsic::ID ID =getVectorIntrinsicIDForCall(Call, TLI);

7459if (isTriviallyVectorizable(ID)) {

7460 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(ID));

7461 }elseif (!VFDatabase(*Call).getMappings(*Call).empty()) {

7462 SubKey =hash_combine(hash_value(I->getOpcode()),

7463hash_value(Call->getCalledFunction()));

7464 }else {

7465 Key =hash_combine(hash_value(Call), Key);

7466 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(Call));

7467 }

7468for (constCallBase::BundleOpInfo &Op : Call->bundle_op_infos())

7469 SubKey =hash_combine(hash_value(Op.Begin),hash_value(Op.End),

7470hash_value(Op.Tag), SubKey);

7471 }elseif (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {

7472if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))

7473 SubKey =hash_value(Gep->getPointerOperand());

7474else

7475 SubKey =hash_value(Gep);

7476 }elseif (BinaryOperator::isIntDivRem(I->getOpcode()) &&

7477 !isa<ConstantInt>(I->getOperand(1))) {

7478// Do not try to vectorize instructions with potentially high cost.

7479 SubKey =hash_value(I);

7480 }else {

7481 SubKey =hash_value(I->getOpcode());

7482 }

7483 Key =hash_combine(hash_value(I->getParent()), Key);

7484 }

7485return std::make_pair(Key, SubKey);

7486}

7487

7488/// Checks if the specified instruction \p I is an alternate operation for

7489/// the given \p MainOp and \p AltOp instructions.

7490staticboolisAlternateInstruction(constInstruction *I,

7491constInstruction *MainOp,

7492constInstruction *AltOp,

7493constTargetLibraryInfo &TLI);

7494

7495bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,

7496ArrayRef<Value *> VL) const{

7497unsigned Opcode0 = S.getOpcode();

7498unsigned Opcode1 = S.getAltOpcode();

7499SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));

7500// If this pattern is supported by the target then consider it profitable.

7501if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),

7502 Opcode0, Opcode1, OpcodeMask))

7503returntrue;

7504SmallVector<ValueList>Operands;

7505for (unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands())) {

7506Operands.emplace_back();

7507// Prepare the operand vector.

7508for (Value *V : VL) {

7509if (isa<PoisonValue>(V)) {

7510Operands.back().push_back(

7511PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));

7512continue;

7513 }

7514Operands.back().push_back(cast<Instruction>(V)->getOperand(I));

7515 }

7516 }

7517if (Operands.size() == 2) {

7518// Try find best operands candidates.

7519for (unsignedI : seq<unsigned>(0, VL.size() - 1)) {

7520SmallVector<std::pair<Value *, Value *>> Candidates(3);

7521 Candidates[0] = std::make_pair(Operands[0][I],Operands[0][I + 1]);

7522 Candidates[1] = std::make_pair(Operands[0][I],Operands[1][I + 1]);

7523 Candidates[2] = std::make_pair(Operands[1][I],Operands[0][I + 1]);

7524 std::optional<int> Res =findBestRootPair(Candidates);

7525switch (Res.value_or(0)) {

7526case 0:

7527break;

7528case 1:

7529std::swap(Operands[0][I + 1],Operands[1][I + 1]);

7530break;

7531case 2:

7532std::swap(Operands[0][I],Operands[1][I]);

7533break;

7534default:

7535llvm_unreachable("Unexpected index.");

7536 }

7537 }

7538 }

7539DenseSet<unsigned> UniqueOpcodes;

7540constexprunsigned NumAltInsts = 3;// main + alt + shuffle.

7541unsigned NonInstCnt = 0;

7542// Estimate number of instructions, required for the vectorized node and for

7543// the buildvector node.

7544unsigned UndefCnt = 0;

7545// Count the number of extra shuffles, required for vector nodes.

7546unsigned ExtraShuffleInsts = 0;

7547// Check that operands do not contain same values and create either perfect

7548// diamond match or shuffled match.

7549if (Operands.size() == 2) {

7550// Do not count same operands twice.

7551if (Operands.front() ==Operands.back()) {

7552Operands.erase(Operands.begin());

7553 }elseif (!allConstant(Operands.front()) &&

7554all_of(Operands.front(), [&](Value *V) {

7555 return is_contained(Operands.back(), V);

7556 })) {

7557Operands.erase(Operands.begin());

7558 ++ExtraShuffleInsts;

7559 }

7560 }

7561constLoop *L = LI->getLoopFor(S.getMainOp()->getParent());

7562// Vectorize node, if:

7563// 1. at least single operand is constant or splat.

7564// 2. Operands have many loop invariants (the instructions are not loop

7565// invariants).

7566// 3. At least single unique operands is supposed to vectorized.

7567returnnone_of(Operands,

7568 [&](ArrayRef<Value *>Op) {

7569if (allConstant(Op) ||

7570 (!isSplat(Op) &&allSameBlock(Op) &&allSameType(Op) &&

7571getSameOpcode(Op, *TLI)))

7572returnfalse;

7573DenseMap<Value *, unsigned> Uniques;

7574for (Value *V :Op) {

7575if (isa<Constant, ExtractElementInst>(V) ||

7576 getTreeEntry(V) || (L &&L->isLoopInvariant(V))) {

7577 if (isa<UndefValue>(V))

7578 ++UndefCnt;

7579 continue;

7580 }

7581auto Res = Uniques.try_emplace(V, 0);

7582// Found first duplicate - need to add shuffle.

7583if (!Res.second && Res.first->second == 1)

7584 ++ExtraShuffleInsts;

7585 ++Res.first->getSecond();

7586if (auto *I = dyn_cast<Instruction>(V))

7587 UniqueOpcodes.insert(I->getOpcode());

7588elseif (Res.second)

7589 ++NonInstCnt;

7590 }

7591returnnone_of(Uniques, [&](constauto &P) {

7592returnP.first->hasNUsesOrMore(P.second + 1) &&

7593none_of(P.first->users(), [&](User *U) {

7594 return getTreeEntry(U) || Uniques.contains(U);

7595 });

7596 });

7597 }) ||

7598// Do not vectorize node, if estimated number of vector instructions is

7599// more than estimated number of buildvector instructions. Number of

7600// vector operands is number of vector instructions + number of vector

7601// instructions for operands (buildvectors). Number of buildvector

7602// instructions is just number_of_operands * number_of_scalars.

7603 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&

7604 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +

7605 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());

7606}

7607

7608BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(

7609const InstructionsState &S,ArrayRef<Value *> VL,

7610bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,

7611SmallVectorImpl<Value *> &PointerOps) {

7612assert(S.getMainOp() &&

7613"Expected instructions with same/alternate opcodes only.");

7614

7615unsigned ShuffleOrOp =

7616 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();

7617Instruction *VL0 = S.getMainOp();

7618switch (ShuffleOrOp) {

7619case Instruction::PHI: {

7620// Too many operands - gather, most probably won't be vectorized.

7621if (VL0->getNumOperands() >MaxPHINumOperands)

7622return TreeEntry::NeedToGather;

7623// Check for terminator values (e.g. invoke).

7624for (Value *V : VL) {

7625auto *PHI = dyn_cast<PHINode>(V);

7626if (!PHI)

7627continue;

7628for (Value *Incoming :PHI->incoming_values()) {

7629Instruction *Term = dyn_cast<Instruction>(Incoming);

7630if (Term &&Term->isTerminator()) {

7631LLVM_DEBUG(dbgs()

7632 <<"SLP: Need to swizzle PHINodes (terminator use).\n");

7633return TreeEntry::NeedToGather;

7634 }

7635 }

7636 }

7637

7638return TreeEntry::Vectorize;

7639 }

7640case Instruction::ExtractValue:

7641case Instruction::ExtractElement: {

7642bool Reuse = canReuseExtract(VL, CurrentOrder);

7643// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and

7644// non-full registers).

7645if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))

7646return TreeEntry::NeedToGather;

7647if (Reuse || !CurrentOrder.empty())

7648return TreeEntry::Vectorize;

7649LLVM_DEBUG(dbgs() <<"SLP: Gather extract sequence.\n");

7650return TreeEntry::NeedToGather;

7651 }

7652case Instruction::InsertElement: {

7653// Check that we have a buildvector and not a shuffle of 2 or more

7654// different vectors.

7655ValueSet SourceVectors;

7656for (Value *V : VL) {

7657 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));

7658assert(getElementIndex(V) != std::nullopt &&

7659"Non-constant or undef index?");

7660 }

7661

7662if (count_if(VL, [&SourceVectors](Value *V) {

7663return !SourceVectors.contains(V);

7664 }) >= 2) {

7665// Found 2nd source vector - cancel.

7666LLVM_DEBUG(dbgs() <<"SLP: Gather of insertelement vectors with "

7667"different source vectors.\n");

7668return TreeEntry::NeedToGather;

7669 }

7670

7671if (any_of(VL, [&SourceVectors](Value *V) {

7672// The last InsertElement can have multiple uses.

7673return SourceVectors.contains(V) && !V->hasOneUse();

7674 })) {

7675assert(SLPReVec &&"Only supported by REVEC.");

7676LLVM_DEBUG(dbgs() <<"SLP: Gather of insertelement vectors with "

7677"multiple uses.\n");

7678return TreeEntry::NeedToGather;

7679 }

7680

7681return TreeEntry::Vectorize;

7682 }

7683case Instruction::Load: {

7684// Check that a vectorized load would load the same memory as a scalar

7685// load. For example, we don't want to vectorize loads that are smaller

7686// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

7687// treats loading/storing it as an i8 struct. If we vectorize loads/stores

7688// from such a struct, we read/write packed bits disagreeing with the

7689// unvectorized version.

7690switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {

7691caseLoadsState::Vectorize:

7692return TreeEntry::Vectorize;

7693caseLoadsState::ScatterVectorize:

7694if (!IsGraphTransformMode && !VectorizableTree.empty()) {

7695// Delay slow vectorized nodes for better vectorization attempts.

7696 LoadEntriesToVectorize.insert(VectorizableTree.size());

7697return TreeEntry::NeedToGather;

7698 }

7699return TreeEntry::ScatterVectorize;

7700caseLoadsState::StridedVectorize:

7701if (!IsGraphTransformMode && VectorizableTree.size() > 1) {

7702// Delay slow vectorized nodes for better vectorization attempts.

7703 LoadEntriesToVectorize.insert(VectorizableTree.size());

7704return TreeEntry::NeedToGather;

7705 }

7706return TreeEntry::StridedVectorize;

7707caseLoadsState::Gather:

7708#ifndef NDEBUG

7709Type *ScalarTy = VL0->getType();

7710if (DL->getTypeSizeInBits(ScalarTy) !=

7711DL->getTypeAllocSizeInBits(ScalarTy))

7712LLVM_DEBUG(dbgs() <<"SLP: Gathering loads of non-packed type.\n");

7713elseif (any_of(VL, [](Value *V) {

7714auto *LI = dyn_cast<LoadInst>(V);

7715return !LI || !LI->isSimple();

7716 }))

7717LLVM_DEBUG(dbgs() <<"SLP: Gathering non-simple loads.\n");

7718else

7719LLVM_DEBUG(dbgs() <<"SLP: Gathering non-consecutive loads.\n");

7720#endif// NDEBUG

7721registerNonVectorizableLoads(VL);

7722return TreeEntry::NeedToGather;

7723 }

7724llvm_unreachable("Unexpected state of loads");

7725 }

7726case Instruction::ZExt:

7727case Instruction::SExt:

7728case Instruction::FPToUI:

7729case Instruction::FPToSI:

7730case Instruction::FPExt:

7731case Instruction::PtrToInt:

7732case Instruction::IntToPtr:

7733case Instruction::SIToFP:

7734case Instruction::UIToFP:

7735case Instruction::Trunc:

7736case Instruction::FPTrunc:

7737case Instruction::BitCast: {

7738Type *SrcTy = VL0->getOperand(0)->getType();

7739for (Value *V : VL) {

7740if (isa<PoisonValue>(V))

7741continue;

7742Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();

7743if (Ty != SrcTy || !isValidElementType(Ty)) {

7744LLVM_DEBUG(

7745dbgs() <<"SLP: Gathering casts with different src types.\n");

7746return TreeEntry::NeedToGather;

7747 }

7748 }

7749return TreeEntry::Vectorize;

7750 }

7751case Instruction::ICmp:

7752case Instruction::FCmp: {

7753// Check that all of the compares have the same predicate.

7754CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

7755CmpInst::Predicate SwapP0 =CmpInst::getSwappedPredicate(P0);

7756Type *ComparedTy = VL0->getOperand(0)->getType();

7757for (Value *V : VL) {

7758if (isa<PoisonValue>(V))

7759continue;

7760auto *Cmp = cast<CmpInst>(V);

7761if ((Cmp->getPredicate() != P0 &&Cmp->getPredicate() != SwapP0) ||

7762Cmp->getOperand(0)->getType() != ComparedTy) {

7763LLVM_DEBUG(dbgs() <<"SLP: Gathering cmp with different predicate.\n");

7764return TreeEntry::NeedToGather;

7765 }

7766 }

7767return TreeEntry::Vectorize;

7768 }

7769case Instruction::Select:

7770case Instruction::FNeg:

7771case Instruction::Add:

7772case Instruction::FAdd:

7773case Instruction::Sub:

7774case Instruction::FSub:

7775case Instruction::Mul:

7776case Instruction::FMul:

7777case Instruction::UDiv:

7778case Instruction::SDiv:

7779case Instruction::FDiv:

7780case Instruction::URem:

7781case Instruction::SRem:

7782case Instruction::FRem:

7783case Instruction::Shl:

7784case Instruction::LShr:

7785case Instruction::AShr:

7786case Instruction::And:

7787case Instruction::Or:

7788case Instruction::Xor:

7789case Instruction::Freeze:

7790if (S.getMainOp()->getType()->isFloatingPointTy() &&

7791TTI->isFPVectorizationPotentiallyUnsafe() &&any_of(VL, [](Value *V) {

7792auto *I = dyn_cast<Instruction>(V);

7793returnI &&I->isBinaryOp() && !I->isFast();

7794 }))

7795return TreeEntry::NeedToGather;

7796return TreeEntry::Vectorize;

7797case Instruction::GetElementPtr: {

7798// We don't combine GEPs with complicated (nested) indexing.

7799for (Value *V : VL) {

7800auto *I = dyn_cast<GetElementPtrInst>(V);

7801if (!I)

7802continue;

7803if (I->getNumOperands() != 2) {

7804LLVM_DEBUG(dbgs() <<"SLP: not-vectorizable GEP (nested indexes).\n");

7805return TreeEntry::NeedToGather;

7806 }

7807 }

7808

7809// We can't combine several GEPs into one vector if they operate on

7810// different types.

7811Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();

7812for (Value *V : VL) {

7813auto *GEP = dyn_cast<GEPOperator>(V);

7814if (!GEP)

7815continue;

7816Type *CurTy =GEP->getSourceElementType();

7817if (Ty0 != CurTy) {

7818LLVM_DEBUG(dbgs() <<"SLP: not-vectorizable GEP (different types).\n");

7819return TreeEntry::NeedToGather;

7820 }

7821 }

7822

7823// We don't combine GEPs with non-constant indexes.

7824Type *Ty1 = VL0->getOperand(1)->getType();

7825for (Value *V : VL) {

7826auto *I = dyn_cast<GetElementPtrInst>(V);

7827if (!I)

7828continue;

7829auto *Op =I->getOperand(1);

7830if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

7831 (Op->getType() != Ty1 &&

7832 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

7833Op->getType()->getScalarSizeInBits() >

7834DL->getIndexSizeInBits(

7835V->getType()->getPointerAddressSpace())))) {

7836LLVM_DEBUG(

7837dbgs() <<"SLP: not-vectorizable GEP (non-constant indexes).\n");

7838return TreeEntry::NeedToGather;

7839 }

7840 }

7841

7842return TreeEntry::Vectorize;

7843 }

7844case Instruction::Store: {

7845// Check if the stores are consecutive or if we need to swizzle them.

7846llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();

7847// Avoid types that are padded when being allocated as scalars, while

7848// being packed together in a vector (such as i1).

7849if (DL->getTypeSizeInBits(ScalarTy) !=

7850DL->getTypeAllocSizeInBits(ScalarTy)) {

7851LLVM_DEBUG(dbgs() <<"SLP: Gathering stores of non-packed type.\n");

7852return TreeEntry::NeedToGather;

7853 }

7854// Make sure all stores in the bundle are simple - we can't vectorize

7855// atomic or volatile stores.

7856for (Value *V : VL) {

7857auto *SI = cast<StoreInst>(V);

7858if (!SI->isSimple()) {

7859LLVM_DEBUG(dbgs() <<"SLP: Gathering non-simple stores.\n");

7860return TreeEntry::NeedToGather;

7861 }

7862 PointerOps.push_back(SI->getPointerOperand());

7863 }

7864

7865// Check the order of pointer operands.

7866if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {

7867Value *Ptr0;

7868Value *PtrN;

7869if (CurrentOrder.empty()) {

7870 Ptr0 = PointerOps.front();

7871 PtrN = PointerOps.back();

7872 }else {

7873 Ptr0 = PointerOps[CurrentOrder.front()];

7874 PtrN = PointerOps[CurrentOrder.back()];

7875 }

7876 std::optional<int> Dist =

7877getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

7878// Check that the sorted pointer operands are consecutive.

7879if (static_cast<unsigned>(*Dist) == VL.size() - 1)

7880return TreeEntry::Vectorize;

7881 }

7882

7883LLVM_DEBUG(dbgs() <<"SLP: Non-consecutive store.\n");

7884return TreeEntry::NeedToGather;

7885 }

7886case Instruction::Call: {

7887if (S.getMainOp()->getType()->isFloatingPointTy() &&

7888TTI->isFPVectorizationPotentiallyUnsafe() &&any_of(VL, [](Value *V) {

7889auto *I = dyn_cast<Instruction>(V);

7890returnI && !I->isFast();

7891 }))

7892return TreeEntry::NeedToGather;

7893// Check if the calls are all to the same vectorizable intrinsic or

7894// library function.

7895CallInst *CI = cast<CallInst>(VL0);

7896Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

7897

7898VFShape Shape =VFShape::get(

7899 CI->getFunctionType(),

7900ElementCount::getFixed(static_cast<unsignedint>(VL.size())),

7901false/*HasGlobalPred*/);

7902Function *VecFunc =VFDatabase(*CI).getVectorizedFunction(Shape);

7903

7904if (!VecFunc && !isTriviallyVectorizable(ID)) {

7905LLVM_DEBUG(dbgs() <<"SLP: Non-vectorizable call.\n");

7906return TreeEntry::NeedToGather;

7907 }

7908Function *F = CI->getCalledFunction();

7909unsigned NumArgs = CI->arg_size();

7910SmallVector<Value *, 4> ScalarArgs(NumArgs,nullptr);

7911for (unsigned J = 0; J != NumArgs; ++J)

7912if (isVectorIntrinsicWithScalarOpAtArg(ID, J,TTI))

7913 ScalarArgs[J] = CI->getArgOperand(J);

7914for (Value *V : VL) {

7915CallInst *CI2 = dyn_cast<CallInst>(V);

7916if (!CI2 || CI2->getCalledFunction() !=F ||

7917getVectorIntrinsicIDForCall(CI2, TLI) !=ID ||

7918 (VecFunc &&

7919 VecFunc !=VFDatabase(*CI2).getVectorizedFunction(Shape)) ||

7920 !CI->hasIdenticalOperandBundleSchema(*CI2)) {

7921LLVM_DEBUG(dbgs() <<"SLP: mismatched calls:" << *CI <<"!=" << *V

7922 <<"\n");

7923return TreeEntry::NeedToGather;

7924 }

7925// Some intrinsics have scalar arguments and should be same in order for

7926// them to be vectorized.

7927for (unsigned J = 0; J != NumArgs; ++J) {

7928if (isVectorIntrinsicWithScalarOpAtArg(ID, J,TTI)) {

7929Value *A1J = CI2->getArgOperand(J);

7930if (ScalarArgs[J] != A1J) {

7931LLVM_DEBUG(dbgs()

7932 <<"SLP: mismatched arguments in call:" << *CI

7933 <<" argument " << ScalarArgs[J] <<"!=" << A1J <<"\n");

7934return TreeEntry::NeedToGather;

7935 }

7936 }

7937 }

7938// Verify that the bundle operands are identical between the two calls.

7939if (CI->hasOperandBundles() &&

7940 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),

7941 CI->op_begin() + CI->getBundleOperandsEndIndex(),

7942 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {

7943LLVM_DEBUG(dbgs() <<"SLP: mismatched bundle operands in calls:" << *CI

7944 <<"!=" << *V <<'\n');

7945return TreeEntry::NeedToGather;

7946 }

7947 }

7948

7949return TreeEntry::Vectorize;

7950 }

7951case Instruction::ShuffleVector: {

7952if (!S.isAltShuffle()) {

7953// REVEC can support non alternate shuffle.

7954if (SLPReVec &&getShufflevectorNumGroups(VL))

7955return TreeEntry::Vectorize;

7956// If this is not an alternate sequence of opcode like add-sub

7957// then do not vectorize this instruction.

7958LLVM_DEBUG(dbgs() <<"SLP: ShuffleVector are not vectorized.\n");

7959return TreeEntry::NeedToGather;

7960 }

7961if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {

7962LLVM_DEBUG(

7963dbgs()

7964 <<"SLP: ShuffleVector not vectorized, operands are buildvector and "

7965"the whole alt sequence is not profitable.\n");

7966return TreeEntry::NeedToGather;

7967 }

7968

7969return TreeEntry::Vectorize;

7970 }

7971default:

7972LLVM_DEBUG(dbgs() <<"SLP: Gathering unknown instruction.\n");

7973return TreeEntry::NeedToGather;

7974 }

7975}

7976

7977namespace{

7978/// Allows to correctly handle operands of the phi nodes based on the \p Main

7979/// PHINode order of incoming basic blocks/values.

7980classPHIHandler {

7981DominatorTree &DT;

7982PHINode *Main =nullptr;

7983SmallVector<Value *> Phis;

7984SmallVector<SmallVector<Value *>>Operands;

7985

7986public:

7987 PHIHandler() =delete;

7988 PHIHandler(DominatorTree &DT,PHINode *Main,ArrayRef<Value *> Phis)

7989 : DT(DT), Main(Main), Phis(Phis),

7990Operands(Main->getNumIncomingValues(),

7991SmallVector<Value *>(Phis.size(), nullptr)) {}

7992void buildOperands() {

7993constexprunsigned FastLimit = 4;

7994if (Main->getNumIncomingValues() <= FastLimit) {

7995for (unsignedI : seq<unsigned>(0, Main->getNumIncomingValues())) {

7996BasicBlock *InBB = Main->getIncomingBlock(I);

7997if (!DT.isReachableFromEntry(InBB)) {

7998Operands[I].assign(Phis.size(),PoisonValue::get(Main->getType()));

7999continue;

8000 }

8001// Prepare the operand vector.

8002for (auto [Idx, V] :enumerate(Phis)) {

8003auto *P = dyn_cast<PHINode>(V);

8004if (!P) {

8005assert(isa<PoisonValue>(V) &&

8006"Expected isa instruction or poison value.");

8007Operands[I][Idx] =V;

8008continue;

8009 }

8010if (P->getIncomingBlock(I) == InBB)

8011Operands[I][Idx] =P->getIncomingValue(I);

8012else

8013Operands[I][Idx] =P->getIncomingValueForBlock(InBB);

8014 }

8015 }

8016return;

8017 }

8018SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4>Blocks;

8019for (unsignedI : seq<unsigned>(0, Main->getNumIncomingValues())) {

8020BasicBlock *InBB = Main->getIncomingBlock(I);

8021if (!DT.isReachableFromEntry(InBB)) {

8022Operands[I].assign(Phis.size(),PoisonValue::get(Main->getType()));

8023continue;

8024 }

8025Blocks.try_emplace(InBB).first->second.push_back(I);

8026 }

8027for (auto [Idx, V] :enumerate(Phis)) {

8028if (isa<PoisonValue>(V)) {

8029for (unsignedI : seq<unsigned>(Main->getNumIncomingValues()))

8030Operands[I][Idx] =V;

8031continue;

8032 }

8033auto *P = cast<PHINode>(V);

8034for (unsignedI : seq<unsigned>(0,P->getNumIncomingValues())) {

8035BasicBlock *InBB =P->getIncomingBlock(I);

8036if (InBB == Main->getIncomingBlock(I)) {

8037if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))

8038continue;

8039Operands[I][Idx] =P->getIncomingValue(I);

8040continue;

8041 }

8042auto It =Blocks.find(InBB);

8043if (It ==Blocks.end())

8044continue;

8045Operands[It->second.front()][Idx] =P->getIncomingValue(I);

8046 }

8047 }

8048for (constauto &P :Blocks) {

8049if (P.getSecond().size() <= 1)

8050continue;

8051unsigned BasicI =P.getSecond().front();

8052for (unsignedI :ArrayRef(P.getSecond()).drop_front()) {

8053assert(all_of(enumerate(Operands[I]),

8054 [&](constauto &Data) {

8055return !Data.value() ||

8056 Data.value() ==Operands[BasicI][Data.index()];

8057 }) &&

8058"Expected empty operands list.");

8059Operands[I] =Operands[BasicI];

8060 }

8061 }

8062 }

8063ArrayRef<Value *>getOperands(unsignedI) const{returnOperands[I]; }

8064};

8065}// namespace

8066

8067void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL,unsignedDepth,

8068const EdgeInfo &UserTreeIdx,

8069unsigned InterleaveFactor) {

8070assert((allConstant(VL) ||allSameType(VL)) &&"Invalid types!");

8071

8072SmallVector<int> ReuseShuffleIndices;

8073SmallVector<Value *> UniqueValues;

8074SmallVector<Value *> NonUniqueValueVL;

8075auto TryToFindDuplicates = [&](const InstructionsState &S,

8076bool DoNotFail =false) {

8077// Check that every instruction appears once in this bundle.

8078SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());

8079for (Value *V : VL) {

8080if (isConstant(V)) {

8081 ReuseShuffleIndices.emplace_back(

8082 isa<PoisonValue>(V) ?PoisonMaskElem : UniqueValues.size());

8083 UniqueValues.emplace_back(V);

8084continue;

8085 }

8086auto Res = UniquePositions.try_emplace(V, UniqueValues.size());

8087 ReuseShuffleIndices.emplace_back(Res.first->second);

8088if (Res.second)

8089 UniqueValues.emplace_back(V);

8090 }

8091size_t NumUniqueScalarValues = UniqueValues.size();

8092bool IsFullVectors =hasFullVectorsOrPowerOf2(

8093 *TTI,getValueType(UniqueValues.front()), NumUniqueScalarValues);

8094if (NumUniqueScalarValues == VL.size() &&

8095 (VectorizeNonPowerOf2 || IsFullVectors)) {

8096 ReuseShuffleIndices.clear();

8097 }else {

8098// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.

8099if ((UserTreeIdx.UserTE &&

8100 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||

8101 !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {

8102LLVM_DEBUG(dbgs() <<"SLP: Reshuffling scalars not yet supported "

8103"for nodes with padding.\n");

8104 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8105returnfalse;

8106 }

8107LLVM_DEBUG(dbgs() <<"SLP: Shuffle for reused scalars.\n");

8108if (NumUniqueScalarValues <= 1 || !IsFullVectors ||

8109 (UniquePositions.size() == 1 &&all_of(UniqueValues, [](Value *V) {

8110return isa<UndefValue>(V) || !isConstant(V);

8111 }))) {

8112if (DoNotFail && UniquePositions.size() > 1 &&

8113 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&

8114all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {

8115// Find the number of elements, which forms full vectors.

8116unsigned PWSz =getFullVectorNumberOfElements(

8117 *TTI, UniqueValues.front()->getType(), UniqueValues.size());

8118if (PWSz == VL.size()) {

8119 ReuseShuffleIndices.clear();

8120 }else {

8121 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());

8122 NonUniqueValueVL.append(

8123 PWSz - UniqueValues.size(),

8124PoisonValue::get(UniqueValues.front()->getType()));

8125// Check that extended with poisons operations are still valid for

8126// vectorization (div/rem are not allowed).

8127if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {

8128LLVM_DEBUG(dbgs() <<"SLP: Scalar used twice in bundle.\n");

8129 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8130returnfalse;

8131 }

8132 VL = NonUniqueValueVL;

8133 }

8134returntrue;

8135 }

8136LLVM_DEBUG(dbgs() <<"SLP: Scalar used twice in bundle.\n");

8137 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8138returnfalse;

8139 }

8140 VL = UniqueValues;

8141 }

8142returntrue;

8143 };

8144

8145 InstructionsState S =getSameOpcode(VL, *TLI);

8146

8147// Don't go into catchswitch blocks, which can happen with PHIs.

8148// Such blocks can only have PHIs and the catchswitch. There is no

8149// place to insert a shuffle if we need to, so just avoid that issue.

8150if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {

8151LLVM_DEBUG(dbgs() <<"SLP: bundle in catchswitch block.\n");

8152 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8153return;

8154 }

8155

8156// Check if this is a duplicate of another entry.

8157if (S) {

8158if (TreeEntry *E = getTreeEntry(S.getMainOp())) {

8159LLVM_DEBUG(dbgs() <<"SLP: \tChecking bundle: " << *S.getMainOp()

8160 <<".\n");

8161if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {

8162auto It = MultiNodeScalars.find(S.getMainOp());

8163if (It != MultiNodeScalars.end()) {

8164auto *TEIt =find_if(It->getSecond(),

8165 [&](TreeEntry *ME) { return ME->isSame(VL); });

8166if (TEIt != It->getSecond().end())

8167 E = *TEIt;

8168else

8169 E =nullptr;

8170 }else {

8171 E =nullptr;

8172 }

8173 }

8174if (!E) {

8175if (!doesNotNeedToBeScheduled(S.getMainOp())) {

8176LLVM_DEBUG(dbgs() <<"SLP: Gathering due to partial overlap.\n");

8177if (TryToFindDuplicates(S))

8178 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8179 ReuseShuffleIndices);

8180return;

8181 }

8182SmallPtrSet<const TreeEntry *, 4> Nodes;

8183 Nodes.insert(getTreeEntry(S.getMainOp()));

8184for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))

8185 Nodes.insert(E);

8186SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());

8187if (any_of(Nodes, [&](const TreeEntry *E) {

8188if (all_of(E->Scalars,

8189 [&](Value *V) { return Values.contains(V); }))

8190returntrue;

8191SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),

8192 E->Scalars.end());

8193 return (

8194all_of(VL, [&](Value *V) {return EValues.contains(V); }));

8195 })) {

8196LLVM_DEBUG(dbgs() <<"SLP: Gathering due to full overlap.\n");

8197if (TryToFindDuplicates(S))

8198 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8199 ReuseShuffleIndices);

8200return;

8201 }

8202 }else {

8203// Record the reuse of the tree node. FIXME, currently this is only

8204// used to properly draw the graph rather than for the actual

8205// vectorization.

8206 E->UserTreeIndices.push_back(UserTreeIdx);

8207LLVM_DEBUG(dbgs() <<"SLP: Perfect diamond merge at " << *S.getMainOp()

8208 <<".\n");

8209return;

8210 }

8211 }

8212 }

8213

8214// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of

8215// a load), in which case peek through to include it in the tree, without

8216// ballooning over-budget.

8217if (Depth >=RecursionMaxDepth &&

8218 !(S && !S.isAltShuffle() && VL.size() >= 4 &&

8219 (match(S.getMainOp(),m_Load(m_Value())) ||

8220all_of(VL, [&S](constValue *I) {

8221returnmatch(I,

8222m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&

8223 cast<Instruction>(I)->getOpcode() == S.getOpcode();

8224 })))) {

8225LLVM_DEBUG(dbgs() <<"SLP: Gathering due to max recursion depth.\n");

8226if (TryToFindDuplicates(S))

8227 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8228 ReuseShuffleIndices);

8229return;

8230 }

8231

8232// Don't handle scalable vectors

8233if (S && S.getOpcode() == Instruction::ExtractElement &&

8234 isa<ScalableVectorType>(

8235 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {

8236LLVM_DEBUG(dbgs() <<"SLP: Gathering due to scalable vector type.\n");

8237if (TryToFindDuplicates(S))

8238 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8239 ReuseShuffleIndices);

8240return;

8241 }

8242

8243// Don't handle vectors.

8244if (!SLPReVec &&getValueType(VL.front())->isVectorTy()) {

8245LLVM_DEBUG(dbgs() <<"SLP: Gathering due to vector type.\n");

8246 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8247return;

8248 }

8249

8250// If all of the operands are identical or constant we have a simple solution.

8251// If we deal with insert/extract instructions, they all must have constant

8252// indices, otherwise we should gather them, not try to vectorize.

8253// If alternate op node with 2 elements with gathered operands - do not

8254// vectorize.

8255auto &&NotProfitableForVectorization = [&S,this,

8256Depth](ArrayRef<Value *> VL) {

8257if (!S || !S.isAltShuffle() || VL.size() > 2)

8258returnfalse;

8259if (VectorizableTree.size() <MinTreeSize)

8260returnfalse;

8261if (Depth >=RecursionMaxDepth - 1)

8262returntrue;

8263// Check if all operands are extracts, part of vector node or can build a

8264// regular vectorize node.

8265SmallVector<unsigned, 8> InstsCount;

8266for (Value *V : VL) {

8267auto *I = cast<Instruction>(V);

8268 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {

8269 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);

8270 }));

8271 }

8272bool IsCommutative =

8273isCommutative(S.getMainOp()) ||isCommutative(S.getAltOp());

8274if ((IsCommutative &&

8275 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||

8276 (!IsCommutative &&

8277all_of(InstsCount, [](unsigned ICnt) {return ICnt < 2; })))

8278returntrue;

8279assert(VL.size() == 2 &&"Expected only 2 alternate op instructions.");

8280SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;

8281auto *I1 = cast<Instruction>(VL.front());

8282auto *I2 = cast<Instruction>(VL.back());

8283for (intOp : seq<int>(S.getMainOp()->getNumOperands()))

8284 Candidates.emplace_back().emplace_back(I1->getOperand(Op),

8285 I2->getOperand(Op));

8286if (static_cast<unsigned>(count_if(

8287 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

8288returnfindBestRootPair(Cand,LookAheadHeuristics::ScoreSplat);

8289 })) >= S.getMainOp()->getNumOperands() / 2)

8290returnfalse;

8291if (S.getMainOp()->getNumOperands() > 2)

8292returntrue;

8293if (IsCommutative) {

8294// Check permuted operands.

8295 Candidates.clear();

8296for (intOp = 0, E = S.getMainOp()->getNumOperands();Op < E; ++Op)

8297 Candidates.emplace_back().emplace_back(I1->getOperand(Op),

8298 I2->getOperand((Op + 1) % E));

8299if (any_of(

8300 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

8301returnfindBestRootPair(Cand,LookAheadHeuristics::ScoreSplat);

8302 }))

8303returnfalse;

8304 }

8305returntrue;

8306 };

8307SmallVector<unsigned> SortedIndices;

8308BasicBlock *BB =nullptr;

8309bool IsScatterVectorizeUserTE =

8310 UserTreeIdx.UserTE &&

8311 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;

8312bool AreAllSameBlock = S &&allSameBlock(VL);

8313bool AreScatterAllGEPSameBlock =

8314 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&

8315 VL.size() > 2 &&

8316all_of(VL,

8317 [&BB](Value *V) {

8318auto *I = dyn_cast<GetElementPtrInst>(V);

8319if (!I)

8320returndoesNotNeedToBeScheduled(V);

8321if (!BB)

8322 BB =I->getParent();

8323return BB ==I->getParent() &&I->getNumOperands() == 2;

8324 }) &&

8325 BB &&

8326sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,

8327 SortedIndices));

8328bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;

8329if (!AreAllSameInsts || (!S &&allConstant(VL)) ||isSplat(VL) ||

8330 (S &&

8331 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(

8332 S.getMainOp()) &&

8333 !all_of(VL,isVectorLikeInstWithConstOps)) ||

8334 NotProfitableForVectorization(VL)) {

8335LLVM_DEBUG(dbgs() <<"SLP: Gathering due to C,S,B,O, small shuffle. \n");

8336if (TryToFindDuplicates(S))

8337 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8338 ReuseShuffleIndices);

8339return;

8340 }

8341

8342// Don't vectorize ephemeral values.

8343if (S && !EphValues.empty()) {

8344for (Value *V : VL) {

8345if (EphValues.count(V)) {

8346LLVM_DEBUG(dbgs() <<"SLP: The instruction (" << *V

8347 <<") is ephemeral.\n");

8348 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8349return;

8350 }

8351 }

8352 }

8353

8354// We now know that this is a vector of instructions of the same type from

8355// the same block.

8356

8357// Check that none of the instructions in the bundle are already in the tree.

8358for (Value *V : VL) {

8359if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||

8360doesNotNeedToBeScheduled(V))

8361continue;

8362if (getTreeEntry(V)) {

8363LLVM_DEBUG(dbgs() <<"SLP: The instruction (" << *V

8364 <<") is already in tree.\n");

8365if (TryToFindDuplicates(S))

8366 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8367 ReuseShuffleIndices);

8368return;

8369 }

8370 }

8371

8372// The reduction nodes (stored in UserIgnoreList) also should stay scalar.

8373if (UserIgnoreList && !UserIgnoreList->empty()) {

8374for (Value *V : VL) {

8375if (UserIgnoreList->contains(V)) {

8376LLVM_DEBUG(dbgs() <<"SLP: Gathering due to gathered scalar.\n");

8377if (TryToFindDuplicates(S))

8378 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8379 ReuseShuffleIndices);

8380return;

8381 }

8382 }

8383 }

8384

8385// Special processing for sorted pointers for ScatterVectorize node with

8386// constant indeces only.

8387if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {

8388assert(VL.front()->getType()->isPointerTy() &&

8389count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&

8390"Expected pointers only.");

8391// Reset S to make it GetElementPtr kind of node.

8392constauto *It =find_if(VL, IsaPred<GetElementPtrInst>);

8393assert(It != VL.end() &&"Expected at least one GEP.");

8394 S =getSameOpcode(*It, *TLI);

8395 }

8396

8397// Check that all of the users of the scalars that we want to vectorize are

8398// schedulable.

8399Instruction *VL0 = S.getMainOp();

8400 BB = VL0->getParent();

8401

8402if (S &&

8403 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||

8404 !DT->isReachableFromEntry(BB))) {

8405// Don't go into unreachable blocks. They may contain instructions with

8406// dependency cycles which confuse the final scheduling.

8407// Do not vectorize EH and non-returning blocks, not profitable in most

8408// cases.

8409LLVM_DEBUG(dbgs() <<"SLP: bundle in unreachable block.\n");

8410 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8411return;

8412 }

8413

8414// Check that every instruction appears once in this bundle.

8415if (!TryToFindDuplicates(S,/*DoNotFail=*/true))

8416return;

8417

8418// Perform specific checks for each particular instruction kind.

8419OrdersType CurrentOrder;

8420SmallVector<Value *> PointerOps;

8421 TreeEntry::EntryState State = getScalarsVectorizationState(

8422 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);

8423if (State == TreeEntry::NeedToGather) {

8424 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8425 ReuseShuffleIndices);

8426return;

8427 }

8428

8429auto &BSRef = BlocksSchedules[BB];

8430if (!BSRef)

8431 BSRef = std::make_unique<BlockScheduling>(BB);

8432

8433 BlockScheduling &BS = *BSRef;

8434

8435 std::optional<ScheduleData *> Bundle =

8436 BS.tryScheduleBundle(UniqueValues,this, S);

8437#ifdef EXPENSIVE_CHECKS

8438// Make sure we didn't break any internal invariants

8439 BS.verify();

8440#endif

8441if (!Bundle) {

8442LLVM_DEBUG(dbgs() <<"SLP: We are not able to schedule this bundle!\n");

8443assert((!BS.getScheduleData(VL0) ||

8444 !BS.getScheduleData(VL0)->isPartOfBundle()) &&

8445"tryScheduleBundle should cancelScheduling on failure");

8446 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8447 ReuseShuffleIndices);

8448 NonScheduledFirst.insert(VL.front());

8449if (S.getOpcode() == Instruction::Load &&

8450 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)

8451registerNonVectorizableLoads(VL);

8452return;

8453 }

8454LLVM_DEBUG(dbgs() <<"SLP: We are able to schedule this bundle.\n");

8455

8456unsigned ShuffleOrOp =

8457 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();

8458auto CreateOperandNodes = [&](TreeEntry *TE,constauto &Operands) {

8459// Postpone PHI nodes creation

8460SmallVector<unsigned> PHIOps;

8461for (unsignedI : seq<unsigned>(Operands.size())) {

8462ArrayRef<Value *>Op =Operands[I];

8463if (Op.empty())

8464continue;

8465 InstructionsState S =getSameOpcode(Op, *TLI);

8466if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())

8467 buildTree_rec(Op,Depth + 1, {TE,I});

8468else

8469 PHIOps.push_back(I);

8470 }

8471for (unsignedI : PHIOps)

8472 buildTree_rec(Operands[I],Depth + 1, {TE,I});

8473 };

8474switch (ShuffleOrOp) {

8475case Instruction::PHI: {

8476auto *PH = cast<PHINode>(VL0);

8477

8478 TreeEntry *TE =

8479 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);

8480LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (PHINode).\n";

8481TE->dump());

8482

8483// Keeps the reordered operands to avoid code duplication.

8484 PHIHandler Handler(*DT, PH, VL);

8485 Handler.buildOperands();

8486for (unsignedI : seq<unsigned>(PH->getNumOperands()))

8487TE->setOperand(I, Handler.getOperands(I));

8488SmallVector<ArrayRef<Value *>>Operands(PH->getNumOperands());

8489for (unsignedI : seq<unsigned>(PH->getNumOperands()))

8490Operands[I] = Handler.getOperands(I);

8491 CreateOperandNodes(TE,Operands);

8492return;

8493 }

8494case Instruction::ExtractValue:

8495case Instruction::ExtractElement: {

8496if (CurrentOrder.empty()) {

8497LLVM_DEBUG(dbgs() <<"SLP: Reusing or shuffling extract sequence.\n");

8498 }else {

8499LLVM_DEBUG({

8500dbgs() <<"SLP: Reusing or shuffling of reordered extract sequence "

8501"with order";

8502for (unsignedIdx : CurrentOrder)

8503dbgs() <<" " <<Idx;

8504dbgs() <<"\n";

8505 });

8506fixupOrderingIndices(CurrentOrder);

8507 }

8508// Insert new order with initial value 0, if it does not exist,

8509// otherwise return the iterator to the existing one.

8510 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8511 ReuseShuffleIndices, CurrentOrder);

8512LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry "

8513"(ExtractValueInst/ExtractElementInst).\n";

8514TE->dump());

8515// This is a special case, as it does not gather, but at the same time

8516// we are not extending buildTree_rec() towards the operands.

8517TE->setOperand(*this);

8518return;

8519 }

8520case Instruction::InsertElement: {

8521assert(ReuseShuffleIndices.empty() &&"All inserts should be unique");

8522

8523auto OrdCompare = [](const std::pair<int, int> &P1,

8524const std::pair<int, int> &P2) {

8525returnP1.first > P2.first;

8526 };

8527PriorityQueue<std::pair<int, int>,SmallVector<std::pair<int, int>>,

8528decltype(OrdCompare)>

8529 Indices(OrdCompare);

8530for (intI = 0, E = VL.size();I < E; ++I) {

8531unsignedIdx = *getElementIndex(VL[I]);

8532 Indices.emplace(Idx,I);

8533 }

8534OrdersType CurrentOrder(VL.size(), VL.size());

8535bool IsIdentity =true;

8536for (intI = 0, E = VL.size();I < E; ++I) {

8537 CurrentOrder[Indices.top().second] =I;

8538 IsIdentity &= Indices.top().second ==I;

8539 Indices.pop();

8540 }

8541if (IsIdentity)

8542 CurrentOrder.clear();

8543 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8544 {}, CurrentOrder);

8545LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (InsertElementInst).\n";

8546TE->dump());

8547

8548TE->setOperand(*this);

8549 buildTree_rec(TE->getOperand(1),Depth + 1, {TE, 1});

8550return;

8551 }

8552case Instruction::Load: {

8553// Check that a vectorized load would load the same memory as a scalar

8554// load. For example, we don't want to vectorize loads that are smaller

8555// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

8556// treats loading/storing it as an i8 struct. If we vectorize loads/stores

8557// from such a struct, we read/write packed bits disagreeing with the

8558// unvectorized version.

8559 TreeEntry *TE =nullptr;

8560fixupOrderingIndices(CurrentOrder);

8561switch (State) {

8562case TreeEntry::Vectorize:

8563TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8564 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);

8565if (CurrentOrder.empty())

8566LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (LoadInst).\n";

8567TE->dump());

8568else

8569LLVM_DEBUG(dbgs()

8570 <<"SLP: added a new TreeEntry (jumbled LoadInst).\n";

8571TE->dump());

8572break;

8573case TreeEntry::StridedVectorize:

8574// Vectorizing non-consecutive loads with `llvm.masked.gather`.

8575TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,

8576 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);

8577LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (strided LoadInst).\n";

8578TE->dump());

8579break;

8580case TreeEntry::ScatterVectorize:

8581// Vectorizing non-consecutive loads with `llvm.masked.gather`.

8582TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,

8583 UserTreeIdx, ReuseShuffleIndices);

8584LLVM_DEBUG(

8585dbgs()

8586 <<"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";

8587TE->dump());

8588break;

8589case TreeEntry::CombinedVectorize:

8590case TreeEntry::NeedToGather:

8591llvm_unreachable("Unexpected loads state.");

8592 }

8593TE->setOperand(*this);

8594if (State == TreeEntry::ScatterVectorize)

8595 buildTree_rec(PointerOps,Depth + 1, {TE, 0});

8596return;

8597 }

8598case Instruction::ZExt:

8599case Instruction::SExt:

8600case Instruction::FPToUI:

8601case Instruction::FPToSI:

8602case Instruction::FPExt:

8603case Instruction::PtrToInt:

8604case Instruction::IntToPtr:

8605case Instruction::SIToFP:

8606case Instruction::UIToFP:

8607case Instruction::Trunc:

8608case Instruction::FPTrunc:

8609case Instruction::BitCast: {

8610auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(

8611 std::make_pair(std::numeric_limits<unsigned>::min(),

8612 std::numeric_limits<unsigned>::max()));

8613if (ShuffleOrOp == Instruction::ZExt ||

8614 ShuffleOrOp == Instruction::SExt) {

8615 CastMaxMinBWSizes = std::make_pair(

8616 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),

8617 PrevMaxBW),

8618 std::min<unsigned>(

8619 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),

8620 PrevMinBW));

8621 }elseif (ShuffleOrOp == Instruction::Trunc) {

8622 CastMaxMinBWSizes = std::make_pair(

8623 std::max<unsigned>(

8624 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),

8625 PrevMaxBW),

8626 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),

8627 PrevMinBW));

8628 }

8629 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8630 ReuseShuffleIndices);

8631LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CastInst).\n";

8632TE->dump());

8633

8634TE->setOperand(*this);

8635for (unsignedI : seq<unsigned>(VL0->getNumOperands()))

8636 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});

8637if (ShuffleOrOp == Instruction::Trunc) {

8638 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);

8639 }elseif (ShuffleOrOp == Instruction::SIToFP ||

8640 ShuffleOrOp == Instruction::UIToFP) {

8641unsigned NumSignBits =

8642ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC,nullptr, DT);

8643if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {

8644APInt Mask = DB->getDemandedBits(OpI);

8645 NumSignBits = std::max(NumSignBits,Mask.countl_zero());

8646 }

8647if (NumSignBits * 2 >=

8648 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))

8649 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);

8650 }

8651return;

8652 }

8653case Instruction::ICmp:

8654case Instruction::FCmp: {

8655// Check that all of the compares have the same predicate.

8656CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

8657 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8658 ReuseShuffleIndices);

8659LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CmpInst).\n";

8660TE->dump());

8661

8662ValueList Left,Right;

8663 VLOperands Ops(VL, S, *this);

8664if (cast<CmpInst>(VL0)->isCommutative()) {

8665// Commutative predicate - collect + sort operands of the instructions

8666// so that each side is more likely to have the same opcode.

8667assert(P0 ==CmpInst::getSwappedPredicate(P0) &&

8668"Commutative Predicate mismatch");

8669 Ops.reorder();

8670Left = Ops.getVL(0);

8671Right = Ops.getVL(1);

8672 }else {

8673// Collect operands - commute if it uses the swapped predicate.

8674for (Value *V : VL) {

8675if (isa<PoisonValue>(V)) {

8676Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));

8677Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));

8678continue;

8679 }

8680auto *Cmp = cast<CmpInst>(V);

8681Value *LHS =Cmp->getOperand(0);

8682Value *RHS =Cmp->getOperand(1);

8683if (Cmp->getPredicate() != P0)

8684std::swap(LHS, RHS);

8685Left.push_back(LHS);

8686Right.push_back(RHS);

8687 }

8688 }

8689TE->setOperand(0,Left);

8690TE->setOperand(1,Right);

8691 buildTree_rec(Left,Depth + 1, {TE, 0});

8692 buildTree_rec(Right,Depth + 1, {TE, 1});

8693if (ShuffleOrOp == Instruction::ICmp) {

8694unsigned NumSignBits0 =

8695ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC,nullptr, DT);

8696if (NumSignBits0 * 2 >=

8697 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))

8698 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);

8699unsigned NumSignBits1 =

8700ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC,nullptr, DT);

8701if (NumSignBits1 * 2 >=

8702 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))

8703 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);

8704 }

8705return;

8706 }

8707case Instruction::Select:

8708case Instruction::FNeg:

8709case Instruction::Add:

8710case Instruction::FAdd:

8711case Instruction::Sub:

8712case Instruction::FSub:

8713case Instruction::Mul:

8714case Instruction::FMul:

8715case Instruction::UDiv:

8716case Instruction::SDiv:

8717case Instruction::FDiv:

8718case Instruction::URem:

8719case Instruction::SRem:

8720case Instruction::FRem:

8721case Instruction::Shl:

8722case Instruction::LShr:

8723case Instruction::AShr:

8724case Instruction::And:

8725case Instruction::Or:

8726case Instruction::Xor:

8727case Instruction::Freeze: {

8728 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8729 ReuseShuffleIndices);

8730LLVM_DEBUG(

8731dbgs() <<"SLP: added a new TreeEntry "

8732"(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";

8733TE->dump());

8734

8735TE->setOperand(*this, isa<BinaryOperator>(VL0) &&isCommutative(VL0));

8736for (unsignedI : seq<unsigned>(VL0->getNumOperands()))

8737 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});

8738return;

8739 }

8740case Instruction::GetElementPtr: {

8741 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8742 ReuseShuffleIndices);

8743LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (GetElementPtrInst).\n";

8744TE->dump());

8745SmallVector<ValueList, 2>Operands(2);

8746// Prepare the operand vector for pointer operands.

8747for (Value *V : VL) {

8748auto *GEP = dyn_cast<GetElementPtrInst>(V);

8749if (!GEP) {

8750Operands.front().push_back(V);

8751continue;

8752 }

8753Operands.front().push_back(GEP->getPointerOperand());

8754 }

8755TE->setOperand(0,Operands.front());

8756// Need to cast all indices to the same type before vectorization to

8757// avoid crash.

8758// Required to be able to find correct matches between different gather

8759// nodes and reuse the vectorized values rather than trying to gather them

8760// again.

8761int IndexIdx = 1;

8762Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();

8763Type *Ty =all_of(VL,

8764 [VL0Ty, IndexIdx](Value *V) {

8765auto *GEP = dyn_cast<GetElementPtrInst>(V);

8766if (!GEP)

8767returntrue;

8768return VL0Ty ==GEP->getOperand(IndexIdx)->getType();

8769 })

8770 ? VL0Ty

8771 : DL->getIndexType(cast<GetElementPtrInst>(VL0)

8772 ->getPointerOperandType()

8773 ->getScalarType());

8774// Prepare the operand vector.

8775for (Value *V : VL) {

8776auto *I = dyn_cast<GetElementPtrInst>(V);

8777if (!I) {

8778Operands.back().push_back(

8779 ConstantInt::get(Ty, 0,/*isSigned=*/false));

8780continue;

8781 }

8782auto *Op =I->getOperand(IndexIdx);

8783auto *CI = dyn_cast<ConstantInt>(Op);

8784if (!CI)

8785Operands.back().push_back(Op);

8786else

8787Operands.back().push_back(ConstantFoldIntegerCast(

8788 CI, Ty, CI->getValue().isSignBitSet(), *DL));

8789 }

8790TE->setOperand(IndexIdx,Operands.back());

8791

8792for (unsignedI = 0, Ops =Operands.size();I < Ops; ++I)

8793 buildTree_rec(Operands[I],Depth + 1, {TE,I});

8794return;

8795 }

8796case Instruction::Store: {

8797bool Consecutive = CurrentOrder.empty();

8798if (!Consecutive)

8799fixupOrderingIndices(CurrentOrder);

8800 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8801 ReuseShuffleIndices, CurrentOrder);

8802if (Consecutive)

8803LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (StoreInst).\n";

8804TE->dump());

8805else

8806LLVM_DEBUG(

8807dbgs() <<"SLP: added a new TreeEntry (jumbled StoreInst).\n";

8808TE->dump());

8809TE->setOperand(*this);

8810 buildTree_rec(TE->getOperand(0),Depth + 1, {TE, 0});

8811return;

8812 }

8813case Instruction::Call: {

8814// Check if the calls are all to the same vectorizable intrinsic or

8815// library function.

8816CallInst *CI = cast<CallInst>(VL0);

8817Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

8818

8819 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8820 ReuseShuffleIndices);

8821LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CallInst).\n";

8822TE->dump());

8823TE->setOperand(*this,isCommutative(VL0));

8824for (unsignedI : seq<unsigned>(CI->arg_size())) {

8825// For scalar operands no need to create an entry since no need to

8826// vectorize it.

8827if (isVectorIntrinsicWithScalarOpAtArg(ID,I,TTI))

8828continue;

8829 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});

8830 }

8831return;

8832 }

8833case Instruction::ShuffleVector: {

8834 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8835 ReuseShuffleIndices);

8836if (S.isAltShuffle()) {

8837LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (isAltShuffle).\n";

8838TE->dump());

8839 }else {

8840assert(SLPReVec &&"Only supported by REVEC.");

8841LLVM_DEBUG(

8842dbgs() <<"SLP: added a new TreeEntry (ShuffleVectorInst).\n";

8843TE->dump());

8844 }

8845

8846// Reorder operands if reordering would enable vectorization.

8847auto *CI = dyn_cast<CmpInst>(VL0);

8848if (CI &&any_of(VL, [](Value *V) {

8849return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();

8850 })) {

8851auto *MainCI = cast<CmpInst>(S.getMainOp());

8852auto *AltCI = cast<CmpInst>(S.getAltOp());

8853CmpInst::Predicate MainP = MainCI->getPredicate();

8854CmpInst::Predicate AltP = AltCI->getPredicate();

8855assert(MainP != AltP &&

8856"Expected different main/alternate predicates.");

8857ValueList Left,Right;

8858// Collect operands - commute if it uses the swapped predicate or

8859// alternate operation.

8860for (Value *V : VL) {

8861if (isa<PoisonValue>(V)) {

8862Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));

8863Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));

8864continue;

8865 }

8866auto *Cmp = cast<CmpInst>(V);

8867Value *LHS =Cmp->getOperand(0);

8868Value *RHS =Cmp->getOperand(1);

8869

8870if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {

8871if (AltP ==CmpInst::getSwappedPredicate(Cmp->getPredicate()))

8872std::swap(LHS, RHS);

8873 }else {

8874if (MainP ==CmpInst::getSwappedPredicate(Cmp->getPredicate()))

8875std::swap(LHS, RHS);

8876 }

8877Left.push_back(LHS);

8878Right.push_back(RHS);

8879 }

8880TE->setOperand(0,Left);

8881TE->setOperand(1,Right);

8882 buildTree_rec(Left,Depth + 1, {TE, 0});

8883 buildTree_rec(Right,Depth + 1, {TE, 1});

8884return;

8885 }

8886

8887TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);

8888for (unsignedI : seq<unsigned>(VL0->getNumOperands()))

8889 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});

8890return;

8891 }

8892default:

8893break;

8894 }

8895llvm_unreachable("Unexpected vectorization of the instructions.");

8896}

8897

8898unsignedBoUpSLP::canMapToVector(Type *T) const{

8899unsignedN = 1;

8900Type *EltTy =T;

8901

8902while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {

8903if (EltTy->isEmptyTy())

8904return 0;

8905if (auto *ST = dyn_cast<StructType>(EltTy)) {

8906// Check that struct is homogeneous.

8907for (constauto *Ty : ST->elements())

8908if (Ty != *ST->element_begin())

8909return 0;

8910N *= ST->getNumElements();

8911 EltTy = *ST->element_begin();

8912 }elseif (auto *AT = dyn_cast<ArrayType>(EltTy)) {

8913N *= AT->getNumElements();

8914 EltTy = AT->getElementType();

8915 }else {

8916auto *VT = cast<FixedVectorType>(EltTy);

8917N *= VT->getNumElements();

8918 EltTy = VT->getElementType();

8919 }

8920 }

8921

8922if (!isValidElementType(EltTy))

8923return 0;

8924uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy,N));

8925if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||

8926 VTSize != DL->getTypeStoreSizeInBits(T))

8927return 0;

8928returnN;

8929}

8930

8931bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,

8932SmallVectorImpl<unsigned> &CurrentOrder,

8933bool ResizeAllowed) const{

8934constauto *It =find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);

8935assert(It != VL.end() &&"Expected at least one extract instruction.");

8936auto *E0 = cast<Instruction>(*It);

8937assert(

8938all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&

8939"Invalid opcode");

8940// Check if all of the extracts come from the same vector and from the

8941// correct offset.

8942Value *Vec = E0->getOperand(0);

8943

8944 CurrentOrder.clear();

8945

8946// We have to extract from a vector/aggregate with the same number of elements.

8947unsigned NElts;

8948if (E0->getOpcode() == Instruction::ExtractValue) {

8949 NElts =canMapToVector(Vec->getType());

8950if (!NElts)

8951returnfalse;

8952// Check if load can be rewritten as load of vector.

8953LoadInst *LI = dyn_cast<LoadInst>(Vec);

8954if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))

8955returnfalse;

8956 }else {

8957 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();

8958 }

8959

8960unsigned E = VL.size();

8961if (!ResizeAllowed && NElts != E)

8962returnfalse;

8963SmallVector<int> Indices(E,PoisonMaskElem);

8964unsigned MinIdx = NElts, MaxIdx = 0;

8965for (auto [I, V] :enumerate(VL)) {

8966auto *Inst = dyn_cast<Instruction>(V);

8967if (!Inst)

8968continue;

8969if (Inst->getOperand(0) != Vec)

8970returnfalse;

8971if (auto *EE = dyn_cast<ExtractElementInst>(Inst))

8972if (isa<UndefValue>(EE->getIndexOperand()))

8973continue;

8974 std::optional<unsigned>Idx =getExtractIndex(Inst);

8975if (!Idx)

8976returnfalse;

8977constunsigned ExtIdx = *Idx;

8978if (ExtIdx >= NElts)

8979continue;

8980 Indices[I] = ExtIdx;

8981if (MinIdx > ExtIdx)

8982 MinIdx = ExtIdx;

8983if (MaxIdx < ExtIdx)

8984 MaxIdx = ExtIdx;

8985 }

8986if (MaxIdx - MinIdx + 1 > E)

8987returnfalse;

8988if (MaxIdx + 1 <= E)

8989 MinIdx = 0;

8990

8991// Check that all of the indices extract from the correct offset.

8992bool ShouldKeepOrder =true;

8993// Assign to all items the initial value E + 1 so we can check if the extract

8994// instruction index was used already.

8995// Also, later we can check that all the indices are used and we have a

8996// consecutive access in the extract instructions, by checking that no

8997// element of CurrentOrder still has value E + 1.

8998 CurrentOrder.assign(E, E);

8999for (unsignedI = 0;I < E; ++I) {

9000if (Indices[I] ==PoisonMaskElem)

9001continue;

9002constunsigned ExtIdx = Indices[I] - MinIdx;

9003if (CurrentOrder[ExtIdx] != E) {

9004 CurrentOrder.clear();

9005returnfalse;

9006 }

9007 ShouldKeepOrder &= ExtIdx ==I;

9008 CurrentOrder[ExtIdx] =I;

9009 }

9010if (ShouldKeepOrder)

9011 CurrentOrder.clear();

9012

9013return ShouldKeepOrder;

9014}

9015

9016bool BoUpSLP::areAllUsersVectorized(

9017Instruction *I,constSmallDenseSet<Value *> *VectorizedVals) const{

9018return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||

9019all_of(I->users(), [this](User *U) {

9020 return ScalarToTreeEntry.contains(U) ||

9021 isVectorLikeInstWithConstOps(U) ||

9022 (isa<ExtractElementInst>(U) && MustGather.contains(U));

9023 });

9024}

9025

9026static std::pair<InstructionCost, InstructionCost>

9027getVectorCallCosts(CallInst *CI,FixedVectorType *VecTy,

9028TargetTransformInfo *TTI,TargetLibraryInfo *TLI,

9029ArrayRef<Type *> ArgTys) {

9030Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

9031

9032// Calculate the cost of the scalar and vector calls.

9033FastMathFlags FMF;

9034if (auto *FPCI = dyn_cast<FPMathOperator>(CI))

9035 FMF = FPCI->getFastMathFlags();

9036IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);

9037auto IntrinsicCost =

9038TTI->getIntrinsicInstrCost(CostAttrs,TTI::TCK_RecipThroughput);

9039

9040auto Shape =VFShape::get(CI->getFunctionType(),

9041ElementCount::getFixed(VecTy->getNumElements()),

9042false/*HasGlobalPred*/);

9043Function *VecFunc =VFDatabase(*CI).getVectorizedFunction(Shape);

9044auto LibCost = IntrinsicCost;

9045if (!CI->isNoBuiltin() && VecFunc) {

9046// Calculate the cost of the vector library call.

9047// If the corresponding vector call is cheaper, return its cost.

9048 LibCost =

9049TTI->getCallInstrCost(nullptr, VecTy, ArgTys,TTI::TCK_RecipThroughput);

9050 }

9051return {IntrinsicCost, LibCost};

9052}

9053

9054void BoUpSLP::TreeEntry::buildAltOpShuffleMask(

9055constfunction_ref<bool(Instruction *)> IsAltOp,SmallVectorImpl<int> &Mask,

9056SmallVectorImpl<Value *> *OpScalars,

9057SmallVectorImpl<Value *> *AltScalars) const{

9058unsigned Sz = Scalars.size();

9059Mask.assign(Sz,PoisonMaskElem);

9060SmallVector<int> OrderMask;

9061if (!ReorderIndices.empty())

9062inversePermutation(ReorderIndices, OrderMask);

9063for (unsignedI = 0;I < Sz; ++I) {

9064unsignedIdx =I;

9065if (!ReorderIndices.empty())

9066Idx = OrderMask[I];

9067if (isa<PoisonValue>(Scalars[Idx]))

9068continue;

9069auto *OpInst = cast<Instruction>(Scalars[Idx]);

9070if (IsAltOp(OpInst)) {

9071Mask[I] = Sz +Idx;

9072if (AltScalars)

9073 AltScalars->push_back(OpInst);

9074 }else {

9075Mask[I] =Idx;

9076if (OpScalars)

9077 OpScalars->push_back(OpInst);

9078 }

9079 }

9080if (!ReuseShuffleIndices.empty()) {

9081SmallVector<int> NewMask(ReuseShuffleIndices.size(),PoisonMaskElem);

9082transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](intIdx) {

9083 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;

9084 });

9085Mask.swap(NewMask);

9086 }

9087}

9088

9089staticboolisAlternateInstruction(constInstruction *I,

9090constInstruction *MainOp,

9091constInstruction *AltOp,

9092constTargetLibraryInfo &TLI) {

9093if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {

9094auto *AltCI = cast<CmpInst>(AltOp);

9095CmpInst::Predicate MainP = MainCI->getPredicate();

9096 [[maybe_unused]]CmpInst::Predicate AltP = AltCI->getPredicate();

9097assert(MainP != AltP &&"Expected different main/alternate predicates.");

9098auto *CI = cast<CmpInst>(I);

9099if (isCmpSameOrSwapped(MainCI, CI, TLI))

9100returnfalse;

9101if (isCmpSameOrSwapped(AltCI, CI, TLI))

9102returntrue;

9103CmpInst::Predicate P = CI->getPredicate();

9104CmpInst::Predicate SwappedP =CmpInst::getSwappedPredicate(P);

9105

9106assert((MainP ==P || AltP ==P || MainP == SwappedP || AltP == SwappedP) &&

9107"CmpInst expected to match either main or alternate predicate or "

9108"their swap.");

9109return MainP !=P && MainP != SwappedP;

9110 }

9111returnI->getOpcode() == AltOp->getOpcode();

9112}

9113

9114TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {

9115assert(!Ops.empty());

9116constauto *Op0 = Ops.front();

9117

9118constboolIsConstant =all_of(Ops, [](Value *V) {

9119// TODO: We should allow undef elements here

9120returnisConstant(V) && !isa<UndefValue>(V);

9121 });

9122constbool IsUniform =all_of(Ops, [=](Value *V) {

9123// TODO: We should allow undef elements here

9124returnV == Op0;

9125 });

9126constbool IsPowerOfTwo =all_of(Ops, [](Value *V) {

9127// TODO: We should allow undef elements here

9128if (auto *CI = dyn_cast<ConstantInt>(V))

9129return CI->getValue().isPowerOf2();

9130returnfalse;

9131 });

9132constbool IsNegatedPowerOfTwo =all_of(Ops, [](Value *V) {

9133// TODO: We should allow undef elements here

9134if (auto *CI = dyn_cast<ConstantInt>(V))

9135return CI->getValue().isNegatedPowerOf2();

9136returnfalse;

9137 });

9138

9139TTI::OperandValueKind VK =TTI::OK_AnyValue;

9140if (IsConstant && IsUniform)

9141 VK =TTI::OK_UniformConstantValue;

9142elseif (IsConstant)

9143 VK =TTI::OK_NonUniformConstantValue;

9144elseif (IsUniform)

9145 VK =TTI::OK_UniformValue;

9146

9147TTI::OperandValueProperties VP =TTI::OP_None;

9148 VP = IsPowerOfTwo ?TTI::OP_PowerOf2 : VP;

9149 VP = IsNegatedPowerOfTwo ?TTI::OP_NegatedPowerOf2 : VP;

9150

9151return {VK, VP};

9152}

9153

9154namespace{

9155/// The base class for shuffle instruction emission and shuffle cost estimation.

9156classBaseShuffleAnalysis {

9157protected:

9158Type *ScalarTy =nullptr;

9159

9160 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}

9161

9162 /// V is expected to be a vectorized value.

9163 /// When REVEC is disabled, there is no difference between VF and

9164 /// VNumElements.

9165 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.

9166 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead

9167 /// of 8.

9168unsigned getVF(Value *V) const{

9169assert(V &&"V cannot be nullptr");

9170assert(isa<FixedVectorType>(V->getType()) &&

9171"V does not have FixedVectorType");

9172assert(ScalarTy &&"ScalarTy cannot be nullptr");

9173unsigned ScalarTyNumElements =getNumElements(ScalarTy);

9174unsigned VNumElements =

9175 cast<FixedVectorType>(V->getType())->getNumElements();

9176assert(VNumElements > ScalarTyNumElements &&

9177"the number of elements of V is not large enough");

9178assert(VNumElements % ScalarTyNumElements == 0 &&

9179"the number of elements of V is not a vectorized value");

9180return VNumElements / ScalarTyNumElements;

9181 }

9182

9183 /// Checks if the mask is an identity mask.

9184 /// \param IsStrict if is true the function returns false if mask size does

9185 /// not match vector size.

9186staticbool isIdentityMask(ArrayRef<int> Mask,constFixedVectorType *VecTy,

9187bool IsStrict) {

9188int Limit =Mask.size();

9189int VF = VecTy->getNumElements();

9190intIndex = -1;

9191if (VF == Limit &&ShuffleVectorInst::isIdentityMask(Mask, Limit))

9192returntrue;

9193if (!IsStrict) {

9194// Consider extract subvector starting from index 0.

9195if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF,Index) &&

9196Index == 0)

9197returntrue;

9198// All VF-size submasks are identity (e.g.

9199// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).

9200if (Limit % VF == 0 &&all_of(seq<int>(0, Limit / VF), [=](intIdx) {

9201ArrayRef<int> Slice =Mask.slice(Idx * VF, VF);

9202returnall_of(Slice, [](intI) {returnI ==PoisonMaskElem; }) ||

9203ShuffleVectorInst::isIdentityMask(Slice, VF);

9204 }))

9205returntrue;

9206 }

9207returnfalse;

9208 }

9209

9210 /// Tries to combine 2 different masks into single one.

9211 /// \param LocalVF Vector length of the permuted input vector. \p Mask may

9212 /// change the size of the vector, \p LocalVF is the original size of the

9213 /// shuffled vector.

9214staticvoid combineMasks(unsigned LocalVF,SmallVectorImpl<int> &Mask,

9215ArrayRef<int> ExtMask) {

9216unsigned VF =Mask.size();

9217SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);

9218for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {

9219if (ExtMask[I] ==PoisonMaskElem)

9220continue;

9221int MaskedIdx =Mask[ExtMask[I] % VF];

9222 NewMask[I] =

9223 MaskedIdx ==PoisonMaskElem ?PoisonMaskElem : MaskedIdx % LocalVF;

9224 }

9225Mask.swap(NewMask);

9226 }

9227

9228 /// Looks through shuffles trying to reduce final number of shuffles in the

9229 /// code. The function looks through the previously emitted shuffle

9230 /// instructions and properly mark indices in mask as undef.

9231 /// For example, given the code

9232 /// \code

9233 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>

9234 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>

9235 /// \endcode

9236 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will

9237 /// look through %s1 and %s2 and select vectors %0 and %1 with mask

9238 /// <0, 1, 2, 3> for the shuffle.

9239 /// If 2 operands are of different size, the smallest one will be resized and

9240 /// the mask recalculated properly.

9241 /// For example, given the code

9242 /// \code

9243 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>

9244 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>

9245 /// \endcode

9246 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will

9247 /// look through %s1 and %s2 and select vectors %0 and %1 with mask

9248 /// <0, 1, 2, 3> for the shuffle.

9249 /// So, it tries to transform permutations to simple vector merge, if

9250 /// possible.

9251 /// \param V The input vector which must be shuffled using the given \p Mask.

9252 /// If the better candidate is found, \p V is set to this best candidate

9253 /// vector.

9254 /// \param Mask The input mask for the shuffle. If the best candidate is found

9255 /// during looking-through-shuffles attempt, it is updated accordingly.

9256 /// \param SinglePermute true if the shuffle operation is originally a

9257 /// single-value-permutation. In this case the look-through-shuffles procedure

9258 /// may look for resizing shuffles as the best candidates.

9259 /// \return true if the shuffle results in the non-resizing identity shuffle

9260 /// (and thus can be ignored), false - otherwise.

9261staticbool peekThroughShuffles(Value *&V,SmallVectorImpl<int> &Mask,

9262bool SinglePermute) {

9263Value *Op =V;

9264ShuffleVectorInst *IdentityOp =nullptr;

9265SmallVector<int> IdentityMask;

9266while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {

9267// Exit if not a fixed vector type or changing size shuffle.

9268auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());

9269if (!SVTy)

9270break;

9271// Remember the identity or broadcast mask, if it is not a resizing

9272// shuffle. If no better candidates are found, this Op and Mask will be

9273// used in the final shuffle.

9274if (isIdentityMask(Mask, SVTy,/*IsStrict=*/false)) {

9275if (!IdentityOp || !SinglePermute ||

9276 (isIdentityMask(Mask, SVTy,/*IsStrict=*/true) &&

9277 !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,

9278 IdentityMask.size()))) {

9279 IdentityOp = SV;

9280// Store current mask in the IdentityMask so later we did not lost

9281// this info if IdentityOp is selected as the best candidate for the

9282// permutation.

9283 IdentityMask.assign(Mask);

9284 }

9285 }

9286// Remember the broadcast mask. If no better candidates are found, this Op

9287// and Mask will be used in the final shuffle.

9288// Zero splat can be used as identity too, since it might be used with

9289// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.

9290// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is

9291// expensive, the analysis founds out, that the source vector is just a

9292// broadcast, this original mask can be transformed to identity mask <0,

9293// 1, 2, 3>.

9294// \code

9295// %0 = shuffle %v, poison, zeroinitalizer

9296// %res = shuffle %0, poison, <3, 1, 2, 0>

9297// \endcode

9298// may be transformed to

9299// \code

9300// %0 = shuffle %v, poison, zeroinitalizer

9301// %res = shuffle %0, poison, <0, 1, 2, 3>

9302// \endcode

9303if (SV->isZeroEltSplat()) {

9304 IdentityOp = SV;

9305 IdentityMask.assign(Mask);

9306 }

9307int LocalVF =Mask.size();

9308if (auto *SVOpTy =

9309 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))

9310 LocalVF = SVOpTy->getNumElements();

9311SmallVector<int> ExtMask(Mask.size(),PoisonMaskElem);

9312for (auto [Idx,I] :enumerate(Mask)) {

9313if (I ==PoisonMaskElem ||

9314static_cast<unsigned>(I) >= SV->getShuffleMask().size())

9315continue;

9316 ExtMask[Idx] = SV->getMaskValue(I);

9317 }

9318bool IsOp1Undef =isUndefVector</*isPoisonOnly=*/true>(

9319 SV->getOperand(0),

9320buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))

9321 .all();

9322bool IsOp2Undef =isUndefVector</*isPoisonOnly=*/true>(

9323 SV->getOperand(1),

9324buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))

9325 .all();

9326if (!IsOp1Undef && !IsOp2Undef) {

9327// Update mask and mark undef elems.

9328for (int &I : Mask) {

9329if (I ==PoisonMaskElem)

9330continue;

9331if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==

9332PoisonMaskElem)

9333I =PoisonMaskElem;

9334 }

9335break;

9336 }

9337SmallVector<int> ShuffleMask(SV->getShuffleMask());

9338 combineMasks(LocalVF, ShuffleMask, Mask);

9339Mask.swap(ShuffleMask);

9340if (IsOp2Undef)

9341Op = SV->getOperand(0);

9342else

9343Op = SV->getOperand(1);

9344 }

9345if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());

9346 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||

9347ShuffleVectorInst::isZeroEltSplatMask(Mask,Mask.size())) {

9348if (IdentityOp) {

9349V = IdentityOp;

9350assert(Mask.size() == IdentityMask.size() &&

9351"Expected masks of same sizes.");

9352// Clear known poison elements.

9353for (auto [I,Idx] :enumerate(Mask))

9354if (Idx ==PoisonMaskElem)

9355 IdentityMask[I] =PoisonMaskElem;

9356Mask.swap(IdentityMask);

9357auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);

9358return SinglePermute &&

9359 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),

9360/*IsStrict=*/true) ||

9361 (Shuffle &&Mask.size() == Shuffle->getShuffleMask().size() &&

9362 Shuffle->isZeroEltSplat() &&

9363ShuffleVectorInst::isZeroEltSplatMask(Mask,Mask.size())));

9364 }

9365V =Op;

9366returnfalse;

9367 }

9368V =Op;

9369returntrue;

9370 }

9371

9372 /// Smart shuffle instruction emission, walks through shuffles trees and

9373 /// tries to find the best matching vector for the actual shuffle

9374 /// instruction.

9375template <typename T,typename ShuffleBuilderTy>

9376staticT createShuffle(Value *V1,Value *V2,ArrayRef<int> Mask,

9377 ShuffleBuilderTy &Builder,Type *ScalarTy) {

9378assert(V1 &&"Expected at least one vector value.");

9379unsigned ScalarTyNumElements =getNumElements(ScalarTy);

9380SmallVector<int> NewMask(Mask);

9381if (ScalarTyNumElements != 1) {

9382assert(SLPReVec &&"FixedVectorType is not expected.");

9383transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);

9384Mask = NewMask;

9385 }

9386if (V2)

9387 Builder.resizeToMatch(V1, V2);

9388int VF =Mask.size();

9389if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))

9390 VF = FTy->getNumElements();

9391if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(

9392 V2,buildUseMask(VF, Mask, UseMask::SecondArg))

9393 .all()) {

9394// Peek through shuffles.

9395Value *Op1 = V1;

9396Value *Op2 =V2;

9397int VF =

9398 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

9399SmallVector<int> CombinedMask1(Mask.size(),PoisonMaskElem);

9400SmallVector<int> CombinedMask2(Mask.size(),PoisonMaskElem);

9401for (intI = 0,E =Mask.size();I <E; ++I) {

9402if (Mask[I] < VF)

9403 CombinedMask1[I] =Mask[I];

9404else

9405 CombinedMask2[I] =Mask[I] - VF;

9406 }

9407Value *PrevOp1;

9408Value *PrevOp2;

9409do {

9410 PrevOp1 = Op1;

9411 PrevOp2 = Op2;

9412 (void)peekThroughShuffles(Op1, CombinedMask1,/*SinglePermute=*/false);

9413 (void)peekThroughShuffles(Op2, CombinedMask2,/*SinglePermute=*/false);

9414// Check if we have 2 resizing shuffles - need to peek through operands

9415// again.

9416if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))

9417if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {

9418SmallVector<int> ExtMask1(Mask.size(),PoisonMaskElem);

9419for (auto [Idx,I] :enumerate(CombinedMask1)) {

9420if (I ==PoisonMaskElem)

9421continue;

9422 ExtMask1[Idx] = SV1->getMaskValue(I);

9423 }

9424SmallBitVector UseMask1 =buildUseMask(

9425 cast<FixedVectorType>(SV1->getOperand(1)->getType())

9426 ->getNumElements(),

9427 ExtMask1, UseMask::SecondArg);

9428SmallVector<int> ExtMask2(CombinedMask2.size(),PoisonMaskElem);

9429for (auto [Idx,I] :enumerate(CombinedMask2)) {

9430if (I ==PoisonMaskElem)

9431continue;

9432 ExtMask2[Idx] = SV2->getMaskValue(I);

9433 }

9434SmallBitVector UseMask2 =buildUseMask(

9435 cast<FixedVectorType>(SV2->getOperand(1)->getType())

9436 ->getNumElements(),

9437 ExtMask2, UseMask::SecondArg);

9438if (SV1->getOperand(0)->getType() ==

9439 SV2->getOperand(0)->getType() &&

9440 SV1->getOperand(0)->getType() != SV1->getType() &&

9441isUndefVector(SV1->getOperand(1), UseMask1).all() &&

9442isUndefVector(SV2->getOperand(1), UseMask2).all()) {

9443 Op1 = SV1->getOperand(0);

9444 Op2 = SV2->getOperand(0);

9445SmallVector<int> ShuffleMask1(SV1->getShuffleMask());

9446int LocalVF = ShuffleMask1.size();

9447if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))

9448 LocalVF = FTy->getNumElements();

9449 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);

9450 CombinedMask1.swap(ShuffleMask1);

9451SmallVector<int> ShuffleMask2(SV2->getShuffleMask());

9452 LocalVF = ShuffleMask2.size();

9453if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))

9454 LocalVF = FTy->getNumElements();

9455 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);

9456 CombinedMask2.swap(ShuffleMask2);

9457 }

9458 }

9459 }while (PrevOp1 != Op1 || PrevOp2 != Op2);

9460 Builder.resizeToMatch(Op1, Op2);

9461 VF = std::max(cast<VectorType>(Op1->getType())

9462 ->getElementCount()

9463 .getKnownMinValue(),

9464 cast<VectorType>(Op2->getType())

9465 ->getElementCount()

9466 .getKnownMinValue());

9467for (intI = 0,E =Mask.size();I <E; ++I) {

9468if (CombinedMask2[I] !=PoisonMaskElem) {

9469assert(CombinedMask1[I] ==PoisonMaskElem &&

9470"Expected undefined mask element");

9471 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);

9472 }

9473 }

9474if (Op1 == Op2 &&

9475 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||

9476 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&

9477 isa<ShuffleVectorInst>(Op1) &&

9478 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==

9479ArrayRef(CombinedMask1))))

9480return Builder.createIdentity(Op1);

9481return Builder.createShuffleVector(

9482 Op1, Op1 == Op2 ?PoisonValue::get(Op1->getType()) : Op2,

9483 CombinedMask1);

9484 }

9485if (isa<PoisonValue>(V1))

9486return Builder.createPoison(

9487 cast<VectorType>(V1->getType())->getElementType(),Mask.size());

9488bool IsIdentity = peekThroughShuffles(V1, NewMask,/*SinglePermute=*/true);

9489assert(V1 &&"Expected non-null value after looking through shuffles.");

9490

9491if (!IsIdentity)

9492return Builder.createShuffleVector(V1, NewMask);

9493return Builder.createIdentity(V1);

9494 }

9495

9496 /// Transforms mask \p CommonMask per given \p Mask to make proper set after

9497 /// shuffle emission.

9498staticvoid transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,

9499ArrayRef<int> Mask) {

9500for (unsignedI : seq<unsigned>(CommonMask.size()))

9501if (Mask[I] !=PoisonMaskElem)

9502 CommonMask[I] =I;

9503 }

9504};

9505}// namespace

9506

9507/// Calculate the scalar and the vector costs from vectorizing set of GEPs.

9508static std::pair<InstructionCost, InstructionCost>

9509getGEPCosts(constTargetTransformInfo &TTI,ArrayRef<Value *> Ptrs,

9510Value *BasePtr,unsigned Opcode,TTI::TargetCostKind CostKind,

9511Type *ScalarTy,VectorType *VecTy) {

9512InstructionCost ScalarCost = 0;

9513InstructionCost VecCost = 0;

9514// Here we differentiate two cases: (1) when Ptrs represent a regular

9515// vectorization tree node (as they are pointer arguments of scattered

9516// loads) or (2) when Ptrs are the arguments of loads or stores being

9517// vectorized as plane wide unit-stride load/store since all the

9518// loads/stores are known to be from/to adjacent locations.

9519if (Opcode == Instruction::Load || Opcode == Instruction::Store) {

9520// Case 2: estimate costs for pointer related costs when vectorizing to

9521// a wide load/store.

9522// Scalar cost is estimated as a set of pointers with known relationship

9523// between them.

9524// For vector code we will use BasePtr as argument for the wide load/store

9525// but we also need to account all the instructions which are going to

9526// stay in vectorized code due to uses outside of these scalar

9527// loads/stores.

9528 ScalarCost =TTI.getPointersChainCost(

9529 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,

9530CostKind);

9531

9532SmallVector<const Value *> PtrsRetainedInVecCode;

9533for (Value *V : Ptrs) {

9534if (V == BasePtr) {

9535 PtrsRetainedInVecCode.push_back(V);

9536continue;

9537 }

9538auto *Ptr = dyn_cast<GetElementPtrInst>(V);

9539// For simplicity assume Ptr to stay in vectorized code if it's not a

9540// GEP instruction. We don't care since it's cost considered free.

9541// TODO: We should check for any uses outside of vectorizable tree

9542// rather than just single use.

9543if (!Ptr || !Ptr->hasOneUse())

9544 PtrsRetainedInVecCode.push_back(V);

9545 }

9546

9547if (PtrsRetainedInVecCode.size() == Ptrs.size()) {

9548// If all pointers stay in vectorized code then we don't have

9549// any savings on that.

9550return std::make_pair(TTI::TCC_Free,TTI::TCC_Free);

9551 }

9552 VecCost =TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,

9553 TTI::PointersChainInfo::getKnownStride(),

9554 VecTy,CostKind);

9555 }else {

9556// Case 1: Ptrs are the arguments of loads that we are going to transform

9557// into masked gather load intrinsic.

9558// All the scalar GEPs will be removed as a result of vectorization.

9559// For any external uses of some lanes extract element instructions will

9560// be generated (which cost is estimated separately).

9561TTI::PointersChainInfo PtrsInfo =

9562all_of(Ptrs,

9563 [](constValue *V) {

9564auto *Ptr = dyn_cast<GetElementPtrInst>(V);

9565returnPtr && !Ptr->hasAllConstantIndices();

9566 })

9567 ? TTI::PointersChainInfo::getUnknownStride()

9568 : TTI::PointersChainInfo::getKnownStride();

9569

9570 ScalarCost =

9571TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,CostKind);

9572auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);

9573if (!BaseGEP) {

9574auto *It =find_if(Ptrs, IsaPred<GEPOperator>);

9575if (It != Ptrs.end())

9576 BaseGEP = cast<GEPOperator>(*It);

9577 }

9578if (BaseGEP) {

9579SmallVector<const Value *> Indices(BaseGEP->indices());

9580 VecCost =TTI.getGEPCost(BaseGEP->getSourceElementType(),

9581 BaseGEP->getPointerOperand(), Indices, VecTy,

9582CostKind);

9583 }

9584 }

9585

9586return std::make_pair(ScalarCost, VecCost);

9587}

9588

9589void BoUpSLP::reorderGatherNode(TreeEntry &TE) {

9590assert(TE.isGather() &&TE.ReorderIndices.empty() &&

9591"Expected gather node without reordering.");

9592DenseMap<std::pair<size_t, Value *>,SmallVector<LoadInst *>> LoadsMap;

9593SmallSet<size_t, 2> LoadKeyUsed;

9594

9595// Do not reorder nodes if it small (just 2 elements), all-constant or all

9596// instructions have same opcode already.

9597if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||

9598all_of(TE.Scalars,isConstant))

9599return;

9600

9601if (any_of(seq<unsigned>(TE.Idx), [&](unsignedIdx) {

9602 return VectorizableTree[Idx]->isSame(TE.Scalars);

9603 }))

9604return;

9605

9606auto GenerateLoadsSubkey = [&](size_tKey,LoadInst *LI) {

9607Key =hash_combine(hash_value(LI->getParent()), Key);

9608Value *Ptr =

9609getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth);

9610if (LoadKeyUsed.contains(Key)) {

9611auto LIt = LoadsMap.find(std::make_pair(Key,Ptr));

9612if (LIt != LoadsMap.end()) {

9613for (LoadInst *RLI : LIt->second) {

9614if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),

9615 LI->getType(), LI->getPointerOperand(), *DL, *SE,

9616/*StrictCheck=*/true))

9617returnhash_value(RLI->getPointerOperand());

9618 }

9619for (LoadInst *RLI : LIt->second) {

9620if (arePointersCompatible(RLI->getPointerOperand(),

9621 LI->getPointerOperand(), *TLI)) {

9622hash_code SubKey =hash_value(RLI->getPointerOperand());

9623return SubKey;

9624 }

9625 }

9626if (LIt->second.size() > 2) {

9627hash_code SubKey =

9628hash_value(LIt->second.back()->getPointerOperand());

9629return SubKey;

9630 }

9631 }

9632 }

9633 LoadKeyUsed.insert(Key);

9634 LoadsMap.try_emplace(std::make_pair(Key,Ptr)).first->second.push_back(LI);

9635returnhash_value(LI->getPointerOperand());

9636 };

9637MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;

9638SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;

9639bool IsOrdered =true;

9640unsigned NumInstructions = 0;

9641// Try to "cluster" scalar instructions, to be able to build extra vectorized

9642// nodes.

9643for (auto [I, V] :enumerate(TE.Scalars)) {

9644size_tKey = 1,Idx = 1;

9645if (auto *Inst = dyn_cast<Instruction>(V);

9646 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&

9647 !isDeleted(Inst) && !isVectorized(V)) {

9648 std::tie(Key,Idx) =generateKeySubkey(V, TLI, GenerateLoadsSubkey,

9649/*AllowAlternate=*/false);

9650 ++NumInstructions;

9651 }

9652auto &Container = SortedValues[Key];

9653if (IsOrdered && !KeyToIndex.contains(V) &&

9654 !(isa<Constant, ExtractElementInst>(V) ||

9655isVectorLikeInstWithConstOps(V)) &&

9656 ((Container.contains(Idx) &&

9657 KeyToIndex.at(Container[Idx].back()).back() !=I - 1) ||

9658 (!Container.empty() && !Container.contains(Idx) &&

9659 KeyToIndex.at(Container.back().second.back()).back() !=I - 1)))

9660 IsOrdered =false;

9661auto &KTI = KeyToIndex[V];

9662if (KTI.empty())

9663 Container[Idx].push_back(V);

9664 KTI.push_back(I);

9665 }

9666SmallVector<std::pair<unsigned, unsigned>> SubVectors;

9667APInt DemandedElts =APInt::getAllOnes(TE.Scalars.size());

9668if (!IsOrdered && NumInstructions > 1) {

9669unsigned Cnt = 0;

9670TE.ReorderIndices.resize(TE.Scalars.size(),TE.Scalars.size());

9671for (constauto &D : SortedValues) {

9672for (constauto &P :D.second) {

9673unsigned Sz = 0;

9674for (Value *V :P.second) {

9675ArrayRef<unsigned> Indices = KeyToIndex.at(V);

9676for (auto [K,Idx] :enumerate(Indices)) {

9677TE.ReorderIndices[Cnt +K] =Idx;

9678TE.Scalars[Cnt +K] =V;

9679 }

9680 Sz += Indices.size();

9681 Cnt += Indices.size();

9682 }

9683if (Sz > 1 && isa<Instruction>(P.second.front())) {

9684constunsigned SubVF =getFloorFullVectorNumberOfElements(

9685 *TTI,TE.Scalars.front()->getType(), Sz);

9686 SubVectors.emplace_back(Cnt - Sz, SubVF);

9687for (unsignedI : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))

9688 DemandedElts.clearBit(I);

9689 }elseif (!P.second.empty() &&isConstant(P.second.front())) {

9690for (unsignedI : seq<unsigned>(Cnt - Sz, Cnt))

9691 DemandedElts.clearBit(I);

9692 }

9693 }

9694 }

9695 }

9696// Reuses always require shuffles, so consider it as profitable.

9697if (!TE.ReuseShuffleIndices.empty() ||TE.ReorderIndices.empty())

9698return;

9699// Do simple cost estimation.

9700constexprTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

9701InstructionCost Cost = 0;

9702auto *ScalarTy =TE.Scalars.front()->getType();

9703auto *VecTy =getWidenedType(ScalarTy,TE.Scalars.size());

9704for (auto [Idx, Sz] : SubVectors) {

9705Cost +=::getShuffleCost(*TTI,TTI::SK_InsertSubvector, VecTy, {},CostKind,

9706Idx,getWidenedType(ScalarTy, Sz));

9707 }

9708if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {

9709assert(SLPReVec &&"Only supported by REVEC.");

9710// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead

9711// of CreateInsertElement.

9712unsigned ScalarTyNumElements =getNumElements(ScalarTy);

9713for (unsignedI : seq<unsigned>(TE.Scalars.size()))

9714if (DemandedElts[I])

9715Cost +=

9716TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,

9717CostKind,I * ScalarTyNumElements, FTy);

9718 }else {

9719Cost +=TTI->getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,

9720/*Extract=*/false,CostKind);

9721 }

9722int Sz =TE.Scalars.size();

9723SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),

9724TE.ReorderIndices.end());

9725for (unsignedI : seq<unsigned>(Sz)) {

9726Value *V =TE.getOrdered(I);

9727if (isa<PoisonValue>(V)) {

9728 ReorderMask[I] =PoisonMaskElem;

9729 }elseif (isConstant(V) || DemandedElts[I]) {

9730 ReorderMask[I] =I +TE.ReorderIndices.size();

9731 }

9732 }

9733Cost +=::getShuffleCost(*TTI,

9734any_of(ReorderMask, [&](intI) {returnI >= Sz; })

9735 ?TTI::SK_PermuteTwoSrc

9736 :TTI::SK_PermuteSingleSrc,

9737 VecTy, ReorderMask);

9738 DemandedElts =APInt::getAllOnes(VecTy->getNumElements());

9739 ReorderMask.assign(Sz,PoisonMaskElem);

9740for (unsignedI : seq<unsigned>(Sz)) {

9741Value *V =TE.getOrdered(I);

9742if (isConstant(V)) {

9743 DemandedElts.clearBit(I);

9744if (!isa<PoisonValue>(V))

9745 ReorderMask[I] =I;

9746 }else {

9747 ReorderMask[I] =I + Sz;

9748 }

9749 }

9750InstructionCost BVCost =TTI->getScalarizationOverhead(

9751 VecTy, DemandedElts,/*Insert=*/true,/*Extract=*/false,CostKind);

9752if (!DemandedElts.isAllOnes())

9753 BVCost +=::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);

9754if (Cost >= BVCost) {

9755SmallVector<int>Mask(TE.ReorderIndices.begin(),TE.ReorderIndices.end());

9756reorderScalars(TE.Scalars, Mask);

9757TE.ReorderIndices.clear();

9758 }

9759}

9760

9761voidBoUpSLP::transformNodes() {

9762constexprTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

9763 BaseGraphSize = VectorizableTree.size();

9764// Turn graph transforming mode on and off, when done.

9765classGraphTransformModeRAAI {

9766bool &SavedIsGraphTransformMode;

9767

9768public:

9769 GraphTransformModeRAAI(bool &IsGraphTransformMode)

9770 : SavedIsGraphTransformMode(IsGraphTransformMode) {

9771 IsGraphTransformMode =true;

9772 }

9773 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =false; }

9774 } TransformContext(IsGraphTransformMode);

9775// Operands are profitable if they are:

9776// 1. At least one constant

9777// or

9778// 2. Splats

9779// or

9780// 3. Results in good vectorization opportunity, i.e. may generate vector

9781// nodes and reduce cost of the graph.

9782auto CheckOperandsProfitability = [this](Instruction *I1,Instruction *I2,

9783const InstructionsState &S) {

9784SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;

9785for (unsignedOp : seq<unsigned>(S.getMainOp()->getNumOperands()))

9786 Candidates.emplace_back().emplace_back(I1->getOperand(Op),

9787 I2->getOperand(Op));

9788returnall_of(

9789 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

9790returnall_of(Cand,

9791 [](const std::pair<Value *, Value *> &P) {

9792return isa<Constant>(P.first) ||

9793 isa<Constant>(P.second) ||P.first ==P.second;

9794 }) ||

9795findBestRootPair(Cand,LookAheadHeuristics::ScoreSplatLoads);

9796 });

9797 };

9798

9799// Try to reorder gather nodes for better vectorization opportunities.

9800for (unsignedIdx : seq<unsigned>(BaseGraphSize)) {

9801 TreeEntry &E = *VectorizableTree[Idx];

9802if (E.isGather())

9803 reorderGatherNode(E);

9804 }

9805

9806// The tree may grow here, so iterate over nodes, built before.

9807for (unsignedIdx : seq<unsigned>(BaseGraphSize)) {

9808 TreeEntry &E = *VectorizableTree[Idx];

9809if (E.isGather()) {

9810ArrayRef<Value *> VL = E.Scalars;

9811constunsigned Sz =getVectorElementSize(VL.front());

9812unsigned MinVF =getMinVF(2 * Sz);

9813// Do not try partial vectorization for small nodes (<= 2), nodes with the

9814// same opcode and same parent block or all constants.

9815if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||

9816 !(!E.hasState() || E.getOpcode() == Instruction::Load ||

9817 E.isAltShuffle() || !allSameBlock(VL)) ||

9818allConstant(VL) ||isSplat(VL))

9819continue;

9820// Try to find vectorizable sequences and transform them into a series of

9821// insertvector instructions.

9822unsigned StartIdx = 0;

9823unsignedEnd = VL.size();

9824for (unsigned VF =getFloorFullVectorNumberOfElements(

9825 *TTI, VL.front()->getType(), VL.size() - 1);

9826 VF >= MinVF; VF =getFloorFullVectorNumberOfElements(

9827 *TTI, VL.front()->getType(), VF - 1)) {

9828if (StartIdx + VF >End)

9829continue;

9830SmallVector<std::pair<unsigned, unsigned>> Slices;

9831for (unsigned Cnt = StartIdx; Cnt + VF <=End; Cnt += VF) {

9832ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

9833// If any instruction is vectorized already - do not try again.

9834// Reuse the existing node, if it fully matches the slice.

9835if (const TreeEntry *SE = getTreeEntry(Slice.front());

9836 SE || getTreeEntry(Slice.back())) {

9837if (!SE)

9838continue;

9839if (VF != SE->getVectorFactor() || !SE->isSame(Slice))

9840continue;

9841 }

9842// Constant already handled effectively - skip.

9843if (allConstant(Slice))

9844continue;

9845// Do not try to vectorize small splats (less than vector register and

9846// only with the single non-undef element).

9847bool IsSplat =isSplat(Slice);

9848if (Slices.empty() || !IsSplat ||

9849 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(

9850 Slice.front()->getType(), VF)),

9851 1U, VF - 1) !=

9852 std::clamp(TTI->getNumberOfParts(getWidenedType(

9853 Slice.front()->getType(), 2 * VF)),

9854 1U, 2 * VF)) ||

9855count(Slice, Slice.front()) ==

9856static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1

9857 : 1)) {

9858if (IsSplat)

9859continue;

9860 InstructionsState S =getSameOpcode(Slice, *TLI);

9861if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||

9862 (S.getOpcode() == Instruction::Load &&

9863areKnownNonVectorizableLoads(Slice)) ||

9864 (S.getOpcode() != Instruction::Load &&

9865 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))

9866continue;

9867if (VF == 2) {

9868// Try to vectorize reduced values or if all users are vectorized.

9869// For expensive instructions extra extracts might be profitable.

9870if ((!UserIgnoreList || E.Idx != 0) &&

9871TTI->getInstructionCost(S.getMainOp(),CostKind) <

9872TTI::TCC_Expensive &&

9873 !all_of(Slice, [&](Value *V) {

9874if (isa<PoisonValue>(V))

9875returntrue;

9876return areAllUsersVectorized(cast<Instruction>(V),

9877 UserIgnoreList);

9878 }))

9879continue;

9880if (S.getOpcode() == Instruction::Load) {

9881OrdersType Order;

9882SmallVector<Value *> PointerOps;

9883LoadsState Res =

9884canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);

9885// Do not vectorize gathers.

9886if (Res ==LoadsState::ScatterVectorize ||

9887 Res ==LoadsState::Gather) {

9888if (Res ==LoadsState::Gather) {

9889registerNonVectorizableLoads(Slice);

9890// If reductions and the scalars from the root node are

9891// analyzed - mark as non-vectorizable reduction.

9892if (UserIgnoreList && E.Idx == 0)

9893analyzedReductionVals(Slice);

9894 }

9895continue;

9896 }

9897 }elseif (S.getOpcode() == Instruction::ExtractElement ||

9898 (TTI->getInstructionCost(S.getMainOp(),CostKind) <

9899TTI::TCC_Expensive &&

9900 !CheckOperandsProfitability(

9901 S.getMainOp(),

9902 cast<Instruction>(*find_if(reverse(Slice),

9903 IsaPred<Instruction>)),

9904 S))) {

9905// Do not vectorize extractelements (handled effectively

9906// alread). Do not vectorize non-profitable instructions (with

9907// low cost and non-vectorizable operands.)

9908continue;

9909 }

9910 }

9911 }

9912 Slices.emplace_back(Cnt, Slice.size());

9913 }

9914auto AddCombinedNode = [&](unsignedIdx,unsigned Cnt,unsigned Sz) {

9915 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);

9916if (StartIdx == Cnt)

9917 StartIdx = Cnt + Sz;

9918if (End == Cnt + Sz)

9919End = Cnt;

9920 };

9921for (auto [Cnt, Sz] : Slices) {

9922ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);

9923// If any instruction is vectorized already - do not try again.

9924if (TreeEntry *SE = getTreeEntry(Slice.front());

9925 SE || getTreeEntry(Slice.back())) {

9926if (!SE)

9927continue;

9928if (VF != SE->getVectorFactor() || !SE->isSame(Slice))

9929continue;

9930 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);

9931 AddCombinedNode(SE->Idx, Cnt, Sz);

9932continue;

9933 }

9934unsigned PrevSize = VectorizableTree.size();

9935 [[maybe_unused]]unsigned PrevEntriesSize =

9936 LoadEntriesToVectorize.size();

9937 buildTree_rec(Slice, 0,EdgeInfo(&E, UINT_MAX));

9938if (PrevSize + 1 == VectorizableTree.size() &&

9939 VectorizableTree[PrevSize]->isGather() &&

9940 VectorizableTree[PrevSize]->hasState() &&

9941 VectorizableTree[PrevSize]->getOpcode() !=

9942 Instruction::ExtractElement &&

9943 !isSplat(Slice)) {

9944if (UserIgnoreList && E.Idx == 0 && VF == 2)

9945analyzedReductionVals(Slice);

9946 VectorizableTree.pop_back();

9947assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&

9948"LoadEntriesToVectorize expected to remain the same");

9949continue;

9950 }

9951 AddCombinedNode(PrevSize, Cnt, Sz);

9952 }

9953 }

9954// Restore ordering, if no extra vectorization happened.

9955if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {

9956SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());

9957reorderScalars(E.Scalars, Mask);

9958 E.ReorderIndices.clear();

9959 }

9960 }

9961if (!E.hasState())

9962continue;

9963switch (E.getOpcode()) {

9964case Instruction::Load: {

9965// No need to reorder masked gather loads, just reorder the scalar

9966// operands.

9967if (E.State != TreeEntry::Vectorize)

9968break;

9969Type *ScalarTy = E.getMainOp()->getType();

9970auto *VecTy =getWidenedType(ScalarTy, E.Scalars.size());

9971Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);

9972// Check if profitable to represent consecutive load + reverse as strided

9973// load with stride -1.

9974if (!E.ReorderIndices.empty() &&isReverseOrder(E.ReorderIndices) &&

9975TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {

9976SmallVector<int> Mask;

9977inversePermutation(E.ReorderIndices, Mask);

9978auto *BaseLI = cast<LoadInst>(E.Scalars.back());

9979InstructionCost OriginalVecCost =

9980TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),

9981 BaseLI->getPointerAddressSpace(),CostKind,

9982TTI::OperandValueInfo()) +

9983::getShuffleCost(*TTI,TTI::SK_Reverse, VecTy, Mask,CostKind);

9984InstructionCost StridedCost =TTI->getStridedMemoryOpCost(

9985 Instruction::Load, VecTy, BaseLI->getPointerOperand(),

9986/*VariableMask=*/false, CommonAlignment,CostKind, BaseLI);

9987if (StridedCost < OriginalVecCost)

9988// Strided load is more profitable than consecutive load + reverse -

9989// transform the node to strided load.

9990 E.State = TreeEntry::StridedVectorize;

9991 }

9992break;

9993 }

9994case Instruction::Store: {

9995Type *ScalarTy =

9996 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();

9997auto *VecTy =getWidenedType(ScalarTy, E.Scalars.size());

9998Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);

9999// Check if profitable to represent consecutive load + reverse as strided

10000// load with stride -1.

10001if (!E.ReorderIndices.empty() &&isReverseOrder(E.ReorderIndices) &&

10002TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {

10003SmallVector<int> Mask;

10004inversePermutation(E.ReorderIndices, Mask);

10005auto *BaseSI = cast<StoreInst>(E.Scalars.back());

10006InstructionCost OriginalVecCost =

10007TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),

10008 BaseSI->getPointerAddressSpace(),CostKind,

10009TTI::OperandValueInfo()) +

10010::getShuffleCost(*TTI,TTI::SK_Reverse, VecTy, Mask,CostKind);

10011InstructionCost StridedCost =TTI->getStridedMemoryOpCost(

10012 Instruction::Store, VecTy, BaseSI->getPointerOperand(),

10013/*VariableMask=*/false, CommonAlignment,CostKind, BaseSI);

10014if (StridedCost < OriginalVecCost)

10015// Strided store is more profitable than reverse + consecutive store -

10016// transform the node to strided store.

10017 E.State = TreeEntry::StridedVectorize;

10018 }elseif (!E.ReorderIndices.empty()) {

10019// Check for interleaved stores.

10020auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {

10021auto *BaseSI = cast<StoreInst>(E.Scalars.front());

10022assert(Mask.size() > 1 &&"Expected mask greater than 1 element.");

10023if (Mask.size() < 4)

10024return 0u;

10025for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {

10026if (ShuffleVectorInst::isInterleaveMask(

10027 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&

10028TTI.isLegalInterleavedAccessType(

10029 VecTy, Factor, BaseSI->getAlign(),

10030 BaseSI->getPointerAddressSpace()))

10031return Factor;

10032 }

10033

10034return 0u;

10035 };

10036SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());

10037unsigned InterleaveFactor = IsInterleaveMask(Mask);

10038if (InterleaveFactor != 0)

10039 E.setInterleave(InterleaveFactor);

10040 }

10041break;

10042 }

10043case Instruction::Select: {

10044if (E.State != TreeEntry::Vectorize)

10045break;

10046auto [MinMaxID, SelectOnly] =canConvertToMinOrMaxIntrinsic(E.Scalars);

10047if (MinMaxID ==Intrinsic::not_intrinsic)

10048break;

10049// This node is a minmax node.

10050 E.CombinedOp = TreeEntry::MinMax;

10051 TreeEntry *CondEntry =const_cast<TreeEntry *>(getOperandEntry(&E, 0));

10052if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&

10053 CondEntry->State == TreeEntry::Vectorize) {

10054// The condition node is part of the combined minmax node.

10055 CondEntry->State = TreeEntry::CombinedVectorize;

10056 }

10057break;

10058 }

10059default:

10060break;

10061 }

10062 }

10063

10064if (LoadEntriesToVectorize.empty()) {

10065// Single load node - exit.

10066if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&

10067 VectorizableTree.front()->getOpcode() == Instruction::Load)

10068return;

10069// Small graph with small VF - exit.

10070constexprunsigned SmallTree = 3;

10071constexprunsigned SmallVF = 2;

10072if ((VectorizableTree.size() <= SmallTree &&

10073 VectorizableTree.front()->Scalars.size() == SmallVF) ||

10074 (VectorizableTree.size() <= 2 && UserIgnoreList))

10075return;

10076

10077if (VectorizableTree.front()->isNonPowOf2Vec() &&

10078getCanonicalGraphSize() !=getTreeSize() && UserIgnoreList &&

10079getCanonicalGraphSize() <= SmallTree &&

10080count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),

10081 [](const std::unique_ptr<TreeEntry> &TE) {

10082return TE->isGather() && TE->hasState() &&

10083 TE->getOpcode() == Instruction::Load &&

10084 !allSameBlock(TE->Scalars);

10085 }) == 1)

10086return;

10087 }

10088

10089// A list of loads to be gathered during the vectorization process. We can

10090// try to vectorize them at the end, if profitable.

10091SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,

10092SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 8>

10093 GatheredLoads;

10094

10095for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

10096 TreeEntry &E = *TE;

10097if (E.isGather() &&

10098 ((E.hasState() && E.getOpcode() == Instruction::Load) ||

10099 (!E.hasState() &&any_of(E.Scalars,

10100 [&](Value *V) {

10101 return isa<LoadInst>(V) &&

10102 !isVectorized(V) &&

10103 !isDeleted(cast<Instruction>(V));

10104 }))) &&

10105 !isSplat(E.Scalars)) {

10106for (Value *V : E.Scalars) {

10107auto *LI = dyn_cast<LoadInst>(V);

10108if (!LI)

10109continue;

10110if (isDeleted(LI) ||isVectorized(LI) || !LI->isSimple())

10111continue;

10112gatherPossiblyVectorizableLoads(

10113 *this, V, *DL, *SE, *TTI,

10114 GatheredLoads[std::make_tuple(

10115 LI->getParent(),

10116getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth),

10117 LI->getType())]);

10118 }

10119 }

10120 }

10121// Try to vectorize gathered loads if this is not just a gather of loads.

10122if (!GatheredLoads.empty())

10123 tryToVectorizeGatheredLoads(GatheredLoads);

10124}

10125

10126/// Merges shuffle masks and emits final shuffle instruction, if required. It

10127/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,

10128/// when the actual shuffle instruction is generated only if this is actually

10129/// required. Otherwise, the shuffle instruction emission is delayed till the

10130/// end of the process, to reduce the number of emitted instructions and further

10131/// analysis/transformations.

10132classBoUpSLP::ShuffleCostEstimator :public BaseShuffleAnalysis {

10133bool IsFinalized =false;

10134SmallVector<int> CommonMask;

10135SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;

10136constTargetTransformInfo &TTI;

10137InstructionCost Cost = 0;

10138SmallDenseSet<Value *> VectorizedVals;

10139BoUpSLP &R;

10140SmallPtrSetImpl<Value *> &CheckedExtracts;

10141constexprstaticTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

10142 /// While set, still trying to estimate the cost for the same nodes and we

10143 /// can delay actual cost estimation (virtual shuffle instruction emission).

10144 /// May help better estimate the cost if same nodes must be permuted + allows

10145 /// to move most of the long shuffles cost estimation to TTI.

10146bool SameNodesEstimated =true;

10147

10148staticConstant *getAllOnesValue(constDataLayout &DL,Type *Ty) {

10149if (Ty->getScalarType()->isPointerTy()) {

10150Constant *Res =ConstantExpr::getIntToPtr(

10151ConstantInt::getAllOnesValue(

10152IntegerType::get(Ty->getContext(),

10153DL.getTypeStoreSizeInBits(Ty->getScalarType()))),

10154 Ty->getScalarType());

10155if (auto *VTy = dyn_cast<VectorType>(Ty))

10156 Res =ConstantVector::getSplat(VTy->getElementCount(), Res);

10157return Res;

10158 }

10159returnConstant::getAllOnesValue(Ty);

10160 }

10161

10162InstructionCost getBuildVectorCost(ArrayRef<Value *> VL,Value *Root) {

10163if ((!Root &&allConstant(VL)) ||all_of(VL, IsaPred<UndefValue>))

10164returnTTI::TCC_Free;

10165auto *VecTy =getWidenedType(ScalarTy, VL.size());

10166InstructionCost GatherCost = 0;

10167SmallVector<Value *> Gathers(VL);

10168if (!Root &&isSplat(VL)) {

10169// Found the broadcasting of the single scalar, calculate the cost as

10170// the broadcast.

10171constauto *It =find_if_not(VL, IsaPred<UndefValue>);

10172assert(It != VL.end() &&"Expected at least one non-undef value.");

10173// Add broadcast for non-identity shuffle only.

10174bool NeedShuffle =

10175count(VL, *It) > 1 &&

10176 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));

10177if (!NeedShuffle) {

10178if (isa<FixedVectorType>(ScalarTy)) {

10179assert(SLPReVec &&"FixedVectorType is not expected.");

10180returnTTI.getShuffleCost(

10181TTI::SK_InsertSubvector, VecTy, {}, CostKind,

10182 std::distance(VL.begin(), It) *getNumElements(ScalarTy),

10183 cast<FixedVectorType>(ScalarTy));

10184 }

10185returnTTI.getVectorInstrCost(Instruction::InsertElement, VecTy,

10186 CostKind, std::distance(VL.begin(), It),

10187PoisonValue::get(VecTy), *It);

10188 }

10189

10190SmallVector<int> ShuffleMask(VL.size(),PoisonMaskElem);

10191transform(VL, ShuffleMask.begin(), [](Value *V) {

10192 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;

10193 });

10194InstructionCost InsertCost =

10195TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,

10196PoisonValue::get(VecTy), *It);

10197return InsertCost +::getShuffleCost(TTI,

10198TargetTransformInfo::SK_Broadcast,

10199 VecTy, ShuffleMask, CostKind,

10200/*Index=*/0,/*SubTp=*/nullptr,

10201/*Args=*/*It);

10202 }

10203return GatherCost +

10204 (all_of(Gathers, IsaPred<UndefValue>)

10205 ?TTI::TCC_Free

10206 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),

10207 ScalarTy));

10208 };

10209

10210 /// Compute the cost of creating a vector containing the extracted values from

10211 /// \p VL.

10212InstructionCost

10213 computeExtractCost(ArrayRef<Value *> VL,ArrayRef<int> Mask,

10214ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

10215unsigned NumParts) {

10216assert(VL.size() > NumParts &&"Unexpected scalarized shuffle.");

10217unsigned NumElts =

10218 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz,Value *V) {

10219 auto *EE = dyn_cast<ExtractElementInst>(V);

10220 if (!EE)

10221 return Sz;

10222 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());

10223 if (!VecTy)

10224 return Sz;

10225 return std::max(Sz, VecTy->getNumElements());

10226 });

10227// FIXME: this must be moved to TTI for better estimation.

10228unsigned EltsPerVector =getPartNumElems(VL.size(), NumParts);

10229auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,

10230SmallVectorImpl<unsigned> &Indices)

10231 -> std::optional<TTI::ShuffleKind> {

10232if (NumElts <= EltsPerVector)

10233return std::nullopt;

10234int OffsetReg0 =

10235alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,

10236 [](int S,intI) {

10237 if (I == PoisonMaskElem)

10238 return S;

10239 return std::min(S, I);

10240 }),

10241 EltsPerVector);

10242int OffsetReg1 = OffsetReg0;

10243DenseSet<int> RegIndices;

10244// Check that if trying to permute same single/2 input vectors.

10245TTI::ShuffleKind ShuffleKind =TTI::SK_PermuteSingleSrc;

10246int FirstRegId = -1;

10247 Indices.assign(1, OffsetReg0);

10248for (auto [Pos,I] :enumerate(Mask)) {

10249if (I ==PoisonMaskElem)

10250continue;

10251intIdx =I - OffsetReg0;

10252int RegId =

10253 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;

10254if (FirstRegId < 0)

10255 FirstRegId = RegId;

10256 RegIndices.insert(RegId);

10257if (RegIndices.size() > 2)

10258return std::nullopt;

10259if (RegIndices.size() == 2) {

10260 ShuffleKind =TTI::SK_PermuteTwoSrc;

10261if (Indices.size() == 1) {

10262 OffsetReg1 =alignDown(

10263 std::accumulate(

10264 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,

10265 [&](int S,intI) {

10266 if (I == PoisonMaskElem)

10267 return S;

10268 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +

10269 ((I - OffsetReg0) % NumElts) / EltsPerVector;

10270 if (RegId == FirstRegId)

10271 return S;

10272 return std::min(S, I);

10273 }),

10274 EltsPerVector);

10275 Indices.push_back(OffsetReg1 % NumElts);

10276 }

10277Idx =I - OffsetReg1;

10278 }

10279I = (Idx % NumElts) % EltsPerVector +

10280 (RegId == FirstRegId ? 0 : EltsPerVector);

10281 }

10282return ShuffleKind;

10283 };

10284InstructionCost Cost = 0;

10285

10286// Process extracts in blocks of EltsPerVector to check if the source vector

10287// operand can be re-used directly. If not, add the cost of creating a

10288// shuffle to extract the values into a vector register.

10289for (unsigned Part : seq<unsigned>(NumParts)) {

10290if (!ShuffleKinds[Part])

10291continue;

10292ArrayRef<int> MaskSlice = Mask.slice(

10293 Part * EltsPerVector,getNumElems(Mask.size(), EltsPerVector, Part));

10294SmallVector<int> SubMask(EltsPerVector,PoisonMaskElem);

10295copy(MaskSlice, SubMask.begin());

10296SmallVector<unsigned, 2> Indices;

10297 std::optional<TTI::ShuffleKind> RegShuffleKind =

10298 CheckPerRegistersShuffle(SubMask, Indices);

10299if (!RegShuffleKind) {

10300if (*ShuffleKinds[Part] !=TTI::SK_PermuteSingleSrc ||

10301 !ShuffleVectorInst::isIdentityMask(

10302 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))

10303Cost +=

10304::getShuffleCost(TTI, *ShuffleKinds[Part],

10305getWidenedType(ScalarTy, NumElts), MaskSlice);

10306continue;

10307 }

10308if (*RegShuffleKind !=TTI::SK_PermuteSingleSrc ||

10309 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {

10310Cost +=

10311::getShuffleCost(TTI, *RegShuffleKind,

10312getWidenedType(ScalarTy, EltsPerVector), SubMask);

10313 }

10314constunsigned BaseVF =getFullVectorNumberOfElements(

10315 *R.TTI, VL.front()->getType(),alignTo(NumElts, EltsPerVector));

10316for (unsignedIdx : Indices) {

10317assert((Idx + EltsPerVector) <= BaseVF &&

10318"SK_ExtractSubvector index out of range");

10319Cost +=::getShuffleCost(TTI,TTI::SK_ExtractSubvector,

10320getWidenedType(ScalarTy, BaseVF), {}, CostKind,

10321Idx,getWidenedType(ScalarTy, EltsPerVector));

10322 }

10323// Second attempt to check, if just a permute is better estimated than

10324// subvector extract.

10325 SubMask.assign(NumElts,PoisonMaskElem);

10326copy(MaskSlice, SubMask.begin());

10327InstructionCost OriginalCost =::getShuffleCost(

10328TTI, *ShuffleKinds[Part],getWidenedType(ScalarTy, NumElts), SubMask);

10329if (OriginalCost <Cost)

10330Cost = OriginalCost;

10331 }

10332returnCost;

10333 }

10334 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given

10335 /// mask \p Mask, register number \p Part, that includes \p SliceSize

10336 /// elements.

10337void estimateNodesPermuteCost(const TreeEntry &E1,const TreeEntry *E2,

10338ArrayRef<int> Mask,unsigned Part,

10339unsigned SliceSize) {

10340if (SameNodesEstimated) {

10341// Delay the cost estimation if the same nodes are reshuffling.

10342// If we already requested the cost of reshuffling of E1 and E2 before, no

10343// need to estimate another cost with the sub-Mask, instead include this

10344// sub-Mask into the CommonMask to estimate it later and avoid double cost

10345// estimation.

10346if ((InVectors.size() == 2 &&

10347 cast<const TreeEntry *>(InVectors.front()) == &E1 &&

10348 cast<const TreeEntry *>(InVectors.back()) == E2) ||

10349 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {

10350unsigned Limit =getNumElems(Mask.size(), SliceSize, Part);

10351assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),

10352 [](intIdx) {returnIdx ==PoisonMaskElem; }) &&

10353"Expected all poisoned elements.");

10354ArrayRef<int> SubMask =ArrayRef(Mask).slice(Part * SliceSize, Limit);

10355copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));

10356return;

10357 }

10358// Found non-matching nodes - need to estimate the cost for the matched

10359// and transform mask.

10360Cost += createShuffle(InVectors.front(),

10361 InVectors.size() == 1 ?nullptr : InVectors.back(),

10362 CommonMask);

10363 transformMaskAfterShuffle(CommonMask, CommonMask);

10364 }elseif (InVectors.size() == 2) {

10365Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);

10366 transformMaskAfterShuffle(CommonMask, CommonMask);

10367 }

10368 SameNodesEstimated =false;

10369if (!E2 && InVectors.size() == 1) {

10370unsigned VF = E1.getVectorFactor();

10371if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {

10372 VF = std::max(VF,

10373 cast<FixedVectorType>(V1->getType())->getNumElements());

10374 }else {

10375constauto *E = cast<const TreeEntry *>(InVectors.front());

10376 VF = std::max(VF, E->getVectorFactor());

10377 }

10378for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

10379if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)

10380 CommonMask[Idx] = Mask[Idx] + VF;

10381Cost += createShuffle(InVectors.front(), &E1, CommonMask);

10382 transformMaskAfterShuffle(CommonMask, CommonMask);

10383 }else {

10384autoP = InVectors.front();

10385Cost += createShuffle(&E1, E2, Mask);

10386unsigned VF = Mask.size();

10387if (Value *V1 =P.dyn_cast<Value *>()) {

10388 VF = std::max(VF,

10389getNumElements(V1->getType()));

10390 }else {

10391constauto *E = cast<const TreeEntry *>(P);

10392 VF = std::max(VF, E->getVectorFactor());

10393 }

10394for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

10395if (Mask[Idx] !=PoisonMaskElem)

10396 CommonMask[Idx] =Idx + (InVectors.empty() ? 0 : VF);

10397Cost += createShuffle(P, InVectors.front(), CommonMask);

10398 transformMaskAfterShuffle(CommonMask, CommonMask);

10399 }

10400 }

10401

10402classShuffleCostBuilder {

10403constTargetTransformInfo &TTI;

10404

10405staticbool isEmptyOrIdentity(ArrayRef<int> Mask,unsigned VF) {

10406int Index = -1;

10407return Mask.empty() ||

10408 (VF == Mask.size() &&

10409ShuffleVectorInst::isIdentityMask(Mask, VF)) ||

10410 (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&

10411 Index == 0);

10412 }

10413

10414public:

10415 ShuffleCostBuilder(constTargetTransformInfo &TTI) :TTI(TTI) {}

10416 ~ShuffleCostBuilder() =default;

10417InstructionCost createShuffleVector(Value *V1,Value *,

10418ArrayRef<int> Mask) const{

10419// Empty mask or identity mask are free.

10420unsigned VF =

10421 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

10422if (isEmptyOrIdentity(Mask, VF))

10423returnTTI::TCC_Free;

10424 return ::getShuffleCost(TTI,TTI::SK_PermuteTwoSrc,

10425 cast<VectorType>(V1->getType()), Mask);

10426 }

10427InstructionCost createShuffleVector(Value *V1,ArrayRef<int> Mask) const{

10428// Empty mask or identity mask are free.

10429unsigned VF =

10430 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

10431if (isEmptyOrIdentity(Mask, VF))

10432returnTTI::TCC_Free;

10433 return ::getShuffleCost(TTI,TTI::SK_PermuteSingleSrc,

10434 cast<VectorType>(V1->getType()), Mask);

10435 }

10436InstructionCost createIdentity(Value *) const{returnTTI::TCC_Free; }

10437InstructionCost createPoison(Type *Ty,unsigned VF) const{

10438returnTTI::TCC_Free;

10439 }

10440void resizeToMatch(Value *&,Value *&) const{}

10441 };

10442

10443 /// Smart shuffle instruction emission, walks through shuffles trees and

10444 /// tries to find the best matching vector for the actual shuffle

10445 /// instruction.

10446InstructionCost

10447 createShuffle(constPointerUnion<Value *, const TreeEntry *> &P1,

10448constPointerUnion<Value *, const TreeEntry *> &P2,

10449ArrayRef<int> Mask) {

10450 ShuffleCostBuilder Builder(TTI);

10451SmallVector<int> CommonMask(Mask);

10452Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();

10453unsigned CommonVF = Mask.size();

10454InstructionCost ExtraCost = 0;

10455auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,

10456unsigned VF) ->InstructionCost {

10457if (E.isGather() &&allConstant(E.Scalars))

10458returnTTI::TCC_Free;

10459Type *EScalarTy = E.Scalars.front()->getType();

10460bool IsSigned =true;

10461if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {

10462 EScalarTy =IntegerType::get(EScalarTy->getContext(), It->second.first);

10463 IsSigned = It->second.second;

10464 }

10465if (EScalarTy != ScalarTy) {

10466unsigned CastOpcode = Instruction::Trunc;

10467unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);

10468unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);

10469if (DstSz > SrcSz)

10470 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

10471returnTTI.getCastInstrCost(CastOpcode,getWidenedType(ScalarTy, VF),

10472getWidenedType(EScalarTy, VF),

10473TTI::CastContextHint::None, CostKind);

10474 }

10475returnTTI::TCC_Free;

10476 };

10477auto GetValueMinBWAffectedCost = [&](constValue *V) ->InstructionCost {

10478if (isa<Constant>(V))

10479returnTTI::TCC_Free;

10480auto *VecTy = cast<VectorType>(V->getType());

10481Type *EScalarTy = VecTy->getElementType();

10482if (EScalarTy != ScalarTy) {

10483bool IsSigned = !isKnownNonNegative(V,SimplifyQuery(*R.DL));

10484unsigned CastOpcode = Instruction::Trunc;

10485unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);

10486unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);

10487if (DstSz > SrcSz)

10488 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

10489returnTTI.getCastInstrCost(

10490 CastOpcode,VectorType::get(ScalarTy, VecTy->getElementCount()),

10491 VecTy,TTI::CastContextHint::None, CostKind);

10492 }

10493returnTTI::TCC_Free;

10494 };

10495if (!V1 && !V2 && !P2.isNull()) {

10496// Shuffle 2 entry nodes.

10497const TreeEntry *E = cast<const TreeEntry *>(P1);

10498unsigned VF = E->getVectorFactor();

10499const TreeEntry *E2 = cast<const TreeEntry *>(P2);

10500 CommonVF = std::max(VF, E2->getVectorFactor());

10501assert(all_of(Mask,

10502 [=](intIdx) {

10503return Idx < 2 * static_cast<int>(CommonVF);

10504 }) &&

10505"All elements in mask must be less than 2 * CommonVF.");

10506if (E->Scalars.size() == E2->Scalars.size()) {

10507SmallVector<int> EMask = E->getCommonMask();

10508SmallVector<int> E2Mask = E2->getCommonMask();

10509if (!EMask.empty() || !E2Mask.empty()) {

10510for (int &Idx : CommonMask) {

10511if (Idx ==PoisonMaskElem)

10512continue;

10513if (Idx <static_cast<int>(CommonVF) && !EMask.empty())

10514Idx = EMask[Idx];

10515elseif (Idx >=static_cast<int>(CommonVF))

10516Idx = (E2Mask.empty() ?Idx - CommonVF : E2Mask[Idx - CommonVF]) +

10517 E->Scalars.size();

10518 }

10519 }

10520 CommonVF = E->Scalars.size();

10521 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +

10522 GetNodeMinBWAffectedCost(*E2, CommonVF);

10523 }else {

10524 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +

10525 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());

10526 }

10527 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10528 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10529 }elseif (!V1 && P2.isNull()) {

10530// Shuffle single entry node.

10531const TreeEntry *E = cast<const TreeEntry *>(P1);

10532unsigned VF = E->getVectorFactor();

10533 CommonVF = VF;

10534assert(

10535all_of(Mask,

10536 [=](intIdx) {return Idx < static_cast<int>(CommonVF); }) &&

10537"All elements in mask must be less than CommonVF.");

10538if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {

10539SmallVector<int> EMask = E->getCommonMask();

10540assert(!EMask.empty() &&"Expected non-empty common mask.");

10541for (int &Idx : CommonMask) {

10542if (Idx !=PoisonMaskElem)

10543Idx = EMask[Idx];

10544 }

10545 CommonVF = E->Scalars.size();

10546 }elseif (unsigned Factor = E->getInterleaveFactor();

10547 Factor > 0 && E->Scalars.size() != Mask.size() &&

10548ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,

10549 Factor)) {

10550// Deinterleaved nodes are free.

10551 std::iota(CommonMask.begin(), CommonMask.end(), 0);

10552 }

10553 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);

10554 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10555// Not identity/broadcast? Try to see if the original vector is better.

10556if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&

10557 CommonVF == CommonMask.size() &&

10558any_of(enumerate(CommonMask),

10559 [](constauto &&P) {

10560returnP.value() !=PoisonMaskElem &&

10561static_cast<unsigned>(P.value()) !=P.index();

10562 }) &&

10563any_of(CommonMask,

10564 [](intIdx) {returnIdx !=PoisonMaskElem &&Idx != 0; })) {

10565SmallVector<int> ReorderMask;

10566inversePermutation(E->ReorderIndices, ReorderMask);

10567::addMask(CommonMask, ReorderMask);

10568 }

10569 }elseif (V1 && P2.isNull()) {

10570// Shuffle single vector.

10571 ExtraCost += GetValueMinBWAffectedCost(V1);

10572 CommonVF = getVF(V1);

10573assert(

10574all_of(Mask,

10575 [=](intIdx) {return Idx < static_cast<int>(CommonVF); }) &&

10576"All elements in mask must be less than CommonVF.");

10577 }elseif (V1 && !V2) {

10578// Shuffle vector and tree node.

10579unsigned VF = getVF(V1);

10580const TreeEntry *E2 = cast<const TreeEntry *>(P2);

10581 CommonVF = std::max(VF, E2->getVectorFactor());

10582assert(all_of(Mask,

10583 [=](intIdx) {

10584return Idx < 2 * static_cast<int>(CommonVF);

10585 }) &&

10586"All elements in mask must be less than 2 * CommonVF.");

10587if (E2->Scalars.size() == VF && VF != CommonVF) {

10588SmallVector<int> E2Mask = E2->getCommonMask();

10589assert(!E2Mask.empty() &&"Expected non-empty common mask.");

10590for (int &Idx : CommonMask) {

10591if (Idx ==PoisonMaskElem)

10592continue;

10593if (Idx >=static_cast<int>(CommonVF))

10594Idx = E2Mask[Idx - CommonVF] + VF;

10595 }

10596 CommonVF = VF;

10597 }

10598 ExtraCost += GetValueMinBWAffectedCost(V1);

10599 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10600 ExtraCost += GetNodeMinBWAffectedCost(

10601 *E2, std::min(CommonVF, E2->getVectorFactor()));

10602 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10603 }elseif (!V1 && V2) {

10604// Shuffle vector and tree node.

10605unsigned VF = getVF(V2);

10606const TreeEntry *E1 = cast<const TreeEntry *>(P1);

10607 CommonVF = std::max(VF, E1->getVectorFactor());

10608assert(all_of(Mask,

10609 [=](intIdx) {

10610return Idx < 2 * static_cast<int>(CommonVF);

10611 }) &&

10612"All elements in mask must be less than 2 * CommonVF.");

10613if (E1->Scalars.size() == VF && VF != CommonVF) {

10614SmallVector<int> E1Mask = E1->getCommonMask();

10615assert(!E1Mask.empty() &&"Expected non-empty common mask.");

10616for (int &Idx : CommonMask) {

10617if (Idx ==PoisonMaskElem)

10618continue;

10619if (Idx >=static_cast<int>(CommonVF))

10620Idx = E1Mask[Idx - CommonVF] + VF;

10621else

10622Idx = E1Mask[Idx];

10623 }

10624 CommonVF = VF;

10625 }

10626 ExtraCost += GetNodeMinBWAffectedCost(

10627 *E1, std::min(CommonVF, E1->getVectorFactor()));

10628 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10629 ExtraCost += GetValueMinBWAffectedCost(V2);

10630 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10631 }else {

10632assert(V1 && V2 &&"Expected both vectors.");

10633unsigned VF = getVF(V1);

10634 CommonVF = std::max(VF, getVF(V2));

10635assert(all_of(Mask,

10636 [=](intIdx) {

10637return Idx < 2 * static_cast<int>(CommonVF);

10638 }) &&

10639"All elements in mask must be less than 2 * CommonVF.");

10640 ExtraCost +=

10641 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);

10642if (V1->getType() != V2->getType()) {

10643 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10644 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10645 }else {

10646if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)

10647 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10648if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)

10649 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10650 }

10651 }

10652 InVectors.front() =

10653Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));

10654if (InVectors.size() == 2)

10655 InVectors.pop_back();

10656return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(

10657 V1, V2, CommonMask, Builder, ScalarTy);

10658 }

10659

10660public:

10661ShuffleCostEstimator(Type *ScalarTy,TargetTransformInfo &TTI,

10662ArrayRef<Value *> VectorizedVals,BoUpSLP &R,

10663SmallPtrSetImpl<Value *> &CheckedExtracts)

10664 : BaseShuffleAnalysis(ScalarTy),TTI(TTI),

10665 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),

10666 CheckedExtracts(CheckedExtracts) {}

10667Value *adjustExtracts(const TreeEntry *E,MutableArrayRef<int> Mask,

10668ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

10669unsigned NumParts,bool &UseVecBaseAsInput) {

10670 UseVecBaseAsInput =false;

10671if (Mask.empty())

10672returnnullptr;

10673Value *VecBase =nullptr;

10674SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());

10675if (!E->ReorderIndices.empty()) {

10676SmallVector<int> ReorderMask(E->ReorderIndices.begin(),

10677 E->ReorderIndices.end());

10678reorderScalars(VL, ReorderMask);

10679 }

10680// Check if it can be considered reused if same extractelements were

10681// vectorized already.

10682bool PrevNodeFound =any_of(

10683ArrayRef(R.VectorizableTree).take_front(E->Idx),

10684 [&](const std::unique_ptr<TreeEntry> &TE) {

10685 return ((TE->hasState() && !TE->isAltShuffle() &&

10686 TE->getOpcode() == Instruction::ExtractElement) ||

10687 TE->isGather()) &&

10688 all_of(enumerate(TE->Scalars), [&](auto &&Data) {

10689 return VL.size() > Data.index() &&

10690 (Mask[Data.index()] == PoisonMaskElem ||

10691 isa<UndefValue>(VL[Data.index()]) ||

10692 Data.value() == VL[Data.index()]);

10693 });

10694 });

10695SmallPtrSet<Value *, 4> UniqueBases;

10696unsigned SliceSize =getPartNumElems(VL.size(), NumParts);

10697for (unsigned Part : seq<unsigned>(NumParts)) {

10698unsigned Limit =getNumElems(VL.size(), SliceSize, Part);

10699ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);

10700for (auto [I, V] :

10701enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {

10702// Ignore non-extractelement scalars.

10703if (isa<UndefValue>(V) ||

10704 (!SubMask.empty() && SubMask[I] ==PoisonMaskElem))

10705continue;

10706// If all users of instruction are going to be vectorized and this

10707// instruction itself is not going to be vectorized, consider this

10708// instruction as dead and remove its cost from the final cost of the

10709// vectorized tree.

10710// Also, avoid adjusting the cost for extractelements with multiple uses

10711// in different graph entries.

10712auto *EE = cast<ExtractElementInst>(V);

10713 VecBase = EE->getVectorOperand();

10714 UniqueBases.insert(VecBase);

10715const TreeEntry *VE = R.getTreeEntry(V);

10716if (!CheckedExtracts.insert(V).second ||

10717 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||

10718any_of(EE->users(),

10719 [&](User *U) {

10720 return isa<GetElementPtrInst>(U) &&

10721 !R.areAllUsersVectorized(cast<Instruction>(U),

10722 &VectorizedVals);

10723 }) ||

10724 (VE && VE != E))

10725continue;

10726 std::optional<unsigned> EEIdx =getExtractIndex(EE);

10727if (!EEIdx)

10728continue;

10729unsignedIdx = *EEIdx;

10730// Take credit for instruction that will become dead.

10731if (EE->hasOneUse() || !PrevNodeFound) {

10732Instruction *Ext = EE->user_back();

10733if (isa<SExtInst, ZExtInst>(Ext) &&

10734all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {

10735// Use getExtractWithExtendCost() to calculate the cost of

10736// extractelement/ext pair.

10737Cost -=

10738TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),

10739 EE->getVectorOperandType(),Idx);

10740// Add back the cost of s|zext which is subtracted separately.

10741Cost +=TTI.getCastInstrCost(

10742 Ext->getOpcode(), Ext->getType(), EE->getType(),

10743TTI::getCastContextHint(Ext), CostKind, Ext);

10744continue;

10745 }

10746 }

10747Cost -=TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),

10748 CostKind,Idx);

10749 }

10750 }

10751// Check that gather of extractelements can be represented as just a

10752// shuffle of a single/two vectors the scalars are extracted from.

10753// Found the bunch of extractelement instructions that must be gathered

10754// into a vector and can be represented as a permutation elements in a

10755// single input vector or of 2 input vectors.

10756// Done for reused if same extractelements were vectorized already.

10757if (!PrevNodeFound)

10758Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);

10759 InVectors.assign(1, E);

10760 CommonMask.assign(Mask.begin(),Mask.end());

10761 transformMaskAfterShuffle(CommonMask, CommonMask);

10762 SameNodesEstimated =false;

10763if (NumParts != 1 && UniqueBases.size() != 1) {

10764 UseVecBaseAsInput =true;

10765 VecBase =

10766Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));

10767 }

10768return VecBase;

10769 }

10770 /// Checks if the specified entry \p E needs to be delayed because of its

10771 /// dependency nodes.

10772 std::optional<InstructionCost>

10773needToDelay(const TreeEntry *,

10774ArrayRef<SmallVector<const TreeEntry *>>) const{

10775// No need to delay the cost estimation during analysis.

10776return std::nullopt;

10777 }

10778voidadd(const TreeEntry &E1,const TreeEntry &E2,ArrayRef<int> Mask) {

10779if (&E1 == &E2) {

10780assert(all_of(Mask,

10781 [&](intIdx) {

10782return Idx < static_cast<int>(E1.getVectorFactor());

10783 }) &&

10784"Expected single vector shuffle mask.");

10785 add(E1, Mask);

10786return;

10787 }

10788if (InVectors.empty()) {

10789 CommonMask.assign(Mask.begin(), Mask.end());

10790 InVectors.assign({&E1, &E2});

10791return;

10792 }

10793assert(!CommonMask.empty() &&"Expected non-empty common mask.");

10794auto *MaskVecTy =getWidenedType(ScalarTy, Mask.size());

10795unsigned NumParts =TTI.getNumberOfParts(MaskVecTy);

10796if (NumParts == 0 || NumParts >= Mask.size() ||

10797 MaskVecTy->getNumElements() % NumParts != 0 ||

10798 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),

10799 MaskVecTy->getNumElements() / NumParts))

10800 NumParts = 1;

10801unsigned SliceSize =getPartNumElems(Mask.size(), NumParts);

10802constauto *It =

10803find_if(Mask, [](intIdx) {returnIdx !=PoisonMaskElem; });

10804unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

10805 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);

10806 }

10807voidadd(const TreeEntry &E1,ArrayRef<int> Mask) {

10808if (InVectors.empty()) {

10809 CommonMask.assign(Mask.begin(), Mask.end());

10810 InVectors.assign(1, &E1);

10811return;

10812 }

10813assert(!CommonMask.empty() &&"Expected non-empty common mask.");

10814auto *MaskVecTy =getWidenedType(ScalarTy, Mask.size());

10815unsigned NumParts =TTI.getNumberOfParts(MaskVecTy);

10816if (NumParts == 0 || NumParts >= Mask.size() ||

10817 MaskVecTy->getNumElements() % NumParts != 0 ||

10818 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),

10819 MaskVecTy->getNumElements() / NumParts))

10820 NumParts = 1;

10821unsigned SliceSize =getPartNumElems(Mask.size(), NumParts);

10822constauto *It =

10823find_if(Mask, [](intIdx) {returnIdx !=PoisonMaskElem; });

10824unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

10825 estimateNodesPermuteCost(E1,nullptr, Mask, Part, SliceSize);

10826if (!SameNodesEstimated && InVectors.size() == 1)

10827 InVectors.emplace_back(&E1);

10828 }

10829 /// Adds 2 input vectors and the mask for their shuffling.

10830voidadd(Value *V1,Value *V2,ArrayRef<int> Mask) {

10831// May come only for shuffling of 2 vectors with extractelements, already

10832// handled in adjustExtracts.

10833assert(InVectors.size() == 1 &&

10834all_of(enumerate(CommonMask),

10835 [&](autoP) {

10836if (P.value() ==PoisonMaskElem)

10837return Mask[P.index()] ==PoisonMaskElem;

10838auto *EI = cast<ExtractElementInst>(

10839 cast<const TreeEntry *>(InVectors.front())

10840 ->getOrdered(P.index()));

10841return EI->getVectorOperand() == V1 ||

10842 EI->getVectorOperand() == V2;

10843 }) &&

10844"Expected extractelement vectors.");

10845 }

10846 /// Adds another one input vector and the mask for the shuffling.

10847voidadd(Value *V1,ArrayRef<int> Mask,bool ForExtracts =false) {

10848if (InVectors.empty()) {

10849assert(CommonMask.empty() && !ForExtracts &&

10850"Expected empty input mask/vectors.");

10851 CommonMask.assign(Mask.begin(), Mask.end());

10852 InVectors.assign(1, V1);

10853return;

10854 }

10855if (ForExtracts) {

10856// No need to add vectors here, already handled them in adjustExtracts.

10857assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&

10858 !CommonMask.empty() &&

10859all_of(enumerate(CommonMask),

10860 [&](autoP) {

10861Value *Scalar = cast<const TreeEntry *>(InVectors[0])

10862 ->getOrdered(P.index());

10863if (P.value() ==PoisonMaskElem)

10864returnP.value() == Mask[P.index()] ||

10865 isa<UndefValue>(Scalar);

10866if (isa<Constant>(V1))

10867returntrue;

10868auto *EI = cast<ExtractElementInst>(Scalar);

10869return EI->getVectorOperand() == V1;

10870 }) &&

10871"Expected only tree entry for extractelement vectors.");

10872return;

10873 }

10874assert(!InVectors.empty() && !CommonMask.empty() &&

10875"Expected only tree entries from extracts/reused buildvectors.");

10876unsigned VF = getVF(V1);

10877if (InVectors.size() == 2) {

10878Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);

10879 transformMaskAfterShuffle(CommonMask, CommonMask);

10880 VF = std::max<unsigned>(VF, CommonMask.size());

10881 }elseif (constauto *InTE =

10882 InVectors.front().dyn_cast<const TreeEntry *>()) {

10883 VF = std::max(VF, InTE->getVectorFactor());

10884 }else {

10885 VF = std::max(

10886 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())

10887 ->getNumElements());

10888 }

10889 InVectors.push_back(V1);

10890for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

10891if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)

10892 CommonMask[Idx] = Mask[Idx] + VF;

10893 }

10894Value *gather(ArrayRef<Value *> VL,unsigned MaskVF = 0,

10895Value *Root =nullptr) {

10896Cost += getBuildVectorCost(VL, Root);

10897if (!Root) {

10898// FIXME: Need to find a way to avoid use of getNullValue here.

10899SmallVector<Constant *> Vals;

10900unsigned VF = VL.size();

10901if (MaskVF != 0)

10902 VF = std::min(VF, MaskVF);

10903for (Value *V : VL.take_front(VF)) {

10904if (isa<UndefValue>(V)) {

10905 Vals.push_back(cast<Constant>(V));

10906continue;

10907 }

10908 Vals.push_back(Constant::getNullValue(V->getType()));

10909 }

10910if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {

10911assert(SLPReVec &&"FixedVectorType is not expected.");

10912// When REVEC is enabled, we need to expand vector types into scalar

10913// types.

10914unsigned VecTyNumElements = VecTy->getNumElements();

10915SmallVector<Constant *> NewVals(VF * VecTyNumElements,nullptr);

10916for (auto [I, V] :enumerate(Vals)) {

10917Type *ScalarTy = V->getType()->getScalarType();

10918Constant *NewVal;

10919if (isa<PoisonValue>(V))

10920 NewVal =PoisonValue::get(ScalarTy);

10921elseif (isa<UndefValue>(V))

10922 NewVal =UndefValue::get(ScalarTy);

10923else

10924 NewVal =Constant::getNullValue(ScalarTy);

10925 std::fill_n(NewVals.begin() +I * VecTyNumElements, VecTyNumElements,

10926 NewVal);

10927 }

10928 Vals.swap(NewVals);

10929 }

10930returnConstantVector::get(Vals);

10931 }

10932returnConstantVector::getSplat(

10933ElementCount::getFixed(

10934 cast<FixedVectorType>(Root->getType())->getNumElements()),

10935 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));

10936 }

10937InstructionCost createFreeze(InstructionCost Cost) {returnCost; }

10938 /// Finalize emission of the shuffles.

10939InstructionCost

10940finalize(ArrayRef<int> ExtMask,

10941ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,

10942ArrayRef<int> SubVectorsMask,unsigned VF = 0,

10943function_ref<void(Value *&,SmallVectorImpl<int> &)> Action = {}) {

10944 IsFinalized =true;

10945if (Action) {

10946constPointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();

10947if (InVectors.size() == 2)

10948Cost += createShuffle(Vec, InVectors.back(), CommonMask);

10949else

10950Cost += createShuffle(Vec,nullptr, CommonMask);

10951 transformMaskAfterShuffle(CommonMask, CommonMask);

10952assert(VF > 0 &&

10953"Expected vector length for the final value before action.");

10954Value *V = cast<Value *>(Vec);

10955 Action(V, CommonMask);

10956 InVectors.front() = V;

10957 }

10958if (!SubVectors.empty()) {

10959constPointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();

10960if (InVectors.size() == 2)

10961Cost += createShuffle(Vec, InVectors.back(), CommonMask);

10962else

10963Cost += createShuffle(Vec,nullptr, CommonMask);

10964 transformMaskAfterShuffle(CommonMask, CommonMask);

10965// Add subvectors permutation cost.

10966if (!SubVectorsMask.empty()) {

10967assert(SubVectorsMask.size() <= CommonMask.size() &&

10968"Expected same size of masks for subvectors and common mask.");

10969SmallVector<int> SVMask(CommonMask.size(),PoisonMaskElem);

10970copy(SubVectorsMask, SVMask.begin());

10971for (auto [I1, I2] :zip(SVMask, CommonMask)) {

10972if (I2 !=PoisonMaskElem) {

10973assert(I1 ==PoisonMaskElem &&"Expected unused subvectors mask");

10974 I1 = I2 + CommonMask.size();

10975 }

10976 }

10977Cost +=::getShuffleCost(TTI,TTI::SK_PermuteTwoSrc,

10978getWidenedType(ScalarTy, CommonMask.size()),

10979 SVMask,CostKind);

10980 }

10981for (auto [E,Idx] : SubVectors) {

10982Type *EScalarTy = E->Scalars.front()->getType();

10983bool IsSigned =true;

10984if (auto It =R.MinBWs.find(E); It !=R.MinBWs.end()) {

10985 EScalarTy =

10986IntegerType::get(EScalarTy->getContext(), It->second.first);

10987 IsSigned = It->second.second;

10988 }

10989if (ScalarTy != EScalarTy) {

10990unsigned CastOpcode = Instruction::Trunc;

10991unsigned DstSz =R.DL->getTypeSizeInBits(ScalarTy);

10992unsigned SrcSz =R.DL->getTypeSizeInBits(EScalarTy);

10993if (DstSz > SrcSz)

10994 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

10995Cost +=TTI.getCastInstrCost(

10996 CastOpcode,getWidenedType(ScalarTy, E->getVectorFactor()),

10997getWidenedType(EScalarTy, E->getVectorFactor()),

10998TTI::CastContextHint::Normal,CostKind);

10999 }

11000Cost +=::getShuffleCost(

11001TTI,TTI::SK_InsertSubvector,

11002getWidenedType(ScalarTy, CommonMask.size()), {},CostKind,Idx,

11003getWidenedType(ScalarTy, E->getVectorFactor()));

11004if (!CommonMask.empty()) {

11005 std::iota(std::next(CommonMask.begin(),Idx),

11006 std::next(CommonMask.begin(),Idx + E->getVectorFactor()),

11007Idx);

11008 }

11009 }

11010 }

11011

11012if (!ExtMask.empty()) {

11013if (CommonMask.empty()) {

11014 CommonMask.assign(ExtMask.begin(), ExtMask.end());

11015 }else {

11016SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);

11017for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {

11018if (ExtMask[I] ==PoisonMaskElem)

11019continue;

11020 NewMask[I] = CommonMask[ExtMask[I]];

11021 }

11022 CommonMask.swap(NewMask);

11023 }

11024 }

11025if (CommonMask.empty()) {

11026assert(InVectors.size() == 1 &&"Expected only one vector with no mask");

11027returnCost;

11028 }

11029returnCost +

11030 createShuffle(InVectors.front(),

11031 InVectors.size() == 2 ? InVectors.back() :nullptr,

11032 CommonMask);

11033 }

11034

11035~ShuffleCostEstimator() {

11036assert((IsFinalized || CommonMask.empty()) &&

11037"Shuffle construction must be finalized.");

11038 }

11039};

11040

11041const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,

11042unsignedIdx) const{

11043if (const TreeEntry *VE = getMatchedVectorizedOperand(E,Idx))

11044return VE;

11045constauto *It =

11046find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

11047return TE->isGather() &&

11048find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {

11049 return EI.EdgeIdx == Idx && EI.UserTE == E;

11050 }) != TE->UserTreeIndices.end();

11051 });

11052assert(It != VectorizableTree.end() &&"Expected vectorizable entry.");

11053return It->get();

11054}

11055

11056TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const{

11057if (TE.State == TreeEntry::ScatterVectorize ||

11058 TE.State == TreeEntry::StridedVectorize)

11059returnTTI::CastContextHint::GatherScatter;

11060if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&

11061 !TE.isAltShuffle()) {

11062if (TE.ReorderIndices.empty())

11063returnTTI::CastContextHint::Normal;

11064SmallVector<int> Mask;

11065inversePermutation(TE.ReorderIndices, Mask);

11066if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))

11067returnTTI::CastContextHint::Reversed;

11068 }

11069returnTTI::CastContextHint::None;

11070}

11071

11072/// Builds the arguments types vector for the given call instruction with the

11073/// given \p ID for the specified vector factor.

11074staticSmallVector<Type *>

11075buildIntrinsicArgTypes(constCallInst *CI,constIntrinsic::ID ID,

11076constunsigned VF,unsigned MinBW,

11077constTargetTransformInfo *TTI) {

11078SmallVector<Type *> ArgTys;

11079for (auto [Idx, Arg] :enumerate(CI->args())) {

11080if (ID !=Intrinsic::not_intrinsic) {

11081if (isVectorIntrinsicWithScalarOpAtArg(ID,Idx,TTI)) {

11082 ArgTys.push_back(Arg->getType());

11083continue;

11084 }

11085if (MinBW > 0) {

11086 ArgTys.push_back(

11087getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));

11088continue;

11089 }

11090 }

11091 ArgTys.push_back(getWidenedType(Arg->getType(), VF));

11092 }

11093return ArgTys;

11094}

11095

11096InstructionCost

11097BoUpSLP::getEntryCost(const TreeEntry *E,ArrayRef<Value *> VectorizedVals,

11098SmallPtrSetImpl<Value *> &CheckedExtracts) {

11099ArrayRef<Value *> VL = E->Scalars;

11100

11101Type *ScalarTy =getValueType(VL[0]);

11102if (!isValidElementType(ScalarTy))

11103returnInstructionCost::getInvalid();

11104TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

11105

11106// If we have computed a smaller type for the expression, update VecTy so

11107// that the costs will be accurate.

11108auto It = MinBWs.find(E);

11109Type *OrigScalarTy = ScalarTy;

11110if (It != MinBWs.end()) {

11111auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);

11112 ScalarTy =IntegerType::get(F->getContext(), It->second.first);

11113if (VecTy)

11114 ScalarTy =getWidenedType(ScalarTy, VecTy->getNumElements());

11115 }

11116auto *VecTy =getWidenedType(ScalarTy, VL.size());

11117unsigned EntryVF = E->getVectorFactor();

11118auto *FinalVecTy =getWidenedType(ScalarTy, EntryVF);

11119

11120if (E->isGather()) {

11121if (allConstant(VL))

11122return 0;

11123if (isa<InsertElementInst>(VL[0]))

11124returnInstructionCost::getInvalid();

11125if (isa<CmpInst>(VL.front()))

11126 ScalarTy = VL.front()->getType();

11127return processBuildVector<ShuffleCostEstimator, InstructionCost>(

11128 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);

11129 }

11130InstructionCost CommonCost = 0;

11131SmallVector<int>Mask;

11132if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||

11133 !isReverseOrder(E->ReorderIndices))) {

11134SmallVector<int> NewMask;

11135if (E->getOpcode() == Instruction::Store) {

11136// For stores the order is actually a mask.

11137 NewMask.resize(E->ReorderIndices.size());

11138copy(E->ReorderIndices, NewMask.begin());

11139 }else {

11140inversePermutation(E->ReorderIndices, NewMask);

11141 }

11142::addMask(Mask, NewMask);

11143 }

11144if (!E->ReuseShuffleIndices.empty())

11145::addMask(Mask, E->ReuseShuffleIndices);

11146if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask,Mask.size()))

11147 CommonCost =

11148::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);

11149assert((E->State == TreeEntry::Vectorize ||

11150 E->State == TreeEntry::ScatterVectorize ||

11151 E->State == TreeEntry::StridedVectorize) &&

11152"Unhandled state");

11153assert(E->getOpcode() &&

11154 ((allSameType(VL) &&allSameBlock(VL)) ||

11155 (E->getOpcode() == Instruction::GetElementPtr &&

11156 E->getMainOp()->getType()->isPointerTy())) &&

11157"Invalid VL");

11158Instruction *VL0 = E->getMainOp();

11159unsigned ShuffleOrOp =

11160 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

11161if (E->CombinedOp != TreeEntry::NotCombinedOp)

11162 ShuffleOrOp = E->CombinedOp;

11163SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());

11164constunsigned Sz = UniqueValues.size();

11165SmallBitVector UsedScalars(Sz,false);

11166for (unsignedI = 0;I < Sz; ++I) {

11167if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)

11168continue;

11169 UsedScalars.set(I);

11170 }

11171auto GetCastContextHint = [&](Value *V) {

11172if (const TreeEntry *OpTE = getTreeEntry(V))

11173return getCastContextHint(*OpTE);

11174 InstructionsState SrcState =getSameOpcode(E->getOperand(0), *TLI);

11175if (SrcState && SrcState.getOpcode() == Instruction::Load &&

11176 !SrcState.isAltShuffle())

11177returnTTI::CastContextHint::GatherScatter;

11178returnTTI::CastContextHint::None;

11179 };

11180auto GetCostDiff =

11181 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,

11182function_ref<InstructionCost(InstructionCost)> VectorCost) {

11183// Calculate the cost of this instruction.

11184InstructionCost ScalarCost = 0;

11185if (isa<CastInst, CallInst>(VL0)) {

11186// For some of the instructions no need to calculate cost for each

11187// particular instruction, we can use the cost of the single

11188// instruction x total number of scalar instructions.

11189 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);

11190 }else {

11191for (unsignedI = 0;I < Sz; ++I) {

11192if (UsedScalars.test(I))

11193continue;

11194 ScalarCost += ScalarEltCost(I);

11195 }

11196 }

11197

11198InstructionCost VecCost = VectorCost(CommonCost);

11199// Check if the current node must be resized, if the parent node is not

11200// resized.

11201if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&

11202 E->Idx != 0 &&

11203 (E->getOpcode() != Instruction::Load ||

11204 !E->UserTreeIndices.empty())) {

11205const EdgeInfo &EI =

11206 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {

11207 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;

11208 });

11209if (EI.UserTE->getOpcode() != Instruction::Select ||

11210 EI.EdgeIdx != 0) {

11211auto UserBWIt = MinBWs.find(EI.UserTE);

11212Type *UserScalarTy =

11213 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();

11214if (UserBWIt != MinBWs.end())

11215 UserScalarTy =IntegerType::get(ScalarTy->getContext(),

11216 UserBWIt->second.first);

11217if (ScalarTy != UserScalarTy) {

11218unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);

11219unsigned SrcBWSz =DL->getTypeSizeInBits(UserScalarTy);

11220unsigned VecOpcode;

11221auto *UserVecTy =getWidenedType(UserScalarTy, E->Scalars.size());

11222if (BWSz > SrcBWSz)

11223 VecOpcode = Instruction::Trunc;

11224else

11225 VecOpcode =

11226 It->second.second ? Instruction::SExt : Instruction::ZExt;

11227TTI::CastContextHint CCH = GetCastContextHint(VL0);

11228 VecCost +=TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,

11229CostKind);

11230 }

11231 }

11232 }

11233LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,

11234 ScalarCost,"Calculated costs for Tree"));

11235return VecCost - ScalarCost;

11236 };

11237// Calculate cost difference from vectorizing set of GEPs.

11238// Negative value means vectorizing is profitable.

11239auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs,Value *BasePtr) {

11240assert((E->State == TreeEntry::Vectorize ||

11241 E->State == TreeEntry::StridedVectorize) &&

11242"Entry state expected to be Vectorize or StridedVectorize here.");

11243InstructionCost ScalarCost = 0;

11244InstructionCost VecCost = 0;

11245 std::tie(ScalarCost, VecCost) =getGEPCosts(

11246 *TTI, Ptrs, BasePtr, E->getOpcode(),CostKind, OrigScalarTy, VecTy);

11247LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,

11248"Calculated GEPs cost for Tree"));

11249

11250return VecCost - ScalarCost;

11251 };

11252

11253auto GetMinMaxCost = [&](Type *Ty,Instruction *VI =nullptr) {

11254auto [MinMaxID, SelectOnly] =canConvertToMinOrMaxIntrinsic(VI ? VI : VL);

11255if (MinMaxID ==Intrinsic::not_intrinsic)

11256returnInstructionCost::getInvalid();

11257Type *CanonicalType = Ty;

11258if (CanonicalType->isPtrOrPtrVectorTy())

11259 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(

11260 CanonicalType->getContext(),

11261DL->getTypeSizeInBits(CanonicalType->getScalarType())));

11262

11263IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,

11264 {CanonicalType, CanonicalType});

11265InstructionCost IntrinsicCost =

11266TTI->getIntrinsicInstrCost(CostAttrs,CostKind);

11267// If the selects are the only uses of the compares, they will be

11268// dead and we can adjust the cost by removing their cost.

11269if (VI && SelectOnly) {

11270assert((!Ty->isVectorTy() ||SLPReVec) &&

11271"Expected only for scalar type.");

11272auto *CI = cast<CmpInst>(VI->getOperand(0));

11273 IntrinsicCost -=TTI->getCmpSelInstrCost(

11274 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),

11275CostKind, {TTI::OK_AnyValue, TTI::OP_None},

11276 {TTI::OK_AnyValue, TTI::OP_None}, CI);

11277 }

11278return IntrinsicCost;

11279 };

11280switch (ShuffleOrOp) {

11281case Instruction::PHI: {

11282// Count reused scalars.

11283InstructionCost ScalarCost = 0;

11284SmallPtrSet<const TreeEntry *, 4> CountedOps;

11285for (Value *V : UniqueValues) {

11286auto *PHI = dyn_cast<PHINode>(V);

11287if (!PHI)

11288continue;

11289

11290ValueList Operands(PHI->getNumIncomingValues(),nullptr);

11291for (unsignedI = 0,N =PHI->getNumIncomingValues();I <N; ++I) {

11292Value *Op =PHI->getIncomingValue(I);

11293Operands[I] =Op;

11294 }

11295if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))

11296if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)

11297if (!OpTE->ReuseShuffleIndices.empty())

11298 ScalarCost +=TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -

11299 OpTE->Scalars.size());

11300 }

11301

11302return CommonCost - ScalarCost;

11303 }

11304case Instruction::ExtractValue:

11305case Instruction::ExtractElement: {

11306auto GetScalarCost = [&](unsignedIdx) {

11307if (isa<PoisonValue>(UniqueValues[Idx]))

11308returnInstructionCost(TTI::TCC_Free);

11309

11310auto *I = cast<Instruction>(UniqueValues[Idx]);

11311VectorType *SrcVecTy;

11312if (ShuffleOrOp == Instruction::ExtractElement) {

11313auto *EE = cast<ExtractElementInst>(I);

11314 SrcVecTy = EE->getVectorOperandType();

11315 }else {

11316auto *EV = cast<ExtractValueInst>(I);

11317Type *AggregateTy = EV->getAggregateOperand()->getType();

11318unsigned NumElts;

11319if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))

11320 NumElts = ATy->getNumElements();

11321else

11322 NumElts = AggregateTy->getStructNumElements();

11323 SrcVecTy =getWidenedType(OrigScalarTy, NumElts);

11324 }

11325if (I->hasOneUse()) {

11326Instruction *Ext =I->user_back();

11327if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&

11328all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {

11329// Use getExtractWithExtendCost() to calculate the cost of

11330// extractelement/ext pair.

11331InstructionCost Cost =TTI->getExtractWithExtendCost(

11332Ext->getOpcode(),Ext->getType(), SrcVecTy, *getExtractIndex(I));

11333// Subtract the cost of s|zext which is subtracted separately.

11334Cost -=TTI->getCastInstrCost(

11335Ext->getOpcode(),Ext->getType(),I->getType(),

11336TTI::getCastContextHint(Ext),CostKind, Ext);

11337returnCost;

11338 }

11339 }

11340returnTTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,

11341CostKind, *getExtractIndex(I));

11342 };

11343auto GetVectorCost = [](InstructionCost CommonCost) {return CommonCost; };

11344return GetCostDiff(GetScalarCost, GetVectorCost);

11345 }

11346case Instruction::InsertElement: {

11347assert(E->ReuseShuffleIndices.empty() &&

11348"Unique insertelements only are expected.");

11349auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());

11350unsignedconst NumElts = SrcVecTy->getNumElements();

11351unsignedconst NumScalars = VL.size();

11352

11353unsigned NumOfParts =TTI->getNumberOfParts(SrcVecTy);

11354

11355SmallVector<int> InsertMask(NumElts,PoisonMaskElem);

11356unsigned OffsetBeg = *getElementIndex(VL.front());

11357unsigned OffsetEnd = OffsetBeg;

11358 InsertMask[OffsetBeg] = 0;

11359for (auto [I, V] :enumerate(VL.drop_front())) {

11360unsignedIdx = *getElementIndex(V);

11361if (OffsetBeg >Idx)

11362 OffsetBeg =Idx;

11363elseif (OffsetEnd <Idx)

11364 OffsetEnd =Idx;

11365 InsertMask[Idx] =I + 1;

11366 }

11367unsigned VecScalarsSz =PowerOf2Ceil(NumElts);

11368if (NumOfParts > 0 && NumOfParts < NumElts)

11369 VecScalarsSz =PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);

11370unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *

11371 VecScalarsSz;

11372unsignedOffset = VecScalarsSz * (OffsetBeg / VecScalarsSz);

11373unsigned InsertVecSz = std::min<unsigned>(

11374PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),

11375 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);

11376bool IsWholeSubvector =

11377 OffsetBeg ==Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);

11378// Check if we can safely insert a subvector. If it is not possible, just

11379// generate a whole-sized vector and shuffle the source vector and the new

11380// subvector.

11381if (OffsetBeg + InsertVecSz > VecSz) {

11382// Align OffsetBeg to generate correct mask.

11383 OffsetBeg =alignDown(OffsetBeg, VecSz,Offset);

11384 InsertVecSz = VecSz;

11385 }

11386

11387APInt DemandedElts =APInt::getZero(NumElts);

11388// TODO: Add support for Instruction::InsertValue.

11389SmallVector<int>Mask;

11390if (!E->ReorderIndices.empty()) {

11391inversePermutation(E->ReorderIndices, Mask);

11392Mask.append(InsertVecSz -Mask.size(),PoisonMaskElem);

11393 }else {

11394Mask.assign(VecSz,PoisonMaskElem);

11395 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);

11396 }

11397bool IsIdentity =true;

11398SmallVector<int> PrevMask(InsertVecSz,PoisonMaskElem);

11399Mask.swap(PrevMask);

11400for (unsignedI = 0;I < NumScalars; ++I) {

11401unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);

11402 DemandedElts.setBit(InsertIdx);

11403 IsIdentity &= InsertIdx - OffsetBeg ==I;

11404Mask[InsertIdx - OffsetBeg] =I;

11405 }

11406assert(Offset < NumElts &&"Failed to find vector index offset");

11407

11408InstructionCost Cost = 0;

11409Cost -=TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,

11410/*Insert*/true,/*Extract*/false,

11411CostKind);

11412

11413// First cost - resize to actual vector size if not identity shuffle or

11414// need to shift the vector.

11415// Do not calculate the cost if the actual size is the register size and

11416// we can merge this shuffle with the following SK_Select.

11417auto *InsertVecTy =getWidenedType(ScalarTy, InsertVecSz);

11418if (!IsIdentity)

11419Cost +=::getShuffleCost(*TTI,TargetTransformInfo::SK_PermuteSingleSrc,

11420 InsertVecTy, Mask);

11421auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

11422 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

11423 }));

11424// Second cost - permutation with subvector, if some elements are from the

11425// initial vector or inserting a subvector.

11426// TODO: Implement the analysis of the FirstInsert->getOperand(0)

11427// subvector of ActualVecTy.

11428SmallBitVector InMask =

11429isUndefVector(FirstInsert->getOperand(0),

11430buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));

11431if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {

11432if (InsertVecSz != VecSz) {

11433auto *ActualVecTy =getWidenedType(ScalarTy, VecSz);

11434Cost +=::getShuffleCost(*TTI,TTI::SK_InsertSubvector, ActualVecTy, {},

11435CostKind, OffsetBeg -Offset, InsertVecTy);

11436 }else {

11437for (unsignedI = 0,End = OffsetBeg -Offset;I <End; ++I)

11438 Mask[I] = InMask.test(I) ?PoisonMaskElem :I;

11439for (unsignedI = OffsetBeg -Offset,End = OffsetEnd -Offset;

11440I <=End; ++I)

11441if (Mask[I] !=PoisonMaskElem)

11442Mask[I] =I + VecSz;

11443for (unsignedI = OffsetEnd + 1 -Offset;I < VecSz; ++I)

11444 Mask[I] =

11445 ((I >= InMask.size()) || InMask.test(I)) ?PoisonMaskElem :I;

11446Cost +=

11447::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);

11448 }

11449 }

11450returnCost;

11451 }

11452case Instruction::ZExt:

11453case Instruction::SExt:

11454case Instruction::FPToUI:

11455case Instruction::FPToSI:

11456case Instruction::FPExt:

11457case Instruction::PtrToInt:

11458case Instruction::IntToPtr:

11459case Instruction::SIToFP:

11460case Instruction::UIToFP:

11461case Instruction::Trunc:

11462case Instruction::FPTrunc:

11463case Instruction::BitCast: {

11464auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

11465Type *SrcScalarTy = VL0->getOperand(0)->getType();

11466auto *SrcVecTy =getWidenedType(SrcScalarTy, VL.size());

11467unsigned Opcode = ShuffleOrOp;

11468unsigned VecOpcode = Opcode;

11469if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&

11470 (SrcIt != MinBWs.end() || It != MinBWs.end())) {

11471// Check if the values are candidates to demote.

11472unsigned SrcBWSz =DL->getTypeSizeInBits(SrcScalarTy->getScalarType());

11473if (SrcIt != MinBWs.end()) {

11474 SrcBWSz = SrcIt->second.first;

11475unsigned SrcScalarTyNumElements =getNumElements(SrcScalarTy);

11476 SrcScalarTy =IntegerType::get(F->getContext(), SrcBWSz);

11477 SrcVecTy =

11478getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);

11479 }

11480unsigned BWSz =DL->getTypeSizeInBits(ScalarTy->getScalarType());

11481if (BWSz == SrcBWSz) {

11482 VecOpcode = Instruction::BitCast;

11483 }elseif (BWSz < SrcBWSz) {

11484 VecOpcode = Instruction::Trunc;

11485 }elseif (It != MinBWs.end()) {

11486assert(BWSz > SrcBWSz &&"Invalid cast!");

11487 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

11488 }elseif (SrcIt != MinBWs.end()) {

11489assert(BWSz > SrcBWSz &&"Invalid cast!");

11490 VecOpcode =

11491 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;

11492 }

11493 }elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&

11494 !SrcIt->second.second) {

11495 VecOpcode = Instruction::UIToFP;

11496 }

11497auto GetScalarCost = [&](unsignedIdx) ->InstructionCost {

11498assert(Idx == 0 &&"Expected 0 index only");

11499returnTTI->getCastInstrCost(Opcode, VL0->getType(),

11500 VL0->getOperand(0)->getType(),

11501TTI::getCastContextHint(VL0),CostKind, VL0);

11502 };

11503auto GetVectorCost = [=](InstructionCost CommonCost) {

11504// Do not count cost here if minimum bitwidth is in effect and it is just

11505// a bitcast (here it is just a noop).

11506if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)

11507return CommonCost;

11508auto *VI = VL0->getOpcode() == Opcode ? VL0 :nullptr;

11509TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));

11510

11511bool IsArithmeticExtendedReduction =

11512 E->Idx == 0 && UserIgnoreList &&

11513all_of(*UserIgnoreList, [](Value *V) {

11514auto *I = cast<Instruction>(V);

11515returnis_contained({Instruction::Add, Instruction::FAdd,

11516 Instruction::Mul, Instruction::FMul,

11517 Instruction::And, Instruction::Or,

11518 Instruction::Xor},

11519I->getOpcode());

11520 });

11521if (IsArithmeticExtendedReduction &&

11522 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))

11523return CommonCost;

11524return CommonCost +

11525TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,CostKind,

11526 VecOpcode == Opcode ? VI :nullptr);

11527 };

11528return GetCostDiff(GetScalarCost, GetVectorCost);

11529 }

11530case Instruction::FCmp:

11531case Instruction::ICmp:

11532case Instruction::Select: {

11533CmpPredicate VecPred, SwappedVecPred;

11534auto MatchCmp =m_Cmp(VecPred,m_Value(),m_Value());

11535if (match(VL0,m_Select(MatchCmp,m_Value(),m_Value())) ||

11536match(VL0, MatchCmp))

11537 SwappedVecPred =CmpInst::getSwappedPredicate(VecPred);

11538else

11539 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()

11540 ?CmpInst::BAD_FCMP_PREDICATE

11541 :CmpInst::BAD_ICMP_PREDICATE;

11542auto GetScalarCost = [&](unsignedIdx) {

11543if (isa<PoisonValue>(UniqueValues[Idx]))

11544returnInstructionCost(TTI::TCC_Free);

11545

11546auto *VI = cast<Instruction>(UniqueValues[Idx]);

11547CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()

11548 ?CmpInst::BAD_FCMP_PREDICATE

11549 :CmpInst::BAD_ICMP_PREDICATE;

11550auto MatchCmp =m_Cmp(CurrentPred,m_Value(),m_Value());

11551if ((!match(VI,m_Select(MatchCmp,m_Value(),m_Value())) &&

11552 !match(VI, MatchCmp)) ||

11553 (CurrentPred !=static_cast<CmpInst::Predicate>(VecPred) &&

11554 CurrentPred !=static_cast<CmpInst::Predicate>(SwappedVecPred)))

11555 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()

11556 ?CmpInst::BAD_FCMP_PREDICATE

11557 :CmpInst::BAD_ICMP_PREDICATE;

11558

11559InstructionCost ScalarCost =TTI->getCmpSelInstrCost(

11560 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,

11561CostKind, getOperandInfo(VI->getOperand(0)),

11562 getOperandInfo(VI->getOperand(1)), VI);

11563InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);

11564if (IntrinsicCost.isValid())

11565 ScalarCost = IntrinsicCost;

11566

11567return ScalarCost;

11568 };

11569auto GetVectorCost = [&](InstructionCost CommonCost) {

11570auto *MaskTy =getWidenedType(Builder.getInt1Ty(), VL.size());

11571

11572InstructionCost VecCost =

11573TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,

11574CostKind, getOperandInfo(E->getOperand(0)),

11575 getOperandInfo(E->getOperand(1)), VL0);

11576if (auto *SI = dyn_cast<SelectInst>(VL0)) {

11577auto *CondType =

11578getWidenedType(SI->getCondition()->getType(), VL.size());

11579unsigned CondNumElements = CondType->getNumElements();

11580unsigned VecTyNumElements =getNumElements(VecTy);

11581assert(VecTyNumElements >= CondNumElements &&

11582 VecTyNumElements % CondNumElements == 0 &&

11583"Cannot vectorize Instruction::Select");

11584if (CondNumElements != VecTyNumElements) {

11585// When the return type is i1 but the source is fixed vector type, we

11586// need to duplicate the condition value.

11587 VecCost +=::getShuffleCost(

11588 *TTI,TTI::SK_PermuteSingleSrc, CondType,

11589createReplicatedMask(VecTyNumElements / CondNumElements,

11590 CondNumElements));

11591 }

11592 }

11593return VecCost + CommonCost;

11594 };

11595return GetCostDiff(GetScalarCost, GetVectorCost);

11596 }

11597case TreeEntry::MinMax: {

11598auto GetScalarCost = [&](unsignedIdx) {

11599return GetMinMaxCost(OrigScalarTy);

11600 };

11601auto GetVectorCost = [&](InstructionCost CommonCost) {

11602InstructionCost VecCost = GetMinMaxCost(VecTy);

11603return VecCost + CommonCost;

11604 };

11605return GetCostDiff(GetScalarCost, GetVectorCost);

11606 }

11607case Instruction::FNeg:

11608case Instruction::Add:

11609case Instruction::FAdd:

11610case Instruction::Sub:

11611case Instruction::FSub:

11612case Instruction::Mul:

11613case Instruction::FMul:

11614case Instruction::UDiv:

11615case Instruction::SDiv:

11616case Instruction::FDiv:

11617case Instruction::URem:

11618case Instruction::SRem:

11619case Instruction::FRem:

11620case Instruction::Shl:

11621case Instruction::LShr:

11622case Instruction::AShr:

11623case Instruction::And:

11624case Instruction::Or:

11625case Instruction::Xor: {

11626auto GetScalarCost = [&](unsignedIdx) {

11627if (isa<PoisonValue>(UniqueValues[Idx]))

11628returnInstructionCost(TTI::TCC_Free);

11629

11630auto *VI = cast<Instruction>(UniqueValues[Idx]);

11631unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;

11632TTI::OperandValueInfo Op1Info =TTI::getOperandInfo(VI->getOperand(0));

11633TTI::OperandValueInfo Op2Info =

11634TTI::getOperandInfo(VI->getOperand(OpIdx));

11635SmallVector<const Value *>Operands(VI->operand_values());

11636returnTTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy,CostKind,

11637 Op1Info, Op2Info,Operands, VI);

11638 };

11639auto GetVectorCost = [=](InstructionCost CommonCost) {

11640if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {

11641for (unsignedI : seq<unsigned>(0, E->getNumOperands())) {

11642ArrayRef<Value *> Ops = E->getOperand(I);

11643if (all_of(Ops, [&](Value *Op) {

11644auto *CI = dyn_cast<ConstantInt>(Op);

11645return CI && CI->getValue().countr_one() >= It->second.first;

11646 }))

11647return CommonCost;

11648 }

11649 }

11650unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;

11651TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));

11652TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));

11653returnTTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,CostKind, Op1Info,

11654 Op2Info, {},nullptr, TLI) +

11655 CommonCost;

11656 };

11657return GetCostDiff(GetScalarCost, GetVectorCost);

11658 }

11659case Instruction::GetElementPtr: {

11660return CommonCost + GetGEPCostDiff(VL, VL0);

11661 }

11662case Instruction::Load: {

11663auto GetScalarCost = [&](unsignedIdx) {

11664auto *VI = cast<LoadInst>(UniqueValues[Idx]);

11665returnTTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,

11666VI->getAlign(),VI->getPointerAddressSpace(),

11667CostKind,TTI::OperandValueInfo(), VI);

11668 };

11669auto *LI0 = cast<LoadInst>(VL0);

11670auto GetVectorCost = [&](InstructionCost CommonCost) {

11671InstructionCost VecLdCost;

11672switch (E->State) {

11673case TreeEntry::Vectorize:

11674if (unsigned Factor = E->getInterleaveFactor()) {

11675 VecLdCost =TTI->getInterleavedMemoryOpCost(

11676 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),

11677 LI0->getPointerAddressSpace(),CostKind);

11678

11679 }else {

11680 VecLdCost =TTI->getMemoryOpCost(

11681 Instruction::Load, VecTy, LI0->getAlign(),

11682 LI0->getPointerAddressSpace(),CostKind,TTI::OperandValueInfo());

11683 }

11684break;

11685case TreeEntry::StridedVectorize: {

11686Align CommonAlignment =

11687 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());

11688 VecLdCost =TTI->getStridedMemoryOpCost(

11689 Instruction::Load, VecTy, LI0->getPointerOperand(),

11690/*VariableMask=*/false, CommonAlignment,CostKind);

11691break;

11692 }

11693case TreeEntry::ScatterVectorize: {

11694Align CommonAlignment =

11695 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());

11696 VecLdCost =TTI->getGatherScatterOpCost(

11697 Instruction::Load, VecTy, LI0->getPointerOperand(),

11698/*VariableMask=*/false, CommonAlignment,CostKind);

11699break;

11700 }

11701case TreeEntry::CombinedVectorize:

11702case TreeEntry::NeedToGather:

11703llvm_unreachable("Unexpected vectorization state.");

11704 }

11705return VecLdCost + CommonCost;

11706 };

11707

11708InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);

11709// If this node generates masked gather load then it is not a terminal node.

11710// Hence address operand cost is estimated separately.

11711if (E->State == TreeEntry::ScatterVectorize)

11712returnCost;

11713

11714// Estimate cost of GEPs since this tree node is a terminator.

11715SmallVector<Value *> PointerOps(VL.size());

11716for (auto [I, V] :enumerate(VL))

11717 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();

11718returnCost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());

11719 }

11720case Instruction::Store: {

11721bool IsReorder = !E->ReorderIndices.empty();

11722auto GetScalarCost = [=](unsignedIdx) {

11723auto *VI = cast<StoreInst>(VL[Idx]);

11724TTI::OperandValueInfo OpInfo =TTI::getOperandInfo(VI->getValueOperand());

11725returnTTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,

11726VI->getAlign(),VI->getPointerAddressSpace(),

11727CostKind, OpInfo, VI);

11728 };

11729auto *BaseSI =

11730 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);

11731auto GetVectorCost = [=](InstructionCost CommonCost) {

11732// We know that we can merge the stores. Calculate the cost.

11733InstructionCost VecStCost;

11734if (E->State == TreeEntry::StridedVectorize) {

11735Align CommonAlignment =

11736 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());

11737 VecStCost =TTI->getStridedMemoryOpCost(

11738 Instruction::Store, VecTy, BaseSI->getPointerOperand(),

11739/*VariableMask=*/false, CommonAlignment,CostKind);

11740 }else {

11741assert(E->State == TreeEntry::Vectorize &&

11742"Expected either strided or consecutive stores.");

11743if (unsigned Factor = E->getInterleaveFactor()) {

11744assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&

11745"No reused shuffles expected");

11746 CommonCost = 0;

11747 VecStCost =TTI->getInterleavedMemoryOpCost(

11748 Instruction::Store, VecTy, Factor, std::nullopt,

11749 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),CostKind);

11750 }else {

11751TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));

11752 VecStCost =TTI->getMemoryOpCost(

11753 Instruction::Store, VecTy, BaseSI->getAlign(),

11754 BaseSI->getPointerAddressSpace(),CostKind, OpInfo);

11755 }

11756 }

11757return VecStCost + CommonCost;

11758 };

11759SmallVector<Value *> PointerOps(VL.size());

11760for (auto [I, V] :enumerate(VL)) {

11761unsignedIdx = IsReorder ? E->ReorderIndices[I] :I;

11762 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();

11763 }

11764

11765return GetCostDiff(GetScalarCost, GetVectorCost) +

11766 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());

11767 }

11768case Instruction::Call: {

11769auto GetScalarCost = [&](unsignedIdx) {

11770auto *CI = cast<CallInst>(UniqueValues[Idx]);

11771Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

11772if (ID !=Intrinsic::not_intrinsic) {

11773IntrinsicCostAttributes CostAttrs(ID, *CI, 1);

11774returnTTI->getIntrinsicInstrCost(CostAttrs,CostKind);

11775 }

11776returnTTI->getCallInstrCost(CI->getCalledFunction(),

11777 CI->getFunctionType()->getReturnType(),

11778 CI->getFunctionType()->params(),CostKind);

11779 };

11780auto GetVectorCost = [=](InstructionCost CommonCost) {

11781auto *CI = cast<CallInst>(VL0);

11782Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

11783SmallVector<Type *> ArgTys =buildIntrinsicArgTypes(

11784 CI,ID, VecTy->getNumElements(),

11785 It != MinBWs.end() ? It->second.first : 0,TTI);

11786auto VecCallCosts =getVectorCallCosts(CI, VecTy,TTI, TLI, ArgTys);

11787return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;

11788 };

11789return GetCostDiff(GetScalarCost, GetVectorCost);

11790 }

11791case Instruction::ShuffleVector: {

11792if (!SLPReVec || E->isAltShuffle())

11793assert(E->isAltShuffle() &&

11794 ((Instruction::isBinaryOp(E->getOpcode()) &&

11795Instruction::isBinaryOp(E->getAltOpcode())) ||

11796 (Instruction::isCast(E->getOpcode()) &&

11797Instruction::isCast(E->getAltOpcode())) ||

11798 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&

11799"Invalid Shuffle Vector Operand");

11800// Try to find the previous shuffle node with the same operands and same

11801// main/alternate ops.

11802auto TryFindNodeWithEqualOperands = [=]() {

11803for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

11804if (TE.get() == E)

11805break;

11806if (TE->hasState() &&TE->isAltShuffle() &&

11807 ((TE->getOpcode() == E->getOpcode() &&

11808TE->getAltOpcode() == E->getAltOpcode()) ||

11809 (TE->getOpcode() == E->getAltOpcode() &&

11810TE->getAltOpcode() == E->getOpcode())) &&

11811TE->hasEqualOperands(*E))

11812returntrue;

11813 }

11814returnfalse;

11815 };

11816auto GetScalarCost = [&](unsignedIdx) {

11817if (isa<PoisonValue>(UniqueValues[Idx]))

11818returnInstructionCost(TTI::TCC_Free);

11819

11820auto *VI = cast<Instruction>(UniqueValues[Idx]);

11821assert(E->isOpcodeOrAlt(VI) &&"Unexpected main/alternate opcode");

11822 (void)E;

11823returnTTI->getInstructionCost(VI,CostKind);

11824 };

11825// Need to clear CommonCost since the final shuffle cost is included into

11826// vector cost.

11827auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {

11828// VecCost is equal to sum of the cost of creating 2 vectors

11829// and the cost of creating shuffle.

11830InstructionCost VecCost = 0;

11831if (TryFindNodeWithEqualOperands()) {

11832LLVM_DEBUG({

11833dbgs() <<"SLP: diamond match for alternate node found.\n";

11834 E->dump();

11835 });

11836// No need to add new vector costs here since we're going to reuse

11837// same main/alternate vector ops, just do different shuffling.

11838 }elseif (Instruction::isBinaryOp(E->getOpcode())) {

11839 VecCost =

11840 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,CostKind);

11841 VecCost +=

11842 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,CostKind);

11843 }elseif (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

11844auto *MaskTy =getWidenedType(Builder.getInt1Ty(), VL.size());

11845 VecCost = TTIRef.getCmpSelInstrCost(

11846 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),CostKind,

11847 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},

11848 VL0);

11849 VecCost += TTIRef.getCmpSelInstrCost(

11850 E->getOpcode(), VecTy, MaskTy,

11851 cast<CmpInst>(E->getAltOp())->getPredicate(),CostKind,

11852 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},

11853 E->getAltOp());

11854 }else {

11855Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();

11856auto *SrcTy =getWidenedType(SrcSclTy, VL.size());

11857if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {

11858auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

11859unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);

11860unsigned SrcBWSz =

11861DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());

11862if (SrcIt != MinBWs.end()) {

11863 SrcBWSz = SrcIt->second.first;

11864 SrcSclTy =IntegerType::get(SrcSclTy->getContext(), SrcBWSz);

11865 SrcTy =getWidenedType(SrcSclTy, VL.size());

11866 }

11867if (BWSz <= SrcBWSz) {

11868if (BWSz < SrcBWSz)

11869 VecCost =

11870 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,

11871TTI::CastContextHint::None,CostKind);

11872LLVM_DEBUG({

11873dbgs()

11874 <<"SLP: alternate extension, which should be truncated.\n";

11875 E->dump();

11876 });

11877return VecCost;

11878 }

11879 }

11880 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,

11881TTI::CastContextHint::None,CostKind);

11882 VecCost +=

11883 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,

11884TTI::CastContextHint::None,CostKind);

11885 }

11886SmallVector<int>Mask;

11887 E->buildAltOpShuffleMask(

11888 [&](Instruction *I) {

11889assert(E->isOpcodeOrAlt(I) &&"Unexpected main/alternate opcode");

11890returnisAlternateInstruction(I, E->getMainOp(), E->getAltOp(),

11891 *TLI);

11892 },

11893Mask);

11894 VecCost +=::getShuffleCost(TTIRef,TargetTransformInfo::SK_PermuteTwoSrc,

11895 FinalVecTy, Mask,CostKind);

11896// Patterns like [fadd,fsub] can be combined into a single instruction

11897// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we

11898// need to take into account their order when looking for the most used

11899// order.

11900unsigned Opcode0 = E->getOpcode();

11901unsigned Opcode1 = E->getAltOpcode();

11902SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));

11903// If this pattern is supported by the target then we consider the

11904// order.

11905if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {

11906InstructionCost AltVecCost = TTIRef.getAltInstrCost(

11907 VecTy, Opcode0, Opcode1, OpcodeMask,CostKind);

11908return AltVecCost < VecCost ? AltVecCost : VecCost;

11909 }

11910// TODO: Check the reverse order too.

11911return VecCost;

11912 };

11913if (SLPReVec && !E->isAltShuffle())

11914return GetCostDiff(

11915 GetScalarCost, [&](InstructionCost) ->InstructionCost {

11916// If a group uses mask in order, the shufflevector can be

11917// eliminated by instcombine. Then the cost is 0.

11918assert(isa<ShuffleVectorInst>(VL.front()) &&

11919"Not supported shufflevector usage.");

11920auto *SV = cast<ShuffleVectorInst>(VL.front());

11921unsigned SVNumElements =

11922 cast<FixedVectorType>(SV->getOperand(0)->getType())

11923 ->getNumElements();

11924unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();

11925for (size_tI = 0,End = VL.size();I !=End;I += GroupSize) {

11926ArrayRef<Value *> Group = VL.slice(I, GroupSize);

11927int NextIndex = 0;

11928if (!all_of(Group, [&](Value *V) {

11929assert(isa<ShuffleVectorInst>(V) &&

11930"Not supported shufflevector usage.");

11931auto *SV = cast<ShuffleVectorInst>(V);

11932intIndex;

11933 [[maybe_unused]]bool IsExtractSubvectorMask =

11934 SV->isExtractSubvectorMask(Index);

11935assert(IsExtractSubvectorMask &&

11936"Not supported shufflevector usage.");

11937if (NextIndex != Index)

11938returnfalse;

11939 NextIndex += SV->getShuffleMask().size();

11940returntrue;

11941 }))

11942 return ::getShuffleCost(

11943 *TTI,TargetTransformInfo::SK_PermuteSingleSrc, VecTy,

11944calculateShufflevectorMask(E->Scalars));

11945 }

11946returnTTI::TCC_Free;

11947 });

11948return GetCostDiff(GetScalarCost, GetVectorCost);

11949 }

11950case Instruction::Freeze:

11951return CommonCost;

11952default:

11953llvm_unreachable("Unknown instruction");

11954 }

11955}

11956

11957bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const{

11958LLVM_DEBUG(dbgs() <<"SLP: Check whether the tree with height "

11959 << VectorizableTree.size() <<" is fully vectorizable .\n");

11960

11961auto &&AreVectorizableGathers = [this](const TreeEntry *TE,unsigned Limit) {

11962SmallVector<int>Mask;

11963returnTE->isGather() &&

11964 !any_of(TE->Scalars,

11965 [this](Value *V) { return EphValues.contains(V); }) &&

11966 (allConstant(TE->Scalars) ||isSplat(TE->Scalars) ||

11967TE->Scalars.size() < Limit ||

11968 (((TE->hasState() &&

11969TE->getOpcode() == Instruction::ExtractElement) ||

11970all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&

11971isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||

11972 (TE->hasState() &&TE->getOpcode() == Instruction::Load &&

11973 !TE->isAltShuffle()) ||

11974any_of(TE->Scalars, IsaPred<LoadInst>));

11975 };

11976

11977// We only handle trees of heights 1 and 2.

11978if (VectorizableTree.size() == 1 &&

11979 (VectorizableTree[0]->State == TreeEntry::Vectorize ||

11980 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||

11981 (ForReduction &&

11982 AreVectorizableGathers(VectorizableTree[0].get(),

11983 VectorizableTree[0]->Scalars.size()) &&

11984 VectorizableTree[0]->getVectorFactor() > 2)))

11985returntrue;

11986

11987if (VectorizableTree.size() != 2)

11988returnfalse;

11989

11990// Handle splat and all-constants stores. Also try to vectorize tiny trees

11991// with the second gather nodes if they have less scalar operands rather than

11992// the initial tree element (may be profitable to shuffle the second gather)

11993// or they are extractelements, which form shuffle.

11994SmallVector<int>Mask;

11995if (VectorizableTree[0]->State == TreeEntry::Vectorize &&

11996 AreVectorizableGathers(VectorizableTree[1].get(),

11997 VectorizableTree[0]->Scalars.size()))

11998returntrue;

11999

12000// Gathering cost would be too much for tiny trees.

12001if (VectorizableTree[0]->isGather() ||

12002 (VectorizableTree[1]->isGather() &&

12003 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&

12004 VectorizableTree[0]->State != TreeEntry::StridedVectorize))

12005returnfalse;

12006

12007returntrue;

12008}

12009

12010staticboolisLoadCombineCandidateImpl(Value *Root,unsigned NumElts,

12011TargetTransformInfo *TTI,

12012bool MustMatchOrInst) {

12013// Look past the root to find a source value. Arbitrarily follow the

12014// path through operand 0 of any 'or'. Also, peek through optional

12015// shift-left-by-multiple-of-8-bits.

12016Value *ZextLoad = Root;

12017constAPInt *ShAmtC;

12018bool FoundOr =false;

12019while (!isa<ConstantExpr>(ZextLoad) &&

12020 (match(ZextLoad,m_Or(m_Value(),m_Value())) ||

12021 (match(ZextLoad,m_Shl(m_Value(),m_APInt(ShAmtC))) &&

12022 ShAmtC->urem(8) == 0))) {

12023auto *BinOp = cast<BinaryOperator>(ZextLoad);

12024 ZextLoad = BinOp->getOperand(0);

12025if (BinOp->getOpcode() == Instruction::Or)

12026 FoundOr =true;

12027 }

12028// Check if the input is an extended load of the required or/shift expression.

12029Value *Load;

12030if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||

12031 !match(ZextLoad,m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))

12032returnfalse;

12033

12034// Require that the total load bit width is a legal integer type.

12035// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.

12036// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.

12037Type *SrcTy = Load->getType();

12038unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;

12039if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))

12040returnfalse;

12041

12042// Everything matched - assume that we can fold the whole sequence using

12043// load combining.

12044LLVM_DEBUG(dbgs() <<"SLP: Assume load combining for tree starting at "

12045 << *(cast<Instruction>(Root)) <<"\n");

12046

12047returntrue;

12048}

12049

12050boolBoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const{

12051if (RdxKind !=RecurKind::Or)

12052returnfalse;

12053

12054unsigned NumElts = VectorizableTree[0]->Scalars.size();

12055Value *FirstReduced = VectorizableTree[0]->Scalars[0];

12056returnisLoadCombineCandidateImpl(FirstReduced, NumElts,TTI,

12057/* MatchOr */false);

12058}

12059

12060boolBoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const{

12061// Peek through a final sequence of stores and check if all operations are

12062// likely to be load-combined.

12063unsigned NumElts = Stores.size();

12064for (Value *Scalar : Stores) {

12065Value *X;

12066if (!match(Scalar,m_Store(m_Value(X),m_Value())) ||

12067 !isLoadCombineCandidateImpl(X, NumElts,TTI,/* MatchOr */true))

12068returnfalse;

12069 }

12070returntrue;

12071}

12072

12073boolBoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const{

12074if (!DebugCounter::shouldExecute(VectorizedGraphs))

12075returntrue;

12076

12077// Graph is empty - do nothing.

12078if (VectorizableTree.empty()) {

12079assert(ExternalUses.empty() &&"We shouldn't have any external users");

12080

12081returntrue;

12082 }

12083

12084// No need to vectorize inserts of gathered values.

12085if (VectorizableTree.size() == 2 &&

12086 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&

12087 VectorizableTree[1]->isGather() &&

12088 (VectorizableTree[1]->getVectorFactor() <= 2 ||

12089 !(isSplat(VectorizableTree[1]->Scalars) ||

12090allConstant(VectorizableTree[1]->Scalars))))

12091returntrue;

12092

12093// If the graph includes only PHI nodes and gathers, it is defnitely not

12094// profitable for the vectorization, we can skip it, if the cost threshold is

12095// default. The cost of vectorized PHI nodes is almost always 0 + the cost of

12096// gathers/buildvectors.

12097constexprint Limit = 4;

12098if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&

12099 !VectorizableTree.empty() &&

12100all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

12101return (TE->isGather() &&

12102 (!TE->hasState() ||

12103 TE->getOpcode() != Instruction::ExtractElement) &&

12104count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||

12105 (TE->hasState() && TE->getOpcode() == Instruction::PHI);

12106 }))

12107returntrue;

12108

12109// We can vectorize the tree if its size is greater than or equal to the

12110// minimum size specified by the MinTreeSize command line option.

12111if (VectorizableTree.size() >=MinTreeSize)

12112returnfalse;

12113

12114// If we have a tiny tree (a tree whose size is less than MinTreeSize), we

12115// can vectorize it if we can prove it fully vectorizable.

12116if (isFullyVectorizableTinyTree(ForReduction))

12117returnfalse;

12118

12119// Check if any of the gather node forms an insertelement buildvector

12120// somewhere.

12121bool IsAllowedSingleBVNode =

12122 VectorizableTree.size() > 1 ||

12123 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&

12124 !VectorizableTree.front()->isAltShuffle() &&

12125 VectorizableTree.front()->getOpcode() != Instruction::PHI &&

12126 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&

12127allSameBlock(VectorizableTree.front()->Scalars));

12128if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

12129return TE->isGather() &&all_of(TE->Scalars, [&](Value *V) {

12130 return isa<ExtractElementInst, UndefValue>(V) ||

12131 (IsAllowedSingleBVNode &&

12132 !V->hasNUsesOrMore(UsesLimit) &&

12133 any_of(V->users(), IsaPred<InsertElementInst>));

12134 });

12135 }))

12136returnfalse;

12137

12138if (VectorizableTree.back()->isGather() &&

12139 VectorizableTree.back()->hasState() &&

12140 VectorizableTree.back()->isAltShuffle() &&

12141 VectorizableTree.back()->getVectorFactor() > 2 &&

12142allSameBlock(VectorizableTree.back()->Scalars) &&

12143 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&

12144TTI->getScalarizationOverhead(

12145getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),

12146 VectorizableTree.back()->getVectorFactor()),

12147APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),

12148/*Insert=*/true,/*Extract=*/false,

12149TTI::TCK_RecipThroughput) > -SLPCostThreshold)

12150returnfalse;

12151

12152// Otherwise, we can't vectorize the tree. It is both tiny and not fully

12153// vectorizable.

12154returntrue;

12155}

12156

12157boolBoUpSLP::isTreeNotExtendable() const{

12158if (getCanonicalGraphSize() !=getTreeSize()) {

12159constexprunsigned SmallTree = 3;

12160if (VectorizableTree.front()->isNonPowOf2Vec() &&

12161getCanonicalGraphSize() <= SmallTree &&

12162count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),

12163 [](const std::unique_ptr<TreeEntry> &TE) {

12164return TE->isGather() && TE->hasState() &&

12165 TE->getOpcode() == Instruction::Load &&

12166 !allSameBlock(TE->Scalars);

12167 }) == 1)

12168returntrue;

12169returnfalse;

12170 }

12171bool Res =false;

12172for (unsignedIdx : seq<unsigned>(getTreeSize())) {

12173 TreeEntry &E = *VectorizableTree[Idx];

12174if (!E.isGather())

12175continue;

12176if (E.hasState() && E.getOpcode() != Instruction::Load)

12177returnfalse;

12178if (isSplat(E.Scalars) ||allConstant(E.Scalars))

12179continue;

12180 Res =true;

12181 }

12182return Res;

12183}

12184

12185InstructionCost BoUpSLP::getSpillCost() const{

12186// Walk from the bottom of the tree to the top, tracking which values are

12187// live. When we see a call instruction that is not part of our tree,

12188// query TTI to see if there is a cost to keeping values live over it

12189// (for example, if spills and fills are required).

12190unsigned BundleWidth = VectorizableTree.front()->Scalars.size();

12191InstructionCost Cost = 0;

12192

12193SmallPtrSet<Instruction *, 4> LiveValues;

12194Instruction *PrevInst =nullptr;

12195

12196// The entries in VectorizableTree are not necessarily ordered by their

12197// position in basic blocks. Collect them and order them by dominance so later

12198// instructions are guaranteed to be visited first. For instructions in

12199// different basic blocks, we only scan to the beginning of the block, so

12200// their order does not matter, as long as all instructions in a basic block

12201// are grouped together. Using dominance ensures a deterministic order.

12202SmallVector<Instruction *, 16> OrderedScalars;

12203for (constauto &TEPtr : VectorizableTree) {

12204if (TEPtr->State != TreeEntry::Vectorize)

12205continue;

12206Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);

12207if (!Inst)

12208continue;

12209 OrderedScalars.push_back(Inst);

12210 }

12211llvm::sort(OrderedScalars, [&](Instruction *A,Instruction *B) {

12212auto *NodeA = DT->getNode(A->getParent());

12213auto *NodeB = DT->getNode(B->getParent());

12214assert(NodeA &&"Should only process reachable instructions");

12215assert(NodeB &&"Should only process reachable instructions");

12216assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

12217"Different nodes should have different DFS numbers");

12218if (NodeA != NodeB)

12219return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();

12220returnB->comesBefore(A);

12221 });

12222

12223for (Instruction *Inst : OrderedScalars) {

12224if (!PrevInst) {

12225 PrevInst = Inst;

12226continue;

12227 }

12228

12229// Update LiveValues.

12230 LiveValues.erase(PrevInst);

12231for (auto &J : PrevInst->operands()) {

12232if (isa<Instruction>(&*J) && getTreeEntry(&*J))

12233 LiveValues.insert(cast<Instruction>(&*J));

12234 }

12235

12236LLVM_DEBUG({

12237dbgs() <<"SLP: #LV: " << LiveValues.size();

12238for (auto *X : LiveValues)

12239dbgs() <<" " <<X->getName();

12240dbgs() <<", Looking at ";

12241 Inst->dump();

12242 });

12243

12244// Now find the sequence of instructions between PrevInst and Inst.

12245unsigned NumCalls = 0;

12246BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),

12247 PrevInstIt =

12248 PrevInst->getIterator().getReverse();

12249while (InstIt != PrevInstIt) {

12250if (PrevInstIt == PrevInst->getParent()->rend()) {

12251 PrevInstIt = Inst->getParent()->rbegin();

12252continue;

12253 }

12254

12255auto NoCallIntrinsic = [this](Instruction *I) {

12256if (auto *II = dyn_cast<IntrinsicInst>(I)) {

12257if (II->isAssumeLikeIntrinsic())

12258returntrue;

12259IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);

12260InstructionCost IntrCost =

12261TTI->getIntrinsicInstrCost(ICA,TTI::TCK_RecipThroughput);

12262InstructionCost CallCost =

12263TTI->getCallInstrCost(nullptr,II->getType(), ICA.getArgTypes(),

12264TTI::TCK_RecipThroughput);

12265if (IntrCost < CallCost)

12266returntrue;

12267 }

12268returnfalse;

12269 };

12270

12271// Debug information does not impact spill cost.

12272if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&

12273 &*PrevInstIt != PrevInst)

12274 NumCalls++;

12275

12276 ++PrevInstIt;

12277 }

12278

12279if (NumCalls) {

12280SmallVector<Type *, 4> V;

12281for (auto *II : LiveValues) {

12282auto *ScalarTy =II->getType();

12283if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))

12284 ScalarTy = VectorTy->getElementType();

12285 V.push_back(getWidenedType(ScalarTy, BundleWidth));

12286 }

12287Cost += NumCalls *TTI->getCostOfKeepingLiveOverCall(V);

12288 }

12289

12290 PrevInst = Inst;

12291 }

12292

12293returnCost;

12294}

12295

12296/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the

12297/// buildvector sequence.

12298staticboolisFirstInsertElement(constInsertElementInst *IE1,

12299constInsertElementInst *IE2) {

12300if (IE1 == IE2)

12301returnfalse;

12302constauto *I1 = IE1;

12303constauto *I2 = IE2;

12304constInsertElementInst *PrevI1;

12305constInsertElementInst *PrevI2;

12306unsigned Idx1 = *getElementIndex(IE1);

12307unsigned Idx2 = *getElementIndex(IE2);

12308do {

12309if (I2 == IE1)

12310returntrue;

12311if (I1 == IE2)

12312returnfalse;

12313 PrevI1 = I1;

12314 PrevI2 = I2;

12315if (I1 && (I1 == IE1 || I1->hasOneUse()) &&

12316getElementIndex(I1).value_or(Idx2) != Idx2)

12317 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));

12318if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&

12319getElementIndex(I2).value_or(Idx1) != Idx1)

12320 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));

12321 }while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));

12322llvm_unreachable("Two different buildvectors not expected.");

12323}

12324

12325namespace{

12326/// Returns incoming Value *, if the requested type is Value * too, or a default

12327/// value, otherwise.

12328structValueSelect {

12329template <typename U>

12330static std::enable_if_t<std::is_same_v<Value *, U>,Value *>get(Value *V) {

12331returnV;

12332 }

12333template <typename U>

12334static std::enable_if_t<!std::is_same_v<Value *, U>,U>get(Value *) {

12335returnU();

12336 }

12337};

12338}// namespace

12339

12340/// Does the analysis of the provided shuffle masks and performs the requested

12341/// actions on the vectors with the given shuffle masks. It tries to do it in

12342/// several steps.

12343/// 1. If the Base vector is not undef vector, resizing the very first mask to

12344/// have common VF and perform action for 2 input vectors (including non-undef

12345/// Base). Other shuffle masks are combined with the resulting after the 1 stage

12346/// and processed as a shuffle of 2 elements.

12347/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the

12348/// action only for 1 vector with the given mask, if it is not the identity

12349/// mask.

12350/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2

12351/// vectors, combing the masks properly between the steps.

12352template <typename T>

12353staticT *performExtractsShuffleAction(

12354MutableArrayRef<std::pair<T *,SmallVector<int>>> ShuffleMask,Value *Base,

12355function_ref<unsigned(T *)> GetVF,

12356function_ref<std::pair<T *, bool>(T *,ArrayRef<int>,bool)> ResizeAction,

12357function_ref<T *(ArrayRef<int>,ArrayRef<T *>)> Action) {

12358assert(!ShuffleMask.empty() &&"Empty list of shuffles for inserts.");

12359SmallVector<int> Mask(ShuffleMask.begin()->second);

12360auto VMIt = std::next(ShuffleMask.begin());

12361T *Prev =nullptr;

12362SmallBitVector UseMask =

12363buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);

12364SmallBitVector IsBaseUndef =isUndefVector(Base, UseMask);

12365if (!IsBaseUndef.all()) {

12366// Base is not undef, need to combine it with the next subvectors.

12367 std::pair<T *, bool> Res =

12368 ResizeAction(ShuffleMask.begin()->first, Mask,/*ForSingleMask=*/false);

12369SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);

12370for (unsignedIdx = 0, VF = Mask.size();Idx < VF; ++Idx) {

12371if (Mask[Idx] ==PoisonMaskElem)

12372 Mask[Idx] = IsBasePoison.test(Idx) ?PoisonMaskElem :Idx;

12373else

12374 Mask[Idx] = (Res.second ?Idx : Mask[Idx]) + VF;

12375 }

12376 [[maybe_unused]]auto *V = ValueSelect::get<T *>(Base);

12377assert((!V || GetVF(V) == Mask.size()) &&

12378"Expected base vector of VF number of elements.");

12379 Prev = Action(Mask, {nullptr, Res.first});

12380 }elseif (ShuffleMask.size() == 1) {

12381// Base is undef and only 1 vector is shuffled - perform the action only for

12382// single vector, if the mask is not the identity mask.

12383 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,

12384/*ForSingleMask=*/true);

12385if (Res.second)

12386// Identity mask is found.

12387 Prev = Res.first;

12388else

12389 Prev = Action(Mask, {ShuffleMask.begin()->first});

12390 }else {

12391// Base is undef and at least 2 input vectors shuffled - perform 2 vectors

12392// shuffles step by step, combining shuffle between the steps.

12393unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);

12394unsigned Vec2VF = GetVF(VMIt->first);

12395if (Vec1VF == Vec2VF) {

12396// No need to resize the input vectors since they are of the same size, we

12397// can shuffle them directly.

12398ArrayRef<int> SecMask = VMIt->second;

12399for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {

12400if (SecMask[I] !=PoisonMaskElem) {

12401assert(Mask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");

12402 Mask[I] = SecMask[I] + Vec1VF;

12403 }

12404 }

12405 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});

12406 }else {

12407// Vectors of different sizes - resize and reshuffle.

12408 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,

12409/*ForSingleMask=*/false);

12410 std::pair<T *, bool> Res2 =

12411 ResizeAction(VMIt->first, VMIt->second,/*ForSingleMask=*/false);

12412ArrayRef<int> SecMask = VMIt->second;

12413for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {

12414if (Mask[I] !=PoisonMaskElem) {

12415assert(SecMask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");

12416if (Res1.second)

12417 Mask[I] =I;

12418 }elseif (SecMask[I] !=PoisonMaskElem) {

12419assert(Mask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");

12420 Mask[I] = (Res2.second ?I : SecMask[I]) + VF;

12421 }

12422 }

12423 Prev = Action(Mask, {Res1.first, Res2.first});

12424 }

12425 VMIt = std::next(VMIt);

12426 }

12427 [[maybe_unused]]bool IsBaseNotUndef = !IsBaseUndef.all();

12428// Perform requested actions for the remaining masks/vectors.

12429for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {

12430// Shuffle other input vectors, if any.

12431 std::pair<T *, bool> Res =

12432 ResizeAction(VMIt->first, VMIt->second,/*ForSingleMask=*/false);

12433ArrayRef<int> SecMask = VMIt->second;

12434for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {

12435if (SecMask[I] !=PoisonMaskElem) {

12436assert((Mask[I] ==PoisonMaskElem || IsBaseNotUndef) &&

12437"Multiple uses of scalars.");

12438 Mask[I] = (Res.second ?I : SecMask[I]) + VF;

12439 }elseif (Mask[I] !=PoisonMaskElem) {

12440 Mask[I] =I;

12441 }

12442 }

12443 Prev = Action(Mask, {Prev, Res.first});

12444 }

12445return Prev;

12446}

12447

12448namespace{

12449/// Data type for handling buildvector sequences with the reused scalars from

12450/// other tree entries.

12451template <typename T>structShuffledInsertData {

12452 /// List of insertelements to be replaced by shuffles.

12453SmallVector<InsertElementInst *> InsertElements;

12454 /// The parent vectors and shuffle mask for the given list of inserts.

12455MapVector<T, SmallVector<int>> ValueMasks;

12456};

12457}// namespace

12458

12459InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

12460InstructionCost Cost = 0;

12461LLVM_DEBUG(dbgs() <<"SLP: Calculating cost for tree of size "

12462 << VectorizableTree.size() <<".\n");

12463

12464unsigned BundleWidth = VectorizableTree[0]->Scalars.size();

12465

12466SmallPtrSet<Value *, 4> CheckedExtracts;

12467for (unsignedI = 0, E = VectorizableTree.size();I < E; ++I) {

12468 TreeEntry &TE = *VectorizableTree[I];

12469// No need to count the cost for combined entries, they are combined and

12470// just skip their cost.

12471if (TE.State == TreeEntry::CombinedVectorize) {

12472LLVM_DEBUG(

12473dbgs() <<"SLP: Skipping cost for combined node that starts with "

12474 << *TE.Scalars[0] <<".\n";

12475 TE.dump();dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12476continue;

12477 }

12478if (TE.isGather() && TE.hasState()) {

12479if (const TreeEntry *E = getTreeEntry(TE.getMainOp());

12480 E && E->getVectorFactor() == TE.getVectorFactor() &&

12481 E->isSame(TE.Scalars)) {

12482// Some gather nodes might be absolutely the same as some vectorizable

12483// nodes after reordering, need to handle it.

12484LLVM_DEBUG(dbgs() <<"SLP: Adding cost 0 for bundle "

12485 <<shortBundleName(TE.Scalars, TE.Idx) <<".\n"

12486 <<"SLP: Current total cost = " <<Cost <<"\n");

12487continue;

12488 }

12489 }

12490

12491// Exclude cost of gather loads nodes which are not used. These nodes were

12492// built as part of the final attempt to vectorize gathered loads.

12493assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&

12494"Expected gather nodes with users only.");

12495

12496InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);

12497Cost +=C;

12498LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C <<" for bundle "

12499 <<shortBundleName(TE.Scalars, TE.Idx) <<".\n"

12500 <<"SLP: Current total cost = " <<Cost <<"\n");

12501 }

12502

12503SmallPtrSet<Value *, 16> ExtractCostCalculated;

12504InstructionCost ExtractCost = 0;

12505SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;

12506SmallVector<APInt> DemandedElts;

12507SmallDenseSet<Value *, 4> UsedInserts;

12508DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;

12509 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;

12510DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;

12511SmallPtrSet<Value *, 4> ScalarOpsFromCasts;

12512// Keep track {Scalar, Index, User} tuple.

12513// On AArch64, this helps in fusing a mov instruction, associated with

12514// extractelement, with fmul in the backend so that extractelement is free.

12515SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;

12516for (ExternalUser &EU : ExternalUses) {

12517 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);

12518 }

12519for (ExternalUser &EU : ExternalUses) {

12520// Uses by ephemeral values are free (because the ephemeral value will be

12521// removed prior to code generation, and so the extraction will be

12522// removed as well).

12523if (EphValues.count(EU.User))

12524continue;

12525

12526// Used in unreachable blocks or in EH pads (rarely executed) or is

12527// terminated with unreachable instruction.

12528if (BasicBlock *UserParent =

12529 EU.User ? cast<Instruction>(EU.User)->getParent() :nullptr;

12530 UserParent &&

12531 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||

12532 isa_and_present<UnreachableInst>(UserParent->getTerminator())))

12533continue;

12534

12535// We only add extract cost once for the same scalar.

12536if (!isa_and_nonnull<InsertElementInst>(EU.User) &&

12537 !ExtractCostCalculated.insert(EU.Scalar).second)

12538continue;

12539

12540// No extract cost for vector "scalar"

12541if (isa<FixedVectorType>(EU.Scalar->getType()))

12542continue;

12543

12544// If found user is an insertelement, do not calculate extract cost but try

12545// to detect it as a final shuffled/identity match.

12546if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);

12547 VU && VU->getOperand(1) == EU.Scalar) {

12548if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {

12549if (!UsedInserts.insert(VU).second)

12550continue;

12551 std::optional<unsigned> InsertIdx =getElementIndex(VU);

12552if (InsertIdx) {

12553const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);

12554auto *It =find_if(

12555 ShuffledInserts,

12556 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {

12557// Checks if 2 insertelements are from the same buildvector.

12558InsertElementInst *VecInsert =Data.InsertElements.front();

12559returnareTwoInsertFromSameBuildVector(

12560 VU, VecInsert, [this](InsertElementInst *II) ->Value * {

12561Value *Op0 =II->getOperand(0);

12562if (getTreeEntry(II) && !getTreeEntry(Op0))

12563returnnullptr;

12564return Op0;

12565 });

12566 });

12567int VecId = -1;

12568if (It == ShuffledInserts.end()) {

12569auto &Data = ShuffledInserts.emplace_back();

12570Data.InsertElements.emplace_back(VU);

12571 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));

12572 VecId = ShuffledInserts.size() - 1;

12573auto It = MinBWs.find(ScalarTE);

12574if (It != MinBWs.end() &&

12575 VectorCasts

12576 .insert(std::make_pair(ScalarTE, FTy->getElementType()))

12577 .second) {

12578unsigned BWSz = It->second.first;

12579unsigned DstBWSz =DL->getTypeSizeInBits(FTy->getElementType());

12580unsigned VecOpcode;

12581if (DstBWSz < BWSz)

12582 VecOpcode = Instruction::Trunc;

12583else

12584 VecOpcode =

12585 It->second.second ? Instruction::SExt : Instruction::ZExt;

12586TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

12587InstructionCost C =TTI->getCastInstrCost(

12588 VecOpcode, FTy,

12589getWidenedType(IntegerType::get(FTy->getContext(), BWSz),

12590 FTy->getNumElements()),

12591TTI::CastContextHint::None,CostKind);

12592LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C

12593 <<" for extending externally used vector with "

12594"non-equal minimum bitwidth.\n");

12595Cost +=C;

12596 }

12597 }else {

12598if (isFirstInsertElement(VU, It->InsertElements.front()))

12599 It->InsertElements.front() = VU;

12600 VecId = std::distance(ShuffledInserts.begin(), It);

12601 }

12602int InIdx = *InsertIdx;

12603SmallVectorImpl<int> &Mask =

12604 ShuffledInserts[VecId].ValueMasks[ScalarTE];

12605if (Mask.empty())

12606 Mask.assign(FTy->getNumElements(),PoisonMaskElem);

12607 Mask[InIdx] = EU.Lane;

12608 DemandedElts[VecId].setBit(InIdx);

12609continue;

12610 }

12611 }

12612 }

12613

12614TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

12615// If we plan to rewrite the tree in a smaller type, we will need to sign

12616// extend the extracted value back to the original type. Here, we account

12617// for the extract and the added cost of the sign extend if needed.

12618InstructionCost ExtraCost =TTI::TCC_Free;

12619auto *VecTy =getWidenedType(EU.Scalar->getType(), BundleWidth);

12620const TreeEntry *Entry = getTreeEntry(EU.Scalar);

12621auto It = MinBWs.find(Entry);

12622if (It != MinBWs.end()) {

12623auto *MinTy =IntegerType::get(F->getContext(), It->second.first);

12624unsigned Extend =isKnownNonNegative(EU.Scalar,SimplifyQuery(*DL))

12625 ? Instruction::ZExt

12626 : Instruction::SExt;

12627 VecTy =getWidenedType(MinTy, BundleWidth);

12628 ExtraCost =TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),

12629 VecTy, EU.Lane);

12630 }else {

12631 ExtraCost =

12632TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,CostKind,

12633 EU.Lane, EU.Scalar, ScalarUserAndIdx);

12634 }

12635// Leave the scalar instructions as is if they are cheaper than extracts.

12636if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||

12637 Entry->getOpcode() == Instruction::Load) {

12638// Checks if the user of the external scalar is phi in loop body.

12639auto IsPhiInLoop = [&](const ExternalUser &U) {

12640if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {

12641auto *I = cast<Instruction>(U.Scalar);

12642constLoop *L = LI->getLoopFor(Phi->getParent());

12643return L && (Phi->getParent() ==I->getParent() ||

12644 L == LI->getLoopFor(I->getParent()));

12645 }

12646returnfalse;

12647 };

12648if (!ValueToExtUses) {

12649 ValueToExtUses.emplace();

12650for_each(enumerate(ExternalUses), [&](constauto &P) {

12651// Ignore phis in loops.

12652if (IsPhiInLoop(P.value()))

12653return;

12654

12655 ValueToExtUses->try_emplace(P.value().Scalar,P.index());

12656 });

12657 }

12658// Can use original instruction, if no operands vectorized or they are

12659// marked as externally used already.

12660auto *Inst = cast<Instruction>(EU.Scalar);

12661InstructionCost ScalarCost =TTI->getInstructionCost(Inst,CostKind);

12662auto OperandIsScalar = [&](Value *V) {

12663if (!getTreeEntry(V)) {

12664// Some extractelements might be not vectorized, but

12665// transformed into shuffle and removed from the function,

12666// consider it here.

12667if (auto *EE = dyn_cast<ExtractElementInst>(V))

12668return !EE->hasOneUse() || !MustGather.contains(EE);

12669returntrue;

12670 }

12671return ValueToExtUses->contains(V);

12672 };

12673bool CanBeUsedAsScalar =all_of(Inst->operands(), OperandIsScalar);

12674bool CanBeUsedAsScalarCast =false;

12675if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {

12676if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));

12677Op &&all_of(Op->operands(), OperandIsScalar)) {

12678InstructionCost OpCost =

12679 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))

12680 ?TTI->getInstructionCost(Op,CostKind)

12681 : 0;

12682if (ScalarCost + OpCost <= ExtraCost) {

12683 CanBeUsedAsScalar = CanBeUsedAsScalarCast =true;

12684 ScalarCost += OpCost;

12685 }

12686 }

12687 }

12688if (CanBeUsedAsScalar) {

12689bool KeepScalar = ScalarCost <= ExtraCost;

12690// Try to keep original scalar if the user is the phi node from the same

12691// block as the root phis, currently vectorized. It allows to keep

12692// better ordering info of PHIs, being vectorized currently.

12693bool IsProfitablePHIUser =

12694 (KeepScalar || (ScalarCost - ExtraCost <=TTI::TCC_Basic &&

12695 VectorizableTree.front()->Scalars.size() > 2)) &&

12696 VectorizableTree.front()->getOpcode() == Instruction::PHI &&

12697 !Inst->hasNUsesOrMore(UsesLimit) &&

12698none_of(Inst->users(),

12699 [&](User *U) {

12700 auto *PHIUser = dyn_cast<PHINode>(U);

12701 return (!PHIUser ||

12702 PHIUser->getParent() !=

12703 cast<Instruction>(

12704 VectorizableTree.front()->getMainOp())

12705 ->getParent()) &&

12706 !getTreeEntry(U);

12707 }) &&

12708count_if(Entry->Scalars, [&](Value *V) {

12709 return ValueToExtUses->contains(V);

12710 }) <= 2;

12711if (IsProfitablePHIUser) {

12712 KeepScalar =true;

12713 }elseif (KeepScalar && ScalarCost !=TTI::TCC_Free &&

12714 ExtraCost - ScalarCost <=TTI::TCC_Basic &&

12715 (!GatheredLoadsEntriesFirst.has_value() ||

12716 Entry->Idx < *GatheredLoadsEntriesFirst)) {

12717unsigned ScalarUsesCount =count_if(Entry->Scalars, [&](Value *V) {

12718 return ValueToExtUses->contains(V);

12719 });

12720auto It = ExtractsCount.find(Entry);

12721if (It != ExtractsCount.end()) {

12722assert(ScalarUsesCount >= It->getSecond().size() &&

12723"Expected total number of external uses not less than "

12724"number of scalar uses.");

12725 ScalarUsesCount -= It->getSecond().size();

12726 }

12727// Keep original scalar if number of externally used instructions in

12728// the same entry is not power of 2. It may help to do some extra

12729// vectorization for now.

12730 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);

12731 }

12732if (KeepScalar) {

12733 ExternalUsesAsOriginalScalar.insert(EU.Scalar);

12734for_each(Inst->operands(), [&](Value *V) {

12735 auto It = ValueToExtUses->find(V);

12736 if (It != ValueToExtUses->end()) {

12737// Replace all uses to avoid compiler crash.

12738 ExternalUses[It->second].User = nullptr;

12739 }

12740 });

12741 ExtraCost = ScalarCost;

12742if (!IsPhiInLoop(EU))

12743 ExtractsCount[Entry].insert(Inst);

12744if (CanBeUsedAsScalarCast) {

12745 ScalarOpsFromCasts.insert(Inst->getOperand(0));

12746// Update the users of the operands of the cast operand to avoid

12747// compiler crash.

12748if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {

12749for_each(IOp->operands(), [&](Value *V) {

12750 auto It = ValueToExtUses->find(V);

12751 if (It != ValueToExtUses->end()) {

12752// Replace all uses to avoid compiler crash.

12753 ExternalUses[It->second].User = nullptr;

12754 }

12755 });

12756 }

12757 }

12758 }

12759 }

12760 }

12761

12762 ExtractCost += ExtraCost;

12763 }

12764// Insert externals for extract of operands of casts to be emitted as scalars

12765// instead of extractelement.

12766for (Value *V : ScalarOpsFromCasts) {

12767 ExternalUsesAsOriginalScalar.insert(V);

12768if (const TreeEntry *E = getTreeEntry(V)) {

12769 ExternalUses.emplace_back(V,nullptr, E->findLaneForValue(V));

12770 }

12771 }

12772// Add reduced value cost, if resized.

12773if (!VectorizedVals.empty()) {

12774const TreeEntry &Root = *VectorizableTree.front();

12775auto BWIt = MinBWs.find(&Root);

12776if (BWIt != MinBWs.end()) {

12777Type *DstTy = Root.Scalars.front()->getType();

12778unsigned OriginalSz =DL->getTypeSizeInBits(DstTy->getScalarType());

12779unsigned SrcSz =

12780 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;

12781if (OriginalSz != SrcSz) {

12782unsigned Opcode = Instruction::Trunc;

12783if (OriginalSz > SrcSz)

12784 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;

12785Type *SrcTy =IntegerType::get(DstTy->getContext(), SrcSz);

12786if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {

12787assert(SLPReVec &&"Only supported by REVEC.");

12788 SrcTy =getWidenedType(SrcTy, VecTy->getNumElements());

12789 }

12790Cost +=TTI->getCastInstrCost(Opcode, DstTy, SrcTy,

12791TTI::CastContextHint::None,

12792TTI::TCK_RecipThroughput);

12793 }

12794 }

12795 }

12796

12797InstructionCost SpillCost = getSpillCost();

12798Cost += SpillCost + ExtractCost;

12799auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE,ArrayRef<int>Mask,

12800bool) {

12801InstructionCost C = 0;

12802unsigned VF =Mask.size();

12803unsigned VecVF =TE->getVectorFactor();

12804if (VF != VecVF &&

12805 (any_of(Mask, [VF](intIdx) {returnIdx >=static_cast<int>(VF); }) ||

12806 !ShuffleVectorInst::isIdentityMask(Mask, VF))) {

12807SmallVector<int> OrigMask(VecVF,PoisonMaskElem);

12808 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),

12809 OrigMask.begin());

12810C =::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc,

12811getWidenedType(TE->getMainOp()->getType(), VecVF),

12812 OrigMask);

12813LLVM_DEBUG(

12814dbgs() <<"SLP: Adding cost " <<C

12815 <<" for final shuffle of insertelement external users.\n";

12816TE->dump();dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12817Cost +=C;

12818return std::make_pair(TE,true);

12819 }

12820return std::make_pair(TE,false);

12821 };

12822// Calculate the cost of the reshuffled vectors, if any.

12823for (intI = 0, E = ShuffledInserts.size();I < E; ++I) {

12824Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);

12825autoVector = ShuffledInserts[I].ValueMasks.takeVector();

12826unsigned VF = 0;

12827auto EstimateShufflesCost = [&](ArrayRef<int>Mask,

12828ArrayRef<const TreeEntry *> TEs) {

12829assert((TEs.size() == 1 || TEs.size() == 2) &&

12830"Expected exactly 1 or 2 tree entries.");

12831if (TEs.size() == 1) {

12832if (VF == 0)

12833 VF = TEs.front()->getVectorFactor();

12834auto *FTy =getWidenedType(TEs.back()->Scalars.front()->getType(), VF);

12835if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&

12836 !all_of(enumerate(Mask), [=](constauto &Data) {

12837returnData.value() ==PoisonMaskElem ||

12838 (Data.index() < VF &&

12839static_cast<int>(Data.index()) ==Data.value());

12840 })) {

12841InstructionCost C =

12842::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, FTy, Mask);

12843LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C

12844 <<" for final shuffle of insertelement "

12845"external users.\n";

12846 TEs.front()->dump();

12847dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12848Cost +=C;

12849 }

12850 }else {

12851if (VF == 0) {

12852if (TEs.front() &&

12853 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())

12854 VF = TEs.front()->getVectorFactor();

12855else

12856 VF =Mask.size();

12857 }

12858auto *FTy =getWidenedType(TEs.back()->Scalars.front()->getType(), VF);

12859InstructionCost C =

12860::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, FTy, Mask);

12861LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C

12862 <<" for final shuffle of vector node and external "

12863"insertelement users.\n";

12864if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();

12865dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12866Cost +=C;

12867 }

12868 VF =Mask.size();

12869return TEs.back();

12870 };

12871 (void)performExtractsShuffleAction<const TreeEntry>(

12872MutableArrayRef(Vector.data(),Vector.size()),Base,

12873 [](const TreeEntry *E) {return E->getVectorFactor(); }, ResizeToVF,

12874 EstimateShufflesCost);

12875InstructionCost InsertCost =TTI->getScalarizationOverhead(

12876 cast<FixedVectorType>(

12877 ShuffledInserts[I].InsertElements.front()->getType()),

12878 DemandedElts[I],

12879/*Insert*/true,/*Extract*/false,TTI::TCK_RecipThroughput);

12880Cost -= InsertCost;

12881 }

12882

12883// Add the cost for reduced value resize (if required).

12884if (ReductionBitWidth != 0) {

12885assert(UserIgnoreList &&"Expected reduction tree.");

12886const TreeEntry &E = *VectorizableTree.front();

12887auto It = MinBWs.find(&E);

12888if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {

12889unsigned SrcSize = It->second.first;

12890unsigned DstSize = ReductionBitWidth;

12891unsigned Opcode = Instruction::Trunc;

12892if (SrcSize < DstSize) {

12893bool IsArithmeticExtendedReduction =

12894all_of(*UserIgnoreList, [](Value *V) {

12895auto *I = cast<Instruction>(V);

12896returnis_contained({Instruction::Add, Instruction::FAdd,

12897 Instruction::Mul, Instruction::FMul,

12898 Instruction::And, Instruction::Or,

12899 Instruction::Xor},

12900I->getOpcode());

12901 });

12902if (IsArithmeticExtendedReduction)

12903 Opcode =

12904 Instruction::BitCast;// Handle it by getExtendedReductionCost

12905else

12906 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

12907 }

12908if (Opcode != Instruction::BitCast) {

12909auto *SrcVecTy =

12910getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());

12911auto *DstVecTy =

12912getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());

12913TTI::CastContextHint CCH = getCastContextHint(E);

12914InstructionCost CastCost;

12915switch (E.getOpcode()) {

12916case Instruction::SExt:

12917case Instruction::ZExt:

12918case Instruction::Trunc: {

12919const TreeEntry *OpTE = getOperandEntry(&E, 0);

12920 CCH = getCastContextHint(*OpTE);

12921break;

12922 }

12923default:

12924break;

12925 }

12926 CastCost +=TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,

12927TTI::TCK_RecipThroughput);

12928Cost += CastCost;

12929LLVM_DEBUG(dbgs() <<"SLP: Adding cost " << CastCost

12930 <<" for final resize for reduction from " << SrcVecTy

12931 <<" to " << DstVecTy <<"\n";

12932dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12933 }

12934 }

12935 }

12936

12937#ifndef NDEBUG

12938SmallString<256> Str;

12939 {

12940raw_svector_ostream OS(Str);

12941OS <<"SLP: Spill Cost = " << SpillCost <<".\n"

12942 <<"SLP: Extract Cost = " << ExtractCost <<".\n"

12943 <<"SLP: Total Cost = " <<Cost <<".\n";

12944 }

12945LLVM_DEBUG(dbgs() << Str);

12946if (ViewSLPTree)

12947ViewGraph(this,"SLP" +F->getName(),false, Str);

12948#endif

12949

12950returnCost;

12951}

12952

12953/// Tries to find extractelement instructions with constant indices from fixed

12954/// vector type and gather such instructions into a bunch, which highly likely

12955/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

12956/// successful, the matched scalars are replaced by poison values in \p VL for

12957/// future analysis.

12958std::optional<TTI::ShuffleKind>

12959BoUpSLP::tryToGatherSingleRegisterExtractElements(

12960MutableArrayRef<Value *> VL,SmallVectorImpl<int> &Mask) const{

12961// Scan list of gathered scalars for extractelements that can be represented

12962// as shuffles.

12963MapVector<Value *, SmallVector<int>> VectorOpToIdx;

12964SmallVector<int> UndefVectorExtracts;

12965for (intI = 0, E = VL.size();I < E; ++I) {

12966auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

12967if (!EI) {

12968if (isa<UndefValue>(VL[I]))

12969 UndefVectorExtracts.push_back(I);

12970continue;

12971 }

12972auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

12973if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))

12974continue;

12975 std::optional<unsigned>Idx =getExtractIndex(EI);

12976// Undefined index.

12977if (!Idx) {

12978 UndefVectorExtracts.push_back(I);

12979continue;

12980 }

12981if (Idx >= VecTy->getNumElements()) {

12982 UndefVectorExtracts.push_back(I);

12983continue;

12984 }

12985SmallBitVector ExtractMask(VecTy->getNumElements(),true);

12986 ExtractMask.reset(*Idx);

12987if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {

12988 UndefVectorExtracts.push_back(I);

12989continue;

12990 }

12991 VectorOpToIdx[EI->getVectorOperand()].push_back(I);

12992 }

12993// Sort the vector operands by the maximum number of uses in extractelements.

12994SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =

12995 VectorOpToIdx.takeVector();

12996stable_sort(Vectors, [](constauto &P1,constauto &P2) {

12997returnP1.second.size() > P2.second.size();

12998 });

12999// Find the best pair of the vectors or a single vector.

13000constint UndefSz = UndefVectorExtracts.size();

13001unsigned SingleMax = 0;

13002unsigned PairMax = 0;

13003if (!Vectors.empty()) {

13004 SingleMax = Vectors.front().second.size() + UndefSz;

13005if (Vectors.size() > 1) {

13006auto *ItNext = std::next(Vectors.begin());

13007 PairMax = SingleMax + ItNext->second.size();

13008 }

13009 }

13010if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)

13011return std::nullopt;

13012// Check if better to perform a shuffle of 2 vectors or just of a single

13013// vector.

13014SmallVector<Value *> SavedVL(VL.begin(), VL.end());

13015SmallVector<Value *> GatheredExtracts(

13016 VL.size(),PoisonValue::get(VL.front()->getType()));

13017if (SingleMax >= PairMax && SingleMax) {

13018for (intIdx : Vectors.front().second)

13019std::swap(GatheredExtracts[Idx], VL[Idx]);

13020 }elseif (!Vectors.empty()) {

13021for (unsignedIdx : {0, 1})

13022for (intIdx : Vectors[Idx].second)

13023std::swap(GatheredExtracts[Idx], VL[Idx]);

13024 }

13025// Add extracts from undefs too.

13026for (intIdx : UndefVectorExtracts)

13027std::swap(GatheredExtracts[Idx], VL[Idx]);

13028// Check that gather of extractelements can be represented as just a

13029// shuffle of a single/two vectors the scalars are extracted from.

13030 std::optional<TTI::ShuffleKind> Res =

13031isFixedVectorShuffle(GatheredExtracts, Mask, AC);

13032if (!Res ||all_of(Mask, [](intIdx) {returnIdx ==PoisonMaskElem; })) {

13033// TODO: try to check other subsets if possible.

13034// Restore the original VL if attempt was not successful.

13035copy(SavedVL, VL.begin());

13036return std::nullopt;

13037 }

13038// Restore unused scalars from mask, if some of the extractelements were not

13039// selected for shuffle.

13040for (intI = 0, E = GatheredExtracts.size();I < E; ++I) {

13041if (Mask[I] ==PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&

13042 isa<UndefValue>(GatheredExtracts[I])) {

13043std::swap(VL[I], GatheredExtracts[I]);

13044continue;

13045 }

13046auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

13047if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||

13048 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||

13049is_contained(UndefVectorExtracts,I))

13050continue;

13051 }

13052return Res;

13053}

13054

13055/// Tries to find extractelement instructions with constant indices from fixed

13056/// vector type and gather such instructions into a bunch, which highly likely

13057/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

13058/// successful, the matched scalars are replaced by poison values in \p VL for

13059/// future analysis.

13060SmallVector<std::optional<TTI::ShuffleKind>>

13061BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

13062SmallVectorImpl<int> &Mask,

13063unsigned NumParts) const{

13064assert(NumParts > 0 &&"NumParts expected be greater than or equal to 1.");

13065SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);

13066Mask.assign(VL.size(),PoisonMaskElem);

13067unsigned SliceSize =getPartNumElems(VL.size(), NumParts);

13068for (unsigned Part : seq<unsigned>(NumParts)) {

13069// Scan list of gathered scalars for extractelements that can be represented

13070// as shuffles.

13071MutableArrayRef<Value *> SubVL =MutableArrayRef(VL).slice(

13072 Part * SliceSize,getNumElems(VL.size(), SliceSize, Part));

13073SmallVector<int> SubMask;

13074 std::optional<TTI::ShuffleKind> Res =

13075 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);

13076 ShufflesRes[Part] = Res;

13077copy(SubMask, std::next(Mask.begin(), Part * SliceSize));

13078 }

13079if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {

13080return Res.has_value();

13081 }))

13082 ShufflesRes.clear();

13083return ShufflesRes;

13084}

13085

13086std::optional<TargetTransformInfo::ShuffleKind>

13087BoUpSLP::isGatherShuffledSingleRegisterEntry(

13088const TreeEntry *TE,ArrayRef<Value *> VL,MutableArrayRef<int> Mask,

13089SmallVectorImpl<const TreeEntry *> &Entries,unsigned Part,bool ForOrder) {

13090 Entries.clear();

13091// TODO: currently checking only for Scalars in the tree entry, need to count

13092// reused elements too for better cost estimation.

13093const EdgeInfo &TEUseEI =TE == VectorizableTree.front().get()

13094 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)

13095 :TE->UserTreeIndices.front();

13096constInstruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);

13097constBasicBlock *TEInsertBlock =nullptr;

13098// Main node of PHI entries keeps the correct order of operands/incoming

13099// blocks.

13100if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {

13101 TEInsertBlock =PHI->getIncomingBlock(TEUseEI.EdgeIdx);

13102 TEInsertPt = TEInsertBlock->getTerminator();

13103 }else {

13104 TEInsertBlock = TEInsertPt->getParent();

13105 }

13106if (!DT->isReachableFromEntry(TEInsertBlock))

13107return std::nullopt;

13108auto *NodeUI = DT->getNode(TEInsertBlock);

13109assert(NodeUI &&"Should only process reachable instructions");

13110SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());

13111auto CheckOrdering = [&](constInstruction *InsertPt) {

13112// Argument InsertPt is an instruction where vector code for some other

13113// tree entry (one that shares one or more scalars with TE) is going to be

13114// generated. This lambda returns true if insertion point of vector code

13115// for the TE dominates that point (otherwise dependency is the other way

13116// around). The other node is not limited to be of a gather kind. Gather

13117// nodes are not scheduled and their vector code is inserted before their

13118// first user. If user is PHI, that is supposed to be at the end of a

13119// predecessor block. Otherwise it is the last instruction among scalars of

13120// the user node. So, instead of checking dependency between instructions

13121// themselves, we check dependency between their insertion points for vector

13122// code (since each scalar instruction ends up as a lane of a vector

13123// instruction).

13124constBasicBlock *InsertBlock = InsertPt->getParent();

13125auto *NodeEUI = DT->getNode(InsertBlock);

13126if (!NodeEUI)

13127returnfalse;

13128assert((NodeUI == NodeEUI) ==

13129 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&

13130"Different nodes should have different DFS numbers");

13131// Check the order of the gather nodes users.

13132if (TEInsertPt->getParent() != InsertBlock &&

13133 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))

13134returnfalse;

13135if (TEInsertPt->getParent() == InsertBlock &&

13136 TEInsertPt->comesBefore(InsertPt))

13137returnfalse;

13138returntrue;

13139 };

13140// Find all tree entries used by the gathered values. If no common entries

13141// found - not a shuffle.

13142// Here we build a set of tree nodes for each gathered value and trying to

13143// find the intersection between these sets. If we have at least one common

13144// tree node for each gathered value - we have just a permutation of the

13145// single vector. If we have 2 different sets, we're in situation where we

13146// have a permutation of 2 input vectors.

13147SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;

13148DenseMap<Value *, int> UsedValuesEntry;

13149for (Value *V : VL) {

13150if (isConstant(V))

13151continue;

13152// Build a list of tree entries where V is used.

13153SmallPtrSet<const TreeEntry *, 4> VToTEs;

13154for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {

13155if (TEPtr == TE || TEPtr->Idx == 0)

13156continue;

13157assert(any_of(TEPtr->Scalars,

13158 [&](Value *V) { return GatheredScalars.contains(V); }) &&

13159"Must contain at least single gathered value.");

13160assert(TEPtr->UserTreeIndices.size() == 1 &&

13161"Expected only single user of a gather node.");

13162const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();

13163

13164PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());

13165constInstruction *InsertPt =

13166 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()

13167 : &getLastInstructionInBundle(UseEI.UserTE);

13168if (TEInsertPt == InsertPt) {

13169// If 2 gathers are operands of the same entry (regardless of whether

13170// user is PHI or else), compare operands indices, use the earlier one

13171// as the base.

13172if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)

13173continue;

13174// If the user instruction is used for some reason in different

13175// vectorized nodes - make it depend on index.

13176if (TEUseEI.UserTE != UseEI.UserTE &&

13177 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)

13178continue;

13179 }

13180

13181// Check if the user node of the TE comes after user node of TEPtr,

13182// otherwise TEPtr depends on TE.

13183if ((TEInsertBlock != InsertPt->getParent() ||

13184 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&

13185 !CheckOrdering(InsertPt))

13186continue;

13187 VToTEs.insert(TEPtr);

13188 }

13189if (const TreeEntry *VTE = getTreeEntry(V)) {

13190if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {

13191if (VTE->State != TreeEntry::Vectorize) {

13192auto It = MultiNodeScalars.find(V);

13193if (It == MultiNodeScalars.end())

13194continue;

13195 VTE = *It->getSecond().begin();

13196// Iterate through all vectorized nodes.

13197auto *MIt =find_if(It->getSecond(), [](const TreeEntry *MTE) {

13198 return MTE->State == TreeEntry::Vectorize;

13199 });

13200if (MIt == It->getSecond().end())

13201continue;

13202 VTE = *MIt;

13203 }

13204 }

13205if (none_of(TE->CombinedEntriesWithIndices,

13206 [&](constauto &P) { return P.first == VTE->Idx; })) {

13207Instruction &LastBundleInst = getLastInstructionInBundle(VTE);

13208if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))

13209continue;

13210 }

13211 VToTEs.insert(VTE);

13212 }

13213if (VToTEs.empty())

13214continue;

13215if (UsedTEs.empty()) {

13216// The first iteration, just insert the list of nodes to vector.

13217 UsedTEs.push_back(VToTEs);

13218 UsedValuesEntry.try_emplace(V, 0);

13219 }else {

13220// Need to check if there are any previously used tree nodes which use V.

13221// If there are no such nodes, consider that we have another one input

13222// vector.

13223SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);

13224unsignedIdx = 0;

13225for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {

13226// Do we have a non-empty intersection of previously listed tree entries

13227// and tree entries using current V?

13228set_intersect(VToTEs, Set);

13229if (!VToTEs.empty()) {

13230// Yes, write the new subset and continue analysis for the next

13231// scalar.

13232Set.swap(VToTEs);

13233break;

13234 }

13235 VToTEs = SavedVToTEs;

13236 ++Idx;

13237 }

13238// No non-empty intersection found - need to add a second set of possible

13239// source vectors.

13240if (Idx == UsedTEs.size()) {

13241// If the number of input vectors is greater than 2 - not a permutation,

13242// fallback to the regular gather.

13243// TODO: support multiple reshuffled nodes.

13244if (UsedTEs.size() == 2)

13245continue;

13246 UsedTEs.push_back(SavedVToTEs);

13247Idx = UsedTEs.size() - 1;

13248 }

13249 UsedValuesEntry.try_emplace(V,Idx);

13250 }

13251 }

13252

13253if (UsedTEs.empty()) {

13254 Entries.clear();

13255return std::nullopt;

13256 }

13257

13258unsigned VF = 0;

13259if (UsedTEs.size() == 1) {

13260// Keep the order to avoid non-determinism.

13261SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),

13262 UsedTEs.front().end());

13263sort(FirstEntries, [](const TreeEntry *TE1,const TreeEntry *TE2) {

13264return TE1->Idx < TE2->Idx;

13265 });

13266// Try to find the perfect match in another gather node at first.

13267auto *It =find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {

13268return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);

13269 });

13270if (It != FirstEntries.end() &&

13271 ((*It)->getVectorFactor() == VL.size() ||

13272 ((*It)->getVectorFactor() ==TE->Scalars.size() &&

13273TE->ReuseShuffleIndices.size() == VL.size() &&

13274 (*It)->isSame(TE->Scalars)))) {

13275 Entries.push_back(*It);

13276if ((*It)->getVectorFactor() == VL.size()) {

13277 std::iota(std::next(Mask.begin(), Part * VL.size()),

13278 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);

13279 }else {

13280SmallVector<int> CommonMask =TE->getCommonMask();

13281copy(CommonMask,Mask.begin());

13282 }

13283// Clear undef scalars.

13284for (unsignedI : seq<unsigned>(VL.size()))

13285if (isa<PoisonValue>(VL[I]))

13286Mask[Part * VL.size() +I] =PoisonMaskElem;

13287returnTargetTransformInfo::SK_PermuteSingleSrc;

13288 }

13289// No perfect match, just shuffle, so choose the first tree node from the

13290// tree.

13291 Entries.push_back(FirstEntries.front());

13292 VF = FirstEntries.front()->getVectorFactor();

13293 }else {

13294// Try to find nodes with the same vector factor.

13295assert(UsedTEs.size() == 2 &&"Expected at max 2 permuted entries.");

13296// Keep the order of tree nodes to avoid non-determinism.

13297DenseMap<int, const TreeEntry *> VFToTE;

13298for (const TreeEntry *TE : UsedTEs.front()) {

13299unsigned VF =TE->getVectorFactor();

13300auto It = VFToTE.find(VF);

13301if (It != VFToTE.end()) {

13302if (It->second->Idx >TE->Idx)

13303 It->getSecond() =TE;

13304continue;

13305 }

13306 VFToTE.try_emplace(VF, TE);

13307 }

13308// Same, keep the order to avoid non-determinism.

13309SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),

13310 UsedTEs.back().end());

13311sort(SecondEntries, [](const TreeEntry *TE1,const TreeEntry *TE2) {

13312return TE1->Idx < TE2->Idx;

13313 });

13314for (const TreeEntry *TE : SecondEntries) {

13315auto It = VFToTE.find(TE->getVectorFactor());

13316if (It != VFToTE.end()) {

13317 VF = It->first;

13318 Entries.push_back(It->second);

13319 Entries.push_back(TE);

13320break;

13321 }

13322 }

13323// No 2 source vectors with the same vector factor - just choose 2 with max

13324// index.

13325if (Entries.empty()) {

13326 Entries.push_back(*llvm::max_element(

13327 UsedTEs.front(), [](const TreeEntry *TE1,const TreeEntry *TE2) {

13328 return TE1->Idx < TE2->Idx;

13329 }));

13330 Entries.push_back(SecondEntries.front());

13331 VF = std::max(Entries.front()->getVectorFactor(),

13332 Entries.back()->getVectorFactor());

13333 }else {

13334 VF = Entries.front()->getVectorFactor();

13335 }

13336 }

13337

13338bool IsSplatOrUndefs =isSplat(VL) ||all_of(VL, IsaPred<UndefValue>);

13339// Checks if the 2 PHIs are compatible in terms of high possibility to be

13340// vectorized.

13341auto AreCompatiblePHIs = [&](Value *V,Value *V1) {

13342auto *PHI = cast<PHINode>(V);

13343auto *PHI1 = cast<PHINode>(V1);

13344// Check that all incoming values are compatible/from same parent (if they

13345// are instructions).

13346// The incoming values are compatible if they all are constants, or

13347// instruction with the same/alternate opcodes from the same basic block.

13348for (intI = 0, E =PHI->getNumIncomingValues();I < E; ++I) {

13349Value *In =PHI->getIncomingValue(I);

13350Value *In1 = PHI1->getIncomingValue(I);

13351if (isConstant(In) &&isConstant(In1))

13352continue;

13353if (!getSameOpcode({In, In1}, *TLI))

13354returnfalse;

13355if (cast<Instruction>(In)->getParent() !=

13356 cast<Instruction>(In1)->getParent())

13357returnfalse;

13358 }

13359returntrue;

13360 };

13361// Check if the value can be ignored during analysis for shuffled gathers.

13362// We suppose it is better to ignore instruction, which do not form splats,

13363// are not vectorized/not extractelements (these instructions will be handled

13364// by extractelements processing) or may form vector node in future.

13365auto MightBeIgnored = [=](Value *V) {

13366auto *I = dyn_cast<Instruction>(V);

13367returnI && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&

13368 !isVectorLikeInstWithConstOps(I) &&

13369 !areAllUsersVectorized(I, UserIgnoreList) &&isSimple(I);

13370 };

13371// Check that the neighbor instruction may form a full vector node with the

13372// current instruction V. It is possible, if they have same/alternate opcode

13373// and same parent basic block.

13374auto NeighborMightBeIgnored = [&](Value *V,intIdx) {

13375Value *V1 = VL[Idx];

13376bool UsedInSameVTE =false;

13377auto It = UsedValuesEntry.find(V1);

13378if (It != UsedValuesEntry.end())

13379 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;

13380returnV != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&

13381getSameOpcode({V, V1}, *TLI) &&

13382 cast<Instruction>(V)->getParent() ==

13383 cast<Instruction>(V1)->getParent() &&

13384 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));

13385 };

13386// Build a shuffle mask for better cost estimation and vector emission.

13387SmallBitVector UsedIdxs(Entries.size());

13388SmallVector<std::pair<unsigned, int>> EntryLanes;

13389for (intI = 0, E = VL.size();I < E; ++I) {

13390Value *V = VL[I];

13391auto It = UsedValuesEntry.find(V);

13392if (It == UsedValuesEntry.end())

13393continue;

13394// Do not try to shuffle scalars, if they are constants, or instructions

13395// that can be vectorized as a result of the following vector build

13396// vectorization.

13397if (isConstant(V) || (MightBeIgnored(V) &&

13398 ((I > 0 && NeighborMightBeIgnored(V,I - 1)) ||

13399 (I != E - 1 && NeighborMightBeIgnored(V,I + 1)))))

13400continue;

13401unsignedIdx = It->second;

13402 EntryLanes.emplace_back(Idx,I);

13403 UsedIdxs.set(Idx);

13404 }

13405// Iterate through all shuffled scalars and select entries, which can be used

13406// for final shuffle.

13407SmallVector<const TreeEntry *> TempEntries;

13408for (unsignedI = 0, Sz = Entries.size();I < Sz; ++I) {

13409if (!UsedIdxs.test(I))

13410continue;

13411// Fix the entry number for the given scalar. If it is the first entry, set

13412// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).

13413// These indices are used when calculating final shuffle mask as the vector

13414// offset.

13415for (std::pair<unsigned, int> &Pair : EntryLanes)

13416if (Pair.first ==I)

13417 Pair.first = TempEntries.size();

13418 TempEntries.push_back(Entries[I]);

13419 }

13420 Entries.swap(TempEntries);

13421if (EntryLanes.size() == Entries.size() &&

13422 !VL.equals(ArrayRef(TE->Scalars)

13423 .slice(Part * VL.size(),

13424 std::min<int>(VL.size(),TE->Scalars.size())))) {

13425// We may have here 1 or 2 entries only. If the number of scalars is equal

13426// to the number of entries, no need to do the analysis, it is not very

13427// profitable. Since VL is not the same as TE->Scalars, it means we already

13428// have some shuffles before. Cut off not profitable case.

13429 Entries.clear();

13430return std::nullopt;

13431 }

13432// Build the final mask, check for the identity shuffle, if possible.

13433bool IsIdentity = Entries.size() == 1;

13434// Pair.first is the offset to the vector, while Pair.second is the index of

13435// scalar in the list.

13436for (const std::pair<unsigned, int> &Pair : EntryLanes) {

13437unsignedIdx = Part * VL.size() + Pair.second;

13438Mask[Idx] =

13439 Pair.first * VF +

13440 (ForOrder ? std::distance(

13441 Entries[Pair.first]->Scalars.begin(),

13442find(Entries[Pair.first]->Scalars, VL[Pair.second]))

13443 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));

13444 IsIdentity &=Mask[Idx] == Pair.second;

13445 }

13446if (ForOrder || IsIdentity || Entries.empty()) {

13447switch (Entries.size()) {

13448case 1:

13449if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)

13450returnTargetTransformInfo::SK_PermuteSingleSrc;

13451break;

13452case 2:

13453if (EntryLanes.size() > 2 || VL.size() <= 2)

13454returnTargetTransformInfo::SK_PermuteTwoSrc;

13455break;

13456default:

13457break;

13458 }

13459 }elseif (!isa<VectorType>(VL.front()->getType()) &&

13460 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {

13461// Do the cost estimation if shuffle beneficial than buildvector.

13462SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),

13463 std::next(Mask.begin(), (Part + 1) * VL.size()));

13464int MinElement = SubMask.front(), MaxElement = SubMask.front();

13465for (intIdx : SubMask) {

13466if (Idx ==PoisonMaskElem)

13467continue;

13468if (MinElement ==PoisonMaskElem || MinElement % VF >Idx % VF)

13469 MinElement =Idx;

13470if (MaxElement ==PoisonMaskElem || MaxElement % VF <Idx % VF)

13471 MaxElement =Idx;

13472 }

13473assert(MaxElement >= 0 && MinElement >= 0 &&

13474 MaxElement % VF >= MinElement % VF &&

13475"Expected at least single element.");

13476unsigned NewVF = std::max<unsigned>(

13477 VL.size(),getFullVectorNumberOfElements(*TTI, VL.front()->getType(),

13478 (MaxElement % VF) -

13479 (MinElement % VF) + 1));

13480if (NewVF < VF) {

13481for_each(SubMask, [&](int &Idx) {

13482if (Idx ==PoisonMaskElem)

13483return;

13484Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +

13485 (Idx >=static_cast<int>(VF) ? NewVF : 0);

13486 });

13487 }else {

13488 NewVF = VF;

13489 }

13490

13491constexprTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

13492auto *VecTy =getWidenedType(VL.front()->getType(), NewVF);

13493auto *MaskVecTy =getWidenedType(VL.front()->getType(), SubMask.size());

13494auto GetShuffleCost = [&,

13495 &TTI = *TTI](ArrayRef<int>Mask,

13496ArrayRef<const TreeEntry *> Entries,

13497VectorType *VecTy) ->InstructionCost {

13498if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&

13499ShuffleVectorInst::isDeInterleaveMaskOfFactor(

13500 Mask, Entries.front()->getInterleaveFactor()))

13501returnTTI::TCC_Free;

13502 return ::getShuffleCost(TTI,

13503 Entries.size() > 1 ?TTI::SK_PermuteTwoSrc

13504 :TTI::SK_PermuteSingleSrc,

13505 VecTy, Mask,CostKind);

13506 };

13507InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);

13508InstructionCost FirstShuffleCost = 0;

13509SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());

13510if (Entries.size() == 1 || !Entries[0]->isGather()) {

13511 FirstShuffleCost = ShuffleCost;

13512 }else {

13513// Transform mask to include only first entry.

13514APInt DemandedElts =APInt::getAllOnes(SubMask.size());

13515bool IsIdentity =true;

13516for (auto [I,Idx] :enumerate(FirstMask)) {

13517if (Idx >=static_cast<int>(NewVF)) {

13518Idx =PoisonMaskElem;

13519 }else {

13520 DemandedElts.clearBit(I);

13521if (Idx !=PoisonMaskElem)

13522 IsIdentity &=static_cast<int>(I) ==Idx;

13523 }

13524 }

13525if (!IsIdentity)

13526 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);

13527 FirstShuffleCost +=TTI->getScalarizationOverhead(

13528 MaskVecTy, DemandedElts,/*Insert=*/true,

13529/*Extract=*/false,CostKind);

13530 }

13531InstructionCost SecondShuffleCost = 0;

13532SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());

13533if (Entries.size() == 1 || !Entries[1]->isGather()) {

13534 SecondShuffleCost = ShuffleCost;

13535 }else {

13536// Transform mask to include only first entry.

13537APInt DemandedElts =APInt::getAllOnes(SubMask.size());

13538bool IsIdentity =true;

13539for (auto [I,Idx] :enumerate(SecondMask)) {

13540if (Idx <static_cast<int>(NewVF) &&Idx >= 0) {

13541Idx =PoisonMaskElem;

13542 }else {

13543 DemandedElts.clearBit(I);

13544if (Idx !=PoisonMaskElem) {

13545Idx -= NewVF;

13546 IsIdentity &=static_cast<int>(I) ==Idx;

13547 }

13548 }

13549 }

13550if (!IsIdentity)

13551 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);

13552 SecondShuffleCost +=TTI->getScalarizationOverhead(

13553 MaskVecTy, DemandedElts,/*Insert=*/true,

13554/*Extract=*/false,CostKind);

13555 }

13556APInt DemandedElts =APInt::getAllOnes(SubMask.size());

13557for (auto [I,Idx] :enumerate(SubMask))

13558if (Idx ==PoisonMaskElem)

13559 DemandedElts.clearBit(I);

13560InstructionCost BuildVectorCost =

13561TTI->getScalarizationOverhead(MaskVecTy, DemandedElts,/*Insert=*/true,

13562/*Extract=*/false,CostKind);

13563const TreeEntry *BestEntry =nullptr;

13564if (FirstShuffleCost < ShuffleCost) {

13565 std::for_each(std::next(Mask.begin(), Part * VL.size()),

13566 std::next(Mask.begin(), (Part + 1) * VL.size()),

13567 [&](int &Idx) {

13568 if (Idx >= static_cast<int>(VF))

13569 Idx = PoisonMaskElem;

13570 });

13571 BestEntry = Entries.front();

13572 ShuffleCost = FirstShuffleCost;

13573 }

13574if (SecondShuffleCost < ShuffleCost) {

13575 std::for_each(std::next(Mask.begin(), Part * VL.size()),

13576 std::next(Mask.begin(), (Part + 1) * VL.size()),

13577 [&](int &Idx) {

13578 if (Idx < static_cast<int>(VF))

13579 Idx = PoisonMaskElem;

13580 else

13581 Idx -= VF;

13582 });

13583 BestEntry = Entries[1];

13584 ShuffleCost = SecondShuffleCost;

13585 }

13586if (BuildVectorCost >= ShuffleCost) {

13587if (BestEntry) {

13588 Entries.clear();

13589 Entries.push_back(BestEntry);

13590 }

13591return Entries.size() > 1 ?TargetTransformInfo::SK_PermuteTwoSrc

13592 :TargetTransformInfo::SK_PermuteSingleSrc;

13593 }

13594 }

13595 Entries.clear();

13596// Clear the corresponding mask elements.

13597 std::fill(std::next(Mask.begin(), Part * VL.size()),

13598 std::next(Mask.begin(), (Part + 1) * VL.size()),PoisonMaskElem);

13599return std::nullopt;

13600}

13601

13602SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

13603BoUpSLP::isGatherShuffledEntry(

13604const TreeEntry *TE,ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,

13605SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,unsigned NumParts,

13606bool ForOrder) {

13607assert(NumParts > 0 && NumParts < VL.size() &&

13608"Expected positive number of registers.");

13609 Entries.clear();

13610// No need to check for the topmost gather node.

13611if (TE == VectorizableTree.front().get() &&

13612 (!GatheredLoadsEntriesFirst.has_value() ||

13613none_of(ArrayRef(VectorizableTree).drop_front(),

13614 [](const std::unique_ptr<TreeEntry> &TE) {

13615return !TE->isGather();

13616 })))

13617return {};

13618// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not

13619// implemented yet.

13620if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))

13621return {};

13622Mask.assign(VL.size(),PoisonMaskElem);

13623assert((TE->UserTreeIndices.size() == 1 ||

13624 TE == VectorizableTree.front().get()) &&

13625"Expected only single user of the gather node.");

13626assert(VL.size() % NumParts == 0 &&

13627"Number of scalars must be divisible by NumParts.");

13628if (!TE->UserTreeIndices.empty() &&

13629TE->UserTreeIndices.front().UserTE->isGather() &&

13630TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {

13631assert(

13632 (TE->Idx == 0 ||

13633 (TE->hasState() &&TE->getOpcode() == Instruction::ExtractElement) ||

13634isSplat(TE->Scalars)) &&

13635"Expected splat or extractelements only node.");

13636return {};

13637 }

13638unsigned SliceSize =getPartNumElems(VL.size(), NumParts);

13639SmallVector<std::optional<TTI::ShuffleKind>> Res;

13640for (unsigned Part : seq<unsigned>(NumParts)) {

13641ArrayRef<Value *> SubVL =

13642 VL.slice(Part * SliceSize,getNumElems(VL.size(), SliceSize, Part));

13643SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();

13644 std::optional<TTI::ShuffleKind> SubRes =

13645 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,

13646 ForOrder);

13647if (!SubRes)

13648 SubEntries.clear();

13649 Res.push_back(SubRes);

13650if (SubEntries.size() == 1 && *SubRes ==TTI::SK_PermuteSingleSrc &&

13651 SubEntries.front()->getVectorFactor() == VL.size() &&

13652 (SubEntries.front()->isSame(TE->Scalars) ||

13653 SubEntries.front()->isSame(VL))) {

13654SmallVector<const TreeEntry *> LocalSubEntries;

13655 LocalSubEntries.swap(SubEntries);

13656 Entries.clear();

13657 Res.clear();

13658 std::iota(Mask.begin(),Mask.end(), 0);

13659// Clear undef scalars.

13660for (intI = 0, Sz = VL.size();I < Sz; ++I)

13661if (isa<PoisonValue>(VL[I]))

13662Mask[I] =PoisonMaskElem;

13663 Entries.emplace_back(1, LocalSubEntries.front());

13664 Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);

13665return Res;

13666 }

13667 }

13668if (all_of(Res,

13669 [](const std::optional<TTI::ShuffleKind> &SK) {return !SK; })) {

13670 Entries.clear();

13671return {};

13672 }

13673return Res;

13674}

13675

13676InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,bool ForPoisonSrc,

13677Type *ScalarTy) const{

13678auto *VecTy =getWidenedType(ScalarTy, VL.size());

13679bool DuplicateNonConst =false;

13680// Find the cost of inserting/extracting values from the vector.

13681// Check if the same elements are inserted several times and count them as

13682// shuffle candidates.

13683APInt ShuffledElements =APInt::getZero(VL.size());

13684DenseMap<Value *, unsigned> UniqueElements;

13685constexprTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

13686InstructionCost Cost;

13687auto EstimateInsertCost = [&](unsignedI,Value *V) {

13688if (V->getType() != ScalarTy) {

13689Cost +=TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,V->getType(),

13690TTI::CastContextHint::None,CostKind);

13691V =nullptr;

13692 }

13693if (!ForPoisonSrc)

13694Cost +=

13695TTI->getVectorInstrCost(Instruction::InsertElement, VecTy,CostKind,

13696I,Constant::getNullValue(VecTy),V);

13697 };

13698SmallVector<int> ShuffleMask(VL.size(),PoisonMaskElem);

13699for (unsignedI = 0, E = VL.size();I < E; ++I) {

13700Value *V = VL[I];

13701// No need to shuffle duplicates for constants.

13702if ((ForPoisonSrc &&isConstant(V)) || isa<UndefValue>(V)) {

13703 ShuffledElements.setBit(I);

13704 ShuffleMask[I] = isa<PoisonValue>(V) ?PoisonMaskElem :I;

13705continue;

13706 }

13707

13708auto Res = UniqueElements.try_emplace(V,I);

13709if (Res.second) {

13710 EstimateInsertCost(I, V);

13711 ShuffleMask[I] =I;

13712continue;

13713 }

13714

13715 DuplicateNonConst =true;

13716 ShuffledElements.setBit(I);

13717 ShuffleMask[I] = Res.first->second;

13718 }

13719if (ForPoisonSrc) {

13720if (isa<FixedVectorType>(ScalarTy)) {

13721assert(SLPReVec &&"Only supported by REVEC.");

13722// We don't need to insert elements one by one. Instead, we can insert the

13723// entire vector into the destination.

13724Cost = 0;

13725unsigned ScalarTyNumElements =getNumElements(ScalarTy);

13726for (unsignedI : seq<unsigned>(VL.size()))

13727if (!ShuffledElements[I])

13728Cost +=TTI->getShuffleCost(

13729TTI::SK_InsertSubvector, VecTy, std::nullopt,CostKind,

13730I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));

13731 }else {

13732Cost =TTI->getScalarizationOverhead(VecTy,

13733/*DemandedElts*/ ~ShuffledElements,

13734/*Insert*/true,

13735/*Extract*/false,CostKind, VL);

13736 }

13737 }

13738if (DuplicateNonConst)

13739Cost +=::getShuffleCost(*TTI,TargetTransformInfo::SK_PermuteSingleSrc,

13740 VecTy, ShuffleMask);

13741returnCost;

13742}

13743

13744Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

13745auto &Res = EntryToLastInstruction.try_emplace(E).first->second;

13746if (Res)

13747return *Res;

13748// Get the basic block this bundle is in. All instructions in the bundle

13749// should be in this block (except for extractelement-like instructions with

13750// constant indices or gathered loads).

13751auto *Front = E->getMainOp();

13752auto *BB = Front->getParent();

13753assert(((GatheredLoadsEntriesFirst.has_value() &&

13754 E->getOpcode() == Instruction::Load && E->isGather() &&

13755 E->Idx < *GatheredLoadsEntriesFirst) ||

13756all_of(E->Scalars,

13757 [=](Value *V) ->bool {

13758 if (E->getOpcode() == Instruction::GetElementPtr &&

13759 !isa<GetElementPtrInst>(V))

13760 return true;

13761 auto *I = dyn_cast<Instruction>(V);

13762 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||

13763 isVectorLikeInstWithConstOps(I);

13764 })) &&

13765"Expected gathered loads or GEPs or instructions from same basic "

13766"block.");

13767

13768auto FindLastInst = [&]() {

13769Instruction *LastInst = Front;

13770for (Value *V : E->Scalars) {

13771auto *I = dyn_cast<Instruction>(V);

13772if (!I)

13773continue;

13774if (LastInst->getParent() ==I->getParent()) {

13775if (LastInst->comesBefore(I))

13776 LastInst =I;

13777continue;

13778 }

13779assert(((E->getOpcode() == Instruction::GetElementPtr &&

13780 !isa<GetElementPtrInst>(I)) ||

13781 (isVectorLikeInstWithConstOps(LastInst) &&

13782isVectorLikeInstWithConstOps(I)) ||

13783 (GatheredLoadsEntriesFirst.has_value() &&

13784 E->getOpcode() == Instruction::Load && E->isGather() &&

13785 E->Idx < *GatheredLoadsEntriesFirst)) &&

13786"Expected vector-like or non-GEP in GEP node insts only.");

13787if (!DT->isReachableFromEntry(LastInst->getParent())) {

13788 LastInst =I;

13789continue;

13790 }

13791if (!DT->isReachableFromEntry(I->getParent()))

13792continue;

13793auto *NodeA = DT->getNode(LastInst->getParent());

13794auto *NodeB = DT->getNode(I->getParent());

13795assert(NodeA &&"Should only process reachable instructions");

13796assert(NodeB &&"Should only process reachable instructions");

13797assert((NodeA == NodeB) ==

13798 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

13799"Different nodes should have different DFS numbers");

13800if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())

13801 LastInst =I;

13802 }

13803 BB = LastInst->getParent();

13804return LastInst;

13805 };

13806

13807auto FindFirstInst = [&]() {

13808Instruction *FirstInst = Front;

13809for (Value *V : E->Scalars) {

13810auto *I = dyn_cast<Instruction>(V);

13811if (!I)

13812continue;

13813if (FirstInst->getParent() ==I->getParent()) {

13814if (I->comesBefore(FirstInst))

13815 FirstInst =I;

13816continue;

13817 }

13818assert(((E->getOpcode() == Instruction::GetElementPtr &&

13819 !isa<GetElementPtrInst>(I)) ||

13820 (isVectorLikeInstWithConstOps(FirstInst) &&

13821isVectorLikeInstWithConstOps(I))) &&

13822"Expected vector-like or non-GEP in GEP node insts only.");

13823if (!DT->isReachableFromEntry(FirstInst->getParent())) {

13824 FirstInst =I;

13825continue;

13826 }

13827if (!DT->isReachableFromEntry(I->getParent()))

13828continue;

13829auto *NodeA = DT->getNode(FirstInst->getParent());

13830auto *NodeB = DT->getNode(I->getParent());

13831assert(NodeA &&"Should only process reachable instructions");

13832assert(NodeB &&"Should only process reachable instructions");

13833assert((NodeA == NodeB) ==

13834 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

13835"Different nodes should have different DFS numbers");

13836if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())

13837 FirstInst =I;

13838 }

13839return FirstInst;

13840 };

13841

13842// Set insertpoint for gathered loads to the very first load.

13843if (GatheredLoadsEntriesFirst.has_value() &&

13844 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&

13845 E->getOpcode() == Instruction::Load) {

13846 Res = FindFirstInst();

13847return *Res;

13848 }

13849

13850// Set the insert point to the beginning of the basic block if the entry

13851// should not be scheduled.

13852if (doesNotNeedToSchedule(E->Scalars) ||

13853 (!E->isGather() &&all_of(E->Scalars,isVectorLikeInstWithConstOps))) {

13854if ((E->getOpcode() == Instruction::GetElementPtr &&

13855any_of(E->Scalars,

13856 [](Value *V) {

13857 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);

13858 })) ||

13859all_of(E->Scalars,

13860 [](Value *V) {

13861 return isa<PoisonValue>(V) ||

13862 (!isVectorLikeInstWithConstOps(V) &&

13863 isUsedOutsideBlock(V));

13864 }) ||

13865 (E->isGather() && E->Idx == 0 &&all_of(E->Scalars, [](Value *V) {

13866 return isa<ExtractElementInst, UndefValue>(V) ||

13867 areAllOperandsNonInsts(V);

13868 })))

13869 Res = FindLastInst();

13870else

13871 Res = FindFirstInst();

13872return *Res;

13873 }

13874

13875// Find the last instruction. The common case should be that BB has been

13876// scheduled, and the last instruction is VL.back(). So we start with

13877// VL.back() and iterate over schedule data until we reach the end of the

13878// bundle. The end of the bundle is marked by null ScheduleData.

13879if (BlocksSchedules.count(BB) && !E->isGather()) {

13880Value *V = E->isOneOf(E->Scalars.back());

13881if (doesNotNeedToBeScheduled(V))

13882V = *find_if_not(E->Scalars,doesNotNeedToBeScheduled);

13883auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);

13884if (Bundle && Bundle->isPartOfBundle())

13885for (; Bundle; Bundle = Bundle->NextInBundle)

13886 Res = Bundle->Inst;

13887 }

13888

13889// LastInst can still be null at this point if there's either not an entry

13890// for BB in BlocksSchedules or there's no ScheduleData available for

13891// VL.back(). This can be the case if buildTree_rec aborts for various

13892// reasons (e.g., the maximum recursion depth is reached, the maximum region

13893// size is reached, etc.). ScheduleData is initialized in the scheduling

13894// "dry-run".

13895//

13896// If this happens, we can still find the last instruction by brute force. We

13897// iterate forwards from Front (inclusive) until we either see all

13898// instructions in the bundle or reach the end of the block. If Front is the

13899// last instruction in program order, LastInst will be set to Front, and we

13900// will visit all the remaining instructions in the block.

13901//

13902// One of the reasons we exit early from buildTree_rec is to place an upper

13903// bound on compile-time. Thus, taking an additional compile-time hit here is

13904// not ideal. However, this should be exceedingly rare since it requires that

13905// we both exit early from buildTree_rec and that the bundle be out-of-order

13906// (causing us to iterate all the way to the end of the block).

13907if (!Res)

13908 Res = FindLastInst();

13909assert(Res &&"Failed to find last instruction in bundle");

13910return *Res;

13911}

13912

13913void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

13914auto *Front = E->getMainOp();

13915Instruction *LastInst = &getLastInstructionInBundle(E);

13916assert(LastInst &&"Failed to find last instruction in bundle");

13917BasicBlock::iterator LastInstIt = LastInst->getIterator();

13918// If the instruction is PHI, set the insert point after all the PHIs.

13919bool IsPHI = isa<PHINode>(LastInst);

13920if (IsPHI)

13921 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();

13922if (IsPHI || (!E->isGather() &&doesNotNeedToSchedule(E->Scalars))) {

13923 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);

13924 }else {

13925// Set the insertion point after the last instruction in the bundle. Set the

13926// debug location to Front.

13927 Builder.SetInsertPoint(

13928 LastInst->getParent(),

13929 LastInst->getNextNonDebugInstruction()->getIterator());

13930 }

13931 Builder.SetCurrentDebugLocation(Front->getDebugLoc());

13932}

13933

13934Value *BoUpSLP::gather(

13935ArrayRef<Value *> VL,Value *Root,Type *ScalarTy,

13936function_ref<Value *(Value *,Value *,ArrayRef<int>)> CreateShuffle) {

13937// List of instructions/lanes from current block and/or the blocks which are

13938// part of the current loop. These instructions will be inserted at the end to

13939// make it possible to optimize loops and hoist invariant instructions out of

13940// the loops body with better chances for success.

13941SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;

13942SmallSet<int, 4> PostponedIndices;

13943Loop *L = LI->getLoopFor(Builder.GetInsertBlock());

13944auto &&CheckPredecessor = [](BasicBlock *InstBB,BasicBlock *InsertBB) {

13945SmallPtrSet<BasicBlock *, 4> Visited;

13946while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)

13947 InsertBB = InsertBB->getSinglePredecessor();

13948return InsertBB && InsertBB == InstBB;

13949 };

13950for (intI = 0, E = VL.size();I < E; ++I) {

13951if (auto *Inst = dyn_cast<Instruction>(VL[I]))

13952if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||

13953 getTreeEntry(Inst) ||

13954 (L && (!Root ||L->isLoopInvariant(Root)) &&L->contains(Inst))) &&

13955 PostponedIndices.insert(I).second)

13956 PostponedInsts.emplace_back(Inst,I);

13957 }

13958

13959auto &&CreateInsertElement = [this](Value *Vec,Value *V,unsigned Pos,

13960Type *Ty) {

13961Value *Scalar =V;

13962if (Scalar->getType() != Ty) {

13963assert(Scalar->getType()->isIntOrIntVectorTy() &&

13964 Ty->isIntOrIntVectorTy() &&"Expected integer types only.");

13965Value *V =Scalar;

13966if (auto *CI = dyn_cast<CastInst>(Scalar);

13967 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {

13968Value *Op = CI->getOperand(0);

13969if (auto *IOp = dyn_cast<Instruction>(Op);

13970 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))

13971V =Op;

13972 }

13973Scalar = Builder.CreateIntCast(

13974 V, Ty, !isKnownNonNegative(Scalar,SimplifyQuery(*DL)));

13975 }

13976

13977Instruction *InsElt;

13978if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {

13979assert(SLPReVec &&"FixedVectorType is not expected.");

13980 Vec =

13981createInsertVector(Builder, Vec, Scalar, Pos *getNumElements(VecTy));

13982auto *II = dyn_cast<IntrinsicInst>(Vec);

13983if (!II ||II->getIntrinsicID() != Intrinsic::vector_insert)

13984return Vec;

13985 InsElt =II;

13986 }else {

13987 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));

13988 InsElt = dyn_cast<InsertElementInst>(Vec);

13989if (!InsElt)

13990return Vec;

13991 }

13992 GatherShuffleExtractSeq.insert(InsElt);

13993 CSEBlocks.insert(InsElt->getParent());

13994// Add to our 'need-to-extract' list.

13995if (isa<Instruction>(V)) {

13996if (TreeEntry *Entry = getTreeEntry(V)) {

13997// Find which lane we need to extract.

13998User *UserOp =nullptr;

13999if (Scalar != V) {

14000if (auto *SI = dyn_cast<Instruction>(Scalar))

14001 UserOp =SI;

14002 }else {

14003 UserOp = InsElt;

14004 }

14005if (UserOp) {

14006unsigned FoundLane =Entry->findLaneForValue(V);

14007 ExternalUses.emplace_back(V, UserOp, FoundLane);

14008 }

14009 }

14010 }

14011return Vec;

14012 };

14013auto *VecTy =getWidenedType(ScalarTy, VL.size());

14014Value *Vec =PoisonValue::get(VecTy);

14015SmallVector<int> NonConsts;

14016SmallVector<int>Mask(VL.size());

14017 std::iota(Mask.begin(),Mask.end(), 0);

14018Value *OriginalRoot = Root;

14019if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);

14020 SV && isa<PoisonValue>(SV->getOperand(1)) &&

14021 SV->getOperand(0)->getType() == VecTy) {

14022 Root = SV->getOperand(0);

14023Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());

14024 }

14025// Insert constant values at first.

14026for (intI = 0, E = VL.size();I < E; ++I) {

14027if (PostponedIndices.contains(I))

14028continue;

14029if (!isConstant(VL[I])) {

14030 NonConsts.push_back(I);

14031continue;

14032 }

14033if (isa<PoisonValue>(VL[I]))

14034continue;

14035 Vec = CreateInsertElement(Vec, VL[I],I, ScalarTy);

14036Mask[I] =I + E;

14037 }

14038if (Root) {

14039if (isa<PoisonValue>(Vec)) {

14040 Vec = OriginalRoot;

14041 }else {

14042 Vec = CreateShuffle(Root, Vec, Mask);

14043if (auto *OI = dyn_cast<Instruction>(OriginalRoot);

14044 OI && OI->hasNUses(0) &&

14045none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

14046returnTE->VectorizedValue == OI;

14047 }))

14048eraseInstruction(OI);

14049 }

14050 }

14051// Insert non-constant values.

14052for (intI : NonConsts)

14053 Vec = CreateInsertElement(Vec, VL[I],I, ScalarTy);

14054// Append instructions, which are/may be part of the loop, in the end to make

14055// it possible to hoist non-loop-based instructions.

14056for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)

14057 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);

14058

14059return Vec;

14060}

14061

14062/// Merges shuffle masks and emits final shuffle instruction, if required. It

14063/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,

14064/// when the actual shuffle instruction is generated only if this is actually

14065/// required. Otherwise, the shuffle instruction emission is delayed till the

14066/// end of the process, to reduce the number of emitted instructions and further

14067/// analysis/transformations.

14068/// The class also will look through the previously emitted shuffle instructions

14069/// and properly mark indices in mask as undef.

14070/// For example, given the code

14071/// \code

14072/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>

14073/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>

14074/// \endcode

14075/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will

14076/// look through %s1 and %s2 and emit

14077/// \code

14078/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>

14079/// \endcode

14080/// instead.

14081/// If 2 operands are of different size, the smallest one will be resized and

14082/// the mask recalculated properly.

14083/// For example, given the code

14084/// \code

14085/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>

14086/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>

14087/// \endcode

14088/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will

14089/// look through %s1 and %s2 and emit

14090/// \code

14091/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>

14092/// \endcode

14093/// instead.

14094classBoUpSLP::ShuffleInstructionBuilder final :public BaseShuffleAnalysis {

14095bool IsFinalized =false;

14096 /// Combined mask for all applied operands and masks. It is built during

14097 /// analysis and actual emission of shuffle vector instructions.

14098SmallVector<int> CommonMask;

14099 /// List of operands for the shuffle vector instruction. It hold at max 2

14100 /// operands, if the 3rd is going to be added, the first 2 are combined into

14101 /// shuffle with \p CommonMask mask, the first operand sets to be the

14102 /// resulting shuffle and the second operand sets to be the newly added

14103 /// operand. The \p CommonMask is transformed in the proper way after that.

14104SmallVector<Value *, 2> InVectors;

14105IRBuilderBase &Builder;

14106BoUpSLP &R;

14107

14108classShuffleIRBuilder {

14109IRBuilderBase &Builder;

14110 /// Holds all of the instructions that we gathered.

14111SetVector<Instruction *> &GatherShuffleExtractSeq;

14112 /// A list of blocks that we are going to CSE.

14113DenseSet<BasicBlock *> &CSEBlocks;

14114 /// Data layout.

14115constDataLayout &DL;

14116

14117public:

14118 ShuffleIRBuilder(IRBuilderBase &Builder,

14119SetVector<Instruction *> &GatherShuffleExtractSeq,

14120DenseSet<BasicBlock *> &CSEBlocks,constDataLayout &DL)

14121 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),

14122 CSEBlocks(CSEBlocks),DL(DL) {}

14123 ~ShuffleIRBuilder() =default;

14124 /// Creates shufflevector for the 2 operands with the given mask.

14125Value *createShuffleVector(Value *V1,Value *V2,ArrayRef<int> Mask) {

14126if (V1->getType() != V2->getType()) {

14127assert(V1->getType()->isIntOrIntVectorTy() &&

14128 V1->getType()->isIntOrIntVectorTy() &&

14129"Expected integer vector types only.");

14130if (V1->getType() != V2->getType()) {

14131if (cast<VectorType>(V2->getType())

14132 ->getElementType()

14133 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())

14134 ->getElementType()

14135 ->getIntegerBitWidth())

14136 V2 = Builder.CreateIntCast(

14137 V2, V1->getType(), !isKnownNonNegative(V2,SimplifyQuery(DL)));

14138else

14139 V1 = Builder.CreateIntCast(

14140 V1, V2->getType(), !isKnownNonNegative(V1,SimplifyQuery(DL)));

14141 }

14142 }

14143Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);

14144if (auto *I = dyn_cast<Instruction>(Vec)) {

14145 GatherShuffleExtractSeq.insert(I);

14146 CSEBlocks.insert(I->getParent());

14147 }

14148return Vec;

14149 }

14150 /// Creates permutation of the single vector operand with the given mask, if

14151 /// it is not identity mask.

14152Value *createShuffleVector(Value *V1,ArrayRef<int> Mask) {

14153if (Mask.empty())

14154return V1;

14155unsigned VF = Mask.size();

14156unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();

14157if (VF == LocalVF &&ShuffleVectorInst::isIdentityMask(Mask, VF))

14158return V1;

14159Value *Vec = Builder.CreateShuffleVector(V1, Mask);

14160if (auto *I = dyn_cast<Instruction>(Vec)) {

14161 GatherShuffleExtractSeq.insert(I);

14162 CSEBlocks.insert(I->getParent());

14163 }

14164return Vec;

14165 }

14166Value *createIdentity(Value *V) {return V; }

14167Value *createPoison(Type *Ty,unsigned VF) {

14168returnPoisonValue::get(getWidenedType(Ty, VF));

14169 }

14170 /// Resizes 2 input vector to match the sizes, if the they are not equal

14171 /// yet. The smallest vector is resized to the size of the larger vector.

14172void resizeToMatch(Value *&V1,Value *&V2) {

14173if (V1->getType() == V2->getType())

14174return;

14175int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();

14176int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();

14177int VF = std::max(V1VF, V2VF);

14178int MinVF = std::min(V1VF, V2VF);

14179SmallVector<int> IdentityMask(VF,PoisonMaskElem);

14180 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),

14181 0);

14182Value *&Op = MinVF == V1VF ? V1 : V2;

14183Op = Builder.CreateShuffleVector(Op, IdentityMask);

14184if (auto *I = dyn_cast<Instruction>(Op)) {

14185 GatherShuffleExtractSeq.insert(I);

14186 CSEBlocks.insert(I->getParent());

14187 }

14188if (MinVF == V1VF)

14189 V1 =Op;

14190else

14191 V2 =Op;

14192 }

14193 };

14194

14195 /// Smart shuffle instruction emission, walks through shuffles trees and

14196 /// tries to find the best matching vector for the actual shuffle

14197 /// instruction.

14198Value *createShuffle(Value *V1,Value *V2,ArrayRef<int> Mask) {

14199assert(V1 &&"Expected at least one vector value.");

14200 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,

14201 R.CSEBlocks, *R.DL);

14202return BaseShuffleAnalysis::createShuffle<Value *>(

14203 V1, V2, Mask, ShuffleBuilder, ScalarTy);

14204 }

14205

14206 /// Cast value \p V to the vector type with the same number of elements, but

14207 /// the base type \p ScalarTy.

14208Value *castToScalarTyElem(Value *V,

14209 std::optional<bool> IsSigned = std::nullopt) {

14210auto *VecTy = cast<VectorType>(V->getType());

14211assert(getNumElements(VecTy) %getNumElements(ScalarTy) == 0);

14212if (VecTy->getElementType() == ScalarTy->getScalarType())

14213return V;

14214return Builder.CreateIntCast(

14215 V,VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),

14216 IsSigned.value_or(!isKnownNonNegative(V,SimplifyQuery(*R.DL))));

14217 }

14218

14219public:

14220ShuffleInstructionBuilder(Type *ScalarTy,IRBuilderBase &Builder,BoUpSLP &R)

14221 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}

14222

14223 /// Adjusts extractelements after reusing them.

14224Value *adjustExtracts(const TreeEntry *E,MutableArrayRef<int> Mask,

14225ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

14226unsigned NumParts,bool &UseVecBaseAsInput) {

14227 UseVecBaseAsInput =false;

14228SmallPtrSet<Value *, 4> UniqueBases;

14229Value *VecBase =nullptr;

14230SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());

14231if (!E->ReorderIndices.empty()) {

14232SmallVector<int> ReorderMask(E->ReorderIndices.begin(),

14233 E->ReorderIndices.end());

14234reorderScalars(VL, ReorderMask);

14235 }

14236for (intI = 0, Sz = Mask.size();I < Sz; ++I) {

14237intIdx = Mask[I];

14238if (Idx ==PoisonMaskElem)

14239continue;

14240auto *EI = cast<ExtractElementInst>(VL[I]);

14241 VecBase = EI->getVectorOperand();

14242if (const TreeEntry *TE = R.getTreeEntry(VecBase))

14243 VecBase = TE->VectorizedValue;

14244assert(VecBase &&"Expected vectorized value.");

14245 UniqueBases.insert(VecBase);

14246// If the only one use is vectorized - can delete the extractelement

14247// itself.

14248if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||

14249 (NumParts != 1 &&count(VL, EI) > 1) ||

14250any_of(EI->users(), [&](User *U) {

14251 const TreeEntry *UTE = R.getTreeEntry(U);

14252 return !UTE || R.MultiNodeScalars.contains(U) ||

14253 (isa<GetElementPtrInst>(U) &&

14254 !R.areAllUsersVectorized(cast<Instruction>(U))) ||

14255 count_if(R.VectorizableTree,

14256 [&](const std::unique_ptr<TreeEntry> &TE) {

14257 return any_of(TE->UserTreeIndices,

14258 [&](const EdgeInfo &Edge) {

14259 return Edge.UserTE == UTE;

14260 }) &&

14261 is_contained(VL, EI);

14262 }) != 1;

14263 }))

14264continue;

14265 R.eraseInstruction(EI);

14266 }

14267if (NumParts == 1 || UniqueBases.size() == 1) {

14268assert(VecBase &&"Expected vectorized value.");

14269return castToScalarTyElem(VecBase);

14270 }

14271 UseVecBaseAsInput =true;

14272auto TransformToIdentity = [](MutableArrayRef<int> Mask) {

14273for (auto [I,Idx] :enumerate(Mask))

14274if (Idx !=PoisonMaskElem)

14275Idx =I;

14276 };

14277// Perform multi-register vector shuffle, joining them into a single virtual

14278// long vector.

14279// Need to shuffle each part independently and then insert all this parts

14280// into a long virtual vector register, forming the original vector.

14281Value *Vec =nullptr;

14282SmallVector<int> VecMask(Mask.size(),PoisonMaskElem);

14283unsigned SliceSize =getPartNumElems(VL.size(), NumParts);

14284for (unsigned Part : seq<unsigned>(NumParts)) {

14285unsigned Limit =getNumElems(VL.size(), SliceSize, Part);

14286ArrayRef<Value *> SubVL =ArrayRef(VL).slice(Part * SliceSize, Limit);

14287MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);

14288constexprint MaxBases = 2;

14289SmallVector<Value *, MaxBases> Bases(MaxBases);

14290auto VLMask =zip(SubVL, SubMask);

14291constunsigned VF = std::accumulate(

14292 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S,constauto &D) {

14293 if (std::get<1>(D) == PoisonMaskElem)

14294 return S;

14295 Value *VecOp =

14296 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();

14297 if (const TreeEntry *TE = R.getTreeEntry(VecOp))

14298 VecOp = TE->VectorizedValue;

14299 assert(VecOp &&"Expected vectorized value.");

14300 const unsigned Size =

14301 cast<FixedVectorType>(VecOp->getType())->getNumElements();

14302 return std::max(S, Size);

14303 });

14304for (constauto [V,I] : VLMask) {

14305if (I ==PoisonMaskElem)

14306continue;

14307Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();

14308if (const TreeEntry *TE = R.getTreeEntry(VecOp))

14309 VecOp = TE->VectorizedValue;

14310assert(VecOp &&"Expected vectorized value.");

14311 VecOp = castToScalarTyElem(VecOp);

14312 Bases[I / VF] = VecOp;

14313 }

14314if (!Bases.front())

14315continue;

14316Value *SubVec;

14317if (Bases.back()) {

14318 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);

14319 TransformToIdentity(SubMask);

14320 }else {

14321 SubVec = Bases.front();

14322 }

14323if (!Vec) {

14324 Vec = SubVec;

14325assert((Part == 0 ||all_of(seq<unsigned>(0, Part),

14326 [&](unsignedP) {

14327ArrayRef<int> SubMask =

14328Mask.slice(P * SliceSize,

14329getNumElems(Mask.size(),

14330 SliceSize,P));

14331returnall_of(SubMask, [](intIdx) {

14332returnIdx ==PoisonMaskElem;

14333 });

14334 })) &&

14335"Expected first part or all previous parts masked.");

14336copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

14337 }else {

14338unsigned NewVF =

14339 cast<FixedVectorType>(Vec->getType())->getNumElements();

14340if (Vec->getType() != SubVec->getType()) {

14341unsigned SubVecVF =

14342 cast<FixedVectorType>(SubVec->getType())->getNumElements();

14343 NewVF = std::max(NewVF, SubVecVF);

14344 }

14345// Adjust SubMask.

14346for (int &Idx : SubMask)

14347if (Idx !=PoisonMaskElem)

14348Idx += NewVF;

14349copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

14350 Vec = createShuffle(Vec, SubVec, VecMask);

14351 TransformToIdentity(VecMask);

14352 }

14353 }

14354copy(VecMask,Mask.begin());

14355return Vec;

14356 }

14357 /// Checks if the specified entry \p E needs to be delayed because of its

14358 /// dependency nodes.

14359 std::optional<Value *>

14360needToDelay(const TreeEntry *E,

14361ArrayRef<SmallVector<const TreeEntry *>> Deps) const{

14362// No need to delay emission if all deps are ready.

14363if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {

14364returnall_of(

14365 TEs, [](const TreeEntry *TE) {return TE->VectorizedValue; });

14366 }))

14367return std::nullopt;

14368// Postpone gather emission, will be emitted after the end of the

14369// process to keep correct order.

14370auto *ResVecTy =getWidenedType(ScalarTy, E->getVectorFactor());

14371return Builder.CreateAlignedLoad(

14372 ResVecTy,

14373PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),

14374MaybeAlign());

14375 }

14376 /// Adds 2 input vectors (in form of tree entries) and the mask for their

14377 /// shuffling.

14378voidadd(const TreeEntry &E1,const TreeEntry &E2,ArrayRef<int> Mask) {

14379Value *V1 = E1.VectorizedValue;

14380if (V1->getType()->isIntOrIntVectorTy())

14381 V1 = castToScalarTyElem(V1,any_of(E1.Scalars, [&](Value *V) {

14382 if (isa<PoisonValue>(V))

14383 return false;

14384 return !isKnownNonNegative(

14385 V, SimplifyQuery(*R.DL));

14386 }));

14387Value *V2 = E2.VectorizedValue;

14388if (V2->getType()->isIntOrIntVectorTy())

14389 V2 = castToScalarTyElem(V2,any_of(E2.Scalars, [&](Value *V) {

14390 if (isa<PoisonValue>(V))

14391 return false;

14392 return !isKnownNonNegative(

14393 V, SimplifyQuery(*R.DL));

14394 }));

14395 add(V1, V2, Mask);

14396 }

14397 /// Adds single input vector (in form of tree entry) and the mask for its

14398 /// shuffling.

14399voidadd(const TreeEntry &E1,ArrayRef<int> Mask) {

14400Value *V1 = E1.VectorizedValue;

14401if (V1->getType()->isIntOrIntVectorTy())

14402 V1 = castToScalarTyElem(V1,any_of(E1.Scalars, [&](Value *V) {

14403 if (isa<PoisonValue>(V))

14404 return false;

14405 return !isKnownNonNegative(

14406 V, SimplifyQuery(*R.DL));

14407 }));

14408 add(V1, Mask);

14409 }

14410 /// Adds 2 input vectors and the mask for their shuffling.

14411voidadd(Value *V1,Value *V2,ArrayRef<int> Mask) {

14412assert(V1 && V2 && !Mask.empty() &&"Expected non-empty input vectors.");

14413assert(isa<FixedVectorType>(V1->getType()) &&

14414 isa<FixedVectorType>(V2->getType()) &&

14415"castToScalarTyElem expects V1 and V2 to be FixedVectorType");

14416 V1 = castToScalarTyElem(V1);

14417 V2 = castToScalarTyElem(V2);

14418if (InVectors.empty()) {

14419 InVectors.push_back(V1);

14420 InVectors.push_back(V2);

14421 CommonMask.assign(Mask.begin(), Mask.end());

14422return;

14423 }

14424Value *Vec = InVectors.front();

14425if (InVectors.size() == 2) {

14426 Vec = createShuffle(Vec, InVectors.back(), CommonMask);

14427 transformMaskAfterShuffle(CommonMask, CommonMask);

14428 }elseif (cast<FixedVectorType>(Vec->getType())->getNumElements() !=

14429 Mask.size()) {

14430 Vec = createShuffle(Vec,nullptr, CommonMask);

14431 transformMaskAfterShuffle(CommonMask, CommonMask);

14432 }

14433 V1 = createShuffle(V1, V2, Mask);

14434unsigned VF = std::max(getVF(V1), getVF(Vec));

14435for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

14436if (Mask[Idx] !=PoisonMaskElem)

14437 CommonMask[Idx] =Idx + VF;

14438 InVectors.front() = Vec;

14439if (InVectors.size() == 2)

14440 InVectors.back() = V1;

14441else

14442 InVectors.push_back(V1);

14443 }

14444 /// Adds another one input vector and the mask for the shuffling.

14445voidadd(Value *V1,ArrayRef<int> Mask,bool =false) {

14446assert(isa<FixedVectorType>(V1->getType()) &&

14447"castToScalarTyElem expects V1 to be FixedVectorType");

14448 V1 = castToScalarTyElem(V1);

14449if (InVectors.empty()) {

14450 InVectors.push_back(V1);

14451 CommonMask.assign(Mask.begin(), Mask.end());

14452return;

14453 }

14454constauto *It =find(InVectors, V1);

14455if (It == InVectors.end()) {

14456if (InVectors.size() == 2 ||

14457 InVectors.front()->getType() != V1->getType()) {

14458Value *V = InVectors.front();

14459if (InVectors.size() == 2) {

14460 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);

14461 transformMaskAfterShuffle(CommonMask, CommonMask);

14462 }elseif (cast<FixedVectorType>(V->getType())->getNumElements() !=

14463 CommonMask.size()) {

14464 V = createShuffle(InVectors.front(),nullptr, CommonMask);

14465 transformMaskAfterShuffle(CommonMask, CommonMask);

14466 }

14467unsigned VF = std::max(CommonMask.size(), Mask.size());

14468for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

14469if (CommonMask[Idx] ==PoisonMaskElem && Mask[Idx] !=PoisonMaskElem)

14470 CommonMask[Idx] =

14471 V->getType() != V1->getType()

14472 ?Idx + VF

14473 : Mask[Idx] + cast<FixedVectorType>(V1->getType())

14474 ->getNumElements();

14475if (V->getType() != V1->getType())

14476 V1 = createShuffle(V1,nullptr, Mask);

14477 InVectors.front() = V;

14478if (InVectors.size() == 2)

14479 InVectors.back() = V1;

14480else

14481 InVectors.push_back(V1);

14482return;

14483 }

14484// Check if second vector is required if the used elements are already

14485// used from the first one.

14486for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

14487if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem) {

14488 InVectors.push_back(V1);

14489break;

14490 }

14491 }

14492unsigned VF = 0;

14493for (Value *V : InVectors)

14494 VF = std::max(VF, getVF(V));

14495for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

14496if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)

14497 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);

14498 }

14499 /// Adds another one input vector and the mask for the shuffling.

14500voidaddOrdered(Value *V1,ArrayRef<unsigned> Order) {

14501SmallVector<int> NewMask;

14502inversePermutation(Order, NewMask);

14503 add(V1, NewMask);

14504 }

14505Value *gather(ArrayRef<Value *> VL,unsigned MaskVF = 0,

14506Value *Root =nullptr) {

14507return R.gather(VL, Root, ScalarTy,

14508 [&](Value *V1,Value *V2,ArrayRef<int> Mask) {

14509return createShuffle(V1, V2, Mask);

14510 });

14511 }

14512Value *createFreeze(Value *V) {return Builder.CreateFreeze(V); }

14513 /// Finalize emission of the shuffles.

14514 /// \param Action the action (if any) to be performed before final applying of

14515 /// the \p ExtMask mask.

14516Value *

14517finalize(ArrayRef<int> ExtMask,

14518ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,

14519ArrayRef<int> SubVectorsMask,unsigned VF = 0,

14520function_ref<void(Value *&,SmallVectorImpl<int> &)> Action = {}) {

14521 IsFinalized =true;

14522if (Action) {

14523Value *Vec = InVectors.front();

14524if (InVectors.size() == 2) {

14525 Vec = createShuffle(Vec, InVectors.back(), CommonMask);

14526 InVectors.pop_back();

14527 }else {

14528 Vec = createShuffle(Vec,nullptr, CommonMask);

14529 }

14530 transformMaskAfterShuffle(CommonMask, CommonMask);

14531assert(VF > 0 &&

14532"Expected vector length for the final value before action.");

14533unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();

14534if (VecVF < VF) {

14535SmallVector<int> ResizeMask(VF,PoisonMaskElem);

14536 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);

14537 Vec = createShuffle(Vec,nullptr, ResizeMask);

14538 }

14539 Action(Vec, CommonMask);

14540 InVectors.front() = Vec;

14541 }

14542if (!SubVectors.empty()) {

14543Value *Vec = InVectors.front();

14544if (InVectors.size() == 2) {

14545 Vec = createShuffle(Vec, InVectors.back(), CommonMask);

14546 InVectors.pop_back();

14547 }else {

14548 Vec = createShuffle(Vec,nullptr, CommonMask);

14549 }

14550 transformMaskAfterShuffle(CommonMask, CommonMask);

14551auto CreateSubVectors = [&](Value *Vec,

14552SmallVectorImpl<int> &CommonMask) {

14553for (auto [E,Idx] : SubVectors) {

14554Value *V = E->VectorizedValue;

14555if (V->getType()->isIntOrIntVectorTy())

14556 V = castToScalarTyElem(V,any_of(E->Scalars, [&](Value *V) {

14557 if (isa<PoisonValue>(V))

14558 return false;

14559 return !isKnownNonNegative(

14560 V, SimplifyQuery(*R.DL));

14561 }));

14562unsigned InsertionIndex =Idx *getNumElements(ScalarTy);

14563 Vec =createInsertVector(

14564 Builder, Vec, V, InsertionIndex,

14565 std::bind(&ShuffleInstructionBuilder::createShuffle,this, _1, _2,

14566 _3));

14567if (!CommonMask.empty()) {

14568 std::iota(std::next(CommonMask.begin(),Idx),

14569 std::next(CommonMask.begin(),Idx + E->getVectorFactor()),

14570Idx);

14571 }

14572 }

14573return Vec;

14574 };

14575if (SubVectorsMask.empty()) {

14576 Vec = CreateSubVectors(Vec, CommonMask);

14577 }else {

14578SmallVector<int> SVMask(CommonMask.size(),PoisonMaskElem);

14579copy(SubVectorsMask, SVMask.begin());

14580for (auto [I1, I2] :zip(SVMask, CommonMask)) {

14581if (I2 !=PoisonMaskElem) {

14582assert(I1 ==PoisonMaskElem &&"Expected unused subvectors mask");

14583I1 = I2 + CommonMask.size();

14584 }

14585 }

14586Value *InsertVec =

14587 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);

14588 Vec = createShuffle(InsertVec, Vec, SVMask);

14589 transformMaskAfterShuffle(CommonMask, SVMask);

14590 }

14591 InVectors.front() = Vec;

14592 }

14593

14594if (!ExtMask.empty()) {

14595if (CommonMask.empty()) {

14596 CommonMask.assign(ExtMask.begin(), ExtMask.end());

14597 }else {

14598SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);

14599for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {

14600if (ExtMask[I] ==PoisonMaskElem)

14601continue;

14602 NewMask[I] = CommonMask[ExtMask[I]];

14603 }

14604 CommonMask.swap(NewMask);

14605 }

14606 }

14607if (CommonMask.empty()) {

14608assert(InVectors.size() == 1 &&"Expected only one vector with no mask");

14609return InVectors.front();

14610 }

14611if (InVectors.size() == 2)

14612return createShuffle(InVectors.front(), InVectors.back(), CommonMask);

14613return createShuffle(InVectors.front(),nullptr, CommonMask);

14614 }

14615

14616~ShuffleInstructionBuilder() {

14617assert((IsFinalized || CommonMask.empty()) &&

14618"Shuffle construction must be finalized.");

14619 }

14620};

14621

14622BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,

14623unsigned NodeIdx) {

14624ArrayRef<Value *> VL = E->getOperand(NodeIdx);

14625 InstructionsState S =getSameOpcode(VL, *TLI);

14626// Special processing for GEPs bundle, which may include non-gep values.

14627if (!S && VL.front()->getType()->isPointerTy()) {

14628constauto *It =find_if(VL, IsaPred<GetElementPtrInst>);

14629if (It != VL.end())

14630 S =getSameOpcode(*It, *TLI);

14631 }

14632if (!S)

14633returnnullptr;

14634auto CheckSameVE = [&](const TreeEntry *VE) {

14635return VE->isSame(VL) &&

14636 (any_of(VE->UserTreeIndices,

14637 [E, NodeIdx](const EdgeInfo &EI) {

14638 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;

14639 }) ||

14640any_of(VectorizableTree,

14641 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {

14642return TE->isOperandGatherNode(

14643 {const_cast<TreeEntry *>(E), NodeIdx}) &&

14644 VE->isSame(TE->Scalars);

14645 }));

14646 };

14647 TreeEntry *VE = getTreeEntry(S.getMainOp());

14648if (VE && CheckSameVE(VE))

14649return VE;

14650auto It = MultiNodeScalars.find(S.getMainOp());

14651if (It != MultiNodeScalars.end()) {

14652auto *I =find_if(It->getSecond(), [&](const TreeEntry *TE) {

14653 return TE != VE && CheckSameVE(TE);

14654 });

14655if (I != It->getSecond().end())

14656return *I;

14657 }

14658returnnullptr;

14659}

14660

14661Value *BoUpSLP::vectorizeOperand(TreeEntry *E,unsigned NodeIdx,

14662bool PostponedPHIs) {

14663ValueList &VL = E->getOperand(NodeIdx);

14664constunsigned VF = VL.size();

14665if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {

14666auto FinalShuffle = [&](Value *V,ArrayRef<int>Mask) {

14667// V may be affected by MinBWs.

14668// We want ShuffleInstructionBuilder to correctly support REVEC. The key

14669// factor is the number of elements, not their type.

14670Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();

14671unsigned NumElements =getNumElements(VL.front()->getType());

14672 ShuffleInstructionBuilder ShuffleBuilder(

14673 NumElements != 1 ?FixedVectorType::get(ScalarTy, NumElements)

14674 : ScalarTy,

14675 Builder, *this);

14676 ShuffleBuilder.add(V, Mask);

14677SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(

14678 E->CombinedEntriesWithIndices.size());

14679transform(E->CombinedEntriesWithIndices, SubVectors.begin(),

14680 [&](constauto &P) {

14681 return std::make_pair(VectorizableTree[P.first].get(),

14682 P.second);

14683 });

14684assert((E->CombinedEntriesWithIndices.empty() ||

14685 E->ReorderIndices.empty()) &&

14686"Expected either combined subnodes or reordering");

14687return ShuffleBuilder.finalize({}, SubVectors, {});

14688 };

14689Value *V =vectorizeTree(VE, PostponedPHIs);

14690if (VF *getNumElements(VL[0]->getType()) !=

14691 cast<FixedVectorType>(V->getType())->getNumElements()) {

14692if (!VE->ReuseShuffleIndices.empty()) {

14693// Reshuffle to get only unique values.

14694// If some of the scalars are duplicated in the vectorization

14695// tree entry, we do not vectorize them but instead generate a

14696// mask for the reuses. But if there are several users of the

14697// same entry, they may have different vectorization factors.

14698// This is especially important for PHI nodes. In this case, we

14699// need to adapt the resulting instruction for the user

14700// vectorization factor and have to reshuffle it again to take

14701// only unique elements of the vector. Without this code the

14702// function incorrectly returns reduced vector instruction with

14703// the same elements, not with the unique ones.

14704

14705// block:

14706// %phi = phi <2 x > { .., %entry} {%shuffle, %block}

14707// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>

14708// ... (use %2)

14709// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}

14710// br %block

14711SmallVector<int>Mask(VF,PoisonMaskElem);

14712for (auto [I, V] :enumerate(VL)) {

14713if (isa<PoisonValue>(V))

14714continue;

14715Mask[I] = VE->findLaneForValue(V);

14716 }

14717V = FinalShuffle(V, Mask);

14718 }else {

14719assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&

14720"Expected vectorization factor less "

14721"than original vector size.");

14722SmallVector<int> UniformMask(VF, 0);

14723 std::iota(UniformMask.begin(), UniformMask.end(), 0);

14724V = FinalShuffle(V, UniformMask);

14725 }

14726 }

14727// Need to update the operand gather node, if actually the operand is not a

14728// vectorized node, but the buildvector/gather node, which matches one of

14729// the vectorized nodes.

14730if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {

14731 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;

14732 }) == VE->UserTreeIndices.end()) {

14733auto *It =

14734find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

14735returnTE->isGather() &&TE->UserTreeIndices.front().UserTE == E &&

14736TE->UserTreeIndices.front().EdgeIdx == NodeIdx;

14737 });

14738assert(It != VectorizableTree.end() &&"Expected gather node operand.");

14739 (*It)->VectorizedValue =V;

14740 }

14741returnV;

14742 }

14743

14744// Find the corresponding gather entry and vectorize it.

14745// Allows to be more accurate with tree/graph transformations, checks for the

14746// correctness of the transformations in many cases.

14747auto *I =find_if(VectorizableTree,

14748 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {

14749returnTE->isOperandGatherNode({E, NodeIdx});

14750 });

14751assert(I != VectorizableTree.end() &&"Gather node is not in the graph.");

14752assert(I->get()->UserTreeIndices.size() == 1 &&

14753"Expected only single user for the gather node.");

14754assert(I->get()->isSame(VL) &&"Expected same list of scalars.");

14755returnvectorizeTree(I->get(), PostponedPHIs);

14756}

14757

14758template <typename BVTy,typename ResTy,typename...Args>

14759ResTy BoUpSLP::processBuildVector(const TreeEntry *E,Type *ScalarTy,

14760 Args &...Params) {

14761assert(E->isGather() &&"Expected gather node.");

14762unsigned VF = E->getVectorFactor();

14763

14764bool NeedFreeze =false;

14765SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),

14766 E->ReuseShuffleIndices.end());

14767SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());

14768// Clear values, to be replaced by insertvector instructions.

14769for (auto [EIdx,Idx] : E->CombinedEntriesWithIndices)

14770for_each(MutableArrayRef(GatheredScalars)

14771 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),

14772 [&](Value *&V) {V =PoisonValue::get(V->getType()); });

14773SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(

14774 E->CombinedEntriesWithIndices.size());

14775transform(E->CombinedEntriesWithIndices, SubVectors.begin(),

14776 [&](constauto &P) {

14777 return std::make_pair(VectorizableTree[P.first].get(), P.second);

14778 });

14779// Build a mask out of the reorder indices and reorder scalars per this

14780// mask.

14781SmallVector<int> ReorderMask(E->ReorderIndices.begin(),

14782 E->ReorderIndices.end());

14783if (!ReorderMask.empty())

14784reorderScalars(GatheredScalars, ReorderMask);

14785SmallVector<int> SubVectorsMask;

14786inversePermutation(E->ReorderIndices, SubVectorsMask);

14787// Transform non-clustered elements in the mask to poison (-1).

14788// "Clustered" operations will be reordered using this mask later.

14789if (!SubVectors.empty() && !SubVectorsMask.empty()) {

14790for (unsignedI : seq<unsigned>(GatheredScalars.size()))

14791if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])

14792 SubVectorsMask[ReorderMask[I]] =PoisonMaskElem;

14793 }else {

14794 SubVectorsMask.clear();

14795 }

14796SmallVector<Value *> StoredGS(GatheredScalars);

14797auto FindReusedSplat = [&](MutableArrayRef<int>Mask,unsigned InputVF,

14798unsignedI,unsigned SliceSize,

14799bool IsNotPoisonous) {

14800if (!isSplat(E->Scalars) ||none_of(E->Scalars, [](Value *V) {

14801 return isa<UndefValue>(V) && !isa<PoisonValue>(V);

14802 }))

14803returnfalse;

14804 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;

14805unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;

14806if (UserTE->getNumOperands() != 2)

14807returnfalse;

14808if (!IsNotPoisonous) {

14809auto *It =

14810find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {

14811returnfind_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {

14812 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;

14813 }) !=TE->UserTreeIndices.end();

14814 });

14815if (It == VectorizableTree.end())

14816returnfalse;

14817SmallVector<Value *>GS((*It)->Scalars.begin(), (*It)->Scalars.end());

14818if (!(*It)->ReorderIndices.empty()) {

14819inversePermutation((*It)->ReorderIndices, ReorderMask);

14820reorderScalars(GS, ReorderMask);

14821 }

14822if (!all_of(zip(GatheredScalars, GS), [&](constauto &P) {

14823Value *V0 = std::get<0>(P);

14824Value *V1 = std::get<1>(P);

14825return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||

14826 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&

14827is_contained(E->Scalars, V1));

14828 }))

14829returnfalse;

14830 }

14831intIdx;

14832if ((Mask.size() < InputVF &&

14833ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF,Idx) &&

14834Idx == 0) ||

14835 (Mask.size() == InputVF &&

14836ShuffleVectorInst::isIdentityMask(Mask,Mask.size()))) {

14837 std::iota(

14838 std::next(Mask.begin(),I * SliceSize),

14839 std::next(Mask.begin(),

14840I * SliceSize +getNumElems(Mask.size(), SliceSize,I)),

14841 0);

14842 }else {

14843unsigned IVal =

14844 *find_if_not(Mask, [](intIdx) {returnIdx ==PoisonMaskElem; });

14845 std::fill(

14846 std::next(Mask.begin(),I * SliceSize),

14847 std::next(Mask.begin(),

14848I * SliceSize +getNumElems(Mask.size(), SliceSize,I)),

14849 IVal);

14850 }

14851returntrue;

14852 };

14853 BVTy ShuffleBuilder(ScalarTy, Params...);

14854 ResTy Res = ResTy();

14855SmallVector<int>Mask;

14856SmallVector<int> ExtractMask(GatheredScalars.size(),PoisonMaskElem);

14857SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;

14858Value *ExtractVecBase =nullptr;

14859bool UseVecBaseAsInput =false;

14860SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;

14861SmallVector<SmallVector<const TreeEntry *>> Entries;

14862Type *OrigScalarTy = GatheredScalars.front()->getType();

14863auto *VecTy =getWidenedType(ScalarTy, GatheredScalars.size());

14864unsigned NumParts =TTI->getNumberOfParts(VecTy);

14865if (NumParts == 0 || NumParts >= GatheredScalars.size() ||

14866 VecTy->getNumElements() % NumParts != 0 ||

14867 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),

14868 VecTy->getNumElements() / NumParts))

14869 NumParts = 1;

14870if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {

14871// Check for gathered extracts.

14872bool Resized =false;

14873 ExtractShuffles =

14874 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);

14875if (!ExtractShuffles.empty()) {

14876SmallVector<const TreeEntry *> ExtractEntries;

14877for (auto [Idx,I] :enumerate(ExtractMask)) {

14878if (I ==PoisonMaskElem)

14879continue;

14880if (constauto *TE = getTreeEntry(

14881 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))

14882 ExtractEntries.push_back(TE);

14883 }

14884if (std::optional<ResTy> Delayed =

14885 ShuffleBuilder.needToDelay(E, ExtractEntries)) {

14886// Delay emission of gathers which are not ready yet.

14887 PostponedGathers.insert(E);

14888// Postpone gather emission, will be emitted after the end of the

14889// process to keep correct order.

14890return *Delayed;

14891 }

14892if (Value *VecBase = ShuffleBuilder.adjustExtracts(

14893 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {

14894 ExtractVecBase = VecBase;

14895if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

14896if (VF == VecBaseTy->getNumElements() &&

14897 GatheredScalars.size() != VF) {

14898 Resized =true;

14899 GatheredScalars.append(VF - GatheredScalars.size(),

14900PoisonValue::get(OrigScalarTy));

14901 }

14902 }

14903 }

14904// Gather extracts after we check for full matched gathers only.

14905if (!ExtractShuffles.empty() || !E->hasState() ||

14906 E->getOpcode() != Instruction::Load ||

14907 (((E->hasState() && E->getOpcode() == Instruction::Load) ||

14908any_of(E->Scalars, IsaPred<LoadInst>)) &&

14909any_of(E->Scalars,

14910 [this](Value *V) {

14911 return isa<LoadInst>(V) && getTreeEntry(V);

14912 })) ||

14913 (E->hasState() && E->isAltShuffle()) ||

14914all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||

14915isSplat(E->Scalars) ||

14916 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {

14917 GatherShuffles =

14918 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);

14919 }

14920if (!GatherShuffles.empty()) {

14921if (std::optional<ResTy> Delayed =

14922 ShuffleBuilder.needToDelay(E, Entries)) {

14923// Delay emission of gathers which are not ready yet.

14924 PostponedGathers.insert(E);

14925// Postpone gather emission, will be emitted after the end of the

14926// process to keep correct order.

14927return *Delayed;

14928 }

14929if (GatherShuffles.size() == 1 &&

14930 *GatherShuffles.front() ==TTI::SK_PermuteSingleSrc &&

14931 Entries.front().front()->isSame(E->Scalars)) {

14932// Perfect match in the graph, will reuse the previously vectorized

14933// node. Cost is 0.

14934LLVM_DEBUG(dbgs() <<"SLP: perfect diamond match for gather bundle "

14935 <<shortBundleName(E->Scalars, E->Idx) <<".\n");

14936// Restore the mask for previous partially matched values.

14937Mask.resize(E->Scalars.size());

14938const TreeEntry *FrontTE = Entries.front().front();

14939if (FrontTE->ReorderIndices.empty() &&

14940 ((FrontTE->ReuseShuffleIndices.empty() &&

14941 E->Scalars.size() == FrontTE->Scalars.size()) ||

14942 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {

14943 std::iota(Mask.begin(),Mask.end(), 0);

14944 }else {

14945for (auto [I, V] :enumerate(E->Scalars)) {

14946if (isa<PoisonValue>(V)) {

14947Mask[I] =PoisonMaskElem;

14948continue;

14949 }

14950Mask[I] = FrontTE->findLaneForValue(V);

14951 }

14952 }

14953 ShuffleBuilder.add(*FrontTE, Mask);

14954// Full matched entry found, no need to insert subvectors.

14955 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});

14956return Res;

14957 }

14958if (!Resized) {

14959if (GatheredScalars.size() != VF &&

14960any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {

14961returnany_of(TEs, [&](const TreeEntry *TE) {

14962returnTE->getVectorFactor() == VF;

14963 });

14964 }))

14965 GatheredScalars.append(VF - GatheredScalars.size(),

14966PoisonValue::get(OrigScalarTy));

14967 }

14968// Remove shuffled elements from list of gathers.

14969for (intI = 0, Sz =Mask.size();I < Sz; ++I) {

14970if (Mask[I] !=PoisonMaskElem)

14971 GatheredScalars[I] =PoisonValue::get(OrigScalarTy);

14972 }

14973 }

14974 }

14975auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,

14976SmallVectorImpl<int> &ReuseMask,

14977bool IsRootPoison) {

14978// For splats with can emit broadcasts instead of gathers, so try to find

14979// such sequences.

14980bool IsSplat = IsRootPoison &&isSplat(Scalars) &&

14981 (Scalars.size() > 2 || Scalars.front() == Scalars.back());

14982 Scalars.append(VF - Scalars.size(),PoisonValue::get(OrigScalarTy));

14983SmallVector<int> UndefPos;

14984DenseMap<Value *, unsigned> UniquePositions;

14985// Gather unique non-const values and all constant values.

14986// For repeated values, just shuffle them.

14987int NumNonConsts = 0;

14988int SinglePos = 0;

14989for (auto [I, V] :enumerate(Scalars)) {

14990if (isa<UndefValue>(V)) {

14991if (!isa<PoisonValue>(V)) {

14992 ReuseMask[I] =I;

14993 UndefPos.push_back(I);

14994 }

14995continue;

14996 }

14997if (isConstant(V)) {

14998 ReuseMask[I] =I;

14999continue;

15000 }

15001 ++NumNonConsts;

15002 SinglePos =I;

15003Value *OrigV =V;

15004 Scalars[I] =PoisonValue::get(OrigScalarTy);

15005if (IsSplat) {

15006 Scalars.front() = OrigV;

15007 ReuseMask[I] = 0;

15008 }else {

15009constauto Res = UniquePositions.try_emplace(OrigV,I);

15010 Scalars[Res.first->second] = OrigV;

15011 ReuseMask[I] = Res.first->second;

15012 }

15013 }

15014if (NumNonConsts == 1) {

15015// Restore single insert element.

15016if (IsSplat) {

15017 ReuseMask.assign(VF,PoisonMaskElem);

15018std::swap(Scalars.front(), Scalars[SinglePos]);

15019if (!UndefPos.empty() && UndefPos.front() == 0)

15020 Scalars.front() =UndefValue::get(OrigScalarTy);

15021 }

15022 ReuseMask[SinglePos] = SinglePos;

15023 }elseif (!UndefPos.empty() && IsSplat) {

15024// For undef values, try to replace them with the simple broadcast.

15025// We can do it if the broadcasted value is guaranteed to be

15026// non-poisonous, or by freezing the incoming scalar value first.

15027auto *It =find_if(Scalars, [this, E](Value *V) {

15028return !isa<UndefValue>(V) &&

15029 (getTreeEntry(V) ||isGuaranteedNotToBePoison(V, AC) ||

15030 (E->UserTreeIndices.size() == 1 &&

15031any_of(V->uses(), [E](constUse &U) {

15032// Check if the value already used in the same operation in

15033// one of the nodes already.

15034 return E->UserTreeIndices.front().EdgeIdx !=

15035 U.getOperandNo() &&

15036 is_contained(

15037 E->UserTreeIndices.front().UserTE->Scalars,

15038 U.getUser());

15039 })));

15040 });

15041if (It != Scalars.end()) {

15042// Replace undefs by the non-poisoned scalars and emit broadcast.

15043int Pos = std::distance(Scalars.begin(), It);

15044for (intI : UndefPos) {

15045// Set the undef position to the non-poisoned scalar.

15046 ReuseMask[I] = Pos;

15047// Replace the undef by the poison, in the mask it is replaced by

15048// non-poisoned scalar already.

15049if (I != Pos)

15050 Scalars[I] =PoisonValue::get(OrigScalarTy);

15051 }

15052 }else {

15053// Replace undefs by the poisons, emit broadcast and then emit

15054// freeze.

15055for (intI : UndefPos) {

15056 ReuseMask[I] =PoisonMaskElem;

15057if (isa<UndefValue>(Scalars[I]))

15058 Scalars[I] =PoisonValue::get(OrigScalarTy);

15059 }

15060 NeedFreeze =true;

15061 }

15062 }

15063 };

15064if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {

15065bool IsNonPoisoned =true;

15066bool IsUsedInExpr =true;

15067Value *Vec1 =nullptr;

15068if (!ExtractShuffles.empty()) {

15069// Gather of extractelements can be represented as just a shuffle of

15070// a single/two vectors the scalars are extracted from.

15071// Find input vectors.

15072Value *Vec2 =nullptr;

15073for (unsignedI = 0, Sz = ExtractMask.size();I < Sz; ++I) {

15074if (!Mask.empty() && Mask[I] !=PoisonMaskElem)

15075 ExtractMask[I] =PoisonMaskElem;

15076 }

15077if (UseVecBaseAsInput) {

15078 Vec1 = ExtractVecBase;

15079 }else {

15080for (unsignedI = 0, Sz = ExtractMask.size();I < Sz; ++I) {

15081if (ExtractMask[I] ==PoisonMaskElem)

15082continue;

15083if (isa<UndefValue>(E->Scalars[I]))

15084continue;

15085auto *EI = cast<ExtractElementInst>(StoredGS[I]);

15086Value *VecOp = EI->getVectorOperand();

15087if (constauto *TE = getTreeEntry(VecOp))

15088if (TE->VectorizedValue)

15089 VecOp =TE->VectorizedValue;

15090if (!Vec1) {

15091 Vec1 = VecOp;

15092 }elseif (Vec1 != VecOp) {

15093assert((!Vec2 || Vec2 == VecOp) &&

15094"Expected only 1 or 2 vectors shuffle.");

15095 Vec2 = VecOp;

15096 }

15097 }

15098 }

15099if (Vec2) {

15100 IsUsedInExpr =false;

15101 IsNonPoisoned &=isGuaranteedNotToBePoison(Vec1, AC) &&

15102isGuaranteedNotToBePoison(Vec2, AC);

15103 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);

15104 }elseif (Vec1) {

15105bool IsNotPoisonedVec =isGuaranteedNotToBePoison(Vec1, AC);

15106 IsUsedInExpr &= FindReusedSplat(

15107 ExtractMask,

15108 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,

15109 ExtractMask.size(), IsNotPoisonedVec);

15110 ShuffleBuilder.add(Vec1, ExtractMask,/*ForExtracts=*/true);

15111 IsNonPoisoned &= IsNotPoisonedVec;

15112 }else {

15113 IsUsedInExpr =false;

15114 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,

15115/*ForExtracts=*/true);

15116 }

15117 }

15118if (!GatherShuffles.empty()) {

15119unsigned SliceSize =getPartNumElems(E->Scalars.size(), NumParts);

15120SmallVector<int> VecMask(Mask.size(),PoisonMaskElem);

15121for (constauto [I, TEs] :enumerate(Entries)) {

15122if (TEs.empty()) {

15123assert(!GatherShuffles[I] &&

15124"No shuffles with empty entries list expected.");

15125continue;

15126 }

15127assert((TEs.size() == 1 || TEs.size() == 2) &&

15128"Expected shuffle of 1 or 2 entries.");

15129unsigned Limit =getNumElems(Mask.size(), SliceSize,I);

15130auto SubMask =ArrayRef(Mask).slice(I * SliceSize, Limit);

15131 VecMask.assign(VecMask.size(),PoisonMaskElem);

15132copy(SubMask, std::next(VecMask.begin(),I * SliceSize));

15133if (TEs.size() == 1) {

15134bool IsNotPoisonedVec =

15135 TEs.front()->VectorizedValue

15136 ?isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)

15137 :true;

15138 IsUsedInExpr &=

15139 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(),I,

15140 SliceSize, IsNotPoisonedVec);

15141 ShuffleBuilder.add(*TEs.front(), VecMask);

15142 IsNonPoisoned &= IsNotPoisonedVec;

15143 }else {

15144 IsUsedInExpr =false;

15145 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);

15146if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)

15147 IsNonPoisoned &=

15148isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&

15149isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);

15150 }

15151 }

15152 }

15153// Try to figure out best way to combine values: build a shuffle and insert

15154// elements or just build several shuffles.

15155// Insert non-constant scalars.

15156SmallVector<Value *> NonConstants(GatheredScalars);

15157int EMSz = ExtractMask.size();

15158int MSz =Mask.size();

15159// Try to build constant vector and shuffle with it only if currently we

15160// have a single permutation and more than 1 scalar constants.

15161bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();

15162bool IsIdentityShuffle =

15163 ((UseVecBaseAsInput ||

15164all_of(ExtractShuffles,

15165 [](const std::optional<TTI::ShuffleKind> &SK) {

15166return SK.value_or(TTI::SK_PermuteTwoSrc) ==

15167TTI::SK_PermuteSingleSrc;

15168 })) &&

15169none_of(ExtractMask, [&](intI) {returnI >= EMSz; }) &&

15170ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||

15171 (!GatherShuffles.empty() &&

15172all_of(GatherShuffles,

15173 [](const std::optional<TTI::ShuffleKind> &SK) {

15174return SK.value_or(TTI::SK_PermuteTwoSrc) ==

15175TTI::SK_PermuteSingleSrc;

15176 }) &&

15177none_of(Mask, [&](intI) {returnI >= MSz; }) &&

15178ShuffleVectorInst::isIdentityMask(Mask, MSz));

15179bool EnoughConstsForShuffle =

15180 IsSingleShuffle &&

15181 (none_of(GatheredScalars,

15182 [](Value *V) {

15183return isa<UndefValue>(V) && !isa<PoisonValue>(V);

15184 }) ||

15185any_of(GatheredScalars,

15186 [](Value *V) {

15187return isa<Constant>(V) && !isa<UndefValue>(V);

15188 })) &&

15189 (!IsIdentityShuffle ||

15190 (GatheredScalars.size() == 2 &&

15191any_of(GatheredScalars,

15192 [](Value *V) {return !isa<UndefValue>(V); })) ||

15193count_if(GatheredScalars, [](Value *V) {

15194return isa<Constant>(V) && !isa<PoisonValue>(V);

15195 }) > 1);

15196// NonConstants array contains just non-constant values, GatheredScalars

15197// contains only constant to build final vector and then shuffle.

15198for (intI = 0, Sz = GatheredScalars.size();I < Sz; ++I) {

15199if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))

15200 NonConstants[I] =PoisonValue::get(OrigScalarTy);

15201else

15202 GatheredScalars[I] =PoisonValue::get(OrigScalarTy);

15203 }

15204// Generate constants for final shuffle and build a mask for them.

15205if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {

15206SmallVector<int> BVMask(GatheredScalars.size(),PoisonMaskElem);

15207 TryPackScalars(GatheredScalars, BVMask,/*IsRootPoison=*/true);

15208Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());

15209 ShuffleBuilder.add(BV, BVMask);

15210 }

15211if (all_of(NonConstants, [=](Value *V) {

15212return isa<PoisonValue>(V) ||

15213 (IsSingleShuffle && ((IsIdentityShuffle &&

15214 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));

15215 }))

15216 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,

15217 SubVectorsMask);

15218else

15219 Res = ShuffleBuilder.finalize(

15220 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),

15221 [&](Value *&Vec,SmallVectorImpl<int> &Mask) {

15222 TryPackScalars(NonConstants, Mask,/*IsRootPoison=*/false);

15223 Vec = ShuffleBuilder.gather(NonConstants,Mask.size(), Vec);

15224 });

15225 }elseif (!allConstant(GatheredScalars)) {

15226// Gather unique scalars and all constants.

15227SmallVector<int> ReuseMask(GatheredScalars.size(),PoisonMaskElem);

15228 TryPackScalars(GatheredScalars, ReuseMask,/*IsRootPoison=*/true);

15229Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());

15230 ShuffleBuilder.add(BV, ReuseMask);

15231 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,

15232 SubVectorsMask);

15233 }else {

15234// Gather all constants.

15235SmallVector<int>Mask(GatheredScalars.size(),PoisonMaskElem);

15236for (auto [I, V] :enumerate(GatheredScalars)) {

15237if (!isa<PoisonValue>(V))

15238Mask[I] =I;

15239 }

15240Value *BV = ShuffleBuilder.gather(GatheredScalars);

15241 ShuffleBuilder.add(BV, Mask);

15242 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,

15243 SubVectorsMask);

15244 }

15245

15246if (NeedFreeze)

15247 Res = ShuffleBuilder.createFreeze(Res);

15248return Res;

15249}

15250

15251Value *BoUpSLP::createBuildVector(const TreeEntry *E,Type *ScalarTy,

15252bool PostponedPHIs) {

15253for (auto [EIdx,_] : E->CombinedEntriesWithIndices)

15254 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);

15255return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,

15256 Builder, *this);

15257}

15258

15259/// \returns \p I after propagating metadata from \p VL only for instructions in

15260/// \p VL.

15261staticInstruction *propagateMetadata(Instruction *Inst,ArrayRef<Value *> VL) {

15262SmallVector<Value *> Insts;

15263for (Value *V : VL)

15264if (isa<Instruction>(V))

15265 Insts.push_back(V);

15266returnllvm::propagateMetadata(Inst, Insts);

15267}

15268

15269Value *BoUpSLP::vectorizeTree(TreeEntry *E,bool PostponedPHIs) {

15270IRBuilderBase::InsertPointGuard Guard(Builder);

15271

15272if (E->VectorizedValue &&

15273 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||

15274 E->isAltShuffle())) {

15275LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *E->Scalars[0] <<".\n");

15276return E->VectorizedValue;

15277 }

15278

15279Value *V = E->Scalars.front();

15280Type *ScalarTy =V->getType();

15281if (!isa<CmpInst>(V))

15282 ScalarTy =getValueType(V);

15283auto It = MinBWs.find(E);

15284if (It != MinBWs.end()) {

15285auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);

15286 ScalarTy =IntegerType::get(F->getContext(), It->second.first);

15287if (VecTy)

15288 ScalarTy =getWidenedType(ScalarTy, VecTy->getNumElements());

15289 }

15290auto *VecTy =getWidenedType(ScalarTy, E->Scalars.size());

15291if (E->isGather()) {

15292// Set insert point for non-reduction initial nodes.

15293if (E->hasState() && E->Idx == 0 && !UserIgnoreList)

15294 setInsertPointAfterBundle(E);

15295Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);

15296 E->VectorizedValue = Vec;

15297return Vec;

15298 }

15299

15300bool IsReverseOrder =

15301 !E->ReorderIndices.empty() &&isReverseOrder(E->ReorderIndices);

15302auto FinalShuffle = [&](Value *V,const TreeEntry *E) {

15303 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);

15304if (E->getOpcode() == Instruction::Store &&

15305 E->State == TreeEntry::Vectorize) {

15306ArrayRef<int>Mask =

15307ArrayRef(reinterpret_cast<constint *>(E->ReorderIndices.begin()),

15308 E->ReorderIndices.size());

15309 ShuffleBuilder.add(V, Mask);

15310 }elseif (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {

15311 ShuffleBuilder.addOrdered(V, {});

15312 }else {

15313 ShuffleBuilder.addOrdered(V, E->ReorderIndices);

15314 }

15315SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(

15316 E->CombinedEntriesWithIndices.size());

15317transform(

15318 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](constauto &P) {

15319 return std::make_pair(VectorizableTree[P.first].get(), P.second);

15320 });

15321assert(

15322 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&

15323"Expected either combined subnodes or reordering");

15324return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});

15325 };

15326

15327assert(!E->isGather() &&"Unhandled state");

15328unsigned ShuffleOrOp =

15329 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

15330Instruction *VL0 = E->getMainOp();

15331auto GetOperandSignedness = [&](unsignedIdx) {

15332const TreeEntry *OpE = getOperandEntry(E,Idx);

15333bool IsSigned =false;

15334auto It = MinBWs.find(OpE);

15335if (It != MinBWs.end())

15336 IsSigned = It->second.second;

15337else

15338 IsSigned =any_of(OpE->Scalars, [&](Value *R) {

15339 if (isa<PoisonValue>(V))

15340 return false;

15341 return !isKnownNonNegative(R, SimplifyQuery(*DL));

15342 });

15343return IsSigned;

15344 };

15345switch (ShuffleOrOp) {

15346case Instruction::PHI: {

15347assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||

15348 E != VectorizableTree.front().get() ||

15349 !E->UserTreeIndices.empty()) &&

15350"PHI reordering is free.");

15351if (PostponedPHIs && E->VectorizedValue)

15352return E->VectorizedValue;

15353auto *PH = cast<PHINode>(VL0);

15354 Builder.SetInsertPoint(PH->getParent(),

15355 PH->getParent()->getFirstNonPHIIt());

15356 Builder.SetCurrentDebugLocation(PH->getDebugLoc());

15357if (PostponedPHIs || !E->VectorizedValue) {

15358PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

15359 E->PHI = NewPhi;

15360Value *V = NewPhi;

15361

15362// Adjust insertion point once all PHI's have been generated.

15363 Builder.SetInsertPoint(PH->getParent(),

15364 PH->getParent()->getFirstInsertionPt());

15365 Builder.SetCurrentDebugLocation(PH->getDebugLoc());

15366

15367V = FinalShuffle(V, E);

15368

15369 E->VectorizedValue =V;

15370if (PostponedPHIs)

15371returnV;

15372 }

15373PHINode *NewPhi = cast<PHINode>(E->PHI);

15374// If phi node is fully emitted - exit.

15375if (NewPhi->getNumIncomingValues() != 0)

15376return NewPhi;

15377

15378// PHINodes may have multiple entries from the same block. We want to

15379// visit every block once.

15380SmallPtrSet<BasicBlock *, 4> VisitedBBs;

15381

15382for (unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {

15383ValueList Operands;

15384BasicBlock *IBB = PH->getIncomingBlock(I);

15385

15386// Stop emission if all incoming values are generated.

15387if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {

15388LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15389return NewPhi;

15390 }

15391

15392if (!VisitedBBs.insert(IBB).second) {

15393 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);

15394continue;

15395 }

15396

15397 Builder.SetInsertPoint(IBB->getTerminator());

15398 Builder.SetCurrentDebugLocation(PH->getDebugLoc());

15399Value *Vec = vectorizeOperand(E,I,/*PostponedPHIs=*/true);

15400if (VecTy != Vec->getType()) {

15401assert((It != MinBWs.end() || getOperandEntry(E,I)->isGather() ||

15402 MinBWs.contains(getOperandEntry(E,I))) &&

15403"Expected item in MinBWs.");

15404 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));

15405 }

15406 NewPhi->addIncoming(Vec, IBB);

15407 }

15408

15409assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&

15410"Invalid number of incoming values");

15411assert(E->VectorizedValue &&"Expected vectorized value.");

15412return E->VectorizedValue;

15413 }

15414

15415case Instruction::ExtractElement: {

15416Value *V = E->getSingleOperand(0);

15417if (const TreeEntry *TE = getTreeEntry(V))

15418V =TE->VectorizedValue;

15419 setInsertPointAfterBundle(E);

15420V = FinalShuffle(V, E);

15421 E->VectorizedValue =V;

15422returnV;

15423 }

15424case Instruction::ExtractValue: {

15425auto *LI = cast<LoadInst>(E->getSingleOperand(0));

15426 Builder.SetInsertPoint(LI);

15427Value *Ptr = LI->getPointerOperand();

15428LoadInst *V = Builder.CreateAlignedLoad(VecTy,Ptr, LI->getAlign());

15429Value *NewV =::propagateMetadata(V, E->Scalars);

15430 NewV = FinalShuffle(NewV, E);

15431 E->VectorizedValue = NewV;

15432return NewV;

15433 }

15434case Instruction::InsertElement: {

15435assert(E->ReuseShuffleIndices.empty() &&"All inserts should be unique");

15436 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));

15437Value *V = vectorizeOperand(E, 1, PostponedPHIs);

15438ArrayRef<Value *>Op = E->getOperand(1);

15439Type *ScalarTy =Op.front()->getType();

15440if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {

15441assert(ScalarTy->isIntegerTy() &&"Expected item in MinBWs.");

15442 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));

15443assert(Res.first > 0 &&"Expected item in MinBWs.");

15444V = Builder.CreateIntCast(

15445 V,

15446getWidenedType(

15447 ScalarTy,

15448 cast<FixedVectorType>(V->getType())->getNumElements()),

15449 Res.second);

15450 }

15451

15452// Create InsertVector shuffle if necessary

15453auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

15454 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

15455 }));

15456constunsigned NumElts =

15457 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();

15458constunsigned NumScalars = E->Scalars.size();

15459

15460unsignedOffset = *getElementIndex(VL0);

15461assert(Offset < NumElts &&"Failed to find vector index offset");

15462

15463// Create shuffle to resize vector

15464SmallVector<int>Mask;

15465if (!E->ReorderIndices.empty()) {

15466inversePermutation(E->ReorderIndices, Mask);

15467Mask.append(NumElts - NumScalars,PoisonMaskElem);

15468 }else {

15469Mask.assign(NumElts,PoisonMaskElem);

15470 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);

15471 }

15472// Create InsertVector shuffle if necessary

15473bool IsIdentity =true;

15474SmallVector<int> PrevMask(NumElts,PoisonMaskElem);

15475Mask.swap(PrevMask);

15476for (unsignedI = 0;I < NumScalars; ++I) {

15477Value *Scalar = E->Scalars[PrevMask[I]];

15478unsigned InsertIdx = *getElementIndex(Scalar);

15479 IsIdentity &= InsertIdx -Offset ==I;

15480Mask[InsertIdx -Offset] =I;

15481 }

15482if (!IsIdentity || NumElts != NumScalars) {

15483Value *V2 =nullptr;

15484bool IsVNonPoisonous =

15485 !isConstant(V) &&isGuaranteedNotToBePoison(V, AC);

15486SmallVector<int> InsertMask(Mask);

15487if (NumElts != NumScalars &&Offset == 0) {

15488// Follow all insert element instructions from the current buildvector

15489// sequence.

15490InsertElementInst *Ins = cast<InsertElementInst>(VL0);

15491do {

15492 std::optional<unsigned> InsertIdx =getElementIndex(Ins);

15493if (!InsertIdx)

15494break;

15495if (InsertMask[*InsertIdx] ==PoisonMaskElem)

15496 InsertMask[*InsertIdx] = *InsertIdx;

15497if (!Ins->hasOneUse())

15498break;

15499Ins = dyn_cast_or_null<InsertElementInst>(

15500Ins->getUniqueUndroppableUser());

15501 }while (Ins);

15502SmallBitVector UseMask =

15503buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);

15504SmallBitVector IsFirstPoison =

15505 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

15506SmallBitVector IsFirstUndef =

15507isUndefVector(FirstInsert->getOperand(0), UseMask);

15508if (!IsFirstPoison.all()) {

15509unsignedIdx = 0;

15510for (unsignedI = 0;I < NumElts;I++) {

15511if (InsertMask[I] ==PoisonMaskElem && !IsFirstPoison.test(I) &&

15512 IsFirstUndef.test(I)) {

15513if (IsVNonPoisonous) {

15514 InsertMask[I] =I < NumScalars ?I : 0;

15515continue;

15516 }

15517if (!V2)

15518V2 =UndefValue::get(V->getType());

15519if (Idx >= NumScalars)

15520Idx = NumScalars - 1;

15521 InsertMask[I] = NumScalars +Idx;

15522 ++Idx;

15523 }elseif (InsertMask[I] !=PoisonMaskElem &&

15524 Mask[I] ==PoisonMaskElem) {

15525 InsertMask[I] =PoisonMaskElem;

15526 }

15527 }

15528 }else {

15529 InsertMask =Mask;

15530 }

15531 }

15532if (!V2)

15533V2 =PoisonValue::get(V->getType());

15534V = Builder.CreateShuffleVector(V, V2, InsertMask);

15535if (auto *I = dyn_cast<Instruction>(V)) {

15536 GatherShuffleExtractSeq.insert(I);

15537 CSEBlocks.insert(I->getParent());

15538 }

15539 }

15540

15541SmallVector<int> InsertMask(NumElts,PoisonMaskElem);

15542for (unsignedI = 0;I < NumElts;I++) {

15543if (Mask[I] !=PoisonMaskElem)

15544 InsertMask[Offset +I] =I;

15545 }

15546SmallBitVector UseMask =

15547buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);

15548SmallBitVector IsFirstUndef =

15549isUndefVector(FirstInsert->getOperand(0), UseMask);

15550if ((!IsIdentity ||Offset != 0 || !IsFirstUndef.all()) &&

15551 NumElts != NumScalars) {

15552if (IsFirstUndef.all()) {

15553if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {

15554SmallBitVector IsFirstPoison =

15555 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

15556if (!IsFirstPoison.all()) {

15557for (unsignedI = 0;I < NumElts;I++) {

15558if (InsertMask[I] ==PoisonMaskElem && !IsFirstPoison.test(I))

15559 InsertMask[I] =I + NumElts;

15560 }

15561 }

15562V = Builder.CreateShuffleVector(

15563 V,

15564 IsFirstPoison.all() ?PoisonValue::get(V->getType())

15565 : FirstInsert->getOperand(0),

15566 InsertMask, cast<Instruction>(E->Scalars.back())->getName());

15567if (auto *I = dyn_cast<Instruction>(V)) {

15568 GatherShuffleExtractSeq.insert(I);

15569 CSEBlocks.insert(I->getParent());

15570 }

15571 }

15572 }else {

15573SmallBitVector IsFirstPoison =

15574 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

15575for (unsignedI = 0;I < NumElts;I++) {

15576if (InsertMask[I] ==PoisonMaskElem)

15577 InsertMask[I] = IsFirstPoison.test(I) ?PoisonMaskElem :I;

15578else

15579 InsertMask[I] += NumElts;

15580 }

15581V = Builder.CreateShuffleVector(

15582 FirstInsert->getOperand(0), V, InsertMask,

15583 cast<Instruction>(E->Scalars.back())->getName());

15584if (auto *I = dyn_cast<Instruction>(V)) {

15585 GatherShuffleExtractSeq.insert(I);

15586 CSEBlocks.insert(I->getParent());

15587 }

15588 }

15589 }

15590

15591 ++NumVectorInstructions;

15592 E->VectorizedValue =V;

15593returnV;

15594 }

15595case Instruction::ZExt:

15596case Instruction::SExt:

15597case Instruction::FPToUI:

15598case Instruction::FPToSI:

15599case Instruction::FPExt:

15600case Instruction::PtrToInt:

15601case Instruction::IntToPtr:

15602case Instruction::SIToFP:

15603case Instruction::UIToFP:

15604case Instruction::Trunc:

15605case Instruction::FPTrunc:

15606case Instruction::BitCast: {

15607 setInsertPointAfterBundle(E);

15608

15609Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);

15610if (E->VectorizedValue) {

15611LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15612return E->VectorizedValue;

15613 }

15614

15615auto *CI = cast<CastInst>(VL0);

15616Instruction::CastOps VecOpcode = CI->getOpcode();

15617Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();

15618auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

15619if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&

15620 (SrcIt != MinBWs.end() || It != MinBWs.end() ||

15621 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {

15622// Check if the values are candidates to demote.

15623unsigned SrcBWSz =DL->getTypeSizeInBits(SrcScalarTy);

15624if (SrcIt != MinBWs.end())

15625 SrcBWSz = SrcIt->second.first;

15626unsigned BWSz =DL->getTypeSizeInBits(ScalarTy->getScalarType());

15627if (BWSz == SrcBWSz) {

15628 VecOpcode = Instruction::BitCast;

15629 }elseif (BWSz < SrcBWSz) {

15630 VecOpcode = Instruction::Trunc;

15631 }elseif (It != MinBWs.end()) {

15632assert(BWSz > SrcBWSz &&"Invalid cast!");

15633 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

15634 }elseif (SrcIt != MinBWs.end()) {

15635assert(BWSz > SrcBWSz &&"Invalid cast!");

15636 VecOpcode =

15637 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;

15638 }

15639 }elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&

15640 !SrcIt->second.second) {

15641 VecOpcode = Instruction::UIToFP;

15642 }

15643Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)

15644 ? InVec

15645 : Builder.CreateCast(VecOpcode, InVec, VecTy);

15646V = FinalShuffle(V, E);

15647

15648 E->VectorizedValue =V;

15649 ++NumVectorInstructions;

15650returnV;

15651 }

15652case Instruction::FCmp:

15653case Instruction::ICmp: {

15654 setInsertPointAfterBundle(E);

15655

15656Value *L = vectorizeOperand(E, 0, PostponedPHIs);

15657if (E->VectorizedValue) {

15658LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15659return E->VectorizedValue;

15660 }

15661Value *R = vectorizeOperand(E, 1, PostponedPHIs);

15662if (E->VectorizedValue) {

15663LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15664return E->VectorizedValue;

15665 }

15666if (L->getType() !=R->getType()) {

15667assert((getOperandEntry(E, 0)->isGather() ||

15668 getOperandEntry(E, 1)->isGather() ||

15669 MinBWs.contains(getOperandEntry(E, 0)) ||

15670 MinBWs.contains(getOperandEntry(E, 1))) &&

15671"Expected item in MinBWs.");

15672if (cast<VectorType>(L->getType())

15673 ->getElementType()

15674 ->getIntegerBitWidth() < cast<VectorType>(R->getType())

15675 ->getElementType()

15676 ->getIntegerBitWidth()) {

15677Type *CastTy =R->getType();

15678L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));

15679 }else {

15680Type *CastTy =L->getType();

15681R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));

15682 }

15683 }

15684

15685CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

15686Value *V = Builder.CreateCmp(P0, L, R);

15687propagateIRFlags(V, E->Scalars, VL0);

15688if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())

15689 ICmp->setSameSign(/*B=*/false);

15690// Do not cast for cmps.

15691 VecTy = cast<FixedVectorType>(V->getType());

15692V = FinalShuffle(V, E);

15693

15694 E->VectorizedValue =V;

15695 ++NumVectorInstructions;

15696returnV;

15697 }

15698case Instruction::Select: {

15699 setInsertPointAfterBundle(E);

15700

15701Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);

15702if (E->VectorizedValue) {

15703LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15704return E->VectorizedValue;

15705 }

15706Value *True = vectorizeOperand(E, 1, PostponedPHIs);

15707if (E->VectorizedValue) {

15708LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15709return E->VectorizedValue;

15710 }

15711Value *False = vectorizeOperand(E, 2, PostponedPHIs);

15712if (E->VectorizedValue) {

15713LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15714return E->VectorizedValue;

15715 }

15716if (True->getType() != VecTy || False->getType() != VecTy) {

15717assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||

15718 getOperandEntry(E, 2)->isGather() ||

15719 MinBWs.contains(getOperandEntry(E, 1)) ||

15720 MinBWs.contains(getOperandEntry(E, 2))) &&

15721"Expected item in MinBWs.");

15722if (True->getType() != VecTy)

15723 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));

15724if (False->getType() != VecTy)

15725 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));

15726 }

15727

15728unsigned CondNumElements =getNumElements(Cond->getType());

15729unsigned TrueNumElements =getNumElements(True->getType());

15730assert(TrueNumElements >= CondNumElements &&

15731 TrueNumElements % CondNumElements == 0 &&

15732"Cannot vectorize Instruction::Select");

15733assert(TrueNumElements ==getNumElements(False->getType()) &&

15734"Cannot vectorize Instruction::Select");

15735if (CondNumElements != TrueNumElements) {

15736// When the return type is i1 but the source is fixed vector type, we

15737// need to duplicate the condition value.

15738Cond = Builder.CreateShuffleVector(

15739Cond,createReplicatedMask(TrueNumElements / CondNumElements,

15740 CondNumElements));

15741 }

15742assert(getNumElements(Cond->getType()) == TrueNumElements &&

15743"Cannot vectorize Instruction::Select");

15744Value *V = Builder.CreateSelect(Cond, True, False);

15745V = FinalShuffle(V, E);

15746

15747 E->VectorizedValue =V;

15748 ++NumVectorInstructions;

15749returnV;

15750 }

15751case Instruction::FNeg: {

15752 setInsertPointAfterBundle(E);

15753

15754Value *Op = vectorizeOperand(E, 0, PostponedPHIs);

15755

15756if (E->VectorizedValue) {

15757LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15758return E->VectorizedValue;

15759 }

15760

15761Value *V = Builder.CreateUnOp(

15762static_cast<Instruction::UnaryOps>(E->getOpcode()),Op);

15763propagateIRFlags(V, E->Scalars, VL0);

15764if (auto *I = dyn_cast<Instruction>(V))

15765V =::propagateMetadata(I, E->Scalars);

15766

15767V = FinalShuffle(V, E);

15768

15769 E->VectorizedValue =V;

15770 ++NumVectorInstructions;

15771

15772returnV;

15773 }

15774case Instruction::Freeze: {

15775 setInsertPointAfterBundle(E);

15776

15777Value *Op = vectorizeOperand(E, 0, PostponedPHIs);

15778

15779if (E->VectorizedValue) {

15780LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15781return E->VectorizedValue;

15782 }

15783

15784if (Op->getType() != VecTy) {

15785assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||

15786 MinBWs.contains(getOperandEntry(E, 0))) &&

15787"Expected item in MinBWs.");

15788Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));

15789 }

15790Value *V = Builder.CreateFreeze(Op);

15791V = FinalShuffle(V, E);

15792

15793 E->VectorizedValue =V;

15794 ++NumVectorInstructions;

15795

15796returnV;

15797 }

15798case Instruction::Add:

15799case Instruction::FAdd:

15800case Instruction::Sub:

15801case Instruction::FSub:

15802case Instruction::Mul:

15803case Instruction::FMul:

15804case Instruction::UDiv:

15805case Instruction::SDiv:

15806case Instruction::FDiv:

15807case Instruction::URem:

15808case Instruction::SRem:

15809case Instruction::FRem:

15810case Instruction::Shl:

15811case Instruction::LShr:

15812case Instruction::AShr:

15813case Instruction::And:

15814case Instruction::Or:

15815case Instruction::Xor: {

15816 setInsertPointAfterBundle(E);

15817

15818Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);

15819if (E->VectorizedValue) {

15820LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15821return E->VectorizedValue;

15822 }

15823Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);

15824if (E->VectorizedValue) {

15825LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15826return E->VectorizedValue;

15827 }

15828if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {

15829for (unsignedI : seq<unsigned>(0, E->getNumOperands())) {

15830ArrayRef<Value *> Ops = E->getOperand(I);

15831if (all_of(Ops, [&](Value *Op) {

15832auto *CI = dyn_cast<ConstantInt>(Op);

15833return CI && CI->getValue().countr_one() >= It->second.first;

15834 })) {

15835V = FinalShuffle(I == 0 ? RHS : LHS, E);

15836 E->VectorizedValue =V;

15837 ++NumVectorInstructions;

15838returnV;

15839 }

15840 }

15841 }

15842if (LHS->getType() != VecTy ||RHS->getType() != VecTy) {

15843assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||

15844 getOperandEntry(E, 1)->isGather() ||

15845 MinBWs.contains(getOperandEntry(E, 0)) ||

15846 MinBWs.contains(getOperandEntry(E, 1))) &&

15847"Expected item in MinBWs.");

15848if (LHS->getType() != VecTy)

15849LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));

15850if (RHS->getType() != VecTy)

15851RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));

15852 }

15853

15854Value *V = Builder.CreateBinOp(

15855static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,

15856 RHS);

15857propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());

15858if (auto *I = dyn_cast<Instruction>(V)) {

15859V =::propagateMetadata(I, E->Scalars);

15860// Drop nuw flags for abs(sub(commutative), true).

15861if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&

15862any_of(E->Scalars, [](Value *V) {

15863 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));

15864 }))

15865I->setHasNoUnsignedWrap(/*b=*/false);

15866 }

15867

15868V = FinalShuffle(V, E);

15869

15870 E->VectorizedValue =V;

15871 ++NumVectorInstructions;

15872

15873returnV;

15874 }

15875case Instruction::Load: {

15876// Loads are inserted at the head of the tree because we don't want to

15877// sink them all the way down past store instructions.

15878 setInsertPointAfterBundle(E);

15879

15880LoadInst *LI = cast<LoadInst>(VL0);

15881Instruction *NewLI;

15882Value *PO = LI->getPointerOperand();

15883if (E->State == TreeEntry::Vectorize) {

15884 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());

15885 }elseif (E->State == TreeEntry::StridedVectorize) {

15886Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();

15887Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();

15888 PO = IsReverseOrder ? PtrN : Ptr0;

15889 std::optional<int> Diff =getPointersDiff(

15890 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);

15891Type *StrideTy =DL->getIndexType(PO->getType());

15892Value *StrideVal;

15893if (Diff) {

15894int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);

15895 StrideVal =

15896 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *

15897DL->getTypeAllocSize(ScalarTy));

15898 }else {

15899SmallVector<Value *> PointerOps(E->Scalars.size(),nullptr);

15900transform(E->Scalars, PointerOps.begin(), [](Value *V) {

15901 return cast<LoadInst>(V)->getPointerOperand();

15902 });

15903OrdersType Order;

15904 std::optional<Value *> Stride =

15905calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,

15906 &*Builder.GetInsertPoint());

15907Value *NewStride =

15908 Builder.CreateIntCast(*Stride, StrideTy,/*isSigned=*/true);

15909 StrideVal = Builder.CreateMul(

15910 NewStride,

15911 ConstantInt::get(

15912 StrideTy,

15913 (IsReverseOrder ? -1 : 1) *

15914static_cast<int>(DL->getTypeAllocSize(ScalarTy))));

15915 }

15916Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);

15917auto *Inst = Builder.CreateIntrinsic(

15918 Intrinsic::experimental_vp_strided_load,

15919 {VecTy, PO->getType(), StrideTy},

15920 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),

15921 Builder.getInt32(E->Scalars.size())});

15922 Inst->addParamAttr(

15923/*ArgNo=*/0,

15924Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));

15925 NewLI = Inst;

15926 }else {

15927assert(E->State == TreeEntry::ScatterVectorize &&"Unhandled state");

15928Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);

15929if (E->VectorizedValue) {

15930LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15931return E->VectorizedValue;

15932 }

15933if (isa<FixedVectorType>(ScalarTy)) {

15934assert(SLPReVec &&"FixedVectorType is not expected.");

15935// CreateMaskedGather expects VecTy and VecPtr have same size. We need

15936// to expand VecPtr if ScalarTy is a vector type.

15937unsigned ScalarTyNumElements =

15938 cast<FixedVectorType>(ScalarTy)->getNumElements();

15939unsigned VecTyNumElements =

15940 cast<FixedVectorType>(VecTy)->getNumElements();

15941assert(VecTyNumElements % ScalarTyNumElements == 0 &&

15942"Cannot expand getelementptr.");

15943unsigned VF = VecTyNumElements / ScalarTyNumElements;

15944SmallVector<Constant *> Indices(VecTyNumElements);

15945transform(seq(VecTyNumElements), Indices.begin(), [=](unsignedI) {

15946 return Builder.getInt64(I % ScalarTyNumElements);

15947 });

15948 VecPtr = Builder.CreateGEP(

15949 VecTy->getElementType(),

15950 Builder.CreateShuffleVector(

15951 VecPtr,createReplicatedMask(ScalarTyNumElements, VF)),

15952ConstantVector::get(Indices));

15953 }

15954// Use the minimum alignment of the gathered loads.

15955Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);

15956 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);

15957 }

15958Value *V =::propagateMetadata(NewLI, E->Scalars);

15959

15960V = FinalShuffle(V, E);

15961 E->VectorizedValue =V;

15962 ++NumVectorInstructions;

15963returnV;

15964 }

15965case Instruction::Store: {

15966auto *SI = cast<StoreInst>(VL0);

15967

15968 setInsertPointAfterBundle(E);

15969

15970Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);

15971if (VecValue->getType() != VecTy)

15972 VecValue =

15973 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));

15974 VecValue = FinalShuffle(VecValue, E);

15975

15976Value *Ptr =SI->getPointerOperand();

15977Instruction *ST;

15978if (E->State == TreeEntry::Vectorize) {

15979ST = Builder.CreateAlignedStore(VecValue,Ptr,SI->getAlign());

15980 }else {

15981assert(E->State == TreeEntry::StridedVectorize &&

15982"Expected either strided or consecutive stores.");

15983if (!E->ReorderIndices.empty()) {

15984SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);

15985Ptr =SI->getPointerOperand();

15986 }

15987Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);

15988Type *StrideTy =DL->getIndexType(SI->getPointerOperandType());

15989auto *Inst = Builder.CreateIntrinsic(

15990 Intrinsic::experimental_vp_strided_store,

15991 {VecTy,Ptr->getType(), StrideTy},

15992 {VecValue,Ptr,

15993 ConstantInt::get(

15994 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),

15995 Builder.getAllOnesMask(VecTy->getElementCount()),

15996 Builder.getInt32(E->Scalars.size())});

15997 Inst->addParamAttr(

15998/*ArgNo=*/1,

15999Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));

16000ST = Inst;

16001 }

16002

16003Value *V =::propagateMetadata(ST, E->Scalars);

16004

16005 E->VectorizedValue =V;

16006 ++NumVectorInstructions;

16007returnV;

16008 }

16009case Instruction::GetElementPtr: {

16010auto *GEP0 = cast<GetElementPtrInst>(VL0);

16011 setInsertPointAfterBundle(E);

16012

16013Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);

16014if (E->VectorizedValue) {

16015LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16016return E->VectorizedValue;

16017 }

16018

16019SmallVector<Value *> OpVecs;

16020for (int J = 1,N = GEP0->getNumOperands(); J <N; ++J) {

16021Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);

16022if (E->VectorizedValue) {

16023LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16024return E->VectorizedValue;

16025 }

16026 OpVecs.push_back(OpVec);

16027 }

16028

16029Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);

16030if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {

16031SmallVector<Value *> GEPs;

16032for (Value *V : E->Scalars) {

16033if (isa<GetElementPtrInst>(V))

16034 GEPs.push_back(V);

16035 }

16036V =::propagateMetadata(I, GEPs);

16037 }

16038

16039V = FinalShuffle(V, E);

16040

16041 E->VectorizedValue =V;

16042 ++NumVectorInstructions;

16043

16044returnV;

16045 }

16046case Instruction::Call: {

16047CallInst *CI = cast<CallInst>(VL0);

16048 setInsertPointAfterBundle(E);

16049

16050Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

16051

16052SmallVector<Type *> ArgTys =buildIntrinsicArgTypes(

16053 CI,ID, VecTy->getNumElements(),

16054 It != MinBWs.end() ? It->second.first : 0,TTI);

16055auto VecCallCosts =getVectorCallCosts(CI, VecTy,TTI, TLI, ArgTys);

16056bool UseIntrinsic =ID !=Intrinsic::not_intrinsic &&

16057 VecCallCosts.first <= VecCallCosts.second;

16058

16059Value *ScalarArg =nullptr;

16060SmallVector<Value *> OpVecs;

16061SmallVector<Type *, 2> TysForDecl;

16062// Add return type if intrinsic is overloaded on it.

16063if (UseIntrinsic &&isVectorIntrinsicWithOverloadTypeAtArg(ID, -1,TTI))

16064 TysForDecl.push_back(VecTy);

16065auto *CEI = cast<CallInst>(VL0);

16066for (unsignedI : seq<unsigned>(0, CI->arg_size())) {

16067ValueList OpVL;

16068// Some intrinsics have scalar arguments. This argument should not be

16069// vectorized.

16070if (UseIntrinsic &&isVectorIntrinsicWithScalarOpAtArg(ID,I,TTI)) {

16071 ScalarArg = CEI->getArgOperand(I);

16072// if decided to reduce bitwidth of abs intrinsic, it second argument

16073// must be set false (do not return poison, if value issigned min).

16074if (ID == Intrinsic::abs && It != MinBWs.end() &&

16075 It->second.first <DL->getTypeSizeInBits(CEI->getType()))

16076 ScalarArg = Builder.getFalse();

16077 OpVecs.push_back(ScalarArg);

16078if (isVectorIntrinsicWithOverloadTypeAtArg(ID,I,TTI))

16079 TysForDecl.push_back(ScalarArg->getType());

16080continue;

16081 }

16082

16083Value *OpVec = vectorizeOperand(E,I, PostponedPHIs);

16084if (E->VectorizedValue) {

16085LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16086return E->VectorizedValue;

16087 }

16088 ScalarArg = CEI->getArgOperand(I);

16089if (cast<VectorType>(OpVec->getType())->getElementType() !=

16090 ScalarArg->getType()->getScalarType() &&

16091 It == MinBWs.end()) {

16092auto *CastTy =

16093getWidenedType(ScalarArg->getType(), VecTy->getNumElements());

16094 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));

16095 }elseif (It != MinBWs.end()) {

16096 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));

16097 }

16098LLVM_DEBUG(dbgs() <<"SLP: OpVec[" <<I <<"]: " << *OpVec <<"\n");

16099 OpVecs.push_back(OpVec);

16100if (UseIntrinsic &&isVectorIntrinsicWithOverloadTypeAtArg(ID,I,TTI))

16101 TysForDecl.push_back(OpVec->getType());

16102 }

16103

16104Function *CF;

16105if (!UseIntrinsic) {

16106VFShape Shape =

16107VFShape::get(CI->getFunctionType(),

16108ElementCount::getFixed(

16109static_cast<unsigned>(VecTy->getNumElements())),

16110false/*HasGlobalPred*/);

16111 CF =VFDatabase(*CI).getVectorizedFunction(Shape);

16112 }else {

16113 CF =Intrinsic::getOrInsertDeclaration(F->getParent(),ID, TysForDecl);

16114 }

16115

16116SmallVector<OperandBundleDef, 1> OpBundles;

16117 CI->getOperandBundlesAsDefs(OpBundles);

16118Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);

16119

16120propagateIRFlags(V, E->Scalars, VL0);

16121V = FinalShuffle(V, E);

16122

16123 E->VectorizedValue =V;

16124 ++NumVectorInstructions;

16125returnV;

16126 }

16127case Instruction::ShuffleVector: {

16128Value *V;

16129if (SLPReVec && !E->isAltShuffle()) {

16130 setInsertPointAfterBundle(E);

16131Value *Src = vectorizeOperand(E, 0, PostponedPHIs);

16132if (E->VectorizedValue) {

16133LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16134return E->VectorizedValue;

16135 }

16136SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));

16137if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {

16138assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&

16139"Not supported shufflevector usage.");

16140SmallVector<int> NewMask(ThisMask.size());

16141transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {

16142 return SVSrc->getShuffleMask()[Mask];

16143 });

16144V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);

16145 }else {

16146V = Builder.CreateShuffleVector(Src, ThisMask);

16147 }

16148propagateIRFlags(V, E->Scalars, VL0);

16149if (auto *I = dyn_cast<Instruction>(V))

16150V =::propagateMetadata(I, E->Scalars);

16151V = FinalShuffle(V, E);

16152 }else {

16153assert(E->isAltShuffle() &&

16154 ((Instruction::isBinaryOp(E->getOpcode()) &&

16155Instruction::isBinaryOp(E->getAltOpcode())) ||

16156 (Instruction::isCast(E->getOpcode()) &&

16157Instruction::isCast(E->getAltOpcode())) ||

16158 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&

16159"Invalid Shuffle Vector Operand");

16160

16161Value *LHS =nullptr, *RHS =nullptr;

16162if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {

16163 setInsertPointAfterBundle(E);

16164LHS = vectorizeOperand(E, 0, PostponedPHIs);

16165if (E->VectorizedValue) {

16166LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16167return E->VectorizedValue;

16168 }

16169RHS = vectorizeOperand(E, 1, PostponedPHIs);

16170 }else {

16171 setInsertPointAfterBundle(E);

16172LHS = vectorizeOperand(E, 0, PostponedPHIs);

16173 }

16174if (E->VectorizedValue) {

16175LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16176return E->VectorizedValue;

16177 }

16178if (LHS && RHS &&

16179 ((Instruction::isBinaryOp(E->getOpcode()) &&

16180 (LHS->getType() != VecTy ||RHS->getType() != VecTy)) ||

16181 (isa<CmpInst>(VL0) &&LHS->getType() !=RHS->getType()))) {

16182assert((It != MinBWs.end() ||

16183 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||

16184 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||

16185 MinBWs.contains(getOperandEntry(E, 0)) ||

16186 MinBWs.contains(getOperandEntry(E, 1))) &&

16187"Expected item in MinBWs.");

16188Type *CastTy = VecTy;

16189if (isa<CmpInst>(VL0) &&LHS->getType() !=RHS->getType()) {

16190if (cast<VectorType>(LHS->getType())

16191 ->getElementType()

16192 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())

16193 ->getElementType()

16194 ->getIntegerBitWidth())

16195 CastTy =RHS->getType();

16196else

16197 CastTy =LHS->getType();

16198 }

16199if (LHS->getType() != CastTy)

16200LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));

16201if (RHS->getType() != CastTy)

16202RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));

16203 }

16204

16205Value *V0, *V1;

16206if (Instruction::isBinaryOp(E->getOpcode())) {

16207 V0 = Builder.CreateBinOp(

16208static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);

16209 V1 = Builder.CreateBinOp(

16210static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);

16211 }elseif (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

16212 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);

16213auto *AltCI = cast<CmpInst>(E->getAltOp());

16214CmpInst::Predicate AltPred = AltCI->getPredicate();

16215 V1 = Builder.CreateCmp(AltPred, LHS, RHS);

16216 }else {

16217if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {

16218unsigned SrcBWSz =DL->getTypeSizeInBits(

16219 cast<VectorType>(LHS->getType())->getElementType());

16220unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);

16221if (BWSz <= SrcBWSz) {

16222if (BWSz < SrcBWSz)

16223LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);

16224assert(LHS->getType() == VecTy &&

16225"Expected same type as operand.");

16226if (auto *I = dyn_cast<Instruction>(LHS))

16227LHS =::propagateMetadata(I, E->Scalars);

16228LHS = FinalShuffle(LHS, E);

16229 E->VectorizedValue =LHS;

16230 ++NumVectorInstructions;

16231returnLHS;

16232 }

16233 }

16234 V0 = Builder.CreateCast(

16235static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);

16236 V1 = Builder.CreateCast(

16237static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);

16238 }

16239// Add V0 and V1 to later analysis to try to find and remove matching

16240// instruction, if any.

16241for (Value *V : {V0, V1}) {

16242if (auto *I = dyn_cast<Instruction>(V)) {

16243 GatherShuffleExtractSeq.insert(I);

16244 CSEBlocks.insert(I->getParent());

16245 }

16246 }

16247

16248// Create shuffle to take alternate operations from the vector.

16249// Also, gather up main and alt scalar ops to propagate IR flags to

16250// each vector operation.

16251ValueList OpScalars, AltScalars;

16252SmallVector<int>Mask;

16253 E->buildAltOpShuffleMask(

16254 [E,this](Instruction *I) {

16255assert(E->isOpcodeOrAlt(I) &&"Unexpected main/alternate opcode");

16256returnisAlternateInstruction(I, E->getMainOp(), E->getAltOp(),

16257 *TLI);

16258 },

16259Mask, &OpScalars, &AltScalars);

16260

16261propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());

16262propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());

16263auto DropNuwFlag = [&](Value *Vec,unsigned Opcode) {

16264// Drop nuw flags for abs(sub(commutative), true).

16265if (auto *I = dyn_cast<Instruction>(Vec);

16266I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&

16267any_of(E->Scalars, [](Value *V) {

16268 if (isa<PoisonValue>(V))

16269 return false;

16270 auto *IV = cast<Instruction>(V);

16271 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);

16272 }))

16273I->setHasNoUnsignedWrap(/*b=*/false);

16274 };

16275 DropNuwFlag(V0, E->getOpcode());

16276 DropNuwFlag(V1, E->getAltOpcode());

16277

16278if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {

16279assert(SLPReVec &&"FixedVectorType is not expected.");

16280transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);

16281 }

16282V = Builder.CreateShuffleVector(V0, V1, Mask);

16283if (auto *I = dyn_cast<Instruction>(V)) {

16284V =::propagateMetadata(I, E->Scalars);

16285 GatherShuffleExtractSeq.insert(I);

16286 CSEBlocks.insert(I->getParent());

16287 }

16288 }

16289

16290 E->VectorizedValue =V;

16291 ++NumVectorInstructions;

16292

16293returnV;

16294 }

16295default:

16296llvm_unreachable("unknown inst");

16297 }

16298returnnullptr;

16299}

16300

16301Value *BoUpSLP::vectorizeTree() {

16302ExtraValueToDebugLocsMap ExternallyUsedValues;

16303returnvectorizeTree(ExternallyUsedValues);

16304}

16305

16306Value *

16307BoUpSLP::vectorizeTree(constExtraValueToDebugLocsMap &ExternallyUsedValues,

16308Instruction *ReductionRoot) {

16309// All blocks must be scheduled before any instructions are inserted.

16310for (auto &BSIter : BlocksSchedules) {

16311 scheduleBlock(BSIter.second.get());

16312 }

16313// Clean Entry-to-LastInstruction table. It can be affected after scheduling,

16314// need to rebuild it.

16315 EntryToLastInstruction.clear();

16316

16317if (ReductionRoot)

16318 Builder.SetInsertPoint(ReductionRoot->getParent(),

16319 ReductionRoot->getIterator());

16320else

16321 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());

16322

16323// Emit gathered loads first to emit better code for the users of those

16324// gathered loads.

16325for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

16326if (GatheredLoadsEntriesFirst.has_value() &&

16327 TE->Idx >= *GatheredLoadsEntriesFirst &&

16328 (!TE->isGather() || !TE->UserTreeIndices.empty())) {

16329assert((!TE->UserTreeIndices.empty() ||

16330 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&

16331"Expected gathered load node.");

16332 (void)vectorizeTree(TE.get(),/*PostponedPHIs=*/false);

16333 }

16334 }

16335// Postpone emission of PHIs operands to avoid cyclic dependencies issues.

16336 (void)vectorizeTree(VectorizableTree[0].get(),/*PostponedPHIs=*/true);

16337for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)

16338if (TE->State == TreeEntry::Vectorize &&

16339 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&

16340 TE->VectorizedValue)

16341 (void)vectorizeTree(TE.get(),/*PostponedPHIs=*/false);

16342// Run through the list of postponed gathers and emit them, replacing the temp

16343// emitted allocas with actual vector instructions.

16344ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();

16345DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;

16346for (const TreeEntry *E : PostponedNodes) {

16347auto *TE =const_cast<TreeEntry *>(E);

16348if (auto *VecTE = getTreeEntry(TE->Scalars.front()))

16349if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(

16350 TE->UserTreeIndices.front().EdgeIdx)) &&

16351 VecTE->isSame(TE->Scalars))

16352// Found gather node which is absolutely the same as one of the

16353// vectorized nodes. It may happen after reordering.

16354continue;

16355auto *PrevVec = cast<Instruction>(TE->VectorizedValue);

16356 TE->VectorizedValue =nullptr;

16357auto *UserI =

16358 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);

16359// If user is a PHI node, its vector code have to be inserted right before

16360// block terminator. Since the node was delayed, there were some unresolved

16361// dependencies at the moment when stab instruction was emitted. In a case

16362// when any of these dependencies turn out an operand of another PHI, coming

16363// from this same block, position of a stab instruction will become invalid.

16364// The is because source vector that supposed to feed this gather node was

16365// inserted at the end of the block [after stab instruction]. So we need

16366// to adjust insertion point again to the end of block.

16367if (isa<PHINode>(UserI)) {

16368// Insert before all users.

16369Instruction *InsertPt = PrevVec->getParent()->getTerminator();

16370for (User *U : PrevVec->users()) {

16371if (U == UserI)

16372continue;

16373auto *UI = dyn_cast<Instruction>(U);

16374if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())

16375continue;

16376if (UI->comesBefore(InsertPt))

16377 InsertPt = UI;

16378 }

16379 Builder.SetInsertPoint(InsertPt);

16380 }else {

16381 Builder.SetInsertPoint(PrevVec);

16382 }

16383 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());

16384Value *Vec =vectorizeTree(TE,/*PostponedPHIs=*/false);

16385if (auto *VecI = dyn_cast<Instruction>(Vec);

16386 VecI && VecI->getParent() == Builder.GetInsertBlock() &&

16387 Builder.GetInsertPoint()->comesBefore(VecI))

16388 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),

16389 Builder.GetInsertPoint());

16390if (Vec->getType() != PrevVec->getType()) {

16391assert(Vec->getType()->isIntOrIntVectorTy() &&

16392 PrevVec->getType()->isIntOrIntVectorTy() &&

16393"Expected integer vector types only.");

16394 std::optional<bool> IsSigned;

16395for (Value *V : TE->Scalars) {

16396if (const TreeEntry *BaseTE = getTreeEntry(V)) {

16397auto It = MinBWs.find(BaseTE);

16398if (It != MinBWs.end()) {

16399 IsSigned = IsSigned.value_or(false) || It->second.second;

16400if (*IsSigned)

16401break;

16402 }

16403for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {

16404auto It = MinBWs.find(MNTE);

16405if (It != MinBWs.end()) {

16406 IsSigned = IsSigned.value_or(false) || It->second.second;

16407if (*IsSigned)

16408break;

16409 }

16410 }

16411if (IsSigned.value_or(false))

16412break;

16413// Scan through gather nodes.

16414for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {

16415auto It = MinBWs.find(BVE);

16416if (It != MinBWs.end()) {

16417 IsSigned = IsSigned.value_or(false) || It->second.second;

16418if (*IsSigned)

16419break;

16420 }

16421 }

16422if (IsSigned.value_or(false))

16423break;

16424if (auto *EE = dyn_cast<ExtractElementInst>(V)) {

16425 IsSigned =

16426 IsSigned.value_or(false) ||

16427 !isKnownNonNegative(EE->getVectorOperand(),SimplifyQuery(*DL));

16428continue;

16429 }

16430if (IsSigned.value_or(false))

16431break;

16432 }

16433 }

16434if (IsSigned.value_or(false)) {

16435// Final attempt - check user node.

16436auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);

16437if (It != MinBWs.end())

16438 IsSigned = It->second.second;

16439 }

16440assert(IsSigned &&

16441"Expected user node or perfect diamond match in MinBWs.");

16442 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);

16443 }

16444 PrevVec->replaceAllUsesWith(Vec);

16445 PostponedValues.try_emplace(Vec).first->second.push_back(TE);

16446// Replace the stub vector node, if it was used before for one of the

16447// buildvector nodes already.

16448auto It = PostponedValues.find(PrevVec);

16449if (It != PostponedValues.end()) {

16450for (TreeEntry *VTE : It->getSecond())

16451 VTE->VectorizedValue = Vec;

16452 }

16453eraseInstruction(PrevVec);

16454 }

16455

16456LLVM_DEBUG(dbgs() <<"SLP: Extracting " << ExternalUses.size()

16457 <<" values .\n");

16458

16459SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;

16460// Maps vector instruction to original insertelement instruction

16461DenseMap<Value *, InsertElementInst *> VectorToInsertElement;

16462// Maps extract Scalar to the corresponding extractelement instruction in the

16463// basic block. Only one extractelement per block should be emitted.

16464DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>

16465 ScalarToEEs;

16466SmallDenseSet<Value *, 4> UsedInserts;

16467DenseMap<std::pair<Value *, Type *>,Value *> VectorCasts;

16468SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;

16469SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;

16470// Extract all of the elements with the external uses.

16471for (constauto &ExternalUse : ExternalUses) {

16472Value *Scalar = ExternalUse.Scalar;

16473llvm::User *User = ExternalUse.User;

16474

16475// Skip users that we already RAUW. This happens when one instruction

16476// has multiple uses of the same value.

16477if (User && !is_contained(Scalar->users(),User))

16478continue;

16479 TreeEntry *E = getTreeEntry(Scalar);

16480assert(E &&"Invalid scalar");

16481assert(!E->isGather() &&"Extracting from a gather list");

16482// Non-instruction pointers are not deleted, just skip them.

16483if (E->getOpcode() == Instruction::GetElementPtr &&

16484 !isa<GetElementPtrInst>(Scalar))

16485continue;

16486

16487Value *Vec = E->VectorizedValue;

16488assert(Vec &&"Can't find vectorizable value");

16489

16490Value *Lane = Builder.getInt32(ExternalUse.Lane);

16491auto ExtractAndExtendIfNeeded = [&](Value *Vec) {

16492if (Scalar->getType() != Vec->getType()) {

16493Value *Ex =nullptr;

16494Value *ExV =nullptr;

16495auto *Inst = dyn_cast<Instruction>(Scalar);

16496bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);

16497auto It = ScalarToEEs.find(Scalar);

16498if (It != ScalarToEEs.end()) {

16499// No need to emit many extracts, just move the only one in the

16500// current block.

16501auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()

16502 : Builder.GetInsertBlock());

16503if (EEIt != It->second.end()) {

16504Value *PrevV = EEIt->second.first;

16505if (auto *I = dyn_cast<Instruction>(PrevV);

16506I && !ReplaceInst &&

16507 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&

16508 Builder.GetInsertPoint()->comesBefore(I)) {

16509I->moveBefore(*Builder.GetInsertPoint()->getParent(),

16510 Builder.GetInsertPoint());

16511if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))

16512 CI->moveAfter(I);

16513 }

16514 Ex = PrevV;

16515 ExV = EEIt->second.second ? EEIt->second.second : Ex;

16516 }

16517 }

16518if (!Ex) {

16519// "Reuse" the existing extract to improve final codegen.

16520if (ReplaceInst) {

16521// Leave the instruction as is, if it cheaper extracts and all

16522// operands are scalar.

16523if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {

16524 IgnoredExtracts.insert(EE);

16525 Ex = EE;

16526 }else {

16527auto *CloneInst = Inst->clone();

16528 CloneInst->insertBefore(Inst->getIterator());

16529if (Inst->hasName())

16530 CloneInst->takeName(Inst);

16531 Ex = CloneInst;

16532 }

16533 }elseif (auto *ES = dyn_cast<ExtractElementInst>(Scalar);

16534 ES && isa<Instruction>(Vec)) {

16535Value *V = ES->getVectorOperand();

16536auto *IVec = cast<Instruction>(Vec);

16537if (const TreeEntry *ETE = getTreeEntry(V))

16538 V = ETE->VectorizedValue;

16539if (auto *IV = dyn_cast<Instruction>(V);

16540 !IV ||IV == Vec ||IV->getParent() != IVec->getParent() ||

16541IV->comesBefore(IVec))

16542 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());

16543else

16544 Ex = Builder.CreateExtractElement(Vec, Lane);

16545 }elseif (auto *VecTy =

16546 dyn_cast<FixedVectorType>(Scalar->getType())) {

16547assert(SLPReVec &&"FixedVectorType is not expected.");

16548unsigned VecTyNumElements = VecTy->getNumElements();

16549// When REVEC is enabled, we need to extract a vector.

16550// Note: The element size of Scalar may be different from the

16551// element size of Vec.

16552 Ex =createExtractVector(Builder, Vec, VecTyNumElements,

16553 ExternalUse.Lane * VecTyNumElements);

16554 }else {

16555 Ex = Builder.CreateExtractElement(Vec, Lane);

16556 }

16557// If necessary, sign-extend or zero-extend ScalarRoot

16558// to the larger type.

16559 ExV = Ex;

16560if (Scalar->getType() != Ex->getType())

16561 ExV = Builder.CreateIntCast(

16562 Ex, Scalar->getType(),

16563 !isKnownNonNegative(Scalar,SimplifyQuery(*DL)));

16564auto *I = dyn_cast<Instruction>(Ex);

16565 ScalarToEEs[Scalar].try_emplace(I ?I->getParent()

16566 : &F->getEntryBlock(),

16567 std::make_pair(Ex, ExV));

16568 }

16569// The then branch of the previous if may produce constants, since 0

16570// operand might be a constant.

16571if (auto *ExI = dyn_cast<Instruction>(Ex);

16572 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {

16573 GatherShuffleExtractSeq.insert(ExI);

16574 CSEBlocks.insert(ExI->getParent());

16575 }

16576return ExV;

16577 }

16578assert(isa<FixedVectorType>(Scalar->getType()) &&

16579 isa<InsertElementInst>(Scalar) &&

16580"In-tree scalar of vector type is not insertelement?");

16581auto *IE = cast<InsertElementInst>(Scalar);

16582 VectorToInsertElement.try_emplace(Vec, IE);

16583return Vec;

16584 };

16585// If User == nullptr, the Scalar remains as scalar in vectorized

16586// instructions or is used as extra arg. Generate ExtractElement instruction

16587// and update the record for this scalar in ExternallyUsedValues.

16588if (!User) {

16589if (!ScalarsWithNullptrUser.insert(Scalar).second)

16590continue;

16591assert((ExternallyUsedValues.count(Scalar) ||

16592 Scalar->hasNUsesOrMore(UsesLimit) ||

16593 ExternalUsesAsOriginalScalar.contains(Scalar) ||

16594any_of(Scalar->users(),

16595 [&](llvm::User *U) {

16596 if (ExternalUsesAsOriginalScalar.contains(U))

16597 return true;

16598 TreeEntry *UseEntry = getTreeEntry(U);

16599 return UseEntry &&

16600 (UseEntry->State == TreeEntry::Vectorize ||

16601 UseEntry->State ==

16602 TreeEntry::StridedVectorize) &&

16603 (E->State == TreeEntry::Vectorize ||

16604 E->State == TreeEntry::StridedVectorize) &&

16605 doesInTreeUserNeedToExtract(

16606 Scalar, getRootEntryInstruction(*UseEntry),

16607 TLI, TTI);

16608 })) &&

16609"Scalar with nullptr User must be registered in "

16610"ExternallyUsedValues map or remain as scalar in vectorized "

16611"instructions");

16612if (auto *VecI = dyn_cast<Instruction>(Vec)) {

16613if (auto *PHI = dyn_cast<PHINode>(VecI)) {

16614if (PHI->getParent()->isLandingPad())

16615 Builder.SetInsertPoint(

16616PHI->getParent(),

16617 std::next(

16618PHI->getParent()->getLandingPadInst()->getIterator()));

16619else

16620 Builder.SetInsertPoint(PHI->getParent(),

16621PHI->getParent()->getFirstNonPHIIt());

16622 }else {

16623 Builder.SetInsertPoint(VecI->getParent(),

16624 std::next(VecI->getIterator()));

16625 }

16626 }else {

16627 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());

16628 }

16629Value *NewInst = ExtractAndExtendIfNeeded(Vec);

16630// Required to update internally referenced instructions.

16631if (Scalar != NewInst) {

16632assert((!isa<ExtractElementInst>(Scalar) ||

16633 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&

16634"Extractelements should not be replaced.");

16635 Scalar->replaceAllUsesWith(NewInst);

16636 }

16637continue;

16638 }

16639

16640if (auto *VU = dyn_cast<InsertElementInst>(User);

16641 VU && VU->getOperand(1) == Scalar) {

16642// Skip if the scalar is another vector op or Vec is not an instruction.

16643if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {

16644if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {

16645if (!UsedInserts.insert(VU).second)

16646continue;

16647// Need to use original vector, if the root is truncated.

16648auto BWIt = MinBWs.find(E);

16649if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {

16650auto *ScalarTy = FTy->getElementType();

16651auto Key = std::make_pair(Vec, ScalarTy);

16652auto VecIt = VectorCasts.find(Key);

16653if (VecIt == VectorCasts.end()) {

16654IRBuilderBase::InsertPointGuard Guard(Builder);

16655if (auto *IVec = dyn_cast<PHINode>(Vec)) {

16656if (IVec->getParent()->isLandingPad())

16657 Builder.SetInsertPoint(IVec->getParent(),

16658 std::next(IVec->getParent()

16659 ->getLandingPadInst()

16660 ->getIterator()));

16661else

16662 Builder.SetInsertPoint(

16663 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());

16664 }elseif (auto *IVec = dyn_cast<Instruction>(Vec)) {

16665 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());

16666 }

16667 Vec = Builder.CreateIntCast(

16668 Vec,

16669getWidenedType(

16670 ScalarTy,

16671 cast<FixedVectorType>(Vec->getType())->getNumElements()),

16672 BWIt->second.second);

16673 VectorCasts.try_emplace(Key, Vec);

16674 }else {

16675 Vec = VecIt->second;

16676 }

16677 }

16678

16679 std::optional<unsigned> InsertIdx =getElementIndex(VU);

16680if (InsertIdx) {

16681auto *It =find_if(

16682 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {

16683// Checks if 2 insertelements are from the same buildvector.

16684InsertElementInst *VecInsert =Data.InsertElements.front();

16685returnareTwoInsertFromSameBuildVector(

16686 VU, VecInsert,

16687 [](InsertElementInst *II) {returnII->getOperand(0); });

16688 });

16689unsignedIdx = *InsertIdx;

16690if (It == ShuffledInserts.end()) {

16691 (void)ShuffledInserts.emplace_back();

16692 It = std::next(ShuffledInserts.begin(),

16693 ShuffledInserts.size() - 1);

16694 }

16695SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];

16696if (Mask.empty())

16697 Mask.assign(FTy->getNumElements(),PoisonMaskElem);

16698 Mask[Idx] = ExternalUse.Lane;

16699 It->InsertElements.push_back(cast<InsertElementInst>(User));

16700continue;

16701 }

16702 }

16703 }

16704 }

16705

16706// Generate extracts for out-of-tree users.

16707// Find the insertion point for the extractelement lane.

16708if (auto *VecI = dyn_cast<Instruction>(Vec)) {

16709if (PHINode *PH = dyn_cast<PHINode>(User)) {

16710for (unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {

16711if (PH->getIncomingValue(I) == Scalar) {

16712Instruction *IncomingTerminator =

16713 PH->getIncomingBlock(I)->getTerminator();

16714if (isa<CatchSwitchInst>(IncomingTerminator)) {

16715 Builder.SetInsertPoint(VecI->getParent(),

16716 std::next(VecI->getIterator()));

16717 }else {

16718 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());

16719 }

16720Value *NewInst = ExtractAndExtendIfNeeded(Vec);

16721 PH->setOperand(I, NewInst);

16722 }

16723 }

16724 }else {

16725 Builder.SetInsertPoint(cast<Instruction>(User));

16726Value *NewInst = ExtractAndExtendIfNeeded(Vec);

16727User->replaceUsesOfWith(Scalar, NewInst);

16728 }

16729 }else {

16730 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());

16731Value *NewInst = ExtractAndExtendIfNeeded(Vec);

16732User->replaceUsesOfWith(Scalar, NewInst);

16733 }

16734

16735LLVM_DEBUG(dbgs() <<"SLP: Replaced:" << *User <<".\n");

16736 }

16737

16738auto CreateShuffle = [&](Value *V1,Value *V2,ArrayRef<int> Mask) {

16739SmallVector<int> CombinedMask1(Mask.size(),PoisonMaskElem);

16740SmallVector<int> CombinedMask2(Mask.size(),PoisonMaskElem);

16741int VF = cast<FixedVectorType>(V1->getType())->getNumElements();

16742for (intI = 0, E = Mask.size();I < E; ++I) {

16743if (Mask[I] < VF)

16744 CombinedMask1[I] = Mask[I];

16745else

16746 CombinedMask2[I] = Mask[I] - VF;

16747 }

16748ShuffleInstructionBuilder ShuffleBuilder(

16749 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);

16750 ShuffleBuilder.add(V1, CombinedMask1);

16751if (V2)

16752 ShuffleBuilder.add(V2, CombinedMask2);

16753return ShuffleBuilder.finalize({}, {}, {});

16754 };

16755

16756auto &&ResizeToVF = [&CreateShuffle](Value *Vec,ArrayRef<int> Mask,

16757bool ForSingleMask) {

16758unsigned VF = Mask.size();

16759unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();

16760if (VF != VecVF) {

16761if (any_of(Mask, [VF](intIdx) {returnIdx >=static_cast<int>(VF); })) {

16762 Vec = CreateShuffle(Vec,nullptr, Mask);

16763return std::make_pair(Vec,true);

16764 }

16765if (!ForSingleMask) {

16766SmallVector<int> ResizeMask(VF,PoisonMaskElem);

16767for (unsignedI = 0;I < VF; ++I) {

16768if (Mask[I] !=PoisonMaskElem)

16769 ResizeMask[Mask[I]] = Mask[I];

16770 }

16771 Vec = CreateShuffle(Vec,nullptr, ResizeMask);

16772 }

16773 }

16774

16775return std::make_pair(Vec,false);

16776 };

16777// Perform shuffling of the vectorize tree entries for better handling of

16778// external extracts.

16779for (intI = 0, E = ShuffledInserts.size();I < E; ++I) {

16780// Find the first and the last instruction in the list of insertelements.

16781sort(ShuffledInserts[I].InsertElements,isFirstInsertElement);

16782InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();

16783InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();

16784 Builder.SetInsertPoint(LastInsert);

16785autoVector = ShuffledInserts[I].ValueMasks.takeVector();

16786Value *NewInst = performExtractsShuffleAction<Value>(

16787MutableArrayRef(Vector.data(),Vector.size()),

16788 FirstInsert->getOperand(0),

16789 [](Value *Vec) {

16790 return cast<VectorType>(Vec->getType())

16791 ->getElementCount()

16792 .getKnownMinValue();

16793 },

16794 ResizeToVF,

16795 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,

16796ArrayRef<Value *> Vals) {

16797 assert((Vals.size() == 1 || Vals.size() == 2) &&

16798"Expected exactly 1 or 2 input values.");

16799 if (Vals.size() == 1) {

16800// Do not create shuffle if the mask is a simple identity

16801// non-resizing mask.

16802 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())

16803 ->getNumElements() ||

16804 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))

16805 return CreateShuffle(Vals.front(), nullptr, Mask);

16806 return Vals.front();

16807 }

16808return CreateShuffle(Vals.front() ? Vals.front()

16809 : FirstInsert->getOperand(0),

16810 Vals.back(), Mask);

16811 });

16812auto It = ShuffledInserts[I].InsertElements.rbegin();

16813// Rebuild buildvector chain.

16814InsertElementInst *II =nullptr;

16815if (It != ShuffledInserts[I].InsertElements.rend())

16816II = *It;

16817SmallVector<Instruction *> Inserts;

16818while (It != ShuffledInserts[I].InsertElements.rend()) {

16819assert(II &&"Must be an insertelement instruction.");

16820if (*It ==II)

16821 ++It;

16822else

16823 Inserts.push_back(cast<Instruction>(II));

16824II = dyn_cast<InsertElementInst>(II->getOperand(0));

16825 }

16826for (Instruction *II :reverse(Inserts)) {

16827II->replaceUsesOfWith(II->getOperand(0), NewInst);

16828if (auto *NewI = dyn_cast<Instruction>(NewInst))

16829if (II->getParent() == NewI->getParent() &&II->comesBefore(NewI))

16830II->moveAfter(NewI);

16831 NewInst =II;

16832 }

16833 LastInsert->replaceAllUsesWith(NewInst);

16834for (InsertElementInst *IE :reverse(ShuffledInserts[I].InsertElements)) {

16835 IE->replaceUsesOfWith(IE->getOperand(0),

16836PoisonValue::get(IE->getOperand(0)->getType()));

16837 IE->replaceUsesOfWith(IE->getOperand(1),

16838PoisonValue::get(IE->getOperand(1)->getType()));

16839eraseInstruction(IE);

16840 }

16841 CSEBlocks.insert(LastInsert->getParent());

16842 }

16843

16844SmallVector<Instruction *> RemovedInsts;

16845// For each vectorized value:

16846for (auto &TEPtr : VectorizableTree) {

16847 TreeEntry *Entry = TEPtr.get();

16848

16849// No need to handle users of gathered values.

16850if (Entry->isGather())

16851continue;

16852

16853assert(Entry->VectorizedValue &&"Can't find vectorizable value");

16854

16855// For each lane:

16856for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

16857Value *Scalar = Entry->Scalars[Lane];

16858

16859if (Entry->getOpcode() == Instruction::GetElementPtr &&

16860 !isa<GetElementPtrInst>(Scalar))

16861continue;

16862if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);

16863 EE && IgnoredExtracts.contains(EE))

16864continue;

16865if (isa<PoisonValue>(Scalar))

16866continue;

16867#ifndef NDEBUG

16868Type *Ty = Scalar->getType();

16869if (!Ty->isVoidTy()) {

16870for (User *U : Scalar->users()) {

16871LLVM_DEBUG(dbgs() <<"SLP: \tvalidating user:" << *U <<".\n");

16872

16873// It is legal to delete users in the ignorelist.

16874assert((getTreeEntry(U) ||

16875 (UserIgnoreList && UserIgnoreList->contains(U)) ||

16876 (isa_and_nonnull<Instruction>(U) &&

16877 isDeleted(cast<Instruction>(U)))) &&

16878"Deleting out-of-tree value");

16879 }

16880 }

16881#endif

16882LLVM_DEBUG(dbgs() <<"SLP: \tErasing scalar:" << *Scalar <<".\n");

16883auto *I = cast<Instruction>(Scalar);

16884 RemovedInsts.push_back(I);

16885 }

16886 }

16887

16888// Merge the DIAssignIDs from the about-to-be-deleted instructions into the

16889// new vector instruction.

16890if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))

16891V->mergeDIAssignID(RemovedInsts);

16892

16893// Clear up reduction references, if any.

16894if (UserIgnoreList) {

16895for (Instruction *I : RemovedInsts) {

16896const TreeEntry *IE = getTreeEntry(I);

16897if (IE->Idx != 0 &&

16898 !(VectorizableTree.front()->isGather() &&

16899 !IE->UserTreeIndices.empty() &&

16900 (ValueToGatherNodes.lookup(I).contains(

16901 VectorizableTree.front().get()) ||

16902any_of(IE->UserTreeIndices,

16903 [&](const EdgeInfo &EI) {

16904 return EI.UserTE == VectorizableTree.front().get() &&

16905 EI.EdgeIdx == UINT_MAX;

16906 }))) &&

16907 !(GatheredLoadsEntriesFirst.has_value() &&

16908IE->Idx >= *GatheredLoadsEntriesFirst &&

16909 VectorizableTree.front()->isGather() &&

16910is_contained(VectorizableTree.front()->Scalars,I)))

16911continue;

16912SmallVector<SelectInst *> LogicalOpSelects;

16913I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {

16914// Do not replace condition of the logical op in form select <cond>.

16915 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&

16916 (match(U.getUser(), m_LogicalAnd()) ||

16917 match(U.getUser(), m_LogicalOr())) &&

16918 U.getOperandNo() == 0;

16919 if (IsPoisoningLogicalOp) {

16920 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));

16921 return false;

16922 }

16923return UserIgnoreList->contains(U.getUser());

16924 });

16925// Replace conditions of the poisoning logical ops with the non-poison

16926// constant value.

16927for (SelectInst *SI : LogicalOpSelects)

16928SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));

16929 }

16930 }

16931// Retain to-be-deleted instructions for some debug-info bookkeeping and alias

16932// cache correctness.

16933// NOTE: removeInstructionAndOperands only marks the instruction for deletion

16934// - instructions are not deleted until later.

16935 removeInstructionsAndOperands(ArrayRef(RemovedInsts));

16936

16937 Builder.ClearInsertionPoint();

16938 InstrElementSize.clear();

16939

16940const TreeEntry &RootTE = *VectorizableTree.front();

16941Value *Vec = RootTE.VectorizedValue;

16942if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&

16943 It != MinBWs.end() &&

16944 ReductionBitWidth != It->second.first) {

16945IRBuilder<>::InsertPointGuard Guard(Builder);

16946 Builder.SetInsertPoint(ReductionRoot->getParent(),

16947 ReductionRoot->getIterator());

16948 Vec = Builder.CreateIntCast(

16949 Vec,

16950VectorType::get(Builder.getIntNTy(ReductionBitWidth),

16951 cast<VectorType>(Vec->getType())->getElementCount()),

16952 It->second.second);

16953 }

16954return Vec;

16955}

16956

16957voidBoUpSLP::optimizeGatherSequence() {

16958LLVM_DEBUG(dbgs() <<"SLP: Optimizing " << GatherShuffleExtractSeq.size()

16959 <<" gather sequences instructions.\n");

16960// LICM InsertElementInst sequences.

16961for (Instruction *I : GatherShuffleExtractSeq) {

16962if (isDeleted(I))

16963continue;

16964

16965// Check if this block is inside a loop.

16966Loop *L = LI->getLoopFor(I->getParent());

16967if (!L)

16968continue;

16969

16970// Check if it has a preheader.

16971BasicBlock *PreHeader = L->getLoopPreheader();

16972if (!PreHeader)

16973continue;

16974

16975// If the vector or the element that we insert into it are

16976// instructions that are defined in this basic block then we can't

16977// hoist this instruction.

16978if (any_of(I->operands(), [L](Value *V) {

16979 auto *OpI = dyn_cast<Instruction>(V);

16980 return OpI && L->contains(OpI);

16981 }))

16982continue;

16983

16984// We can hoist this instruction. Move it to the pre-header.

16985I->moveBefore(PreHeader->getTerminator()->getIterator());

16986 CSEBlocks.insert(PreHeader);

16987 }

16988

16989// Make a list of all reachable blocks in our CSE queue.

16990SmallVector<const DomTreeNode *, 8> CSEWorkList;

16991 CSEWorkList.reserve(CSEBlocks.size());

16992for (BasicBlock *BB : CSEBlocks)

16993if (DomTreeNode *N = DT->getNode(BB)) {

16994assert(DT->isReachableFromEntry(N));

16995 CSEWorkList.push_back(N);

16996 }

16997

16998// Sort blocks by domination. This ensures we visit a block after all blocks

16999// dominating it are visited.

17000llvm::sort(CSEWorkList, [](constDomTreeNode *A,constDomTreeNode *B) {

17001assert((A ==B) == (A->getDFSNumIn() ==B->getDFSNumIn()) &&

17002"Different nodes should have different DFS numbers");

17003returnA->getDFSNumIn() <B->getDFSNumIn();

17004 });

17005

17006// Less defined shuffles can be replaced by the more defined copies.

17007// Between two shuffles one is less defined if it has the same vector operands

17008// and its mask indeces are the same as in the first one or undefs. E.g.

17009// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,

17010// poison, <0, 0, 0, 0>.

17011auto &&IsIdenticalOrLessDefined = [TTI =TTI](Instruction *I1,

17012Instruction *I2,

17013SmallVectorImpl<int> &NewMask) {

17014if (I1->getType() != I2->getType())

17015returnfalse;

17016auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);

17017auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);

17018if (!SI1 || !SI2)

17019return I1->isIdenticalTo(I2);

17020if (SI1->isIdenticalTo(SI2))

17021returntrue;

17022for (intI = 0, E = SI1->getNumOperands();I < E; ++I)

17023if (SI1->getOperand(I) != SI2->getOperand(I))

17024returnfalse;

17025// Check if the second instruction is more defined than the first one.

17026 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());

17027ArrayRef<int> SM1 = SI1->getShuffleMask();

17028// Count trailing undefs in the mask to check the final number of used

17029// registers.

17030unsigned LastUndefsCnt = 0;

17031for (intI = 0, E = NewMask.size();I < E; ++I) {

17032if (SM1[I] ==PoisonMaskElem)

17033 ++LastUndefsCnt;

17034else

17035 LastUndefsCnt = 0;

17036if (NewMask[I] !=PoisonMaskElem && SM1[I] !=PoisonMaskElem &&

17037 NewMask[I] != SM1[I])

17038returnfalse;

17039if (NewMask[I] ==PoisonMaskElem)

17040 NewMask[I] = SM1[I];

17041 }

17042// Check if the last undefs actually change the final number of used vector

17043// registers.

17044return SM1.size() - LastUndefsCnt > 1 &&

17045TTI->getNumberOfParts(SI1->getType()) ==

17046TTI->getNumberOfParts(

17047getWidenedType(SI1->getType()->getElementType(),

17048 SM1.size() - LastUndefsCnt));

17049 };

17050// Perform O(N^2) search over the gather/shuffle sequences and merge identical

17051// instructions. TODO: We can further optimize this scan if we split the

17052// instructions into different buckets based on the insert lane.

17053SmallVector<Instruction *, 16> Visited;

17054for (autoI = CSEWorkList.begin(), E = CSEWorkList.end();I != E; ++I) {

17055assert(*I &&

17056 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&

17057"Worklist not sorted properly!");

17058BasicBlock *BB = (*I)->getBlock();

17059// For all instructions in blocks containing gather sequences:

17060for (Instruction &In :llvm::make_early_inc_range(*BB)) {

17061if (isDeleted(&In))

17062continue;

17063if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&

17064 !GatherShuffleExtractSeq.contains(&In))

17065continue;

17066

17067// Check if we can replace this instruction with any of the

17068// visited instructions.

17069bool Replaced =false;

17070for (Instruction *&V : Visited) {

17071SmallVector<int> NewMask;

17072if (IsIdenticalOrLessDefined(&In, V, NewMask) &&

17073 DT->dominates(V->getParent(), In.getParent())) {

17074 In.replaceAllUsesWith(V);

17075eraseInstruction(&In);

17076if (auto *SI = dyn_cast<ShuffleVectorInst>(V))

17077if (!NewMask.empty())

17078 SI->setShuffleMask(NewMask);

17079 Replaced =true;

17080break;

17081 }

17082if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&

17083 GatherShuffleExtractSeq.contains(V) &&

17084 IsIdenticalOrLessDefined(V, &In, NewMask) &&

17085 DT->dominates(In.getParent(), V->getParent())) {

17086 In.moveAfter(V);

17087 V->replaceAllUsesWith(&In);

17088eraseInstruction(V);

17089if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))

17090if (!NewMask.empty())

17091 SI->setShuffleMask(NewMask);

17092 V = &In;

17093 Replaced =true;

17094break;

17095 }

17096 }

17097if (!Replaced) {

17098assert(!is_contained(Visited, &In));

17099 Visited.push_back(&In);

17100 }

17101 }

17102 }

17103 CSEBlocks.clear();

17104 GatherShuffleExtractSeq.clear();

17105}

17106

17107BoUpSLP::ScheduleData *

17108BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {

17109 ScheduleData *Bundle =nullptr;

17110 ScheduleData *PrevInBundle =nullptr;

17111for (Value *V : VL) {

17112if (doesNotNeedToBeScheduled(V))

17113continue;

17114 ScheduleData *BundleMember = getScheduleData(V);

17115assert(BundleMember &&

17116"no ScheduleData for bundle member "

17117"(maybe not in same basic block)");

17118assert(BundleMember->isSchedulingEntity() &&

17119"bundle member already part of other bundle");

17120if (PrevInBundle) {

17121 PrevInBundle->NextInBundle = BundleMember;

17122 }else {

17123 Bundle = BundleMember;

17124 }

17125

17126// Group the instructions to a bundle.

17127 BundleMember->FirstInBundle = Bundle;

17128 PrevInBundle = BundleMember;

17129 }

17130assert(Bundle &&"Failed to find schedule bundle");

17131return Bundle;

17132}

17133

17134// Groups the instructions to a bundle (which is then a single scheduling entity)

17135// and schedules instructions until the bundle gets ready.

17136std::optional<BoUpSLP::ScheduleData *>

17137BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,BoUpSLP *SLP,

17138const InstructionsState &S) {

17139// No need to schedule PHIs, insertelement, extractelement and extractvalue

17140// instructions.

17141if (isa<PHINode>(S.getMainOp()) ||

17142isVectorLikeInstWithConstOps(S.getMainOp()) ||doesNotNeedToSchedule(VL))

17143returnnullptr;

17144

17145// Initialize the instruction bundle.

17146Instruction *OldScheduleEnd = ScheduleEnd;

17147LLVM_DEBUG(dbgs() <<"SLP: bundle: " << *S.getMainOp() <<"\n");

17148

17149auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,

17150 ScheduleData *Bundle) {

17151// The scheduling region got new instructions at the lower end (or it is a

17152// new region for the first bundle). This makes it necessary to

17153// recalculate all dependencies.

17154// It is seldom that this needs to be done a second time after adding the

17155// initial bundle to the region.

17156if (ScheduleEnd != OldScheduleEnd) {

17157for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode())

17158if (ScheduleData *SD = getScheduleData(I))

17159 SD->clearDependencies();

17160 ReSchedule =true;

17161 }

17162if (Bundle) {

17163LLVM_DEBUG(dbgs() <<"SLP: try schedule bundle " << *Bundle

17164 <<" in block " << BB->getName() <<"\n");

17165 calculateDependencies(Bundle,/*InsertInReadyList=*/true, SLP);

17166 }

17167

17168if (ReSchedule) {

17169 resetSchedule();

17170 initialFillReadyList(ReadyInsts);

17171 }

17172

17173// Now try to schedule the new bundle or (if no bundle) just calculate

17174// dependencies. As soon as the bundle is "ready" it means that there are no

17175// cyclic dependencies and we can schedule it. Note that's important that we

17176// don't "schedule" the bundle yet (see cancelScheduling).

17177while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&

17178 !ReadyInsts.empty()) {

17179 ScheduleData *Picked = ReadyInsts.pop_back_val();

17180assert(Picked->isSchedulingEntity() && Picked->isReady() &&

17181"must be ready to schedule");

17182 schedule(Picked, ReadyInsts);

17183 }

17184 };

17185

17186// Make sure that the scheduling region contains all

17187// instructions of the bundle.

17188for (Value *V : VL) {

17189if (doesNotNeedToBeScheduled(V))

17190continue;

17191if (!extendSchedulingRegion(V, S)) {

17192// If the scheduling region got new instructions at the lower end (or it

17193// is a new region for the first bundle). This makes it necessary to

17194// recalculate all dependencies.

17195// Otherwise the compiler may crash trying to incorrectly calculate

17196// dependencies and emit instruction in the wrong order at the actual

17197// scheduling.

17198 TryScheduleBundleImpl(/*ReSchedule=*/false,nullptr);

17199return std::nullopt;

17200 }

17201 }

17202

17203bool ReSchedule =false;

17204for (Value *V : VL) {

17205if (doesNotNeedToBeScheduled(V))

17206continue;

17207 ScheduleData *BundleMember = getScheduleData(V);

17208assert(BundleMember &&

17209"no ScheduleData for bundle member (maybe not in same basic block)");

17210

17211// Make sure we don't leave the pieces of the bundle in the ready list when

17212// whole bundle might not be ready.

17213 ReadyInsts.remove(BundleMember);

17214

17215if (!BundleMember->IsScheduled)

17216continue;

17217// A bundle member was scheduled as single instruction before and now

17218// needs to be scheduled as part of the bundle. We just get rid of the

17219// existing schedule.

17220LLVM_DEBUG(dbgs() <<"SLP: reset schedule because " << *BundleMember

17221 <<" was already scheduled\n");

17222 ReSchedule =true;

17223 }

17224

17225auto *Bundle = buildBundle(VL);

17226 TryScheduleBundleImpl(ReSchedule, Bundle);

17227if (!Bundle->isReady()) {

17228 cancelScheduling(VL, S.getMainOp());

17229return std::nullopt;

17230 }

17231return Bundle;

17232}

17233

17234void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,

17235Value *OpValue) {

17236if (isa<PHINode>(OpValue) ||isVectorLikeInstWithConstOps(OpValue) ||

17237doesNotNeedToSchedule(VL))

17238return;

17239

17240if (doesNotNeedToBeScheduled(OpValue))

17241 OpValue = *find_if_not(VL,doesNotNeedToBeScheduled);

17242 ScheduleData *Bundle = getScheduleData(OpValue);

17243LLVM_DEBUG(dbgs() <<"SLP: cancel scheduling of " << *Bundle <<"\n");

17244assert(!Bundle->IsScheduled &&

17245"Can't cancel bundle which is already scheduled");

17246assert(Bundle->isSchedulingEntity() &&

17247 (Bundle->isPartOfBundle() ||needToScheduleSingleInstruction(VL)) &&

17248"tried to unbundle something which is not a bundle");

17249

17250// Remove the bundle from the ready list.

17251if (Bundle->isReady())

17252 ReadyInsts.remove(Bundle);

17253

17254// Un-bundle: make single instructions out of the bundle.

17255 ScheduleData *BundleMember = Bundle;

17256while (BundleMember) {

17257assert(BundleMember->FirstInBundle == Bundle &&"corrupt bundle links");

17258 BundleMember->FirstInBundle = BundleMember;

17259 ScheduleData *Next = BundleMember->NextInBundle;

17260 BundleMember->NextInBundle =nullptr;

17261 BundleMember->TE =nullptr;

17262if (BundleMember->unscheduledDepsInBundle() == 0) {

17263 ReadyInsts.insert(BundleMember);

17264 }

17265 BundleMember = Next;

17266 }

17267}

17268

17269BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {

17270// Allocate a new ScheduleData for the instruction.

17271if (ChunkPos >= ChunkSize) {

17272 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));

17273 ChunkPos = 0;

17274 }

17275return &(ScheduleDataChunks.back()[ChunkPos++]);

17276}

17277

17278bool BoUpSLP::BlockScheduling::extendSchedulingRegion(

17279Value *V,const InstructionsState &S) {

17280Instruction *I = dyn_cast<Instruction>(V);

17281assert(I &&"bundle member must be an instruction");

17282assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&

17283 !doesNotNeedToBeScheduled(I) &&

17284"phi nodes/insertelements/extractelements/extractvalues don't need to "

17285"be scheduled");

17286if (getScheduleData(I))

17287returntrue;

17288if (!ScheduleStart) {

17289// It's the first instruction in the new region.

17290 initScheduleData(I,I->getNextNode(),nullptr,nullptr);

17291 ScheduleStart =I;

17292 ScheduleEnd =I->getNextNode();

17293assert(ScheduleEnd &&"tried to vectorize a terminator?");

17294LLVM_DEBUG(dbgs() <<"SLP: initialize schedule region to " << *I <<"\n");

17295returntrue;

17296 }

17297// Search up and down at the same time, because we don't know if the new

17298// instruction is above or below the existing scheduling region.

17299// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted

17300// against the budget. Otherwise debug info could affect codegen.

17301BasicBlock::reverse_iterator UpIter =

17302 ++ScheduleStart->getIterator().getReverse();

17303BasicBlock::reverse_iterator UpperEnd = BB->rend();

17304BasicBlock::iterator DownIter = ScheduleEnd->getIterator();

17305BasicBlock::iterator LowerEnd = BB->end();

17306auto IsAssumeLikeIntr = [](constInstruction &I) {

17307if (auto *II = dyn_cast<IntrinsicInst>(&I))

17308returnII->isAssumeLikeIntrinsic();

17309returnfalse;

17310 };

17311 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);

17312 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);

17313while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=I &&

17314 &*DownIter !=I) {

17315if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {

17316LLVM_DEBUG(dbgs() <<"SLP: exceeded schedule region size limit\n");

17317returnfalse;

17318 }

17319

17320 ++UpIter;

17321 ++DownIter;

17322

17323 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);

17324 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);

17325 }

17326if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==I)) {

17327assert(I->getParent() == ScheduleStart->getParent() &&

17328"Instruction is in wrong basic block.");

17329 initScheduleData(I, ScheduleStart,nullptr, FirstLoadStoreInRegion);

17330 ScheduleStart =I;

17331LLVM_DEBUG(dbgs() <<"SLP: extend schedule region start to " << *I

17332 <<"\n");

17333returntrue;

17334 }

17335assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==I)) &&

17336"Expected to reach top of the basic block or instruction down the "

17337"lower end.");

17338assert(I->getParent() == ScheduleEnd->getParent() &&

17339"Instruction is in wrong basic block.");

17340 initScheduleData(ScheduleEnd,I->getNextNode(), LastLoadStoreInRegion,

17341nullptr);

17342 ScheduleEnd =I->getNextNode();

17343assert(ScheduleEnd &&"tried to vectorize a terminator?");

17344LLVM_DEBUG(dbgs() <<"SLP: extend schedule region end to " << *I <<"\n");

17345returntrue;

17346}

17347

17348void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,

17349Instruction *ToI,

17350 ScheduleData *PrevLoadStore,

17351 ScheduleData *NextLoadStore) {

17352 ScheduleData *CurrentLoadStore = PrevLoadStore;

17353for (Instruction *I = FromI;I != ToI;I =I->getNextNode()) {

17354// No need to allocate data for non-schedulable instructions.

17355if (doesNotNeedToBeScheduled(I))

17356continue;

17357 ScheduleData *SD = ScheduleDataMap.lookup(I);

17358if (!SD) {

17359 SD = allocateScheduleDataChunks();

17360 ScheduleDataMap[I] = SD;

17361 }

17362assert(!isInSchedulingRegion(SD) &&

17363"new ScheduleData already in scheduling region");

17364 SD->init(SchedulingRegionID,I);

17365

17366if (I->mayReadOrWriteMemory() &&

17367 (!isa<IntrinsicInst>(I) ||

17368 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&

17369 cast<IntrinsicInst>(I)->getIntrinsicID() !=

17370 Intrinsic::pseudoprobe))) {

17371// Update the linked list of memory accessing instructions.

17372if (CurrentLoadStore) {

17373 CurrentLoadStore->NextLoadStore = SD;

17374 }else {

17375 FirstLoadStoreInRegion = SD;

17376 }

17377 CurrentLoadStore = SD;

17378 }

17379

17380if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

17381match(I, m_Intrinsic<Intrinsic::stackrestore>()))

17382 RegionHasStackSave =true;

17383 }

17384if (NextLoadStore) {

17385if (CurrentLoadStore)

17386 CurrentLoadStore->NextLoadStore = NextLoadStore;

17387 }else {

17388 LastLoadStoreInRegion = CurrentLoadStore;

17389 }

17390}

17391

17392void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

17393bool InsertInReadyList,

17394BoUpSLP *SLP) {

17395assert(SD->isSchedulingEntity());

17396

17397SmallVector<ScheduleData *, 10> WorkList;

17398 WorkList.push_back(SD);

17399

17400while (!WorkList.empty()) {

17401 ScheduleData *SD = WorkList.pop_back_val();

17402for (ScheduleData *BundleMember = SD; BundleMember;

17403 BundleMember = BundleMember->NextInBundle) {

17404assert(isInSchedulingRegion(BundleMember));

17405if (BundleMember->hasValidDependencies())

17406continue;

17407

17408LLVM_DEBUG(dbgs() <<"SLP: update deps of " << *BundleMember

17409 <<"\n");

17410 BundleMember->Dependencies = 0;

17411 BundleMember->resetUnscheduledDeps();

17412

17413// Handle def-use chain dependencies.

17414for (User *U : BundleMember->Inst->users()) {

17415if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {

17416 BundleMember->Dependencies++;

17417 ScheduleData *DestBundle = UseSD->FirstInBundle;

17418if (!DestBundle->IsScheduled)

17419 BundleMember->incrementUnscheduledDeps(1);

17420if (!DestBundle->hasValidDependencies())

17421 WorkList.push_back(DestBundle);

17422 }

17423 }

17424

17425auto MakeControlDependent = [&](Instruction *I) {

17426auto *DepDest = getScheduleData(I);

17427assert(DepDest &&"must be in schedule window");

17428 DepDest->ControlDependencies.push_back(BundleMember);

17429 BundleMember->Dependencies++;

17430 ScheduleData *DestBundle = DepDest->FirstInBundle;

17431if (!DestBundle->IsScheduled)

17432 BundleMember->incrementUnscheduledDeps(1);

17433if (!DestBundle->hasValidDependencies())

17434 WorkList.push_back(DestBundle);

17435 };

17436

17437// Any instruction which isn't safe to speculate at the beginning of the

17438// block is control dependend on any early exit or non-willreturn call

17439// which proceeds it.

17440if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {

17441for (Instruction *I = BundleMember->Inst->getNextNode();

17442I != ScheduleEnd;I =I->getNextNode()) {

17443if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))

17444continue;

17445

17446// Add the dependency

17447 MakeControlDependent(I);

17448

17449if (!isGuaranteedToTransferExecutionToSuccessor(I))

17450// Everything past here must be control dependent on I.

17451break;

17452 }

17453 }

17454

17455if (RegionHasStackSave) {

17456// If we have an inalloc alloca instruction, it needs to be scheduled

17457// after any preceeding stacksave. We also need to prevent any alloca

17458// from reordering above a preceeding stackrestore.

17459if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||

17460match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {

17461for (Instruction *I = BundleMember->Inst->getNextNode();

17462I != ScheduleEnd;I =I->getNextNode()) {

17463if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

17464match(I, m_Intrinsic<Intrinsic::stackrestore>()))

17465// Any allocas past here must be control dependent on I, and I

17466// must be memory dependend on BundleMember->Inst.

17467break;

17468

17469if (!isa<AllocaInst>(I))

17470continue;

17471

17472// Add the dependency

17473 MakeControlDependent(I);

17474 }

17475 }

17476

17477// In addition to the cases handle just above, we need to prevent

17478// allocas and loads/stores from moving below a stacksave or a

17479// stackrestore. Avoiding moving allocas below stackrestore is currently

17480// thought to be conservatism. Moving loads/stores below a stackrestore

17481// can lead to incorrect code.

17482if (isa<AllocaInst>(BundleMember->Inst) ||

17483 BundleMember->Inst->mayReadOrWriteMemory()) {

17484for (Instruction *I = BundleMember->Inst->getNextNode();

17485I != ScheduleEnd;I =I->getNextNode()) {

17486if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&

17487 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))

17488continue;

17489

17490// Add the dependency

17491 MakeControlDependent(I);

17492break;

17493 }

17494 }

17495 }

17496

17497// Handle the memory dependencies (if any).

17498 ScheduleData *DepDest = BundleMember->NextLoadStore;

17499if (!DepDest)

17500continue;

17501Instruction *SrcInst = BundleMember->Inst;

17502assert(SrcInst->mayReadOrWriteMemory() &&

17503"NextLoadStore list for non memory effecting bundle?");

17504MemoryLocation SrcLoc =getLocation(SrcInst);

17505bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();

17506unsigned NumAliased = 0;

17507unsigned DistToSrc = 1;

17508

17509for (; DepDest; DepDest = DepDest->NextLoadStore) {

17510assert(isInSchedulingRegion(DepDest));

17511

17512// We have two limits to reduce the complexity:

17513// 1) AliasedCheckLimit: It's a small limit to reduce calls to

17514// SLP->isAliased (which is the expensive part in this loop).

17515// 2) MaxMemDepDistance: It's for very large blocks and it aborts

17516// the whole loop (even if the loop is fast, it's quadratic).

17517// It's important for the loop break condition (see below) to

17518// check this limit even between two read-only instructions.

17519if (DistToSrc >=MaxMemDepDistance ||

17520 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&

17521 (NumAliased >=AliasedCheckLimit ||

17522 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {

17523

17524// We increment the counter only if the locations are aliased

17525// (instead of counting all alias checks). This gives a better

17526// balance between reduced runtime and accurate dependencies.

17527 NumAliased++;

17528

17529 DepDest->MemoryDependencies.push_back(BundleMember);

17530 BundleMember->Dependencies++;

17531 ScheduleData *DestBundle = DepDest->FirstInBundle;

17532if (!DestBundle->IsScheduled) {

17533 BundleMember->incrementUnscheduledDeps(1);

17534 }

17535if (!DestBundle->hasValidDependencies()) {

17536 WorkList.push_back(DestBundle);

17537 }

17538 }

17539

17540// Example, explaining the loop break condition: Let's assume our

17541// starting instruction is i0 and MaxMemDepDistance = 3.

17542//

17543// +--------v--v--v

17544// i0,i1,i2,i3,i4,i5,i6,i7,i8

17545// +--------^--^--^

17546//

17547// MaxMemDepDistance let us stop alias-checking at i3 and we add

17548// dependencies from i0 to i3,i4,.. (even if they are not aliased).

17549// Previously we already added dependencies from i3 to i6,i7,i8

17550// (because of MaxMemDepDistance). As we added a dependency from

17551// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8

17552// and we can abort this loop at i6.

17553if (DistToSrc >= 2 *MaxMemDepDistance)

17554break;

17555 DistToSrc++;

17556 }

17557 }

17558if (InsertInReadyList && SD->isReady()) {

17559 ReadyInsts.insert(SD);

17560LLVM_DEBUG(dbgs() <<"SLP: gets ready on update: " << *SD->Inst

17561 <<"\n");

17562 }

17563 }

17564}

17565

17566void BoUpSLP::BlockScheduling::resetSchedule() {

17567assert(ScheduleStart &&

17568"tried to reset schedule on block which has not been scheduled");

17569for (Instruction *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {

17570if (ScheduleData *SD = getScheduleData(I)) {

17571assert(isInSchedulingRegion(SD) &&

17572"ScheduleData not in scheduling region");

17573 SD->IsScheduled =false;

17574 SD->resetUnscheduledDeps();

17575 }

17576 }

17577 ReadyInsts.clear();

17578}

17579

17580void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

17581if (!BS->ScheduleStart)

17582return;

17583

17584LLVM_DEBUG(dbgs() <<"SLP: schedule block " << BS->BB->getName() <<"\n");

17585

17586// A key point - if we got here, pre-scheduling was able to find a valid

17587// scheduling of the sub-graph of the scheduling window which consists

17588// of all vector bundles and their transitive users. As such, we do not

17589// need to reschedule anything *outside of* that subgraph.

17590

17591 BS->resetSchedule();

17592

17593// For the real scheduling we use a more sophisticated ready-list: it is

17594// sorted by the original instruction location. This lets the final schedule

17595// be as close as possible to the original instruction order.

17596// WARNING: If changing this order causes a correctness issue, that means

17597// there is some missing dependence edge in the schedule data graph.

17598structScheduleDataCompare {

17599bool operator()(ScheduleData *SD1, ScheduleData *SD2) const{

17600return SD2->SchedulingPriority < SD1->SchedulingPriority;

17601 }

17602 };

17603 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;

17604

17605// Ensure that all dependency data is updated (for nodes in the sub-graph)

17606// and fill the ready-list with initial instructions.

17607intIdx = 0;

17608for (auto *I = BS->ScheduleStart;I != BS->ScheduleEnd;

17609I =I->getNextNode()) {

17610if (ScheduleData *SD = BS->getScheduleData(I)) {

17611 [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);

17612assert((isVectorLikeInstWithConstOps(SD->Inst) ||

17613 SD->isPartOfBundle() ==

17614 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&

17615"scheduler and vectorizer bundle mismatch");

17616 SD->FirstInBundle->SchedulingPriority =Idx++;

17617

17618if (SD->isSchedulingEntity() && SD->isPartOfBundle())

17619 BS->calculateDependencies(SD,false,this);

17620 }

17621 }

17622 BS->initialFillReadyList(ReadyInsts);

17623

17624Instruction *LastScheduledInst = BS->ScheduleEnd;

17625

17626// Do the "real" scheduling.

17627while (!ReadyInsts.empty()) {

17628 ScheduleData *Picked = *ReadyInsts.begin();

17629 ReadyInsts.erase(ReadyInsts.begin());

17630

17631// Move the scheduled instruction(s) to their dedicated places, if not

17632// there yet.

17633for (ScheduleData *BundleMember = Picked; BundleMember;

17634 BundleMember = BundleMember->NextInBundle) {

17635Instruction *PickedInst = BundleMember->Inst;

17636if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)

17637 PickedInst->moveAfter(LastScheduledInst->getPrevNode());

17638 LastScheduledInst = PickedInst;

17639 }

17640

17641 BS->schedule(Picked, ReadyInsts);

17642 }

17643

17644// Check that we didn't break any of our invariants.

17645#ifdef EXPENSIVE_CHECKS

17646 BS->verify();

17647#endif

17648

17649#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)

17650// Check that all schedulable entities got scheduled

17651for (auto *I = BS->ScheduleStart;I != BS->ScheduleEnd;I =I->getNextNode()) {

17652 ScheduleData *SD = BS->getScheduleData(I);

17653if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())

17654assert(SD->IsScheduled &&"must be scheduled at this point");

17655 }

17656#endif

17657

17658// Avoid duplicate scheduling of the block.

17659 BS->ScheduleStart =nullptr;

17660}

17661

17662unsignedBoUpSLP::getVectorElementSize(Value *V) {

17663// If V is a store, just return the width of the stored value (or value

17664// truncated just before storing) without traversing the expression tree.

17665// This is the common case.

17666if (auto *Store = dyn_cast<StoreInst>(V))

17667returnDL->getTypeSizeInBits(Store->getValueOperand()->getType());

17668

17669if (auto *IEI = dyn_cast<InsertElementInst>(V))

17670returngetVectorElementSize(IEI->getOperand(1));

17671

17672auto E = InstrElementSize.find(V);

17673if (E != InstrElementSize.end())

17674return E->second;

17675

17676// If V is not a store, we can traverse the expression tree to find loads

17677// that feed it. The type of the loaded value may indicate a more suitable

17678// width than V's type. We want to base the vector element size on the width

17679// of memory operations where possible.

17680SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;

17681SmallPtrSet<Instruction *, 16> Visited;

17682if (auto *I = dyn_cast<Instruction>(V)) {

17683 Worklist.emplace_back(I,I->getParent(), 0);

17684 Visited.insert(I);

17685 }

17686

17687// Traverse the expression tree in bottom-up order looking for loads. If we

17688// encounter an instruction we don't yet handle, we give up.

17689auto Width = 0u;

17690Value *FirstNonBool =nullptr;

17691while (!Worklist.empty()) {

17692auto [I, Parent, Level] = Worklist.pop_back_val();

17693

17694// We should only be looking at scalar instructions here. If the current

17695// instruction has a vector type, skip.

17696auto *Ty =I->getType();

17697if (isa<VectorType>(Ty))

17698continue;

17699if (Ty != Builder.getInt1Ty() && !FirstNonBool)

17700 FirstNonBool =I;

17701if (Level >RecursionMaxDepth)

17702continue;

17703

17704// If the current instruction is a load, update MaxWidth to reflect the

17705// width of the loaded value.

17706if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))

17707 Width = std::max<unsigned>(Width,DL->getTypeSizeInBits(Ty));

17708

17709// Otherwise, we need to visit the operands of the instruction. We only

17710// handle the interesting cases from buildTree here. If an operand is an

17711// instruction we haven't yet visited and from the same basic block as the

17712// user or the use is a PHI node, we add it to the worklist.

17713elseif (isa<PHINode,CastInst,GetElementPtrInst,CmpInst,SelectInst,

17714BinaryOperator,UnaryOperator>(I)) {

17715for (Use &U :I->operands()) {

17716if (auto *J = dyn_cast<Instruction>(U.get()))

17717if (Visited.insert(J).second &&

17718 (isa<PHINode>(I) || J->getParent() == Parent)) {

17719 Worklist.emplace_back(J, J->getParent(), Level + 1);

17720continue;

17721 }

17722if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())

17723 FirstNonBool = U.get();

17724 }

17725 }else {

17726break;

17727 }

17728 }

17729

17730// If we didn't encounter a memory access in the expression tree, or if we

17731// gave up for some reason, just return the width of V. Otherwise, return the

17732// maximum width we found.

17733if (!Width) {

17734if (V->getType() == Builder.getInt1Ty() && FirstNonBool)

17735 V = FirstNonBool;

17736 Width =DL->getTypeSizeInBits(V->getType());

17737 }

17738

17739for (Instruction *I : Visited)

17740 InstrElementSize[I] = Width;

17741

17742return Width;

17743}

17744

17745bool BoUpSLP::collectValuesToDemote(

17746const TreeEntry &E,bool IsProfitableToDemoteRoot,unsigned &BitWidth,

17747SmallVectorImpl<unsigned> &ToDemote,DenseSet<const TreeEntry *> &Visited,

17748constSmallDenseSet<unsigned, 8> &NodesToKeepBWs,unsigned &MaxDepthLevel,

17749bool &IsProfitableToDemote,bool IsTruncRoot) const{

17750// We can always demote constants.

17751if (all_of(E.Scalars, IsaPred<Constant>))

17752returntrue;

17753

17754unsigned OrigBitWidth =

17755DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());

17756if (OrigBitWidth ==BitWidth) {

17757 MaxDepthLevel = 1;

17758returntrue;

17759 }

17760

17761// Check if the node was analyzed already and must keep its original bitwidth.

17762if (NodesToKeepBWs.contains(E.Idx))

17763returnfalse;

17764

17765// If the value is not a vectorized instruction in the expression and not used

17766// by the insertelement instruction and not used in multiple vector nodes, it

17767// cannot be demoted.

17768bool IsSignedNode =any_of(E.Scalars, [&](Value *R) {

17769 if (isa<PoisonValue>(R))

17770 return false;

17771 return !isKnownNonNegative(R, SimplifyQuery(*DL));

17772 });

17773auto IsPotentiallyTruncated = [&](Value *V,unsigned &BitWidth) ->bool {

17774if (isa<PoisonValue>(V))

17775returntrue;

17776if (MultiNodeScalars.contains(V))

17777returnfalse;

17778// For lat shuffle of sext/zext with many uses need to check the extra bit

17779// for unsigned values, otherwise may have incorrect casting for reused

17780// scalars.

17781bool IsSignedVal = !isKnownNonNegative(V,SimplifyQuery(*DL));

17782if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >BitWidth) {

17783APInt Mask =APInt::getBitsSetFrom(OrigBitWidth,BitWidth);

17784if (MaskedValueIsZero(V, Mask,SimplifyQuery(*DL)))

17785returntrue;

17786 }

17787unsigned NumSignBits =ComputeNumSignBits(V, *DL, 0, AC,nullptr, DT);

17788unsigned BitWidth1 = OrigBitWidth - NumSignBits;

17789if (IsSignedNode)

17790 ++BitWidth1;

17791if (auto *I = dyn_cast<Instruction>(V)) {

17792APInt Mask = DB->getDemandedBits(I);

17793unsigned BitWidth2 =

17794 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());

17795while (!IsSignedNode && BitWidth2 < OrigBitWidth) {

17796APInt Mask =APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);

17797if (MaskedValueIsZero(V, Mask,SimplifyQuery(*DL)))

17798break;

17799 BitWidth2 *= 2;

17800 }

17801 BitWidth1 = std::min(BitWidth1, BitWidth2);

17802 }

17803BitWidth = std::max(BitWidth, BitWidth1);

17804returnBitWidth > 0 && OrigBitWidth >= (BitWidth * 2);

17805 };

17806auto FinalAnalysis = [&,TTI =TTI]() {

17807if (!IsProfitableToDemote)

17808returnfalse;

17809bool Res =all_of(

17810 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));

17811// Demote gathers.

17812if (Res && E.isGather()) {

17813// Check possible extractelement instructions bases and final vector

17814// length.

17815SmallPtrSet<Value *, 4> UniqueBases;

17816for (Value *V : E.Scalars) {

17817auto *EE = dyn_cast<ExtractElementInst>(V);

17818if (!EE)

17819continue;

17820 UniqueBases.insert(EE->getVectorOperand());

17821 }

17822constunsigned VF = E.Scalars.size();

17823Type *OrigScalarTy = E.Scalars.front()->getType();

17824if (UniqueBases.size() <= 2 ||

17825TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==

17826TTI->getNumberOfParts(getWidenedType(

17827IntegerType::get(OrigScalarTy->getContext(),BitWidth), VF)))

17828 ToDemote.push_back(E.Idx);

17829 }

17830return Res;

17831 };

17832if (E.isGather() || !Visited.insert(&E).second ||

17833any_of(E.Scalars, [&](Value *V) {

17834 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {

17835 return isa<InsertElementInst>(U) && !getTreeEntry(U);

17836 });

17837 }))

17838return FinalAnalysis();

17839

17840if (any_of(E.Scalars, [&](Value *V) {

17841 return !all_of(V->users(), [=](User *U) {

17842 return getTreeEntry(U) ||

17843 (E.Idx == 0 && UserIgnoreList &&

17844 UserIgnoreList->contains(U)) ||

17845 (!isa<CmpInst>(U) && U->getType()->isSized() &&

17846 !U->getType()->isScalableTy() &&

17847 DL->getTypeSizeInBits(U->getType()) <= BitWidth);

17848 }) && !IsPotentiallyTruncated(V,BitWidth);

17849 }))

17850returnfalse;

17851

17852auto ProcessOperands = [&](ArrayRef<const TreeEntry *>Operands,

17853bool &NeedToExit) {

17854 NeedToExit =false;

17855unsigned InitLevel = MaxDepthLevel;

17856for (const TreeEntry *Op :Operands) {

17857unsigned Level = InitLevel;

17858if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot,BitWidth,

17859 ToDemote, Visited, NodesToKeepBWs, Level,

17860 IsProfitableToDemote, IsTruncRoot)) {

17861if (!IsProfitableToDemote)

17862returnfalse;

17863 NeedToExit =true;

17864if (!FinalAnalysis())

17865returnfalse;

17866continue;

17867 }

17868 MaxDepthLevel = std::max(MaxDepthLevel, Level);

17869 }

17870returntrue;

17871 };

17872auto AttemptCheckBitwidth =

17873 [&](function_ref<bool(unsigned,unsigned)> Checker,bool &NeedToExit) {

17874// Try all bitwidth < OrigBitWidth.

17875 NeedToExit =false;

17876unsigned BestFailBitwidth = 0;

17877for (;BitWidth < OrigBitWidth;BitWidth *= 2) {

17878if (Checker(BitWidth, OrigBitWidth))

17879returntrue;

17880if (BestFailBitwidth == 0 && FinalAnalysis())

17881 BestFailBitwidth =BitWidth;

17882 }

17883if (BitWidth >= OrigBitWidth) {

17884if (BestFailBitwidth == 0) {

17885BitWidth = OrigBitWidth;

17886returnfalse;

17887 }

17888 MaxDepthLevel = 1;

17889BitWidth = BestFailBitwidth;

17890 NeedToExit =true;

17891returntrue;

17892 }

17893returnfalse;

17894 };

17895auto TryProcessInstruction =

17896 [&](unsigned &BitWidth,ArrayRef<const TreeEntry *>Operands = {},

17897function_ref<bool(unsigned,unsigned)> Checker = {}) {

17898if (Operands.empty()) {

17899if (!IsTruncRoot)

17900 MaxDepthLevel = 1;

17901 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,

17902 std::ref(BitWidth)));

17903 }else {

17904// Several vectorized uses? Check if we can truncate it, otherwise -

17905// exit.

17906if (E.UserTreeIndices.size() > 1 &&

17907 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,

17908 std::ref(BitWidth))))

17909returnfalse;

17910bool NeedToExit =false;

17911if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))

17912returnfalse;

17913if (NeedToExit)

17914returntrue;

17915if (!ProcessOperands(Operands, NeedToExit))

17916returnfalse;

17917if (NeedToExit)

17918returntrue;

17919 }

17920

17921 ++MaxDepthLevel;

17922// Record the entry that we can demote.

17923 ToDemote.push_back(E.Idx);

17924return IsProfitableToDemote;

17925 };

17926switch (E.getOpcode()) {

17927

17928// We can always demote truncations and extensions. Since truncations can

17929// seed additional demotion, we save the truncated value.

17930case Instruction::Trunc:

17931if (IsProfitableToDemoteRoot)

17932 IsProfitableToDemote =true;

17933return TryProcessInstruction(BitWidth);

17934case Instruction::ZExt:

17935case Instruction::SExt:

17936 IsProfitableToDemote =true;

17937return TryProcessInstruction(BitWidth);

17938

17939// We can demote certain binary operations if we can demote both of their

17940// operands.

17941case Instruction::Add:

17942case Instruction::Sub:

17943case Instruction::Mul:

17944case Instruction::And:

17945case Instruction::Or:

17946case Instruction::Xor: {

17947return TryProcessInstruction(

17948BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});

17949 }

17950case Instruction::Freeze:

17951return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));

17952case Instruction::Shl: {

17953// If we are truncating the result of this SHL, and if it's a shift of an

17954// inrange amount, we can always perform a SHL in a smaller type.

17955auto ShlChecker = [&](unsignedBitWidth,unsigned) {

17956returnall_of(E.Scalars, [&](Value *V) {

17957 if (isa<PoisonValue>(V))

17958 return true;

17959 auto *I = cast<Instruction>(V);

17960 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);

17961 return AmtKnownBits.getMaxValue().ult(BitWidth);

17962 });

17963 };

17964return TryProcessInstruction(

17965BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);

17966 }

17967case Instruction::LShr: {

17968// If this is a truncate of a logical shr, we can truncate it to a smaller

17969// lshr iff we know that the bits we would otherwise be shifting in are

17970// already zeros.

17971auto LShrChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

17972returnall_of(E.Scalars, [&](Value *V) {

17973 if (isa<PoisonValue>(V))

17974 return true;

17975 auto *I = cast<Instruction>(V);

17976 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);

17977 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

17978 return AmtKnownBits.getMaxValue().ult(BitWidth) &&

17979 MaskedValueIsZero(I->getOperand(0), ShiftedBits,

17980 SimplifyQuery(*DL));

17981 });

17982 };

17983return TryProcessInstruction(

17984BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},

17985 LShrChecker);

17986 }

17987case Instruction::AShr: {

17988// If this is a truncate of an arithmetic shr, we can truncate it to a

17989// smaller ashr iff we know that all the bits from the sign bit of the

17990// original type and the sign bit of the truncate type are similar.

17991auto AShrChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

17992returnall_of(E.Scalars, [&](Value *V) {

17993 if (isa<PoisonValue>(V))

17994 return true;

17995 auto *I = cast<Instruction>(V);

17996 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);

17997 unsigned ShiftedBits = OrigBitWidth - BitWidth;

17998 return AmtKnownBits.getMaxValue().ult(BitWidth) &&

17999 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,

18000 nullptr, DT);

18001 });

18002 };

18003return TryProcessInstruction(

18004BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},

18005 AShrChecker);

18006 }

18007case Instruction::UDiv:

18008case Instruction::URem: {

18009// UDiv and URem can be truncated if all the truncated bits are zero.

18010auto Checker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

18011assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");

18012returnall_of(E.Scalars, [&](Value *V) {

18013 auto *I = cast<Instruction>(V);

18014 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

18015 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&

18016 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));

18017 });

18018 };

18019return TryProcessInstruction(

18020BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);

18021 }

18022

18023// We can demote selects if we can demote their true and false values.

18024case Instruction::Select: {

18025return TryProcessInstruction(

18026BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});

18027 }

18028

18029// We can demote phis if we can demote all their incoming operands. Note that

18030// we don't need to worry about cycles since we ensure single use above.

18031case Instruction::PHI: {

18032constunsigned NumOps = E.getNumOperands();

18033SmallVector<const TreeEntry *> Ops(NumOps);

18034transform(seq<unsigned>(0, NumOps), Ops.begin(),

18035 std::bind(&BoUpSLP::getOperandEntry,this, &E, _1));

18036

18037return TryProcessInstruction(BitWidth, Ops);

18038 }

18039

18040case Instruction::Call: {

18041auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());

18042if (!IC)

18043break;

18044Intrinsic::ID ID =getVectorIntrinsicIDForCall(IC, TLI);

18045if (ID != Intrinsic::abs &&ID != Intrinsic::smin &&

18046ID != Intrinsic::smax &&ID != Intrinsic::umin &&ID != Intrinsic::umax)

18047break;

18048SmallVector<const TreeEntry *, 2>Operands(1, getOperandEntry(&E, 0));

18049function_ref<bool(unsigned,unsigned)> CallChecker;

18050auto CompChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

18051assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");

18052returnall_of(E.Scalars, [&](Value *V) {

18053 auto *I = cast<Instruction>(V);

18054 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {

18055 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

18056 return MaskedValueIsZero(I->getOperand(0), Mask,

18057 SimplifyQuery(*DL)) &&

18058 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));

18059 }

18060assert((ID == Intrinsic::smin ||ID == Intrinsic::smax) &&

18061"Expected min/max intrinsics only.");

18062unsigned SignBits = OrigBitWidth -BitWidth;

18063APInt Mask =APInt::getBitsSetFrom(OrigBitWidth,BitWidth - 1);

18064unsigned Op0SignBits =ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,

18065nullptr, DT);

18066unsigned Op1SignBits =ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,

18067nullptr, DT);

18068return SignBits <= Op0SignBits &&

18069 ((SignBits != Op0SignBits &&

18070 !isKnownNonNegative(I->getOperand(0),SimplifyQuery(*DL))) ||

18071MaskedValueIsZero(I->getOperand(0),Mask,

18072SimplifyQuery(*DL))) &&

18073 SignBits <= Op1SignBits &&

18074 ((SignBits != Op1SignBits &&

18075 !isKnownNonNegative(I->getOperand(1),SimplifyQuery(*DL))) ||

18076MaskedValueIsZero(I->getOperand(1),Mask,SimplifyQuery(*DL)));

18077 });

18078 };

18079auto AbsChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

18080assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");

18081returnall_of(E.Scalars, [&](Value *V) {

18082 auto *I = cast<Instruction>(V);

18083 unsigned SignBits = OrigBitWidth - BitWidth;

18084 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);

18085 unsigned Op0SignBits =

18086 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);

18087 return SignBits <= Op0SignBits &&

18088 ((SignBits != Op0SignBits &&

18089 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||

18090 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));

18091 });

18092 };

18093if (ID != Intrinsic::abs) {

18094Operands.push_back(getOperandEntry(&E, 1));

18095 CallChecker = CompChecker;

18096 }else {

18097 CallChecker = AbsChecker;

18098 }

18099InstructionCost BestCost =

18100 std::numeric_limits<InstructionCost::CostType>::max();

18101unsigned BestBitWidth =BitWidth;

18102unsigned VF = E.Scalars.size();

18103// Choose the best bitwidth based on cost estimations.

18104auto Checker = [&](unsignedBitWidth,unsigned) {

18105unsigned MinBW =PowerOf2Ceil(BitWidth);

18106SmallVector<Type *> ArgTys =

18107buildIntrinsicArgTypes(IC,ID, VF, MinBW,TTI);

18108auto VecCallCosts =getVectorCallCosts(

18109 IC,getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),

18110TTI, TLI, ArgTys);

18111InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);

18112if (Cost < BestCost) {

18113 BestCost =Cost;

18114 BestBitWidth =BitWidth;

18115 }

18116returnfalse;

18117 };

18118 [[maybe_unused]]bool NeedToExit;

18119 (void)AttemptCheckBitwidth(Checker, NeedToExit);

18120BitWidth = BestBitWidth;

18121return TryProcessInstruction(BitWidth,Operands, CallChecker);

18122 }

18123

18124// Otherwise, conservatively give up.

18125default:

18126break;

18127 }

18128 MaxDepthLevel = 1;

18129return FinalAnalysis();

18130}

18131

18132staticRecurKind getRdxKind(Value *V);

18133

18134voidBoUpSLP::computeMinimumValueSizes() {

18135// We only attempt to truncate integer expressions.

18136bool IsStoreOrInsertElt =

18137 VectorizableTree.front()->hasState() &&

18138 (VectorizableTree.front()->getOpcode() == Instruction::Store ||

18139 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);

18140if ((IsStoreOrInsertElt || UserIgnoreList) &&

18141 ExtraBitWidthNodes.size() <= 1 &&

18142 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||

18143 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))

18144return;

18145

18146unsigned NodeIdx = 0;

18147if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())

18148 NodeIdx = 1;

18149

18150// Ensure the roots of the vectorizable tree don't form a cycle.

18151if (VectorizableTree[NodeIdx]->isGather() ||

18152 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||

18153 (NodeIdx != 0 &&any_of(VectorizableTree[NodeIdx]->UserTreeIndices,

18154 [NodeIdx](constEdgeInfo &EI) {

18155return EI.UserTE->Idx > NodeIdx;

18156 })))

18157return;

18158

18159// The first value node for store/insertelement is sext/zext/trunc? Skip it,

18160// resize to the final type.

18161bool IsTruncRoot =false;

18162bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;

18163SmallVector<unsigned> RootDemotes;

18164SmallDenseSet<unsigned, 8> NodesToKeepBWs;

18165if (NodeIdx != 0 &&

18166 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&

18167 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {

18168assert(IsStoreOrInsertElt &&"Expected store/insertelement seeded graph.");

18169 IsTruncRoot =true;

18170 RootDemotes.push_back(NodeIdx);

18171 IsProfitableToDemoteRoot =true;

18172 ++NodeIdx;

18173 }

18174

18175// Analyzed the reduction already and not profitable - exit.

18176if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))

18177return;

18178

18179SmallVector<unsigned> ToDemote;

18180auto ComputeMaxBitWidth =

18181 [&](const TreeEntry &E,bool IsTopRoot,bool IsProfitableToDemoteRoot,

18182unsigned Limit,bool IsTruncRoot,bool IsSignedCmp) ->unsigned {

18183 ToDemote.clear();

18184// Check if the root is trunc and the next node is gather/buildvector, then

18185// keep trunc in scalars, which is free in most cases.

18186if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&

18187 !NodesToKeepBWs.contains(E.Idx) &&

18188 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&

18189all_of(E.Scalars, [&](Value *V) {

18190return V->hasOneUse() || isa<Constant>(V) ||

18191 (!V->hasNUsesOrMore(UsesLimit) &&

18192none_of(V->users(), [&](User *U) {

18193 const TreeEntry *TE = getTreeEntry(U);

18194 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;

18195 if (TE == UserTE || !TE)

18196 return false;

18197 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,

18198 SelectInst>(U) ||

18199 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,

18200 SelectInst>(UserTE->getMainOp()))

18201 return true;

18202 unsigned UserTESz = DL->getTypeSizeInBits(

18203 UserTE->Scalars.front()->getType());

18204 auto It = MinBWs.find(TE);

18205 if (It != MinBWs.end() && It->second.first > UserTESz)

18206 return true;

18207 return DL->getTypeSizeInBits(U->getType()) > UserTESz;

18208 }));

18209 })) {

18210 ToDemote.push_back(E.Idx);

18211const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;

18212auto It = MinBWs.find(UserTE);

18213if (It != MinBWs.end())

18214return It->second.first;

18215unsigned MaxBitWidth =

18216DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());

18217 MaxBitWidth =bit_ceil(MaxBitWidth);

18218if (MaxBitWidth < 8 && MaxBitWidth > 1)

18219 MaxBitWidth = 8;

18220return MaxBitWidth;

18221 }

18222

18223if (!E.hasState())

18224return 0u;

18225

18226unsigned VF = E.getVectorFactor();

18227Type *ScalarTy = E.Scalars.front()->getType();

18228unsigned ScalarTyNumElements =getNumElements(ScalarTy);

18229auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());

18230if (!TreeRootIT)

18231return 0u;

18232

18233if (any_of(E.Scalars,

18234 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))

18235return 0u;

18236

18237unsigned NumParts =TTI->getNumberOfParts(

18238getWidenedType(TreeRootIT, VF * ScalarTyNumElements));

18239

18240// The maximum bit width required to represent all the values that can be

18241// demoted without loss of precision. It would be safe to truncate the roots

18242// of the expression to this width.

18243unsigned MaxBitWidth = 1u;

18244

18245// True if the roots can be zero-extended back to their original type,

18246// rather than sign-extended. We know that if the leading bits are not

18247// demanded, we can safely zero-extend. So we initialize IsKnownPositive to

18248// True.

18249// Determine if the sign bit of all the roots is known to be zero. If not,

18250// IsKnownPositive is set to False.

18251bool IsKnownPositive = !IsSignedCmp &&all_of(E.Scalars, [&](Value *R) {

18252 if (isa<PoisonValue>(R))

18253 return true;

18254 KnownBits Known = computeKnownBits(R, *DL);

18255 return Known.isNonNegative();

18256 });

18257

18258// We first check if all the bits of the roots are demanded. If they're not,

18259// we can truncate the roots to this narrower type.

18260for (Value *Root : E.Scalars) {

18261if (isa<PoisonValue>(Root))

18262continue;

18263unsigned NumSignBits =ComputeNumSignBits(Root, *DL, 0, AC,nullptr, DT);

18264TypeSize NumTypeBits =

18265DL->getTypeSizeInBits(Root->getType()->getScalarType());

18266unsigned BitWidth1 = NumTypeBits - NumSignBits;

18267// If we can't prove that the sign bit is zero, we must add one to the

18268// maximum bit width to account for the unknown sign bit. This preserves

18269// the existing sign bit so we can safely sign-extend the root back to the

18270// original type. Otherwise, if we know the sign bit is zero, we will

18271// zero-extend the root instead.

18272//

18273// FIXME: This is somewhat suboptimal, as there will be cases where adding

18274// one to the maximum bit width will yield a larger-than-necessary

18275// type. In general, we need to add an extra bit only if we can't

18276// prove that the upper bit of the original type is equal to the

18277// upper bit of the proposed smaller type. If these two bits are

18278// the same (either zero or one) we know that sign-extending from

18279// the smaller type will result in the same value. Here, since we

18280// can't yet prove this, we are just making the proposed smaller

18281// type larger to ensure correctness.

18282if (!IsKnownPositive)

18283 ++BitWidth1;

18284

18285APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));

18286unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();

18287 MaxBitWidth =

18288 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);

18289 }

18290

18291if (MaxBitWidth < 8 && MaxBitWidth > 1)

18292 MaxBitWidth = 8;

18293

18294// If the original type is large, but reduced type does not improve the reg

18295// use - ignore it.

18296if (NumParts > 1 &&

18297 NumParts ==

18298TTI->getNumberOfParts(getWidenedType(

18299IntegerType::get(F->getContext(),bit_ceil(MaxBitWidth)), VF)))

18300return 0u;

18301

18302unsigned Opcode = E.getOpcode();

18303bool IsProfitableToDemote = Opcode == Instruction::Trunc ||

18304 Opcode == Instruction::SExt ||

18305 Opcode == Instruction::ZExt || NumParts > 1;

18306// Conservatively determine if we can actually truncate the roots of the

18307// expression. Collect the values that can be demoted in ToDemote and

18308// additional roots that require investigating in Roots.

18309DenseSet<const TreeEntry *> Visited;

18310unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;

18311bool NeedToDemote = IsProfitableToDemote;

18312

18313if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,

18314 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,

18315 NeedToDemote, IsTruncRoot) ||

18316 (MaxDepthLevel <= Limit &&

18317 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&

18318 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||

18319DL->getTypeSizeInBits(TreeRootIT) /

18320DL->getTypeSizeInBits(

18321 E.getMainOp()->getOperand(0)->getType()) >

18322 2)))))

18323return 0u;

18324// Round MaxBitWidth up to the next power-of-two.

18325 MaxBitWidth =bit_ceil(MaxBitWidth);

18326

18327return MaxBitWidth;

18328 };

18329

18330// If we can truncate the root, we must collect additional values that might

18331// be demoted as a result. That is, those seeded by truncations we will

18332// modify.

18333// Add reduction ops sizes, if any.

18334if (UserIgnoreList &&

18335 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {

18336// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n

18337// x i1> to in)).

18338if (all_of(*UserIgnoreList,

18339 [](Value *V) {

18340return isa<PoisonValue>(V) ||

18341 cast<Instruction>(V)->getOpcode() == Instruction::Add;

18342 }) &&

18343 VectorizableTree.front()->State == TreeEntry::Vectorize &&

18344 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&

18345 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==

18346 Builder.getInt1Ty()) {

18347 ReductionBitWidth = 1;

18348 }else {

18349for (Value *V : *UserIgnoreList) {

18350if (isa<PoisonValue>(V))

18351continue;

18352unsigned NumSignBits =ComputeNumSignBits(V, *DL, 0, AC,nullptr, DT);

18353TypeSize NumTypeBits =DL->getTypeSizeInBits(V->getType());

18354unsigned BitWidth1 = NumTypeBits - NumSignBits;

18355if (!isKnownNonNegative(V,SimplifyQuery(*DL)))

18356 ++BitWidth1;

18357unsigned BitWidth2 = BitWidth1;

18358if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {

18359APInt Mask = DB->getDemandedBits(cast<Instruction>(V));

18360 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();

18361 }

18362 ReductionBitWidth =

18363 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);

18364 }

18365if (ReductionBitWidth < 8 && ReductionBitWidth > 1)

18366 ReductionBitWidth = 8;

18367

18368 ReductionBitWidth =bit_ceil(ReductionBitWidth);

18369 }

18370 }

18371bool IsTopRoot = NodeIdx == 0;

18372while (NodeIdx < VectorizableTree.size() &&

18373 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&

18374 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {

18375 RootDemotes.push_back(NodeIdx);

18376 ++NodeIdx;

18377 IsTruncRoot =true;

18378 }

18379bool IsSignedCmp =false;

18380while (NodeIdx < VectorizableTree.size()) {

18381ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;

18382unsigned Limit = 2;

18383if (IsTopRoot &&

18384 ReductionBitWidth ==

18385DL->getTypeSizeInBits(

18386 VectorizableTree.front()->Scalars.front()->getType()))

18387 Limit = 3;

18388unsigned MaxBitWidth = ComputeMaxBitWidth(

18389 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,

18390 IsTruncRoot, IsSignedCmp);

18391if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {

18392if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)

18393 ReductionBitWidth =bit_ceil(MaxBitWidth);

18394elseif (MaxBitWidth == 0)

18395 ReductionBitWidth = 0;

18396 }

18397

18398for (unsignedIdx : RootDemotes) {

18399if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {

18400uint32_t OrigBitWidth =

18401DL->getTypeSizeInBits(V->getType()->getScalarType());

18402if (OrigBitWidth > MaxBitWidth) {

18403APInt Mask =APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);

18404returnMaskedValueIsZero(V, Mask,SimplifyQuery(*DL));

18405 }

18406returnfalse;

18407 }))

18408 ToDemote.push_back(Idx);

18409 }

18410 RootDemotes.clear();

18411 IsTopRoot =false;

18412 IsProfitableToDemoteRoot =true;

18413

18414if (ExtraBitWidthNodes.empty()) {

18415 NodeIdx = VectorizableTree.size();

18416 }else {

18417unsigned NewIdx = 0;

18418do {

18419 NewIdx = *ExtraBitWidthNodes.begin();

18420 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());

18421 }while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());

18422 NodeIdx = NewIdx;

18423 IsTruncRoot =

18424 NodeIdx < VectorizableTree.size() &&

18425any_of(VectorizableTree[NodeIdx]->UserTreeIndices,

18426 [](constEdgeInfo &EI) {

18427return EI.EdgeIdx == 0 &&

18428 EI.UserTE->getOpcode() == Instruction::Trunc &&

18429 !EI.UserTE->isAltShuffle();

18430 });

18431 IsSignedCmp =

18432 NodeIdx < VectorizableTree.size() &&

18433any_of(

18434 VectorizableTree[NodeIdx]->UserTreeIndices,

18435 [&](constEdgeInfo &EI) {

18436return (EI.UserTE->hasState() &&

18437 EI.UserTE->getOpcode() == Instruction::ICmp) &&

18438any_of(EI.UserTE->Scalars, [&](Value *V) {

18439 auto *IC = dyn_cast<ICmpInst>(V);

18440 return IC &&

18441 (IC->isSigned() ||

18442 !isKnownNonNegative(IC->getOperand(0),

18443 SimplifyQuery(*DL)) ||

18444 !isKnownNonNegative(IC->getOperand(1),

18445 SimplifyQuery(*DL)));

18446 });

18447 });

18448 }

18449

18450// If the maximum bit width we compute is less than the width of the roots'

18451// type, we can proceed with the narrowing. Otherwise, do nothing.

18452if (MaxBitWidth == 0 ||

18453 MaxBitWidth >=

18454 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())

18455 ->getBitWidth()) {

18456if (UserIgnoreList)

18457 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());

18458 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());

18459continue;

18460 }

18461

18462// Finally, map the values we can demote to the maximum bit with we

18463// computed.

18464for (unsignedIdx : ToDemote) {

18465 TreeEntry *TE = VectorizableTree[Idx].get();

18466if (MinBWs.contains(TE))

18467continue;

18468bool IsSigned =any_of(TE->Scalars, [&](Value *R) {

18469 if (isa<PoisonValue>(R))

18470 return false;

18471 return !isKnownNonNegative(R, SimplifyQuery(*DL));

18472 });

18473 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);

18474 }

18475 }

18476}

18477

18478PreservedAnalyses SLPVectorizerPass::run(Function &F,FunctionAnalysisManager &AM) {

18479auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);

18480auto *TTI = &AM.getResult<TargetIRAnalysis>(F);

18481auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);

18482auto *AA = &AM.getResult<AAManager>(F);

18483auto *LI = &AM.getResult<LoopAnalysis>(F);

18484auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);

18485auto *AC = &AM.getResult<AssumptionAnalysis>(F);

18486auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);

18487auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

18488

18489bool Changed =runImpl(F, SE,TTI, TLI, AA, LI, DT, AC, DB, ORE);

18490if (!Changed)

18491returnPreservedAnalyses::all();

18492

18493PreservedAnalyses PA;

18494 PA.preserveSet<CFGAnalyses>();

18495return PA;

18496}

18497

18498boolSLPVectorizerPass::runImpl(Function &F,ScalarEvolution *SE_,

18499TargetTransformInfo *TTI_,

18500TargetLibraryInfo *TLI_,AAResults *AA_,

18501LoopInfo *LI_,DominatorTree *DT_,

18502AssumptionCache *AC_,DemandedBits *DB_,

18503OptimizationRemarkEmitter *ORE_) {

18504if (!RunSLPVectorization)

18505returnfalse;

18506 SE = SE_;

18507TTI = TTI_;

18508 TLI = TLI_;

18509 AA = AA_;

18510 LI = LI_;

18511 DT = DT_;

18512 AC = AC_;

18513 DB = DB_;

18514DL = &F.getDataLayout();

18515

18516 Stores.clear();

18517 GEPs.clear();

18518bool Changed =false;

18519

18520// If the target claims to have no vector registers don't attempt

18521// vectorization.

18522if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {

18523LLVM_DEBUG(

18524dbgs() <<"SLP: Didn't find any vector registers for target, abort.\n");

18525returnfalse;

18526 }

18527

18528// Don't vectorize when the attribute NoImplicitFloat is used.

18529if (F.hasFnAttribute(Attribute::NoImplicitFloat))

18530returnfalse;

18531

18532LLVM_DEBUG(dbgs() <<"SLP: Analyzing blocks in " <<F.getName() <<".\n");

18533

18534// Use the bottom up slp vectorizer to construct chains that start with

18535// store instructions.

18536BoUpSLP R(&F, SE,TTI, TLI, AA, LI, DT, AC, DB,DL, ORE_);

18537

18538// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to

18539// delete instructions.

18540

18541// Update DFS numbers now so that we can use them for ordering.

18542 DT->updateDFSNumbers();

18543

18544// Scan the blocks in the function in post order.

18545for (auto *BB :post_order(&F.getEntryBlock())) {

18546if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))

18547continue;

18548

18549// Start new block - clear the list of reduction roots.

18550 R.clearReductionData();

18551 collectSeedInstructions(BB);

18552

18553// Vectorize trees that end at stores.

18554if (!Stores.empty()) {

18555LLVM_DEBUG(dbgs() <<"SLP: Found stores for " << Stores.size()

18556 <<" underlying objects.\n");

18557 Changed |= vectorizeStoreChains(R);

18558 }

18559

18560// Vectorize trees that end at reductions.

18561 Changed |= vectorizeChainsInBlock(BB, R);

18562

18563// Vectorize the index computations of getelementptr instructions. This

18564// is primarily intended to catch gather-like idioms ending at

18565// non-consecutive loads.

18566if (!GEPs.empty()) {

18567LLVM_DEBUG(dbgs() <<"SLP: Found GEPs for " << GEPs.size()

18568 <<" underlying objects.\n");

18569 Changed |= vectorizeGEPIndices(BB, R);

18570 }

18571 }

18572

18573if (Changed) {

18574 R.optimizeGatherSequence();

18575LLVM_DEBUG(dbgs() <<"SLP: vectorized \"" <<F.getName() <<"\"\n");

18576 }

18577return Changed;

18578}

18579

18580std::optional<bool>

18581SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,BoUpSLP &R,

18582unsignedIdx,unsigned MinVF,

18583unsigned &Size) {

18584Size = 0;

18585LLVM_DEBUG(dbgs() <<"SLP: Analyzing a store chain of length " << Chain.size()

18586 <<"\n");

18587constunsigned Sz = R.getVectorElementSize(Chain[0]);

18588unsigned VF = Chain.size();

18589

18590if (!has_single_bit(Sz) ||

18591 !hasFullVectorsOrPowerOf2(

18592 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),

18593 VF) ||

18594 VF < 2 || VF < MinVF) {

18595// Check if vectorizing with a non-power-of-2 VF should be considered. At

18596// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost

18597// all vector lanes are used.

18598if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))

18599returnfalse;

18600 }

18601

18602LLVM_DEBUG(dbgs() <<"SLP: Analyzing " << VF <<" stores at offset " <<Idx

18603 <<"\n");

18604

18605SetVector<Value *> ValOps;

18606for (Value *V : Chain)

18607 ValOps.insert(cast<StoreInst>(V)->getValueOperand());

18608// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.

18609 InstructionsState S =getSameOpcode(ValOps.getArrayRef(), *TLI);

18610if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {

18611DenseSet<Value *> Stores(Chain.begin(), Chain.end());

18612bool IsAllowedSize =

18613hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),

18614 ValOps.size()) ||

18615 (VectorizeNonPowerOf2 &&has_single_bit(ValOps.size() + 1));

18616if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&

18617 (!S.getMainOp()->isSafeToRemove() ||

18618any_of(ValOps.getArrayRef(),

18619 [&](Value *V) {

18620 return !isa<ExtractElementInst>(V) &&

18621 (V->getNumUses() > Chain.size() ||

18622 any_of(V->users(), [&](User *U) {

18623 return !Stores.contains(U);

18624 }));

18625 }))) ||

18626 (ValOps.size() > Chain.size() / 2 && !S)) {

18627Size = (!IsAllowedSize && S) ? 1 : 2;

18628returnfalse;

18629 }

18630 }

18631if (R.isLoadCombineCandidate(Chain))

18632returntrue;

18633R.buildTree(Chain);

18634// Check if tree tiny and store itself or its value is not vectorized.

18635if (R.isTreeTinyAndNotFullyVectorizable()) {

18636if (R.isGathered(Chain.front()) ||

18637R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))

18638return std::nullopt;

18639Size =R.getCanonicalGraphSize();

18640returnfalse;

18641 }

18642R.reorderTopToBottom();

18643R.reorderBottomToTop();

18644R.transformNodes();

18645R.buildExternalUses();

18646

18647R.computeMinimumValueSizes();

18648

18649Size =R.getCanonicalGraphSize();

18650if (S && S.getOpcode() == Instruction::Load)

18651Size = 2;// cut off masked gather small trees

18652InstructionCost Cost =R.getTreeCost();

18653

18654LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost <<" for VF=" << VF <<"\n");

18655if (Cost < -SLPCostThreshold) {

18656LLVM_DEBUG(dbgs() <<"SLP: Decided to vectorize cost = " <<Cost <<"\n");

18657

18658using namespaceore;

18659

18660R.getORE()->emit(OptimizationRemark(SV_NAME,"StoresVectorized",

18661 cast<StoreInst>(Chain[0]))

18662 <<"Stores SLP vectorized with cost " <<NV("Cost",Cost)

18663 <<" and with tree size "

18664 <<NV("TreeSize",R.getTreeSize()));

18665

18666R.vectorizeTree();

18667returntrue;

18668 }

18669

18670returnfalse;

18671}

18672

18673/// Checks if the quadratic mean deviation is less than 90% of the mean size.

18674staticboolcheckTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,

18675boolFirst) {

18676unsigned Num = 0;

18677uint64_t Sum = std::accumulate(

18678 Sizes.begin(), Sizes.end(),static_cast<uint64_t>(0),

18679 [&](uint64_t V,const std::pair<unsigned, unsigned> &Val) {

18680 unsigned Size = First ? Val.first : Val.second;

18681 if (Size == 1)

18682 return V;

18683 ++Num;

18684 return V + Size;

18685 });

18686if (Num == 0)

18687returntrue;

18688uint64_t Mean = Sum / Num;

18689if (Mean == 0)

18690returntrue;

18691uint64_t Dev = std::accumulate(

18692 Sizes.begin(), Sizes.end(),static_cast<uint64_t>(0),

18693 [&](uint64_t V,const std::pair<unsigned, unsigned> &Val) {

18694 unsigned P = First ? Val.first : Val.second;

18695 if (P == 1)

18696 return V;

18697 return V + (P - Mean) * (P - Mean);

18698 }) /

18699 Num;

18700return Dev * 81 / (Mean * Mean) == 0;

18701}

18702

18703bool SLPVectorizerPass::vectorizeStores(

18704ArrayRef<StoreInst *> Stores,BoUpSLP &R,

18705DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>

18706 &Visited) {

18707// We may run into multiple chains that merge into a single chain. We mark the

18708// stores that we vectorized so that we don't visit the same store twice.

18709BoUpSLP::ValueSet VectorizedStores;

18710bool Changed =false;

18711

18712structStoreDistCompare {

18713bool operator()(const std::pair<unsigned, int> &Op1,

18714const std::pair<unsigned, int> &Op2) const{

18715return Op1.second < Op2.second;

18716 }

18717 };

18718// A set of pairs (index of store in Stores array ref, Distance of the store

18719// address relative to base store address in units).

18720usingStoreIndexToDistSet =

18721 std::set<std::pair<unsigned, int>, StoreDistCompare>;

18722auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {

18723int PrevDist = -1;

18724BoUpSLP::ValueList Operands;

18725// Collect the chain into a list.

18726for (auto [Idx,Data] :enumerate(Set)) {

18727if (Operands.empty() ||Data.second - PrevDist == 1) {

18728Operands.push_back(Stores[Data.first]);

18729 PrevDist =Data.second;

18730if (Idx !=Set.size() - 1)

18731continue;

18732 }

18733auto E =make_scope_exit([&, &DataVar =Data]() {

18734Operands.clear();

18735Operands.push_back(Stores[DataVar.first]);

18736 PrevDist = DataVar.second;

18737 });

18738

18739if (Operands.size() <= 1 ||

18740 !Visited

18741 .insert({Operands.front(),

18742 cast<StoreInst>(Operands.front())->getValueOperand(),

18743 Operands.back(),

18744 cast<StoreInst>(Operands.back())->getValueOperand(),

18745 Operands.size()})

18746 .second)

18747continue;

18748

18749unsigned MaxVecRegSize =R.getMaxVecRegSize();

18750unsigned EltSize =R.getVectorElementSize(Operands[0]);

18751unsigned MaxElts =llvm::bit_floor(MaxVecRegSize / EltSize);

18752

18753unsigned MaxVF =

18754 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);

18755auto *Store = cast<StoreInst>(Operands[0]);

18756Type *StoreTy =Store->getValueOperand()->getType();

18757Type *ValueTy = StoreTy;

18758if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

18759 ValueTy = Trunc->getSrcTy();

18760unsigned MinVF = std::max<unsigned>(

18761 2,PowerOf2Ceil(TTI->getStoreMinimumVF(

18762R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,

18763 ValueTy)));

18764

18765if (MaxVF < MinVF) {

18766LLVM_DEBUG(dbgs() <<"SLP: Vectorization infeasible as MaxVF (" << MaxVF

18767 <<") < "

18768 <<"MinVF (" << MinVF <<")\n");

18769continue;

18770 }

18771

18772unsigned NonPowerOf2VF = 0;

18773if (VectorizeNonPowerOf2) {

18774// First try vectorizing with a non-power-of-2 VF. At the moment, only

18775// consider cases where VF + 1 is a power-of-2, i.e. almost all vector

18776// lanes are used.

18777unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);

18778if (has_single_bit(CandVF + 1)) {

18779 NonPowerOf2VF = CandVF;

18780assert(NonPowerOf2VF != MaxVF &&

18781"Non-power-of-2 VF should not be equal to MaxVF");

18782 }

18783 }

18784

18785unsigned MaxRegVF = MaxVF;

18786 MaxVF = std::min<unsigned>(MaxVF,bit_floor(Operands.size()));

18787if (MaxVF < MinVF) {

18788LLVM_DEBUG(dbgs() <<"SLP: Vectorization infeasible as MaxVF (" << MaxVF

18789 <<") < "

18790 <<"MinVF (" << MinVF <<")\n");

18791continue;

18792 }

18793

18794unsigned Sz = 1 +Log2_32(MaxVF) -Log2_32(MinVF);

18795SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));

18796unsignedSize = MinVF;

18797for_each(reverse(CandidateVFs), [&](unsigned &VF) {

18798 VF =Size > MaxVF ? NonPowerOf2VF :Size;

18799Size *= 2;

18800 });

18801unsignedEnd =Operands.size();

18802unsigned Repeat = 0;

18803constexprunsigned MaxAttempts = 4;

18804OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());

18805for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {

18806P.first =P.second = 1;

18807 });

18808DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;

18809auto IsNotVectorized = [](boolFirst,

18810const std::pair<unsigned, unsigned> &P) {

18811returnFirst ?P.first > 0 :P.second > 0;

18812 };

18813auto IsVectorized = [](boolFirst,

18814const std::pair<unsigned, unsigned> &P) {

18815returnFirst ?P.first == 0 :P.second == 0;

18816 };

18817auto VFIsProfitable = [](boolFirst,unsignedSize,

18818const std::pair<unsigned, unsigned> &P) {

18819returnFirst ?Size >=P.first :Size >=P.second;

18820 };

18821auto FirstSizeSame = [](unsignedSize,

18822const std::pair<unsigned, unsigned> &P) {

18823returnSize ==P.first;

18824 };

18825while (true) {

18826 ++Repeat;

18827bool RepeatChanged =false;

18828bool AnyProfitableGraph =false;

18829for (unsignedSize : CandidateVFs) {

18830 AnyProfitableGraph =false;

18831unsigned StartIdx = std::distance(

18832 RangeSizes.begin(),

18833find_if(RangeSizes, std::bind(IsNotVectorized,Size >= MaxRegVF,

18834 std::placeholders::_1)));

18835while (StartIdx <End) {

18836unsigned EndIdx =

18837 std::distance(RangeSizes.begin(),

18838find_if(RangeSizes.drop_front(StartIdx),

18839 std::bind(IsVectorized,Size >= MaxRegVF,

18840 std::placeholders::_1)));

18841unsigned Sz = EndIdx >=End ?End : EndIdx;

18842for (unsigned Cnt = StartIdx; Cnt +Size <= Sz;) {

18843if (!checkTreeSizes(RangeSizes.slice(Cnt,Size),

18844Size >= MaxRegVF)) {

18845 ++Cnt;

18846continue;

18847 }

18848ArrayRef<Value *> Slice =ArrayRef(Operands).slice(Cnt,Size);

18849assert(all_of(Slice,

18850 [&](Value *V) {

18851return cast<StoreInst>(V)

18852 ->getValueOperand()

18853 ->getType() ==

18854 cast<StoreInst>(Slice.front())

18855 ->getValueOperand()

18856 ->getType();

18857 }) &&

18858"Expected all operands of same type.");

18859if (!NonSchedulable.empty()) {

18860auto [NonSchedSizeMax, NonSchedSizeMin] =

18861 NonSchedulable.lookup(Slice.front());

18862if (NonSchedSizeMax > 0 && NonSchedSizeMin <=Size) {

18863 Cnt += NonSchedSizeMax;

18864continue;

18865 }

18866 }

18867unsigned TreeSize;

18868 std::optional<bool> Res =

18869 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);

18870if (!Res) {

18871 NonSchedulable

18872 .try_emplace(Slice.front(), std::make_pair(Size,Size))

18873 .first->getSecond()

18874 .second =Size;

18875 }elseif (*Res) {

18876// Mark the vectorized stores so that we don't vectorize them

18877// again.

18878 VectorizedStores.insert(Slice.begin(), Slice.end());

18879// Mark the vectorized stores so that we don't vectorize them

18880// again.

18881 AnyProfitableGraph = RepeatChanged = Changed =true;

18882// If we vectorized initial block, no need to try to vectorize

18883// it again.

18884for_each(RangeSizes.slice(Cnt,Size),

18885 [](std::pair<unsigned, unsigned> &P) {

18886 P.first = P.second = 0;

18887 });

18888if (Cnt < StartIdx + MinVF) {

18889for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),

18890 [](std::pair<unsigned, unsigned> &P) {

18891 P.first = P.second = 0;

18892 });

18893 StartIdx = Cnt +Size;

18894 }

18895if (Cnt > Sz -Size - MinVF) {

18896for_each(RangeSizes.slice(Cnt +Size, Sz - (Cnt +Size)),

18897 [](std::pair<unsigned, unsigned> &P) {

18898 P.first = P.second = 0;

18899 });

18900if (Sz ==End)

18901End = Cnt;

18902 Sz = Cnt;

18903 }

18904 Cnt +=Size;

18905continue;

18906 }

18907if (Size > 2 && Res &&

18908 !all_of(RangeSizes.slice(Cnt,Size),

18909 std::bind(VFIsProfitable,Size >= MaxRegVF, TreeSize,

18910 std::placeholders::_1))) {

18911 Cnt +=Size;

18912continue;

18913 }

18914// Check for the very big VFs that we're not rebuilding same

18915// trees, just with larger number of elements.

18916if (Size > MaxRegVF && TreeSize > 1 &&

18917all_of(RangeSizes.slice(Cnt,Size),

18918 std::bind(FirstSizeSame, TreeSize,

18919 std::placeholders::_1))) {

18920 Cnt +=Size;

18921while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)

18922 ++Cnt;

18923continue;

18924 }

18925if (TreeSize > 1)

18926for_each(RangeSizes.slice(Cnt,Size),

18927 [&](std::pair<unsigned, unsigned> &P) {

18928 if (Size >= MaxRegVF)

18929 P.second = std::max(P.second, TreeSize);

18930 else

18931 P.first = std::max(P.first, TreeSize);

18932 });

18933 ++Cnt;

18934 AnyProfitableGraph =true;

18935 }

18936if (StartIdx >=End)

18937break;

18938if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)

18939 AnyProfitableGraph =true;

18940 StartIdx = std::distance(

18941 RangeSizes.begin(),

18942find_if(RangeSizes.drop_front(Sz),

18943 std::bind(IsNotVectorized,Size >= MaxRegVF,

18944 std::placeholders::_1)));

18945 }

18946if (!AnyProfitableGraph &&Size >= MaxRegVF &&has_single_bit(Size))

18947break;

18948 }

18949// All values vectorized - exit.

18950if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {

18951returnP.first == 0 &&P.second == 0;

18952 }))

18953break;

18954// Check if tried all attempts or no need for the last attempts at all.

18955if (Repeat >= MaxAttempts ||

18956 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))

18957break;

18958constexprunsigned StoresLimit = 64;

18959constunsigned MaxTotalNum = std::min<unsigned>(

18960Operands.size(),

18961static_cast<unsigned>(

18962End -

18963 std::distance(

18964 RangeSizes.begin(),

18965find_if(RangeSizes, std::bind(IsNotVectorized,true,

18966 std::placeholders::_1))) +

18967 1));

18968unsigned VF =bit_ceil(CandidateVFs.front()) * 2;

18969unsigned Limit =

18970getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);

18971 CandidateVFs.clear();

18972if (bit_floor(Limit) == VF)

18973 CandidateVFs.push_back(Limit);

18974if (VF > MaxTotalNum || VF >= StoresLimit)

18975break;

18976for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {

18977if (P.first != 0)

18978P.first = std::max(P.second,P.first);

18979 });

18980// Last attempt to vectorize max number of elements, if all previous

18981// attempts were unsuccessful because of the cost issues.

18982 CandidateVFs.push_back(VF);

18983 }

18984 }

18985 };

18986

18987// Stores pair (first: index of the store into Stores array ref, address of

18988// which taken as base, second: sorted set of pairs {index, dist}, which are

18989// indices of stores in the set and their store location distances relative to

18990// the base address).

18991

18992// Need to store the index of the very first store separately, since the set

18993// may be reordered after the insertion and the first store may be moved. This

18994// container allows to reduce number of calls of getPointersDiff() function.

18995SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;

18996// Inserts the specified store SI with the given index Idx to the set of the

18997// stores. If the store with the same distance is found already - stop

18998// insertion, try to vectorize already found stores. If some stores from this

18999// sequence were not vectorized - try to vectorize them with the new store

19000// later. But this logic is applied only to the stores, that come before the

19001// previous store with the same distance.

19002// Example:

19003// 1. store x, %p

19004// 2. store y, %p+1

19005// 3. store z, %p+2

19006// 4. store a, %p

19007// 5. store b, %p+3

19008// - Scan this from the last to first store. The very first bunch of stores is

19009// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores

19010// vector).

19011// - The next store in the list - #1 - has the same distance from store #5 as

19012// the store #4.

19013// - Try to vectorize sequence of stores 4,2,3,5.

19014// - If all these stores are vectorized - just drop them.

19015// - If some of them are not vectorized (say, #3 and #5), do extra analysis.

19016// - Start new stores sequence.

19017// The new bunch of stores is {1, {1, 0}}.

19018// - Add the stores from previous sequence, that were not vectorized.

19019// Here we consider the stores in the reversed order, rather they are used in

19020// the IR (Stores are reversed already, see vectorizeStoreChains() function).

19021// Store #3 can be added -> comes after store #4 with the same distance as

19022// store #1.

19023// Store #5 cannot be added - comes before store #4.

19024// This logic allows to improve the compile time, we assume that the stores

19025// after previous store with the same distance most likely have memory

19026// dependencies and no need to waste compile time to try to vectorize them.

19027// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.

19028auto FillStoresSet = [&](unsignedIdx,StoreInst *SI) {

19029for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {

19030 std::optional<int> Diff =getPointersDiff(

19031 Stores[Set.first]->getValueOperand()->getType(),

19032 Stores[Set.first]->getPointerOperand(),

19033SI->getValueOperand()->getType(),SI->getPointerOperand(), *DL, *SE,

19034/*StrictCheck=*/true);

19035if (!Diff)

19036continue;

19037auto It =Set.second.find(std::make_pair(Idx, *Diff));

19038if (It ==Set.second.end()) {

19039Set.second.emplace(Idx, *Diff);

19040return;

19041 }

19042// Try to vectorize the first found set to avoid duplicate analysis.

19043 TryToVectorize(Set.second);

19044unsigned ItIdx = It->first;

19045int ItDist = It->second;

19046 StoreIndexToDistSet PrevSet;

19047copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),

19048 [&](const std::pair<unsigned, int> &Pair) {

19049 return Pair.first > ItIdx;

19050 });

19051Set.second.clear();

19052Set.first =Idx;

19053Set.second.emplace(Idx, 0);

19054// Insert stores that followed previous match to try to vectorize them

19055// with this store.

19056unsigned StartIdx = ItIdx + 1;

19057SmallBitVector UsedStores(Idx - StartIdx);

19058// Distances to previously found dup store (or this store, since they

19059// store to the same addresses).

19060SmallVector<int> Dists(Idx - StartIdx, 0);

19061for (const std::pair<unsigned, int> &Pair :reverse(PrevSet)) {

19062// Do not try to vectorize sequences, we already tried.

19063if (VectorizedStores.contains(Stores[Pair.first]))

19064break;

19065unsigned BI = Pair.first - StartIdx;

19066 UsedStores.set(BI);

19067 Dists[BI] = Pair.second - ItDist;

19068 }

19069for (unsignedI = StartIdx;I <Idx; ++I) {

19070unsigned BI =I - StartIdx;

19071if (UsedStores.test(BI))

19072Set.second.emplace(I, Dists[BI]);

19073 }

19074return;

19075 }

19076auto &Res = SortedStores.emplace_back();

19077 Res.first =Idx;

19078 Res.second.emplace(Idx, 0);

19079 };

19080Type *PrevValTy =nullptr;

19081for (auto [I, SI] :enumerate(Stores)) {

19082if (R.isDeleted(SI))

19083continue;

19084if (!PrevValTy)

19085 PrevValTy =SI->getValueOperand()->getType();

19086// Check that we do not try to vectorize stores of different types.

19087if (PrevValTy !=SI->getValueOperand()->getType()) {

19088for (auto &Set : SortedStores)

19089 TryToVectorize(Set.second);

19090 SortedStores.clear();

19091 PrevValTy =SI->getValueOperand()->getType();

19092 }

19093 FillStoresSet(I, SI);

19094 }

19095

19096// Final vectorization attempt.

19097for (auto &Set : SortedStores)

19098 TryToVectorize(Set.second);

19099

19100return Changed;

19101}

19102

19103void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {

19104// Initialize the collections. We will make a single pass over the block.

19105 Stores.clear();

19106 GEPs.clear();

19107

19108// Visit the store and getelementptr instructions in BB and organize them in

19109// Stores and GEPs according to the underlying objects of their pointer

19110// operands.

19111for (Instruction &I : *BB) {

19112// Ignore store instructions that are volatile or have a pointer operand

19113// that doesn't point to a scalar type.

19114if (auto *SI = dyn_cast<StoreInst>(&I)) {

19115if (!SI->isSimple())

19116continue;

19117if (!isValidElementType(SI->getValueOperand()->getType()))

19118continue;

19119 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);

19120 }

19121

19122// Ignore getelementptr instructions that have more than one index, a

19123// constant index, or a pointer operand that doesn't point to a scalar

19124// type.

19125elseif (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {

19126if (GEP->getNumIndices() != 1)

19127continue;

19128Value *Idx =GEP->idx_begin()->get();

19129if (isa<Constant>(Idx))

19130continue;

19131if (!isValidElementType(Idx->getType()))

19132continue;

19133if (GEP->getType()->isVectorTy())

19134continue;

19135 GEPs[GEP->getPointerOperand()].push_back(GEP);

19136 }

19137 }

19138}

19139

19140bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL,BoUpSLP &R,

19141bool MaxVFOnly) {

19142if (VL.size() < 2)

19143returnfalse;

19144

19145LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize a list of length = "

19146 << VL.size() <<".\n");

19147

19148// Check that all of the parts are instructions of the same type,

19149// we permit an alternate opcode via InstructionsState.

19150 InstructionsState S =getSameOpcode(VL, *TLI);

19151if (!S)

19152returnfalse;

19153

19154Instruction *I0 = S.getMainOp();

19155// Make sure invalid types (including vector type) are rejected before

19156// determining vectorization factor for scalar instructions.

19157for (Value *V : VL) {

19158Type *Ty =V->getType();

19159if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {

19160// NOTE: the following will give user internal llvm type name, which may

19161// not be useful.

19162R.getORE()->emit([&]() {

19163 std::string TypeStr;

19164llvm::raw_string_ostream rso(TypeStr);

19165 Ty->print(rso);

19166returnOptimizationRemarkMissed(SV_NAME,"UnsupportedType", I0)

19167 <<"Cannot SLP vectorize list: type "

19168 << TypeStr +" is unsupported by vectorizer";

19169 });

19170returnfalse;

19171 }

19172 }

19173

19174Type *ScalarTy =getValueType(VL[0]);

19175unsigned Sz =R.getVectorElementSize(I0);

19176unsigned MinVF =R.getMinVF(Sz);

19177unsigned MaxVF = std::max<unsigned>(

19178getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);

19179 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);

19180if (MaxVF < 2) {

19181R.getORE()->emit([&]() {

19182returnOptimizationRemarkMissed(SV_NAME,"SmallVF", I0)

19183 <<"Cannot SLP vectorize list: vectorization factor "

19184 <<"less than 2 is not supported";

19185 });

19186returnfalse;

19187 }

19188

19189bool Changed =false;

19190bool CandidateFound =false;

19191InstructionCost MinCost =SLPCostThreshold.getValue();

19192

19193unsigned NextInst = 0, MaxInst = VL.size();

19194for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;

19195 VF =getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {

19196// No actual vectorization should happen, if number of parts is the same as

19197// provided vectorization factor (i.e. the scalar type is used for vector

19198// code during codegen).

19199auto *VecTy =getWidenedType(ScalarTy, VF);

19200if (TTI->getNumberOfParts(VecTy) == VF)

19201continue;

19202for (unsignedI = NextInst;I < MaxInst; ++I) {

19203unsigned ActualVF = std::min(MaxInst -I, VF);

19204

19205if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))

19206continue;

19207

19208if (MaxVFOnly && ActualVF < MaxVF)

19209break;

19210if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))

19211break;

19212

19213SmallVector<Value *> Ops(ActualVF,nullptr);

19214unsignedIdx = 0;

19215for (Value *V : VL.drop_front(I)) {

19216// Check that a previous iteration of this loop did not delete the

19217// Value.

19218if (auto *Inst = dyn_cast<Instruction>(V);

19219 !Inst || !R.isDeleted(Inst)) {

19220 Ops[Idx] =V;

19221 ++Idx;

19222if (Idx == ActualVF)

19223break;

19224 }

19225 }

19226// Not enough vectorizable instructions - exit.

19227if (Idx != ActualVF)

19228break;

19229

19230LLVM_DEBUG(dbgs() <<"SLP: Analyzing " << ActualVF <<" operations "

19231 <<"\n");

19232

19233R.buildTree(Ops);

19234if (R.isTreeTinyAndNotFullyVectorizable())

19235continue;

19236R.reorderTopToBottom();

19237R.reorderBottomToTop(

19238/*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&

19239 !R.doesRootHaveInTreeUses());

19240R.transformNodes();

19241R.buildExternalUses();

19242

19243R.computeMinimumValueSizes();

19244InstructionCost Cost =R.getTreeCost();

19245 CandidateFound =true;

19246 MinCost = std::min(MinCost,Cost);

19247

19248LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost

19249 <<" for VF=" << ActualVF <<"\n");

19250if (Cost < -SLPCostThreshold) {

19251LLVM_DEBUG(dbgs() <<"SLP: Vectorizing list at cost:" <<Cost <<".\n");

19252R.getORE()->emit(OptimizationRemark(SV_NAME,"VectorizedList",

19253 cast<Instruction>(Ops[0]))

19254 <<"SLP vectorized with cost " <<ore::NV("Cost",Cost)

19255 <<" and with tree size "

19256 <<ore::NV("TreeSize",R.getTreeSize()));

19257

19258R.vectorizeTree();

19259// Move to the next bundle.

19260I += VF - 1;

19261 NextInst =I + 1;

19262 Changed =true;

19263 }

19264 }

19265 }

19266

19267if (!Changed && CandidateFound) {

19268R.getORE()->emit([&]() {

19269returnOptimizationRemarkMissed(SV_NAME,"NotBeneficial", I0)

19270 <<"List vectorization was possible but not beneficial with cost "

19271 <<ore::NV("Cost", MinCost) <<" >= "

19272 <<ore::NV("Treshold", -SLPCostThreshold);

19273 });

19274 }elseif (!Changed) {

19275R.getORE()->emit([&]() {

19276returnOptimizationRemarkMissed(SV_NAME,"NotPossible", I0)

19277 <<"Cannot SLP vectorize list: vectorization was impossible"

19278 <<" with available vectorization factors";

19279 });

19280 }

19281return Changed;

19282}

19283

19284bool SLPVectorizerPass::tryToVectorize(Instruction *I,BoUpSLP &R) {

19285if (!I)

19286returnfalse;

19287

19288if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))

19289returnfalse;

19290

19291Value *P =I->getParent();

19292

19293// Vectorize in current basic block only.

19294auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));

19295auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));

19296if (!Op0 || !Op1 || Op0->getParent() !=P || Op1->getParent() !=P ||

19297R.isDeleted(Op0) ||R.isDeleted(Op1))

19298returnfalse;

19299

19300// First collect all possible candidates

19301SmallVector<std::pair<Value *, Value *>, 4> Candidates;

19302 Candidates.emplace_back(Op0, Op1);

19303

19304auto *A = dyn_cast<BinaryOperator>(Op0);

19305auto *B = dyn_cast<BinaryOperator>(Op1);

19306// Try to skip B.

19307if (A &&B &&B->hasOneUse()) {

19308auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));

19309auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));

19310if (B0 && B0->getParent() ==P && !R.isDeleted(B0))

19311 Candidates.emplace_back(A, B0);

19312if (B1 && B1->getParent() ==P && !R.isDeleted(B1))

19313 Candidates.emplace_back(A, B1);

19314 }

19315// Try to skip A.

19316if (B &&A &&A->hasOneUse()) {

19317auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));

19318auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));

19319if (A0 && A0->getParent() ==P && !R.isDeleted(A0))

19320 Candidates.emplace_back(A0,B);

19321if (A1 && A1->getParent() ==P && !R.isDeleted(A1))

19322 Candidates.emplace_back(A1,B);

19323 }

19324

19325if (Candidates.size() == 1)

19326return tryToVectorizeList({Op0, Op1},R);

19327

19328// We have multiple options. Try to pick the single best.

19329 std::optional<int> BestCandidate =R.findBestRootPair(Candidates);

19330if (!BestCandidate)

19331returnfalse;

19332return tryToVectorizeList(

19333 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},R);

19334}

19335

19336namespace{

19337

19338/// Model horizontal reductions.

19339///

19340/// A horizontal reduction is a tree of reduction instructions that has values

19341/// that can be put into a vector as its leaves. For example:

19342///

19343/// mul mul mul mul

19344/// \ / \ /

19345/// + +

19346/// \ /

19347/// +

19348/// This tree has "mul" as its leaf values and "+" as its reduction

19349/// instructions. A reduction can feed into a store or a binary operation

19350/// feeding a phi.

19351/// ...

19352/// \ /

19353/// +

19354/// |

19355/// phi +=

19356///

19357/// Or:

19358/// ...

19359/// \ /

19360/// +

19361/// |

19362/// *p =

19363///

19364classHorizontalReduction {

19365usingReductionOpsType =SmallVector<Value *, 16>;

19366usingReductionOpsListType =SmallVector<ReductionOpsType, 2>;

19367 ReductionOpsListType ReductionOps;

19368 /// List of possibly reduced values.

19369SmallVector<SmallVector<Value *>> ReducedVals;

19370 /// Maps reduced value to the corresponding reduction operation.

19371SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;

19372WeakTrackingVH ReductionRoot;

19373 /// The type of reduction operation.

19374RecurKind RdxKind;

19375 /// Checks if the optimization of original scalar identity operations on

19376 /// matched horizontal reductions is enabled and allowed.

19377bool IsSupportedHorRdxIdentityOp =false;

19378

19379staticbool isCmpSelMinMax(Instruction *I) {

19380returnmatch(I,m_Select(m_Cmp(),m_Value(),m_Value())) &&

19381RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));

19382 }

19383

19384// And/or are potentially poison-safe logical patterns like:

19385// select x, y, false

19386// select x, true, y

19387staticbool isBoolLogicOp(Instruction *I) {

19388return isa<SelectInst>(I) &&

19389 (match(I,m_LogicalAnd()) ||match(I,m_LogicalOr()));

19390 }

19391

19392 /// Checks if instruction is associative and can be vectorized.

19393staticbool isVectorizable(RecurKind Kind,Instruction *I) {

19394if (Kind == RecurKind::None)

19395returnfalse;

19396

19397// Integer ops that map to select instructions or intrinsics are fine.

19398if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||

19399 isBoolLogicOp(I))

19400returntrue;

19401

19402if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {

19403// FP min/max are associative except for NaN and -0.0. We do not

19404// have to rule out -0.0 here because the intrinsic semantics do not

19405// specify a fixed result for it.

19406returnI->getFastMathFlags().noNaNs();

19407 }

19408

19409if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)

19410returntrue;

19411

19412returnI->isAssociative();

19413 }

19414

19415staticValue *getRdxOperand(Instruction *I,unsignedIndex) {

19416// Poison-safe 'or' takes the form: select X, true, Y

19417// To make that work with the normal operand processing, we skip the

19418// true value operand.

19419// TODO: Change the code and data structures to handle this without a hack.

19420if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) &&Index == 1)

19421returnI->getOperand(2);

19422returnI->getOperand(Index);

19423 }

19424

19425 /// Creates reduction operation with the current opcode.

19426staticValue *createOp(IRBuilderBase &Builder,RecurKind Kind,Value *LHS,

19427Value *RHS,constTwine &Name,bool UseSelect) {

19428switch (Kind) {

19429case RecurKind::Or: {

19430if (UseSelect &&

19431LHS->getType() ==CmpInst::makeCmpResultType(LHS->getType()))

19432return Builder.CreateSelect(LHS, Builder.getTrue(),RHS,Name);

19433unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);

19434return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,

19435Name);

19436 }

19437case RecurKind::And: {

19438if (UseSelect &&

19439LHS->getType() ==CmpInst::makeCmpResultType(LHS->getType()))

19440return Builder.CreateSelect(LHS,RHS, Builder.getFalse(),Name);

19441unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);

19442return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,

19443Name);

19444 }

19445case RecurKind::Add:

19446case RecurKind::Mul:

19447case RecurKind::Xor:

19448case RecurKind::FAdd:

19449case RecurKind::FMul: {

19450unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);

19451return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,

19452Name);

19453 }

19454case RecurKind::SMax:

19455case RecurKind::SMin:

19456case RecurKind::UMax:

19457case RecurKind::UMin:

19458if (UseSelect) {

19459CmpInst::Predicate Pred =llvm::getMinMaxReductionPredicate(Kind);

19460Value *Cmp = Builder.CreateICmp(Pred,LHS,RHS,Name);

19461return Builder.CreateSelect(Cmp,LHS,RHS,Name);

19462 }

19463 [[fallthrough]];

19464case RecurKind::FMax:

19465case RecurKind::FMin:

19466case RecurKind::FMaximum:

19467case RecurKind::FMinimum: {

19468Intrinsic::ID Id =llvm::getMinMaxReductionIntrinsicOp(Kind);

19469return Builder.CreateBinaryIntrinsic(Id,LHS,RHS);

19470 }

19471default:

19472llvm_unreachable("Unknown reduction operation.");

19473 }

19474 }

19475

19476 /// Creates reduction operation with the current opcode with the IR flags

19477 /// from \p ReductionOps, dropping nuw/nsw flags.

19478staticValue *createOp(IRBuilderBase &Builder,RecurKind RdxKind,Value *LHS,

19479Value *RHS,constTwine &Name,

19480const ReductionOpsListType &ReductionOps) {

19481bool UseSelect = ReductionOps.size() == 2 ||

19482// Logical or/and.

19483 (ReductionOps.size() == 1 &&

19484any_of(ReductionOps.front(), IsaPred<SelectInst>));

19485assert((!UseSelect || ReductionOps.size() != 2 ||

19486 isa<SelectInst>(ReductionOps[1][0])) &&

19487"Expected cmp + select pairs for reduction");

19488Value *Op = createOp(Builder, RdxKind,LHS,RHS,Name, UseSelect);

19489if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {

19490if (auto *Sel = dyn_cast<SelectInst>(Op)) {

19491propagateIRFlags(Sel->getCondition(), ReductionOps[0],nullptr,

19492/*IncludeWrapFlags=*/false);

19493propagateIRFlags(Op, ReductionOps[1],nullptr,

19494/*IncludeWrapFlags=*/false);

19495returnOp;

19496 }

19497 }

19498propagateIRFlags(Op, ReductionOps[0],nullptr,/*IncludeWrapFlags=*/false);

19499returnOp;

19500 }

19501

19502public:

19503staticRecurKind getRdxKind(Value *V) {

19504auto *I = dyn_cast<Instruction>(V);

19505if (!I)

19506return RecurKind::None;

19507if (match(I,m_Add(m_Value(),m_Value())))

19508return RecurKind::Add;

19509if (match(I,m_Mul(m_Value(),m_Value())))

19510return RecurKind::Mul;

19511if (match(I,m_And(m_Value(),m_Value())) ||

19512match(I,m_LogicalAnd(m_Value(),m_Value())))

19513return RecurKind::And;

19514if (match(I,m_Or(m_Value(),m_Value())) ||

19515match(I,m_LogicalOr(m_Value(),m_Value())))

19516return RecurKind::Or;

19517if (match(I,m_Xor(m_Value(),m_Value())))

19518return RecurKind::Xor;

19519if (match(I,m_FAdd(m_Value(),m_Value())))

19520return RecurKind::FAdd;

19521if (match(I,m_FMul(m_Value(),m_Value())))

19522return RecurKind::FMul;

19523

19524if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(),m_Value())))

19525return RecurKind::FMax;

19526if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(),m_Value())))

19527return RecurKind::FMin;

19528

19529if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(),m_Value())))

19530return RecurKind::FMaximum;

19531if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(),m_Value())))

19532return RecurKind::FMinimum;

19533// This matches either cmp+select or intrinsics. SLP is expected to handle

19534// either form.

19535// TODO: If we are canonicalizing to intrinsics, we can remove several

19536// special-case paths that deal with selects.

19537if (match(I,m_SMax(m_Value(),m_Value())))

19538return RecurKind::SMax;

19539if (match(I,m_SMin(m_Value(),m_Value())))

19540return RecurKind::SMin;

19541if (match(I,m_UMax(m_Value(),m_Value())))

19542return RecurKind::UMax;

19543if (match(I,m_UMin(m_Value(),m_Value())))

19544return RecurKind::UMin;

19545

19546if (auto *Select = dyn_cast<SelectInst>(I)) {

19547// Try harder: look for min/max pattern based on instructions producing

19548// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).

19549// During the intermediate stages of SLP, it's very common to have

19550// pattern like this (since optimizeGatherSequence is run only once

19551// at the end):

19552// %1 = extractelement <2 x i32> %a, i32 0

19553// %2 = extractelement <2 x i32> %a, i32 1

19554// %cond = icmp sgt i32 %1, %2

19555// %3 = extractelement <2 x i32> %a, i32 0

19556// %4 = extractelement <2 x i32> %a, i32 1

19557// %select = select i1 %cond, i32 %3, i32 %4

19558CmpPredicate Pred;

19559Instruction *L1;

19560Instruction *L2;

19561

19562Value *LHS =Select->getTrueValue();

19563Value *RHS =Select->getFalseValue();

19564Value *Cond =Select->getCondition();

19565

19566// TODO: Support inverse predicates.

19567if (match(Cond,m_Cmp(Pred,m_Specific(LHS),m_Instruction(L2)))) {

19568if (!isa<ExtractElementInst>(RHS) ||

19569 !L2->isIdenticalTo(cast<Instruction>(RHS)))

19570return RecurKind::None;

19571 }elseif (match(Cond,m_Cmp(Pred,m_Instruction(L1),m_Specific(RHS)))) {

19572if (!isa<ExtractElementInst>(LHS) ||

19573 !L1->isIdenticalTo(cast<Instruction>(LHS)))

19574return RecurKind::None;

19575 }else {

19576if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))

19577return RecurKind::None;

19578if (!match(Cond,m_Cmp(Pred,m_Instruction(L1),m_Instruction(L2))) ||

19579 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||

19580 !L2->isIdenticalTo(cast<Instruction>(RHS)))

19581return RecurKind::None;

19582 }

19583

19584switch (Pred) {

19585default:

19586return RecurKind::None;

19587caseCmpInst::ICMP_SGT:

19588caseCmpInst::ICMP_SGE:

19589return RecurKind::SMax;

19590caseCmpInst::ICMP_SLT:

19591caseCmpInst::ICMP_SLE:

19592return RecurKind::SMin;

19593caseCmpInst::ICMP_UGT:

19594caseCmpInst::ICMP_UGE:

19595return RecurKind::UMax;

19596caseCmpInst::ICMP_ULT:

19597caseCmpInst::ICMP_ULE:

19598return RecurKind::UMin;

19599 }

19600 }

19601return RecurKind::None;

19602 }

19603

19604 /// Get the index of the first operand.

19605staticunsigned getFirstOperandIndex(Instruction *I) {

19606return isCmpSelMinMax(I) ? 1 : 0;

19607 }

19608

19609private:

19610 /// Total number of operands in the reduction operation.

19611staticunsigned getNumberOfOperands(Instruction *I) {

19612return isCmpSelMinMax(I) ? 3 : 2;

19613 }

19614

19615 /// Checks if the instruction is in basic block \p BB.

19616 /// For a cmp+sel min/max reduction check that both ops are in \p BB.

19617staticbool hasSameParent(Instruction *I,BasicBlock *BB) {

19618if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {

19619auto *Sel = cast<SelectInst>(I);

19620auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());

19621return Sel->getParent() == BB &&Cmp &&Cmp->getParent() == BB;

19622 }

19623returnI->getParent() == BB;

19624 }

19625

19626 /// Expected number of uses for reduction operations/reduced values.

19627staticbool hasRequiredNumberOfUses(bool IsCmpSelMinMax,Instruction *I) {

19628if (IsCmpSelMinMax) {

19629// SelectInst must be used twice while the condition op must have single

19630// use only.

19631if (auto *Sel = dyn_cast<SelectInst>(I))

19632return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();

19633returnI->hasNUses(2);

19634 }

19635

19636// Arithmetic reduction operation must be used once only.

19637returnI->hasOneUse();

19638 }

19639

19640 /// Initializes the list of reduction operations.

19641void initReductionOps(Instruction *I) {

19642if (isCmpSelMinMax(I))

19643 ReductionOps.assign(2, ReductionOpsType());

19644else

19645 ReductionOps.assign(1, ReductionOpsType());

19646 }

19647

19648 /// Add all reduction operations for the reduction instruction \p I.

19649void addReductionOps(Instruction *I) {

19650if (isCmpSelMinMax(I)) {

19651 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());

19652 ReductionOps[1].emplace_back(I);

19653 }else {

19654 ReductionOps[0].emplace_back(I);

19655 }

19656 }

19657

19658staticbool isGoodForReduction(ArrayRef<Value *> Data) {

19659int Sz = Data.size();

19660auto *I = dyn_cast<Instruction>(Data.front());

19661return Sz > 1 ||isConstant(Data.front()) ||

19662 (I && !isa<LoadInst>(I) &&isValidForAlternation(I->getOpcode()));

19663 }

19664

19665public:

19666HorizontalReduction() =default;

19667

19668 /// Try to find a reduction tree.

19669bool matchAssociativeReduction(BoUpSLP &R,Instruction *Root,

19670ScalarEvolution &SE,constDataLayout &DL,

19671constTargetLibraryInfo &TLI) {

19672 RdxKind = HorizontalReduction::getRdxKind(Root);

19673if (!isVectorizable(RdxKind, Root))

19674returnfalse;

19675

19676// Analyze "regular" integer/FP types for reductions - no target-specific

19677// types or pointers.

19678Type *Ty = Root->getType();

19679if (!isValidElementType(Ty) || Ty->isPointerTy())

19680returnfalse;

19681

19682// Though the ultimate reduction may have multiple uses, its condition must

19683// have only single use.

19684if (auto *Sel = dyn_cast<SelectInst>(Root))

19685if (!Sel->getCondition()->hasOneUse())

19686returnfalse;

19687

19688 ReductionRoot = Root;

19689

19690// Iterate through all the operands of the possible reduction tree and

19691// gather all the reduced values, sorting them by their value id.

19692BasicBlock *BB = Root->getParent();

19693bool IsCmpSelMinMax = isCmpSelMinMax(Root);

19694SmallVector<std::pair<Instruction *, unsigned>> Worklist(

19695 1, std::make_pair(Root, 0));

19696// Checks if the operands of the \p TreeN instruction are also reduction

19697// operations or should be treated as reduced values or an extra argument,

19698// which is not part of the reduction.

19699auto CheckOperands = [&](Instruction *TreeN,

19700SmallVectorImpl<Value *> &PossibleReducedVals,

19701SmallVectorImpl<Instruction *> &ReductionOps,

19702unsigned Level) {

19703for (intI :reverse(seq<int>(getFirstOperandIndex(TreeN),

19704 getNumberOfOperands(TreeN)))) {

19705Value *EdgeVal = getRdxOperand(TreeN,I);

19706 ReducedValsToOps[EdgeVal].push_back(TreeN);

19707auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);

19708// If the edge is not an instruction, or it is different from the main

19709// reduction opcode or has too many uses - possible reduced value.

19710// Also, do not try to reduce const values, if the operation is not

19711// foldable.

19712if (!EdgeInst || Level >RecursionMaxDepth ||

19713getRdxKind(EdgeInst) != RdxKind ||

19714 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||

19715 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||

19716 !isVectorizable(RdxKind, EdgeInst) ||

19717 (R.isAnalyzedReductionRoot(EdgeInst) &&

19718all_of(EdgeInst->operands(), IsaPred<Constant>))) {

19719 PossibleReducedVals.push_back(EdgeVal);

19720continue;

19721 }

19722 ReductionOps.push_back(EdgeInst);

19723 }

19724 };

19725// Try to regroup reduced values so that it gets more profitable to try to

19726// reduce them. Values are grouped by their value ids, instructions - by

19727// instruction op id and/or alternate op id, plus do extra analysis for

19728// loads (grouping them by the distabce between pointers) and cmp

19729// instructions (grouping them by the predicate).

19730SmallMapVector<

19731 size_t,SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,

19732 8>

19733 PossibleReducedVals;

19734 initReductionOps(Root);

19735DenseMap<std::pair<size_t, Value *>,SmallVector<LoadInst *>> LoadsMap;

19736SmallSet<size_t, 2> LoadKeyUsed;

19737

19738auto GenerateLoadsSubkey = [&](size_tKey,LoadInst *LI) {

19739Key =hash_combine(hash_value(LI->getParent()), Key);

19740Value *Ptr =

19741getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth);

19742if (!LoadKeyUsed.insert(Key).second) {

19743auto LIt = LoadsMap.find(std::make_pair(Key,Ptr));

19744if (LIt != LoadsMap.end()) {

19745for (LoadInst *RLI : LIt->second) {

19746if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),

19747 LI->getType(), LI->getPointerOperand(),DL, SE,

19748/*StrictCheck=*/true))

19749returnhash_value(RLI->getPointerOperand());

19750 }

19751for (LoadInst *RLI : LIt->second) {

19752if (arePointersCompatible(RLI->getPointerOperand(),

19753 LI->getPointerOperand(), TLI)) {

19754hash_code SubKey =hash_value(RLI->getPointerOperand());

19755return SubKey;

19756 }

19757 }

19758if (LIt->second.size() > 2) {

19759hash_code SubKey =

19760hash_value(LIt->second.back()->getPointerOperand());

19761return SubKey;

19762 }

19763 }

19764 }

19765 LoadsMap.try_emplace(std::make_pair(Key,Ptr))

19766 .first->second.push_back(LI);

19767returnhash_value(LI->getPointerOperand());

19768 };

19769

19770while (!Worklist.empty()) {

19771auto [TreeN, Level] = Worklist.pop_back_val();

19772SmallVector<Value *> PossibleRedVals;

19773SmallVector<Instruction *> PossibleReductionOps;

19774 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);

19775 addReductionOps(TreeN);

19776// Add reduction values. The values are sorted for better vectorization

19777// results.

19778for (Value *V : PossibleRedVals) {

19779size_tKey,Idx;

19780 std::tie(Key,Idx) =generateKeySubkey(V, &TLI, GenerateLoadsSubkey,

19781/*AllowAlternate=*/false);

19782 ++PossibleReducedVals[Key][Idx]

19783 .insert(std::make_pair(V, 0))

19784 .first->second;

19785 }

19786for (Instruction *I :reverse(PossibleReductionOps))

19787 Worklist.emplace_back(I,I->getParent() == BB ? 0 : Level + 1);

19788 }

19789auto PossibleReducedValsVect = PossibleReducedVals.takeVector();

19790// Sort values by the total number of values kinds to start the reduction

19791// from the longest possible reduced values sequences.

19792for (auto &PossibleReducedVals : PossibleReducedValsVect) {

19793auto PossibleRedVals = PossibleReducedVals.second.takeVector();

19794SmallVector<SmallVector<Value *>> PossibleRedValsVect;

19795for (auto It = PossibleRedVals.begin(),E = PossibleRedVals.end();

19796 It !=E; ++It) {

19797 PossibleRedValsVect.emplace_back();

19798auto RedValsVect = It->second.takeVector();

19799stable_sort(RedValsVect,llvm::less_second());

19800for (const std::pair<Value *, unsigned> &Data : RedValsVect)

19801 PossibleRedValsVect.back().append(Data.second, Data.first);

19802 }

19803stable_sort(PossibleRedValsVect, [](constauto &P1,constauto &P2) {

19804returnP1.size() > P2.size();

19805 });

19806int NewIdx = -1;

19807for (ArrayRef<Value *> Data : PossibleRedValsVect) {

19808if (NewIdx < 0 ||

19809 (!isGoodForReduction(Data) &&

19810 (!isa<LoadInst>(Data.front()) ||

19811 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||

19812getUnderlyingObject(

19813 cast<LoadInst>(Data.front())->getPointerOperand()) !=

19814getUnderlyingObject(

19815 cast<LoadInst>(ReducedVals[NewIdx].front())

19816 ->getPointerOperand())))) {

19817 NewIdx = ReducedVals.size();

19818 ReducedVals.emplace_back();

19819 }

19820 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());

19821 }

19822 }

19823// Sort the reduced values by number of same/alternate opcode and/or pointer

19824// operand.

19825stable_sort(ReducedVals, [](ArrayRef<Value *> P1,ArrayRef<Value *> P2) {

19826returnP1.size() > P2.size();

19827 });

19828returntrue;

19829 }

19830

19831 /// Attempt to vectorize the tree found by matchAssociativeReduction.

19832Value *tryToReduce(BoUpSLP &V,constDataLayout &DL,TargetTransformInfo *TTI,

19833constTargetLibraryInfo &TLI,AssumptionCache *AC) {

19834constunsigned ReductionLimit =VectorizeNonPowerOf2 ? 3 : 4;

19835constexprunsigned RegMaxNumber = 4;

19836constexprunsigned RedValsMaxNumber = 128;

19837// If there are a sufficient number of reduction values, reduce

19838// to a nearby power-of-2. We can safely generate oversized

19839// vectors and rely on the backend to split them to legal sizes.

19840if (unsigned NumReducedVals = std::accumulate(

19841 ReducedVals.begin(), ReducedVals.end(), 0,

19842 [](unsigned Num,ArrayRef<Value *> Vals) ->unsigned {

19843 if (!isGoodForReduction(Vals))

19844 return Num;

19845 return Num + Vals.size();

19846 });

19847 NumReducedVals < ReductionLimit &&

19848all_of(ReducedVals, [](ArrayRef<Value *> RedV) {

19849return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);

19850 })) {

19851for (ReductionOpsType &RdxOps : ReductionOps)

19852for (Value *RdxOp : RdxOps)

19853V.analyzedReductionRoot(cast<Instruction>(RdxOp));

19854returnnullptr;

19855 }

19856

19857IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),

19858TargetFolder(DL));

19859 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));

19860

19861// Track the reduced values in case if they are replaced by extractelement

19862// because of the vectorization.

19863DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *

19864 ReducedVals.front().size());

19865

19866// The compare instruction of a min/max is the insertion point for new

19867// instructions and may be replaced with a new compare instruction.

19868auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {

19869assert(isa<SelectInst>(RdxRootInst) &&

19870"Expected min/max reduction to have select root instruction");

19871Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();

19872assert(isa<Instruction>(ScalarCond) &&

19873"Expected min/max reduction to have compare condition");

19874return cast<Instruction>(ScalarCond);

19875 };

19876

19877bool AnyBoolLogicOp =any_of(ReductionOps.back(), [](Value *V) {

19878 return isBoolLogicOp(cast<Instruction>(V));

19879 });

19880// Return new VectorizedTree, based on previous value.

19881auto GetNewVectorizedTree = [&](Value *VectorizedTree,Value *Res) {

19882if (VectorizedTree) {

19883// Update the final value in the reduction.

19884 Builder.SetCurrentDebugLocation(

19885 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());

19886if (AnyBoolLogicOp) {

19887auto It = ReducedValsToOps.find(VectorizedTree);

19888auto It1 = ReducedValsToOps.find(Res);

19889if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||

19890isGuaranteedNotToBePoison(VectorizedTree, AC) ||

19891 (It != ReducedValsToOps.end() &&

19892any_of(It->getSecond(), [&](Instruction *I) {

19893 return isBoolLogicOp(I) &&

19894 getRdxOperand(I, 0) == VectorizedTree;

19895 }))) {

19896 ;

19897 }elseif (isGuaranteedNotToBePoison(Res, AC) ||

19898 (It1 != ReducedValsToOps.end() &&

19899any_of(It1->getSecond(), [&](Instruction *I) {

19900 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;

19901 }))) {

19902std::swap(VectorizedTree, Res);

19903 }else {

19904 VectorizedTree = Builder.CreateFreeze(VectorizedTree);

19905 }

19906 }

19907

19908return createOp(Builder, RdxKind, VectorizedTree, Res,"op.rdx",

19909 ReductionOps);

19910 }

19911// Initialize the final value in the reduction.

19912return Res;

19913 };

19914SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *

19915 ReductionOps.front().size());

19916for (ReductionOpsType &RdxOps : ReductionOps)

19917for (Value *RdxOp : RdxOps) {

19918if (!RdxOp)

19919continue;

19920 IgnoreList.insert(RdxOp);

19921 }

19922// Intersect the fast-math-flags from all reduction operations.

19923FastMathFlags RdxFMF;

19924 RdxFMF.set();

19925for (Value *U : IgnoreList)

19926if (auto *FPMO = dyn_cast<FPMathOperator>(U))

19927 RdxFMF &= FPMO->getFastMathFlags();

19928bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));

19929

19930// Need to track reduced vals, they may be changed during vectorization of

19931// subvectors.

19932for (ArrayRef<Value *> Candidates : ReducedVals)

19933for (Value *V : Candidates)

19934 TrackedVals.try_emplace(V, V);

19935

19936auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,

19937Value *V) ->unsigned & {

19938auto *It = MV.find(V);

19939assert(It != MV.end() &&"Unable to find given key.");

19940return It->second;

19941 };

19942

19943DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());

19944// List of the values that were reduced in other trees as part of gather

19945// nodes and thus requiring extract if fully vectorized in other trees.

19946SmallPtrSet<Value *, 4> RequiredExtract;

19947WeakTrackingVH VectorizedTree =nullptr;

19948bool CheckForReusedReductionOps =false;

19949// Try to vectorize elements based on their type.

19950SmallVector<InstructionsState> States;

19951for (ArrayRef<Value *> RV : ReducedVals)

19952 States.push_back(getSameOpcode(RV, TLI));

19953for (unsignedI = 0,E = ReducedVals.size();I <E; ++I) {

19954ArrayRef<Value *> OrigReducedVals = ReducedVals[I];

19955 InstructionsState S = States[I];

19956SmallVector<Value *> Candidates;

19957 Candidates.reserve(2 * OrigReducedVals.size());

19958DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());

19959for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {

19960Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);

19961// Check if the reduction value was not overriden by the extractelement

19962// instruction because of the vectorization and exclude it, if it is not

19963// compatible with other values.

19964// Also check if the instruction was folded to constant/other value.

19965auto *Inst = dyn_cast<Instruction>(RdxVal);

19966if ((Inst &&isVectorLikeInstWithConstOps(Inst) &&

19967 (!S || !S.isOpcodeOrAlt(Inst))) ||

19968 (S && !Inst))

19969continue;

19970 Candidates.push_back(RdxVal);

19971 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);

19972 }

19973bool ShuffledExtracts =false;

19974// Try to handle shuffled extractelements.

19975if (S && S.getOpcode() == Instruction::ExtractElement &&

19976 !S.isAltShuffle() &&I + 1 <E) {

19977SmallVector<Value *> CommonCandidates(Candidates);

19978for (Value *RV : ReducedVals[I + 1]) {

19979Value *RdxVal = TrackedVals.at(RV);

19980// Check if the reduction value was not overriden by the

19981// extractelement instruction because of the vectorization and

19982// exclude it, if it is not compatible with other values.

19983auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);

19984if (!Inst)

19985continue;

19986 CommonCandidates.push_back(RdxVal);

19987 TrackedToOrig.try_emplace(RdxVal, RV);

19988 }

19989SmallVector<int>Mask;

19990if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {

19991 ++I;

19992 Candidates.swap(CommonCandidates);

19993 ShuffledExtracts =true;

19994 }

19995 }

19996

19997// Emit code for constant values.

19998if (Candidates.size() > 1 &&allConstant(Candidates)) {

19999Value *Res = Candidates.front();

20000Value *OrigV = TrackedToOrig.at(Candidates.front());

20001 ++VectorizedVals.try_emplace(OrigV).first->getSecond();

20002for (Value *VC :ArrayRef(Candidates).drop_front()) {

20003 Res = createOp(Builder, RdxKind, Res, VC,"const.rdx", ReductionOps);

20004Value *OrigV = TrackedToOrig.at(VC);

20005 ++VectorizedVals.try_emplace(OrigV).first->getSecond();

20006if (auto *ResI = dyn_cast<Instruction>(Res))

20007V.analyzedReductionRoot(ResI);

20008 }

20009 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);

20010continue;

20011 }

20012

20013unsigned NumReducedVals = Candidates.size();

20014if (NumReducedVals < ReductionLimit &&

20015 (NumReducedVals < 2 || !isSplat(Candidates)))

20016continue;

20017

20018// Check if we support repeated scalar values processing (optimization of

20019// original scalar identity operations on matched horizontal reductions).

20020 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&

20021 RdxKind != RecurKind::FMul &&

20022 RdxKind != RecurKind::FMulAdd;

20023// Gather same values.

20024SmallMapVector<Value *, unsigned, 16> SameValuesCounter;

20025if (IsSupportedHorRdxIdentityOp)

20026for (Value *V : Candidates) {

20027Value *OrigV = TrackedToOrig.at(V);

20028 ++SameValuesCounter.try_emplace(OrigV).first->second;

20029 }

20030// Used to check if the reduced values used same number of times. In this

20031// case the compiler may produce better code. E.g. if reduced values are

20032// aabbccdd (8 x values), then the first node of the tree will have a node

20033// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.

20034// Plus, the final reduction will be performed on <8 x aabbccdd>.

20035// Instead compiler may build <4 x abcd> tree immediately, + reduction (4

20036// x abcd) * 2.

20037// Currently it only handles add/fadd/xor. and/or/min/max do not require

20038// this analysis, other operations may require an extra estimation of

20039// the profitability.

20040bool SameScaleFactor =false;

20041bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&

20042 SameValuesCounter.size() != Candidates.size();

20043BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;

20044if (OptReusedScalars) {

20045 SameScaleFactor =

20046 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||

20047 RdxKind == RecurKind::Xor) &&

20048all_of(drop_begin(SameValuesCounter),

20049 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {

20050returnP.second == SameValuesCounter.front().second;

20051 });

20052 Candidates.resize(SameValuesCounter.size());

20053transform(SameValuesCounter, Candidates.begin(),

20054 [&](constauto &P) { return TrackedVals.at(P.first); });

20055 NumReducedVals = Candidates.size();

20056// Have a reduction of the same element.

20057if (NumReducedVals == 1) {

20058Value *OrigV = TrackedToOrig.at(Candidates.front());

20059unsigned Cnt = At(SameValuesCounter, OrigV);

20060Value *RedVal =

20061 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);

20062 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);

20063 VectorizedVals.try_emplace(OrigV, Cnt);

20064 ExternallyUsedValues.insert(OrigV);

20065continue;

20066 }

20067 }

20068

20069unsigned MaxVecRegSize =V.getMaxVecRegSize();

20070unsigned EltSize =V.getVectorElementSize(Candidates[0]);

20071constunsigned MaxElts = std::clamp<unsigned>(

20072llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,

20073 RegMaxNumber * RedValsMaxNumber);

20074

20075unsigned ReduxWidth = NumReducedVals;

20076auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {

20077unsigned NumParts, NumRegs;

20078Type *ScalarTy = Candidates.front()->getType();

20079 ReduxWidth =

20080getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);

20081VectorType *Tp =getWidenedType(ScalarTy, ReduxWidth);

20082 NumParts =TTI.getNumberOfParts(Tp);

20083 NumRegs =

20084TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));

20085while (NumParts > NumRegs) {

20086assert(ReduxWidth > 0 &&"ReduxWidth is unexpectedly 0.");

20087 ReduxWidth =bit_floor(ReduxWidth - 1);

20088VectorType *Tp =getWidenedType(ScalarTy, ReduxWidth);

20089 NumParts =TTI.getNumberOfParts(Tp);

20090 NumRegs =

20091TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));

20092 }

20093if (NumParts > NumRegs / 2)

20094 ReduxWidth =bit_floor(ReduxWidth);

20095return ReduxWidth;

20096 };

20097if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))

20098 ReduxWidth = GetVectorFactor(ReduxWidth);

20099 ReduxWidth = std::min(ReduxWidth, MaxElts);

20100

20101unsigned Start = 0;

20102unsigned Pos = Start;

20103// Restarts vectorization attempt with lower vector factor.

20104unsigned PrevReduxWidth = ReduxWidth;

20105bool CheckForReusedReductionOpsLocal =false;

20106auto AdjustReducedVals = [&](bool IgnoreVL =false) {

20107bool IsAnyRedOpGathered = !IgnoreVL &&V.isAnyGathered(IgnoreList);

20108if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {

20109// Check if any of the reduction ops are gathered. If so, worth

20110// trying again with less number of reduction ops.

20111 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;

20112 }

20113 ++Pos;

20114if (Pos < NumReducedVals - ReduxWidth + 1)

20115return IsAnyRedOpGathered;

20116 Pos = Start;

20117 --ReduxWidth;

20118if (ReduxWidth > 1)

20119 ReduxWidth = GetVectorFactor(ReduxWidth);

20120return IsAnyRedOpGathered;

20121 };

20122bool AnyVectorized =false;

20123SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;

20124while (Pos < NumReducedVals - ReduxWidth + 1 &&

20125 ReduxWidth >= ReductionLimit) {

20126// Dependency in tree of the reduction ops - drop this attempt, try

20127// later.

20128if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&

20129 Start == 0) {

20130 CheckForReusedReductionOps =true;

20131break;

20132 }

20133 PrevReduxWidth = ReduxWidth;

20134ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);

20135// Been analyzed already - skip.

20136if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||

20137 (!has_single_bit(ReduxWidth) &&

20138 (IgnoredCandidates.contains(

20139 std::make_pair(Pos,bit_floor(ReduxWidth))) ||

20140 IgnoredCandidates.contains(

20141 std::make_pair(Pos + (ReduxWidth -bit_floor(ReduxWidth)),

20142bit_floor(ReduxWidth))))) ||

20143V.areAnalyzedReductionVals(VL)) {

20144 (void)AdjustReducedVals(/*IgnoreVL=*/true);

20145continue;

20146 }

20147// Early exit if any of the reduction values were deleted during

20148// previous vectorization attempts.

20149if (any_of(VL, [&V](Value *RedVal) {

20150auto *RedValI = dyn_cast<Instruction>(RedVal);

20151if (!RedValI)

20152returnfalse;

20153returnV.isDeleted(RedValI);

20154 }))

20155break;

20156V.buildTree(VL, IgnoreList);

20157if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {

20158if (!AdjustReducedVals())

20159V.analyzedReductionVals(VL);

20160continue;

20161 }

20162if (V.isLoadCombineReductionCandidate(RdxKind)) {

20163if (!AdjustReducedVals())

20164V.analyzedReductionVals(VL);

20165continue;

20166 }

20167V.reorderTopToBottom();

20168// No need to reorder the root node at all.

20169V.reorderBottomToTop(/*IgnoreReorder=*/true);

20170// Keep extracted other reduction values, if they are used in the

20171// vectorization trees.

20172BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(

20173 ExternallyUsedValues);

20174// The reduction root is used as the insertion point for new

20175// instructions, so set it as externally used to prevent it from being

20176// deleted.

20177 LocalExternallyUsedValues.insert(ReductionRoot);

20178for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {

20179if (Cnt ==I || (ShuffledExtracts && Cnt ==I - 1))

20180continue;

20181for (Value *V : ReducedVals[Cnt])

20182if (isa<Instruction>(V))

20183 LocalExternallyUsedValues.insert(TrackedVals[V]);

20184 }

20185if (!IsSupportedHorRdxIdentityOp) {

20186// Number of uses of the candidates in the vector of values.

20187assert(SameValuesCounter.empty() &&

20188"Reused values counter map is not empty");

20189for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {

20190if (Cnt >= Pos && Cnt < Pos + ReduxWidth)

20191continue;

20192Value *V = Candidates[Cnt];

20193Value *OrigV = TrackedToOrig.at(V);

20194 ++SameValuesCounter.try_emplace(OrigV).first->second;

20195 }

20196 }

20197V.transformNodes();

20198SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());

20199// Gather externally used values.

20200SmallPtrSet<Value *, 4> Visited;

20201for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {

20202if (Cnt >= Pos && Cnt < Pos + ReduxWidth)

20203continue;

20204Value *RdxVal = Candidates[Cnt];

20205if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())

20206 RdxVal = It->second;

20207if (!Visited.insert(RdxVal).second)

20208continue;

20209// Check if the scalar was vectorized as part of the vectorization

20210// tree but not the top node.

20211if (!VLScalars.contains(RdxVal) &&V.isVectorized(RdxVal)) {

20212 LocalExternallyUsedValues.insert(RdxVal);

20213continue;

20214 }

20215Value *OrigV = TrackedToOrig.at(RdxVal);

20216unsigned NumOps =

20217 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);

20218if (NumOps != ReducedValsToOps.at(OrigV).size())

20219 LocalExternallyUsedValues.insert(RdxVal);

20220 }

20221// Do not need the list of reused scalars in regular mode anymore.

20222if (!IsSupportedHorRdxIdentityOp)

20223 SameValuesCounter.clear();

20224for (Value *RdxVal : VL)

20225if (RequiredExtract.contains(RdxVal))

20226 LocalExternallyUsedValues.insert(RdxVal);

20227V.buildExternalUses(LocalExternallyUsedValues);

20228

20229V.computeMinimumValueSizes();

20230

20231// Estimate cost.

20232InstructionCost TreeCost =V.getTreeCost(VL);

20233InstructionCost ReductionCost =

20234 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);

20235InstructionCost Cost = TreeCost + ReductionCost;

20236LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost

20237 <<" for reduction\n");

20238if (!Cost.isValid())

20239break;

20240if (Cost >= -SLPCostThreshold) {

20241V.getORE()->emit([&]() {

20242returnOptimizationRemarkMissed(SV_NAME,"HorSLPNotBeneficial",

20243 ReducedValsToOps.at(VL[0]).front())

20244 <<"Vectorizing horizontal reduction is possible "

20245 <<"but not beneficial with cost " <<ore::NV("Cost",Cost)

20246 <<" and threshold "

20247 <<ore::NV("Threshold", -SLPCostThreshold);

20248 });

20249if (!AdjustReducedVals()) {

20250V.analyzedReductionVals(VL);

20251unsignedOffset = Pos == Start ? Pos : Pos - 1;

20252if (ReduxWidth > ReductionLimit &&V.isTreeNotExtendable()) {

20253// Add subvectors of VL to the list of the analyzed values.

20254for (unsigned VF =getFloorFullVectorNumberOfElements(

20255 *TTI, VL.front()->getType(), ReduxWidth - 1);

20256 VF >= ReductionLimit;

20257 VF =getFloorFullVectorNumberOfElements(

20258 *TTI, VL.front()->getType(), VF - 1)) {

20259if (has_single_bit(VF) &&

20260V.getCanonicalGraphSize() !=V.getTreeSize())

20261continue;

20262for (unsignedIdx : seq<unsigned>(ReduxWidth - VF))

20263 IgnoredCandidates.insert(std::make_pair(Offset +Idx, VF));

20264 }

20265 }

20266 }

20267continue;

20268 }

20269

20270LLVM_DEBUG(dbgs() <<"SLP: Vectorizing horizontal reduction at cost:"

20271 <<Cost <<". (HorRdx)\n");

20272V.getORE()->emit([&]() {

20273returnOptimizationRemark(SV_NAME,"VectorizedHorizontalReduction",

20274 ReducedValsToOps.at(VL[0]).front())

20275 <<"Vectorized horizontal reduction with cost "

20276 <<ore::NV("Cost",Cost) <<" and with tree size "

20277 <<ore::NV("TreeSize",V.getTreeSize());

20278 });

20279

20280 Builder.setFastMathFlags(RdxFMF);

20281

20282// Emit a reduction. If the root is a select (min/max idiom), the insert

20283// point is the compare condition of that select.

20284Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);

20285Instruction *InsertPt = RdxRootInst;

20286if (IsCmpSelMinMax)

20287 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);

20288

20289// Vectorize a tree.

20290Value *VectorizedRoot =

20291V.vectorizeTree(LocalExternallyUsedValues, InsertPt);

20292// Update TrackedToOrig mapping, since the tracked values might be

20293// updated.

20294for (Value *RdxVal : Candidates) {

20295Value *OrigVal = TrackedToOrig.at(RdxVal);

20296Value *TransformedRdxVal = TrackedVals.at(OrigVal);

20297if (TransformedRdxVal != RdxVal)

20298 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);

20299 }

20300

20301 Builder.SetInsertPoint(InsertPt);

20302

20303// To prevent poison from leaking across what used to be sequential,

20304// safe, scalar boolean logic operations, the reduction operand must be

20305// frozen.

20306if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))

20307 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);

20308

20309// Emit code to correctly handle reused reduced values, if required.

20310if (OptReusedScalars && !SameScaleFactor) {

20311 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,

20312 SameValuesCounter, TrackedToOrig);

20313 }

20314

20315Value *ReducedSubTree;

20316Type *ScalarTy = VL.front()->getType();

20317if (isa<FixedVectorType>(ScalarTy)) {

20318assert(SLPReVec &&"FixedVectorType is not expected.");

20319unsigned ScalarTyNumElements =getNumElements(ScalarTy);

20320 ReducedSubTree =PoisonValue::get(FixedVectorType::get(

20321 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));

20322for (unsignedI : seq<unsigned>(ScalarTyNumElements)) {

20323// Do reduction for each lane.

20324// e.g., do reduce add for

20325// VL[0] = <4 x Ty> <a, b, c, d>

20326// VL[1] = <4 x Ty> <e, f, g, h>

20327// Lane[0] = <2 x Ty> <a, e>

20328// Lane[1] = <2 x Ty> <b, f>

20329// Lane[2] = <2 x Ty> <c, g>

20330// Lane[3] = <2 x Ty> <d, h>

20331// result[0] = reduce add Lane[0]

20332// result[1] = reduce add Lane[1]

20333// result[2] = reduce add Lane[2]

20334// result[3] = reduce add Lane[3]

20335SmallVector<int, 16>Mask =

20336createStrideMask(I, ScalarTyNumElements, VL.size());

20337Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);

20338 ReducedSubTree = Builder.CreateInsertElement(

20339 ReducedSubTree,

20340 emitReduction(Lane, Builder,TTI, RdxRootInst->getType()),I);

20341 }

20342 }else {

20343 ReducedSubTree = emitReduction(VectorizedRoot, Builder,TTI,

20344 RdxRootInst->getType());

20345 }

20346if (ReducedSubTree->getType() != VL.front()->getType()) {

20347assert(ReducedSubTree->getType() != VL.front()->getType() &&

20348"Expected different reduction type.");

20349 ReducedSubTree =

20350 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),

20351V.isSignedMinBitwidthRootNode());

20352 }

20353

20354// Improved analysis for add/fadd/xor reductions with same scale factor

20355// for all operands of reductions. We can emit scalar ops for them

20356// instead.

20357if (OptReusedScalars && SameScaleFactor)

20358 ReducedSubTree = emitScaleForReusedOps(

20359 ReducedSubTree, Builder, SameValuesCounter.front().second);

20360

20361 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);

20362// Count vectorized reduced values to exclude them from final reduction.

20363for (Value *RdxVal : VL) {

20364Value *OrigV = TrackedToOrig.at(RdxVal);

20365if (IsSupportedHorRdxIdentityOp) {

20366 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));

20367continue;

20368 }

20369 ++VectorizedVals.try_emplace(OrigV).first->getSecond();

20370if (!V.isVectorized(RdxVal))

20371 RequiredExtract.insert(RdxVal);

20372 }

20373 Pos += ReduxWidth;

20374 Start = Pos;

20375 ReduxWidth = NumReducedVals - Pos;

20376if (ReduxWidth > 1)

20377 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);

20378 AnyVectorized =true;

20379 }

20380if (OptReusedScalars && !AnyVectorized) {

20381for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {

20382Value *RdxVal = TrackedVals.at(P.first);

20383Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,P.second);

20384 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);

20385 VectorizedVals.try_emplace(P.first,P.second);

20386 }

20387continue;

20388 }

20389 }

20390if (VectorizedTree) {

20391// Reorder operands of bool logical op in the natural order to avoid

20392// possible problem with poison propagation. If not possible to reorder

20393// (both operands are originally RHS), emit an extra freeze instruction

20394// for the LHS operand.

20395// I.e., if we have original code like this:

20396// RedOp1 = select i1 ?, i1 LHS, i1 false

20397// RedOp2 = select i1 RHS, i1 ?, i1 false

20398

20399// Then, we swap LHS/RHS to create a new op that matches the poison

20400// semantics of the original code.

20401

20402// If we have original code like this and both values could be poison:

20403// RedOp1 = select i1 ?, i1 LHS, i1 false

20404// RedOp2 = select i1 ?, i1 RHS, i1 false

20405

20406// Then, we must freeze LHS in the new op.

20407auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS,Value *&RHS,

20408Instruction *RedOp1,

20409Instruction *RedOp2,

20410bool InitStep) {

20411if (!AnyBoolLogicOp)

20412return;

20413if (isBoolLogicOp(RedOp1) && ((!InitStep &&LHS == VectorizedTree) ||

20414 getRdxOperand(RedOp1, 0) ==LHS ||

20415isGuaranteedNotToBePoison(LHS, AC)))

20416return;

20417if (isBoolLogicOp(RedOp2) && ((!InitStep &&RHS == VectorizedTree) ||

20418 getRdxOperand(RedOp2, 0) ==RHS ||

20419isGuaranteedNotToBePoison(RHS, AC))) {

20420std::swap(LHS,RHS);

20421return;

20422 }

20423if (LHS != VectorizedTree)

20424LHS = Builder.CreateFreeze(LHS);

20425 };

20426// Finish the reduction.

20427// Need to add extra arguments and not vectorized possible reduction

20428// values.

20429// Try to avoid dependencies between the scalar remainders after

20430// reductions.

20431auto FinalGen =

20432 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,

20433bool InitStep) {

20434unsigned Sz = InstVals.size();

20435SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +

20436 Sz % 2);

20437for (unsignedI = 0,E = (Sz / 2) * 2;I <E;I += 2) {

20438Instruction *RedOp = InstVals[I + 1].first;

20439 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());

20440Value *RdxVal1 = InstVals[I].second;

20441Value *StableRdxVal1 = RdxVal1;

20442auto It1 = TrackedVals.find(RdxVal1);

20443if (It1 != TrackedVals.end())

20444 StableRdxVal1 = It1->second;

20445Value *RdxVal2 = InstVals[I + 1].second;

20446Value *StableRdxVal2 = RdxVal2;

20447auto It2 = TrackedVals.find(RdxVal2);

20448if (It2 != TrackedVals.end())

20449 StableRdxVal2 = It2->second;

20450// To prevent poison from leaking across what used to be

20451// sequential, safe, scalar boolean logic operations, the

20452// reduction operand must be frozen.

20453 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,

20454 RedOp, InitStep);

20455Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,

20456 StableRdxVal2,"op.rdx", ReductionOps);

20457 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);

20458 }

20459if (Sz % 2 == 1)

20460 ExtraReds[Sz / 2] = InstVals.back();

20461return ExtraReds;

20462 };

20463SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;

20464 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),

20465 VectorizedTree);

20466SmallPtrSet<Value *, 8> Visited;

20467for (ArrayRef<Value *> Candidates : ReducedVals) {

20468for (Value *RdxVal : Candidates) {

20469if (!Visited.insert(RdxVal).second)

20470continue;

20471unsigned NumOps = VectorizedVals.lookup(RdxVal);

20472for (Instruction *RedOp :

20473ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))

20474 ExtraReductions.emplace_back(RedOp, RdxVal);

20475 }

20476 }

20477// Iterate through all not-vectorized reduction values/extra arguments.

20478bool InitStep =true;

20479while (ExtraReductions.size() > 1) {

20480SmallVector<std::pair<Instruction *, Value *>> NewReds =

20481 FinalGen(ExtraReductions, InitStep);

20482 ExtraReductions.swap(NewReds);

20483 InitStep =false;

20484 }

20485 VectorizedTree = ExtraReductions.front().second;

20486

20487 ReductionRoot->replaceAllUsesWith(VectorizedTree);

20488

20489// The original scalar reduction is expected to have no remaining

20490// uses outside the reduction tree itself. Assert that we got this

20491// correct, replace internal uses with undef, and mark for eventual

20492// deletion.

20493#ifndef NDEBUG

20494SmallSet<Value *, 4> IgnoreSet;

20495for (ArrayRef<Value *> RdxOps : ReductionOps)

20496 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());

20497#endif

20498for (ArrayRef<Value *> RdxOps : ReductionOps) {

20499for (Value *Ignore : RdxOps) {

20500if (!Ignore)

20501continue;

20502#ifndef NDEBUG

20503for (auto *U :Ignore->users()) {

20504assert(IgnoreSet.count(U) &&

20505"All users must be either in the reduction ops list.");

20506 }

20507#endif

20508if (!Ignore->use_empty()) {

20509Value *P =PoisonValue::get(Ignore->getType());

20510Ignore->replaceAllUsesWith(P);

20511 }

20512 }

20513V.removeInstructionsAndOperands(RdxOps);

20514 }

20515 }elseif (!CheckForReusedReductionOps) {

20516for (ReductionOpsType &RdxOps : ReductionOps)

20517for (Value *RdxOp : RdxOps)

20518V.analyzedReductionRoot(cast<Instruction>(RdxOp));

20519 }

20520return VectorizedTree;

20521 }

20522

20523private:

20524 /// Calculate the cost of a reduction.

20525InstructionCost getReductionCost(TargetTransformInfo *TTI,

20526ArrayRef<Value *> ReducedVals,

20527bool IsCmpSelMinMax,FastMathFlags FMF,

20528constBoUpSLP &R) {

20529TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

20530Type *ScalarTy = ReducedVals.front()->getType();

20531unsigned ReduxWidth = ReducedVals.size();

20532FixedVectorType *VectorTy =R.getReductionType();

20533InstructionCost VectorCost = 0, ScalarCost;

20534// If all of the reduced values are constant, the vector cost is 0, since

20535// the reduction value can be calculated at the compile time.

20536bool AllConsts =allConstant(ReducedVals);

20537auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {

20538InstructionCost Cost = 0;

20539// Scalar cost is repeated for N-1 elements.

20540int Cnt = ReducedVals.size();

20541for (Value *RdxVal : ReducedVals) {

20542if (Cnt == 1)

20543break;

20544 --Cnt;

20545if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {

20546Cost += GenCostFn();

20547continue;

20548 }

20549InstructionCost ScalarCost = 0;

20550for (User *U : RdxVal->users()) {

20551auto *RdxOp = cast<Instruction>(U);

20552if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {

20553 ScalarCost +=TTI->getInstructionCost(RdxOp,CostKind);

20554continue;

20555 }

20556 ScalarCost =InstructionCost::getInvalid();

20557break;

20558 }

20559if (ScalarCost.isValid())

20560Cost += ScalarCost;

20561else

20562Cost += GenCostFn();

20563 }

20564returnCost;

20565 };

20566switch (RdxKind) {

20567case RecurKind::Add:

20568case RecurKind::Mul:

20569case RecurKind::Or:

20570case RecurKind::And:

20571case RecurKind::Xor:

20572case RecurKind::FAdd:

20573case RecurKind::FMul: {

20574unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(RdxKind);

20575if (!AllConsts) {

20576if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {

20577assert(SLPReVec &&"FixedVectorType is not expected.");

20578unsigned ScalarTyNumElements = VecTy->getNumElements();

20579for (unsignedI : seq<unsigned>(ReducedVals.size())) {

20580 VectorCost +=TTI->getShuffleCost(

20581TTI::SK_PermuteSingleSrc, VectorTy,

20582createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));

20583 VectorCost +=TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,

20584CostKind);

20585 }

20586 VectorCost +=TTI->getScalarizationOverhead(

20587 VecTy,APInt::getAllOnes(ScalarTyNumElements),/*Insert*/true,

20588/*Extract*/false,TTI::TCK_RecipThroughput);

20589 }else {

20590Type *RedTy = VectorTy->getElementType();

20591auto [RType, IsSigned] =R.getRootNodeTypeWithNoCast().value_or(

20592 std::make_pair(RedTy,true));

20593if (RType == RedTy) {

20594 VectorCost =TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,

20595 FMF,CostKind);

20596 }else {

20597 VectorCost =TTI->getExtendedReductionCost(

20598 RdxOpcode, !IsSigned, RedTy,getWidenedType(RType, ReduxWidth),

20599 FMF,CostKind);

20600 }

20601 }

20602 }

20603 ScalarCost = EvaluateScalarCost([&]() {

20604returnTTI->getArithmeticInstrCost(RdxOpcode, ScalarTy,CostKind);

20605 });

20606break;

20607 }

20608case RecurKind::FMax:

20609case RecurKind::FMin:

20610case RecurKind::FMaximum:

20611case RecurKind::FMinimum:

20612case RecurKind::SMax:

20613case RecurKind::SMin:

20614case RecurKind::UMax:

20615case RecurKind::UMin: {

20616Intrinsic::ID Id =getMinMaxReductionIntrinsicOp(RdxKind);

20617if (!AllConsts)

20618 VectorCost =TTI->getMinMaxReductionCost(Id, VectorTy, FMF,CostKind);

20619 ScalarCost = EvaluateScalarCost([&]() {

20620IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);

20621returnTTI->getIntrinsicInstrCost(ICA,CostKind);

20622 });

20623break;

20624 }

20625default:

20626llvm_unreachable("Expected arithmetic or min/max reduction operation");

20627 }

20628

20629LLVM_DEBUG(dbgs() <<"SLP: Adding cost " << VectorCost - ScalarCost

20630 <<" for reduction of " <<shortBundleName(ReducedVals)

20631 <<" (It is a splitting reduction)\n");

20632return VectorCost - ScalarCost;

20633 }

20634

20635 /// Emit a horizontal reduction of the vectorized value.

20636Value *emitReduction(Value *VectorizedValue,IRBuilderBase &Builder,

20637constTargetTransformInfo *TTI,Type *DestTy) {

20638assert(VectorizedValue &&"Need to have a vectorized tree node");

20639assert(RdxKind != RecurKind::FMulAdd &&

20640"A call to the llvm.fmuladd intrinsic is not handled yet");

20641

20642auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());

20643if (FTy->getScalarType() == Builder.getInt1Ty() &&

20644 RdxKind == RecurKind::Add &&

20645 DestTy->getScalarType() != FTy->getScalarType()) {

20646// Convert vector_reduce_add(ZExt(<n x i1>)) to

20647// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).

20648Value *V = Builder.CreateBitCast(

20649 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));

20650 ++NumVectorInstructions;

20651return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);

20652 }

20653 ++NumVectorInstructions;

20654returncreateSimpleReduction(Builder, VectorizedValue, RdxKind);

20655 }

20656

20657 /// Emits optimized code for unique scalar value reused \p Cnt times.

20658Value *emitScaleForReusedOps(Value *VectorizedValue,IRBuilderBase &Builder,

20659unsigned Cnt) {

20660assert(IsSupportedHorRdxIdentityOp &&

20661"The optimization of matched scalar identity horizontal reductions "

20662"must be supported.");

20663if (Cnt == 1)

20664return VectorizedValue;

20665switch (RdxKind) {

20666case RecurKind::Add: {

20667// res = mul vv, n

20668Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);

20669LLVM_DEBUG(dbgs() <<"SLP: Add (to-mul) " << Cnt <<"of "

20670 << VectorizedValue <<". (HorRdx)\n");

20671return Builder.CreateMul(VectorizedValue, Scale);

20672 }

20673case RecurKind::Xor: {

20674// res = n % 2 ? 0 : vv

20675LLVM_DEBUG(dbgs() <<"SLP: Xor " << Cnt <<"of " << VectorizedValue

20676 <<". (HorRdx)\n");

20677if (Cnt % 2 == 0)

20678returnConstant::getNullValue(VectorizedValue->getType());

20679return VectorizedValue;

20680 }

20681case RecurKind::FAdd: {

20682// res = fmul v, n

20683Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);

20684LLVM_DEBUG(dbgs() <<"SLP: FAdd (to-fmul) " << Cnt <<"of "

20685 << VectorizedValue <<". (HorRdx)\n");

20686return Builder.CreateFMul(VectorizedValue, Scale);

20687 }

20688case RecurKind::And:

20689case RecurKind::Or:

20690case RecurKind::SMax:

20691case RecurKind::SMin:

20692case RecurKind::UMax:

20693case RecurKind::UMin:

20694case RecurKind::FMax:

20695case RecurKind::FMin:

20696case RecurKind::FMaximum:

20697case RecurKind::FMinimum:

20698// res = vv

20699return VectorizedValue;

20700case RecurKind::Mul:

20701case RecurKind::FMul:

20702case RecurKind::FMulAdd:

20703case RecurKind::IAnyOf:

20704case RecurKind::FAnyOf:

20705case RecurKind::IFindLastIV:

20706case RecurKind::FFindLastIV:

20707case RecurKind::None:

20708llvm_unreachable("Unexpected reduction kind for repeated scalar.");

20709 }

20710returnnullptr;

20711 }

20712

20713 /// Emits actual operation for the scalar identity values, found during

20714 /// horizontal reduction analysis.

20715Value *

20716 emitReusedOps(Value *VectorizedValue,IRBuilderBase &Builder,BoUpSLP &R,

20717constSmallMapVector<Value *, unsigned, 16> &SameValuesCounter,

20718constDenseMap<Value *, Value *> &TrackedToOrig) {

20719assert(IsSupportedHorRdxIdentityOp &&

20720"The optimization of matched scalar identity horizontal reductions "

20721"must be supported.");

20722ArrayRef<Value *> VL =R.getRootNodeScalars();

20723auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());

20724if (VTy->getElementType() != VL.front()->getType()) {

20725 VectorizedValue = Builder.CreateIntCast(

20726 VectorizedValue,

20727getWidenedType(VL.front()->getType(), VTy->getNumElements()),

20728R.isSignedMinBitwidthRootNode());

20729 }

20730switch (RdxKind) {

20731case RecurKind::Add: {

20732// root = mul prev_root, <1, 1, n, 1>

20733SmallVector<Constant *> Vals;

20734for (Value *V : VL) {

20735unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));

20736 Vals.push_back(ConstantInt::get(V->getType(), Cnt,/*IsSigned=*/false));

20737 }

20738auto *Scale =ConstantVector::get(Vals);

20739LLVM_DEBUG(dbgs() <<"SLP: Add (to-mul) " << Scale <<"of "

20740 << VectorizedValue <<". (HorRdx)\n");

20741return Builder.CreateMul(VectorizedValue, Scale);

20742 }

20743case RecurKind::And:

20744case RecurKind::Or:

20745// No need for multiple or/and(s).

20746LLVM_DEBUG(dbgs() <<"SLP: And/or of same " << VectorizedValue

20747 <<". (HorRdx)\n");

20748return VectorizedValue;

20749case RecurKind::SMax:

20750case RecurKind::SMin:

20751case RecurKind::UMax:

20752case RecurKind::UMin:

20753case RecurKind::FMax:

20754case RecurKind::FMin:

20755case RecurKind::FMaximum:

20756case RecurKind::FMinimum:

20757// No need for multiple min/max(s) of the same value.

20758LLVM_DEBUG(dbgs() <<"SLP: Max/min of same " << VectorizedValue

20759 <<". (HorRdx)\n");

20760return VectorizedValue;

20761case RecurKind::Xor: {

20762// Replace values with even number of repeats with 0, since

20763// x xor x = 0.

20764// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,

20765// 7>, if elements 4th and 6th elements have even number of repeats.

20766SmallVector<int>Mask(

20767 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),

20768PoisonMaskElem);

20769 std::iota(Mask.begin(),Mask.end(), 0);

20770bool NeedShuffle =false;

20771for (unsignedI = 0, VF = VL.size();I < VF; ++I) {

20772Value *V = VL[I];

20773unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));

20774if (Cnt % 2 == 0) {

20775Mask[I] = VF;

20776 NeedShuffle =true;

20777 }

20778 }

20779LLVM_DEBUG(dbgs() <<"SLP: Xor <";for (intI

20780 : Mask)dbgs()

20781 <<I <<" ";

20782dbgs() <<"> of " << VectorizedValue <<". (HorRdx)\n");

20783if (NeedShuffle)

20784 VectorizedValue = Builder.CreateShuffleVector(

20785 VectorizedValue,

20786 ConstantVector::getNullValue(VectorizedValue->getType()),Mask);

20787return VectorizedValue;

20788 }

20789case RecurKind::FAdd: {

20790// root = fmul prev_root, <1.0, 1.0, n.0, 1.0>

20791SmallVector<Constant *> Vals;

20792for (Value *V : VL) {

20793unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));

20794 Vals.push_back(ConstantFP::get(V->getType(), Cnt));

20795 }

20796auto *Scale =ConstantVector::get(Vals);

20797return Builder.CreateFMul(VectorizedValue, Scale);

20798 }

20799case RecurKind::Mul:

20800case RecurKind::FMul:

20801case RecurKind::FMulAdd:

20802case RecurKind::IAnyOf:

20803case RecurKind::FAnyOf:

20804case RecurKind::IFindLastIV:

20805case RecurKind::FFindLastIV:

20806case RecurKind::None:

20807llvm_unreachable("Unexpected reduction kind for reused scalars.");

20808 }

20809returnnullptr;

20810 }

20811};

20812}// end anonymous namespace

20813

20814/// Gets recurrence kind from the specified value.

20815staticRecurKind getRdxKind(Value *V) {

20816return HorizontalReduction::getRdxKind(V);

20817}

20818static std::optional<unsigned>getAggregateSize(Instruction *InsertInst) {

20819if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))

20820return cast<FixedVectorType>(IE->getType())->getNumElements();

20821

20822unsigned AggregateSize = 1;

20823auto *IV = cast<InsertValueInst>(InsertInst);

20824Type *CurrentType =IV->getType();

20825do {

20826if (auto *ST = dyn_cast<StructType>(CurrentType)) {

20827for (auto *Elt : ST->elements())

20828if (Elt != ST->getElementType(0))// check homogeneity

20829return std::nullopt;

20830 AggregateSize *= ST->getNumElements();

20831 CurrentType = ST->getElementType(0);

20832 }elseif (auto *AT = dyn_cast<ArrayType>(CurrentType)) {

20833 AggregateSize *= AT->getNumElements();

20834 CurrentType = AT->getElementType();

20835 }elseif (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {

20836 AggregateSize *= VT->getNumElements();

20837return AggregateSize;

20838 }elseif (CurrentType->isSingleValueType()) {

20839return AggregateSize;

20840 }else {

20841return std::nullopt;

20842 }

20843 }while (true);

20844}

20845

20846staticvoidfindBuildAggregate_rec(Instruction *LastInsertInst,

20847TargetTransformInfo *TTI,

20848SmallVectorImpl<Value *> &BuildVectorOpds,

20849SmallVectorImpl<Value *> &InsertElts,

20850unsigned OperandOffset,constBoUpSLP &R) {

20851do {

20852Value *InsertedOperand = LastInsertInst->getOperand(1);

20853 std::optional<unsigned> OperandIndex =

20854getElementIndex(LastInsertInst, OperandOffset);

20855if (!OperandIndex || R.isDeleted(LastInsertInst))

20856return;

20857if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {

20858findBuildAggregate_rec(cast<Instruction>(InsertedOperand),TTI,

20859 BuildVectorOpds, InsertElts, *OperandIndex, R);

20860

20861 }else {

20862 BuildVectorOpds[*OperandIndex] = InsertedOperand;

20863 InsertElts[*OperandIndex] = LastInsertInst;

20864 }

20865 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));

20866 }while (LastInsertInst !=nullptr &&

20867 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&

20868 LastInsertInst->hasOneUse());

20869}

20870

20871/// Recognize construction of vectors like

20872/// %ra = insertelement <4 x float> poison, float %s0, i32 0

20873/// %rb = insertelement <4 x float> %ra, float %s1, i32 1

20874/// %rc = insertelement <4 x float> %rb, float %s2, i32 2

20875/// %rd = insertelement <4 x float> %rc, float %s3, i32 3

20876/// starting from the last insertelement or insertvalue instruction.

20877///

20878/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},

20879/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.

20880/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.

20881///

20882/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.

20883///

20884/// \return true if it matches.

20885staticboolfindBuildAggregate(Instruction *LastInsertInst,

20886TargetTransformInfo *TTI,

20887SmallVectorImpl<Value *> &BuildVectorOpds,

20888SmallVectorImpl<Value *> &InsertElts,

20889constBoUpSLP &R) {

20890

20891assert((isa<InsertElementInst>(LastInsertInst) ||

20892 isa<InsertValueInst>(LastInsertInst)) &&

20893"Expected insertelement or insertvalue instruction!");

20894

20895assert((BuildVectorOpds.empty() && InsertElts.empty()) &&

20896"Expected empty result vectors!");

20897

20898 std::optional<unsigned> AggregateSize =getAggregateSize(LastInsertInst);

20899if (!AggregateSize)

20900returnfalse;

20901 BuildVectorOpds.resize(*AggregateSize);

20902 InsertElts.resize(*AggregateSize);

20903

20904findBuildAggregate_rec(LastInsertInst,TTI, BuildVectorOpds, InsertElts, 0,

20905 R);

20906llvm::erase(BuildVectorOpds,nullptr);

20907llvm::erase(InsertElts,nullptr);

20908if (BuildVectorOpds.size() >= 2)

20909returntrue;

20910

20911returnfalse;

20912}

20913

20914/// Try and get a reduction instruction from a phi node.

20915///

20916/// Given a phi node \p P in a block \p ParentBB, consider possible reductions

20917/// if they come from either \p ParentBB or a containing loop latch.

20918///

20919/// \returns A candidate reduction value if possible, or \code nullptr \endcode

20920/// if not possible.

20921staticInstruction *getReductionInstr(constDominatorTree *DT,PHINode *P,

20922BasicBlock *ParentBB,LoopInfo *LI) {

20923// There are situations where the reduction value is not dominated by the

20924// reduction phi. Vectorizing such cases has been reported to cause

20925// miscompiles. See PR25787.

20926auto DominatedReduxValue = [&](Value *R) {

20927return isa<Instruction>(R) &&

20928 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());

20929 };

20930

20931Instruction *Rdx =nullptr;

20932

20933// Return the incoming value if it comes from the same BB as the phi node.

20934if (P->getIncomingBlock(0) == ParentBB) {

20935 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));

20936 }elseif (P->getIncomingBlock(1) == ParentBB) {

20937 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));

20938 }

20939

20940if (Rdx && DominatedReduxValue(Rdx))

20941return Rdx;

20942

20943// Otherwise, check whether we have a loop latch to look at.

20944Loop *BBL = LI->getLoopFor(ParentBB);

20945if (!BBL)

20946returnnullptr;

20947BasicBlock *BBLatch = BBL->getLoopLatch();

20948if (!BBLatch)

20949returnnullptr;

20950

20951// There is a loop latch, return the incoming value if it comes from

20952// that. This reduction pattern occasionally turns up.

20953if (P->getIncomingBlock(0) == BBLatch) {

20954 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));

20955 }elseif (P->getIncomingBlock(1) == BBLatch) {

20956 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));

20957 }

20958

20959if (Rdx && DominatedReduxValue(Rdx))

20960return Rdx;

20961

20962returnnullptr;

20963}

20964

20965staticboolmatchRdxBop(Instruction *I,Value *&V0,Value *&V1) {

20966if (match(I,m_BinOp(m_Value(V0),m_Value(V1))))

20967returntrue;

20968if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0),m_Value(V1))))

20969returntrue;

20970if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0),m_Value(V1))))

20971returntrue;

20972if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0),m_Value(V1))))

20973returntrue;

20974if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0),m_Value(V1))))

20975returntrue;

20976if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0),m_Value(V1))))

20977returntrue;

20978if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0),m_Value(V1))))

20979returntrue;

20980if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0),m_Value(V1))))

20981returntrue;

20982if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0),m_Value(V1))))

20983returntrue;

20984returnfalse;

20985}

20986

20987/// We could have an initial reduction that is not an add.

20988/// r *= v1 + v2 + v3 + v4

20989/// In such a case start looking for a tree rooted in the first '+'.

20990/// \Returns the new root if found, which may be nullptr if not an instruction.

20991staticInstruction *tryGetSecondaryReductionRoot(PHINode *Phi,

20992Instruction *Root) {

20993assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||

20994 isa<IntrinsicInst>(Root)) &&

20995"Expected binop, select, or intrinsic for reduction matching");

20996Value *LHS =

20997 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));

20998Value *RHS =

20999 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);

21000if (LHS == Phi)

21001return dyn_cast<Instruction>(RHS);

21002if (RHS == Phi)

21003return dyn_cast<Instruction>(LHS);

21004returnnullptr;

21005}

21006

21007/// \p Returns the first operand of \p I that does not match \p Phi. If

21008/// operand is not an instruction it returns nullptr.

21009staticInstruction *getNonPhiOperand(Instruction *I,PHINode *Phi) {

21010Value *Op0 =nullptr;

21011Value *Op1 =nullptr;

21012if (!matchRdxBop(I, Op0, Op1))

21013returnnullptr;

21014return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);

21015}

21016

21017/// \Returns true if \p I is a candidate instruction for reduction vectorization.

21018staticboolisReductionCandidate(Instruction *I) {

21019boolIsSelect =match(I,m_Select(m_Value(),m_Value(),m_Value()));

21020Value *B0 =nullptr, *B1 =nullptr;

21021bool IsBinop =matchRdxBop(I, B0, B1);

21022return IsBinop ||IsSelect;

21023}

21024

21025bool SLPVectorizerPass::vectorizeHorReduction(

21026PHINode *P,Instruction *Root,BasicBlock *BB,BoUpSLP &R,

21027SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {

21028if (!ShouldVectorizeHor)

21029returnfalse;

21030bool TryOperandsAsNewSeeds =P && isa<BinaryOperator>(Root);

21031

21032if (Root->getParent() != BB || isa<PHINode>(Root))

21033returnfalse;

21034

21035// If we can find a secondary reduction root, use that instead.

21036auto SelectRoot = [&]() {

21037if (TryOperandsAsNewSeeds &&isReductionCandidate(Root) &&

21038 HorizontalReduction::getRdxKind(Root) !=RecurKind::None)

21039if (Instruction *NewRoot =tryGetSecondaryReductionRoot(P, Root))

21040return NewRoot;

21041return Root;

21042 };

21043

21044// Start analysis starting from Root instruction. If horizontal reduction is

21045// found, try to vectorize it. If it is not a horizontal reduction or

21046// vectorization is not possible or not effective, and currently analyzed

21047// instruction is a binary operation, try to vectorize the operands, using

21048// pre-order DFS traversal order. If the operands were not vectorized, repeat

21049// the same procedure considering each operand as a possible root of the

21050// horizontal reduction.

21051// Interrupt the process if the Root instruction itself was vectorized or all

21052// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.

21053// If a horizintal reduction was not matched or vectorized we collect

21054// instructions for possible later attempts for vectorization.

21055 std::queue<std::pair<Instruction *, unsigned>>Stack;

21056Stack.emplace(SelectRoot(), 0);

21057SmallPtrSet<Value *, 8> VisitedInstrs;

21058bool Res =false;

21059auto &&TryToReduce = [this, &R](Instruction *Inst) ->Value * {

21060if (R.isAnalyzedReductionRoot(Inst))

21061returnnullptr;

21062if (!isReductionCandidate(Inst))

21063returnnullptr;

21064HorizontalReduction HorRdx;

21065if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))

21066returnnullptr;

21067return HorRdx.tryToReduce(R, *DL,TTI, *TLI, AC);

21068 };

21069auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {

21070if (TryOperandsAsNewSeeds && FutureSeed == Root) {

21071 FutureSeed =getNonPhiOperand(Root,P);

21072if (!FutureSeed)

21073returnfalse;

21074 }

21075// Do not collect CmpInst or InsertElementInst/InsertValueInst as their

21076// analysis is done separately.

21077if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))

21078 PostponedInsts.push_back(FutureSeed);

21079returntrue;

21080 };

21081

21082while (!Stack.empty()) {

21083Instruction *Inst;

21084unsigned Level;

21085 std::tie(Inst, Level) =Stack.front();

21086Stack.pop();

21087// Do not try to analyze instruction that has already been vectorized.

21088// This may happen when we vectorize instruction operands on a previous

21089// iteration while stack was populated before that happened.

21090if (R.isDeleted(Inst))

21091continue;

21092if (Value *VectorizedV = TryToReduce(Inst)) {

21093 Res =true;

21094if (auto *I = dyn_cast<Instruction>(VectorizedV)) {

21095// Try to find another reduction.

21096Stack.emplace(I, Level);

21097continue;

21098 }

21099if (R.isDeleted(Inst))

21100continue;

21101 }else {

21102// We could not vectorize `Inst` so try to use it as a future seed.

21103if (!TryAppendToPostponedInsts(Inst)) {

21104assert(Stack.empty() &&"Expected empty stack");

21105break;

21106 }

21107 }

21108

21109// Try to vectorize operands.

21110// Continue analysis for the instruction from the same basic block only to

21111// save compile time.

21112if (++Level <RecursionMaxDepth)

21113for (auto *Op : Inst->operand_values())

21114if (VisitedInstrs.insert(Op).second)

21115if (auto *I = dyn_cast<Instruction>(Op))

21116// Do not try to vectorize CmpInst operands, this is done

21117// separately.

21118if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&

21119 !R.isDeleted(I) &&I->getParent() == BB)

21120Stack.emplace(I, Level);

21121 }

21122return Res;

21123}

21124

21125bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P,Instruction *Root,

21126BasicBlock *BB,BoUpSLP &R) {

21127SmallVector<WeakTrackingVH> PostponedInsts;

21128bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);

21129 Res |= tryToVectorize(PostponedInsts, R);

21130return Res;

21131}

21132

21133bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,

21134BoUpSLP &R) {

21135bool Res =false;

21136for (Value *V : Insts)

21137if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))

21138 Res |= tryToVectorize(Inst, R);

21139return Res;

21140}

21141

21142bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

21143BasicBlock *BB,BoUpSLP &R,

21144bool MaxVFOnly) {

21145if (!R.canMapToVector(IVI->getType()))

21146returnfalse;

21147

21148SmallVector<Value *, 16> BuildVectorOpds;

21149SmallVector<Value *, 16> BuildVectorInsts;

21150if (!findBuildAggregate(IVI,TTI, BuildVectorOpds, BuildVectorInsts, R))

21151returnfalse;

21152

21153if (MaxVFOnly && BuildVectorOpds.size() == 2) {

21154R.getORE()->emit([&]() {

21155returnOptimizationRemarkMissed(SV_NAME,"NotPossible", IVI)

21156 <<"Cannot SLP vectorize list: only 2 elements of buildvalue, "

21157"trying reduction first.";

21158 });

21159returnfalse;

21160 }

21161LLVM_DEBUG(dbgs() <<"SLP: array mappable to vector: " << *IVI <<"\n");

21162// Aggregate value is unlikely to be processed in vector register.

21163return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);

21164}

21165

21166bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

21167BasicBlock *BB,BoUpSLP &R,

21168bool MaxVFOnly) {

21169SmallVector<Value *, 16> BuildVectorInsts;

21170SmallVector<Value *, 16> BuildVectorOpds;

21171SmallVector<int>Mask;

21172if (!findBuildAggregate(IEI,TTI, BuildVectorOpds, BuildVectorInsts, R) ||

21173 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&

21174isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))

21175returnfalse;

21176

21177if (MaxVFOnly && BuildVectorInsts.size() == 2) {

21178R.getORE()->emit([&]() {

21179returnOptimizationRemarkMissed(SV_NAME,"NotPossible", IEI)

21180 <<"Cannot SLP vectorize list: only 2 elements of buildvector, "

21181"trying reduction first.";

21182 });

21183returnfalse;

21184 }

21185LLVM_DEBUG(dbgs() <<"SLP: array mappable to vector: " << *IEI <<"\n");

21186return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);

21187}

21188

21189template <typename T>

21190staticbooltryToVectorizeSequence(

21191SmallVectorImpl<T *> &Incoming,function_ref<bool(T *,T *)> Comparator,

21192function_ref<bool(T *,T *)> AreCompatible,

21193function_ref<bool(ArrayRef<T *>,bool)> TryToVectorizeHelper,

21194bool MaxVFOnly,BoUpSLP &R) {

21195bool Changed =false;

21196// Sort by type, parent, operands.

21197stable_sort(Incoming, Comparator);

21198

21199// Try to vectorize elements base on their type.

21200SmallVector<T *> Candidates;

21201SmallVector<T *> VL;

21202for (auto *IncIt =Incoming.begin(), *E =Incoming.end(); IncIt != E;

21203 VL.clear()) {

21204// Look for the next elements with the same type, parent and operand

21205// kinds.

21206auto *I = dyn_cast<Instruction>(*IncIt);

21207if (!I || R.isDeleted(I)) {

21208 ++IncIt;

21209continue;

21210 }

21211auto *SameTypeIt = IncIt;

21212while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||

21213 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||

21214 AreCompatible(*SameTypeIt, *IncIt))) {

21215auto *I = dyn_cast<Instruction>(*SameTypeIt);

21216 ++SameTypeIt;

21217if (I && !R.isDeleted(I))

21218 VL.push_back(cast<T>(I));

21219 }

21220

21221// Try to vectorize them.

21222unsigned NumElts = VL.size();

21223LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize starting at nodes ("

21224 << NumElts <<")\n");

21225// The vectorization is a 3-state attempt:

21226// 1. Try to vectorize instructions with the same/alternate opcodes with the

21227// size of maximal register at first.

21228// 2. Try to vectorize remaining instructions with the same type, if

21229// possible. This may result in the better vectorization results rather than

21230// if we try just to vectorize instructions with the same/alternate opcodes.

21231// 3. Final attempt to try to vectorize all instructions with the

21232// same/alternate ops only, this may result in some extra final

21233// vectorization.

21234if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {

21235// Success start over because instructions might have been changed.

21236 Changed =true;

21237 VL.swap(Candidates);

21238 Candidates.clear();

21239for (T *V : VL) {

21240if (auto *I = dyn_cast<Instruction>(V);I && !R.isDeleted(I))

21241 Candidates.push_back(V);

21242 }

21243 }else {

21244 /// \Returns the minimum number of elements that we will attempt to

21245 /// vectorize.

21246auto GetMinNumElements = [&R](Value *V) {

21247unsigned EltSize = R.getVectorElementSize(V);

21248return std::max(2U, R.getMaxVecRegSize() / EltSize);

21249 };

21250if (NumElts < GetMinNumElements(*IncIt) &&

21251 (Candidates.empty() ||

21252 Candidates.front()->getType() == (*IncIt)->getType())) {

21253for (T *V : VL) {

21254if (auto *I = dyn_cast<Instruction>(V);I && !R.isDeleted(I))

21255 Candidates.push_back(V);

21256 }

21257 }

21258 }

21259// Final attempt to vectorize instructions with the same types.

21260if (Candidates.size() > 1 &&

21261 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {

21262if (TryToVectorizeHelper(Candidates,/*MaxVFOnly=*/false)) {

21263// Success start over because instructions might have been changed.

21264 Changed =true;

21265 }elseif (MaxVFOnly) {

21266// Try to vectorize using small vectors.

21267SmallVector<T *> VL;

21268for (auto *It = Candidates.begin(), *End = Candidates.end(); It !=End;

21269 VL.clear()) {

21270auto *I = dyn_cast<Instruction>(*It);

21271if (!I || R.isDeleted(I)) {

21272 ++It;

21273continue;

21274 }

21275auto *SameTypeIt = It;

21276while (SameTypeIt !=End &&

21277 (!isa<Instruction>(*SameTypeIt) ||

21278 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||

21279 AreCompatible(*SameTypeIt, *It))) {

21280auto *I = dyn_cast<Instruction>(*SameTypeIt);

21281 ++SameTypeIt;

21282if (I && !R.isDeleted(I))

21283 VL.push_back(cast<T>(I));

21284 }

21285unsigned NumElts = VL.size();

21286if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),

21287/*MaxVFOnly=*/false))

21288 Changed =true;

21289 It = SameTypeIt;

21290 }

21291 }

21292 Candidates.clear();

21293 }

21294

21295// Start over at the next instruction of a different type (or the end).

21296 IncIt = SameTypeIt;

21297 }

21298return Changed;

21299}

21300

21301/// Compare two cmp instructions. If IsCompatibility is true, function returns

21302/// true if 2 cmps have same/swapped predicates and mos compatible corresponding

21303/// operands. If IsCompatibility is false, function implements strict weak

21304/// ordering relation between two cmp instructions, returning true if the first

21305/// instruction is "less" than the second, i.e. its predicate is less than the

21306/// predicate of the second or the operands IDs are less than the operands IDs

21307/// of the second cmp instruction.

21308template <bool IsCompatibility>

21309staticboolcompareCmp(Value *V,Value *V2,TargetLibraryInfo &TLI,

21310constDominatorTree &DT) {

21311assert(isValidElementType(V->getType()) &&

21312isValidElementType(V2->getType()) &&

21313"Expected valid element types only.");

21314if (V == V2)

21315return IsCompatibility;

21316auto *CI1 = cast<CmpInst>(V);

21317auto *CI2 = cast<CmpInst>(V2);

21318if (CI1->getOperand(0)->getType()->getTypeID() <

21319 CI2->getOperand(0)->getType()->getTypeID())

21320return !IsCompatibility;

21321if (CI1->getOperand(0)->getType()->getTypeID() >

21322 CI2->getOperand(0)->getType()->getTypeID())

21323returnfalse;

21324if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <

21325 CI2->getOperand(0)->getType()->getScalarSizeInBits())

21326return !IsCompatibility;

21327if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >

21328 CI2->getOperand(0)->getType()->getScalarSizeInBits())

21329returnfalse;

21330CmpInst::Predicate Pred1 = CI1->getPredicate();

21331CmpInst::Predicate Pred2 = CI2->getPredicate();

21332CmpInst::Predicate SwapPred1 =CmpInst::getSwappedPredicate(Pred1);

21333CmpInst::Predicate SwapPred2 =CmpInst::getSwappedPredicate(Pred2);

21334CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);

21335CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);

21336if (BasePred1 < BasePred2)

21337return !IsCompatibility;

21338if (BasePred1 > BasePred2)

21339returnfalse;

21340// Compare operands.

21341bool CI1Preds = Pred1 == BasePred1;

21342bool CI2Preds = Pred2 == BasePred1;

21343for (intI = 0, E = CI1->getNumOperands();I < E; ++I) {

21344auto *Op1 = CI1->getOperand(CI1Preds ?I : E -I - 1);

21345auto *Op2 = CI2->getOperand(CI2Preds ?I : E -I - 1);

21346if (Op1 == Op2)

21347continue;

21348if (Op1->getValueID() < Op2->getValueID())

21349return !IsCompatibility;

21350if (Op1->getValueID() > Op2->getValueID())

21351returnfalse;

21352if (auto *I1 = dyn_cast<Instruction>(Op1))

21353if (auto *I2 = dyn_cast<Instruction>(Op2)) {

21354if (IsCompatibility) {

21355if (I1->getParent() != I2->getParent())

21356returnfalse;

21357 }else {

21358// Try to compare nodes with same parent.

21359DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());

21360DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());

21361if (!NodeI1)

21362return NodeI2 !=nullptr;

21363if (!NodeI2)

21364returnfalse;

21365assert((NodeI1 == NodeI2) ==

21366 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

21367"Different nodes should have different DFS numbers");

21368if (NodeI1 != NodeI2)

21369return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

21370 }

21371 InstructionsState S =getSameOpcode({I1, I2}, TLI);

21372if (S && (IsCompatibility || !S.isAltShuffle()))

21373continue;

21374if (IsCompatibility)

21375returnfalse;

21376if (I1->getOpcode() != I2->getOpcode())

21377return I1->getOpcode() < I2->getOpcode();

21378 }

21379 }

21380return IsCompatibility;

21381}

21382

21383template <typename ItT>

21384bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,

21385BasicBlock *BB,BoUpSLP &R) {

21386bool Changed =false;

21387// Try to find reductions first.

21388for (CmpInst *I : CmpInsts) {

21389if (R.isDeleted(I))

21390continue;

21391for (Value *Op :I->operands())

21392if (auto *RootOp = dyn_cast<Instruction>(Op)) {

21393 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);

21394if (R.isDeleted(I))

21395break;

21396 }

21397 }

21398// Try to vectorize operands as vector bundles.

21399for (CmpInst *I : CmpInsts) {

21400if (R.isDeleted(I))

21401continue;

21402 Changed |= tryToVectorize(I, R);

21403 }

21404// Try to vectorize list of compares.

21405// Sort by type, compare predicate, etc.

21406auto CompareSorter = [&](Value *V,Value *V2) {

21407if (V == V2)

21408returnfalse;

21409return compareCmp<false>(V, V2, *TLI, *DT);

21410 };

21411

21412auto AreCompatibleCompares = [&](Value *V1,Value *V2) {

21413if (V1 == V2)

21414returntrue;

21415return compareCmp<true>(V1, V2, *TLI, *DT);

21416 };

21417

21418SmallVector<Value *> Vals;

21419for (Instruction *V : CmpInsts)

21420if (!R.isDeleted(V) &&isValidElementType(getValueType(V)))

21421 Vals.push_back(V);

21422if (Vals.size() <= 1)

21423return Changed;

21424 Changed |= tryToVectorizeSequence<Value>(

21425 Vals, CompareSorter, AreCompatibleCompares,

21426 [this, &R](ArrayRef<Value *> Candidates,bool MaxVFOnly) {

21427// Exclude possible reductions from other blocks.

21428bool ArePossiblyReducedInOtherBlock =any_of(Candidates, [](Value *V) {

21429returnany_of(V->users(), [V](User *U) {

21430 auto *Select = dyn_cast<SelectInst>(U);

21431 return Select &&

21432 Select->getParent() != cast<Instruction>(V)->getParent();

21433 });

21434 });

21435if (ArePossiblyReducedInOtherBlock)

21436returnfalse;

21437return tryToVectorizeList(Candidates, R, MaxVFOnly);

21438 },

21439/*MaxVFOnly=*/true,R);

21440return Changed;

21441}

21442

21443bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,

21444BasicBlock *BB,BoUpSLP &R) {

21445assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&

21446"This function only accepts Insert instructions");

21447bool OpsChanged =false;

21448SmallVector<WeakTrackingVH> PostponedInsts;

21449for (auto *I :reverse(Instructions)) {

21450// pass1 - try to match and vectorize a buildvector sequence for MaxVF only.

21451if (R.isDeleted(I) || isa<CmpInst>(I))

21452continue;

21453if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {

21454 OpsChanged |=

21455 vectorizeInsertValueInst(LastInsertValue, BB, R,/*MaxVFOnly=*/true);

21456 }elseif (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {

21457 OpsChanged |=

21458 vectorizeInsertElementInst(LastInsertElem, BB, R,/*MaxVFOnly=*/true);

21459 }

21460// pass2 - try to vectorize reductions only

21461if (R.isDeleted(I))

21462continue;

21463 OpsChanged |= vectorizeHorReduction(nullptr,I, BB, R, PostponedInsts);

21464if (R.isDeleted(I) || isa<CmpInst>(I))

21465continue;

21466// pass3 - try to match and vectorize a buildvector sequence.

21467if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {

21468 OpsChanged |=

21469 vectorizeInsertValueInst(LastInsertValue, BB, R,/*MaxVFOnly=*/false);

21470 }elseif (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {

21471 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,

21472/*MaxVFOnly=*/false);

21473 }

21474 }

21475// Now try to vectorize postponed instructions.

21476 OpsChanged |= tryToVectorize(PostponedInsts, R);

21477

21478Instructions.clear();

21479return OpsChanged;

21480}

21481

21482bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,BoUpSLP &R) {

21483bool Changed =false;

21484SmallVector<Value *, 4>Incoming;

21485SmallPtrSet<Value *, 16> VisitedInstrs;

21486// Maps phi nodes to the non-phi nodes found in the use tree for each phi

21487// node. Allows better to identify the chains that can be vectorized in the

21488// better way.

21489DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;

21490auto PHICompare = [this, &PHIToOpcodes](Value *V1,Value *V2) {

21491assert(isValidElementType(V1->getType()) &&

21492isValidElementType(V2->getType()) &&

21493"Expected vectorizable types only.");

21494// It is fine to compare type IDs here, since we expect only vectorizable

21495// types, like ints, floats and pointers, we don't care about other type.

21496if (V1->getType()->getTypeID() <V2->getType()->getTypeID())

21497returntrue;

21498if (V1->getType()->getTypeID() >V2->getType()->getTypeID())

21499returnfalse;

21500if (V1->getType()->getScalarSizeInBits() <

21501V2->getType()->getScalarSizeInBits())

21502returntrue;

21503if (V1->getType()->getScalarSizeInBits() >

21504V2->getType()->getScalarSizeInBits())

21505returnfalse;

21506ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

21507ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

21508if (Opcodes1.size() < Opcodes2.size())

21509returntrue;

21510if (Opcodes1.size() > Opcodes2.size())

21511returnfalse;

21512for (intI = 0, E = Opcodes1.size();I < E; ++I) {

21513 {

21514// Instructions come first.

21515auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);

21516auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);

21517if (I1 && I2) {

21518DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());

21519DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());

21520if (!NodeI1)

21521return NodeI2 !=nullptr;

21522if (!NodeI2)

21523returnfalse;

21524assert((NodeI1 == NodeI2) ==

21525 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

21526"Different nodes should have different DFS numbers");

21527if (NodeI1 != NodeI2)

21528return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

21529 InstructionsState S =getSameOpcode({I1, I2}, *TLI);

21530if (S && !S.isAltShuffle())

21531continue;

21532returnI1->getOpcode() < I2->getOpcode();

21533 }

21534if (I1)

21535returntrue;

21536if (I2)

21537returnfalse;

21538 }

21539 {

21540// Non-undef constants come next.

21541bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);

21542bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);

21543if (C1 && C2)

21544continue;

21545if (C1)

21546returntrue;

21547if (C2)

21548returnfalse;

21549 }

21550bool U1 = isa<UndefValue>(Opcodes1[I]);

21551bool U2 = isa<UndefValue>(Opcodes2[I]);

21552 {

21553// Non-constant non-instructions come next.

21554if (!U1 && !U2) {

21555auto ValID1 = Opcodes1[I]->getValueID();

21556auto ValID2 = Opcodes2[I]->getValueID();

21557if (ValID1 == ValID2)

21558continue;

21559if (ValID1 < ValID2)

21560returntrue;

21561if (ValID1 > ValID2)

21562returnfalse;

21563 }

21564if (!U1)

21565returntrue;

21566if (!U2)

21567returnfalse;

21568 }

21569// Undefs come last.

21570assert(U1 && U2 &&"The only thing left should be undef & undef.");

21571 }

21572returnfalse;

21573 };

21574auto AreCompatiblePHIs = [&PHIToOpcodes,this, &R](Value *V1,Value *V2) {

21575if (V1 == V2)

21576returntrue;

21577if (V1->getType() !=V2->getType())

21578returnfalse;

21579ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

21580ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

21581if (Opcodes1.size() != Opcodes2.size())

21582returnfalse;

21583for (intI = 0, E = Opcodes1.size();I < E; ++I) {

21584// Undefs are compatible with any other value.

21585if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

21586continue;

21587if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

21588if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

21589if (R.isDeleted(I1) ||R.isDeleted(I2))

21590returnfalse;

21591if (I1->getParent() != I2->getParent())

21592returnfalse;

21593if (getSameOpcode({I1, I2}, *TLI))

21594continue;

21595returnfalse;

21596 }

21597if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

21598continue;

21599if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())

21600returnfalse;

21601 }

21602returntrue;

21603 };

21604

21605bool HaveVectorizedPhiNodes =false;

21606do {

21607// Collect the incoming values from the PHIs.

21608Incoming.clear();

21609for (Instruction &I : *BB) {

21610auto *P = dyn_cast<PHINode>(&I);

21611if (!P ||P->getNumIncomingValues() >MaxPHINumOperands)

21612break;

21613

21614// No need to analyze deleted, vectorized and non-vectorizable

21615// instructions.

21616if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&

21617isValidElementType(P->getType()))

21618Incoming.push_back(P);

21619 }

21620

21621if (Incoming.size() <= 1)

21622break;

21623

21624// Find the corresponding non-phi nodes for better matching when trying to

21625// build the tree.

21626for (Value *V :Incoming) {

21627SmallVectorImpl<Value *> &Opcodes =

21628 PHIToOpcodes.try_emplace(V).first->getSecond();

21629if (!Opcodes.empty())

21630continue;

21631SmallVector<Value *, 4> Nodes(1, V);

21632SmallPtrSet<Value *, 4> Visited;

21633while (!Nodes.empty()) {

21634auto *PHI = cast<PHINode>(Nodes.pop_back_val());

21635if (!Visited.insert(PHI).second)

21636continue;

21637for (Value *V :PHI->incoming_values()) {

21638if (auto *PHI1 = dyn_cast<PHINode>((V))) {

21639 Nodes.push_back(PHI1);

21640continue;

21641 }

21642 Opcodes.emplace_back(V);

21643 }

21644 }

21645 }

21646

21647 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(

21648Incoming, PHICompare, AreCompatiblePHIs,

21649 [this, &R](ArrayRef<Value *> Candidates,bool MaxVFOnly) {

21650return tryToVectorizeList(Candidates, R, MaxVFOnly);

21651 },

21652/*MaxVFOnly=*/true,R);

21653 Changed |= HaveVectorizedPhiNodes;

21654if (HaveVectorizedPhiNodes &&any_of(PHIToOpcodes, [&](constauto &P) {

21655auto *PHI = dyn_cast<PHINode>(P.first);

21656return !PHI ||R.isDeleted(PHI);

21657 }))

21658 PHIToOpcodes.clear();

21659 VisitedInstrs.insert(Incoming.begin(),Incoming.end());

21660 }while (HaveVectorizedPhiNodes);

21661

21662 VisitedInstrs.clear();

21663

21664 InstSetVector PostProcessInserts;

21665SmallSetVector<CmpInst *, 8> PostProcessCmps;

21666// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true

21667// also vectorizes `PostProcessCmps`.

21668auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {

21669bool Changed = vectorizeInserts(PostProcessInserts, BB, R);

21670if (VectorizeCmps) {

21671 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);

21672 PostProcessCmps.clear();

21673 }

21674 PostProcessInserts.clear();

21675return Changed;

21676 };

21677// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.

21678auto IsInPostProcessInstrs = [&](Instruction *I) {

21679if (auto *Cmp = dyn_cast<CmpInst>(I))

21680return PostProcessCmps.contains(Cmp);

21681return isa<InsertElementInst, InsertValueInst>(I) &&

21682 PostProcessInserts.contains(I);

21683 };

21684// Returns true if `I` is an instruction without users, like terminator, or

21685// function call with ignored return value, store. Ignore unused instructions

21686// (basing on instruction type, except for CallInst and InvokeInst).

21687auto HasNoUsers = [](Instruction *I) {

21688returnI->use_empty() &&

21689 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));

21690 };

21691for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {

21692// Skip instructions with scalable type. The num of elements is unknown at

21693// compile-time for scalable type.

21694if (isa<ScalableVectorType>(It->getType()))

21695continue;

21696

21697// Skip instructions marked for the deletion.

21698if (R.isDeleted(&*It))

21699continue;

21700// We may go through BB multiple times so skip the one we have checked.

21701if (!VisitedInstrs.insert(&*It).second) {

21702if (HasNoUsers(&*It) &&

21703 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {

21704// We would like to start over since some instructions are deleted

21705// and the iterator may become invalid value.

21706 Changed =true;

21707 It = BB->begin();

21708 E = BB->end();

21709 }

21710continue;

21711 }

21712

21713if (isa<DbgInfoIntrinsic>(It))

21714continue;

21715

21716// Try to vectorize reductions that use PHINodes.

21717if (PHINode *P = dyn_cast<PHINode>(It)) {

21718// Check that the PHI is a reduction PHI.

21719if (P->getNumIncomingValues() == 2) {

21720// Try to match and vectorize a horizontal reduction.

21721Instruction *Root =getReductionInstr(DT,P, BB, LI);

21722if (Root && vectorizeRootInstruction(P, Root, BB, R)) {

21723 Changed =true;

21724 It = BB->begin();

21725 E = BB->end();

21726continue;

21727 }

21728 }

21729// Try to vectorize the incoming values of the PHI, to catch reductions

21730// that feed into PHIs.

21731for (unsignedI : seq<unsigned>(P->getNumIncomingValues())) {

21732// Skip if the incoming block is the current BB for now. Also, bypass

21733// unreachable IR for efficiency and to avoid crashing.

21734// TODO: Collect the skipped incoming values and try to vectorize them

21735// after processing BB.

21736if (BB ==P->getIncomingBlock(I) ||

21737 !DT->isReachableFromEntry(P->getIncomingBlock(I)))

21738continue;

21739

21740// Postponed instructions should not be vectorized here, delay their

21741// vectorization.

21742if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));

21743 PI && !IsInPostProcessInstrs(PI)) {

21744bool Res =

21745 vectorizeRootInstruction(nullptr, PI,P->getIncomingBlock(I), R);

21746 Changed |= Res;

21747if (Res &&R.isDeleted(P)) {

21748 It = BB->begin();

21749 E = BB->end();

21750break;

21751 }

21752 }

21753 }

21754continue;

21755 }

21756

21757if (HasNoUsers(&*It)) {

21758bool OpsChanged =false;

21759auto *SI = dyn_cast<StoreInst>(It);

21760bool TryToVectorizeRoot =ShouldStartVectorizeHorAtStore || !SI;

21761if (SI) {

21762auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));

21763// Try to vectorize chain in store, if this is the only store to the

21764// address in the block.

21765// TODO: This is just a temporarily solution to save compile time. Need

21766// to investigate if we can safely turn on slp-vectorize-hor-store

21767// instead to allow lookup for reduction chains in all non-vectorized

21768// stores (need to check side effects and compile time).

21769 TryToVectorizeRoot |= (I == Stores.end() ||I->second.size() == 1) &&

21770SI->getValueOperand()->hasOneUse();

21771 }

21772if (TryToVectorizeRoot) {

21773for (auto *V : It->operand_values()) {

21774// Postponed instructions should not be vectorized here, delay their

21775// vectorization.

21776if (auto *VI = dyn_cast<Instruction>(V);

21777VI && !IsInPostProcessInstrs(VI))

21778// Try to match and vectorize a horizontal reduction.

21779 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);

21780 }

21781 }

21782// Start vectorization of post-process list of instructions from the

21783// top-tree instructions to try to vectorize as many instructions as

21784// possible.

21785 OpsChanged |=

21786 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());

21787if (OpsChanged) {

21788// We would like to start over since some instructions are deleted

21789// and the iterator may become invalid value.

21790 Changed =true;

21791 It = BB->begin();

21792 E = BB->end();

21793continue;

21794 }

21795 }

21796

21797if (isa<InsertElementInst, InsertValueInst>(It))

21798 PostProcessInserts.insert(&*It);

21799elseif (isa<CmpInst>(It))

21800 PostProcessCmps.insert(cast<CmpInst>(&*It));

21801 }

21802

21803return Changed;

21804}

21805

21806bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,BoUpSLP &R) {

21807auto Changed =false;

21808for (auto &Entry : GEPs) {

21809// If the getelementptr list has fewer than two elements, there's nothing

21810// to do.

21811if (Entry.second.size() < 2)

21812continue;

21813

21814LLVM_DEBUG(dbgs() <<"SLP: Analyzing a getelementptr list of length "

21815 <<Entry.second.size() <<".\n");

21816

21817// Process the GEP list in chunks suitable for the target's supported

21818// vector size. If a vector register can't hold 1 element, we are done. We

21819// are trying to vectorize the index computations, so the maximum number of

21820// elements is based on the size of the index expression, rather than the

21821// size of the GEP itself (the target's pointer size).

21822auto *It =find_if(Entry.second, [&](GetElementPtrInst *GEP) {

21823 return !R.isDeleted(GEP);

21824 });

21825if (It ==Entry.second.end())

21826continue;

21827unsigned MaxVecRegSize =R.getMaxVecRegSize();

21828unsigned EltSize =R.getVectorElementSize(*(*It)->idx_begin());

21829if (MaxVecRegSize < EltSize)

21830continue;

21831

21832unsigned MaxElts = MaxVecRegSize / EltSize;

21833for (unsigned BI = 0, BE =Entry.second.size(); BI < BE; BI += MaxElts) {

21834autoLen = std::min<unsigned>(BE - BI, MaxElts);

21835ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);

21836

21837// Initialize a set a candidate getelementptrs. Note that we use a

21838// SetVector here to preserve program order. If the index computations

21839// are vectorizable and begin with loads, we want to minimize the chance

21840// of having to reorder them later.

21841SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());

21842

21843// Some of the candidates may have already been vectorized after we

21844// initially collected them or their index is optimized to constant value.

21845// If so, they are marked as deleted, so remove them from the set of

21846// candidates.

21847 Candidates.remove_if([&R](Value *I) {

21848returnR.isDeleted(cast<Instruction>(I)) ||

21849 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());

21850 });

21851

21852// Remove from the set of candidates all pairs of getelementptrs with

21853// constant differences. Such getelementptrs are likely not good

21854// candidates for vectorization in a bottom-up phase since one can be

21855// computed from the other. We also ensure all candidate getelementptr

21856// indices are unique.

21857for (intI = 0, E = GEPList.size();I < E && Candidates.size() > 1; ++I) {

21858auto *GEPI = GEPList[I];

21859if (!Candidates.count(GEPI))

21860continue;

21861constSCEV *SCEVI = SE->getSCEV(GEPList[I]);

21862for (int J =I + 1; J < E && Candidates.size() > 1; ++J) {

21863auto *GEPJ = GEPList[J];

21864constSCEV *SCEVJ = SE->getSCEV(GEPList[J]);

21865if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {

21866 Candidates.remove(GEPI);

21867 Candidates.remove(GEPJ);

21868 }elseif (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {

21869 Candidates.remove(GEPJ);

21870 }

21871 }

21872 }

21873

21874// We break out of the above computation as soon as we know there are

21875// fewer than two candidates remaining.

21876if (Candidates.size() < 2)

21877continue;

21878

21879// Add the single, non-constant index of each candidate to the bundle. We

21880// ensured the indices met these constraints when we originally collected

21881// the getelementptrs.

21882SmallVector<Value *, 16> Bundle(Candidates.size());

21883auto BundleIndex = 0u;

21884for (auto *V : Candidates) {

21885auto *GEP = cast<GetElementPtrInst>(V);

21886auto *GEPIdx =GEP->idx_begin()->get();

21887assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));

21888 Bundle[BundleIndex++] = GEPIdx;

21889 }

21890

21891// Try and vectorize the indices. We are currently only interested in

21892// gather-like cases of the form:

21893//

21894// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...

21895//

21896// where the loads of "a", the loads of "b", and the subtractions can be

21897// performed in parallel. It's likely that detecting this pattern in a

21898// bottom-up phase will be simpler and less costly than building a

21899// full-blown top-down phase beginning at the consecutive loads.

21900 Changed |= tryToVectorizeList(Bundle, R);

21901 }

21902 }

21903return Changed;

21904}

21905

21906bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

21907bool Changed =false;

21908// Sort by type, base pointers and values operand. Value operands must be

21909// compatible (have the same opcode, same parent), otherwise it is

21910// definitely not profitable to try to vectorize them.

21911auto &&StoreSorter = [this](StoreInst *V,StoreInst *V2) {

21912if (V->getValueOperand()->getType()->getTypeID() <

21913V2->getValueOperand()->getType()->getTypeID())

21914returntrue;

21915if (V->getValueOperand()->getType()->getTypeID() >

21916V2->getValueOperand()->getType()->getTypeID())

21917returnfalse;

21918if (V->getPointerOperandType()->getTypeID() <

21919V2->getPointerOperandType()->getTypeID())

21920returntrue;

21921if (V->getPointerOperandType()->getTypeID() >

21922V2->getPointerOperandType()->getTypeID())

21923returnfalse;

21924if (V->getValueOperand()->getType()->getScalarSizeInBits() <

21925V2->getValueOperand()->getType()->getScalarSizeInBits())

21926returntrue;

21927if (V->getValueOperand()->getType()->getScalarSizeInBits() >

21928V2->getValueOperand()->getType()->getScalarSizeInBits())

21929returnfalse;

21930// UndefValues are compatible with all other values.

21931if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))

21932if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {

21933DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =

21934 DT->getNode(I1->getParent());

21935DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =

21936 DT->getNode(I2->getParent());

21937assert(NodeI1 &&"Should only process reachable instructions");

21938assert(NodeI2 &&"Should only process reachable instructions");

21939assert((NodeI1 == NodeI2) ==

21940 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

21941"Different nodes should have different DFS numbers");

21942if (NodeI1 != NodeI2)

21943return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

21944returnI1->getOpcode() < I2->getOpcode();

21945 }

21946returnV->getValueOperand()->getValueID() <

21947V2->getValueOperand()->getValueID();

21948 };

21949

21950auto &&AreCompatibleStores = [this](StoreInst *V1,StoreInst *V2) {

21951if (V1 == V2)

21952returntrue;

21953if (V1->getValueOperand()->getType() !=V2->getValueOperand()->getType())

21954returnfalse;

21955if (V1->getPointerOperandType() !=V2->getPointerOperandType())

21956returnfalse;

21957// Undefs are compatible with any other value.

21958if (isa<UndefValue>(V1->getValueOperand()) ||

21959 isa<UndefValue>(V2->getValueOperand()))

21960returntrue;

21961if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))

21962if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {

21963if (I1->getParent() != I2->getParent())

21964returnfalse;

21965returngetSameOpcode({I1, I2}, *TLI).valid();

21966 }

21967if (isa<Constant>(V1->getValueOperand()) &&

21968 isa<Constant>(V2->getValueOperand()))

21969returntrue;

21970return V1->getValueOperand()->getValueID() ==

21971V2->getValueOperand()->getValueID();

21972 };

21973

21974// Attempt to sort and vectorize each of the store-groups.

21975DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;

21976for (auto &Pair : Stores) {

21977if (Pair.second.size() < 2)

21978continue;

21979

21980LLVM_DEBUG(dbgs() <<"SLP: Analyzing a store chain of length "

21981 << Pair.second.size() <<".\n");

21982

21983if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))

21984continue;

21985

21986// Reverse stores to do bottom-to-top analysis. This is important if the

21987// values are stores to the same addresses several times, in this case need

21988// to follow the stores order (reversed to meet the memory dependecies).

21989SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),

21990 Pair.second.rend());

21991 Changed |= tryToVectorizeSequence<StoreInst>(

21992 ReversedStores, StoreSorter, AreCompatibleStores,

21993 [&](ArrayRef<StoreInst *> Candidates,bool) {

21994return vectorizeStores(Candidates, R, Attempted);

21995 },

21996/*MaxVFOnly=*/false,R);

21997 }

21998return Changed;

21999}

isConstant

static bool isConstant(const MachineInstr &MI)

Definition:AMDGPUInstructionSelector.cpp:2862

Select

AMDGPU Register Bank Select

Definition:AMDGPURegBankSelect.cpp:71

PHI

Rewrite undef for PHI

Definition:AMDGPURewriteUndefForPHI.cpp:100

Ignore

ReachingDefAnalysis InstSet InstSet & Ignore

Definition:ARMLowOverheadLoops.cpp:531

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

Definition:ARMSLSHardening.cpp:73

Results

Function Alias Analysis Results

Definition:AliasAnalysis.cpp:731

AliasAnalysis.h

AssumptionCache.h

Attributes.h

This file contains the simple types necessary to represent the attributes associated with functions a...

getParent

static const Function * getParent(const Value *V)

Definition:BasicAliasAnalysis.cpp:863

true

basic Basic Alias true

Definition:BasicAliasAnalysis.cpp:1981

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

#define LLVM_DUMP_METHOD

Mark debug helper function definitions like dump() that should not be stripped from debug builds.

Definition:Compiler.h:622

ConstantFolding.h

Constants.h

This file contains the declarations for the subclasses of Constant, which represent the different fla...

CostKind

static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))

DOTGraphTraits.h

getElementIndex

static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)

Definition:DataLayout.cpp:920

DataLayout.h

Idx

Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx

Definition:DeadArgumentElimination.cpp:353

DebugCounter.h

This file provides an implementation of debug counters.

DEBUG_COUNTER

#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)

Definition:DebugCounter.h:190

Debug.h

LLVM_DEBUG

#define LLVM_DEBUG(...)

Definition:Debug.h:106

DemandedBits.h

DenseMap.h

This file defines the DenseMap class.

DenseSet.h

This file defines the DenseSet and SmallDenseSet classes.

DerivedTypes.h

Dominators.h

Name

std::string Name

Definition:ELFObjHandler.cpp:77

Index

uint32_t Index

Definition:ELFObjHandler.cpp:83

Size

uint64_t Size

Definition:ELFObjHandler.cpp:81

End

bool End

Definition:ELF_riscv.cpp:480

Blocks

DenseMap< Block *, BlockRelaxAux > Blocks

Definition:ELF_riscv.cpp:507

static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")

runImpl

static bool runImpl(Function &F, const TargetLowering &TLI)

Definition:ExpandLargeDivRem.cpp:79

GlobalsModRef.h

This is the interface for a simple mod/ref and alias analysis over globals.

GraphWriter.h

Cleanup

static const HTTPClientCleanup Cleanup

Definition:HTTPClient.cpp:42

GEP

Hexagon Common GEP

Definition:HexagonCommonGEP.cpp:170

#define _

Definition:HexagonMCCodeEmitter.cpp:46

IRBuilder.h

IRTranslator LLVM IR MI

Definition:IRTranslator.cpp:112

Module.h This file contains the declarations for the Module class.

Operator.h

Type.h

Use.h

This defines the Use class.

iv Induction Variable Users

Definition:IVUsers.cpp:48

InjectTLIMappings.h

InstrTypes.h

InstructionCost.h

This file defines an InstructionCost class that is used when calculating the cost of an instruction,...

static bool isSplat(Value *V)

Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).

Definition:LowerMatrixIntrinsics.cpp:102

#define F(x, y, z)

Definition:MD5.cpp:55

#define I(x, y, z)

Definition:MD5.cpp:58

Operands

mir Rename Register Operands

Definition:MIRNamerPass.cpp:74

MathExtras.h

MemoryLocation.h

This file provides utility analysis objects describing memory locations.

Unknown

@ Unknown

Definition:NVPTXISelLowering.cpp:4791

uint64_t IntrinsicInst * II

Definition:NVVMIntrRange.cpp:51

static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")

OptimizationRemarkEmitter.h

#define P(N)

verify

ppc ctr loops verify

Definition:PPCCTRLoopsVerify.cpp:72

IsSelect

static bool IsSelect(MachineInstr &MI)

Definition:PPCISelLowering.cpp:13186

if(PassOpts->AAPipeline)

Definition:PassBuilderBindings.cpp:64

Pass.h

PatternMatch.h

PriorityQueue.h

This file defines the PriorityQueue class.

Cond

const SmallVectorImpl< MachineOperand > & Cond

Definition:RISCVRedundantCopyElimination.cpp:75

assert

assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

isLoadCombineCandidateImpl

static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)

Definition:SLPVectorizer.cpp:12010

RunSLPVectorization

static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))

getWidenedType

static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)

Definition:SLPVectorizer.cpp:263

isVectorLikeInstWithConstOps

static bool isVectorLikeInstWithConstOps(Value *V)

Checks if V is one of vector-like instructions, i.e.

Definition:SLPVectorizer.cpp:417

calculateRtStride

static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)

Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.

Definition:SLPVectorizer.cpp:4822

isRepeatedNonIdentityClusteredMask

static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)

Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...

Definition:SLPVectorizer.cpp:5864

MaxPHINumOperands

static const unsigned MaxPHINumOperands

Maximum allowed number of operands in the PHI nodes.

Definition:SLPVectorizer.cpp:222

MaxVectorRegSizeOption

static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))

MaxProfitableLoadStride

static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))

findBuildAggregate

static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)

Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...

Definition:SLPVectorizer.cpp:20885

needToScheduleSingleInstruction

static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:7386

clusterSortPtrAccesses

static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)

Definition:SLPVectorizer.cpp:5357

getNumElements

static unsigned getNumElements(Type *Ty)

Definition:SLPVectorizer.cpp:254

buildUseMask

static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)

Prepares a use bitset for the given mask either for the first argument or for the second.

Definition:SLPVectorizer.cpp:616

areCompatibleCmpOps

static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)

Checks if the provided operands of 2 cmp instructions are compatible, i.e.

Definition:SLPVectorizer.cpp:873

createInsertVector

static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})

Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.

Definition:SLPVectorizer.cpp:4967

getNumElems

static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)

Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...

Definition:SLPVectorizer.cpp:442

getShuffleCost

static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})

Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.

Definition:SLPVectorizer.cpp:4943

findBuildAggregate_rec

static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)

Definition:SLPVectorizer.cpp:20846

isSimple

static bool isSimple(Instruction *I)

Definition:SLPVectorizer.cpp:1137

MinScheduleRegionSize

static const int MinScheduleRegionSize

If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.

Definition:SLPVectorizer.cpp:219

MinProfitableStridedLoads

static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))

isFirstInsertElement

static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)

Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.

Definition:SLPVectorizer.cpp:12298

getAltInstrMask

static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)

Definition:SLPVectorizer.cpp:1211

LookAheadMaxDepth

static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))

MaxVFOption

static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))

reorderReuses

static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)

Reorders the given Reuses mask according to the given Mask.

Definition:SLPVectorizer.cpp:4551

combineOrders

static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)

Definition:SLPVectorizer.cpp:5904

MaxMemDepDistance

static const unsigned MaxMemDepDistance

Definition:SLPVectorizer.cpp:215

ViewSLPTree

static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))

doesInTreeUserNeedToExtract

static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)

Definition:SLPVectorizer.cpp:1099

VectorizeNonPowerOf2

static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))

MinTreeSize

static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))

reorderOrder

static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)

Reorders the given Order according to the given Mask.

Definition:SLPVectorizer.cpp:4565

getFullVectorNumberOfElements

static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)

Returns the number of elements of the given type Ty, not less than Sz, which forms type,...

Definition:SLPVectorizer.cpp:271

performExtractsShuffleAction

static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)

Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...

Definition:SLPVectorizer.cpp:12353

ShouldVectorizeHor

static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))

isConstant

static bool isConstant(Value *V)

Definition:SLPVectorizer.cpp:410

isSplat

static bool isSplat(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:493

SLPCostThreshold

static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))

getPartNumElems

static unsigned getPartNumElems(unsigned Size, unsigned NumParts)

Returns power-of-2 number of elements in a single register (part), given the total number of elements...

Definition:SLPVectorizer.cpp:435

allConstant

static bool allConstant(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:485

UsesLimit

static constexpr int UsesLimit

Definition:SLPVectorizer.cpp:210

getElementIndex

static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)

Definition:SLPVectorizer.cpp:568

isReductionCandidate

static bool isReductionCandidate(Instruction *I)

\Returns true if I is a candidate instruction for reduction vectorization.

Definition:SLPVectorizer.cpp:21018

checkTreeSizes

static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)

Checks if the quadratic mean deviation is less than 90% of the mean size.

Definition:SLPVectorizer.cpp:18674

getShufflevectorNumGroups

static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:339

isCmpSameOrSwapped

static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)

Definition:SLPVectorizer.cpp:887

SLPSkipEarlyProfitabilityCheck

static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))

generateKeySubkey

static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)

Generates key/subkey pair for the given value to provide effective sorting of the values and better d...

Definition:SLPVectorizer.cpp:7405

ShouldStartVectorizeHorAtStore

static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))

getVectorCallCosts

static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)

Definition:SLPVectorizer.cpp:9027

transformScalarShuffleIndiciesToVector

static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)

Definition:SLPVectorizer.cpp:300

SLPReVec

static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))

isValidForAlternation

static bool isValidForAlternation(unsigned Opcode)

Definition:SLPVectorizer.cpp:861

buildIntrinsicArgTypes

static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)

Builds the arguments types vector for the given call instruction with the given ID for the specified ...

Definition:SLPVectorizer.cpp:11075

getExtractIndex

static std::optional< unsigned > getExtractIndex(Instruction *E)

Definition:SLPVectorizer.cpp:794

RootLookAheadMaxDepth

static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))

AliasedCheckLimit

static const unsigned AliasedCheckLimit

Definition:SLPVectorizer.cpp:206

getValueType

static Type * getValueType(Value *V)

Returns the type of the given value/instruction V.

Definition:SLPVectorizer.cpp:243

gatherPossiblyVectorizableLoads

static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)

Tries to find subvector of loads and builds new vector of only loads if can be profitable.

Definition:SLPVectorizer.cpp:6777

shortBundleName

static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)

Print a short descriptor of the instruction bundle suitable for debug output.

Definition:SLPVectorizer.cpp:449

dumpOrder

static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)

Definition:SLPVectorizer.cpp:6722

isValidElementType

static bool isValidElementType(Type *Ty)

Predicate for the element types that the SLP vectorizer supports.

Definition:SLPVectorizer.cpp:231

getReductionInstr

static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)

Try and get a reduction instruction from a phi node.

Definition:SLPVectorizer.cpp:20921

calculateShufflevectorMask

static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:391

allSameType

static bool allSameType(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:1092

getLocation

static MemoryLocation getLocation(Instruction *I)

Definition:SLPVectorizer.cpp:1128

isCommutative

static bool isCommutative(Instruction *I)

Definition:SLPVectorizer.cpp:509

allSameBlock

static bool allSameBlock(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:461

getFloorFullVectorNumberOfElements

static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)

Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...

Definition:SLPVectorizer.cpp:286

areTwoInsertFromSameBuildVector

static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)

Check if two insertelement instructions are from the same buildvector.

Definition:SLPVectorizer.cpp:5488

arePointersCompatible

static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)

Definition:SLPVectorizer.cpp:4778

getGEPCosts

static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)

Calculate the scalar and the vector costs from vectorizing set of GEPs.

Definition:SLPVectorizer.cpp:9509

isUndefVector

static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})

Checks if the given value is actually an undefined constant vector.

Definition:SLPVectorizer.cpp:637

tryToVectorizeSequence

static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)

Definition:SLPVectorizer.cpp:21190

getSameOpcode

static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)

Definition:SLPVectorizer.cpp:909

ScheduleRegionSizeBudget

static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))

Limits the size of scheduling regions in a block.

tryGetSecondaryReductionRoot

static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)

We could have an initial reduction that is not an add.

Definition:SLPVectorizer.cpp:20991

getRdxKind

static RecurKind getRdxKind(Value *V)

Gets recurrence kind from the specified value.

Definition:SLPVectorizer.cpp:20815

matchRdxBop

static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)

Definition:SLPVectorizer.cpp:20965

MinVectorRegSizeOption

static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))

isFixedVectorShuffle

static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)

Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...

Definition:SLPVectorizer.cpp:706

getAggregateSize

static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)

Definition:SLPVectorizer.cpp:20818

getInsertExtractIndex

static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)

Definition:SLPVectorizer.cpp:543

RecursionMaxDepth

static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))

computeCommonAlignment

static Align computeCommonAlignment(ArrayRef< Value * > VL)

Calculates minimal alignment as a common alignment.

Definition:SLPVectorizer.cpp:4797

addMask

static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)

Shuffles Mask in accordance with the given SubMask.

Definition:SLPVectorizer.cpp:1150

fixupOrderingIndices

static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)

Order may have elements assigned special value (size) which is out of bounds.

Definition:SLPVectorizer.cpp:1185

createExtractVector

static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)

Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.

Definition:SLPVectorizer.cpp:4998

getNonPhiOperand

static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)

Returns the first operand of I that does not match Phi.

Definition:SLPVectorizer.cpp:21009

compareCmp

static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)

Compare two cmp instructions.

Definition:SLPVectorizer.cpp:21309

isReverseOrder

static bool isReverseOrder(ArrayRef< unsigned > Order)

Check if Order represents reverse order.

Definition:SLPVectorizer.cpp:4805

isAlternateInstruction

static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)

Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...

Definition:SLPVectorizer.cpp:9089

SLPVectorizer.h

STLExtras.h

This file contains some templates that are useful if you are working with the STL at all.

raw_pwrite_stream & OS

Definition:SampleProfWriter.cpp:51

SV_NAME

#define SV_NAME

Definition:SandboxVectorizer.cpp:17

ScalarEvolutionExpander.h

ScalarEvolutionExpressions.h

ScalarEvolution.h

ScopeExit.h

This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...

SetOperations.h

This file defines generic set operations that may be used on set's of different types,...

SetVector.h

This file implements a set that has insertion order iteration characteristics.

SmallBitVector.h

This file implements the SmallBitVector class.

SmallPtrSet.h

This file defines the SmallPtrSet class.

SmallSet.h

This file defines the SmallSet class.

SmallString.h

This file defines the SmallString class.

Statistic.h

This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC

#define STATISTIC(VARNAME, DESC)

Definition:Statistic.h:166

getType

static SymbolRef::Type getType(const Symbol *Sym)

Definition:TapiFile.cpp:39

Ptr

@ Ptr

Definition:TargetLibraryInfo.cpp:77

TargetLibraryInfo.h

TargetTransformInfo.h

This pass exposes codegen information to IR-level passes.

Local.h

getOpcode

static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)

Returns the opcode of Values or ~0 if they do not all agree.

Definition:VPlanSLP.cpp:191

getOperands

static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)

Definition:VPlanSLP.cpp:154

Value * RHS

Definition:X86PartialReduction.cpp:74

LHS

Value * LHS

Definition:X86PartialReduction.cpp:73

static const uint32_t IV[8]

Definition:blake3_impl.h:78

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator

Merges shuffle masks and emits final shuffle instruction, if required.

Definition:SLPVectorizer.cpp:10132

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::ShuffleCostEstimator

ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)

Definition:SLPVectorizer.cpp:10661

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add

void add(Value *V1, Value *V2, ArrayRef< int > Mask)

Adds 2 input vectors and the mask for their shuffling.

Definition:SLPVectorizer.cpp:10830

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add

void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)

Definition:SLPVectorizer.cpp:10778

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::needToDelay

std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const

Checks if the specified entry E needs to be delayed because of its dependency nodes.

Definition:SLPVectorizer.cpp:10773

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::gather

Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)

Definition:SLPVectorizer.cpp:10894

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::~ShuffleCostEstimator

~ShuffleCostEstimator()

Definition:SLPVectorizer.cpp:11035

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::finalize

InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})

Finalize emission of the shuffles.

Definition:SLPVectorizer.cpp:10940

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::createFreeze

InstructionCost createFreeze(InstructionCost Cost)

Definition:SLPVectorizer.cpp:10937

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add

void add(const TreeEntry &E1, ArrayRef< int > Mask)

Definition:SLPVectorizer.cpp:10807

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::adjustExtracts

Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)

Definition:SLPVectorizer.cpp:10667

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add

void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)

Adds another one input vector and the mask for the shuffling.

Definition:SLPVectorizer.cpp:10847

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder

Merges shuffle masks and emits final shuffle instruction, if required.

Definition:SLPVectorizer.cpp:14094

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add

void add(Value *V1, ArrayRef< int > Mask, bool=false)

Adds another one input vector and the mask for the shuffling.

Definition:SLPVectorizer.cpp:14445

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::addOrdered

void addOrdered(Value *V1, ArrayRef< unsigned > Order)

Adds another one input vector and the mask for the shuffling.

Definition:SLPVectorizer.cpp:14500

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::needToDelay

std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const

Checks if the specified entry E needs to be delayed because of its dependency nodes.

Definition:SLPVectorizer.cpp:14360

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add

void add(const TreeEntry &E1, ArrayRef< int > Mask)

Adds single input vector (in form of tree entry) and the mask for its shuffling.

Definition:SLPVectorizer.cpp:14399

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::gather

Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)

Definition:SLPVectorizer.cpp:14505

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add

void add(Value *V1, Value *V2, ArrayRef< int > Mask)

Adds 2 input vectors and the mask for their shuffling.

Definition:SLPVectorizer.cpp:14411

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::createFreeze

Value * createFreeze(Value *V)

Definition:SLPVectorizer.cpp:14512

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::ShuffleInstructionBuilder

ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)

Definition:SLPVectorizer.cpp:14220

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add

void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)

Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.

Definition:SLPVectorizer.cpp:14378

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::adjustExtracts

Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)

Adjusts extractelements after reusing them.

Definition:SLPVectorizer.cpp:14224

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::finalize

Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})

Finalize emission of the shuffles.

Definition:SLPVectorizer.cpp:14517

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::~ShuffleInstructionBuilder

~ShuffleInstructionBuilder()

Definition:SLPVectorizer.cpp:14616

VectorType

Definition:ItaniumDemangle.h:1173

bool

llvm::AAManager

A manager for alias analyses.

Definition:AliasAnalysis.h:933

llvm::AAResults

Definition:AliasAnalysis.h:314

llvm::APInt

Class for arbitrary precision integers.

Definition:APInt.h:78

llvm::APInt::getAllOnes

static APInt getAllOnes(unsigned numBits)

Return an APInt of a specified width with all bits set.

Definition:APInt.h:234

llvm::APInt::clearBit

void clearBit(unsigned BitPosition)

Set a given bit to 0.

Definition:APInt.h:1407

llvm::APInt::setBit

void setBit(unsigned BitPosition)

Set the given bit to 1 whose position is given as "bitPosition".

Definition:APInt.h:1330

llvm::APInt::isAllOnes

bool isAllOnes() const

Determine if all bits are set. This is true for zero-width values.

Definition:APInt.h:371

llvm::APInt::isZero

bool isZero() const

Determine if this value is zero, i.e. all bits are clear.

Definition:APInt.h:380

llvm::APInt::urem

APInt urem(const APInt &RHS) const

Unsigned remainder operation.

Definition:APInt.cpp:1640

llvm::APInt::clearAllBits

void clearAllBits()

Set every bit to 0.

Definition:APInt.h:1397

llvm::APInt::setAllBits

void setAllBits()

Set every bit to 1.

Definition:APInt.h:1319

llvm::APInt::setBits

void setBits(unsigned loBit, unsigned hiBit)

Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.

Definition:APInt.h:1367

llvm::APInt::getZero

static APInt getZero(unsigned numBits)

Get the '0' value for the specified bit-width.

Definition:APInt.h:200

llvm::APInt::getBitsSetFrom

static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)

Constructs an APInt value that has a contiguous range of bits set.

Definition:APInt.h:286

llvm::APInt::getOneBitSet

static APInt getOneBitSet(unsigned numBits, unsigned BitNo)

Return an APInt with exactly one bit set in the result.

Definition:APInt.h:239

llvm::AnalysisManager

A container for analyses that lazily runs them and caches their results.

Definition:PassManager.h:253

llvm::AnalysisManager::getCachedResult

PassT::Result * getCachedResult(IRUnitT &IR) const

Get the cached result of an analysis pass for a given IR unit.

Definition:PassManager.h:429

llvm::AnalysisManager::getResult

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

Definition:PassManager.h:410

llvm::ArrayRef

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

Definition:ArrayRef.h:41

llvm::ArrayRef::equals

bool equals(ArrayRef RHS) const

equals - Check for element-wise equality.

Definition:ArrayRef.h:190

llvm::ArrayRef::back

const T & back() const

back - Get the last element.

Definition:ArrayRef.h:177

llvm::ArrayRef::take_front

ArrayRef< T > take_front(size_t N=1) const

Return a copy of *this with only the first N elements.

Definition:ArrayRef.h:231

llvm::ArrayRef::drop_front

ArrayRef< T > drop_front(size_t N=1) const

Drop the first N elements of the array.

Definition:ArrayRef.h:207

llvm::ArrayRef::front

const T & front() const

front - Get the first element.

Definition:ArrayRef.h:171

llvm::ArrayRef::end

iterator end() const

Definition:ArrayRef.h:157

llvm::ArrayRef::size

size_t size() const

size - Get the array size.

Definition:ArrayRef.h:168

llvm::ArrayRef::drop_back

ArrayRef< T > drop_back(size_t N=1) const

Drop the last N elements of the array.

Definition:ArrayRef.h:213

llvm::ArrayRef::begin

iterator begin() const

Definition:ArrayRef.h:156

llvm::ArrayRef::empty

bool empty() const

empty - Check if the array is empty.

Definition:ArrayRef.h:163

llvm::ArrayRef::slice

ArrayRef< T > slice(size_t N, size_t M) const

slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.

Definition:ArrayRef.h:198

llvm::AssumptionAnalysis

A function analysis which provides an AssumptionCache.

Definition:AssumptionCache.h:173

llvm::AssumptionCache

A cache of @llvm.assume calls within a function.

Definition:AssumptionCache.h:42

llvm::Attribute::getWithAlignment

static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)

Return a uniquified Attribute object that has the specific alignment set.

Definition:Attributes.cpp:234

llvm::BasicBlock

LLVM Basic Block Representation.

Definition:BasicBlock.h:61

llvm::BasicBlock::end

iterator end()

Definition:BasicBlock.h:464

llvm::BasicBlock::begin

iterator begin()

Instruction iterator methods.

Definition:BasicBlock.h:451

llvm::BasicBlock::reverse_iterator

InstListType::reverse_iterator reverse_iterator

Definition:BasicBlock.h:179

llvm::BasicBlock::getParent

const Function * getParent() const

Return the enclosing method, or null if none.

Definition:BasicBlock.h:220

llvm::BasicBlock::rend

reverse_iterator rend()

Definition:BasicBlock.h:469

llvm::BasicBlock::iterator

InstListType::iterator iterator

Instruction iterators...

Definition:BasicBlock.h:177

llvm::BasicBlock::isEHPad

bool isEHPad() const

Return true if this basic block is an exception handling block.

Definition:BasicBlock.h:678

llvm::BasicBlock::getTerminator

const Instruction * getTerminator() const LLVM_READONLY

Returns the terminator instruction if the block is well formed or null if the block is not well forme...

Definition:BasicBlock.h:240

llvm::BatchAAResults

This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...

Definition:AliasAnalysis.h:630

llvm::BatchAAResults::getModRefInfo

ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)

Definition:AliasAnalysis.h:653

llvm::BinaryOperator

Definition:InstrTypes.h:170

llvm::CFGAnalyses

Represents analyses that only rely on functions' control flow.

Definition:Analysis.h:72

llvm::CallBase

Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...

Definition:InstrTypes.h:1112

llvm::CallBase::getBundleOperandsEndIndex

unsigned getBundleOperandsEndIndex() const

Return the index of the last bundle operand in the Use array.

Definition:InstrTypes.h:1980

llvm::CallBase::getOperandBundlesAsDefs

void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const

Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.

Definition:Instructions.cpp:483

llvm::CallBase::isNoBuiltin

bool isNoBuiltin() const

Return true if the call should not be treated as a call to a builtin.

Definition:InstrTypes.h:1875

llvm::CallBase::getCalledFunction

Function * getCalledFunction() const

Returns the function called, or null if this is an indirect function invocation or the function signa...

Definition:InstrTypes.h:1341

llvm::CallBase::hasIdenticalOperandBundleSchema

bool hasIdenticalOperandBundleSchema(const CallBase &Other) const

Return true if Other has the same sequence of operand bundle tags with the same number of operands on...

Definition:InstrTypes.h:2117

llvm::CallBase::getBundleOperandsStartIndex

unsigned getBundleOperandsStartIndex() const

Return the index of the first bundle operand in the Use array.

Definition:InstrTypes.h:1974

llvm::CallBase::getArgOperand

Value * getArgOperand(unsigned i) const

Definition:InstrTypes.h:1286

llvm::CallBase::getFunctionType

FunctionType * getFunctionType() const

Definition:InstrTypes.h:1199

llvm::CallBase::args

iterator_range< User::op_iterator > args()

Iteration adapter for range-for loops.

Definition:InstrTypes.h:1277

llvm::CallBase::arg_size

unsigned arg_size() const

Definition:InstrTypes.h:1284

llvm::CallBase::addParamAttr

void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)

Adds the attribute to the indicated argument.

Definition:InstrTypes.h:1494

llvm::CallBase::hasOperandBundles

bool hasOperandBundles() const

Return true if this User has any operand bundles.

Definition:InstrTypes.h:1971

llvm::CallInst

This class represents a function call, abstracting a target machine's calling convention.

Definition:Instructions.h:1479

llvm::CastInst

This is the base class for all instructions that perform data casts.

Definition:InstrTypes.h:444

llvm::CmpInst

This class is the base class for the comparison instructions.

Definition:InstrTypes.h:661

llvm::CmpInst::makeCmpResultType

static Type * makeCmpResultType(Type *opnd_type)

Create a result type for fcmp/icmp.

Definition:InstrTypes.h:980

llvm::CmpInst::Predicate

Predicate

This enumeration lists the possible predicates for CmpInst subclasses.

Definition:InstrTypes.h:673

llvm::CmpInst::BAD_ICMP_PREDICATE

@ BAD_ICMP_PREDICATE

Definition:InstrTypes.h:706

llvm::CmpInst::ICMP_SLT

@ ICMP_SLT

signed less than

Definition:InstrTypes.h:702

llvm::CmpInst::ICMP_SLE

@ ICMP_SLE

signed less or equal

Definition:InstrTypes.h:703

llvm::CmpInst::ICMP_UGE

@ ICMP_UGE

unsigned greater or equal

Definition:InstrTypes.h:697

llvm::CmpInst::ICMP_UGT

@ ICMP_UGT

unsigned greater than

Definition:InstrTypes.h:696

llvm::CmpInst::ICMP_SGT

@ ICMP_SGT

signed greater than

Definition:InstrTypes.h:700

llvm::CmpInst::ICMP_ULT

@ ICMP_ULT

unsigned less than

Definition:InstrTypes.h:698

llvm::CmpInst::ICMP_SGE

@ ICMP_SGE

signed greater or equal

Definition:InstrTypes.h:701

llvm::CmpInst::ICMP_ULE

@ ICMP_ULE

unsigned less or equal

Definition:InstrTypes.h:699

llvm::CmpInst::BAD_FCMP_PREDICATE

@ BAD_FCMP_PREDICATE

Definition:InstrTypes.h:693

llvm::CmpInst::getSwappedPredicate

Predicate getSwappedPredicate() const

For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.

Definition:InstrTypes.h:825

llvm::CmpInst::getInversePredicate

Predicate getInversePredicate() const

For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...

Definition:InstrTypes.h:787

llvm::CmpInst::getPredicate

Predicate getPredicate() const

Return the predicate for this instruction.

Definition:InstrTypes.h:763

llvm::CmpPredicate

An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...

Definition:CmpPredicate.h:22

llvm::ConstantExpr::getIntToPtr

static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)

Definition:Constants.cpp:2307

llvm::ConstantInt

This is the shared class of boolean and integer constants.

Definition:Constants.h:83

llvm::ConstantInt::getZExtValue

uint64_t getZExtValue() const

Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...

Definition:Constants.h:157

llvm::ConstantVector::getSplat

static Constant * getSplat(ElementCount EC, Constant *Elt)

Return a ConstantVector with the specified constant in each element.

Definition:Constants.cpp:1472

llvm::ConstantVector::get

static Constant * get(ArrayRef< Constant * > V)

Definition:Constants.cpp:1421

llvm::Constant

This is an important base class in LLVM.

Definition:Constant.h:42

llvm::Constant::getAllOnesValue

static Constant * getAllOnesValue(Type *Ty)

Definition:Constants.cpp:420

llvm::Constant::getNullValue

static Constant * getNullValue(Type *Ty)

Constructor to create a '0' constant of arbitrary type.

Definition:Constants.cpp:373

llvm::DWARFExpression::Operation

This class represents an Operation in the Expression.

Definition:DWARFExpression.h:32

llvm::DWARFExpression::Operation::getNumOperands

uint64_t getNumOperands() const

Definition:DWARFExpression.h:90

llvm::DataLayout

A parsed version of the target data layout string in and methods for querying it.

Definition:DataLayout.h:63

llvm::DataLayout::getTypeStoreSizeInBits

TypeSize getTypeStoreSizeInBits(Type *Ty) const

Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...

Definition:DataLayout.h:434

llvm::DataLayout::getIndexType

IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const

Returns the type of a GEP index in AddressSpace.

Definition:DataLayout.cpp:878

llvm::DataLayout::getTypeSizeInBits

TypeSize getTypeSizeInBits(Type *Ty) const

Size examples:

Definition:DataLayout.h:617

llvm::DebugCounter::shouldExecute

static bool shouldExecute(unsigned CounterName)

Definition:DebugCounter.h:87

llvm::DemandedBitsAnalysis

An analysis that produces DemandedBits for a function.

Definition:DemandedBits.h:103

llvm::DemandedBits

Definition:DemandedBits.h:40

llvm::DemandedBits::getDemandedBits

APInt getDemandedBits(Instruction *I)

Return the bits demanded from instruction I.

Definition:DemandedBits.cpp:399

llvm::DenseMapBase::lookup

ValueT lookup(const_arg_type_t< KeyT > Val) const

lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...

Definition:DenseMap.h:194

llvm::DenseMapBase::find

iterator find(const_arg_type_t< KeyT > Val)

Definition:DenseMap.h:156

llvm::DenseMapBase::try_emplace

std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)

Definition:DenseMap.h:226

llvm::DenseMapBase::erase

bool erase(const KeyT &Val)

Definition:DenseMap.h:321

llvm::DenseMapBase::size

unsigned size() const

Definition:DenseMap.h:99

llvm::DenseMapBase::empty

bool empty() const

Definition:DenseMap.h:98

llvm::DenseMapBase::count

size_type count(const_arg_type_t< KeyT > Val) const

Return 1 if the specified key is in the map, 0 otherwise.

Definition:DenseMap.h:152

llvm::DenseMapBase::end

iterator end()

Definition:DenseMap.h:84

llvm::DenseMapBase::at

const ValueT & at(const_arg_type_t< KeyT > Val) const

at - Return the entry for the specified key, or abort if no such entry exists.

Definition:DenseMap.h:202

llvm::DenseMapBase::contains

bool contains(const_arg_type_t< KeyT > Val) const

Return true if the specified key is in the map, false otherwise.

Definition:DenseMap.h:147

llvm::DenseMapBase::insert

std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)

Definition:DenseMap.h:211

llvm::DenseMapBase::clear

void clear()

Definition:DenseMap.h:110

llvm::DenseMap

Definition:DenseMap.h:727

llvm::DenseSet

Implements a dense probed hash-table based set.

Definition:DenseSet.h:278

llvm::DomTreeNodeBase< BasicBlock >

llvm::DomTreeNodeBase::getDFSNumIn

unsigned getDFSNumIn() const

getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.

Definition:GenericDomTree.h:140

llvm::DominatorTreeAnalysis

Analysis pass which computes a DominatorTree.

Definition:Dominators.h:279

llvm::DominatorTreeBase::updateDFSNumbers

void updateDFSNumbers() const

updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.

Definition:GenericDomTree.h:805

llvm::DominatorTreeBase::getNode

DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const

getNode - return the (Post)DominatorTree node for the specified basic block.

Definition:GenericDomTree.h:401

llvm::DominatorTree

Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.

Definition:Dominators.h:162

llvm::DominatorTree::isReachableFromEntry

bool isReachableFromEntry(const Use &U) const

Provide an overload for a Use.

Definition:Dominators.cpp:321

llvm::DominatorTree::dominates

bool dominates(const BasicBlock *BB, const Use &U) const

Return true if the (end of the) basic block BB dominates the use U.

Definition:Dominators.cpp:122

llvm::ElementCount::getFixed

static constexpr ElementCount getFixed(ScalarTy MinVal)

Definition:TypeSize.h:311

llvm::ExtractElementInst

This instruction extracts a single (scalar) element from a VectorType value.

Definition:Instructions.h:1775

llvm::ExtractValueInst

This instruction extracts a struct member or array element value from an aggregate value.

Definition:Instructions.h:2397

llvm::FastMathFlags

Convenience struct for specifying and reasoning about fast-math flags.

Definition:FMF.h:20

llvm::FastMathFlags::set

void set()

Definition:FMF.h:62

llvm::FixedVectorType

Class to represent fixed width SIMD vectors.

Definition:DerivedTypes.h:563

llvm::FixedVectorType::getNumElements

unsigned getNumElements() const

Definition:DerivedTypes.h:606

llvm::FixedVectorType::get

static FixedVectorType * get(Type *ElementType, unsigned NumElts)

Definition:Type.cpp:791

llvm::FunctionType::params

ArrayRef< Type * > params() const

Definition:DerivedTypes.h:132

llvm::FunctionType::getReturnType

Type * getReturnType() const

Definition:DerivedTypes.h:126

llvm::Function

Definition:Function.h:63

llvm::Function::empty

bool empty() const

Definition:Function.h:859

llvm::GetElementPtrInst

an instruction for type-safe pointer arithmetic to access elements of arrays and structs

Definition:Instructions.h:933

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator++

nodes_iterator operator++()

Definition:SLPVectorizer.cpp:4457

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::nodes_iterator

nodes_iterator(const ItTy &It2)

Definition:SLPVectorizer.cpp:4455

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator*

NodeRef operator*()

Definition:SLPVectorizer.cpp:4456

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator!=

bool operator!=(const nodes_iterator &N2) const

Definition:SLPVectorizer.cpp:4461

llvm::IRBuilderBase::InsertPointGuard

Definition:IRBuilder.h:394

llvm::IRBuilderBase

Common base class shared among various IRBuilders.

Definition:IRBuilder.h:113

llvm::IRBuilderBase::CreateExtractVector

CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")

Create a call to the vector.extract intrinsic.

Definition:IRBuilder.h:1072

llvm::IRBuilderBase::CreateInsertElement

Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")

Definition:IRBuilder.h:2511

llvm::IRBuilderBase::getInt1Ty

IntegerType * getInt1Ty()

Fetch the type representing a single bit.

Definition:IRBuilder.h:530

llvm::IRBuilderBase::CreateInsertVector

CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")

Create a call to the vector.insert intrinsic.

Definition:IRBuilder.h:1080

llvm::IRBuilderBase::CreateExtractElement

Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")

Definition:IRBuilder.h:2499

llvm::IRBuilderBase::getIntNTy

IntegerType * getIntNTy(unsigned N)

Fetch the type representing an N-bit integer.

Definition:IRBuilder.h:558

llvm::IRBuilderBase::CreateAlignedLoad

LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)

Definition:IRBuilder.h:1815

llvm::IRBuilderBase::getTrue

ConstantInt * getTrue()

Get the constant value for i1 true.

Definition:IRBuilder.h:485

llvm::IRBuilderBase::CreateSelect

Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)

Definition:IRBuilder.cpp:1053

llvm::IRBuilderBase::GetInsertPoint

BasicBlock::iterator GetInsertPoint() const

Definition:IRBuilder.h:194

llvm::IRBuilderBase::CreateFreeze

Value * CreateFreeze(Value *V, const Twine &Name="")

Definition:IRBuilder.h:2574

llvm::IRBuilderBase::CreateCast

Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})

Definition:IRBuilder.h:2186

llvm::IRBuilderBase::GetInsertBlock

BasicBlock * GetInsertBlock() const

Definition:IRBuilder.h:193

llvm::IRBuilderBase::setFastMathFlags

void setFastMathFlags(FastMathFlags NewFMF)

Set the fast-math flags to be used with generated fp-math operators.

Definition:IRBuilder.h:330

llvm::IRBuilderBase::SetCurrentDebugLocation

void SetCurrentDebugLocation(DebugLoc L)

Set location information used by debugging information.

Definition:IRBuilder.h:239

llvm::IRBuilderBase::CreateGEP

Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())

Definition:IRBuilder.h:1874

llvm::IRBuilderBase::getInt64

ConstantInt * getInt64(uint64_t C)

Get a constant 64-bit value.

Definition:IRBuilder.h:510

llvm::IRBuilderBase::getAllOnesMask

Value * getAllOnesMask(ElementCount NumElts)

Return an all true boolean vector (mask) with NumElts lanes.

Definition:IRBuilder.h:867

llvm::IRBuilderBase::CreateUnOp

Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)

Definition:IRBuilder.h:1761

llvm::IRBuilderBase::CreateBinaryIntrinsic

Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with 2 operands which is mangled on the first type.

Definition:IRBuilder.cpp:889

llvm::IRBuilderBase::CreateIntrinsic

CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with Args, mangled using Types.

Definition:IRBuilder.cpp:900

llvm::IRBuilderBase::getInt32

ConstantInt * getInt32(uint32_t C)

Get a constant 32-bit value.

Definition:IRBuilder.h:505

llvm::IRBuilderBase::CreateCmp

Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)

Definition:IRBuilder.h:2404

llvm::IRBuilderBase::CreatePHI

PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")

Definition:IRBuilder.h:2435

llvm::IRBuilderBase::CreateBitCast

Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")

Definition:IRBuilder.h:2152

llvm::IRBuilderBase::CreateUnaryIntrinsic

CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with 1 operand which is mangled on its type.

Definition:IRBuilder.cpp:881

llvm::IRBuilderBase::CreateShuffleVector

Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")

Definition:IRBuilder.h:2533

llvm::IRBuilderBase::getFalse

ConstantInt * getFalse()

Get the constant value for i1 false.

Definition:IRBuilder.h:490

llvm::IRBuilderBase::CreateCall

CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)

Definition:IRBuilder.h:2449

llvm::IRBuilderBase::CreateBinOp

Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)

Definition:IRBuilder.h:1671

llvm::IRBuilderBase::ClearInsertionPoint

void ClearInsertionPoint()

Clear the insertion point: created instructions will not be inserted into a block.

Definition:IRBuilder.h:188

llvm::IRBuilderBase::CreateIntCast

Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")

Definition:IRBuilder.h:2225

llvm::IRBuilderBase::SetInsertPoint

void SetInsertPoint(BasicBlock *TheBB)

This specifies that created instructions should be appended to the end of the specified block.

Definition:IRBuilder.h:199

llvm::IRBuilderBase::CreateAlignedStore

StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)

Definition:IRBuilder.h:1834

llvm::IRBuilderBase::CreateICmp

Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")

Definition:IRBuilder.h:2380

llvm::IRBuilderBase::CreateFMul

Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)

Definition:IRBuilder.h:1614

llvm::IRBuilderBase::CreateMul

Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)

Definition:IRBuilder.h:1404

llvm::IRBuilderBase::CreateMaskedGather

CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")

Create a call to Masked Gather intrinsic.

Definition:IRBuilder.cpp:596

llvm::IRBuilder

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

Definition:IRBuilder.h:2705

llvm::InsertElementInst

This instruction inserts a single (scalar) element into a VectorType value.

Definition:Instructions.h:1834

llvm::InsertElementInst::getType

VectorType * getType() const

Overload to return most specific vector type.

Definition:Instructions.h:1862

llvm::InsertValueInst

This instruction inserts a struct field of array element value into an aggregate value.

Definition:Instructions.h:2485

llvm::InstructionCost

Definition:InstructionCost.h:29

llvm::InstructionCost::getInvalid

static InstructionCost getInvalid(CostType Val=0)

Definition:InstructionCost.h:73

llvm::InstructionCost::isValid

bool isValid() const

Definition:InstructionCost.h:79

llvm::Instruction

Definition:Instruction.h:68

llvm::Instruction::isCast

bool isCast() const

Definition:Instruction.h:300

llvm::Instruction::mayReadOrWriteMemory

bool mayReadOrWriteMemory() const

Return true if this instruction may read or write memory.

Definition:Instruction.h:780

llvm::Instruction::getDebugLoc

const DebugLoc & getDebugLoc() const

Return the debug location for this node as a DebugLoc.

Definition:Instruction.h:492

llvm::Instruction::moveAfter

void moveAfter(Instruction *MovePos)

Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...

Definition:Instruction.cpp:191

llvm::Instruction::isBinaryOp

bool isBinaryOp() const

Definition:Instruction.h:296

llvm::Instruction::comesBefore

bool comesBefore(const Instruction *Other) const

Given an instruction Other in the same basic block as this instruction, return true if this instructi...

Definition:Instruction.cpp:334

llvm::Instruction::getNextNonDebugInstruction

const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const

Return a pointer to the next non-debug instruction in the same basic block as 'this',...

Definition:Instruction.cpp:1226

llvm::Instruction::getOpcode

unsigned getOpcode() const

Returns a member of one of the enums like Instruction::Add.

Definition:Instruction.h:291

llvm::Instruction::BinaryOps

BinaryOps

Definition:Instruction.h:989

llvm::Instruction::isIdenticalTo

bool isIdenticalTo(const Instruction *I) const LLVM_READONLY

Return true if the specified instruction is exactly identical to the current one.

Definition:Instruction.cpp:914

llvm::Instruction::isIntDivRem

bool isIntDivRem() const

Definition:Instruction.h:297

llvm::Instruction::UnaryOps

UnaryOps

Definition:Instruction.h:982

llvm::Instruction::CastOps

CastOps

Definition:Instruction.h:1003

llvm::IntegerType::get

static IntegerType * get(LLVMContext &C, unsigned NumBits)

This static method is the primary way of constructing an IntegerType.

Definition:Type.cpp:311

llvm::IntrinsicCostAttributes

Definition:TargetTransformInfo.h:119

llvm::IntrinsicCostAttributes::getArgTypes

const SmallVectorImpl< Type * > & getArgTypes() const

Definition:TargetTransformInfo.h:156

llvm::LoadInst

An instruction for reading from memory.

Definition:Instructions.h:176

llvm::LoadInst::getPointerOperand

Value * getPointerOperand()

Definition:Instructions.h:255

llvm::LoadInst::isSimple

bool isSimple() const

Definition:Instructions.h:247

llvm::LoadInst::getAlign

Align getAlign() const

Return the alignment of the access that is being performed.

Definition:Instructions.h:211

llvm::LoopAnalysis

Analysis pass that exposes the LoopInfo for a function.

Definition:LoopInfo.h:566

llvm::LoopBase::getLoopLatch

BlockT * getLoopLatch() const

If there is a single latch block for this loop, return it.

Definition:GenericLoopInfoImpl.h:256

llvm::LoopInfoBase::getLoopFor

LoopT * getLoopFor(const BlockT *BB) const

Return the inner most loop that BB lives in.

Definition:GenericLoopInfo.h:606

llvm::LoopInfo

Definition:LoopInfo.h:407

llvm::Loop

Represents a single loop in the control flow graph.

Definition:LoopInfo.h:39

llvm::MapVector

This class implements a map that also provides access to all stored values in a deterministic order.

Definition:MapVector.h:36

llvm::MapVector::end

iterator end()

Definition:MapVector.h:71

llvm::MapVector::takeVector

VectorType takeVector()

Clear the MapVector and return the underlying vector.

Definition:MapVector.h:55

llvm::MapVector::find

iterator find(const KeyT &Key)

Definition:MapVector.h:167

llvm::MapVector::empty

bool empty() const

Definition:MapVector.h:79

llvm::MapVector::try_emplace

std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)

Definition:MapVector.h:118

llvm::MapVector::insert

std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)

Definition:MapVector.h:141

llvm::MapVector::lookup

ValueT lookup(const KeyT &Key) const

Definition:MapVector.h:110

llvm::MapVector::size

size_type size() const

Definition:MapVector.h:60

llvm::MapVector::front

std::pair< KeyT, ValueT > & front()

Definition:MapVector.h:83

llvm::MapVector::clear

void clear()

Definition:MapVector.h:88

llvm::MemIntrinsic

This is the common base class for memset/memcpy/memmove.

Definition:IntrinsicInst.h:1205

llvm::MemoryLocation

Representation for a specific memory location.

Definition:MemoryLocation.h:227

llvm::MemoryLocation::get

static MemoryLocation get(const LoadInst *LI)

Return a location with information about the memory reference by the given instruction.

Definition:MemoryLocation.cpp:35

llvm::MemoryLocation::Ptr

const Value * Ptr

The address of the start of the location.

Definition:MemoryLocation.h:235

llvm::MutableArrayRef

MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...

Definition:ArrayRef.h:310

llvm::MutableArrayRef::front

T & front() const

front - Get the first element.

Definition:ArrayRef.h:366

llvm::MutableArrayRef::end

iterator end() const

Definition:ArrayRef.h:360

llvm::MutableArrayRef::begin

iterator begin() const

Definition:ArrayRef.h:359

llvm::MutableArrayRef::slice

MutableArrayRef< T > slice(size_t N, size_t M) const

slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.

Definition:ArrayRef.h:379

llvm::OptimizationRemarkEmitterAnalysis

Definition:OptimizationRemarkEmitter.h:164

llvm::OptimizationRemarkEmitter

The optimization diagnostic interface.

Definition:OptimizationRemarkEmitter.h:32

llvm::OptimizationRemarkMissed

Diagnostic information for missed-optimization remarks.

Definition:DiagnosticInfo.h:807

llvm::OptimizationRemark

Diagnostic information for applied optimization remarks.

Definition:DiagnosticInfo.h:762

llvm::OwningArrayRef

This is a MutableArrayRef that owns its array.

Definition:ArrayRef.h:452

llvm::PHINode

Definition:Instructions.h:2600

llvm::PHINode::addIncoming

void addIncoming(Value *V, BasicBlock *BB)

Add an incoming value to the end of the PHI list.

Definition:Instructions.h:2735

llvm::PHINode::getIncomingValueForBlock

Value * getIncomingValueForBlock(const BasicBlock *BB) const

Definition:Instructions.h:2775

llvm::PHINode::getIncomingBlock

BasicBlock * getIncomingBlock(unsigned i) const

Return incoming basic block number i.

Definition:Instructions.h:2695

llvm::PHINode::getNumIncomingValues

unsigned getNumIncomingValues() const

Return the number of incoming edges.

Definition:Instructions.h:2671

llvm::Pass

Pass interface - Implemented by all 'passes'.

Definition:Pass.h:94

llvm::PointerType::getUnqual

static PointerType * getUnqual(Type *ElementType)

This constructs a pointer to an object of the specified type in the default address space (address sp...

Definition:DerivedTypes.h:686

llvm::PointerUnion

A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...

Definition:PointerUnion.h:118

llvm::PointerUnion::isNull

bool isNull() const

Test if the pointer held in the union is null, regardless of which type it is.

Definition:PointerUnion.h:142

llvm::PointerUnion::dyn_cast

T dyn_cast() const

Returns the current pointer if it is of the specified pointer type, otherwise returns null.

Definition:PointerUnion.h:168

llvm::PoisonValue::get

static PoisonValue * get(Type *T)

Static factory methods - Return an 'poison' object of the specified type.

Definition:Constants.cpp:1878

llvm::PreservedAnalyses

A set of analyses that are preserved following a run of a transformation pass.

Definition:Analysis.h:111

llvm::PreservedAnalyses::all

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

Definition:Analysis.h:117

llvm::PreservedAnalyses::preserveSet

void preserveSet()

Mark an analysis set as preserved.

Definition:Analysis.h:146

llvm::PriorityQueue

PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...

Definition:PriorityQueue.h:28

llvm::RecurrenceDescriptor::getOpcode

unsigned getOpcode() const

Definition:IVDescriptors.h:212

llvm::RecurrenceDescriptor::isIntMinMaxRecurrenceKind

static bool isIntMinMaxRecurrenceKind(RecurKind Kind)

Returns true if the recurrence kind is an integer min/max kind.

Definition:IVDescriptors.h:234

llvm::RecurrenceDescriptor::isMinMaxRecurrenceKind

static bool isMinMaxRecurrenceKind(RecurKind Kind)

Returns true if the recurrence kind is any min/max kind.

Definition:IVDescriptors.h:246

llvm::SCEVExpander

This class uses information about analyze scalars to rewrite expressions in canonical form.

Definition:ScalarEvolutionExpander.h:63

llvm::SCEVExpander::expandCodeFor

Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)

Insert code to directly compute the specified SCEV expression into the program.

Definition:ScalarEvolutionExpander.cpp:1443

llvm::SCEV

This class represents an analyzed expression in the program.

Definition:ScalarEvolution.h:71

llvm::SCEV::isZero

bool isZero() const

Return true if the expression is a constant zero.

Definition:ScalarEvolution.cpp:448

llvm::SCEV::isNonConstantNegative

bool isNonConstantNegative() const

Return true if the specified scev is negated, but not a constant.

Definition:ScalarEvolution.cpp:454

llvm::SCEV::getType

Type * getType() const

Return the LLVM type of this SCEV expression.

Definition:ScalarEvolution.cpp:386

llvm::ScalarEvolutionAnalysis

Analysis pass that exposes the ScalarEvolution for a function.

Definition:ScalarEvolution.h:2320

llvm::ScalarEvolution

The main scalar evolution driver.

Definition:ScalarEvolution.h:447

llvm::ScalarEvolution::getConstant

const SCEV * getConstant(ConstantInt *V)

Definition:ScalarEvolution.cpp:473

llvm::ScalarEvolution::getSCEV

const SCEV * getSCEV(Value *V)

Return a SCEV expression for the full generality of the specified expression.

Definition:ScalarEvolution.cpp:4547

llvm::ScalarEvolution::forgetValue

void forgetValue(Value *V)

This method should be called by the client when it has changed a value in a way that may effect its v...

Definition:ScalarEvolution.cpp:8542

llvm::ScalarEvolution::getMinusSCEV

const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Return LHS-RHS.

Definition:ScalarEvolution.cpp:4655

llvm::ScalarEvolution::getMulExpr

const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Get a canonical multiply expression, or something simpler if possible.

Definition:ScalarEvolution.cpp:3106

llvm::ScalarEvolution::getUDivExactExpr

const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)

Get a canonical unsigned division expression, or something simpler if possible.

Definition:ScalarEvolution.cpp:3587

llvm::ScalarEvolution::getAddExpr

const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Get a canonical add expression, or something simpler if possible.

Definition:ScalarEvolution.cpp:2526

llvm::SelectInst

This class represents the LLVM 'select' instruction.

Definition:Instructions.h:1657

llvm::SetVector

A vector that has set insertion semantics.

Definition:SetVector.h:57

llvm::SetVector::getArrayRef

ArrayRef< value_type > getArrayRef() const

Definition:SetVector.h:84

llvm::SetVector::size

size_type size() const

Determine the number of elements in the SetVector.

Definition:SetVector.h:98

llvm::SetVector::front

const value_type & front() const

Return the first element of the SetVector.

Definition:SetVector.h:143

llvm::SetVector::clear

void clear()

Completely clear the SetVector.

Definition:SetVector.h:273

llvm::SetVector::empty

bool empty() const

Determine if the SetVector is empty or not.

Definition:SetVector.h:93

llvm::SetVector::insert

bool insert(const value_type &X)

Insert a new element into the SetVector.

Definition:SetVector.h:162

llvm::SetVector::contains

bool contains(const key_type &key) const

Check if the SetVector contains the given key.

Definition:SetVector.h:254

llvm::ShuffleVectorInst

This instruction constructs a fixed permutation of two input vectors.

Definition:Instructions.h:1901

llvm::ShuffleVectorInst::isZeroEltSplatMask

static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)

Return true if this shuffle mask chooses all elements with the same value as the first element of exa...

Definition:Instructions.cpp:1911

llvm::ShuffleVectorInst::isOneUseSingleSourceMask

static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)

Return true if this shuffle mask represents "clustered" mask of size VF, i.e.

Definition:Instructions.cpp:2253

llvm::ShuffleVectorInst::isDeInterleaveMaskOfFactor

static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)

Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...

Definition:Instructions.cpp:2379

llvm::ShuffleVectorInst::isIdentityMask

static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)

Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...

Definition:Instructions.cpp:1883

llvm::ShuffleVectorInst::isExtractSubvectorMask

static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)

Return true if this shuffle mask is an extract subvector mask.

Definition:Instructions.cpp:2010

llvm::ShuffleVectorInst::isReverseMask

static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)

Return true if this shuffle mask swaps the order of elements from exactly one source vector.

Definition:Instructions.cpp:1891

llvm::ShuffleVectorInst::isInsertSubvectorMask

static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)

Return true if this shuffle mask is an insert subvector mask.

Definition:Instructions.cpp:2039

llvm::ShuffleVectorInst::isInterleaveMask

static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)

Return true if the mask interleaves one or more input vectors together.

Definition:Instructions.cpp:2295

llvm::SmallBitVector

This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...

Definition:SmallBitVector.h:35

llvm::SmallBitVector::find_first

int find_first() const

Returns the index of the first set bit, -1 if none of the bits are set.

Definition:SmallBitVector.h:230

llvm::SmallBitVector::set

SmallBitVector & set()

Definition:SmallBitVector.h:366

llvm::SmallBitVector::test

bool test(unsigned Idx) const

Definition:SmallBitVector.h:472

llvm::SmallBitVector::find_next

int find_next(unsigned Prev) const

Returns the index of the next set bit following the "Prev" bit.

Definition:SmallBitVector.h:277

llvm::SmallBitVector::all

bool all() const

Returns true if all bits are set.

Definition:SmallBitVector.h:216

llvm::SmallBitVector::size

size_type size() const

Returns the number of bits in this bitvector.

Definition:SmallBitVector.h:195

llvm::SmallBitVector::any

bool any() const

Returns true if any bit is set.

Definition:SmallBitVector.h:209

llvm::SmallBitVector::count

size_type count() const

Returns the number of bits which are set.

Definition:SmallBitVector.h:200

llvm::SmallBitVector::reset

SmallBitVector & reset()

Definition:SmallBitVector.h:401

llvm::SmallBitVector::none

bool none() const

Returns true if none of the bits are set.

Definition:SmallBitVector.h:223

llvm::SmallDenseMap

Definition:DenseMap.h:883

llvm::SmallDenseSet

Implements a dense probed hash-table based set with some number of buckets stored inline.

Definition:DenseSet.h:298

llvm::SmallPtrSetImplBase::size

size_type size() const

Definition:SmallPtrSet.h:94

llvm::SmallPtrSetImplBase::clear

void clear()

Definition:SmallPtrSet.h:97

llvm::SmallPtrSetImplBase::empty

bool empty() const

Definition:SmallPtrSet.h:93

llvm::SmallPtrSetImpl

A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...

Definition:SmallPtrSet.h:363

llvm::SmallPtrSetImpl::erase

bool erase(PtrType Ptr)

Remove pointer from the set.

Definition:SmallPtrSet.h:401

llvm::SmallPtrSetImpl::count

size_type count(ConstPtrType Ptr) const

count - Return 1 if the specified pointer is in the set, 0 otherwise.

Definition:SmallPtrSet.h:452

llvm::SmallPtrSetImpl::end

iterator end() const

Definition:SmallPtrSet.h:477

llvm::SmallPtrSetImpl::insert

std::pair< iterator, bool > insert(PtrType Ptr)

Inserts Ptr if and only if there is no element in the container equal to Ptr.

Definition:SmallPtrSet.h:384

llvm::SmallPtrSetImpl::begin

iterator begin() const

Definition:SmallPtrSet.h:472

llvm::SmallPtrSetImpl::contains

bool contains(ConstPtrType Ptr) const

Definition:SmallPtrSet.h:458

llvm::SmallPtrSet< Value *, 16 >

llvm::SmallSetVector

A SetVector that performs no allocations if smaller than a certain size.

Definition:SetVector.h:370

llvm::SmallSet

SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...

Definition:SmallSet.h:132

llvm::SmallSet::count

size_type count(const T &V) const

count - Return 1 if the element is in the set, 0 otherwise.

Definition:SmallSet.h:175

llvm::SmallSet::contains

bool contains(const T &V) const

Check if the SmallSet contains the given element.

Definition:SmallSet.h:222

llvm::SmallSet::insert

std::pair< const_iterator, bool > insert(const T &V)

insert - Insert an element into the set if it isn't already there.

Definition:SmallSet.h:181

llvm::SmallSet::size

size_type size() const

Definition:SmallSet.h:170

llvm::SmallString

SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...

Definition:SmallString.h:26

llvm::SmallVectorBase::empty

bool empty() const

Definition:SmallVector.h:81

llvm::SmallVectorBase::size

size_t size() const

Definition:SmallVector.h:78

llvm::SmallVectorImpl

This class consists of common code factored out of the SmallVector class to reduce code duplication b...

Definition:SmallVector.h:573

llvm::SmallVectorImpl::pop_back_val

T pop_back_val()

Definition:SmallVector.h:673

llvm::SmallVectorImpl::assign

void assign(size_type NumElts, ValueParamT Elt)

Definition:SmallVector.h:704

llvm::SmallVectorImpl::emplace_back

reference emplace_back(ArgTypes &&... Args)

Definition:SmallVector.h:937

llvm::SmallVectorImpl::reserve

void reserve(size_type N)

Definition:SmallVector.h:663

llvm::SmallVectorImpl::append

void append(ItTy in_start, ItTy in_end)

Add the specified range to the end of the SmallVector.

Definition:SmallVector.h:683

llvm::SmallVectorImpl::clear

void clear()

Definition:SmallVector.h:610

llvm::SmallVectorImpl::swap

void swap(SmallVectorImpl &RHS)

Definition:SmallVector.h:968

llvm::SmallVectorImpl::resize

void resize(size_type N)

Definition:SmallVector.h:638

llvm::SmallVectorTemplateBase::pop_back

void pop_back()

Definition:SmallVector.h:425

llvm::SmallVectorTemplateBase::push_back

void push_back(const T &Elt)

Definition:SmallVector.h:413

llvm::SmallVectorTemplateCommon::end

iterator end()

Definition:SmallVector.h:269

llvm::SmallVectorTemplateCommon::rbegin

reverse_iterator rbegin()

Definition:SmallVector.h:273

llvm::SmallVectorTemplateCommon::front

reference front()

Definition:SmallVector.h:299

llvm::SmallVectorTemplateCommon::begin

iterator begin()

Definition:SmallVector.h:267

llvm::SmallVectorTemplateCommon::back

reference back()

Definition:SmallVector.h:308

llvm::SmallVectorTemplateCommon::rend

reverse_iterator rend()

Definition:SmallVector.h:275

llvm::SmallVector

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Definition:SmallVector.h:1196

llvm::StoreInst

An instruction for storing to memory.

Definition:Instructions.h:292

llvm::StoreInst::getPointerOperandType

Type * getPointerOperandType() const

Definition:Instructions.h:384

llvm::StoreInst::getValueOperand

Value * getValueOperand()

Definition:Instructions.h:378

llvm::StoreInst::getPointerOperand

Value * getPointerOperand()

Definition:Instructions.h:381

llvm::StringRef

StringRef - Represent a constant reference to a string, i.e.

Definition:StringRef.h:51

llvm::TargetFolder

TargetFolder - Create constants with target dependent folding.

Definition:TargetFolder.h:34

llvm::TargetIRAnalysis

Analysis pass providing the TargetTransformInfo.

Definition:TargetTransformInfo.h:3194

llvm::TargetLibraryAnalysis

Analysis pass providing the TargetLibraryInfo.

Definition:TargetLibraryInfo.h:614

llvm::TargetLibraryInfo

Provides information about what library functions are available for the current target.

Definition:TargetLibraryInfo.h:280

llvm::TargetTransformInfo

This pass provides access to the codegen interfaces that are needed for IR-level transformations.

Definition:TargetTransformInfo.h:212

llvm::TargetTransformInfo::getCastContextHint

static CastContextHint getCastContextHint(const Instruction *I)

Calculates a CastContextHint from I.

Definition:TargetTransformInfo.cpp:996

llvm::TargetTransformInfo::getStridedMemoryOpCost

InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1156

llvm::TargetTransformInfo::getCmpSelInstrCost

InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1067

llvm::TargetTransformInfo::getRegisterBitWidth

TypeSize getRegisterBitWidth(RegisterKind K) const

Definition:TargetTransformInfo.cpp:776

llvm::TargetTransformInfo::isLegalMaskedGather

bool isLegalMaskedGather(Type *DataType, Align Alignment) const

Return true if the target supports masked gather.

Definition:TargetTransformInfo.cpp:490

llvm::TargetTransformInfo::getMemoryOpCost

InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1125

llvm::TargetTransformInfo::getInterleavedMemoryOpCost

InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const

Definition:TargetTransformInfo.cpp:1165

llvm::TargetTransformInfo::getIntrinsicInstrCost

InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const

Definition:TargetTransformInfo.cpp:1177

llvm::TargetTransformInfo::getArithmeticReductionCost

InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Calculate the cost of vector reduction intrinsics.

Definition:TargetTransformInfo.cpp:1215

llvm::TargetTransformInfo::getCastInstrCost

InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1039

llvm::TargetTransformInfo::getGEPCost

InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const

Estimate the cost of a GEP operation when lowered.

Definition:TargetTransformInfo.cpp:248

llvm::TargetTransformInfo::isLegalInterleavedAccessType

bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const

Return true is the target supports interleaved access for the given vector type VTy,...

Definition:TargetTransformInfo.cpp:531

llvm::TargetTransformInfo::isLegalBroadcastLoad

bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const

\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...

Definition:TargetTransformInfo.cpp:485

llvm::TargetTransformInfo::getExtendedReductionCost

InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...

Definition:TargetTransformInfo.cpp:1233

llvm::TargetTransformInfo::getOperandInfo

static OperandValueInfo getOperandInfo(const Value *V)

Collect properties of V used in cost analysis, e.g. OP_PowerOf2.

Definition:TargetTransformInfo.cpp:880

llvm::TargetTransformInfo::getRegisterClassForType

unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const

Definition:TargetTransformInfo.cpp:767

llvm::TargetTransformInfo::forceScalarizeMaskedGather

bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const

Return true if the target forces scalarizing of llvm.masked.gather intrinsics.

Definition:TargetTransformInfo.cpp:506

llvm::TargetTransformInfo::isLegalStridedLoadStore

bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const

Return true if the target supports strided load.

Definition:TargetTransformInfo.cpp:526

llvm::TargetTransformInfo::getMinMaxReductionCost

InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Definition:TargetTransformInfo.cpp:1224

llvm::TargetTransformInfo::TargetCostKind

TargetCostKind

The kind of cost model.

Definition:TargetTransformInfo.h:263

llvm::TargetTransformInfo::TCK_RecipThroughput

@ TCK_RecipThroughput

Reciprocal throughput.

Definition:TargetTransformInfo.h:264

llvm::TargetTransformInfo::getArithmeticInstrCost

InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const

This is an approximation of reciprocal throughput of a math/logic op.

Definition:TargetTransformInfo.cpp:940

llvm::TargetTransformInfo::OperandValueProperties

OperandValueProperties

Additional properties of an operand's values.

Definition:TargetTransformInfo.h:1126

llvm::TargetTransformInfo::OP_NegatedPowerOf2

@ OP_NegatedPowerOf2

Definition:TargetTransformInfo.h:1129

llvm::TargetTransformInfo::OP_None

@ OP_None

Definition:TargetTransformInfo.h:1127

llvm::TargetTransformInfo::OP_PowerOf2

@ OP_PowerOf2

Definition:TargetTransformInfo.h:1128

llvm::TargetTransformInfo::getPointersChainCost

InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...

Definition:TargetTransformInfo.cpp:254

llvm::TargetTransformInfo::getMaximumVF

unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const

Definition:TargetTransformInfo.cpp:807

llvm::TargetTransformInfo::isTypeLegal

bool isTypeLegal(Type *Ty) const

Return true if this type is legal.

Definition:TargetTransformInfo.cpp:583

llvm::TargetTransformInfo::getCostOfKeepingLiveOverCall

InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const

Definition:TargetTransformInfo.cpp:1247

llvm::TargetTransformInfo::RGK_FixedWidthVector

@ RGK_FixedWidthVector

Definition:TargetTransformInfo.h:1180

llvm::TargetTransformInfo::getShuffleCost

InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const

Definition:TargetTransformInfo.cpp:976

llvm::TargetTransformInfo::getMinVectorRegisterBitWidth

unsigned getMinVectorRegisterBitWidth() const

Definition:TargetTransformInfo.cpp:781

llvm::TargetTransformInfo::getGatherScatterOpCost

InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1146

llvm::TargetTransformInfo::getNumberOfRegisters

unsigned getNumberOfRegisters(unsigned ClassID) const

Definition:TargetTransformInfo.cpp:759

llvm::TargetTransformInfo::isLegalAltInstr

bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const

Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...

Definition:TargetTransformInfo.cpp:495

llvm::TargetTransformInfo::isFPVectorizationPotentiallyUnsafe

bool isFPVectorizationPotentiallyUnsafe() const

Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...

Definition:TargetTransformInfo.cpp:680

llvm::TargetTransformInfo::getStoreMinimumVF

unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const

Definition:TargetTransformInfo.cpp:812

llvm::TargetTransformInfo::TCC_Expensive

@ TCC_Expensive

The cost of a 'div' instruction on x86.

Definition:TargetTransformInfo.h:291

llvm::TargetTransformInfo::TCC_Free

@ TCC_Free

Expected to fold away in lowering.

Definition:TargetTransformInfo.h:289

llvm::TargetTransformInfo::TCC_Basic

@ TCC_Basic

The cost of a typical 'add' instruction.

Definition:TargetTransformInfo.h:290

llvm::TargetTransformInfo::getScalarizationOverhead

InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const

Estimate the overhead of scalarizing an instruction.

Definition:TargetTransformInfo.cpp:628

llvm::TargetTransformInfo::getInstructionCost

InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const

Estimate the cost of a given IR user when lowered.

Definition:TargetTransformInfo.cpp:270

llvm::TargetTransformInfo::getExtractWithExtendCost

InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const

Definition:TargetTransformInfo.cpp:1050

llvm::TargetTransformInfo::getNumberOfParts

unsigned getNumberOfParts(Type *Tp) const

Definition:TargetTransformInfo.cpp:1193

llvm::TargetTransformInfo::getVectorInstrCost

InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const

Definition:TargetTransformInfo.cpp:1079

llvm::TargetTransformInfo::ShuffleKind

ShuffleKind

The various kinds of shuffle patterns for vector queries.

Definition:TargetTransformInfo.h:1098

llvm::TargetTransformInfo::SK_InsertSubvector

@ SK_InsertSubvector

InsertSubvector. Index indicates start offset.

Definition:TargetTransformInfo.h:1105

llvm::TargetTransformInfo::SK_Select

@ SK_Select

Selects elements from the corresponding lane of either source operand.

Definition:TargetTransformInfo.h:1101

llvm::TargetTransformInfo::SK_PermuteSingleSrc

@ SK_PermuteSingleSrc

Shuffle elements of single source vector with any shuffle mask.

Definition:TargetTransformInfo.h:1109

llvm::TargetTransformInfo::SK_Broadcast

@ SK_Broadcast

Broadcast element 0 to all other elements.

Definition:TargetTransformInfo.h:1099

llvm::TargetTransformInfo::SK_PermuteTwoSrc

@ SK_PermuteTwoSrc

Merge elements from two source vectors into one with any shuffle mask.

Definition:TargetTransformInfo.h:1107

llvm::TargetTransformInfo::SK_Reverse

@ SK_Reverse

Reverse the order of the vector.

Definition:TargetTransformInfo.h:1100

llvm::TargetTransformInfo::SK_ExtractSubvector

@ SK_ExtractSubvector

ExtractSubvector Index indicates start offset.

Definition:TargetTransformInfo.h:1106

llvm::TargetTransformInfo::getCallInstrCost

InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const

Definition:TargetTransformInfo.cpp:1185

llvm::TargetTransformInfo::CastContextHint

CastContextHint

Represents a hint about the context in which a cast is used.

Definition:TargetTransformInfo.h:1389

llvm::TargetTransformInfo::CastContextHint::Reversed

@ Reversed

The cast is used with a reversed load/store.

llvm::TargetTransformInfo::CastContextHint::None

@ None

The cast is not used with a load/store of any kind.

llvm::TargetTransformInfo::CastContextHint::Normal

@ Normal

The cast is used with a normal load/store.

llvm::TargetTransformInfo::CastContextHint::GatherScatter

@ GatherScatter

The cast is used with a gather/scatter.

llvm::TargetTransformInfo::OperandValueKind

OperandValueKind

Additional information about an operand's possible values.

Definition:TargetTransformInfo.h:1118

llvm::TargetTransformInfo::OK_UniformConstantValue

@ OK_UniformConstantValue

Definition:TargetTransformInfo.h:1121

llvm::TargetTransformInfo::OK_UniformValue

@ OK_UniformValue

Definition:TargetTransformInfo.h:1120

llvm::TargetTransformInfo::OK_AnyValue

@ OK_AnyValue

Definition:TargetTransformInfo.h:1119

llvm::TargetTransformInfo::OK_NonUniformConstantValue

@ OK_NonUniformConstantValue

Definition:TargetTransformInfo.h:1122

llvm::Twine

Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...

Definition:Twine.h:81

llvm::TypeSize

Definition:TypeSize.h:334

llvm::Type

The instances of the Type class are immutable: once they are created, they are never changed.

Definition:Type.h:45

llvm::Type::getIntegerBitWidth

unsigned getIntegerBitWidth() const

llvm::Type::isVectorTy

bool isVectorTy() const

True if this is an instance of VectorType.

Definition:Type.h:270

llvm::Type::isX86_FP80Ty

bool isX86_FP80Ty() const

Return true if this is x86 long double.

Definition:Type.h:159

llvm::Type::isIntOrIntVectorTy

bool isIntOrIntVectorTy() const

Return true if this is an integer type or a vector of integer types.

Definition:Type.h:243

llvm::Type::isPointerTy

bool isPointerTy() const

True if this is an instance of PointerType.

Definition:Type.h:264

llvm::Type::isEmptyTy

bool isEmptyTy() const

Return true if this type is empty, that is, it has no elements or all of its elements are empty.

llvm::Type::getStructNumElements

unsigned getStructNumElements() const

llvm::Type::getPointerAddressSpace

unsigned getPointerAddressSpace() const

Get the address space of this pointer or pointer vector type.

llvm::Type::isSingleValueType

bool isSingleValueType() const

Return true if the type is a valid type for a register in codegen.

Definition:Type.h:295

llvm::Type::print

void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const

Print the current type.

llvm::Type::isPPC_FP128Ty

bool isPPC_FP128Ty() const

Return true if this is powerpc long double.

Definition:Type.h:165

llvm::Type::getScalarSizeInBits

unsigned getScalarSizeInBits() const LLVM_READONLY

If this is a vector type, return the getPrimitiveSizeInBits value for the element type.

llvm::Type::getWithNewType

Type * getWithNewType(Type *EltTy) const

Given vector type, change the element type, whilst keeping the old number of elements.

llvm::Type::getContext

LLVMContext & getContext() const

Return the LLVMContext in which this type was uniqued.

Definition:Type.h:128

llvm::Type::isFloatingPointTy

bool isFloatingPointTy() const

Return true if this is one of the floating-point types.

Definition:Type.h:184

llvm::Type::isPtrOrPtrVectorTy

bool isPtrOrPtrVectorTy() const

Return true if this is a pointer type or a vector of pointer types.

Definition:Type.h:267

llvm::Type::isIntegerTy

bool isIntegerTy() const

True if this is an instance of IntegerType.

Definition:Type.h:237

llvm::Type::getTypeID

TypeID getTypeID() const

Return the type id for the type.

Definition:Type.h:136

llvm::Type::isFPOrFPVectorTy

bool isFPOrFPVectorTy() const

Return true if this is a FP type or a vector of FP.

Definition:Type.h:225

llvm::Type::isVoidTy

bool isVoidTy() const

Return true if this is 'void'.

Definition:Type.h:139

llvm::Type::getScalarType

Type * getScalarType() const

If this is a vector type, return the element type, otherwise return 'this'.

Definition:Type.h:355

llvm::UnaryOperator

Definition:InstrTypes.h:100

llvm::UndefValue::get

static UndefValue * get(Type *T)

Static factory methods - Return an 'undef' object of the specified type.

Definition:Constants.cpp:1859

llvm::Use

A Use represents the edge between a Value definition and its users.

Definition:Use.h:43

llvm::User

Definition:User.h:44

llvm::User::operands

op_range operands()

Definition:User.h:288

llvm::User::replaceUsesOfWith

bool replaceUsesOfWith(Value *From, Value *To)

Replace uses of one Value with another.

Definition:User.cpp:21

llvm::User::User

User(Type *ty, unsigned vty, AllocInfo AllocInfo)

Definition:User.h:115

llvm::User::op_begin

op_iterator op_begin()

Definition:User.h:280

llvm::User::getOperand

Value * getOperand(unsigned i) const

Definition:User.h:228

llvm::User::getNumOperands

unsigned getNumOperands() const

Definition:User.h:250

llvm::User::operand_values

iterator_range< value_op_iterator > operand_values()

Definition:User.h:312

llvm::VFDatabase

The Vector Function Database.

Definition:VectorUtils.h:31

llvm::VFDatabase::getMappings

static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)

Retrieve all the VFInfo instances associated to the CallInst CI.

Definition:VectorUtils.h:72

llvm::Value

LLVM Value Representation.

Definition:Value.h:74

llvm::Value::getType

Type * getType() const

All values are typed, get the type of this value.

Definition:Value.h:255

llvm::Value::user_begin

user_iterator user_begin()

Definition:Value.h:397

llvm::Value::hasOneUse

bool hasOneUse() const

Return true if there is exactly one use of this value.

Definition:Value.h:434

llvm::Value::replaceAllUsesWith

void replaceAllUsesWith(Value *V)

Change all uses of this to point to a new Value.

Definition:Value.cpp:534

llvm::Value::users

iterator_range< user_iterator > users()

Definition:Value.h:421

llvm::Value::getValueID

unsigned getValueID() const

Return an ID for the concrete type of this object.

Definition:Value.h:532

llvm::Value::hasNUsesOrMore

bool hasNUsesOrMore(unsigned N) const

Return true if this value has N uses or more.

Definition:Value.cpp:153

llvm::Value::hasNUses

bool hasNUses(unsigned N) const

Return true if this Value has exactly N uses.

Definition:Value.cpp:149

llvm::Value::use_empty

bool use_empty() const

Definition:Value.h:344

llvm::Value::getContext

LLVMContext & getContext() const

All values hold a context through their type.

Definition:Value.cpp:1075

llvm::Value::getNumUses

unsigned getNumUses() const

This method computes the number of uses of this Value.

Definition:Value.cpp:255

llvm::Value::getName

StringRef getName() const

Return a constant reference to the value's name.

Definition:Value.cpp:309

llvm::Value::takeName

void takeName(Value *V)

Transfer the name from V to this value.

Definition:Value.cpp:383

llvm::VectorType

Base class of all SIMD vector types.

Definition:DerivedTypes.h:427

llvm::VectorType::getElementCount

ElementCount getElementCount() const

Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...

Definition:DerivedTypes.h:665

llvm::VectorType::get

static VectorType * get(Type *ElementType, ElementCount EC)

This static method is the primary way to construct an VectorType.

llvm::VectorType::getElementType

Type * getElementType() const

Definition:DerivedTypes.h:460

llvm::WeakTrackingVH

Value handle that is nullable, but tries to track the Value.

Definition:ValueHandle.h:204

llvm::cl::opt

Definition:CommandLine.h:1423

llvm::detail::DenseSetImpl::insert

std::pair< iterator, bool > insert(const ValueT &V)

Definition:DenseSet.h:213

llvm::detail::DenseSetImpl::clear

void clear()

Definition:DenseSet.h:92

llvm::detail::DenseSetImpl::find

iterator find(const_arg_type_t< ValueT > V)

Definition:DenseSet.h:187

llvm::detail::DenseSetImpl::end

iterator end()

Definition:DenseSet.h:182

llvm::detail::DenseSetImpl::size

size_type size() const

Definition:DenseSet.h:81

llvm::detail::DenseSetImpl::empty

bool empty() const

Definition:DenseSet.h:80

llvm::detail::DenseSetImpl::contains

bool contains(const_arg_type_t< ValueT > V) const

Check if the set contains the given element.

Definition:DenseSet.h:193

llvm::detail::DenseSetImpl::begin

iterator begin()

Definition:DenseSet.h:181

llvm::detail::DenseSetImpl::erase

bool erase(const ValueT &V)

Definition:DenseSet.h:97

llvm::detail::DenseSetImpl::count

size_type count(const_arg_type_t< ValueT > V) const

Return 1 if the specified key is in the set, 0 otherwise.

Definition:DenseSet.h:95

llvm::details::FixedOrScalableQuantity::getFixedValue

constexpr ScalarTy getFixedValue() const

Definition:TypeSize.h:202

llvm::function_ref

An efficient, type-erasing, non-owning reference to a callable.

Definition:STLFunctionalExtras.h:37

llvm::hash_code

An opaque object representing a hash code.

Definition:Hashing.h:75

llvm::ilist_detail::node_parent_access::getParent

const ParentTy * getParent() const

Definition:ilist_node.h:32

llvm::ilist_node_impl::getIterator

self_iterator getIterator()

Definition:ilist_node.h:132

llvm::ilist_node_with_parent::getPrevNode

NodeTy * getPrevNode()

Definition:ilist_node.h:339

llvm::iterator_adaptor_base

CRTP base class for adapting an iterator to a different type.

Definition:iterator.h:237

llvm::iterator_range

A range adaptor for a pair of iterators.

Definition:iterator_range.h:42

llvm::raw_ostream

This class implements an extremely fast bulk output stream that can only output to a stream.

Definition:raw_ostream.h:52

llvm::raw_ostream::indent

raw_ostream & indent(unsigned NumSpaces)

indent - Insert 'NumSpaces' spaces.

Definition:raw_ostream.cpp:495

llvm::raw_string_ostream

A raw_ostream that writes to an std::string.

Definition:raw_ostream.h:661

llvm::raw_svector_ostream

A raw_ostream that writes to an SmallVector or SmallString.

Definition:raw_ostream.h:691

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics

A helper class used for scoring candidates for two consecutive lanes.

Definition:SLPVectorizer.cpp:1670

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveExtracts

static const int ScoreConsecutiveExtracts

ExtractElementInst from same vector and consecutive indexes.

Definition:SLPVectorizer.cpp:1708

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getShallowScore

int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const

Definition:SLPVectorizer.cpp:1730

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAllUserVectorized

static const int ScoreAllUserVectorized

Score if all users are vectorized.

Definition:SLPVectorizer.cpp:1724

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSameOpcode

static const int ScoreSameOpcode

Instructions with the same opcode.

Definition:SLPVectorizer.cpp:1714

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreUndef

static const int ScoreUndef

Matching with an undef is preferable to failing.

Definition:SLPVectorizer.cpp:1720

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getScoreAtLevelRec

int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const

Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.

Definition:SLPVectorizer.cpp:1902

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreFail

static const int ScoreFail

Score for failing to find a decent match.

Definition:SLPVectorizer.cpp:1722

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreMaskedGatherCandidate

static const int ScoreMaskedGatherCandidate

A load candidate for masked gather.

Definition:SLPVectorizer.cpp:1706

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplat

static const int ScoreSplat

Identical instructions (a.k.a. splat or broadcast).

Definition:SLPVectorizer.cpp:1718

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::LookAheadHeuristics

LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)

Definition:SLPVectorizer.cpp:1679

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplatLoads

static const int ScoreSplatLoads

The same load multiple times.

Definition:SLPVectorizer.cpp:1702

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedLoads

static const int ScoreReversedLoads

Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).

Definition:SLPVectorizer.cpp:1704

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConstants

static const int ScoreConstants

Constants.

Definition:SLPVectorizer.cpp:1712

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAltOpcodes

static const int ScoreAltOpcodes

Instructions with alt opcodes (e.g, add + sub).

Definition:SLPVectorizer.cpp:1716

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveLoads

static const int ScoreConsecutiveLoads

Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

Definition:SLPVectorizer.cpp:1697

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedExtracts

static const int ScoreReversedExtracts

ExtractElementInst from same vector and reversed indices.

Definition:SLPVectorizer.cpp:1710

llvm::slpvectorizer::BoUpSLP::VLOperands

A helper data structure to hold the operands of a vector of instructions.

Definition:SLPVectorizer.cpp:1971

llvm::slpvectorizer::BoUpSLP::VLOperands::getVL

ValueList getVL(unsigned OpIdx) const

\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.

Definition:SLPVectorizer.cpp:2581

llvm::slpvectorizer::BoUpSLP::VLOperands::getModeStr

static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)

Definition:SLPVectorizer.cpp:2734

llvm::slpvectorizer::BoUpSLP::VLOperands::VLOperands

VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)

Initialize with all the operands of the instruction vector RootVL.

Definition:SLPVectorizer.cpp:2571

llvm::slpvectorizer::BoUpSLP::VLOperands::dumpMode

static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)

Debug print.

Definition:SLPVectorizer.cpp:2756

llvm::slpvectorizer::BoUpSLP::VLOperands::dump

LLVM_DUMP_METHOD void dump() const

Debug print.

Definition:SLPVectorizer.cpp:2783

llvm::slpvectorizer::BoUpSLP::VLOperands::operator<<

friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)

Definition:SLPVectorizer.cpp:2760

llvm::slpvectorizer::BoUpSLP::VLOperands::reorder

void reorder()

Definition:SLPVectorizer.cpp:2593

llvm::slpvectorizer::BoUpSLP::VLOperands::printMode

static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)

Definition:SLPVectorizer.cpp:2750

llvm::slpvectorizer::BoUpSLP::VLOperands::print

LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const

Definition:SLPVectorizer.cpp:2764

llvm::slpvectorizer::BoUpSLP

Bottom Up SLP Vectorizer.

Definition:SLPVectorizer.cpp:1319

llvm::slpvectorizer::BoUpSLP::OrdersType

SmallVector< unsigned, 4 > OrdersType

Definition:SLPVectorizer.cpp:1339

llvm::slpvectorizer::BoUpSLP::getRootNodeTypeWithNoCast

std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const

Returns the type/is-signed info for the root node in the graph without casting.

Definition:SLPVectorizer.cpp:1409

llvm::slpvectorizer::BoUpSLP::findPartiallyOrderedLoads

std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)

Sort loads into increasing pointers offsets to allow greater clustering.

Definition:SLPVectorizer.cpp:5464

llvm::slpvectorizer::BoUpSLP::LoadsState

LoadsState

Tracks the state we can represent the loads in the given sequence.

Definition:SLPVectorizer.cpp:1327

llvm::slpvectorizer::BoUpSLP::LoadsState::ScatterVectorize

@ ScatterVectorize

llvm::slpvectorizer::BoUpSLP::LoadsState::Gather

@ Gather

llvm::slpvectorizer::BoUpSLP::LoadsState::Vectorize

@ Vectorize

llvm::slpvectorizer::BoUpSLP::LoadsState::StridedVectorize

@ StridedVectorize

llvm::slpvectorizer::BoUpSLP::operator<<

friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)

Definition:SLPVectorizer.cpp:4045

llvm::slpvectorizer::BoUpSLP::reorderTopToBottom

void reorderTopToBottom()

Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...

Definition:SLPVectorizer.cpp:5926

llvm::slpvectorizer::BoUpSLP::reorderBottomToTop

void reorderBottomToTop(bool IgnoreReorder=false)

Reorders the current graph to the most profitable order starting from leaves to the root.

Definition:SLPVectorizer.cpp:6258

llvm::slpvectorizer::BoUpSLP::registerNonVectorizableLoads

void registerNonVectorizableLoads(ArrayRef< T * > VL)

Registers non-vectorizable sequence of loads.

Definition:SLPVectorizer.cpp:1627

llvm::slpvectorizer::BoUpSLP::getTreeSize

unsigned getTreeSize() const

Definition:SLPVectorizer.cpp:1487

llvm::slpvectorizer::BoUpSLP::~BoUpSLP

~BoUpSLP()

Definition:SLPVectorizer.cpp:4510

llvm::slpvectorizer::BoUpSLP::areKnownNonVectorizableLoads

bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const

Checks if the given loads sequence is known as not vectorizable.

Definition:SLPVectorizer.cpp:1633

llvm::slpvectorizer::BoUpSLP::getCanonicalGraphSize

unsigned getCanonicalGraphSize() const

Returns the base graph size, before any transformations.

Definition:SLPVectorizer.cpp:1490

llvm::slpvectorizer::BoUpSLP::areAnalyzedReductionVals

bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const

Checks if the provided list of reduced values was checked already for vectorization.

Definition:SLPVectorizer.cpp:2919

llvm::slpvectorizer::BoUpSLP::canVectorizeLoads

LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const

Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.

Definition:SLPVectorizer.cpp:5013

llvm::slpvectorizer::BoUpSLP::isLoadCombineCandidate

bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const

Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...

Definition:SLPVectorizer.cpp:12060

llvm::slpvectorizer::BoUpSLP::analyzedReductionVals

void analyzedReductionVals(ArrayRef< Value * > VL)

Adds the list of reduced values to list of already checked values for the vectorization.

Definition:SLPVectorizer.cpp:2924

llvm::slpvectorizer::BoUpSLP::isLoadCombineReductionCandidate

bool isLoadCombineReductionCandidate(RecurKind RdxKind) const

Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...

Definition:SLPVectorizer.cpp:12050

llvm::slpvectorizer::BoUpSLP::getVectorElementSize

unsigned getVectorElementSize(Value *V)

Definition:SLPVectorizer.cpp:17662

llvm::slpvectorizer::BoUpSLP::isSignedMinBitwidthRootNode

bool isSignedMinBitwidthRootNode() const

Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...

Definition:SLPVectorizer.cpp:1428

llvm::slpvectorizer::BoUpSLP::analyzedReductionRoot

void analyzedReductionRoot(Instruction *I)

Definition:SLPVectorizer.cpp:2914

llvm::slpvectorizer::BoUpSLP::getRootNodeScalars

ArrayRef< Value * > getRootNodeScalars() const

Return the scalars of the root node.

Definition:SLPVectorizer.cpp:1402

llvm::slpvectorizer::BoUpSLP::computeMinimumValueSizes

void computeMinimumValueSizes()

Compute the minimum type sizes required to represent the entries in a vectorizable tree.

Definition:SLPVectorizer.cpp:18134

llvm::slpvectorizer::BoUpSLP::deleteTree

void deleteTree()

Clear the internal data structures that are created by 'buildTree'.

Definition:SLPVectorizer.cpp:1460

llvm::slpvectorizer::BoUpSLP::getTreeCost

InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})

Definition:SLPVectorizer.cpp:12459

llvm::slpvectorizer::BoUpSLP::getMaximumVF

unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const

Definition:SLPVectorizer.cpp:1568

llvm::slpvectorizer::BoUpSLP::ValueSet

SmallPtrSet< Value *, 16 > ValueSet

Definition:SLPVectorizer.cpp:1336

llvm::slpvectorizer::BoUpSLP::ExtraValueToDebugLocsMap

SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap

Definition:SLPVectorizer.cpp:1338

llvm::slpvectorizer::BoUpSLP::BoUpSLP

BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)

Definition:SLPVectorizer.cpp:1341

llvm::slpvectorizer::BoUpSLP::isNotScheduled

bool isNotScheduled(const Value *V) const

Checks if the specified value was not schedule.

Definition:SLPVectorizer.cpp:2942

llvm::slpvectorizer::BoUpSLP::transformNodes

void transformNodes()

Transforms graph nodes to target specific representations, if profitable.

Definition:SLPVectorizer.cpp:9761

llvm::slpvectorizer::BoUpSLP::isDeleted

bool isDeleted(Instruction *I) const

Checks if the instruction is marked for deletion.

Definition:SLPVectorizer.cpp:2813

llvm::slpvectorizer::BoUpSLP::buildExternalUses

void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})

Builds external uses of the vectorized scalars, i.e.

Definition:SLPVectorizer.cpp:6535

llvm::slpvectorizer::BoUpSLP::isTreeTinyAndNotFullyVectorizable

bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const

Definition:SLPVectorizer.cpp:12073

llvm::slpvectorizer::BoUpSLP::removeInstructionsAndOperands

void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)

Remove instructions from the parent function and clear the operands of DeadVals instructions,...

Definition:SLPVectorizer.cpp:2825

llvm::slpvectorizer::BoUpSLP::canMapToVector

unsigned canMapToVector(Type *T) const

Check if homogeneous aggregate is isomorphic to some VectorType.

Definition:SLPVectorizer.cpp:8898

llvm::slpvectorizer::BoUpSLP::getMinVF

unsigned getMinVF(unsigned Sz) const

Definition:SLPVectorizer.cpp:1564

llvm::slpvectorizer::BoUpSLP::isAnalyzedReductionRoot

bool isAnalyzedReductionRoot(Instruction *I) const

Checks if the instruction was already analyzed for being possible reduction root.

Definition:SLPVectorizer.cpp:2909

llvm::slpvectorizer::BoUpSLP::getReorderingData

std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)

Gets reordering data for the given tree entry.

Definition:SLPVectorizer.cpp:5540

llvm::slpvectorizer::BoUpSLP::eraseInstruction

void eraseInstruction(Instruction *I)

Removes an instruction from its block and eventually deletes it.

Definition:SLPVectorizer.cpp:2818

llvm::slpvectorizer::BoUpSLP::doesRootHaveInTreeUses

bool doesRootHaveInTreeUses() const

Returns whether the root node has in-tree uses.

Definition:SLPVectorizer.cpp:1396

llvm::slpvectorizer::BoUpSLP::getORE

OptimizationRemarkEmitter * getORE()

Definition:SLPVectorizer.cpp:1637

llvm::slpvectorizer::BoUpSLP::isAnyGathered

bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const

Checks if the given value is gathered in one of the nodes.

Definition:SLPVectorizer.cpp:2934

llvm::slpvectorizer::BoUpSLP::ValueList

SmallVector< Value *, 8 > ValueList

Definition:SLPVectorizer.cpp:1334

llvm::slpvectorizer::BoUpSLP::buildTree

void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)

Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...

Definition:SLPVectorizer.cpp:6759

llvm::slpvectorizer::BoUpSLP::isTreeNotExtendable

bool isTreeNotExtendable() const

Checks if the graph and all its subgraphs cannot be better vectorized.

Definition:SLPVectorizer.cpp:12157

llvm::slpvectorizer::BoUpSLP::getReductionType

FixedVectorType * getReductionType() const

Returns reduction type after minbitdth analysis.

Definition:SLPVectorizer.cpp:1433

llvm::slpvectorizer::BoUpSLP::getMaxVecRegSize

unsigned getMaxVecRegSize() const

Definition:SLPVectorizer.cpp:1555

llvm::slpvectorizer::BoUpSLP::isVectorized

bool isVectorized(Value *V) const

Check if the value is vectorized in the tree.

Definition:SLPVectorizer.cpp:2947

llvm::slpvectorizer::BoUpSLP::isIdentityOrder

bool isIdentityOrder(ArrayRef< unsigned > Order) const

Does this non-empty order represent an identity order? Identity should be represented as an empty ord...

Definition:SLPVectorizer.cpp:1499

llvm::slpvectorizer::BoUpSLP::isGathered

bool isGathered(const Value *V) const

Checks if the given value is gathered in one of the nodes.

Definition:SLPVectorizer.cpp:2938

llvm::slpvectorizer::BoUpSLP::getSpillCost

InstructionCost getSpillCost() const

Definition:SLPVectorizer.cpp:12185

llvm::slpvectorizer::BoUpSLP::getMinVecRegSize

unsigned getMinVecRegSize() const

Definition:SLPVectorizer.cpp:1560

llvm::slpvectorizer::BoUpSLP::vectorizeTree

Value * vectorizeTree()

Vectorize the tree that starts with the elements in VL.

Definition:SLPVectorizer.cpp:16301

llvm::slpvectorizer::BoUpSLP::findBestRootPair

std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const

Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...

Definition:SLPVectorizer.cpp:2793

llvm::slpvectorizer::BoUpSLP::findReusedOrderedScalars

std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)

Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...

Definition:SLPVectorizer.cpp:4610

llvm::slpvectorizer::BoUpSLP::clearReductionData

void clearReductionData()

Clear the list of the analyzed reduction root instructions.

Definition:SLPVectorizer.cpp:2928

llvm::slpvectorizer::BoUpSLP::optimizeGatherSequence

void optimizeGatherSequence()

Perform LICM and CSE on the newly generated gather sequences.

Definition:SLPVectorizer.cpp:16957

uint32_t

uint64_t

unsigned

llvm::VFDatabase::getVectorizedFunction

Function * getVectorizedFunction(const VFShape &Shape) const

Definition:VectorUtils.h:106

iterator.h

iterator_range.h

This provides a very simple, boring adaptor for a begin and end iterator into a range type.

ErrorHandling.h

llvm_unreachable

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

Definition:ErrorHandling.h:143

llvm::AArch64CC::LS

@ LS

Definition:AArch64BaseInfo.h:264

llvm::AMDGPU::HSAMD::Kernel::Key::Args

constexpr char Args[]

Key for Kernel::Metadata::mArgs.

Definition:AMDGPUMetadata.h:395

llvm::AMDGPU::PALMD::Key

Key

PAL metadata keys.

Definition:AMDGPUMetadata.h:487

llvm::AMDGPU::VGPRIndexMode::Id

Definition:SIDefines.h:310

llvm::AMDGPU::P1

@ P1

Definition:AMDGPURegBankLegalizeRules.h:53

llvm::ARMII::HorizontalReduction

@ HorizontalReduction

Definition:ARMBaseInfo.h:425

llvm::ARM_MB::ST

@ ST

Definition:ARMBaseInfo.h:73

llvm::ARM_PROC::IE

@ IE

Definition:ARMBaseInfo.h:27

llvm::ARM::PredBlockMask::TE

@ TE

llvm::BitmaskEnumDetail::Mask

constexpr std::underlying_type_t< E > Mask()

Get a bitmask with 1s in all places up to the high-order bit of E's largest value.

Definition:BitmaskEnum.h:125

llvm::COFF::Entry

@ Entry

Definition:COFF.h:844

llvm::CallingConv::C

@ C

The default llvm calling convention, compatible with C.

Definition:CallingConv.h:34

llvm::Intrinsic::getOrInsertDeclaration

Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})

Look up the Function declaration of the intrinsic id in the Module M.

Definition:Intrinsics.cpp:732

llvm::Intrinsic::not_intrinsic

@ not_intrinsic

Definition:Intrinsics.h:44

llvm::LegalityPredicates::all

Predicate all(Predicate P0, Predicate P1)

True iff P0 and P1 are true.

Definition:LegalizerInfo.h:234

llvm::M68kBeads::Term

@ Term

Definition:M68kBaseInfo.h:116

llvm::M68k::MemAddrModeKind::U

@ U

llvm::M68k::MemAddrModeKind::V

@ V

llvm::M68k::MemAddrModeKind::u

@ u

llvm::M68k::MemAddrModeKind::K

@ K

llvm::M68k::MemAddrModeKind::L

@ L

llvm::MipsISD::Ext

@ Ext

Definition:MipsISelLowering.h:157

llvm::MipsISD::Ins

@ Ins

Definition:MipsISelLowering.h:158

llvm::NVPTX::PTXLdStInstCode::Scalar

@ Scalar

Definition:NVPTX.h:162

llvm::NVPTX::PTXLdStInstCode::V2

@ V2

Definition:NVPTX.h:163

llvm::PatternMatch

Definition:PatternMatch.h:47

llvm::PatternMatch::m_Store

TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)

Matches StoreInst.

Definition:PatternMatch.h:1930

llvm::PatternMatch::m_And

BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1216

llvm::PatternMatch::m_Add

BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1102

llvm::PatternMatch::m_BinOp

class_match< BinaryOperator > m_BinOp()

Match an arbitrary binary operation and ignore it.

Definition:PatternMatch.h:100

llvm::PatternMatch::m_Xor

BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1228

llvm::PatternMatch::m_FMul

BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1174

llvm::PatternMatch::match

bool match(Val *V, const Pattern &P)

Definition:PatternMatch.h:49

llvm::PatternMatch::m_Instruction

bind_ty< Instruction > m_Instruction(Instruction *&I)

Match an instruction, capturing it if we match.

Definition:PatternMatch.h:826

llvm::PatternMatch::m_Specific

specificval_ty m_Specific(const Value *V)

Match if we have a specific specified value.

Definition:PatternMatch.h:885

llvm::PatternMatch::m_ExtractElt

TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)

Matches ExtractElementInst.

Definition:PatternMatch.h:1837

llvm::PatternMatch::m_ConstantInt

class_match< ConstantInt > m_ConstantInt()

Match an arbitrary ConstantInt and ignore it.

Definition:PatternMatch.h:168

llvm::PatternMatch::m_Select

ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)

Matches SelectInst.

Definition:PatternMatch.h:1799

llvm::PatternMatch::m_SMin

MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)

Definition:PatternMatch.h:2348

llvm::PatternMatch::m_FAdd

BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1108

llvm::PatternMatch::m_Mul

BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1168

llvm::PatternMatch::m_OneUse

OneUse_match< T > m_OneUse(const T &SubPattern)

Definition:PatternMatch.h:67

llvm::PatternMatch::m_LogicalOr

auto m_LogicalOr()

Matches L || R where L and R are arbitrary values.

Definition:PatternMatch.h:3099

llvm::PatternMatch::m_Load

OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)

Matches LoadInst.

Definition:PatternMatch.h:1923

llvm::PatternMatch::m_ZExt

CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)

Matches ZExt.

Definition:PatternMatch.h:2107

llvm::PatternMatch::m_UMax

MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)

Definition:PatternMatch.h:2354

llvm::PatternMatch::m_Cmp

class_match< CmpInst > m_Cmp()

Matches any compare instruction and ignore it.

Definition:PatternMatch.h:105

llvm::PatternMatch::m_SMax

MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)

Definition:PatternMatch.h:2342

llvm::PatternMatch::m_APInt

apint_match m_APInt(const APInt *&Res)

Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.

Definition:PatternMatch.h:299

llvm::PatternMatch::m_Value

class_match< Value > m_Value()

Match an arbitrary value and ignore it.

Definition:PatternMatch.h:92

llvm::PatternMatch::m_ZExtOrSExt

match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)

Definition:PatternMatch.h:2138

llvm::PatternMatch::m_Shl

BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1234

llvm::PatternMatch::m_LogicalAnd

auto m_LogicalAnd()

Matches L && R where L and R are arbitrary values.

Definition:PatternMatch.h:3081

llvm::PatternMatch::m_Undef

auto m_Undef()

Match an arbitrary undef constant.

Definition:PatternMatch.h:152

llvm::PatternMatch::m_Or

BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1222

llvm::PatternMatch::m_UMin

MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)

Definition:PatternMatch.h:2360

llvm::PatternMatch::m_CombineOr

match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)

Combine two pattern matchers matching L || R.

Definition:PatternMatch.h:239

llvm::RISCVFenceField::R

@ R

Definition:RISCVBaseInfo.h:373

llvm::SIEncodingFamily::VI

@ VI

Definition:SIDefines.h:37

llvm::SIEncodingFamily::SI

@ SI

Definition:SIDefines.h:36

llvm::SPII::Store

@ Store

Definition:SparcInstrInfo.h:33

llvm::X86AS::GS

@ GS

Definition:X86.h:210

llvm::X86::FirstMacroFusionInstKind::Cmp

@ Cmp

llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind::Used

@ Used

llvm::cl::Hidden

@ Hidden

Definition:CommandLine.h:137

llvm::cl::init

initializer< Ty > init(const Ty &Val)

Definition:CommandLine.h:443

llvm::codeview::EncodedFramePtrReg::BasePtr

@ BasePtr

llvm::codeview::ExportFlags::IsConstant

@ IsConstant

llvm::dwarf::Index

Index

Definition:Dwarf.h:882

llvm::dxil::ElementType::I1

@ I1

llvm::logicalview::LVPrintKind::Instructions

@ Instructions

llvm::objcopy::AdjustKind::Set

@ Set

llvm::omp::RTLDependInfoFields::Len

@ Len

llvm::ore::NV

DiagnosticInfoOptimizationBase::Argument NV

Definition:OptimizationRemarkEmitter.h:135

llvm::pdb::PDB_MemoryType::Stack

@ Stack

llvm::sampleprof::Base

@ Base

Definition:Discriminator.h:58

llvm::sys::path::begin

const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)

Get begin iterator over path.

Definition:Path.cpp:226

llvm::tgtok::In

@ In

Definition:TGLexer.h:84

llvm

This is an optimization pass for GlobalISel generic memory operations.

Definition:AddressRanges.h:18

llvm::drop_begin

auto drop_begin(T &&RangeOrContainer, size_t N=1)

Return a range covering RangeOrContainer with the first N elements excluded.

Definition:STLExtras.h:329

llvm::getPointersDiff

std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)

Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...

Definition:LoopAccessAnalysis.cpp:1535

llvm::dump

void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)

Definition:SparseBitVector.h:877

llvm::createSimpleReduction

Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)

Create a reduction of the given vector.

Definition:LoopUtils.cpp:1278

llvm::doesNotNeedToBeScheduled

static bool doesNotNeedToBeScheduled(Value *V)

Checks if the specified value does not require scheduling.

Definition:SLPVectorizer.cpp:1287

llvm::Offset

@ Offset

Definition:DWP.cpp:480

llvm::zip

detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)

zip iterator for two or more iteratable types.

Definition:STLExtras.h:854

llvm::stable_sort

void stable_sort(R &&Range)

Definition:STLExtras.h:2037

llvm::find

auto find(R &&Range, const T &Val)

Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1759

llvm::for_each

UnaryFunction for_each(R &&Range, UnaryFunction F)

Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1732

llvm::all_of

bool all_of(R &&range, UnaryPredicate P)

Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1739

llvm::hash_value

hash_code hash_value(const FixedPointSemantics &Val)

Definition:APFixedPoint.h:136

llvm::getMinMaxReductionIntrinsicOp

Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)

Returns the min/max intrinsic used when expanding a min/max reduction.

Definition:LoopUtils.cpp:989

llvm::isEqual

bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)

Definition:GCNRegPressure.cpp:22

llvm::size

auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)

Get the size of a range.

Definition:STLExtras.h:1697

llvm::RecursivelyDeleteTriviallyDeadInstructions

bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())

If the specified value is a trivially dead instruction, delete it.

Definition:Local.cpp:546

llvm::getVectorIntrinsicIDForCall

Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)

Returns intrinsic ID for call.

Definition:VectorUtils.cpp:209

llvm::reorderScalars

static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)

Reorders the list of scalars in accordance with the given Mask.

Definition:SLPVectorizer.cpp:1238

llvm::make_scope_exit

detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)

Definition:ScopeExit.h:59

llvm::Depth

@ Depth

Definition:SIMachineScheduler.h:36

llvm::enumerate

auto enumerate(FirstRange &&First, RestRanges &&...Rest)

Given two or more input ranges, returns a new range whose values are tuples (A, B,...

Definition:STLExtras.h:2448

llvm::set_intersect

void set_intersect(S1Ty &S1, const S2Ty &S2)

set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...

Definition:SetOperations.h:58

llvm::AlignStyle::Right

@ Right

llvm::AlignStyle::Left

@ Left

llvm::verifyFunction

bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)

Check a function for errors, useful for use when debugging a pass.

Definition:Verifier.cpp:7301

llvm::salvageDebugInfo

void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)

Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...

Definition:Utils.cpp:1683

llvm::Failed

testing::Matcher< const detail::ErrorHolder & > Failed()

Definition:Error.h:198

llvm::isUsedOutsideBlock

static bool isUsedOutsideBlock(Value *V)

Checks if the provided value does not require scheduling.

Definition:SLPVectorizer.cpp:1270

llvm::canConvertToMinOrMaxIntrinsic

std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)

Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...

Definition:ValueTracking.cpp:9167

llvm::set_is_subset

bool set_is_subset(const S1Ty &S1, const S2Ty &S2)

set_is_subset(A, B) - Return true iff A in B

Definition:SetOperations.h:151

llvm::getUnderlyingObject

const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)

This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....

Definition:ValueTracking.cpp:6768

llvm::interleaveComma

void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)

Definition:STLExtras.h:2207

llvm::make_early_inc_range

iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)

Make a range that does early increment to allow mutation of the underlying range without disrupting i...

Definition:STLExtras.h:657

llvm::alignDown

constexpr T alignDown(U Value, V Align, W Skew=0)

Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.

Definition:MathExtras.h:556

llvm::post_order

iterator_range< po_iterator< T > > post_order(const T &G)

Definition:PostOrderIterator.h:197

llvm::getAlign

MaybeAlign getAlign(const Function &F, unsigned Index)

Definition:NVPTXUtilities.cpp:323

llvm::propagateMetadata

Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)

Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...

Definition:VectorUtils.cpp:942

llvm::bit_ceil

T bit_ceil(T Value)

Returns the smallest integral power of two no smaller than Value if Value is nonzero.

Definition:bit.h:342

llvm::copy_if

OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)

Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1785

llvm::isGather

bool isGather(IntrinsicInst *IntInst)

Definition:ARMBaseInstrInfo.h:937

llvm::getPointerOperand

const Value * getPointerOperand(const Value *V)

A helper function that returns the pointer operand of a load, store or GEP instruction.

Definition:Instructions.h:4998

llvm::PowerOf2Ceil

uint64_t PowerOf2Ceil(uint64_t A)

Returns the power of two which is greater than or equal to the given value.

Definition:MathExtras.h:395

llvm::MaskedValueIsZero

bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)

Return true if 'V & Mask' is known to be zero.

Definition:ValueTracking.cpp:333

llvm::erase

void erase(Container &C, ValueType V)

Wrapper function to remove a value from a container:

Definition:STLExtras.h:2107

llvm::transform

OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)

Wrapper function around std::transform to apply a function to a range and store the result elsewhere.

Definition:STLExtras.h:1952

llvm::has_single_bit

constexpr bool has_single_bit(T Value) noexcept

Definition:bit.h:146

llvm::any_of

bool any_of(R &&range, UnaryPredicate P)

Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1746

llvm::isInstructionTriviallyDead

bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)

Return true if the result produced by the instruction is not used, and the instruction will return.

Definition:Local.cpp:406

llvm::Log2_32

unsigned Log2_32(uint32_t Value)

Return the floor log base 2 of the specified value, -1 if the value is zero.

Definition:MathExtras.h:341

llvm::createStrideMask

llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)

Create a stride shuffle mask.

Definition:VectorUtils.cpp:1032

llvm::reverse

auto reverse(ContainerTy &&C)

Definition:STLExtras.h:420

llvm::inversePermutation

static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)

Definition:SLPVectorizer.cpp:1228

llvm::get

decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)

Definition:PointerIntPair.h:270

llvm::sort

void sort(IteratorTy Start, IteratorTy End)

Definition:STLExtras.h:1664

llvm::createReplicatedMask

llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)

Create a mask with replicated elements.

Definition:VectorUtils.cpp:1012

llvm::ComplexDeinterleavingOperation::Splat

@ Splat

llvm::find_if_not

auto find_if_not(R &&Range, UnaryPredicate P)

Definition:STLExtras.h:1771

llvm::dbgs

raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

Definition:Debug.cpp:163

llvm::hasFullVectorsOrPowerOf2

static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)

Returns true if widened type of Ty elements with size Sz represents full vector type,...

Definition:SLPVectorizer.cpp:1303

llvm::isPointerTy

bool isPointerTy(const Type *T)

Definition:SPIRVUtils.h:256

llvm::none_of

bool none_of(R &&Range, UnaryPredicate P)

Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1753

llvm::wouldInstructionBeTriviallyDead

bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)

Return true if the result produced by the instruction would have no side effects if it was not used.

Definition:Local.cpp:425

llvm::isModOrRefSet

bool isModOrRefSet(const ModRefInfo MRI)

Definition:ModRef.h:42

llvm::isSafeToSpeculativelyExecute

bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)

Return true if the instruction does not have any effects besides calculating the result and does not ...

Definition:ValueTracking.cpp:7043

llvm::sortPtrAccesses

bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)

Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...

Definition:LoopAccessAnalysis.cpp:1600

llvm::isa

bool isa(const From &Val)

isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...

Definition:Casting.h:548

llvm::propagateIRFlags

void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)

Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...

Definition:LoopUtils.cpp:1368

llvm::PoisonMaskElem

constexpr int PoisonMaskElem

Definition:Instructions.h:1889

llvm::ModRefInfo::Ref

@ Ref

The access may reference the value stored in memory.

llvm::divideCeil

constexpr T divideCeil(U Numerator, V Denominator)

Returns the integer ceil(Numerator / Denominator).

Definition:MathExtras.h:404

llvm::IRMemLocation::Other

@ Other

Any other memory.

llvm::IRMemLocation::First

@ First

Helpers to iterate all locations in the MemoryEffectsBase class.

llvm::TTI

TargetTransformInfo TTI

Definition:TargetTransformInfo.h:208

llvm::getMinMaxReductionPredicate

CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)

Returns the comparison predicate used when expanding a min/max reduction.

Definition:LoopUtils.cpp:1054

llvm::RecurKind

RecurKind

These are the kinds of recurrences that we support.

Definition:IVDescriptors.h:33

llvm::RecurKind::Or

@ Or

Bitwise or logical OR of integers.

llvm::RecurKind::None

@ None

Not a recurrence.

llvm::isVectorIntrinsicWithScalarOpAtArg

bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)

Identifies if the vector form of the intrinsic has a scalar operand.

Definition:VectorUtils.cpp:134

llvm::areAllOperandsNonInsts

static bool areAllOperandsNonInsts(Value *V)

Checks if the provided value does not require scheduling.

Definition:SLPVectorizer.cpp:1253

llvm::alignTo

uint64_t alignTo(uint64_t Size, Align A)

Returns a multiple of A needed to store Size bytes.

Definition:Alignment.h:155

llvm::count

auto count(R &&Range, const E &Element)

Wrapper function around std::count to count the number of times an element Element occurs in the give...

Definition:STLExtras.h:1938

llvm::Op

DWARFExpression::Operation Op

Definition:DWARFExpression.cpp:22

llvm::max_element

auto max_element(R &&Range)

Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...

Definition:STLExtras.h:2014

llvm::ViewGraph

void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)

ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.

Definition:GraphWriter.h:427

llvm::copy

OutputIt copy(R &&Range, OutputIt Out)

Definition:STLExtras.h:1841

llvm::doesNotNeedToSchedule

static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)

Checks if the specified array of instructions does not require scheduling.

Definition:SLPVectorizer.cpp:1295

llvm::BitWidth

constexpr unsigned BitWidth

Definition:BitmaskEnum.h:217

llvm::isGuaranteedToTransferExecutionToSuccessor

bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)

Return true if this function can prove that the instruction I will always transfer execution to one o...

Definition:ValueTracking.cpp:7920

llvm::PseudoProbeReservedId::Last

@ Last

llvm::count_if

auto count_if(R &&Range, UnaryPredicate P)

Wrapper function around std::count_if to count the number of times an element satisfying a given pred...

Definition:STLExtras.h:1945

llvm::find_if

auto find_if(R &&Range, UnaryPredicate P)

Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1766

llvm::is_contained

bool is_contained(R &&Range, const E &Element)

Returns true if Element is found in Range.

Definition:STLExtras.h:1903

llvm::ComputeNumSignBits

unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)

Return the number of times the sign bit of the register is replicated into the other bits.

Definition:ValueTracking.cpp:351

llvm::Cost

InstructionCost Cost

Definition:FunctionSpecialization.h:102

llvm::seq

auto seq(T Begin, T End)

Iterate over an integral type from Begin up to - but not including - End.

Definition:Sequence.h:305

llvm::VFParamKind::Vector

@ Vector

llvm::hash_combine

hash_code hash_combine(const Ts &...args)

Combine values into a single hash_code.

Definition:Hashing.h:590

llvm::isGuaranteedNotToBePoison

bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)

Returns true if V cannot be poison, but may be undef.

Definition:ValueTracking.cpp:7849

llvm::bit_floor

T bit_floor(T Value)

Returns the largest integral power of two no greater than Value if Value is nonzero.

Definition:bit.h:327

llvm::ConstantFoldIntegerCast

Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)

Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...

Definition:ConstantFolding.cpp:1549

llvm::Data

@ Data

Definition:SIMachineScheduler.h:55

llvm::isKnownNonNegative

bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)

Returns true if the give value is known to be non-negative.

Definition:ValueTracking.cpp:292

llvm::mayHaveNonDefUseDependency

bool mayHaveNonDefUseDependency(const Instruction &I)

Returns true if the result or effects of the given instructions I depend values not reachable through...

Definition:ValueTracking.cpp:7156

llvm::isTriviallyVectorizable

bool isTriviallyVectorizable(Intrinsic::ID ID)

Identify if the intrinsic is trivially vectorizable.

Definition:VectorUtils.cpp:46

llvm::isVectorIntrinsicWithOverloadTypeAtArg

bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)

Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...

Definition:VectorUtils.cpp:162

llvm::hash_combine_range

hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)

Compute a hash_code for a sequence of values.

Definition:Hashing.h:468

std::swap

void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)

Implement std::swap in terms of BitVector swap.

Definition:BitVector.h:860

raw_ostream.h

#define N

llvm::Align

This struct is a compact representation of a valid (non-zero power of two) alignment.

Definition:Alignment.h:39

llvm::CallBase::BundleOpInfo

Used to keep track of an operand bundle.

Definition:InstrTypes.h:2138

llvm::CodeMetrics::collectEphemeralValues

static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)

Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).

Definition:CodeMetrics.cpp:71

llvm::DOTGraphTraits< BoUpSLP * >::TreeEntry

BoUpSLP::TreeEntry TreeEntry

Definition:SLPVectorizer.cpp:4476

llvm::DOTGraphTraits< BoUpSLP * >::getNodeLabel

std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)

Definition:SLPVectorizer.cpp:4480

llvm::DOTGraphTraits< BoUpSLP * >::getNodeAttributes

static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)

Definition:SLPVectorizer.cpp:4497

llvm::DOTGraphTraits< BoUpSLP * >::DOTGraphTraits

DOTGraphTraits(bool IsSimple=false)

Definition:SLPVectorizer.cpp:4478

llvm::DOTGraphTraits

DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...

Definition:DOTGraphTraits.h:166

llvm::DefaultDOTGraphTraits

DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...

Definition:DOTGraphTraits.h:28

llvm::DiagnosticInfoOptimizationBase::Argument

Used in the streaming interface as the general argument type.

Definition:DiagnosticInfo.h:499

llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::operator*

NodeRef operator*()

Definition:SLPVectorizer.cpp:4433

llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::ChildIteratorType

ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)

Definition:SLPVectorizer.cpp:4429

llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::VectorizableTree

ContainerTy & VectorizableTree

Definition:SLPVectorizer.cpp:4427

llvm::GraphTraits< BoUpSLP * >::child_end

static ChildIteratorType child_end(NodeRef N)

Definition:SLPVectorizer.cpp:4444

llvm::GraphTraits< BoUpSLP * >::getEntryNode

static NodeRef getEntryNode(BoUpSLP &R)

Definition:SLPVectorizer.cpp:4436

llvm::GraphTraits< BoUpSLP * >::child_begin

static ChildIteratorType child_begin(NodeRef N)

Definition:SLPVectorizer.cpp:4440

llvm::GraphTraits< BoUpSLP * >::nodes_begin

static nodes_iterator nodes_begin(BoUpSLP *R)

Definition:SLPVectorizer.cpp:4464

llvm::GraphTraits< BoUpSLP * >::NodeRef

TreeEntry * NodeRef

NodeRef has to be a pointer per the GraphWriter.

Definition:SLPVectorizer.cpp:4418

llvm::GraphTraits< BoUpSLP * >::size

static unsigned size(BoUpSLP *R)

Definition:SLPVectorizer.cpp:4472

llvm::GraphTraits< BoUpSLP * >::TreeEntry

BoUpSLP::TreeEntry TreeEntry

Definition:SLPVectorizer.cpp:4415

llvm::GraphTraits< BoUpSLP * >::nodes_end

static nodes_iterator nodes_end(BoUpSLP *R)

Definition:SLPVectorizer.cpp:4468

llvm::GraphTraits

Definition:GraphTraits.h:38

llvm::Incoming

Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...

Definition:SILowerI1Copies.h:25

llvm::Loop::LoopBounds::Direction

Direction

An enum for the direction of the loop.

Definition:LoopInfo.h:215

llvm::MaybeAlign

This struct is a compact representation of a valid (power of two) or undefined (0) alignment.

Definition:Alignment.h:117

llvm::MinMax

Definition:AssumeBundleQueries.h:70

llvm::SLPVectorizerPass::run

PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)

Definition:SLPVectorizer.cpp:18478

llvm::SLPVectorizerPass::runImpl

bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)

Definition:SLPVectorizer.cpp:18498

llvm::SimplifyQuery

Definition:SimplifyQuery.h:70

llvm::SmallMapVector

A MapVector that performs no allocations if smaller than a certain size.

Definition:MapVector.h:254

llvm::TargetTransformInfo::OperandValueInfo

Definition:TargetTransformInfo.h:1135

llvm::TargetTransformInfo::PointersChainInfo

Describe known properties for a set of pointers.

Definition:TargetTransformInfo.h:311

llvm::VFShape