Movatterモバイル変換


[0]ホーム

URL:


LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/PriorityQueue.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/ADT/SetOperations.h"
26#include "llvm/ADT/SetVector.h"
27#include "llvm/ADT/SmallBitVector.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallString.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
33#include "llvm/ADT/iterator_range.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/Analysis/AssumptionCache.h"
36#include "llvm/Analysis/CodeMetrics.h"
37#include "llvm/Analysis/ConstantFolding.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/LoopAccessAnalysis.h"
42#include "llvm/Analysis/LoopInfo.h"
43#include "llvm/Analysis/MemoryLocation.h"
44#include "llvm/Analysis/OptimizationRemarkEmitter.h"
45#include "llvm/Analysis/ScalarEvolution.h"
46#include "llvm/Analysis/ScalarEvolutionExpressions.h"
47#include "llvm/Analysis/TargetLibraryInfo.h"
48#include "llvm/Analysis/TargetTransformInfo.h"
49#include "llvm/Analysis/ValueTracking.h"
50#include "llvm/Analysis/VectorUtils.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
62#include "llvm/IR/Instructions.h"
63#include "llvm/IR/IntrinsicInst.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
67#include "llvm/IR/PatternMatch.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
77#include "llvm/Support/Casting.h"
78#include "llvm/Support/CommandLine.h"
79#include "llvm/Support/Compiler.h"
80#include "llvm/Support/DOTGraphTraits.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/DebugCounter.h"
83#include "llvm/Support/ErrorHandling.h"
84#include "llvm/Support/GraphWriter.h"
85#include "llvm/Support/InstructionCost.h"
86#include "llvm/Support/KnownBits.h"
87#include "llvm/Support/MathExtras.h"
88#include "llvm/Support/raw_ostream.h"
89#include "llvm/Transforms/Utils/InjectTLIMappings.h"
90#include "llvm/Transforms/Utils/Local.h"
91#include "llvm/Transforms/Utils/LoopUtils.h"
92#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespacellvm;
105using namespacellvm::PatternMatch;
106using namespaceslpvectorizer;
107using namespacestd::placeholders;
108
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
111
112STATISTIC(NumVectorInstructions,"Number of vector instructions generated");
113
114DEBUG_COUNTER(VectorizedGraphs,"slp-vectorized",
115"Controls which SLP graphs should be vectorized.");
116
117staticcl::opt<bool>
118RunSLPVectorization("vectorize-slp",cl::init(true),cl::Hidden,
119cl::desc("Run the SLP vectorization passes"));
120
121staticcl::opt<bool>
122SLPReVec("slp-revec",cl::init(false),cl::Hidden,
123cl::desc("Enable vectorization for wider vector utilization"));
124
125staticcl::opt<int>
126SLPCostThreshold("slp-threshold",cl::init(0),cl::Hidden,
127cl::desc("Only vectorize if you gain more than this "
128"number "));
129
130staticcl::opt<bool>SLPSkipEarlyProfitabilityCheck(
131"slp-skip-early-profitability-check",cl::init(false),cl::Hidden,
132cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
133"heuristics and makes vectorization decision via cost modeling."));
134
135staticcl::opt<bool>
136ShouldVectorizeHor("slp-vectorize-hor",cl::init(true),cl::Hidden,
137cl::desc("Attempt to vectorize horizontal reductions"));
138
139staticcl::opt<bool>ShouldStartVectorizeHorAtStore(
140"slp-vectorize-hor-store",cl::init(false),cl::Hidden,
141cl::desc(
142"Attempt to vectorize horizontal reductions feeding into a store"));
143
144staticcl::opt<int>
145MaxVectorRegSizeOption("slp-max-reg-size",cl::init(128),cl::Hidden,
146cl::desc("Attempt to vectorize for this register size in bits"));
147
148staticcl::opt<unsigned>
149MaxVFOption("slp-max-vf",cl::init(0),cl::Hidden,
150cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
151
152/// Limits the size of scheduling regions in a block.
153/// It avoid long compile times for _very_ large blocks where vector
154/// instructions are spread over a wide range.
155/// This limit is way higher than needed by real-world functions.
156staticcl::opt<int>
157ScheduleRegionSizeBudget("slp-schedule-budget",cl::init(100000),cl::Hidden,
158cl::desc("Limit the size of the SLP scheduling region per block"));
159
160staticcl::opt<int>MinVectorRegSizeOption(
161"slp-min-reg-size",cl::init(128),cl::Hidden,
162cl::desc("Attempt to vectorize for this register size in bits"));
163
164staticcl::opt<unsigned>RecursionMaxDepth(
165"slp-recursion-max-depth",cl::init(12),cl::Hidden,
166cl::desc("Limit the recursion depth when building a vectorizable tree"));
167
168staticcl::opt<unsigned>MinTreeSize(
169"slp-min-tree-size",cl::init(3),cl::Hidden,
170cl::desc("Only vectorize small trees if they are fully vectorizable"));
171
172// The maximum depth that the look-ahead score heuristic will explore.
173// The higher this value, the higher the compilation time overhead.
174staticcl::opt<int>LookAheadMaxDepth(
175"slp-max-look-ahead-depth",cl::init(2),cl::Hidden,
176cl::desc("The maximum look-ahead depth for operand reordering scores"));
177
178// The maximum depth that the look-ahead score heuristic will explore
179// when it probing among candidates for vectorization tree roots.
180// The higher this value, the higher the compilation time overhead but unlike
181// similar limit for operands ordering this is less frequently used, hence
182// impact of higher value is less noticeable.
183staticcl::opt<int>RootLookAheadMaxDepth(
184"slp-max-root-look-ahead-depth",cl::init(2),cl::Hidden,
185cl::desc("The maximum look-ahead depth for searching best rooting option"));
186
187staticcl::opt<unsigned>MinProfitableStridedLoads(
188"slp-min-strided-loads",cl::init(2),cl::Hidden,
189cl::desc("The minimum number of loads, which should be considered strided, "
190"if the stride is > 1 or is runtime value"));
191
192staticcl::opt<unsigned>MaxProfitableLoadStride(
193"slp-max-stride",cl::init(8),cl::Hidden,
194cl::desc("The maximum stride, considered to be profitable."));
195
196staticcl::opt<bool>
197ViewSLPTree("view-slp-tree",cl::Hidden,
198cl::desc("Display the SLP trees with Graphviz"));
199
200staticcl::opt<bool>VectorizeNonPowerOf2(
201"slp-vectorize-non-power-of-2",cl::init(false),cl::Hidden,
202cl::desc("Try to vectorize with non-power-of-2 number of elements."));
203
204// Limit the number of alias checks. The limit is chosen so that
205// it has no negative effect on the llvm benchmarks.
206staticconstunsignedAliasedCheckLimit = 10;
207
208// Limit of the number of uses for potentially transformed instructions/values,
209// used in checks to avoid compile-time explode.
210staticconstexprintUsesLimit = 64;
211
212// Another limit for the alias checks: The maximum distance between load/store
213// instructions where alias checks are done.
214// This limit is useful for very large basic blocks.
215staticconstunsignedMaxMemDepDistance = 160;
216
217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
218/// regions to be handled.
219staticconstintMinScheduleRegionSize = 16;
220
221/// Maximum allowed number of operands in the PHI nodes.
222staticconstunsignedMaxPHINumOperands = 128;
223
224/// Predicate for the element types that the SLP vectorizer supports.
225///
226/// The most important thing to filter here are types which are invalid in LLVM
227/// vectors. We also filter target specific types which have absolutely no
228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
229/// avoids spending time checking the cost model and realizing that they will
230/// be inevitably scalarized.
231staticboolisValidElementType(Type *Ty) {
232// TODO: Support ScalableVectorType.
233if (SLPReVec && isa<FixedVectorType>(Ty))
234 Ty = Ty->getScalarType();
235return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
236 !Ty->isPPC_FP128Ty();
237}
238
239/// Returns the type of the given value/instruction \p V. If it is store,
240/// returns the type of its value operand, for Cmp - the types of the compare
241/// operands and for insertelement - the type os the inserted operand.
242/// Otherwise, just the type of the value is returned.
243staticType *getValueType(Value *V) {
244if (auto *SI = dyn_cast<StoreInst>(V))
245return SI->getValueOperand()->getType();
246if (auto *CI = dyn_cast<CmpInst>(V))
247return CI->getOperand(0)->getType();
248if (auto *IE = dyn_cast<InsertElementInst>(V))
249return IE->getOperand(1)->getType();
250return V->getType();
251}
252
253/// \returns the number of elements for Ty.
254staticunsignedgetNumElements(Type *Ty) {
255assert(!isa<ScalableVectorType>(Ty) &&
256"ScalableVectorType is not supported.");
257if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258return VecTy->getNumElements();
259return 1;
260}
261
262/// \returns the vector type of ScalarTy based on vectorization factor.
263staticFixedVectorType *getWidenedType(Type *ScalarTy,unsigned VF) {
264returnFixedVectorType::get(ScalarTy->getScalarType(),
265 VF *getNumElements(ScalarTy));
266}
267
268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
269/// which forms type, which splits by \p TTI into whole vector types during
270/// legalization.
271staticunsignedgetFullVectorNumberOfElements(constTargetTransformInfo &TTI,
272Type *Ty,unsigned Sz) {
273if (!isValidElementType(Ty))
274returnbit_ceil(Sz);
275// Find the number of elements, which forms full vectors.
276constunsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));
277if (NumParts == 0 || NumParts >= Sz)
278returnbit_ceil(Sz);
279returnbit_ceil(divideCeil(Sz, NumParts)) * NumParts;
280}
281
282/// Returns the number of elements of the given type \p Ty, not greater than \p
283/// Sz, which forms type, which splits by \p TTI into whole vector types during
284/// legalization.
285staticunsigned
286getFloorFullVectorNumberOfElements(constTargetTransformInfo &TTI,Type *Ty,
287unsigned Sz) {
288if (!isValidElementType(Ty))
289returnbit_floor(Sz);
290// Find the number of elements, which forms full vectors.
291unsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));
292if (NumParts == 0 || NumParts >= Sz)
293returnbit_floor(Sz);
294unsigned RegVF =bit_ceil(divideCeil(Sz, NumParts));
295if (RegVF > Sz)
296returnbit_floor(Sz);
297return (Sz / RegVF) * RegVF;
298}
299
300staticvoidtransformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
301SmallVectorImpl<int> &Mask) {
302// The ShuffleBuilder implementation use shufflevector to splat an "element".
303// But the element have different meaning for SLP (scalar) and REVEC
304// (vector). We need to expand Mask into masks which shufflevector can use
305// directly.
306SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
307for (unsignedI : seq<unsigned>(Mask.size()))
308for (auto [J, MaskV] :enumerate(MutableArrayRef(NewMask).slice(
309I * VecTyNumElements, VecTyNumElements)))
310 MaskV = Mask[I] ==PoisonMaskElem ?PoisonMaskElem
311 : Mask[I] * VecTyNumElements + J;
312 Mask.swap(NewMask);
313}
314
315/// \returns the number of groups of shufflevector
316/// A group has the following features
317/// 1. All of value in a group are shufflevector.
318/// 2. The mask of all shufflevector is isExtractSubvectorMask.
319/// 3. The mask of all shufflevector uses all of the elements of the source.
320/// e.g., it is 1 group (%0)
321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
325/// it is 2 groups (%3 and %4)
326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
334/// it is 0 group
335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339staticunsignedgetShufflevectorNumGroups(ArrayRef<Value *> VL) {
340if (VL.empty())
341return 0;
342if (!all_of(VL, IsaPred<ShuffleVectorInst>))
343return 0;
344auto *SV = cast<ShuffleVectorInst>(VL.front());
345unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348if (SVNumElements % ShuffleMaskSize != 0)
349return 0;
350unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
352return 0;
353unsigned NumGroup = 0;
354for (size_tI = 0, E = VL.size();I != E;I += GroupSize) {
355auto *SV = cast<ShuffleVectorInst>(VL[I]);
356Value *Src = SV->getOperand(0);
357ArrayRef<Value *> Group = VL.slice(I, GroupSize);
358SmallBitVector ExpectedIndex(GroupSize);
359if (!all_of(Group, [&](Value *V) {
360auto *SV = cast<ShuffleVectorInst>(V);
361// From the same source.
362if (SV->getOperand(0) != Src)
363returnfalse;
364int Index;
365if (!SV->isExtractSubvectorMask(Index))
366returnfalse;
367 ExpectedIndex.set(Index / ShuffleMaskSize);
368returntrue;
369 }))
370return 0;
371if (!ExpectedIndex.all())
372return 0;
373 ++NumGroup;
374 }
375assert(NumGroup == (VL.size() / GroupSize) &&"Unexpected number of groups");
376return NumGroup;
377}
378
379/// \returns a shufflevector mask which is used to vectorize shufflevectors
380/// e.g.,
381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
389/// the result is
390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
391staticSmallVector<int>calculateShufflevectorMask(ArrayRef<Value *> VL) {
392assert(getShufflevectorNumGroups(VL) &&"Not supported shufflevector usage.");
393auto *SV = cast<ShuffleVectorInst>(VL.front());
394unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396SmallVector<int> Mask;
397unsigned AccumulateLength = 0;
398for (Value *V : VL) {
399auto *SV = cast<ShuffleVectorInst>(V);
400for (int M : SV->getShuffleMask())
401 Mask.push_back(M ==PoisonMaskElem ?PoisonMaskElem
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
404 }
405return Mask;
406}
407
408/// \returns True if the value is a constant (but not globals/constant
409/// expressions).
410staticboolisConstant(Value *V) {
411return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
412}
413
414/// Checks if \p V is one of vector-like instructions, i.e. undef,
415/// insertelement/extractelement with constant indices for fixed vector type or
416/// extractvalue instruction.
417staticboolisVectorLikeInstWithConstOps(Value *V) {
418if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
420returnfalse;
421auto *I = dyn_cast<Instruction>(V);
422if (!I || isa<ExtractValueInst>(I))
423returntrue;
424if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
425returnfalse;
426if (isa<ExtractElementInst>(I))
427returnisConstant(I->getOperand(1));
428assert(isa<InsertElementInst>(V) &&"Expected only insertelement.");
429returnisConstant(I->getOperand(2));
430}
431
432/// Returns power-of-2 number of elements in a single register (part), given the
433/// total number of elements \p Size and number of registers (parts) \p
434/// NumParts.
435staticunsignedgetPartNumElems(unsignedSize,unsigned NumParts) {
436return std::min<unsigned>(Size,bit_ceil(divideCeil(Size, NumParts)));
437}
438
439/// Returns correct remaining number of elements, considering total amount \p
440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
441/// and current register (part) \p Part.
442staticunsignedgetNumElems(unsignedSize,unsigned PartNumElems,
443unsigned Part) {
444return std::min<unsigned>(PartNumElems,Size - Part * PartNumElems);
445}
446
447#if !defined(NDEBUG)
448/// Print a short descriptor of the instruction bundle suitable for debug output.
449static std::stringshortBundleName(ArrayRef<Value *> VL,intIdx = -1) {
450 std::string Result;
451raw_string_ostreamOS(Result);
452if (Idx >= 0)
453OS <<"Idx: " <<Idx <<", ";
454OS <<"n=" << VL.size() <<" [" << *VL.front() <<", ..]";
455return Result;
456}
457#endif
458
459/// \returns true if all of the instructions in \p VL are in the same block or
460/// false otherwise.
461staticboolallSameBlock(ArrayRef<Value *> VL) {
462auto *It =find_if(VL, IsaPred<Instruction>);
463if (It == VL.end())
464returnfalse;
465Instruction *I0 = cast<Instruction>(*It);
466if (all_of(VL,isVectorLikeInstWithConstOps))
467returntrue;
468
469BasicBlock *BB = I0->getParent();
470for (Value *V :iterator_range(It, VL.end())) {
471if (isa<PoisonValue>(V))
472continue;
473auto *II = dyn_cast<Instruction>(V);
474if (!II)
475returnfalse;
476
477if (BB !=II->getParent())
478returnfalse;
479 }
480returntrue;
481}
482
483/// \returns True if all of the values in \p VL are constants (but not
484/// globals/constant expressions).
485staticboolallConstant(ArrayRef<Value *> VL) {
486// Constant expressions and globals can't be vectorized like normal integer/FP
487// constants.
488returnall_of(VL,isConstant);
489}
490
491/// \returns True if all of the values in \p VL are identical or some of them
492/// are UndefValue.
493staticboolisSplat(ArrayRef<Value *> VL) {
494Value *FirstNonUndef =nullptr;
495for (Value *V : VL) {
496if (isa<UndefValue>(V))
497continue;
498if (!FirstNonUndef) {
499 FirstNonUndef = V;
500continue;
501 }
502if (V != FirstNonUndef)
503returnfalse;
504 }
505return FirstNonUndef !=nullptr;
506}
507
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
509staticboolisCommutative(Instruction *I) {
510if (auto *Cmp = dyn_cast<CmpInst>(I))
511return Cmp->isCommutative();
512if (auto *BO = dyn_cast<BinaryOperator>(I))
513return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
515 !BO->hasNUsesOrMore(UsesLimit) &&
516all_of(
517 BO->uses(),
518 [](constUse &U) {
519// Commutative, if icmp eq/ne sub, 0
520 CmpPredicate Pred;
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
524 return true;
525// Commutative, if abs(sub nsw, true) or abs(sub, false).
526 ConstantInt *Flag;
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
531 Flag->isOne());
532 })) ||
533 (BO->getOpcode() == Instruction::FSub &&
534 !BO->hasNUsesOrMore(UsesLimit) &&
535all_of(BO->uses(), [](constUse &U) {
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 }));
539returnI->isCommutative();
540}
541
542template <typename T>
543static std::optional<unsigned>getInsertExtractIndex(constValue *Inst,
544unsignedOffset) {
545static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
547"unsupported T");
548int Index =Offset;
549if (constauto *IE = dyn_cast<T>(Inst)) {
550constauto *VT = dyn_cast<FixedVectorType>(IE->getType());
551if (!VT)
552return std::nullopt;
553constauto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
554if (!CI)
555return std::nullopt;
556if (CI->getValue().uge(VT->getNumElements()))
557return std::nullopt;
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
560return Index;
561 }
562return std::nullopt;
563}
564
565/// \returns inserting or extracting index of InsertElement, ExtractElement or
566/// InsertValue instruction, using Offset as base offset for index.
567/// \returns std::nullopt if the index is not an immediate.
568static std::optional<unsigned>getElementIndex(constValue *Inst,
569unsignedOffset = 0) {
570if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst,Offset))
571return Index;
572if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,Offset))
573return Index;
574
575int Index =Offset;
576
577constauto *IV = dyn_cast<InsertValueInst>(Inst);
578if (!IV)
579return std::nullopt;
580
581Type *CurrentType =IV->getType();
582for (unsignedI :IV->indices()) {
583if (constauto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(I);
586 }elseif (constauto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
589 }else {
590return std::nullopt;
591 }
592 Index +=I;
593 }
594return Index;
595}
596
597namespace{
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements
599/// in the shuffle mask.
600enum class UseMask {
601 FirstArg,///< The mask is expected to be for permutation of 1-2 vectors,
602 ///< check for the mask elements for the first argument (mask
603 ///< indices are in range [0:VF)).
604 SecondArg,///< The mask is expected to be for permutation of 2 vectors, check
605 ///< for the mask elements for the second argument (mask indices
606 ///< are in range [VF:2*VF))
607 UndefsAsMask///< Consider undef mask elements (-1) as placeholders for
608 ///< future shuffle elements and mark them as ones as being used
609 ///< in future. Non-undef elements are considered as unused since
610 ///< they're already marked as used in the mask.
611};
612}// namespace
613
614/// Prepares a use bitset for the given mask either for the first argument or
615/// for the second.
616staticSmallBitVectorbuildUseMask(int VF,ArrayRef<int> Mask,
617 UseMask MaskArg) {
618SmallBitVector UseMask(VF,true);
619for (auto [Idx,Value] :enumerate(Mask)) {
620if (Value ==PoisonMaskElem) {
621if (MaskArg == UseMask::UndefsAsMask)
622 UseMask.reset(Idx);
623continue;
624 }
625if (MaskArg == UseMask::FirstArg &&Value < VF)
626 UseMask.reset(Value);
627elseif (MaskArg == UseMask::SecondArg &&Value >= VF)
628 UseMask.reset(Value - VF);
629 }
630return UseMask;
631}
632
633/// Checks if the given value is actually an undefined constant vector.
634/// Also, if the \p UseMask is not empty, tries to check if the non-masked
635/// elements actually mask the insertelement buildvector, if any.
636template <bool IsPoisonOnly = false>
637staticSmallBitVectorisUndefVector(constValue *V,
638constSmallBitVector &UseMask = {}) {
639SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(),true);
640usingT = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
641if (isa<T>(V))
642return Res;
643auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
644if (!VecTy)
645return Res.reset();
646auto *C = dyn_cast<Constant>(V);
647if (!C) {
648if (!UseMask.empty()) {
649constValue *Base =V;
650while (auto *II = dyn_cast<InsertElementInst>(Base)) {
651Base =II->getOperand(0);
652if (isa<T>(II->getOperand(1)))
653continue;
654 std::optional<unsigned>Idx =getElementIndex(II);
655if (!Idx) {
656 Res.reset();
657return Res;
658 }
659if (*Idx < UseMask.size() && !UseMask.test(*Idx))
660 Res.reset(*Idx);
661 }
662// TODO: Add analysis for shuffles here too.
663if (V ==Base) {
664 Res.reset();
665 }else {
666SmallBitVector SubMask(UseMask.size(),false);
667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
668 }
669 }else {
670 Res.reset();
671 }
672return Res;
673 }
674for (unsignedI = 0, E = VecTy->getNumElements();I != E; ++I) {
675if (Constant *Elem =C->getAggregateElement(I))
676if (!isa<T>(Elem) &&
677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
678 Res.reset(I);
679 }
680return Res;
681}
682
683/// Checks if the vector of instructions can be represented as a shuffle, like:
684/// %x0 = extractelement <4 x i8> %x, i32 0
685/// %x3 = extractelement <4 x i8> %x, i32 3
686/// %y1 = extractelement <4 x i8> %y, i32 1
687/// %y2 = extractelement <4 x i8> %y, i32 2
688/// %x0x0 = mul i8 %x0, %x0
689/// %x3x3 = mul i8 %x3, %x3
690/// %y1y1 = mul i8 %y1, %y1
691/// %y2y2 = mul i8 %y2, %y2
692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
696/// ret <4 x i8> %ins4
697/// can be transformed into:
698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
699/// i32 6>
700/// %2 = mul <4 x i8> %1, %1
701/// ret <4 x i8> %2
702/// Mask will return the Shuffle Mask equivalent to the extracted elements.
703/// TODO: Can we split off and reuse the shuffle mask detection from
704/// ShuffleVectorInst/getShuffleCost?
705static std::optional<TargetTransformInfo::ShuffleKind>
706isFixedVectorShuffle(ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,
707AssumptionCache *AC) {
708constauto *It =find_if(VL, IsaPred<ExtractElementInst>);
709if (It == VL.end())
710return std::nullopt;
711unsignedSize =
712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S,Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
714 if (!EI)
715 return S;
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
717 if (!VTy)
718 return S;
719 return std::max(S, VTy->getNumElements());
720 });
721
722Value *Vec1 =nullptr;
723Value *Vec2 =nullptr;
724bool HasNonUndefVec =any_of(VL, [&](Value *V) {
725auto *EE = dyn_cast<ExtractElementInst>(V);
726if (!EE)
727returnfalse;
728Value *Vec = EE->getVectorOperand();
729if (isa<UndefValue>(Vec))
730returnfalse;
731returnisGuaranteedNotToBePoison(Vec, AC);
732 });
733enum ShuffleMode {Unknown,Select, Permute };
734 ShuffleMode CommonShuffleMode =Unknown;
735 Mask.assign(VL.size(),PoisonMaskElem);
736for (unsignedI = 0, E = VL.size();I < E; ++I) {
737// Undef can be represented as an undef element in a vector.
738if (isa<UndefValue>(VL[I]))
739continue;
740auto *EI = cast<ExtractElementInst>(VL[I]);
741if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742return std::nullopt;
743auto *Vec = EI->getVectorOperand();
744// We can extractelement from undef or poison vector.
745if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
746continue;
747// All vector operands must have the same number of vector elements.
748if (isa<UndefValue>(Vec)) {
749 Mask[I] =I;
750 }else {
751if (isa<UndefValue>(EI->getIndexOperand()))
752continue;
753auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
754if (!Idx)
755return std::nullopt;
756// Undefined behavior if Idx is negative or >= Size.
757if (Idx->getValue().uge(Size))
758continue;
759unsigned IntIdx =Idx->getValue().getZExtValue();
760 Mask[I] = IntIdx;
761 }
762if (isUndefVector(Vec).all() && HasNonUndefVec)
763continue;
764// For correct shuffling we have to have at most 2 different vector operands
765// in all extractelement instructions.
766if (!Vec1 || Vec1 == Vec) {
767 Vec1 = Vec;
768 }elseif (!Vec2 || Vec2 == Vec) {
769 Vec2 = Vec;
770 Mask[I] +=Size;
771 }else {
772return std::nullopt;
773 }
774if (CommonShuffleMode == Permute)
775continue;
776// If the extract index is not the same as the operation number, it is a
777// permutation.
778if (Mask[I] %Size !=I) {
779 CommonShuffleMode = Permute;
780continue;
781 }
782 CommonShuffleMode =Select;
783 }
784// If we're not crossing lanes in different vectors, consider it as blending.
785if (CommonShuffleMode ==Select && Vec2)
786returnTargetTransformInfo::SK_Select;
787// If Vec2 was never used, we have a permutation of a single vector, otherwise
788// we have permutation of 2 vectors.
789return Vec2 ?TargetTransformInfo::SK_PermuteTwoSrc
790 :TargetTransformInfo::SK_PermuteSingleSrc;
791}
792
793/// \returns True if Extract{Value,Element} instruction extracts element Idx.
794static std::optional<unsigned>getExtractIndex(Instruction *E) {
795unsigned Opcode = E->getOpcode();
796assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798"Expected extractelement or extractvalue instruction.");
799if (Opcode == Instruction::ExtractElement) {
800auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
801if (!CI)
802return std::nullopt;
803return CI->getZExtValue();
804 }
805auto *EI = cast<ExtractValueInst>(E);
806if (EI->getNumIndices() != 1)
807return std::nullopt;
808return *EI->idx_begin();
809}
810
811namespace{
812
813/// Main data required for vectorization of instructions.
814classInstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0.
816Instruction *MainOp =nullptr;
817Instruction *AltOp =nullptr;
818
819public:
820Instruction *getMainOp() const{
821assert(valid() &&"InstructionsState is invalid.");
822return MainOp;
823 }
824
825Instruction *getAltOp() const{
826assert(valid() &&"InstructionsState is invalid.");
827return AltOp;
828 }
829
830 /// The main/alternate opcodes for the list of instructions.
831unsignedgetOpcode() const{return getMainOp()->getOpcode(); }
832
833unsigned getAltOpcode() const{return getAltOp()->getOpcode(); }
834
835 /// Some of the instructions in the list have alternate opcodes.
836bool isAltShuffle() const{return getMainOp() != getAltOp(); }
837
838bool isOpcodeOrAlt(Instruction *I) const{
839unsigned CheckedOpcode =I->getOpcode();
840returngetOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
841 }
842
843 /// Checks if the current state is valid, i.e. has non-null MainOp
844bool valid() const{return MainOp && AltOp; }
845
846explicitoperatorbool() const{return valid(); }
847
848 InstructionsState() =delete;
849 InstructionsState(Instruction *MainOp,Instruction *AltOp)
850 : MainOp(MainOp), AltOp(AltOp) {}
851static InstructionsState invalid() {return {nullptr,nullptr}; }
852};
853
854}// end anonymous namespace
855
856/// \returns true if \p Opcode is allowed as part of the main/alternate
857/// instruction for SLP vectorization.
858///
859/// Example of unsupported opcode is SDIV that can potentially cause UB if the
860/// "shuffled out" lane would result in division by zero.
861staticboolisValidForAlternation(unsigned Opcode) {
862if (Instruction::isIntDivRem(Opcode))
863returnfalse;
864
865returntrue;
866}
867
868static InstructionsStategetSameOpcode(ArrayRef<Value *> VL,
869constTargetLibraryInfo &TLI);
870
871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
872/// compatible instructions or constants, or just some other regular values.
873staticboolareCompatibleCmpOps(Value *BaseOp0,Value *BaseOp1,Value *Op0,
874Value *Op1,constTargetLibraryInfo &TLI) {
875return (isConstant(BaseOp0) &&isConstant(Op0)) ||
876 (isConstant(BaseOp1) &&isConstant(Op1)) ||
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
880getSameOpcode({BaseOp0, Op0}, TLI) ||
881getSameOpcode({BaseOp1, Op1}, TLI);
882}
883
884/// \returns true if a compare instruction \p CI has similar "look" and
885/// same predicate as \p BaseCI, "as is" or with its operands and predicate
886/// swapped, false otherwise.
887staticboolisCmpSameOrSwapped(constCmpInst *BaseCI,constCmpInst *CI,
888constTargetLibraryInfo &TLI) {
889assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
890"Assessing comparisons of different types?");
891CmpInst::Predicate BasePred = BaseCI->getPredicate();
892CmpInst::Predicate Pred = CI->getPredicate();
893CmpInst::Predicate SwappedPred =CmpInst::getSwappedPredicate(Pred);
894
895Value *BaseOp0 = BaseCI->getOperand(0);
896Value *BaseOp1 = BaseCI->getOperand(1);
897Value *Op0 = CI->getOperand(0);
898Value *Op1 = CI->getOperand(1);
899
900return (BasePred == Pred &&
901areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
902 (BasePred == SwappedPred &&
903areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
904}
905
906/// \returns analysis of the Instructions in \p VL described in
907/// InstructionsState, the Opcode that we suppose the whole list
908/// could be vectorized even if its structure is diverse.
909static InstructionsStategetSameOpcode(ArrayRef<Value *> VL,
910constTargetLibraryInfo &TLI) {
911// Make sure these are all Instructions.
912if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
913return InstructionsState::invalid();
914
915auto *It =find_if(VL, IsaPred<Instruction>);
916if (It == VL.end())
917return InstructionsState::invalid();
918
919Instruction *MainOp = cast<Instruction>(*It);
920unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
921if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
922 (VL.size() == 2 && InstCnt < 2))
923return InstructionsState::invalid();
924
925bool IsCastOp = isa<CastInst>(MainOp);
926bool IsBinOp = isa<BinaryOperator>(MainOp);
927bool IsCmpOp = isa<CmpInst>(MainOp);
928CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
929 :CmpInst::BAD_ICMP_PREDICATE;
930Instruction *AltOp = MainOp;
931unsigned Opcode = MainOp->getOpcode();
932unsigned AltOpcode = Opcode;
933
934bool SwappedPredsCompatible = IsCmpOp && [&]() {
935SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
936 UniquePreds.insert(BasePred);
937 UniqueNonSwappedPreds.insert(BasePred);
938for (Value *V : VL) {
939auto *I = dyn_cast<CmpInst>(V);
940if (!I)
941returnfalse;
942CmpInst::Predicate CurrentPred =I->getPredicate();
943CmpInst::Predicate SwappedCurrentPred =
944CmpInst::getSwappedPredicate(CurrentPred);
945 UniqueNonSwappedPreds.insert(CurrentPred);
946if (!UniquePreds.contains(CurrentPred) &&
947 !UniquePreds.contains(SwappedCurrentPred))
948 UniquePreds.insert(CurrentPred);
949 }
950// Total number of predicates > 2, but if consider swapped predicates
951// compatible only 2, consider swappable predicates as compatible opcodes,
952// not alternate.
953return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
954 }();
955// Check for one alternate opcode from another BinaryOperator.
956// TODO - generalize to support all operators (types, calls etc.).
957Intrinsic::ID BaseID = 0;
958SmallVector<VFInfo> BaseMappings;
959if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
960 BaseID =getVectorIntrinsicIDForCall(CallBase, &TLI);
961 BaseMappings =VFDatabase(*CallBase).getMappings(*CallBase);
962if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
963return InstructionsState::invalid();
964 }
965bool AnyPoison = InstCnt != VL.size();
966// Check MainOp too to be sure that it matches the requirements for the
967// instructions.
968for (Value *V :iterator_range(It, VL.end())) {
969auto *I = dyn_cast<Instruction>(V);
970if (!I)
971continue;
972
973// Cannot combine poison and divisions.
974// TODO: do some smart analysis of the CallInsts to exclude divide-like
975// intrinsics/functions only.
976if (AnyPoison && (I->isIntDivRem() ||I->isFPDivRem() || isa<CallInst>(I)))
977return InstructionsState::invalid();
978unsigned InstOpcode =I->getOpcode();
979if (IsBinOp && isa<BinaryOperator>(I)) {
980if (InstOpcode == Opcode || InstOpcode == AltOpcode)
981continue;
982if (Opcode == AltOpcode &&isValidForAlternation(InstOpcode) &&
983isValidForAlternation(Opcode)) {
984 AltOpcode = InstOpcode;
985 AltOp =I;
986continue;
987 }
988 }elseif (IsCastOp && isa<CastInst>(I)) {
989Value *Op0 = MainOp->getOperand(0);
990Type *Ty0 = Op0->getType();
991Value *Op1 =I->getOperand(0);
992Type *Ty1 = Op1->getType();
993if (Ty0 == Ty1) {
994if (InstOpcode == Opcode || InstOpcode == AltOpcode)
995continue;
996if (Opcode == AltOpcode) {
997assert(isValidForAlternation(Opcode) &&
998isValidForAlternation(InstOpcode) &&
999"Cast isn't safe for alternation, logic needs to be updated!");
1000 AltOpcode = InstOpcode;
1001 AltOp =I;
1002continue;
1003 }
1004 }
1005 }elseif (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1006auto *BaseInst = cast<CmpInst>(MainOp);
1007Type *Ty0 = BaseInst->getOperand(0)->getType();
1008Type *Ty1 = Inst->getOperand(0)->getType();
1009if (Ty0 == Ty1) {
1010assert(InstOpcode == Opcode &&"Expected same CmpInst opcode.");
1011assert(InstOpcode == AltOpcode &&
1012"Alternate instructions are only supported by BinaryOperator "
1013"and CastInst.");
1014// Check for compatible operands. If the corresponding operands are not
1015// compatible - need to perform alternate vectorization.
1016CmpInst::Predicate CurrentPred = Inst->getPredicate();
1017CmpInst::Predicate SwappedCurrentPred =
1018CmpInst::getSwappedPredicate(CurrentPred);
1019
1020if ((VL.size() == 2 || SwappedPredsCompatible) &&
1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1022continue;
1023
1024if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1025continue;
1026auto *AltInst = cast<CmpInst>(AltOp);
1027if (MainOp != AltOp) {
1028if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1029continue;
1030 }elseif (BasePred != CurrentPred) {
1031assert(
1032isValidForAlternation(InstOpcode) &&
1033"CmpInst isn't safe for alternation, logic needs to be updated!");
1034 AltOp =I;
1035continue;
1036 }
1037CmpInst::Predicate AltPred = AltInst->getPredicate();
1038if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1040continue;
1041 }
1042 }elseif (InstOpcode == Opcode) {
1043assert(InstOpcode == AltOpcode &&
1044"Alternate instructions are only supported by BinaryOperator and "
1045"CastInst.");
1046if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1047if (Gep->getNumOperands() != 2 ||
1048 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1049return InstructionsState::invalid();
1050 }elseif (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1051if (!isVectorLikeInstWithConstOps(EI))
1052return InstructionsState::invalid();
1053 }elseif (auto *LI = dyn_cast<LoadInst>(I)) {
1054auto *BaseLI = cast<LoadInst>(MainOp);
1055if (!LI->isSimple() || !BaseLI->isSimple())
1056return InstructionsState::invalid();
1057 }elseif (auto *Call = dyn_cast<CallInst>(I)) {
1058auto *CallBase = cast<CallInst>(MainOp);
1059if (Call->getCalledFunction() !=CallBase->getCalledFunction())
1060return InstructionsState::invalid();
1061if (Call->hasOperandBundles() &&
1062 (!CallBase->hasOperandBundles() ||
1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1064 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1065CallBase->op_begin() +
1066CallBase->getBundleOperandsStartIndex())))
1067return InstructionsState::invalid();
1068Intrinsic::IDID =getVectorIntrinsicIDForCall(Call, &TLI);
1069if (ID != BaseID)
1070return InstructionsState::invalid();
1071if (!ID) {
1072SmallVector<VFInfo> Mappings =VFDatabase(*Call).getMappings(*Call);
1073if (Mappings.size() != BaseMappings.size() ||
1074 Mappings.front().ISA != BaseMappings.front().ISA ||
1075 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1076 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1077 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1078 Mappings.front().Shape.Parameters !=
1079 BaseMappings.front().Shape.Parameters)
1080return InstructionsState::invalid();
1081 }
1082 }
1083continue;
1084 }
1085return InstructionsState::invalid();
1086 }
1087
1088return InstructionsState(MainOp, AltOp);
1089}
1090
1091/// \returns true if all of the values in \p VL have the same type or false
1092/// otherwise.
1093staticboolallSameType(ArrayRef<Value *> VL) {
1094Type *Ty = VL.front()->getType();
1095returnall_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1096}
1097
1098/// \returns True if in-tree use also needs extract. This refers to
1099/// possible scalar operand in vectorized instruction.
1100staticbooldoesInTreeUserNeedToExtract(Value *Scalar,Instruction *UserInst,
1101TargetLibraryInfo *TLI,
1102constTargetTransformInfo *TTI) {
1103if (!UserInst)
1104returnfalse;
1105unsigned Opcode = UserInst->getOpcode();
1106switch (Opcode) {
1107case Instruction::Load: {
1108LoadInst *LI = cast<LoadInst>(UserInst);
1109return (LI->getPointerOperand() == Scalar);
1110 }
1111case Instruction::Store: {
1112StoreInst *SI = cast<StoreInst>(UserInst);
1113return (SI->getPointerOperand() == Scalar);
1114 }
1115case Instruction::Call: {
1116CallInst *CI = cast<CallInst>(UserInst);
1117Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
1118returnany_of(enumerate(CI->args()), [&](auto &&Arg) {
1119 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1120 Arg.value().get() == Scalar;
1121 });
1122 }
1123default:
1124returnfalse;
1125 }
1126}
1127
1128/// \returns the AA location that is being access by the instruction.
1129staticMemoryLocationgetLocation(Instruction *I) {
1130if (StoreInst *SI = dyn_cast<StoreInst>(I))
1131returnMemoryLocation::get(SI);
1132if (LoadInst *LI = dyn_cast<LoadInst>(I))
1133returnMemoryLocation::get(LI);
1134returnMemoryLocation();
1135}
1136
1137/// \returns True if the instruction is not a volatile or atomic load/store.
1138staticboolisSimple(Instruction *I) {
1139if (LoadInst *LI = dyn_cast<LoadInst>(I))
1140return LI->isSimple();
1141if (StoreInst *SI = dyn_cast<StoreInst>(I))
1142return SI->isSimple();
1143if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1144return !MI->isVolatile();
1145returntrue;
1146}
1147
1148/// Shuffles \p Mask in accordance with the given \p SubMask.
1149/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1150/// one but two input vectors.
1151staticvoidaddMask(SmallVectorImpl<int> &Mask,ArrayRef<int> SubMask,
1152bool ExtendingManyInputs =false) {
1153if (SubMask.empty())
1154return;
1155assert(
1156 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1157// Check if input scalars were extended to match the size of other node.
1158 (SubMask.size() == Mask.size() && Mask.back() ==PoisonMaskElem)) &&
1159"SubMask with many inputs support must be larger than the mask.");
1160if (Mask.empty()) {
1161 Mask.append(SubMask.begin(), SubMask.end());
1162return;
1163 }
1164SmallVector<int> NewMask(SubMask.size(),PoisonMaskElem);
1165int TermValue = std::min(Mask.size(), SubMask.size());
1166for (intI = 0, E = SubMask.size();I < E; ++I) {
1167if (SubMask[I] ==PoisonMaskElem ||
1168 (!ExtendingManyInputs &&
1169 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1170continue;
1171 NewMask[I] = Mask[SubMask[I]];
1172 }
1173 Mask.swap(NewMask);
1174}
1175
1176/// Order may have elements assigned special value (size) which is out of
1177/// bounds. Such indices only appear on places which correspond to undef values
1178/// (see canReuseExtract for details) and used in order to avoid undef values
1179/// have effect on operands ordering.
1180/// The first loop below simply finds all unused indices and then the next loop
1181/// nest assigns these indices for undef values positions.
1182/// As an example below Order has two undef positions and they have assigned
1183/// values 3 and 7 respectively:
1184/// before: 6 9 5 4 9 2 1 0
1185/// after: 6 3 5 4 7 2 1 0
1186staticvoidfixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1187constunsigned Sz = Order.size();
1188SmallBitVector UnusedIndices(Sz,/*t=*/true);
1189SmallBitVector MaskedIndices(Sz);
1190for (unsignedI = 0;I < Sz; ++I) {
1191if (Order[I] < Sz)
1192 UnusedIndices.reset(Order[I]);
1193else
1194 MaskedIndices.set(I);
1195 }
1196if (MaskedIndices.none())
1197return;
1198assert(UnusedIndices.count() == MaskedIndices.count() &&
1199"Non-synced masked/available indices.");
1200intIdx = UnusedIndices.find_first();
1201int MIdx = MaskedIndices.find_first();
1202while (MIdx >= 0) {
1203assert(Idx >= 0 &&"Indices must be synced.");
1204 Order[MIdx] =Idx;
1205Idx = UnusedIndices.find_next(Idx);
1206 MIdx = MaskedIndices.find_next(MIdx);
1207 }
1208}
1209
1210/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1211/// Opcode1.
1212staticSmallBitVectorgetAltInstrMask(ArrayRef<Value *> VL,unsigned Opcode0,
1213unsigned Opcode1) {
1214Type *ScalarTy = VL[0]->getType();
1215unsigned ScalarTyNumElements =getNumElements(ScalarTy);
1216SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements,false);
1217for (unsigned Lane : seq<unsigned>(VL.size())) {
1218if (isa<PoisonValue>(VL[Lane]))
1219continue;
1220if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1221 OpcodeMask.set(Lane * ScalarTyNumElements,
1222 Lane * ScalarTyNumElements + ScalarTyNumElements);
1223 }
1224return OpcodeMask;
1225}
1226
1227namespacellvm {
1228
1229staticvoidinversePermutation(ArrayRef<unsigned> Indices,
1230SmallVectorImpl<int> &Mask) {
1231 Mask.clear();
1232constunsigned E = Indices.size();
1233 Mask.resize(E,PoisonMaskElem);
1234for (unsignedI = 0;I < E; ++I)
1235 Mask[Indices[I]] =I;
1236}
1237
1238/// Reorders the list of scalars in accordance with the given \p Mask.
1239staticvoidreorderScalars(SmallVectorImpl<Value *> &Scalars,
1240ArrayRef<int> Mask) {
1241assert(!Mask.empty() &&"Expected non-empty mask.");
1242SmallVector<Value *> Prev(Scalars.size(),
1243PoisonValue::get(Scalars.front()->getType()));
1244 Prev.swap(Scalars);
1245for (unsignedI = 0, E = Prev.size();I < E; ++I)
1246if (Mask[I] !=PoisonMaskElem)
1247 Scalars[Mask[I]] = Prev[I];
1248}
1249
1250/// Checks if the provided value does not require scheduling. It does not
1251/// require scheduling if this is not an instruction or it is an instruction
1252/// that does not read/write memory and all operands are either not instructions
1253/// or phi nodes or instructions from different blocks.
1254staticboolareAllOperandsNonInsts(Value *V) {
1255auto *I = dyn_cast<Instruction>(V);
1256if (!I)
1257returntrue;
1258return !mayHaveNonDefUseDependency(*I) &&
1259all_of(I->operands(), [I](Value *V) {
1260 auto *IO = dyn_cast<Instruction>(V);
1261 if (!IO)
1262 return true;
1263 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1264 });
1265}
1266
1267/// Checks if the provided value does not require scheduling. It does not
1268/// require scheduling if this is not an instruction or it is an instruction
1269/// that does not read/write memory and all users are phi nodes or instructions
1270/// from the different blocks.
1271staticboolisUsedOutsideBlock(Value *V) {
1272auto *I = dyn_cast<Instruction>(V);
1273if (!I)
1274returntrue;
1275// Limits the number of uses to save compile time.
1276return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1277all_of(I->users(), [I](User *U) {
1278 auto *IU = dyn_cast<Instruction>(U);
1279 if (!IU)
1280 return true;
1281 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1282 });
1283}
1284
1285/// Checks if the specified value does not require scheduling. It does not
1286/// require scheduling if all operands and all users do not need to be scheduled
1287/// in the current basic block.
1288staticbooldoesNotNeedToBeScheduled(Value *V) {
1289returnareAllOperandsNonInsts(V) &&isUsedOutsideBlock(V);
1290}
1291
1292/// Checks if the specified array of instructions does not require scheduling.
1293/// It is so if all either instructions have operands that do not require
1294/// scheduling or their users do not require scheduling since they are phis or
1295/// in other basic blocks.
1296staticbooldoesNotNeedToSchedule(ArrayRef<Value *> VL) {
1297return !VL.empty() &&
1298 (all_of(VL,isUsedOutsideBlock) ||all_of(VL,areAllOperandsNonInsts));
1299}
1300
1301/// Returns true if widened type of \p Ty elements with size \p Sz represents
1302/// full vector type, i.e. adding extra element results in extra parts upon type
1303/// legalization.
1304staticboolhasFullVectorsOrPowerOf2(constTargetTransformInfo &TTI,Type *Ty,
1305unsigned Sz) {
1306if (Sz <= 1)
1307returnfalse;
1308if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1309returnfalse;
1310if (has_single_bit(Sz))
1311returntrue;
1312constunsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1313return NumParts > 0 && NumParts < Sz &&has_single_bit(Sz / NumParts) &&
1314 Sz % NumParts == 0;
1315}
1316
1317/// Returns number of parts, the type \p VecTy will be split at the codegen
1318/// phase. If the type is going to be scalarized or does not uses whole
1319/// registers, returns 1.
1320staticunsigned
1321getNumberOfParts(constTargetTransformInfo &TTI,VectorType *VecTy,
1322constunsigned Limit = std::numeric_limits<unsigned>::max()) {
1323unsigned NumParts =TTI.getNumberOfParts(VecTy);
1324if (NumParts == 0 || NumParts >= Limit)
1325return 1;
1326unsigned Sz =getNumElements(VecTy);
1327if (NumParts >= Sz || Sz % NumParts != 0 ||
1328 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
1329return 1;
1330return NumParts;
1331}
1332
1333namespaceslpvectorizer {
1334
1335/// Bottom Up SLP Vectorizer.
1336classBoUpSLP {
1337structTreeEntry;
1338structScheduleData;
1339classShuffleCostEstimator;
1340classShuffleInstructionBuilder;
1341
1342public:
1343 /// Tracks the state we can represent the loads in the given sequence.
1344enum classLoadsState {
1345Gather,
1346Vectorize,
1347ScatterVectorize,
1348StridedVectorize
1349 };
1350
1351usingValueList =SmallVector<Value *, 8>;
1352usingInstrList =SmallVector<Instruction *, 16>;
1353usingValueSet =SmallPtrSet<Value *, 16>;
1354usingStoreList =SmallVector<StoreInst *, 8>;
1355usingExtraValueToDebugLocsMap =SmallDenseSet<Value *, 4>;
1356usingOrdersType =SmallVector<unsigned, 4>;
1357
1358BoUpSLP(Function *Func,ScalarEvolution *Se,TargetTransformInfo *Tti,
1359TargetLibraryInfo *TLi,AAResults *Aa,LoopInfo *Li,
1360DominatorTree *Dt,AssumptionCache *AC,DemandedBits *DB,
1361constDataLayout *DL,OptimizationRemarkEmitter *ORE)
1362 : BatchAA(*Aa),F(Func), SE(Se),TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1363 AC(AC), DB(DB),DL(DL), ORE(ORE),
1364 Builder(Se->getContext(),TargetFolder(*DL)) {
1365CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1366// Use the vector register size specified by the target unless overridden
1367// by a command-line option.
1368// TODO: It would be better to limit the vectorization factor based on
1369// data type rather than just register size. For example, x86 AVX has
1370// 256-bit registers, but it does not support integer operations
1371// at that width (that requires AVX2).
1372if (MaxVectorRegSizeOption.getNumOccurrences())
1373 MaxVecRegSize =MaxVectorRegSizeOption;
1374else
1375 MaxVecRegSize =
1376TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1377 .getFixedValue();
1378
1379if (MinVectorRegSizeOption.getNumOccurrences())
1380 MinVecRegSize =MinVectorRegSizeOption;
1381else
1382 MinVecRegSize =TTI->getMinVectorRegisterBitWidth();
1383 }
1384
1385 /// Vectorize the tree that starts with the elements in \p VL.
1386 /// Returns the vectorized root.
1387Value *vectorizeTree();
1388
1389 /// Vectorize the tree but with the list of externally used values \p
1390 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1391 /// generated extractvalue instructions.
1392Value *
1393vectorizeTree(constExtraValueToDebugLocsMap &ExternallyUsedValues,
1394Instruction *ReductionRoot =nullptr);
1395
1396 /// \returns the cost incurred by unwanted spills and fills, caused by
1397 /// holding live values over call sites.
1398InstructionCostgetSpillCost()const;
1399
1400 /// \returns the vectorization cost of the subtree that starts at \p VL.
1401 /// A negative number means that this is profitable.
1402InstructionCostgetTreeCost(ArrayRef<Value *> VectorizedVals = {});
1403
1404 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1405 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1406voidbuildTree(ArrayRef<Value *> Roots,
1407constSmallDenseSet<Value *> &UserIgnoreLst);
1408
1409 /// Construct a vectorizable tree that starts at \p Roots.
1410voidbuildTree(ArrayRef<Value *> Roots);
1411
1412 /// Returns whether the root node has in-tree uses.
1413booldoesRootHaveInTreeUses() const{
1414return !VectorizableTree.empty() &&
1415 !VectorizableTree.front()->UserTreeIndices.empty();
1416 }
1417
1418 /// Return the scalars of the root node.
1419ArrayRef<Value *>getRootNodeScalars() const{
1420assert(!VectorizableTree.empty() &&"No graph to get the first node from");
1421return VectorizableTree.front()->Scalars;
1422 }
1423
1424 /// Returns the type/is-signed info for the root node in the graph without
1425 /// casting.
1426 std::optional<std::pair<Type *, bool>>getRootNodeTypeWithNoCast() const{
1427const TreeEntry &Root = *VectorizableTree.front().get();
1428if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1429 !Root.Scalars.front()->getType()->isIntegerTy())
1430return std::nullopt;
1431auto It = MinBWs.find(&Root);
1432if (It != MinBWs.end())
1433return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1434 It->second.first),
1435 It->second.second);
1436if (Root.getOpcode() == Instruction::ZExt ||
1437 Root.getOpcode() == Instruction::SExt)
1438return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1439 Root.getOpcode() == Instruction::SExt);
1440return std::nullopt;
1441 }
1442
1443 /// Checks if the root graph node can be emitted with narrower bitwidth at
1444 /// codegen and returns it signedness, if so.
1445boolisSignedMinBitwidthRootNode() const{
1446return MinBWs.at(VectorizableTree.front().get()).second;
1447 }
1448
1449 /// Returns reduction type after minbitdth analysis.
1450FixedVectorType *getReductionType() const{
1451if (ReductionBitWidth == 0 ||
1452 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1453 ReductionBitWidth >=
1454DL->getTypeSizeInBits(
1455 VectorizableTree.front()->Scalars.front()->getType()))
1456returngetWidenedType(
1457 VectorizableTree.front()->Scalars.front()->getType(),
1458 VectorizableTree.front()->getVectorFactor());
1459returngetWidenedType(
1460IntegerType::get(
1461 VectorizableTree.front()->Scalars.front()->getContext(),
1462 ReductionBitWidth),
1463 VectorizableTree.front()->getVectorFactor());
1464 }
1465
1466 /// Builds external uses of the vectorized scalars, i.e. the list of
1467 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1468 /// ExternallyUsedValues contains additional list of external uses to handle
1469 /// vectorization of reductions.
1470void
1471buildExternalUses(constExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1472
1473 /// Transforms graph nodes to target specific representations, if profitable.
1474voidtransformNodes();
1475
1476 /// Clear the internal data structures that are created by 'buildTree'.
1477voiddeleteTree() {
1478 VectorizableTree.clear();
1479 ScalarToTreeEntry.clear();
1480 MultiNodeScalars.clear();
1481 MustGather.clear();
1482 NonScheduledFirst.clear();
1483 EntryToLastInstruction.clear();
1484 LoadEntriesToVectorize.clear();
1485 IsGraphTransformMode =false;
1486 GatheredLoadsEntriesFirst.reset();
1487 ExternalUses.clear();
1488 ExternalUsesAsOriginalScalar.clear();
1489for (auto &Iter : BlocksSchedules) {
1490 BlockScheduling *BS = Iter.second.get();
1491 BS->clear();
1492 }
1493 MinBWs.clear();
1494 ReductionBitWidth = 0;
1495 BaseGraphSize = 1;
1496 CastMaxMinBWSizes.reset();
1497 ExtraBitWidthNodes.clear();
1498 InstrElementSize.clear();
1499 UserIgnoreList =nullptr;
1500 PostponedGathers.clear();
1501 ValueToGatherNodes.clear();
1502 }
1503
1504unsignedgetTreeSize() const{return VectorizableTree.size(); }
1505
1506 /// Returns the base graph size, before any transformations.
1507unsignedgetCanonicalGraphSize() const{return BaseGraphSize; }
1508
1509 /// Perform LICM and CSE on the newly generated gather sequences.
1510voidoptimizeGatherSequence();
1511
1512 /// Does this non-empty order represent an identity order? Identity
1513 /// should be represented as an empty order, so this is used to
1514 /// decide if we can canonicalize a computed order. Undef elements
1515 /// (represented as size) are ignored.
1516boolisIdentityOrder(ArrayRef<unsigned> Order) const{
1517assert(!Order.empty() &&"expected non-empty order");
1518constunsigned Sz = Order.size();
1519returnall_of(enumerate(Order), [&](constauto &P) {
1520returnP.value() ==P.index() ||P.value() == Sz;
1521 });
1522 }
1523
1524 /// Checks if the specified gather tree entry \p TE can be represented as a
1525 /// shuffled vector entry + (possibly) permutation with other gathers. It
1526 /// implements the checks only for possibly ordered scalars (Loads,
1527 /// ExtractElement, ExtractValue), which can be part of the graph.
1528 std::optional<OrdersType>findReusedOrderedScalars(const TreeEntry &TE);
1529
1530 /// Sort loads into increasing pointers offsets to allow greater clustering.
1531 std::optional<OrdersType>findPartiallyOrderedLoads(const TreeEntry &TE);
1532
1533 /// Gets reordering data for the given tree entry. If the entry is vectorized
1534 /// - just return ReorderIndices, otherwise check if the scalars can be
1535 /// reordered and return the most optimal order.
1536 /// \return std::nullopt if ordering is not important, empty order, if
1537 /// identity order is important, or the actual order.
1538 /// \param TopToBottom If true, include the order of vectorized stores and
1539 /// insertelement nodes, otherwise skip them.
1540 std::optional<OrdersType>getReorderingData(const TreeEntry &TE,
1541bool TopToBottom);
1542
1543 /// Reorders the current graph to the most profitable order starting from the
1544 /// root node to the leaf nodes. The best order is chosen only from the nodes
1545 /// of the same size (vectorization factor). Smaller nodes are considered
1546 /// parts of subgraph with smaller VF and they are reordered independently. We
1547 /// can make it because we still need to extend smaller nodes to the wider VF
1548 /// and we can merge reordering shuffles with the widening shuffles.
1549voidreorderTopToBottom();
1550
1551 /// Reorders the current graph to the most profitable order starting from
1552 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1553 /// number of reshuffles if the leaf nodes use the same order. In this case we
1554 /// can merge the orders and just shuffle user node instead of shuffling its
1555 /// operands. Plus, even the leaf nodes have different orders, it allows to
1556 /// sink reordering in the graph closer to the root node and merge it later
1557 /// during analysis.
1558voidreorderBottomToTop(bool IgnoreReorder =false);
1559
1560 /// \return The vector element size in bits to use when vectorizing the
1561 /// expression tree ending at \p V. If V is a store, the size is the width of
1562 /// the stored value. Otherwise, the size is the width of the largest loaded
1563 /// value reaching V. This method is used by the vectorizer to calculate
1564 /// vectorization factors.
1565unsignedgetVectorElementSize(Value *V);
1566
1567 /// Compute the minimum type sizes required to represent the entries in a
1568 /// vectorizable tree.
1569voidcomputeMinimumValueSizes();
1570
1571// \returns maximum vector register size as set by TTI or overridden by cl::opt.
1572unsignedgetMaxVecRegSize() const{
1573return MaxVecRegSize;
1574 }
1575
1576// \returns minimum vector register size as set by cl::opt.
1577unsignedgetMinVecRegSize() const{
1578return MinVecRegSize;
1579 }
1580
1581unsignedgetMinVF(unsigned Sz) const{
1582return std::max(2U,getMinVecRegSize() / Sz);
1583 }
1584
1585unsignedgetMaximumVF(unsigned ElemWidth,unsigned Opcode) const{
1586unsigned MaxVF =MaxVFOption.getNumOccurrences() ?
1587MaxVFOption :TTI->getMaximumVF(ElemWidth, Opcode);
1588return MaxVF ? MaxVF : UINT_MAX;
1589 }
1590
1591 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1592 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1593 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1594 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1595 ///
1596 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1597unsignedcanMapToVector(Type *T)const;
1598
1599 /// \returns True if the VectorizableTree is both tiny and not fully
1600 /// vectorizable. We do not vectorize such trees.
1601boolisTreeTinyAndNotFullyVectorizable(bool ForReduction =false)const;
1602
1603 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1604 /// It may happen, if all gather nodes are loads and they cannot be
1605 /// "clusterized". In this case even subgraphs cannot be vectorized more
1606 /// effectively than the base graph.
1607boolisTreeNotExtendable()const;
1608
1609 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1610 /// can be load combined in the backend. Load combining may not be allowed in
1611 /// the IR optimizer, so we do not want to alter the pattern. For example,
1612 /// partially transforming a scalar bswap() pattern into vector code is
1613 /// effectively impossible for the backend to undo.
1614 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1615 /// may not be necessary.
1616boolisLoadCombineReductionCandidate(RecurKind RdxKind)const;
1617
1618 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1619 /// can be load combined in the backend. Load combining may not be allowed in
1620 /// the IR optimizer, so we do not want to alter the pattern. For example,
1621 /// partially transforming a scalar bswap() pattern into vector code is
1622 /// effectively impossible for the backend to undo.
1623 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1624 /// may not be necessary.
1625boolisLoadCombineCandidate(ArrayRef<Value *> Stores)const;
1626
1627 /// Checks if the given array of loads can be represented as a vectorized,
1628 /// scatter or just simple gather.
1629 /// \param VL list of loads.
1630 /// \param VL0 main load value.
1631 /// \param Order returned order of load instructions.
1632 /// \param PointerOps returned list of pointer operands.
1633 /// \param BestVF return best vector factor, if recursive check found better
1634 /// vectorization sequences rather than masked gather.
1635 /// \param TryRecursiveCheck used to check if long masked gather can be
1636 /// represented as a serie of loads/insert subvector, if profitable.
1637LoadsStatecanVectorizeLoads(ArrayRef<Value *> VL,constValue *VL0,
1638SmallVectorImpl<unsigned> &Order,
1639SmallVectorImpl<Value *> &PointerOps,
1640unsigned *BestVF =nullptr,
1641bool TryRecursiveCheck =true)const;
1642
1643 /// Registers non-vectorizable sequence of loads
1644template <typename T>voidregisterNonVectorizableLoads(ArrayRef<T *> VL) {
1645 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1646 }
1647
1648 /// Checks if the given loads sequence is known as not vectorizable
1649template <typename T>
1650boolareKnownNonVectorizableLoads(ArrayRef<T *> VL) const{
1651return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1652 }
1653
1654OptimizationRemarkEmitter *getORE() {return ORE; }
1655
1656 /// This structure holds any data we need about the edges being traversed
1657 /// during buildTree_rec(). We keep track of:
1658 /// (i) the user TreeEntry index, and
1659 /// (ii) the index of the edge.
1660structEdgeInfo {
1661EdgeInfo() =default;
1662EdgeInfo(TreeEntry *UserTE,unsignedEdgeIdx)
1663 :UserTE(UserTE),EdgeIdx(EdgeIdx) {}
1664 /// The user TreeEntry.
1665 TreeEntry *UserTE =nullptr;
1666 /// The operand index of the use.
1667unsignedEdgeIdx = UINT_MAX;
1668#ifndef NDEBUG
1669friendinlineraw_ostream &operator<<(raw_ostream &OS,
1670constBoUpSLP::EdgeInfo &EI) {
1671 EI.dump(OS);
1672returnOS;
1673 }
1674 /// Debug print.
1675voiddump(raw_ostream &OS) const{
1676OS <<"{User:" << (UserTE ? std::to_string(UserTE->Idx) :"null")
1677 <<" EdgeIdx:" <<EdgeIdx <<"}";
1678 }
1679LLVM_DUMP_METHODvoiddump() const{dump(dbgs()); }
1680#endif
1681bool operator == (constEdgeInfo &Other) const{
1682returnUserTE ==Other.UserTE &&EdgeIdx ==Other.EdgeIdx;
1683 }
1684 };
1685
1686 /// A helper class used for scoring candidates for two consecutive lanes.
1687classLookAheadHeuristics {
1688constTargetLibraryInfo &TLI;
1689constDataLayout &DL;
1690ScalarEvolution &SE;
1691constBoUpSLP &R;
1692int NumLanes;// Total number of lanes (aka vectorization factor).
1693int MaxLevel;// The maximum recursion depth for accumulating score.
1694
1695public:
1696LookAheadHeuristics(constTargetLibraryInfo &TLI,constDataLayout &DL,
1697ScalarEvolution &SE,constBoUpSLP &R,int NumLanes,
1698int MaxLevel)
1699 : TLI(TLI),DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1700 MaxLevel(MaxLevel) {}
1701
1702// The hard-coded scores listed here are not very important, though it shall
1703// be higher for better matches to improve the resulting cost. When
1704// computing the scores of matching one sub-tree with another, we are
1705// basically counting the number of values that are matching. So even if all
1706// scores are set to 1, we would still get a decent matching result.
1707// However, sometimes we have to break ties. For example we may have to
1708// choose between matching loads vs matching opcodes. This is what these
1709// scores are helping us with: they provide the order of preference. Also,
1710// this is important if the scalar is externally used or used in another
1711// tree entry node in the different lane.
1712
1713 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1714staticconstintScoreConsecutiveLoads = 4;
1715 /// The same load multiple times. This should have a better score than
1716 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1717 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1718 /// a vector load and 1.0 for a broadcast.
1719staticconstintScoreSplatLoads = 3;
1720 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1721staticconstintScoreReversedLoads = 3;
1722 /// A load candidate for masked gather.
1723staticconstintScoreMaskedGatherCandidate = 1;
1724 /// ExtractElementInst from same vector and consecutive indexes.
1725staticconstintScoreConsecutiveExtracts = 4;
1726 /// ExtractElementInst from same vector and reversed indices.
1727staticconstintScoreReversedExtracts = 3;
1728 /// Constants.
1729staticconstintScoreConstants = 2;
1730 /// Instructions with the same opcode.
1731staticconstintScoreSameOpcode = 2;
1732 /// Instructions with alt opcodes (e.g, add + sub).
1733staticconstintScoreAltOpcodes = 1;
1734 /// Identical instructions (a.k.a. splat or broadcast).
1735staticconstintScoreSplat = 1;
1736 /// Matching with an undef is preferable to failing.
1737staticconstintScoreUndef = 1;
1738 /// Score for failing to find a decent match.
1739staticconstintScoreFail = 0;
1740 /// Score if all users are vectorized.
1741staticconstintScoreAllUserVectorized = 1;
1742
1743 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1744 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1745 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1746 /// MainAltOps.
1747intgetShallowScore(Value *V1,Value *V2,Instruction *U1,Instruction *U2,
1748ArrayRef<Value *> MainAltOps) const{
1749if (!isValidElementType(V1->getType()) ||
1750 !isValidElementType(V2->getType()))
1751returnLookAheadHeuristics::ScoreFail;
1752
1753if (V1 == V2) {
1754if (isa<LoadInst>(V1)) {
1755// Retruns true if the users of V1 and V2 won't need to be extracted.
1756auto AllUsersAreInternal = [U1, U2,this](Value *V1,Value *V2) {
1757// Bail out if we have too many uses to save compilation time.
1758if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1759returnfalse;
1760
1761auto AllUsersVectorized = [U1, U2,this](Value *V) {
1762returnllvm::all_of(V->users(), [U1, U2,this](Value *U) {
1763 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1764 });
1765 };
1766return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1767 };
1768// A broadcast of a load can be cheaper on some targets.
1769if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1770ElementCount::getFixed(NumLanes)) &&
1771 ((int)V1->getNumUses() == NumLanes ||
1772 AllUsersAreInternal(V1, V2)))
1773returnLookAheadHeuristics::ScoreSplatLoads;
1774 }
1775returnLookAheadHeuristics::ScoreSplat;
1776 }
1777
1778auto CheckSameEntryOrFail = [&]() {
1779if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1780 TE1 && TE1 == R.getTreeEntry(V2))
1781returnLookAheadHeuristics::ScoreSplatLoads;
1782returnLookAheadHeuristics::ScoreFail;
1783 };
1784
1785auto *LI1 = dyn_cast<LoadInst>(V1);
1786auto *LI2 = dyn_cast<LoadInst>(V2);
1787if (LI1 && LI2) {
1788if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1789 !LI2->isSimple())
1790return CheckSameEntryOrFail();
1791
1792 std::optional<int> Dist =getPointersDiff(
1793 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1794 LI2->getPointerOperand(),DL, SE,/*StrictCheck=*/true);
1795if (!Dist || *Dist == 0) {
1796if (getUnderlyingObject(LI1->getPointerOperand()) ==
1797getUnderlyingObject(LI2->getPointerOperand()) &&
1798 R.TTI->isLegalMaskedGather(
1799getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1800returnLookAheadHeuristics::ScoreMaskedGatherCandidate;
1801return CheckSameEntryOrFail();
1802 }
1803// The distance is too large - still may be profitable to use masked
1804// loads/gathers.
1805if (std::abs(*Dist) > NumLanes / 2)
1806returnLookAheadHeuristics::ScoreMaskedGatherCandidate;
1807// This still will detect consecutive loads, but we might have "holes"
1808// in some cases. It is ok for non-power-2 vectorization and may produce
1809// better results. It should not affect current vectorization.
1810return (*Dist > 0) ?LookAheadHeuristics::ScoreConsecutiveLoads
1811 :LookAheadHeuristics::ScoreReversedLoads;
1812 }
1813
1814auto *C1 = dyn_cast<Constant>(V1);
1815auto *C2 = dyn_cast<Constant>(V2);
1816if (C1 && C2)
1817returnLookAheadHeuristics::ScoreConstants;
1818
1819// Extracts from consecutive indexes of the same vector better score as
1820// the extracts could be optimized away.
1821Value *EV1;
1822ConstantInt *Ex1Idx;
1823if (match(V1,m_ExtractElt(m_Value(EV1),m_ConstantInt(Ex1Idx)))) {
1824// Undefs are always profitable for extractelements.
1825// Compiler can easily combine poison and extractelement <non-poison> or
1826// undef and extractelement <poison>. But combining undef +
1827// extractelement <non-poison-but-may-produce-poison> requires some
1828// extra operations.
1829if (isa<UndefValue>(V2))
1830return (isa<PoisonValue>(V2) ||isUndefVector(EV1).all())
1831 ?LookAheadHeuristics::ScoreConsecutiveExtracts
1832 :LookAheadHeuristics::ScoreSameOpcode;
1833Value *EV2 =nullptr;
1834ConstantInt *Ex2Idx =nullptr;
1835if (match(V2,
1836m_ExtractElt(m_Value(EV2),m_CombineOr(m_ConstantInt(Ex2Idx),
1837m_Undef())))) {
1838// Undefs are always profitable for extractelements.
1839if (!Ex2Idx)
1840returnLookAheadHeuristics::ScoreConsecutiveExtracts;
1841if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1842returnLookAheadHeuristics::ScoreConsecutiveExtracts;
1843if (EV2 == EV1) {
1844int Idx1 = Ex1Idx->getZExtValue();
1845int Idx2 = Ex2Idx->getZExtValue();
1846int Dist = Idx2 - Idx1;
1847// The distance is too large - still may be profitable to use
1848// shuffles.
1849if (std::abs(Dist) == 0)
1850returnLookAheadHeuristics::ScoreSplat;
1851if (std::abs(Dist) > NumLanes / 2)
1852returnLookAheadHeuristics::ScoreSameOpcode;
1853return (Dist > 0) ?LookAheadHeuristics::ScoreConsecutiveExtracts
1854 :LookAheadHeuristics::ScoreReversedExtracts;
1855 }
1856returnLookAheadHeuristics::ScoreAltOpcodes;
1857 }
1858return CheckSameEntryOrFail();
1859 }
1860
1861auto *I1 = dyn_cast<Instruction>(V1);
1862auto *I2 = dyn_cast<Instruction>(V2);
1863if (I1 && I2) {
1864if (I1->getParent() != I2->getParent())
1865return CheckSameEntryOrFail();
1866SmallVector<Value *, 4> Ops(MainAltOps);
1867 Ops.push_back(I1);
1868 Ops.push_back(I2);
1869 InstructionsState S =getSameOpcode(Ops, TLI);
1870// Note: Only consider instructions with <= 2 operands to avoid
1871// complexity explosion.
1872if (S &&
1873 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1874 !S.isAltShuffle()) &&
1875all_of(Ops, [&S](Value *V) {
1876return isa<PoisonValue>(V) ||
1877 cast<Instruction>(V)->getNumOperands() ==
1878 S.getMainOp()->getNumOperands();
1879 }))
1880return S.isAltShuffle() ?LookAheadHeuristics::ScoreAltOpcodes
1881 :LookAheadHeuristics::ScoreSameOpcode;
1882 }
1883
1884if (I1 && isa<PoisonValue>(V2))
1885returnLookAheadHeuristics::ScoreSameOpcode;
1886
1887if (isa<UndefValue>(V2))
1888returnLookAheadHeuristics::ScoreUndef;
1889
1890return CheckSameEntryOrFail();
1891 }
1892
1893 /// Go through the operands of \p LHS and \p RHS recursively until
1894 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1895 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1896 /// of \p U1 and \p U2), except at the beginning of the recursion where
1897 /// these are set to nullptr.
1898 ///
1899 /// For example:
1900 /// \verbatim
1901 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1902 /// \ / \ / \ / \ /
1903 /// + + + +
1904 /// G1 G2 G3 G4
1905 /// \endverbatim
1906 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1907 /// each level recursively, accumulating the score. It starts from matching
1908 /// the additions at level 0, then moves on to the loads (level 1). The
1909 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1910 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1911 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1912 /// Please note that the order of the operands does not matter, as we
1913 /// evaluate the score of all profitable combinations of operands. In
1914 /// other words the score of G1 and G4 is the same as G1 and G2. This
1915 /// heuristic is based on ideas described in:
1916 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1917 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1918 /// Luís F. W. Góes
1919intgetScoreAtLevelRec(Value *LHS,Value *RHS,Instruction *U1,
1920Instruction *U2,int CurrLevel,
1921ArrayRef<Value *> MainAltOps) const{
1922
1923// Get the shallow score of V1 and V2.
1924int ShallowScoreAtThisLevel =
1925getShallowScore(LHS,RHS, U1, U2, MainAltOps);
1926
1927// If reached MaxLevel,
1928// or if V1 and V2 are not instructions,
1929// or if they are SPLAT,
1930// or if they are not consecutive,
1931// or if profitable to vectorize loads or extractelements, early return
1932// the current cost.
1933auto *I1 = dyn_cast<Instruction>(LHS);
1934auto *I2 = dyn_cast<Instruction>(RHS);
1935if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1936 ShallowScoreAtThisLevel ==LookAheadHeuristics::ScoreFail ||
1937 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1938 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1939 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1940 ShallowScoreAtThisLevel))
1941return ShallowScoreAtThisLevel;
1942assert(I1 && I2 &&"Should have early exited.");
1943
1944// Contains the I2 operand indexes that got matched with I1 operands.
1945SmallSet<unsigned, 4> Op2Used;
1946
1947// Recursion towards the operands of I1 and I2. We are trying all possible
1948// operand pairs, and keeping track of the best score.
1949for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1950 OpIdx1 != NumOperands1; ++OpIdx1) {
1951// Try to pair op1I with the best operand of I2.
1952int MaxTmpScore = 0;
1953unsigned MaxOpIdx2 = 0;
1954bool FoundBest =false;
1955// If I2 is commutative try all combinations.
1956unsigned FromIdx =isCommutative(I2) ? 0 : OpIdx1;
1957unsigned ToIdx =isCommutative(I2)
1958 ? I2->getNumOperands()
1959 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1960assert(FromIdx <= ToIdx &&"Bad index");
1961for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1962// Skip operands already paired with OpIdx1.
1963if (Op2Used.count(OpIdx2))
1964continue;
1965// Recursively calculate the cost at each level
1966int TmpScore =
1967getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1968 I1, I2, CurrLevel + 1, {});
1969// Look for the best score.
1970if (TmpScore >LookAheadHeuristics::ScoreFail &&
1971 TmpScore > MaxTmpScore) {
1972 MaxTmpScore = TmpScore;
1973 MaxOpIdx2 = OpIdx2;
1974 FoundBest =true;
1975 }
1976 }
1977if (FoundBest) {
1978// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1979 Op2Used.insert(MaxOpIdx2);
1980 ShallowScoreAtThisLevel += MaxTmpScore;
1981 }
1982 }
1983return ShallowScoreAtThisLevel;
1984 }
1985 };
1986 /// A helper data structure to hold the operands of a vector of instructions.
1987 /// This supports a fixed vector length for all operand vectors.
1988classVLOperands {
1989 /// For each operand we need (i) the value, and (ii) the opcode that it
1990 /// would be attached to if the expression was in a left-linearized form.
1991 /// This is required to avoid illegal operand reordering.
1992 /// For example:
1993 /// \verbatim
1994 /// 0 Op1
1995 /// |/
1996 /// Op1 Op2 Linearized + Op2
1997 /// \ / ----------> |/
1998 /// - -
1999 ///
2000 /// Op1 - Op2 (0 + Op1) - Op2
2001 /// \endverbatim
2002 ///
2003 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2004 ///
2005 /// Another way to think of this is to track all the operations across the
2006 /// path from the operand all the way to the root of the tree and to
2007 /// calculate the operation that corresponds to this path. For example, the
2008 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2009 /// corresponding operation is a '-' (which matches the one in the
2010 /// linearized tree, as shown above).
2011 ///
2012 /// For lack of a better term, we refer to this operation as Accumulated
2013 /// Path Operation (APO).
2014structOperandData {
2015 OperandData() =default;
2016 OperandData(Value *V,bool APO,bool IsUsed)
2017 : V(V), APO(APO), IsUsed(IsUsed) {}
2018 /// The operand value.
2019Value *V =nullptr;
2020 /// TreeEntries only allow a single opcode, or an alternate sequence of
2021 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2022 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2023 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2024 /// (e.g., Add/Mul)
2025bool APO =false;
2026 /// Helper data for the reordering function.
2027bool IsUsed =false;
2028 };
2029
2030 /// During operand reordering, we are trying to select the operand at lane
2031 /// that matches best with the operand at the neighboring lane. Our
2032 /// selection is based on the type of value we are looking for. For example,
2033 /// if the neighboring lane has a load, we need to look for a load that is
2034 /// accessing a consecutive address. These strategies are summarized in the
2035 /// 'ReorderingMode' enumerator.
2036enum class ReorderingMode {
2037 Load,///< Matching loads to consecutive memory addresses
2038 Opcode,///< Matching instructions based on opcode (same or alternate)
2039Constant,///< Matching constants
2040Splat,///< Matching the same instruction multiple times (broadcast)
2041Failed,///< We failed to create a vectorizable group
2042 };
2043
2044usingOperandDataVec =SmallVector<OperandData, 2>;
2045
2046 /// A vector of operand vectors.
2047SmallVector<OperandDataVec, 4> OpsVec;
2048 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2049 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2050unsigned ArgSize = 0;
2051
2052constTargetLibraryInfo &TLI;
2053constDataLayout &DL;
2054ScalarEvolution &SE;
2055constBoUpSLP &R;
2056constLoop *L =nullptr;
2057
2058 /// \returns the operand data at \p OpIdx and \p Lane.
2059 OperandData &getData(unsigned OpIdx,unsigned Lane) {
2060return OpsVec[OpIdx][Lane];
2061 }
2062
2063 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2064const OperandData &getData(unsigned OpIdx,unsigned Lane) const{
2065return OpsVec[OpIdx][Lane];
2066 }
2067
2068 /// Clears the used flag for all entries.
2069void clearUsed() {
2070for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2071 OpIdx != NumOperands; ++OpIdx)
2072for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2073 ++Lane)
2074 OpsVec[OpIdx][Lane].IsUsed =false;
2075 }
2076
2077 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2078void swap(unsigned OpIdx1,unsigned OpIdx2,unsigned Lane) {
2079std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2080 }
2081
2082 /// \param Lane lane of the operands under analysis.
2083 /// \param OpIdx operand index in \p Lane lane we're looking the best
2084 /// candidate for.
2085 /// \param Idx operand index of the current candidate value.
2086 /// \returns The additional score due to possible broadcasting of the
2087 /// elements in the lane. It is more profitable to have power-of-2 unique
2088 /// elements in the lane, it will be vectorized with higher probability
2089 /// after removing duplicates. Currently the SLP vectorizer supports only
2090 /// vectorization of the power-of-2 number of unique scalars.
2091int getSplatScore(unsigned Lane,unsigned OpIdx,unsignedIdx,
2092constSmallBitVector &UsedLanes) const{
2093Value *IdxLaneV = getData(Idx, Lane).V;
2094if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2095 isa<ExtractElementInst>(IdxLaneV))
2096return 0;
2097SmallDenseMap<Value *, unsigned, 4> Uniques;
2098for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2099if (Ln == Lane)
2100continue;
2101Value *OpIdxLnV = getData(OpIdx, Ln).V;
2102if (!isa<Instruction>(OpIdxLnV))
2103return 0;
2104 Uniques.try_emplace(OpIdxLnV, Ln);
2105 }
2106unsigned UniquesCount = Uniques.size();
2107auto IdxIt = Uniques.find(IdxLaneV);
2108unsigned UniquesCntWithIdxLaneV =
2109 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2110Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2111auto OpIdxIt = Uniques.find(OpIdxLaneV);
2112unsigned UniquesCntWithOpIdxLaneV =
2113 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2114if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2115return 0;
2116return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2117 UniquesCntWithOpIdxLaneV,
2118 UniquesCntWithOpIdxLaneV -
2119bit_floor(UniquesCntWithOpIdxLaneV)) -
2120 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2121 ? UniquesCntWithIdxLaneV -bit_floor(UniquesCntWithIdxLaneV)
2122 :bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2123 }
2124
2125 /// \param Lane lane of the operands under analysis.
2126 /// \param OpIdx operand index in \p Lane lane we're looking the best
2127 /// candidate for.
2128 /// \param Idx operand index of the current candidate value.
2129 /// \returns The additional score for the scalar which users are all
2130 /// vectorized.
2131int getExternalUseScore(unsigned Lane,unsigned OpIdx,unsignedIdx) const{
2132Value *IdxLaneV = getData(Idx, Lane).V;
2133Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2134// Do not care about number of uses for vector-like instructions
2135// (extractelement/extractvalue with constant indices), they are extracts
2136// themselves and already externally used. Vectorization of such
2137// instructions does not add extra extractelement instruction, just may
2138// remove it.
2139if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2140isVectorLikeInstWithConstOps(OpIdxLaneV))
2141returnLookAheadHeuristics::ScoreAllUserVectorized;
2142auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2143if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2144return 0;
2145return R.areAllUsersVectorized(IdxLaneI)
2146 ?LookAheadHeuristics::ScoreAllUserVectorized
2147 : 0;
2148 }
2149
2150 /// Score scaling factor for fully compatible instructions but with
2151 /// different number of external uses. Allows better selection of the
2152 /// instructions with less external uses.
2153staticconstint ScoreScaleFactor = 10;
2154
2155 /// \Returns the look-ahead score, which tells us how much the sub-trees
2156 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2157 /// score. This helps break ties in an informed way when we cannot decide on
2158 /// the order of the operands by just considering the immediate
2159 /// predecessors.
2160int getLookAheadScore(Value *LHS,Value *RHS,ArrayRef<Value *> MainAltOps,
2161int Lane,unsigned OpIdx,unsignedIdx,
2162bool &IsUsed,constSmallBitVector &UsedLanes) {
2163LookAheadHeuristics LookAhead(TLI,DL, SE, R, getNumLanes(),
2164LookAheadMaxDepth);
2165// Keep track of the instruction stack as we recurse into the operands
2166// during the look-ahead score exploration.
2167int Score =
2168 LookAhead.getScoreAtLevelRec(LHS,RHS,/*U1=*/nullptr,/*U2=*/nullptr,
2169/*CurrLevel=*/1, MainAltOps);
2170if (Score) {
2171int SplatScore = getSplatScore(Lane, OpIdx,Idx, UsedLanes);
2172if (Score <= -SplatScore) {
2173// Failed score.
2174 Score = 0;
2175 }else {
2176 Score += SplatScore;
2177// Scale score to see the difference between different operands
2178// and similar operands but all vectorized/not all vectorized
2179// uses. It does not affect actual selection of the best
2180// compatible operand in general, just allows to select the
2181// operand with all vectorized uses.
2182 Score *= ScoreScaleFactor;
2183 Score += getExternalUseScore(Lane, OpIdx,Idx);
2184 IsUsed =true;
2185 }
2186 }
2187return Score;
2188 }
2189
2190 /// Best defined scores per lanes between the passes. Used to choose the
2191 /// best operand (with the highest score) between the passes.
2192 /// The key - {Operand Index, Lane}.
2193 /// The value - the best score between the passes for the lane and the
2194 /// operand.
2195SmallDenseMap<std::pair<unsigned, unsigned>,unsigned, 8>
2196 BestScoresPerLanes;
2197
2198// Search all operands in Ops[*][Lane] for the one that matches best
2199// Ops[OpIdx][LastLane] and return its opreand index.
2200// If no good match can be found, return std::nullopt.
2201 std::optional<unsigned>
2202 getBestOperand(unsigned OpIdx,int Lane,int LastLane,
2203ArrayRef<ReorderingMode> ReorderingModes,
2204ArrayRef<Value *> MainAltOps,
2205constSmallBitVector &UsedLanes) {
2206unsigned NumOperands = getNumOperands();
2207
2208// The operand of the previous lane at OpIdx.
2209Value *OpLastLane = getData(OpIdx, LastLane).V;
2210
2211// Our strategy mode for OpIdx.
2212 ReorderingMode RMode = ReorderingModes[OpIdx];
2213if (RMode == ReorderingMode::Failed)
2214return std::nullopt;
2215
2216// The linearized opcode of the operand at OpIdx, Lane.
2217bool OpIdxAPO = getData(OpIdx, Lane).APO;
2218
2219// The best operand index and its score.
2220// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2221// are using the score to differentiate between the two.
2222structBestOpData {
2223 std::optional<unsigned>Idx;
2224unsigned Score = 0;
2225 } BestOp;
2226 BestOp.Score =
2227 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2228 .first->second;
2229
2230// Track if the operand must be marked as used. If the operand is set to
2231// Score 1 explicitly (because of non power-of-2 unique scalars, we may
2232// want to reestimate the operands again on the following iterations).
2233bool IsUsed = RMode == ReorderingMode::Splat ||
2234 RMode == ReorderingMode::Constant ||
2235 RMode == ReorderingMode::Load;
2236// Iterate through all unused operands and look for the best.
2237for (unsignedIdx = 0;Idx != NumOperands; ++Idx) {
2238// Get the operand at Idx and Lane.
2239 OperandData &OpData = getData(Idx, Lane);
2240Value *Op = OpData.V;
2241bool OpAPO = OpData.APO;
2242
2243// Skip already selected operands.
2244if (OpData.IsUsed)
2245continue;
2246
2247// Skip if we are trying to move the operand to a position with a
2248// different opcode in the linearized tree form. This would break the
2249// semantics.
2250if (OpAPO != OpIdxAPO)
2251continue;
2252
2253// Look for an operand that matches the current mode.
2254switch (RMode) {
2255case ReorderingMode::Load:
2256case ReorderingMode::Opcode: {
2257bool LeftToRight = Lane > LastLane;
2258Value *OpLeft = (LeftToRight) ? OpLastLane :Op;
2259Value *OpRight = (LeftToRight) ?Op : OpLastLane;
2260int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2261 OpIdx,Idx, IsUsed, UsedLanes);
2262if (Score >static_cast<int>(BestOp.Score) ||
2263 (Score > 0 && Score ==static_cast<int>(BestOp.Score) &&
2264Idx == OpIdx)) {
2265 BestOp.Idx =Idx;
2266 BestOp.Score = Score;
2267 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2268 }
2269break;
2270 }
2271case ReorderingMode::Constant:
2272if (isa<Constant>(Op) ||
2273 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2274 BestOp.Idx =Idx;
2275if (isa<Constant>(Op)) {
2276 BestOp.Score =LookAheadHeuristics::ScoreConstants;
2277 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2278LookAheadHeuristics::ScoreConstants;
2279 }
2280if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2281 IsUsed =false;
2282 }
2283break;
2284case ReorderingMode::Splat:
2285if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2286 IsUsed =Op == OpLastLane;
2287if (Op == OpLastLane) {
2288 BestOp.Score =LookAheadHeuristics::ScoreSplat;
2289 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2290LookAheadHeuristics::ScoreSplat;
2291 }
2292 BestOp.Idx =Idx;
2293 }
2294break;
2295case ReorderingMode::Failed:
2296llvm_unreachable("Not expected Failed reordering mode.");
2297 }
2298 }
2299
2300if (BestOp.Idx) {
2301 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2302return BestOp.Idx;
2303 }
2304// If we could not find a good match return std::nullopt.
2305return std::nullopt;
2306 }
2307
2308 /// Helper for reorderOperandVecs.
2309 /// \returns the lane that we should start reordering from. This is the one
2310 /// which has the least number of operands that can freely move about or
2311 /// less profitable because it already has the most optimal set of operands.
2312unsigned getBestLaneToStartReordering() const{
2313unsigned Min = UINT_MAX;
2314unsigned SameOpNumber = 0;
2315// std::pair<unsigned, unsigned> is used to implement a simple voting
2316// algorithm and choose the lane with the least number of operands that
2317// can freely move about or less profitable because it already has the
2318// most optimal set of operands. The first unsigned is a counter for
2319// voting, the second unsigned is the counter of lanes with instructions
2320// with same/alternate opcodes and same parent basic block.
2321MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
2322// Try to be closer to the original results, if we have multiple lanes
2323// with same cost. If 2 lanes have the same cost, use the one with the
2324// highest index.
2325for (intI = getNumLanes();I > 0; --I) {
2326unsigned Lane =I - 1;
2327 OperandsOrderData NumFreeOpsHash =
2328 getMaxNumOperandsThatCanBeReordered(Lane);
2329// Compare the number of operands that can move and choose the one with
2330// the least number.
2331if (NumFreeOpsHash.NumOfAPOs < Min) {
2332 Min = NumFreeOpsHash.NumOfAPOs;
2333 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2334 HashMap.clear();
2335 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2336 }elseif (NumFreeOpsHash.NumOfAPOs == Min &&
2337 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2338// Select the most optimal lane in terms of number of operands that
2339// should be moved around.
2340 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2341 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2342 }elseif (NumFreeOpsHash.NumOfAPOs == Min &&
2343 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2344auto [It, Inserted] =
2345 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2346if (!Inserted)
2347 ++It->second.first;
2348 }
2349 }
2350// Select the lane with the minimum counter.
2351unsigned BestLane = 0;
2352unsigned CntMin = UINT_MAX;
2353for (constauto &Data :reverse(HashMap)) {
2354if (Data.second.first < CntMin) {
2355 CntMin =Data.second.first;
2356 BestLane =Data.second.second;
2357 }
2358 }
2359return BestLane;
2360 }
2361
2362 /// Data structure that helps to reorder operands.
2363structOperandsOrderData {
2364 /// The best number of operands with the same APOs, which can be
2365 /// reordered.
2366unsigned NumOfAPOs = UINT_MAX;
2367 /// Number of operands with the same/alternate instruction opcode and
2368 /// parent.
2369unsigned NumOpsWithSameOpcodeParent = 0;
2370 /// Hash for the actual operands ordering.
2371 /// Used to count operands, actually their position id and opcode
2372 /// value. It is used in the voting mechanism to find the lane with the
2373 /// least number of operands that can freely move about or less profitable
2374 /// because it already has the most optimal set of operands. Can be
2375 /// replaced with SmallVector<unsigned> instead but hash code is faster
2376 /// and requires less memory.
2377unsigned Hash = 0;
2378 };
2379 /// \returns the maximum number of operands that are allowed to be reordered
2380 /// for \p Lane and the number of compatible instructions(with the same
2381 /// parent/opcode). This is used as a heuristic for selecting the first lane
2382 /// to start operand reordering.
2383 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const{
2384unsigned CntTrue = 0;
2385unsigned NumOperands = getNumOperands();
2386// Operands with the same APO can be reordered. We therefore need to count
2387// how many of them we have for each APO, like this: Cnt[APO] = x.
2388// Since we only have two APOs, namely true and false, we can avoid using
2389// a map. Instead we can simply count the number of operands that
2390// correspond to one of them (in this case the 'true' APO), and calculate
2391// the other by subtracting it from the total number of operands.
2392// Operands with the same instruction opcode and parent are more
2393// profitable since we don't need to move them in many cases, with a high
2394// probability such lane already can be vectorized effectively.
2395bool AllUndefs =true;
2396unsigned NumOpsWithSameOpcodeParent = 0;
2397Instruction *OpcodeI =nullptr;
2398BasicBlock *Parent =nullptr;
2399unsigned Hash = 0;
2400for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2401const OperandData &OpData = getData(OpIdx, Lane);
2402if (OpData.APO)
2403 ++CntTrue;
2404// Use Boyer-Moore majority voting for finding the majority opcode and
2405// the number of times it occurs.
2406if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2407if (!OpcodeI || !getSameOpcode({OpcodeI,I}, TLI) ||
2408I->getParent() != Parent) {
2409if (NumOpsWithSameOpcodeParent == 0) {
2410 NumOpsWithSameOpcodeParent = 1;
2411 OpcodeI =I;
2412 Parent =I->getParent();
2413 }else {
2414 --NumOpsWithSameOpcodeParent;
2415 }
2416 }else {
2417 ++NumOpsWithSameOpcodeParent;
2418 }
2419 }
2420 Hash =hash_combine(
2421 Hash,hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2422 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2423 }
2424if (AllUndefs)
2425return {};
2426 OperandsOrderDataData;
2427Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2428Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2429Data.Hash = Hash;
2430returnData;
2431 }
2432
2433 /// Go through the instructions in VL and append their operands.
2434void appendOperandsOfVL(ArrayRef<Value *> VL,const InstructionsState &S) {
2435assert(!VL.empty() &&"Bad VL");
2436assert((empty() || VL.size() == getNumLanes()) &&
2437"Expected same number of lanes");
2438assert(S.valid() &&"InstructionsState is invalid.");
2439// IntrinsicInst::isCommutative returns true if swapping the first "two"
2440// arguments to the intrinsic produces the same result.
2441constexprunsigned IntrinsicNumOperands = 2;
2442Instruction *MainOp = S.getMainOp();
2443unsigned NumOperands = MainOp->getNumOperands();
2444 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2445 OpsVec.resize(NumOperands);
2446unsigned NumLanes = VL.size();
2447for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2448 OpsVec[OpIdx].resize(NumLanes);
2449for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2450assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2451"Expected instruction or poison value");
2452// Our tree has just 3 nodes: the root and two operands.
2453// It is therefore trivial to get the APO. We only need to check the
2454// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2455// RHS operand. The LHS operand of both add and sub is never attached
2456// to an inversese operation in the linearized form, therefore its APO
2457// is false. The RHS is true only if VL[Lane] is an inverse operation.
2458
2459// Since operand reordering is performed on groups of commutative
2460// operations or alternating sequences (e.g., +, -), we can safely
2461// tell the inverse operations by checking commutativity.
2462if (isa<PoisonValue>(VL[Lane])) {
2463if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2464if (OpIdx == 0) {
2465 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),true,false};
2466continue;
2467 }
2468 }elseif (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2469if (OpIdx == 0) {
2470 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),true,false};
2471continue;
2472 }
2473 }
2474 OpsVec[OpIdx][Lane] = {
2475PoisonValue::get(MainOp->getOperand(OpIdx)->getType()),true,
2476false};
2477continue;
2478 }
2479bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2480bool APO = (OpIdx == 0) ?false : IsInverseOperation;
2481 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2482 APO,false};
2483 }
2484 }
2485 }
2486
2487 /// \returns the number of operands.
2488unsigned getNumOperands() const{return ArgSize; }
2489
2490 /// \returns the number of lanes.
2491unsigned getNumLanes() const{return OpsVec[0].size(); }
2492
2493 /// \returns the operand value at \p OpIdx and \p Lane.
2494Value *getValue(unsigned OpIdx,unsigned Lane) const{
2495return getData(OpIdx, Lane).V;
2496 }
2497
2498 /// \returns true if the data structure is empty.
2499bool empty() const{return OpsVec.empty(); }
2500
2501 /// Clears the data.
2502void clear() { OpsVec.clear(); }
2503
2504 /// \Returns true if there are enough operands identical to \p Op to fill
2505 /// the whole vector (it is mixed with constants or loop invariant values).
2506 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2507bool shouldBroadcast(Value *Op,unsigned OpIdx,unsigned Lane) {
2508assert(Op == getValue(OpIdx, Lane) &&
2509"Op is expected to be getValue(OpIdx, Lane).");
2510// Small number of loads - try load matching.
2511if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2512returnfalse;
2513bool OpAPO = getData(OpIdx, Lane).APO;
2514bool IsInvariant = L && L->isLoopInvariant(Op);
2515unsigned Cnt = 0;
2516for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2517if (Ln == Lane)
2518continue;
2519// This is set to true if we found a candidate for broadcast at Lane.
2520bool FoundCandidate =false;
2521for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2522 OperandData &Data = getData(OpI, Ln);
2523if (Data.APO != OpAPO ||Data.IsUsed)
2524continue;
2525Value *OpILane = getValue(OpI, Lane);
2526bool IsConstantOp = isa<Constant>(OpILane);
2527// Consider the broadcast candidate if:
2528// 1. Same value is found in one of the operands.
2529if (Data.V ==Op ||
2530// 2. The operand in the given lane is not constant but there is a
2531// constant operand in another lane (which can be moved to the
2532// given lane). In this case we can represent it as a simple
2533// permutation of constant and broadcast.
2534 (!IsConstantOp &&
2535 ((Lns > 2 && isa<Constant>(Data.V)) ||
2536// 2.1. If we have only 2 lanes, need to check that value in the
2537// next lane does not build same opcode sequence.
2538 (Lns == 2 &&
2539 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
2540 isa<Constant>(Data.V)))) ||
2541// 3. The operand in the current lane is loop invariant (can be
2542// hoisted out) and another operand is also a loop invariant
2543// (though not a constant). In this case the whole vector can be
2544// hoisted out.
2545// FIXME: need to teach the cost model about this case for better
2546// estimation.
2547 (IsInvariant && !isa<Constant>(Data.V) &&
2548 !getSameOpcode({Op,Data.V}, TLI) &&
2549 L->isLoopInvariant(Data.V))) {
2550 FoundCandidate =true;
2551Data.IsUsed =Data.V ==Op;
2552if (Data.V ==Op)
2553 ++Cnt;
2554break;
2555 }
2556 }
2557if (!FoundCandidate)
2558returnfalse;
2559 }
2560return getNumLanes() == 2 || Cnt > 1;
2561 }
2562
2563 /// Checks if there is at least single compatible operand in lanes other
2564 /// than \p Lane, compatible with the operand \p Op.
2565bool canBeVectorized(Instruction *Op,unsigned OpIdx,unsigned Lane) const{
2566assert(Op == getValue(OpIdx, Lane) &&
2567"Op is expected to be getValue(OpIdx, Lane).");
2568bool OpAPO = getData(OpIdx, Lane).APO;
2569for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2570if (Ln == Lane)
2571continue;
2572if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2573const OperandData &Data = getData(OpI, Ln);
2574if (Data.APO != OpAPO ||Data.IsUsed)
2575returntrue;
2576Value *OpILn = getValue(OpI, Ln);
2577return (L && L->isLoopInvariant(OpILn)) ||
2578 (getSameOpcode({Op, OpILn}, TLI) &&
2579allSameBlock({Op, OpILn}));
2580 }))
2581returntrue;
2582 }
2583returnfalse;
2584 }
2585
2586public:
2587 /// Initialize with all the operands of the instruction vector \p RootVL.
2588VLOperands(ArrayRef<Value *> RootVL,const InstructionsState &S,
2589constBoUpSLP &R)
2590 : TLI(*R.TLI),DL(*R.DL), SE(*R.SE), R(R),
2591 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
2592// Append all the operands of RootVL.
2593 appendOperandsOfVL(RootVL, S);
2594 }
2595
2596 /// \Returns a value vector with the operands across all lanes for the
2597 /// opearnd at \p OpIdx.
2598ValueListgetVL(unsigned OpIdx) const{
2599ValueList OpVL(OpsVec[OpIdx].size());
2600assert(OpsVec[OpIdx].size() == getNumLanes() &&
2601"Expected same num of lanes across all operands");
2602for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2603 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2604return OpVL;
2605 }
2606
2607// Performs operand reordering for 2 or more operands.
2608// The original operands are in OrigOps[OpIdx][Lane].
2609// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2610voidreorder() {
2611unsigned NumOperands = getNumOperands();
2612unsigned NumLanes = getNumLanes();
2613// Each operand has its own mode. We are using this mode to help us select
2614// the instructions for each lane, so that they match best with the ones
2615// we have selected so far.
2616SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2617
2618// This is a greedy single-pass algorithm. We are going over each lane
2619// once and deciding on the best order right away with no back-tracking.
2620// However, in order to increase its effectiveness, we start with the lane
2621// that has operands that can move the least. For example, given the
2622// following lanes:
2623// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2624// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2625// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2626// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2627// we will start at Lane 1, since the operands of the subtraction cannot
2628// be reordered. Then we will visit the rest of the lanes in a circular
2629// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2630
2631// Find the first lane that we will start our search from.
2632unsigned FirstLane = getBestLaneToStartReordering();
2633
2634// Initialize the modes.
2635for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2636Value *OpLane0 = getValue(OpIdx, FirstLane);
2637// Keep track if we have instructions with all the same opcode on one
2638// side.
2639if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2640// Check if OpLane0 should be broadcast.
2641if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2642 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2643 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2644elseif (isa<LoadInst>(OpILane0))
2645 ReorderingModes[OpIdx] = ReorderingMode::Load;
2646else
2647 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2648 }elseif (isa<Constant>(OpLane0)) {
2649 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2650 }elseif (isa<Argument>(OpLane0)) {
2651// Our best hope is a Splat. It may save some cost in some cases.
2652 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2653 }else {
2654llvm_unreachable("Unexpected value kind.");
2655 }
2656 }
2657
2658// Check that we don't have same operands. No need to reorder if operands
2659// are just perfect diamond or shuffled diamond match. Do not do it only
2660// for possible broadcasts or non-power of 2 number of scalars (just for
2661// now).
2662auto &&SkipReordering = [this]() {
2663SmallPtrSet<Value *, 4> UniqueValues;
2664ArrayRef<OperandData> Op0 = OpsVec.front();
2665for (const OperandData &Data : Op0)
2666 UniqueValues.insert(Data.V);
2667for (ArrayRef<OperandData>Op :
2668ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2669if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2670return !UniqueValues.contains(Data.V);
2671 }))
2672returnfalse;
2673 }
2674// TODO: Check if we can remove a check for non-power-2 number of
2675// scalars after full support of non-power-2 vectorization.
2676return UniqueValues.size() != 2 &&
2677hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
2678 UniqueValues.size());
2679 };
2680
2681// If the initial strategy fails for any of the operand indexes, then we
2682// perform reordering again in a second pass. This helps avoid assigning
2683// high priority to the failed strategy, and should improve reordering for
2684// the non-failed operand indexes.
2685for (intPass = 0;Pass != 2; ++Pass) {
2686// Check if no need to reorder operands since they're are perfect or
2687// shuffled diamond match.
2688// Need to do it to avoid extra external use cost counting for
2689// shuffled matches, which may cause regressions.
2690if (SkipReordering())
2691break;
2692// Skip the second pass if the first pass did not fail.
2693bool StrategyFailed =false;
2694// Mark all operand data as free to use.
2695 clearUsed();
2696// We keep the original operand order for the FirstLane, so reorder the
2697// rest of the lanes. We are visiting the nodes in a circular fashion,
2698// using FirstLane as the center point and increasing the radius
2699// distance.
2700SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2701for (unsignedI = 0;I < NumOperands; ++I)
2702 MainAltOps[I].push_back(getData(I, FirstLane).V);
2703
2704SmallBitVector UsedLanes(NumLanes);
2705 UsedLanes.set(FirstLane);
2706for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2707// Visit the lane on the right and then the lane on the left.
2708for (intDirection : {+1, -1}) {
2709int Lane = FirstLane +Direction * Distance;
2710if (Lane < 0 || Lane >= (int)NumLanes)
2711continue;
2712 UsedLanes.set(Lane);
2713int LastLane = Lane -Direction;
2714assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2715"Out of bounds");
2716// Look for a good match for each operand.
2717for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2718// Search for the operand that matches SortedOps[OpIdx][Lane-1].
2719 std::optional<unsigned> BestIdx =
2720 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2721 MainAltOps[OpIdx], UsedLanes);
2722// By not selecting a value, we allow the operands that follow to
2723// select a better matching value. We will get a non-null value in
2724// the next run of getBestOperand().
2725if (BestIdx) {
2726// Swap the current operand with the one returned by
2727// getBestOperand().
2728 swap(OpIdx, *BestIdx, Lane);
2729 }else {
2730// Enable the second pass.
2731 StrategyFailed =true;
2732 }
2733// Try to get the alternate opcode and follow it during analysis.
2734if (MainAltOps[OpIdx].size() != 2) {
2735 OperandData &AltOp = getData(OpIdx, Lane);
2736 InstructionsState OpS =
2737getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2738if (OpS && OpS.isAltShuffle())
2739 MainAltOps[OpIdx].push_back(AltOp.V);
2740 }
2741 }
2742 }
2743 }
2744// Skip second pass if the strategy did not fail.
2745if (!StrategyFailed)
2746break;
2747 }
2748 }
2749
2750#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2751LLVM_DUMP_METHODstaticStringRefgetModeStr(ReorderingMode RMode) {
2752switch (RMode) {
2753case ReorderingMode::Load:
2754return"Load";
2755case ReorderingMode::Opcode:
2756return"Opcode";
2757case ReorderingMode::Constant:
2758return"Constant";
2759case ReorderingMode::Splat:
2760return"Splat";
2761case ReorderingMode::Failed:
2762return"Failed";
2763 }
2764llvm_unreachable("Unimplemented Reordering Type");
2765 }
2766
2767LLVM_DUMP_METHODstaticraw_ostream &printMode(ReorderingMode RMode,
2768raw_ostream &OS) {
2769returnOS <<getModeStr(RMode);
2770 }
2771
2772 /// Debug print.
2773LLVM_DUMP_METHODstaticvoiddumpMode(ReorderingMode RMode) {
2774printMode(RMode,dbgs());
2775 }
2776
2777friendraw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2778returnprintMode(RMode,OS);
2779 }
2780
2781LLVM_DUMP_METHODraw_ostream &print(raw_ostream &OS) const{
2782constunsigned Indent = 2;
2783unsigned Cnt = 0;
2784for (constOperandDataVec &OpDataVec : OpsVec) {
2785OS <<"Operand " << Cnt++ <<"\n";
2786for (const OperandData &OpData : OpDataVec) {
2787OS.indent(Indent) <<"{";
2788if (Value *V = OpData.V)
2789OS << *V;
2790else
2791OS <<"null";
2792OS <<", APO:" << OpData.APO <<"}\n";
2793 }
2794OS <<"\n";
2795 }
2796returnOS;
2797 }
2798
2799 /// Debug print.
2800LLVM_DUMP_METHODvoiddump() const{print(dbgs()); }
2801#endif
2802 };
2803
2804 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2805 /// for a pair which have highest score deemed to have best chance to form
2806 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2807 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2808 /// of the cost, considered to be good enough score.
2809 std::optional<int>
2810findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2811int Limit =LookAheadHeuristics::ScoreFail) const{
2812LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this,/*NumLanes=*/2,
2813RootLookAheadMaxDepth);
2814int BestScore = Limit;
2815 std::optional<int> Index;
2816for (intI : seq<int>(0, Candidates.size())) {
2817int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2818 Candidates[I].second,
2819/*U1=*/nullptr,/*U2=*/nullptr,
2820/*CurrLevel=*/1, {});
2821if (Score > BestScore) {
2822 BestScore = Score;
2823 Index =I;
2824 }
2825 }
2826return Index;
2827 }
2828
2829 /// Checks if the instruction is marked for deletion.
2830boolisDeleted(Instruction *I) const{return DeletedInstructions.count(I); }
2831
2832 /// Removes an instruction from its block and eventually deletes it.
2833 /// It's like Instruction::eraseFromParent() except that the actual deletion
2834 /// is delayed until BoUpSLP is destructed.
2835voideraseInstruction(Instruction *I) {
2836 DeletedInstructions.insert(I);
2837 }
2838
2839 /// Remove instructions from the parent function and clear the operands of \p
2840 /// DeadVals instructions, marking for deletion trivially dead operands.
2841template <typename T>
2842voidremoveInstructionsAndOperands(ArrayRef<T *> DeadVals) {
2843SmallVector<WeakTrackingVH> DeadInsts;
2844for (T *V : DeadVals) {
2845auto *I = cast<Instruction>(V);
2846 DeletedInstructions.insert(I);
2847 }
2848DenseSet<Value *> Processed;
2849for (T *V : DeadVals) {
2850if (!V || !Processed.insert(V).second)
2851continue;
2852auto *I = cast<Instruction>(V);
2853salvageDebugInfo(*I);
2854SmallVector<const TreeEntry *> Entries;
2855if (const TreeEntry *Entry = getTreeEntry(I)) {
2856 Entries.push_back(Entry);
2857auto It = MultiNodeScalars.find(I);
2858if (It != MultiNodeScalars.end())
2859 Entries.append(It->second.begin(), It->second.end());
2860 }
2861for (Use &U :I->operands()) {
2862if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2863 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2864wouldInstructionBeTriviallyDead(OpI, TLI) &&
2865 (Entries.empty() ||none_of(Entries, [&](const TreeEntry *Entry) {
2866return Entry->VectorizedValue == OpI;
2867 })))
2868 DeadInsts.push_back(OpI);
2869 }
2870I->dropAllReferences();
2871 }
2872for (T *V : DeadVals) {
2873auto *I = cast<Instruction>(V);
2874if (!I->getParent())
2875continue;
2876assert((I->use_empty() ||all_of(I->uses(),
2877 [&](Use &U) {
2878 return isDeleted(
2879 cast<Instruction>(U.getUser()));
2880 })) &&
2881"trying to erase instruction with users.");
2882I->removeFromParent();
2883 SE->forgetValue(I);
2884 }
2885// Process the dead instruction list until empty.
2886while (!DeadInsts.empty()) {
2887Value *V = DeadInsts.pop_back_val();
2888Instruction *VI = cast_or_null<Instruction>(V);
2889if (!VI || !VI->getParent())
2890continue;
2891assert(isInstructionTriviallyDead(VI, TLI) &&
2892"Live instruction found in dead worklist!");
2893assert(VI->use_empty() &&"Instructions with uses are not dead.");
2894
2895// Don't lose the debug info while deleting the instructions.
2896salvageDebugInfo(*VI);
2897
2898// Null out all of the instruction's operands to see if any operand
2899// becomes dead as we go.
2900for (Use &OpU : VI->operands()) {
2901Value *OpV = OpU.get();
2902if (!OpV)
2903continue;
2904 OpU.set(nullptr);
2905
2906if (!OpV->use_empty())
2907continue;
2908
2909// If the operand is an instruction that became dead as we nulled out
2910// the operand, and if it is 'trivially' dead, delete it in a future
2911// loop iteration.
2912if (auto *OpI = dyn_cast<Instruction>(OpV))
2913if (!DeletedInstructions.contains(OpI) &&
2914isInstructionTriviallyDead(OpI, TLI))
2915 DeadInsts.push_back(OpI);
2916 }
2917
2918 VI->removeFromParent();
2919 DeletedInstructions.insert(VI);
2920 SE->forgetValue(VI);
2921 }
2922 }
2923
2924 /// Checks if the instruction was already analyzed for being possible
2925 /// reduction root.
2926boolisAnalyzedReductionRoot(Instruction *I) const{
2927return AnalyzedReductionsRoots.count(I);
2928 }
2929 /// Register given instruction as already analyzed for being possible
2930 /// reduction root.
2931voidanalyzedReductionRoot(Instruction *I) {
2932 AnalyzedReductionsRoots.insert(I);
2933 }
2934 /// Checks if the provided list of reduced values was checked already for
2935 /// vectorization.
2936boolareAnalyzedReductionVals(ArrayRef<Value *> VL) const{
2937return AnalyzedReductionVals.contains(hash_value(VL));
2938 }
2939 /// Adds the list of reduced values to list of already checked values for the
2940 /// vectorization.
2941voidanalyzedReductionVals(ArrayRef<Value *> VL) {
2942 AnalyzedReductionVals.insert(hash_value(VL));
2943 }
2944 /// Clear the list of the analyzed reduction root instructions.
2945voidclearReductionData() {
2946 AnalyzedReductionsRoots.clear();
2947 AnalyzedReductionVals.clear();
2948 AnalyzedMinBWVals.clear();
2949 }
2950 /// Checks if the given value is gathered in one of the nodes.
2951boolisAnyGathered(constSmallDenseSet<Value *> &Vals) const{
2952returnany_of(MustGather, [&](Value *V) {return Vals.contains(V); });
2953 }
2954 /// Checks if the given value is gathered in one of the nodes.
2955boolisGathered(constValue *V) const{
2956return MustGather.contains(V);
2957 }
2958 /// Checks if the specified value was not schedule.
2959boolisNotScheduled(constValue *V) const{
2960return NonScheduledFirst.contains(V);
2961 }
2962
2963 /// Check if the value is vectorized in the tree.
2964boolisVectorized(Value *V) const{return getTreeEntry(V); }
2965
2966~BoUpSLP();
2967
2968private:
2969 /// Determine if a node \p E in can be demoted to a smaller type with a
2970 /// truncation. We collect the entries that will be demoted in ToDemote.
2971 /// \param E Node for analysis
2972 /// \param ToDemote indices of the nodes to be demoted.
2973bool collectValuesToDemote(
2974const TreeEntry &E,bool IsProfitableToDemoteRoot,unsigned &BitWidth,
2975SmallVectorImpl<unsigned> &ToDemote,DenseSet<const TreeEntry *> &Visited,
2976constSmallDenseSet<unsigned, 8> &NodesToKeepBWs,unsigned &MaxDepthLevel,
2977bool &IsProfitableToDemote,bool IsTruncRoot)const;
2978
2979 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2980 /// reordering (i.e. the operands can be reordered because they have only one
2981 /// user and reordarable).
2982 /// \param ReorderableGathers List of all gather nodes that require reordering
2983 /// (e.g., gather of extractlements or partially vectorizable loads).
2984 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2985 /// reordering, subset of \p NonVectorized.
2986bool
2987 canReorderOperands(TreeEntry *UserTE,
2988SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2989ArrayRef<TreeEntry *> ReorderableGathers,
2990SmallVectorImpl<TreeEntry *> &GatherOps);
2991
2992 /// Checks if the given \p TE is a gather node with clustered reused scalars
2993 /// and reorders it per given \p Mask.
2994void reorderNodeWithReuses(TreeEntry &TE,ArrayRef<int> Mask)const;
2995
2996 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2997 /// if any. If it is not vectorized (gather node), returns nullptr.
2998 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,unsigned OpIdx) {
2999ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
3000 TreeEntry *TE =nullptr;
3001constauto *It =find_if(VL, [&](Value *V) {
3002 TE = getTreeEntry(V);
3003if (TE &&is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
3004returntrue;
3005auto It = MultiNodeScalars.find(V);
3006if (It != MultiNodeScalars.end()) {
3007for (TreeEntry *E : It->second) {
3008if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
3009 TE = E;
3010returntrue;
3011 }
3012 }
3013 }
3014returnfalse;
3015 });
3016if (It != VL.end()) {
3017assert(TE->isSame(VL) &&"Expected same scalars.");
3018returnTE;
3019 }
3020returnnullptr;
3021 }
3022
3023 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
3024 /// if any. If it is not vectorized (gather node), returns nullptr.
3025const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
3026unsigned OpIdx) const{
3027returnconst_cast<BoUpSLP *>(this)->getVectorizedOperand(
3028const_cast<TreeEntry *>(UserTE), OpIdx);
3029 }
3030
3031 /// Checks if all users of \p I are the part of the vectorization tree.
3032bool areAllUsersVectorized(
3033Instruction *I,
3034constSmallDenseSet<Value *> *VectorizedVals =nullptr)const;
3035
3036 /// Return information about the vector formed for the specified index
3037 /// of a vector of (the same) instruction.
3038TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
3039
3040 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3041const TreeEntry *getOperandEntry(const TreeEntry *E,unsignedIdx)const;
3042
3043 /// Gets the root instruction for the given node. If the node is a strided
3044 /// load/store node with the reverse order, the root instruction is the last
3045 /// one.
3046Instruction *getRootEntryInstruction(const TreeEntry &Entry)const;
3047
3048 /// \returns Cast context for the given graph node.
3049TargetTransformInfo::CastContextHint
3050 getCastContextHint(const TreeEntry &TE)const;
3051
3052 /// \returns the cost of the vectorizable entry.
3053InstructionCost getEntryCost(const TreeEntry *E,
3054ArrayRef<Value *> VectorizedVals,
3055SmallPtrSetImpl<Value *> &CheckedExtracts);
3056
3057 /// This is the recursive part of buildTree.
3058void buildTree_rec(ArrayRef<Value *> Roots,unsignedDepth,
3059const EdgeInfo &EI,unsigned InterleaveFactor = 0);
3060
3061 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3062 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3063 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3064 /// returns false, setting \p CurrentOrder to either an empty vector or a
3065 /// non-identity permutation that allows to reuse extract instructions.
3066 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3067 /// extract order.
3068bool canReuseExtract(ArrayRef<Value *> VL,
3069SmallVectorImpl<unsigned> &CurrentOrder,
3070bool ResizeAllowed =false)const;
3071
3072 /// Vectorize a single entry in the tree.
3073 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3074 /// avoid issues with def-use order.
3075Value *vectorizeTree(TreeEntry *E,bool PostponedPHIs);
3076
3077 /// Returns vectorized operand node, that matches the order of the scalars
3078 /// operand number \p NodeIdx in entry \p E.
3079 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,unsigned NodeIdx);
3080const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3081unsigned NodeIdx) const{
3082returnconst_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3083 }
3084
3085 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3086 /// \p E.
3087 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3088 /// avoid issues with def-use order.
3089Value *vectorizeOperand(TreeEntry *E,unsigned NodeIdx,bool PostponedPHIs);
3090
3091 /// Create a new vector from a list of scalar values. Produces a sequence
3092 /// which exploits values reused across lanes, and arranges the inserts
3093 /// for ease of later optimization.
3094template <typename BVTy,typename ResTy,typename...Args>
3095 ResTy processBuildVector(const TreeEntry *E,Type *ScalarTy, Args &...Params);
3096
3097 /// Create a new vector from a list of scalar values. Produces a sequence
3098 /// which exploits values reused across lanes, and arranges the inserts
3099 /// for ease of later optimization.
3100Value *createBuildVector(const TreeEntry *E,Type *ScalarTy,
3101bool PostponedPHIs);
3102
3103 /// Returns the instruction in the bundle, which can be used as a base point
3104 /// for scheduling. Usually it is the last instruction in the bundle, except
3105 /// for the case when all operands are external (in this case, it is the first
3106 /// instruction in the list).
3107Instruction &getLastInstructionInBundle(const TreeEntry *E);
3108
3109 /// Tries to find extractelement instructions with constant indices from fixed
3110 /// vector type and gather such instructions into a bunch, which highly likely
3111 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3112 /// was successful, the matched scalars are replaced by poison values in \p VL
3113 /// for future analysis.
3114 std::optional<TargetTransformInfo::ShuffleKind>
3115 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3116SmallVectorImpl<int> &Mask)const;
3117
3118 /// Tries to find extractelement instructions with constant indices from fixed
3119 /// vector type and gather such instructions into a bunch, which highly likely
3120 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3121 /// was successful, the matched scalars are replaced by poison values in \p VL
3122 /// for future analysis.
3123SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3124 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3125SmallVectorImpl<int> &Mask,
3126unsigned NumParts)const;
3127
3128 /// Checks if the gathered \p VL can be represented as a single register
3129 /// shuffle(s) of previous tree entries.
3130 /// \param TE Tree entry checked for permutation.
3131 /// \param VL List of scalars (a subset of the TE scalar), checked for
3132 /// permutations. Must form single-register vector.
3133 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3134 /// commands to build the mask using the original vector value, without
3135 /// relying on the potential reordering.
3136 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3137 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3138 std::optional<TargetTransformInfo::ShuffleKind>
3139 isGatherShuffledSingleRegisterEntry(
3140const TreeEntry *TE,ArrayRef<Value *> VL,MutableArrayRef<int> Mask,
3141SmallVectorImpl<const TreeEntry *> &Entries,unsigned Part,
3142bool ForOrder);
3143
3144 /// Checks if the gathered \p VL can be represented as multi-register
3145 /// shuffle(s) of previous tree entries.
3146 /// \param TE Tree entry checked for permutation.
3147 /// \param VL List of scalars (a subset of the TE scalar), checked for
3148 /// permutations.
3149 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3150 /// commands to build the mask using the original vector value, without
3151 /// relying on the potential reordering.
3152 /// \returns per-register series of ShuffleKind, if gathered values can be
3153 /// represented as shuffles of previous tree entries. \p Mask is filled with
3154 /// the shuffle mask (also on per-register base).
3155SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3156 isGatherShuffledEntry(
3157const TreeEntry *TE,ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,
3158SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
3159unsigned NumParts,bool ForOrder =false);
3160
3161 /// \returns the cost of gathering (inserting) the values in \p VL into a
3162 /// vector.
3163 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3164InstructionCost getGatherCost(ArrayRef<Value *> VL,bool ForPoisonSrc,
3165Type *ScalarTy)const;
3166
3167 /// Set the Builder insert point to one after the last instruction in
3168 /// the bundle
3169void setInsertPointAfterBundle(const TreeEntry *E);
3170
3171 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3172 /// specified, the starting vector value is poison.
3173Value *
3174 gather(ArrayRef<Value *> VL,Value *Root,Type *ScalarTy,
3175function_ref<Value *(Value *,Value *,ArrayRef<int>)> CreateShuffle);
3176
3177 /// \returns whether the VectorizableTree is fully vectorizable and will
3178 /// be beneficial even the tree height is tiny.
3179bool isFullyVectorizableTinyTree(bool ForReduction)const;
3180
3181 /// Run through the list of all gathered loads in the graph and try to find
3182 /// vector loads/masked gathers instead of regular gathers. Later these loads
3183 /// are reshufled to build final gathered nodes.
3184void tryToVectorizeGatheredLoads(
3185constSmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3186SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3187 8> &GatheredLoads);
3188
3189 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3190 /// users of \p TE and collects the stores. It returns the map from the store
3191 /// pointers to the collected stores.
3192SmallVector<SmallVector<StoreInst *>>
3193 collectUserStores(const BoUpSLP::TreeEntry *TE)const;
3194
3195 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3196 /// stores in \p StoresVec can form a vector instruction. If so it returns
3197 /// true and populates \p ReorderIndices with the shuffle indices of the
3198 /// stores when compared to the sorted vector.
3199bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3200OrdersType &ReorderIndices)const;
3201
3202 /// Iterates through the users of \p TE, looking for scalar stores that can be
3203 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3204 /// their order and builds an order index vector for each store bundle. It
3205 /// returns all these order vectors found.
3206 /// We run this after the tree has formed, otherwise we may come across user
3207 /// instructions that are not yet in the tree.
3208SmallVector<OrdersType, 1>
3209 findExternalStoreUsersReorderIndices(TreeEntry *TE)const;
3210
3211 /// Tries to reorder the gathering node for better vectorization
3212 /// opportunities.
3213void reorderGatherNode(TreeEntry &TE);
3214
3215structTreeEntry {
3216usingVecTreeTy =SmallVector<std::unique_ptr<TreeEntry>, 8>;
3217 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3218
3219 /// \returns Common mask for reorder indices and reused scalars.
3220SmallVector<int> getCommonMask() const{
3221SmallVector<int>Mask;
3222inversePermutation(ReorderIndices, Mask);
3223::addMask(Mask, ReuseShuffleIndices);
3224returnMask;
3225 }
3226
3227 /// \returns true if the scalars in VL are equal to this entry.
3228bool isSame(ArrayRef<Value *> VL) const{
3229auto &&IsSame = [VL](ArrayRef<Value *> Scalars,ArrayRef<int>Mask) {
3230if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3231return std::equal(VL.begin(), VL.end(), Scalars.begin());
3232return VL.size() ==Mask.size() &&
3233 std::equal(VL.begin(), VL.end(),Mask.begin(),
3234 [Scalars](Value *V,int Idx) {
3235 return (isa<UndefValue>(V) &&
3236 Idx == PoisonMaskElem) ||
3237 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3238 });
3239 };
3240if (!ReorderIndices.empty()) {
3241// TODO: implement matching if the nodes are just reordered, still can
3242// treat the vector as the same if the list of scalars matches VL
3243// directly, without reordering.
3244SmallVector<int>Mask;
3245inversePermutation(ReorderIndices, Mask);
3246if (VL.size() == Scalars.size())
3247return IsSame(Scalars, Mask);
3248if (VL.size() == ReuseShuffleIndices.size()) {
3249::addMask(Mask, ReuseShuffleIndices);
3250return IsSame(Scalars, Mask);
3251 }
3252returnfalse;
3253 }
3254return IsSame(Scalars, ReuseShuffleIndices);
3255 }
3256
3257bool isOperandGatherNode(const EdgeInfo &UserEI) const{
3258returnisGather() && !UserTreeIndices.empty() &&
3259 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3260 UserTreeIndices.front().UserTE == UserEI.UserTE;
3261 }
3262
3263 /// \returns true if current entry has same operands as \p TE.
3264bool hasEqualOperands(const TreeEntry &TE) const{
3265if (TE.getNumOperands() != getNumOperands())
3266returnfalse;
3267SmallBitVectorUsed(getNumOperands());
3268for (unsignedI = 0, E = getNumOperands();I < E; ++I) {
3269unsigned PrevCount =Used.count();
3270for (unsigned K = 0;K < E; ++K) {
3271if (Used.test(K))
3272continue;
3273if (getOperand(K) ==TE.getOperand(I)) {
3274Used.set(K);
3275break;
3276 }
3277 }
3278// Check if we actually found the matching operand.
3279if (PrevCount ==Used.count())
3280returnfalse;
3281 }
3282returntrue;
3283 }
3284
3285 /// \return Final vectorization factor for the node. Defined by the total
3286 /// number of vectorized scalars, including those, used several times in the
3287 /// entry and counted in the \a ReuseShuffleIndices, if any.
3288unsigned getVectorFactor() const{
3289if (!ReuseShuffleIndices.empty())
3290return ReuseShuffleIndices.size();
3291return Scalars.size();
3292 };
3293
3294 /// Checks if the current node is a gather node.
3295boolisGather() const{return State == NeedToGather; }
3296
3297 /// A vector of scalars.
3298ValueList Scalars;
3299
3300 /// The Scalars are vectorized into this value. It is initialized to Null.
3301WeakTrackingVH VectorizedValue =nullptr;
3302
3303 /// New vector phi instructions emitted for the vectorized phi nodes.
3304PHINode *PHI =nullptr;
3305
3306 /// Do we need to gather this sequence or vectorize it
3307 /// (either with vector instruction or with scatter/gather
3308 /// intrinsics for store/load)?
3309enum EntryState {
3310 Vectorize,///< The node is regularly vectorized.
3311 ScatterVectorize,///< Masked scatter/gather node.
3312 StridedVectorize,///< Strided loads (and stores)
3313 NeedToGather,///< Gather/buildvector node.
3314 CombinedVectorize,///< Vectorized node, combined with its user into more
3315 ///< complex node like select/cmp to minmax, mul/add to
3316 ///< fma, etc. Must be used for the following nodes in
3317 ///< the pattern, not the very first one.
3318 };
3319 EntryState State;
3320
3321 /// List of combined opcodes supported by the vectorizer.
3322enum CombinedOpcode {
3323 NotCombinedOp = -1,
3324MinMax = Instruction::OtherOpsEnd + 1,
3325 };
3326 CombinedOpcode CombinedOp = NotCombinedOp;
3327
3328 /// Does this sequence require some shuffling?
3329SmallVector<int, 4> ReuseShuffleIndices;
3330
3331 /// Does this entry require reordering?
3332SmallVector<unsigned, 4> ReorderIndices;
3333
3334 /// Points back to the VectorizableTree.
3335 ///
3336 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3337 /// to be a pointer and needs to be able to initialize the child iterator.
3338 /// Thus we need a reference back to the container to translate the indices
3339 /// to entries.
3340 VecTreeTy &Container;
3341
3342 /// The TreeEntry index containing the user of this entry. We can actually
3343 /// have multiple users so the data structure is not truly a tree.
3344SmallVector<EdgeInfo, 1> UserTreeIndices;
3345
3346 /// The index of this treeEntry in VectorizableTree.
3347unsignedIdx = 0;
3348
3349 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3350 /// other nodes as a series of insertvector instructions.
3351SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3352
3353private:
3354 /// The operands of each instruction in each lane Operands[op_index][lane].
3355 /// Note: This helps avoid the replication of the code that performs the
3356 /// reordering of operands during buildTree_rec() and vectorizeTree().
3357SmallVector<ValueList, 2>Operands;
3358
3359 /// MainOp and AltOp are recorded inside. S should be obtained from
3360 /// newTreeEntry.
3361 InstructionsState S = InstructionsState::invalid();
3362
3363 /// Interleaving factor for interleaved loads Vectorize nodes.
3364unsigned InterleaveFactor = 0;
3365
3366public:
3367 /// Returns interleave factor for interleave nodes.
3368unsigned getInterleaveFactor() const{return InterleaveFactor; }
3369 /// Sets interleaving factor for the interleaving nodes.
3370void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3371
3372 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3373void setOperand(unsigned OpIdx,ArrayRef<Value *> OpVL) {
3374if (Operands.size() < OpIdx + 1)
3375Operands.resize(OpIdx + 1);
3376assert(Operands[OpIdx].empty() &&"Already resized?");
3377assert(OpVL.size() <= Scalars.size() &&
3378"Number of operands is greater than the number of scalars.");
3379Operands[OpIdx].resize(OpVL.size());
3380copy(OpVL, Operands[OpIdx].begin());
3381 }
3382
3383 /// Set this bundle's operand from Scalars.
3384void setOperand(constBoUpSLP &R,bool RequireReorder =false) {
3385 VLOperands Ops(Scalars, S, R);
3386if (RequireReorder)
3387 Ops.reorder();
3388for (unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands()))
3389 setOperand(I, Ops.getVL(I));
3390 }
3391
3392 /// Reorders operands of the node to the given mask \p Mask.
3393void reorderOperands(ArrayRef<int> Mask) {
3394for (ValueList &Operand : Operands)
3395reorderScalars(Operand, Mask);
3396 }
3397
3398 /// \returns the \p OpIdx operand of this TreeEntry.
3399ValueList &getOperand(unsigned OpIdx) {
3400assert(OpIdx <Operands.size() &&"Off bounds");
3401returnOperands[OpIdx];
3402 }
3403
3404 /// \returns the \p OpIdx operand of this TreeEntry.
3405ArrayRef<Value *> getOperand(unsigned OpIdx) const{
3406assert(OpIdx <Operands.size() &&"Off bounds");
3407returnOperands[OpIdx];
3408 }
3409
3410 /// \returns the number of operands.
3411unsigned getNumOperands() const{returnOperands.size(); }
3412
3413 /// \return the single \p OpIdx operand.
3414Value *getSingleOperand(unsigned OpIdx) const{
3415assert(OpIdx <Operands.size() &&"Off bounds");
3416assert(!Operands[OpIdx].empty() &&"No operand available");
3417returnOperands[OpIdx][0];
3418 }
3419
3420 /// Some of the instructions in the list have alternate opcodes.
3421bool isAltShuffle() const{return S.isAltShuffle(); }
3422
3423bool isOpcodeOrAlt(Instruction *I) const{return S.isOpcodeOrAlt(I); }
3424
3425 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3426 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3427 /// \p OpValue.
3428Value *isOneOf(Value *Op) const{
3429auto *I = dyn_cast<Instruction>(Op);
3430if (I && isOpcodeOrAlt(I))
3431returnOp;
3432return S.getMainOp();
3433 }
3434
3435void setOperations(const InstructionsState &S) {
3436assert(S &&"InstructionsState is invalid.");
3437 this->S = S;
3438 }
3439
3440Instruction *getMainOp() const{return S.getMainOp(); }
3441
3442Instruction *getAltOp() const{return S.getAltOp(); }
3443
3444 /// The main/alternate opcodes for the list of instructions.
3445unsigned getOpcode() const{return S.getOpcode(); }
3446
3447unsigned getAltOpcode() const{return S.getAltOpcode(); }
3448
3449bool hasState() const{return S.valid(); }
3450
3451 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3452 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3453int findLaneForValue(Value *V) const{
3454unsigned FoundLane = getVectorFactor();
3455for (auto *It =find(Scalars, V), *End = Scalars.end(); It !=End;
3456 std::advance(It, 1)) {
3457if (*It != V)
3458continue;
3459 FoundLane = std::distance(Scalars.begin(), It);
3460assert(FoundLane < Scalars.size() &&"Couldn't find extract lane");
3461if (!ReorderIndices.empty())
3462 FoundLane = ReorderIndices[FoundLane];
3463assert(FoundLane < Scalars.size() &&"Couldn't find extract lane");
3464if (ReuseShuffleIndices.empty())
3465break;
3466if (auto *RIt =find(ReuseShuffleIndices, FoundLane);
3467 RIt != ReuseShuffleIndices.end()) {
3468 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3469break;
3470 }
3471 }
3472assert(FoundLane < getVectorFactor() &&"Unable to find given value.");
3473return FoundLane;
3474 }
3475
3476 /// Build a shuffle mask for graph entry which represents a merge of main
3477 /// and alternate operations.
3478void
3479 buildAltOpShuffleMask(constfunction_ref<bool(Instruction *)> IsAltOp,
3480SmallVectorImpl<int> &Mask,
3481SmallVectorImpl<Value *> *OpScalars =nullptr,
3482SmallVectorImpl<Value *> *AltScalars =nullptr)const;
3483
3484 /// Return true if this is a non-power-of-2 node.
3485bool isNonPowOf2Vec() const{
3486bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3487return IsNonPowerOf2;
3488 }
3489
3490 /// Return true if this is a node, which tries to vectorize number of
3491 /// elements, forming whole vectors.
3492bool
3493 hasNonWholeRegisterOrNonPowerOf2Vec(constTargetTransformInfo &TTI) const{
3494bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3495TTI,getValueType(Scalars.front()), Scalars.size());
3496assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3497"Reshuffling not supported with non-power-of-2 vectors yet.");
3498return IsNonPowerOf2;
3499 }
3500
3501Value *getOrdered(unsigned Idx) const{
3502assert(isGather() &&"Must be used only for buildvectors/gathers.");
3503if (ReorderIndices.empty())
3504return Scalars[Idx];
3505SmallVector<int>Mask;
3506inversePermutation(ReorderIndices, Mask);
3507return Scalars[Mask[Idx]];
3508 }
3509
3510#ifndef NDEBUG
3511 /// Debug printer.
3512LLVM_DUMP_METHODvoiddump() const{
3513dbgs() <<Idx <<".\n";
3514for (unsigned OpI = 0, OpE =Operands.size(); OpI != OpE; ++OpI) {
3515dbgs() <<"Operand " << OpI <<":\n";
3516for (constValue *V : Operands[OpI])
3517dbgs().indent(2) << *V <<"\n";
3518 }
3519dbgs() <<"Scalars: \n";
3520for (Value *V : Scalars)
3521dbgs().indent(2) << *V <<"\n";
3522dbgs() <<"State: ";
3523switch (State) {
3524case Vectorize:
3525if (InterleaveFactor > 0) {
3526dbgs() <<"Vectorize with interleave factor " << InterleaveFactor
3527 <<"\n";
3528 }else {
3529dbgs() <<"Vectorize\n";
3530 }
3531break;
3532case ScatterVectorize:
3533dbgs() <<"ScatterVectorize\n";
3534break;
3535case StridedVectorize:
3536dbgs() <<"StridedVectorize\n";
3537break;
3538case NeedToGather:
3539dbgs() <<"NeedToGather\n";
3540break;
3541case CombinedVectorize:
3542dbgs() <<"CombinedVectorize\n";
3543break;
3544 }
3545if (S) {
3546dbgs() <<"MainOp: " << *S.getMainOp() <<"\n";
3547dbgs() <<"AltOp: " << *S.getAltOp() <<"\n";
3548 }else {
3549dbgs() <<"MainOp: NULL\n";
3550dbgs() <<"AltOp: NULL\n";
3551 }
3552dbgs() <<"VectorizedValue: ";
3553if (VectorizedValue)
3554dbgs() << *VectorizedValue <<"\n";
3555else
3556dbgs() <<"NULL\n";
3557dbgs() <<"ReuseShuffleIndices: ";
3558if (ReuseShuffleIndices.empty())
3559dbgs() <<"Empty";
3560else
3561for (int ReuseIdx : ReuseShuffleIndices)
3562dbgs() << ReuseIdx <<", ";
3563dbgs() <<"\n";
3564dbgs() <<"ReorderIndices: ";
3565for (unsigned ReorderIdx : ReorderIndices)
3566dbgs() << ReorderIdx <<", ";
3567dbgs() <<"\n";
3568dbgs() <<"UserTreeIndices: ";
3569for (constauto &EInfo : UserTreeIndices)
3570dbgs() << EInfo <<", ";
3571dbgs() <<"\n";
3572if (!CombinedEntriesWithIndices.empty()) {
3573dbgs() <<"Combined entries: ";
3574interleaveComma(CombinedEntriesWithIndices,dbgs(), [&](constauto &P) {
3575dbgs() <<"Entry index " <<P.first <<" with offset " <<P.second;
3576 });
3577dbgs() <<"\n";
3578 }
3579 }
3580#endif
3581 };
3582
3583#ifndef NDEBUG
3584void dumpTreeCosts(const TreeEntry *E,InstructionCost ReuseShuffleCost,
3585InstructionCost VecCost,InstructionCost ScalarCost,
3586StringRef Banner) const{
3587dbgs() <<"SLP: " << Banner <<":\n";
3588 E->dump();
3589dbgs() <<"SLP: Costs:\n";
3590dbgs() <<"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<"\n";
3591dbgs() <<"SLP: VectorCost = " << VecCost <<"\n";
3592dbgs() <<"SLP: ScalarCost = " << ScalarCost <<"\n";
3593dbgs() <<"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3594 << ReuseShuffleCost + VecCost - ScalarCost <<"\n";
3595 }
3596#endif
3597
3598 /// Create a new VectorizableTree entry.
3599 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3600 std::optional<ScheduleData *> Bundle,
3601const InstructionsState &S,
3602const EdgeInfo &UserTreeIdx,
3603ArrayRef<int> ReuseShuffleIndices = {},
3604ArrayRef<unsigned> ReorderIndices = {},
3605unsigned InterleaveFactor = 0) {
3606 TreeEntry::EntryState EntryState =
3607 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3608 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3609 ReuseShuffleIndices, ReorderIndices);
3610if (E && InterleaveFactor > 0)
3611 E->setInterleave(InterleaveFactor);
3612return E;
3613 }
3614
3615 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3616 TreeEntry::EntryState EntryState,
3617 std::optional<ScheduleData *> Bundle,
3618const InstructionsState &S,
3619const EdgeInfo &UserTreeIdx,
3620ArrayRef<int> ReuseShuffleIndices = {},
3621ArrayRef<unsigned> ReorderIndices = {}) {
3622assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3623 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3624"Need to vectorize gather entry?");
3625// Gathered loads still gathered? Do not create entry, use the original one.
3626if (GatheredLoadsEntriesFirst.has_value() &&
3627 EntryState == TreeEntry::NeedToGather && S &&
3628 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3629 !UserTreeIdx.UserTE)
3630returnnullptr;
3631 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3632 TreeEntry *Last = VectorizableTree.back().get();
3633Last->Idx = VectorizableTree.size() - 1;
3634Last->State = EntryState;
3635// FIXME: Remove once support for ReuseShuffleIndices has been implemented
3636// for non-power-of-two vectors.
3637assert(
3638 (hasFullVectorsOrPowerOf2(*TTI,getValueType(VL.front()), VL.size()) ||
3639 ReuseShuffleIndices.empty()) &&
3640"Reshuffling scalars not yet supported for nodes with padding");
3641Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3642 ReuseShuffleIndices.end());
3643if (ReorderIndices.empty()) {
3644Last->Scalars.assign(VL.begin(), VL.end());
3645if (S)
3646Last->setOperations(S);
3647 }else {
3648// Reorder scalars and build final mask.
3649Last->Scalars.assign(VL.size(),nullptr);
3650transform(ReorderIndices,Last->Scalars.begin(),
3651 [VL](unsignedIdx) ->Value * {
3652 if (Idx >= VL.size())
3653 return UndefValue::get(VL.front()->getType());
3654 return VL[Idx];
3655 });
3656 InstructionsState S =getSameOpcode(Last->Scalars, *TLI);
3657if (S)
3658Last->setOperations(S);
3659Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3660 }
3661if (!Last->isGather()) {
3662for (Value *V : VL) {
3663if (isa<PoisonValue>(V))
3664continue;
3665const TreeEntry *TE = getTreeEntry(V);
3666assert((!TE || TE ==Last ||doesNotNeedToBeScheduled(V)) &&
3667"Scalar already in tree!");
3668if (TE) {
3669if (TE !=Last)
3670 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3671continue;
3672 }
3673 ScalarToTreeEntry[V] =Last;
3674 }
3675// Update the scheduler bundle to point to this TreeEntry.
3676 ScheduleData *BundleMember = *Bundle;
3677assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3678isVectorLikeInstWithConstOps(S.getMainOp()) ||
3679doesNotNeedToSchedule(VL)) &&
3680"Bundle and VL out of sync");
3681if (BundleMember) {
3682for (Value *V : VL) {
3683if (doesNotNeedToBeScheduled(V))
3684continue;
3685if (!BundleMember)
3686continue;
3687 BundleMember->TE =Last;
3688 BundleMember = BundleMember->NextInBundle;
3689 }
3690 }
3691assert(!BundleMember &&"Bundle and VL out of sync");
3692 }else {
3693// Build a map for gathered scalars to the nodes where they are used.
3694bool AllConstsOrCasts =true;
3695for (Value *V : VL)
3696if (!isConstant(V)) {
3697auto *I = dyn_cast<CastInst>(V);
3698 AllConstsOrCasts &=I &&I->getType()->isIntegerTy();
3699if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3700 !UserTreeIdx.UserTE->isGather())
3701 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3702 }
3703if (AllConstsOrCasts)
3704 CastMaxMinBWSizes =
3705 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3706 MustGather.insert(VL.begin(), VL.end());
3707 }
3708
3709if (UserTreeIdx.UserTE)
3710Last->UserTreeIndices.push_back(UserTreeIdx);
3711returnLast;
3712 }
3713
3714 /// -- Vectorization State --
3715 /// Holds all of the tree entries.
3716 TreeEntry::VecTreeTy VectorizableTree;
3717
3718#ifndef NDEBUG
3719 /// Debug printer.
3720LLVM_DUMP_METHODvoid dumpVectorizableTree() const{
3721for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3722 VectorizableTree[Id]->dump();
3723dbgs() <<"\n";
3724 }
3725 }
3726#endif
3727
3728 TreeEntry *getTreeEntry(Value *V) {
3729assert(V &&"V cannot be nullptr.");
3730return ScalarToTreeEntry.lookup(V);
3731 }
3732
3733const TreeEntry *getTreeEntry(Value *V) const{
3734assert(V &&"V cannot be nullptr.");
3735return ScalarToTreeEntry.lookup(V);
3736 }
3737
3738 /// Check that the operand node of alternate node does not generate
3739 /// buildvector sequence. If it is, then probably not worth it to build
3740 /// alternate shuffle, if number of buildvector operands + alternate
3741 /// instruction > than the number of buildvector instructions.
3742 /// \param S the instructions state of the analyzed values.
3743 /// \param VL list of the instructions with alternate opcodes.
3744bool areAltOperandsProfitable(const InstructionsState &S,
3745ArrayRef<Value *> VL)const;
3746
3747 /// Checks if the specified list of the instructions/values can be vectorized
3748 /// and fills required data before actual scheduling of the instructions.
3749 TreeEntry::EntryState
3750 getScalarsVectorizationState(const InstructionsState &S,ArrayRef<Value *> VL,
3751bool IsScatterVectorizeUserTE,
3752OrdersType &CurrentOrder,
3753SmallVectorImpl<Value *> &PointerOps);
3754
3755 /// Maps a specific scalar to its tree entry.
3756SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3757
3758 /// List of scalars, used in several vectorize nodes, and the list of the
3759 /// nodes.
3760SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
3761
3762 /// Maps a value to the proposed vectorizable size.
3763SmallDenseMap<Value *, unsigned> InstrElementSize;
3764
3765 /// A list of scalars that we found that we need to keep as scalars.
3766ValueSet MustGather;
3767
3768 /// A set of first non-schedulable values.
3769ValueSet NonScheduledFirst;
3770
3771 /// A map between the vectorized entries and the last instructions in the
3772 /// bundles. The bundles are built in use order, not in the def order of the
3773 /// instructions. So, we cannot rely directly on the last instruction in the
3774 /// bundle being the last instruction in the program order during
3775 /// vectorization process since the basic blocks are affected, need to
3776 /// pre-gather them before.
3777DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3778
3779 /// List of gather nodes, depending on other gather/vector nodes, which should
3780 /// be emitted after the vector instruction emission process to correctly
3781 /// handle order of the vector instructions and shuffles.
3782SetVector<const TreeEntry *> PostponedGathers;
3783
3784usingValueToGatherNodesMap =
3785DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
3786 ValueToGatherNodesMap ValueToGatherNodes;
3787
3788 /// A list of the load entries (node indices), which can be vectorized using
3789 /// strided or masked gather approach, but attempted to be represented as
3790 /// contiguous loads.
3791SetVector<unsigned> LoadEntriesToVectorize;
3792
3793 /// true if graph nodes transforming mode is on.
3794bool IsGraphTransformMode =false;
3795
3796 /// The index of the first gathered load entry in the VectorizeTree.
3797 std::optional<unsigned> GatheredLoadsEntriesFirst;
3798
3799 /// This POD struct describes one external user in the vectorized tree.
3800structExternalUser {
3801 ExternalUser(Value *S,llvm::User *U,int L)
3802 :Scalar(S),User(U), Lane(L) {}
3803
3804// Which scalar in our function.
3805Value *Scalar;
3806
3807// Which user that uses the scalar.
3808llvm::User *User;
3809
3810// Which lane does the scalar belong to.
3811int Lane;
3812 };
3813usingUserList =SmallVector<ExternalUser, 16>;
3814
3815 /// Checks if two instructions may access the same memory.
3816 ///
3817 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3818 /// is invariant in the calling loop.
3819bool isAliased(constMemoryLocation &Loc1,Instruction *Inst1,
3820Instruction *Inst2) {
3821if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3822returntrue;
3823// First check if the result is already in the cache.
3824 AliasCacheKeyKey = std::make_pair(Inst1, Inst2);
3825auto It = AliasCache.find(Key);
3826if (It != AliasCache.end())
3827return It->second;
3828bool Aliased =isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3829// Store the result in the cache.
3830 AliasCache.try_emplace(Key, Aliased);
3831 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3832return Aliased;
3833 }
3834
3835usingAliasCacheKey = std::pair<Instruction *, Instruction *>;
3836
3837 /// Cache for alias results.
3838 /// TODO: consider moving this to the AliasAnalysis itself.
3839DenseMap<AliasCacheKey, bool> AliasCache;
3840
3841// Cache for pointerMayBeCaptured calls inside AA. This is preserved
3842// globally through SLP because we don't perform any action which
3843// invalidates capture results.
3844BatchAAResults BatchAA;
3845
3846 /// Temporary store for deleted instructions. Instructions will be deleted
3847 /// eventually when the BoUpSLP is destructed. The deferral is required to
3848 /// ensure that there are no incorrect collisions in the AliasCache, which
3849 /// can happen if a new instruction is allocated at the same address as a
3850 /// previously deleted instruction.
3851DenseSet<Instruction *> DeletedInstructions;
3852
3853 /// Set of the instruction, being analyzed already for reductions.
3854SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3855
3856 /// Set of hashes for the list of reduction values already being analyzed.
3857DenseSet<size_t> AnalyzedReductionVals;
3858
3859 /// Values, already been analyzed for mininmal bitwidth and found to be
3860 /// non-profitable.
3861DenseSet<Value *> AnalyzedMinBWVals;
3862
3863 /// A list of values that need to extracted out of the tree.
3864 /// This list holds pairs of (Internal Scalar : External User). External User
3865 /// can be nullptr, it means that this Internal Scalar will be used later,
3866 /// after vectorization.
3867 UserList ExternalUses;
3868
3869 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3870 /// extractelement instructions.
3871SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3872
3873 /// Values used only by @llvm.assume calls.
3874SmallPtrSet<const Value *, 32> EphValues;
3875
3876 /// Holds all of the instructions that we gathered, shuffle instructions and
3877 /// extractelements.
3878SetVector<Instruction *> GatherShuffleExtractSeq;
3879
3880 /// A list of blocks that we are going to CSE.
3881DenseSet<BasicBlock *> CSEBlocks;
3882
3883 /// List of hashes of vector of loads, which are known to be non vectorizable.
3884DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3885
3886 /// Contains all scheduling relevant data for an instruction.
3887 /// A ScheduleData either represents a single instruction or a member of an
3888 /// instruction bundle (= a group of instructions which is combined into a
3889 /// vector instruction).
3890structScheduleData {
3891// The initial value for the dependency counters. It means that the
3892// dependencies are not calculated yet.
3893enum { InvalidDeps = -1 };
3894
3895 ScheduleData() =default;
3896
3897voidinit(int BlockSchedulingRegionID,Instruction *I) {
3898 FirstInBundle =this;
3899 NextInBundle =nullptr;
3900 NextLoadStore =nullptr;
3901 IsScheduled =false;
3902 SchedulingRegionID = BlockSchedulingRegionID;
3903 clearDependencies();
3904 Inst =I;
3905TE =nullptr;
3906 }
3907
3908 /// Verify basic self consistency properties
3909voidverify() {
3910if (hasValidDependencies()) {
3911assert(UnscheduledDeps <= Dependencies &&"invariant");
3912 }else {
3913assert(UnscheduledDeps == Dependencies &&"invariant");
3914 }
3915
3916if (IsScheduled) {
3917assert(isSchedulingEntity() &&
3918"unexpected scheduled state");
3919for (const ScheduleData *BundleMember =this; BundleMember;
3920 BundleMember = BundleMember->NextInBundle) {
3921assert(BundleMember->hasValidDependencies() &&
3922 BundleMember->UnscheduledDeps == 0 &&
3923"unexpected scheduled state");
3924assert((BundleMember ==this || !BundleMember->IsScheduled) &&
3925"only bundle is marked scheduled");
3926 }
3927 }
3928
3929assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3930"all bundle members must be in same basic block");
3931 }
3932
3933 /// Returns true if the dependency information has been calculated.
3934 /// Note that depenendency validity can vary between instructions within
3935 /// a single bundle.
3936bool hasValidDependencies() const{return Dependencies != InvalidDeps; }
3937
3938 /// Returns true for single instructions and for bundle representatives
3939 /// (= the head of a bundle).
3940bool isSchedulingEntity() const{return FirstInBundle ==this; }
3941
3942 /// Returns true if it represents an instruction bundle and not only a
3943 /// single instruction.
3944bool isPartOfBundle() const{
3945return NextInBundle !=nullptr || FirstInBundle !=this ||TE;
3946 }
3947
3948 /// Returns true if it is ready for scheduling, i.e. it has no more
3949 /// unscheduled depending instructions/bundles.
3950bool isReady() const{
3951assert(isSchedulingEntity() &&
3952"can't consider non-scheduling entity for ready list");
3953return unscheduledDepsInBundle() == 0 && !IsScheduled;
3954 }
3955
3956 /// Modifies the number of unscheduled dependencies for this instruction,
3957 /// and returns the number of remaining dependencies for the containing
3958 /// bundle.
3959int incrementUnscheduledDeps(int Incr) {
3960assert(hasValidDependencies() &&
3961"increment of unscheduled deps would be meaningless");
3962 UnscheduledDeps += Incr;
3963return FirstInBundle->unscheduledDepsInBundle();
3964 }
3965
3966 /// Sets the number of unscheduled dependencies to the number of
3967 /// dependencies.
3968void resetUnscheduledDeps() {
3969 UnscheduledDeps = Dependencies;
3970 }
3971
3972 /// Clears all dependency information.
3973void clearDependencies() {
3974 Dependencies = InvalidDeps;
3975 resetUnscheduledDeps();
3976 MemoryDependencies.clear();
3977 ControlDependencies.clear();
3978 }
3979
3980int unscheduledDepsInBundle() const{
3981assert(isSchedulingEntity() &&"only meaningful on the bundle");
3982int Sum = 0;
3983for (const ScheduleData *BundleMember =this; BundleMember;
3984 BundleMember = BundleMember->NextInBundle) {
3985if (BundleMember->UnscheduledDeps == InvalidDeps)
3986return InvalidDeps;
3987 Sum += BundleMember->UnscheduledDeps;
3988 }
3989return Sum;
3990 }
3991
3992voiddump(raw_ostream &os) const{
3993if (!isSchedulingEntity()) {
3994 os <<"/ " << *Inst;
3995 }elseif (NextInBundle) {
3996 os <<'[' << *Inst;
3997 ScheduleData *SD = NextInBundle;
3998while (SD) {
3999 os <<';' << *SD->Inst;
4000 SD = SD->NextInBundle;
4001 }
4002 os <<']';
4003 }else {
4004 os << *Inst;
4005 }
4006 }
4007
4008LLVM_DUMP_METHODvoiddump() const{dump(dbgs()); }
4009
4010Instruction *Inst =nullptr;
4011
4012 /// The TreeEntry that this instruction corresponds to.
4013 TreeEntry *TE =nullptr;
4014
4015 /// Points to the head in an instruction bundle (and always to this for
4016 /// single instructions).
4017 ScheduleData *FirstInBundle =nullptr;
4018
4019 /// Single linked list of all instructions in a bundle. Null if it is a
4020 /// single instruction.
4021 ScheduleData *NextInBundle =nullptr;
4022
4023 /// Single linked list of all memory instructions (e.g. load, store, call)
4024 /// in the block - until the end of the scheduling region.
4025 ScheduleData *NextLoadStore =nullptr;
4026
4027 /// The dependent memory instructions.
4028 /// This list is derived on demand in calculateDependencies().
4029SmallVector<ScheduleData *, 4> MemoryDependencies;
4030
4031 /// List of instructions which this instruction could be control dependent
4032 /// on. Allowing such nodes to be scheduled below this one could introduce
4033 /// a runtime fault which didn't exist in the original program.
4034 /// ex: this is a load or udiv following a readonly call which inf loops
4035SmallVector<ScheduleData *, 4> ControlDependencies;
4036
4037 /// This ScheduleData is in the current scheduling region if this matches
4038 /// the current SchedulingRegionID of BlockScheduling.
4039int SchedulingRegionID = 0;
4040
4041 /// Used for getting a "good" final ordering of instructions.
4042int SchedulingPriority = 0;
4043
4044 /// The number of dependencies. Constitutes of the number of users of the
4045 /// instruction plus the number of dependent memory instructions (if any).
4046 /// This value is calculated on demand.
4047 /// If InvalidDeps, the number of dependencies is not calculated yet.
4048int Dependencies = InvalidDeps;
4049
4050 /// The number of dependencies minus the number of dependencies of scheduled
4051 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4052 /// for scheduling.
4053 /// Note that this is negative as long as Dependencies is not calculated.
4054int UnscheduledDeps = InvalidDeps;
4055
4056 /// True if this instruction is scheduled (or considered as scheduled in the
4057 /// dry-run).
4058bool IsScheduled =false;
4059 };
4060
4061#ifndef NDEBUG
4062friendinlineraw_ostream &operator<<(raw_ostream &os,
4063const BoUpSLP::ScheduleData &SD) {
4064 SD.dump(os);
4065return os;
4066 }
4067#endif
4068
4069friendstructGraphTraits<BoUpSLP *>;
4070friendstructDOTGraphTraits<BoUpSLP *>;
4071
4072 /// Contains all scheduling data for a basic block.
4073 /// It does not schedules instructions, which are not memory read/write
4074 /// instructions and their operands are either constants, or arguments, or
4075 /// phis, or instructions from others blocks, or their users are phis or from
4076 /// the other blocks. The resulting vector instructions can be placed at the
4077 /// beginning of the basic block without scheduling (if operands does not need
4078 /// to be scheduled) or at the end of the block (if users are outside of the
4079 /// block). It allows to save some compile time and memory used by the
4080 /// compiler.
4081 /// ScheduleData is assigned for each instruction in between the boundaries of
4082 /// the tree entry, even for those, which are not part of the graph. It is
4083 /// required to correctly follow the dependencies between the instructions and
4084 /// their correct scheduling. The ScheduleData is not allocated for the
4085 /// instructions, which do not require scheduling, like phis, nodes with
4086 /// extractelements/insertelements only or nodes with instructions, with
4087 /// uses/operands outside of the block.
4088structBlockScheduling {
4089 BlockScheduling(BasicBlock *BB)
4090 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4091
4092void clear() {
4093 ReadyInsts.clear();
4094 ScheduleStart =nullptr;
4095 ScheduleEnd =nullptr;
4096 FirstLoadStoreInRegion =nullptr;
4097 LastLoadStoreInRegion =nullptr;
4098 RegionHasStackSave =false;
4099
4100// Reduce the maximum schedule region size by the size of the
4101// previous scheduling run.
4102 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4103if (ScheduleRegionSizeLimit <MinScheduleRegionSize)
4104 ScheduleRegionSizeLimit =MinScheduleRegionSize;
4105 ScheduleRegionSize = 0;
4106
4107// Make a new scheduling region, i.e. all existing ScheduleData is not
4108// in the new region yet.
4109 ++SchedulingRegionID;
4110 }
4111
4112 ScheduleData *getScheduleData(Instruction *I) {
4113if (BB !=I->getParent())
4114// Avoid lookup if can't possibly be in map.
4115returnnullptr;
4116 ScheduleData *SD = ScheduleDataMap.lookup(I);
4117if (SD && isInSchedulingRegion(SD))
4118return SD;
4119returnnullptr;
4120 }
4121
4122 ScheduleData *getScheduleData(Value *V) {
4123if (auto *I = dyn_cast<Instruction>(V))
4124return getScheduleData(I);
4125returnnullptr;
4126 }
4127
4128bool isInSchedulingRegion(ScheduleData *SD) const{
4129return SD->SchedulingRegionID == SchedulingRegionID;
4130 }
4131
4132 /// Marks an instruction as scheduled and puts all dependent ready
4133 /// instructions into the ready-list.
4134template <typename ReadyListType>
4135void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4136 SD->IsScheduled =true;
4137LLVM_DEBUG(dbgs() <<"SLP: schedule " << *SD <<"\n");
4138
4139for (ScheduleData *BundleMember = SD; BundleMember;
4140 BundleMember = BundleMember->NextInBundle) {
4141
4142// Handle the def-use chain dependencies.
4143
4144// Decrement the unscheduled counter and insert to ready list if ready.
4145auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4146 ScheduleData *OpDef = getScheduleData(I);
4147if (OpDef && OpDef->hasValidDependencies() &&
4148 OpDef->incrementUnscheduledDeps(-1) == 0) {
4149// There are no more unscheduled dependencies after
4150// decrementing, so we can put the dependent instruction
4151// into the ready list.
4152 ScheduleData *DepBundle = OpDef->FirstInBundle;
4153assert(!DepBundle->IsScheduled &&
4154"already scheduled bundle gets ready");
4155 ReadyList.insert(DepBundle);
4156LLVM_DEBUG(dbgs()
4157 <<"SLP: gets ready (def): " << *DepBundle <<"\n");
4158 }
4159 };
4160
4161// If BundleMember is a vector bundle, its operands may have been
4162// reordered during buildTree(). We therefore need to get its operands
4163// through the TreeEntry.
4164if (TreeEntry *TE = BundleMember->TE) {
4165// Need to search for the lane since the tree entry can be reordered.
4166auto *In = BundleMember->Inst;
4167int Lane = std::distance(TE->Scalars.begin(),
4168find(TE->Scalars, In));
4169assert(Lane >= 0 &&"Lane not set");
4170
4171// Since vectorization tree is being built recursively this assertion
4172// ensures that the tree entry has all operands set before reaching
4173// this code. Couple of exceptions known at the moment are extracts
4174// where their second (immediate) operand is not added. Since
4175// immediates do not affect scheduler behavior this is considered
4176// okay.
4177assert(
4178 In &&
4179 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4180In->getNumOperands() ==TE->getNumOperands()) &&
4181"Missed TreeEntry operands?");
4182
4183for (unsigned OpIdx : seq<unsigned>(TE->getNumOperands()))
4184if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4185 DecrUnsched(I);
4186 }else {
4187// If BundleMember is a stand-alone instruction, no operand reordering
4188// has taken place, so we directly access its operands.
4189for (Use &U : BundleMember->Inst->operands())
4190if (auto *I = dyn_cast<Instruction>(U.get()))
4191 DecrUnsched(I);
4192 }
4193// Handle the memory dependencies.
4194for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4195if (MemoryDepSD->hasValidDependencies() &&
4196 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4197// There are no more unscheduled dependencies after decrementing,
4198// so we can put the dependent instruction into the ready list.
4199 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4200assert(!DepBundle->IsScheduled &&
4201"already scheduled bundle gets ready");
4202 ReadyList.insert(DepBundle);
4203LLVM_DEBUG(dbgs()
4204 <<"SLP: gets ready (mem): " << *DepBundle <<"\n");
4205 }
4206 }
4207// Handle the control dependencies.
4208for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4209if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4210// There are no more unscheduled dependencies after decrementing,
4211// so we can put the dependent instruction into the ready list.
4212 ScheduleData *DepBundle = DepSD->FirstInBundle;
4213assert(!DepBundle->IsScheduled &&
4214"already scheduled bundle gets ready");
4215 ReadyList.insert(DepBundle);
4216LLVM_DEBUG(dbgs()
4217 <<"SLP: gets ready (ctl): " << *DepBundle <<"\n");
4218 }
4219 }
4220 }
4221 }
4222
4223 /// Verify basic self consistency properties of the data structure.
4224voidverify() {
4225if (!ScheduleStart)
4226return;
4227
4228assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4229 ScheduleStart->comesBefore(ScheduleEnd) &&
4230"Not a valid scheduling region?");
4231
4232for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {
4233auto *SD = getScheduleData(I);
4234if (!SD)
4235continue;
4236assert(isInSchedulingRegion(SD) &&
4237"primary schedule data not in window?");
4238assert(isInSchedulingRegion(SD->FirstInBundle) &&
4239"entire bundle in window!");
4240 SD->verify();
4241 }
4242
4243for (auto *SD : ReadyInsts) {
4244assert(SD->isSchedulingEntity() && SD->isReady() &&
4245"item in ready list not ready?");
4246 (void)SD;
4247 }
4248 }
4249
4250 /// Put all instructions into the ReadyList which are ready for scheduling.
4251template <typename ReadyListType>
4252void initialFillReadyList(ReadyListType &ReadyList) {
4253for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {
4254 ScheduleData *SD = getScheduleData(I);
4255if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4256 SD->isReady()) {
4257 ReadyList.insert(SD);
4258LLVM_DEBUG(dbgs()
4259 <<"SLP: initially in ready list: " << *SD <<"\n");
4260 }
4261 }
4262 }
4263
4264 /// Build a bundle from the ScheduleData nodes corresponding to the
4265 /// scalar instruction for each lane.
4266 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4267
4268 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4269 /// cyclic dependencies. This is only a dry-run, no instructions are
4270 /// actually moved at this stage.
4271 /// \returns the scheduling bundle. The returned Optional value is not
4272 /// std::nullopt if \p VL is allowed to be scheduled.
4273 std::optional<ScheduleData *>
4274 tryScheduleBundle(ArrayRef<Value *> VL,BoUpSLP *SLP,
4275const InstructionsState &S);
4276
4277 /// Un-bundles a group of instructions.
4278void cancelScheduling(ArrayRef<Value *> VL,Value *OpValue);
4279
4280 /// Allocates schedule data chunk.
4281 ScheduleData *allocateScheduleDataChunks();
4282
4283 /// Extends the scheduling region so that V is inside the region.
4284 /// \returns true if the region size is within the limit.
4285bool extendSchedulingRegion(Value *V,const InstructionsState &S);
4286
4287 /// Initialize the ScheduleData structures for new instructions in the
4288 /// scheduling region.
4289void initScheduleData(Instruction *FromI,Instruction *ToI,
4290 ScheduleData *PrevLoadStore,
4291 ScheduleData *NextLoadStore);
4292
4293 /// Updates the dependency information of a bundle and of all instructions/
4294 /// bundles which depend on the original bundle.
4295void calculateDependencies(ScheduleData *SD,bool InsertInReadyList,
4296BoUpSLP *SLP);
4297
4298 /// Sets all instruction in the scheduling region to un-scheduled.
4299void resetSchedule();
4300
4301BasicBlock *BB;
4302
4303 /// Simple memory allocation for ScheduleData.
4304SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
4305
4306 /// The size of a ScheduleData array in ScheduleDataChunks.
4307int ChunkSize;
4308
4309 /// The allocator position in the current chunk, which is the last entry
4310 /// of ScheduleDataChunks.
4311int ChunkPos;
4312
4313 /// Attaches ScheduleData to Instruction.
4314 /// Note that the mapping survives during all vectorization iterations, i.e.
4315 /// ScheduleData structures are recycled.
4316DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
4317
4318 /// The ready-list for scheduling (only used for the dry-run).
4319SetVector<ScheduleData *> ReadyInsts;
4320
4321 /// The first instruction of the scheduling region.
4322Instruction *ScheduleStart =nullptr;
4323
4324 /// The first instruction _after_ the scheduling region.
4325Instruction *ScheduleEnd =nullptr;
4326
4327 /// The first memory accessing instruction in the scheduling region
4328 /// (can be null).
4329 ScheduleData *FirstLoadStoreInRegion =nullptr;
4330
4331 /// The last memory accessing instruction in the scheduling region
4332 /// (can be null).
4333 ScheduleData *LastLoadStoreInRegion =nullptr;
4334
4335 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4336 /// region? Used to optimize the dependence calculation for the
4337 /// common case where there isn't.
4338bool RegionHasStackSave =false;
4339
4340 /// The current size of the scheduling region.
4341int ScheduleRegionSize = 0;
4342
4343 /// The maximum size allowed for the scheduling region.
4344int ScheduleRegionSizeLimit =ScheduleRegionSizeBudget;
4345
4346 /// The ID of the scheduling region. For a new vectorization iteration this
4347 /// is incremented which "removes" all ScheduleData from the region.
4348 /// Make sure that the initial SchedulingRegionID is greater than the
4349 /// initial SchedulingRegionID in ScheduleData (which is 0).
4350int SchedulingRegionID = 1;
4351 };
4352
4353 /// Attaches the BlockScheduling structures to basic blocks.
4354MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
4355
4356 /// Performs the "real" scheduling. Done before vectorization is actually
4357 /// performed in a basic block.
4358void scheduleBlock(BlockScheduling *BS);
4359
4360 /// List of users to ignore during scheduling and that don't need extracting.
4361constSmallDenseSet<Value *> *UserIgnoreList =nullptr;
4362
4363 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4364 /// sorted SmallVectors of unsigned.
4365structOrdersTypeDenseMapInfo {
4366staticOrdersType getEmptyKey() {
4367OrdersTypeV;
4368V.push_back(~1U);
4369returnV;
4370 }
4371
4372staticOrdersType getTombstoneKey() {
4373OrdersTypeV;
4374V.push_back(~2U);
4375returnV;
4376 }
4377
4378staticunsigned getHashValue(constOrdersType &V) {
4379returnstatic_cast<unsigned>(hash_combine_range(V.begin(),V.end()));
4380 }
4381
4382staticboolisEqual(constOrdersType &LHS,constOrdersType &RHS) {
4383returnLHS ==RHS;
4384 }
4385 };
4386
4387// Analysis and block reference.
4388Function *F;
4389ScalarEvolution *SE;
4390TargetTransformInfo *TTI;
4391TargetLibraryInfo *TLI;
4392LoopInfo *LI;
4393DominatorTree *DT;
4394AssumptionCache *AC;
4395DemandedBits *DB;
4396constDataLayout *DL;
4397OptimizationRemarkEmitter *ORE;
4398
4399unsigned MaxVecRegSize;// This is set by TTI or overridden by cl::opt.
4400unsigned MinVecRegSize;// Set by cl::opt (default: 128).
4401
4402 /// Instruction builder to construct the vectorized tree.
4403IRBuilder<TargetFolder> Builder;
4404
4405 /// A map of scalar integer values to the smallest bit width with which they
4406 /// can legally be represented. The values map to (width, signed) pairs,
4407 /// where "width" indicates the minimum bit width and "signed" is True if the
4408 /// value must be signed-extended, rather than zero-extended, back to its
4409 /// original width.
4410DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
4411
4412 /// Final size of the reduced vector, if the current graph represents the
4413 /// input for the reduction and it was possible to narrow the size of the
4414 /// reduction.
4415unsigned ReductionBitWidth = 0;
4416
4417 /// Canonical graph size before the transformations.
4418unsigned BaseGraphSize = 1;
4419
4420 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4421 /// type sizes, used in the tree.
4422 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4423
4424 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4425 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4426DenseSet<unsigned> ExtraBitWidthNodes;
4427};
4428
4429}// end namespace slpvectorizer
4430
4431template <>structGraphTraits<BoUpSLP *> {
4432usingTreeEntry = BoUpSLP::TreeEntry;
4433
4434 /// NodeRef has to be a pointer per the GraphWriter.
4435usingNodeRef =TreeEntry *;
4436
4437usingContainerTy =BoUpSLP::TreeEntry::VecTreeTy;
4438
4439 /// Add the VectorizableTree to the index iterator to be able to return
4440 /// TreeEntry pointers.
4441structChildIteratorType
4442 :publiciterator_adaptor_base<
4443 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4444ContainerTy &VectorizableTree;
4445
4446ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
4447ContainerTy &VT)
4448 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4449
4450NodeRefoperator*() {returnI->UserTE; }
4451 };
4452
4453staticNodeRefgetEntryNode(BoUpSLP &R) {
4454return R.VectorizableTree[0].get();
4455 }
4456
4457static ChildIteratorTypechild_begin(NodeRefN) {
4458return {N->UserTreeIndices.begin(),N->Container};
4459 }
4460
4461static ChildIteratorTypechild_end(NodeRefN) {
4462return {N->UserTreeIndices.end(),N->Container};
4463 }
4464
4465 /// For the node iterator we just need to turn the TreeEntry iterator into a
4466 /// TreeEntry* iterator so that it dereferences to NodeRef.
4467classnodes_iterator {
4468usingItTy =ContainerTy::iterator;
4469ItTy It;
4470
4471public:
4472nodes_iterator(constItTy &It2) : It(It2) {}
4473NodeRefoperator*() {return It->get(); }
4474 nodes_iteratoroperator++() {
4475 ++It;
4476return *this;
4477 }
4478booloperator!=(const nodes_iterator &N2) const{return N2.It != It; }
4479 };
4480
4481static nodes_iteratornodes_begin(BoUpSLP *R) {
4482return nodes_iterator(R->VectorizableTree.begin());
4483 }
4484
4485static nodes_iteratornodes_end(BoUpSLP *R) {
4486return nodes_iterator(R->VectorizableTree.end());
4487 }
4488
4489staticunsignedsize(BoUpSLP *R) {return R->VectorizableTree.size(); }
4490};
4491
4492template <>structDOTGraphTraits<BoUpSLP *> :publicDefaultDOTGraphTraits {
4493usingTreeEntry = BoUpSLP::TreeEntry;
4494
4495DOTGraphTraits(bool IsSimple =false) :DefaultDOTGraphTraits(IsSimple) {}
4496
4497 std::stringgetNodeLabel(constTreeEntry *Entry,constBoUpSLP *R) {
4498 std::string Str;
4499raw_string_ostreamOS(Str);
4500OS << Entry->Idx <<".\n";
4501if (isSplat(Entry->Scalars))
4502OS <<"<splat> ";
4503for (auto *V : Entry->Scalars) {
4504OS << *V;
4505if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4506 return EU.Scalar == V;
4507 }))
4508OS <<" <extract>";
4509OS <<"\n";
4510 }
4511return Str;
4512 }
4513
4514static std::stringgetNodeAttributes(constTreeEntry *Entry,
4515constBoUpSLP *) {
4516if (Entry->isGather())
4517return"color=red";
4518if (Entry->State == TreeEntry::ScatterVectorize ||
4519 Entry->State == TreeEntry::StridedVectorize)
4520return"color=blue";
4521return"";
4522 }
4523};
4524
4525}// end namespace llvm
4526
4527BoUpSLP::~BoUpSLP() {
4528SmallVector<WeakTrackingVH> DeadInsts;
4529for (auto *I : DeletedInstructions) {
4530if (!I->getParent()) {
4531// Temporarily insert instruction back to erase them from parent and
4532// memory later.
4533if (isa<PHINode>(I))
4534// Phi nodes must be the very first instructions in the block.
4535I->insertBefore(F->getEntryBlock(),
4536F->getEntryBlock().getFirstNonPHIIt());
4537else
4538I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
4539continue;
4540 }
4541for (Use &U :I->operands()) {
4542auto *Op = dyn_cast<Instruction>(U.get());
4543if (Op && !DeletedInstructions.count(Op) &&Op->hasOneUser() &&
4544wouldInstructionBeTriviallyDead(Op, TLI))
4545 DeadInsts.emplace_back(Op);
4546 }
4547I->dropAllReferences();
4548 }
4549for (auto *I : DeletedInstructions) {
4550assert(I->use_empty() &&
4551"trying to erase instruction with users.");
4552I->eraseFromParent();
4553 }
4554
4555// Cleanup any dead scalar code feeding the vectorized instructions
4556RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
4557
4558#ifdef EXPENSIVE_CHECKS
4559// If we could guarantee that this call is not extremely slow, we could
4560// remove the ifdef limitation (see PR47712).
4561assert(!verifyFunction(*F, &dbgs()));
4562#endif
4563}
4564
4565/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4566/// contains original mask for the scalars reused in the node. Procedure
4567/// transform this mask in accordance with the given \p Mask.
4568staticvoidreorderReuses(SmallVectorImpl<int> &Reuses,ArrayRef<int> Mask) {
4569assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4570"Expected non-empty mask.");
4571SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4572 Prev.swap(Reuses);
4573for (unsignedI = 0,E = Prev.size();I <E; ++I)
4574if (Mask[I] !=PoisonMaskElem)
4575 Reuses[Mask[I]] = Prev[I];
4576}
4577
4578/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4579/// the original order of the scalars. Procedure transforms the provided order
4580/// in accordance with the given \p Mask. If the resulting \p Order is just an
4581/// identity order, \p Order is cleared.
4582staticvoidreorderOrder(SmallVectorImpl<unsigned> &Order,ArrayRef<int> Mask,
4583bool BottomOrder =false) {
4584assert(!Mask.empty() &&"Expected non-empty mask.");
4585unsigned Sz = Mask.size();
4586if (BottomOrder) {
4587SmallVector<unsigned> PrevOrder;
4588if (Order.empty()) {
4589 PrevOrder.resize(Sz);
4590 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4591 }else {
4592 PrevOrder.swap(Order);
4593 }
4594 Order.assign(Sz, Sz);
4595for (unsignedI = 0;I < Sz; ++I)
4596if (Mask[I] !=PoisonMaskElem)
4597 Order[I] = PrevOrder[Mask[I]];
4598if (all_of(enumerate(Order), [&](constauto &Data) {
4599returnData.value() == Sz ||Data.index() ==Data.value();
4600 })) {
4601 Order.clear();
4602return;
4603 }
4604fixupOrderingIndices(Order);
4605return;
4606 }
4607SmallVector<int> MaskOrder;
4608if (Order.empty()) {
4609 MaskOrder.resize(Sz);
4610 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4611 }else {
4612inversePermutation(Order, MaskOrder);
4613 }
4614reorderReuses(MaskOrder, Mask);
4615if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4616 Order.clear();
4617return;
4618 }
4619 Order.assign(Sz, Sz);
4620for (unsignedI = 0;I < Sz; ++I)
4621if (MaskOrder[I] !=PoisonMaskElem)
4622 Order[MaskOrder[I]] =I;
4623fixupOrderingIndices(Order);
4624}
4625
4626std::optional<BoUpSLP::OrdersType>
4627BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4628assert(TE.isGather() &&"Expected gather node only.");
4629// Try to find subvector extract/insert patterns and reorder only such
4630// patterns.
4631SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4632Type *ScalarTy = GatheredScalars.front()->getType();
4633int NumScalars = GatheredScalars.size();
4634if (!isValidElementType(ScalarTy))
4635return std::nullopt;
4636auto *VecTy =getWidenedType(ScalarTy, NumScalars);
4637unsigned NumParts =::getNumberOfParts(*TTI, VecTy, NumScalars);
4638SmallVector<int> ExtractMask;
4639SmallVector<int> Mask;
4640SmallVector<SmallVector<const TreeEntry *>> Entries;
4641SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
4642 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4643SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
4644 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4645/*ForOrder=*/true);
4646// No shuffled operands - ignore.
4647if (GatherShuffles.empty() && ExtractShuffles.empty())
4648return std::nullopt;
4649OrdersType CurrentOrder(NumScalars, NumScalars);
4650if (GatherShuffles.size() == 1 &&
4651 *GatherShuffles.front() ==TTI::SK_PermuteSingleSrc &&
4652 Entries.front().front()->isSame(TE.Scalars)) {
4653// Perfect match in the graph, will reuse the previously vectorized
4654// node. Cost is 0.
4655 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4656return CurrentOrder;
4657 }
4658auto IsSplatMask = [](ArrayRef<int> Mask) {
4659int SingleElt =PoisonMaskElem;
4660returnall_of(Mask, [&](intI) {
4661if (SingleElt ==PoisonMaskElem &&I !=PoisonMaskElem)
4662 SingleElt =I;
4663returnI ==PoisonMaskElem ||I == SingleElt;
4664 });
4665 };
4666// Exclusive broadcast mask - ignore.
4667if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4668 (Entries.size() != 1 ||
4669 Entries.front().front()->ReorderIndices.empty())) ||
4670 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4671return std::nullopt;
4672SmallBitVector ShuffledSubMasks(NumParts);
4673auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4674ArrayRef<int> Mask,int PartSz,int NumParts,
4675function_ref<unsigned(unsigned)> GetVF) {
4676for (intI : seq<int>(0, NumParts)) {
4677if (ShuffledSubMasks.test(I))
4678continue;
4679constint VF = GetVF(I);
4680if (VF == 0)
4681continue;
4682unsigned Limit =getNumElems(CurrentOrder.size(), PartSz,I);
4683MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4684// Shuffle of at least 2 vectors - ignore.
4685if (any_of(Slice, [&](intI) {returnI != NumScalars; })) {
4686 std::fill(Slice.begin(), Slice.end(), NumScalars);
4687 ShuffledSubMasks.set(I);
4688continue;
4689 }
4690// Try to include as much elements from the mask as possible.
4691int FirstMin = INT_MAX;
4692int SecondVecFound =false;
4693for (int K : seq<int>(Limit)) {
4694intIdx = Mask[I * PartSz + K];
4695if (Idx ==PoisonMaskElem) {
4696Value *V = GatheredScalars[I * PartSz + K];
4697if (isConstant(V) && !isa<PoisonValue>(V)) {
4698 SecondVecFound =true;
4699break;
4700 }
4701continue;
4702 }
4703if (Idx < VF) {
4704if (FirstMin >Idx)
4705 FirstMin =Idx;
4706 }else {
4707 SecondVecFound =true;
4708break;
4709 }
4710 }
4711 FirstMin = (FirstMin / PartSz) * PartSz;
4712// Shuffle of at least 2 vectors - ignore.
4713if (SecondVecFound) {
4714 std::fill(Slice.begin(), Slice.end(), NumScalars);
4715 ShuffledSubMasks.set(I);
4716continue;
4717 }
4718for (int K : seq<int>(Limit)) {
4719intIdx = Mask[I * PartSz + K];
4720if (Idx ==PoisonMaskElem)
4721continue;
4722Idx -= FirstMin;
4723if (Idx >= PartSz) {
4724 SecondVecFound =true;
4725break;
4726 }
4727if (CurrentOrder[I * PartSz +Idx] >
4728static_cast<unsigned>(I * PartSz + K) &&
4729 CurrentOrder[I * PartSz +Idx] !=
4730static_cast<unsigned>(I * PartSz +Idx))
4731 CurrentOrder[I * PartSz +Idx] =I * PartSz + K;
4732 }
4733// Shuffle of at least 2 vectors - ignore.
4734if (SecondVecFound) {
4735 std::fill(Slice.begin(), Slice.end(), NumScalars);
4736 ShuffledSubMasks.set(I);
4737continue;
4738 }
4739 }
4740 };
4741int PartSz =getPartNumElems(NumScalars, NumParts);
4742if (!ExtractShuffles.empty())
4743 TransformMaskToOrder(
4744 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsignedI) {
4745if (!ExtractShuffles[I])
4746return 0U;
4747unsigned VF = 0;
4748unsigned Sz =getNumElems(TE.getVectorFactor(), PartSz,I);
4749for (unsignedIdx : seq<unsigned>(Sz)) {
4750int K =I * PartSz +Idx;
4751if (ExtractMask[K] ==PoisonMaskElem)
4752continue;
4753if (!TE.ReuseShuffleIndices.empty())
4754 K = TE.ReuseShuffleIndices[K];
4755if (K ==PoisonMaskElem)
4756continue;
4757if (!TE.ReorderIndices.empty())
4758 K = std::distance(TE.ReorderIndices.begin(),
4759find(TE.ReorderIndices, K));
4760auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4761if (!EI)
4762continue;
4763 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4764 ->getElementCount()
4765 .getKnownMinValue());
4766 }
4767return VF;
4768 });
4769// Check special corner case - single shuffle of the same entry.
4770if (GatherShuffles.size() == 1 && NumParts != 1) {
4771if (ShuffledSubMasks.any())
4772return std::nullopt;
4773 PartSz = NumScalars;
4774 NumParts = 1;
4775 }
4776if (!Entries.empty())
4777 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsignedI) {
4778if (!GatherShuffles[I])
4779return 0U;
4780return std::max(Entries[I].front()->getVectorFactor(),
4781 Entries[I].back()->getVectorFactor());
4782 });
4783int NumUndefs =
4784count_if(CurrentOrder, [&](intIdx) {returnIdx == NumScalars; });
4785if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4786return std::nullopt;
4787return std::move(CurrentOrder);
4788}
4789
4790staticboolarePointersCompatible(Value *Ptr1,Value *Ptr2,
4791constTargetLibraryInfo &TLI,
4792bool CompareOpcodes =true) {
4793if (getUnderlyingObject(Ptr1,RecursionMaxDepth) !=
4794getUnderlyingObject(Ptr2,RecursionMaxDepth))
4795returnfalse;
4796auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4797auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4798return (!GEP1 || GEP1->getNumOperands() == 2) &&
4799 (!GEP2 || GEP2->getNumOperands() == 2) &&
4800 (((!GEP1 ||isConstant(GEP1->getOperand(1))) &&
4801 (!GEP2 ||isConstant(GEP2->getOperand(1)))) ||
4802 !CompareOpcodes ||
4803 (GEP1 && GEP2 &&
4804getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4805}
4806
4807/// Calculates minimal alignment as a common alignment.
4808template <typename T>
4809staticAligncomputeCommonAlignment(ArrayRef<Value *> VL) {
4810Align CommonAlignment = cast<T>(VL.front())->getAlign();
4811for (Value *V : VL.drop_front())
4812 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4813return CommonAlignment;
4814}
4815
4816/// Check if \p Order represents reverse order.
4817staticboolisReverseOrder(ArrayRef<unsigned> Order) {
4818assert(!Order.empty() &&
4819"Order is empty. Please check it before using isReverseOrder.");
4820unsigned Sz = Order.size();
4821returnall_of(enumerate(Order), [&](constauto &Pair) {
4822return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4823 });
4824}
4825
4826/// Checks if the provided list of pointers \p Pointers represents the strided
4827/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4828/// Otherwise, if \p Inst is not specified, just initialized optional value is
4829/// returned to show that the pointers represent strided pointers. If \p Inst
4830/// specified, the runtime stride is materialized before the given \p Inst.
4831/// \returns std::nullopt if the pointers are not pointers with the runtime
4832/// stride, nullptr or actual stride value, otherwise.
4833static std::optional<Value *>
4834calculateRtStride(ArrayRef<Value *> PointerOps,Type *ElemTy,
4835constDataLayout &DL,ScalarEvolution &SE,
4836SmallVectorImpl<unsigned> &SortedIndices,
4837Instruction *Inst =nullptr) {
4838SmallVector<const SCEV *> SCEVs;
4839constSCEV *PtrSCEVLowest =nullptr;
4840constSCEV *PtrSCEVHighest =nullptr;
4841// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4842// addresses).
4843for (Value *Ptr : PointerOps) {
4844constSCEV *PtrSCEV = SE.getSCEV(Ptr);
4845if (!PtrSCEV)
4846return std::nullopt;
4847 SCEVs.push_back(PtrSCEV);
4848if (!PtrSCEVLowest && !PtrSCEVHighest) {
4849 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4850continue;
4851 }
4852constSCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4853if (isa<SCEVCouldNotCompute>(Diff))
4854return std::nullopt;
4855if (Diff->isNonConstantNegative()) {
4856 PtrSCEVLowest = PtrSCEV;
4857continue;
4858 }
4859constSCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4860if (isa<SCEVCouldNotCompute>(Diff1))
4861return std::nullopt;
4862if (Diff1->isNonConstantNegative()) {
4863 PtrSCEVHighest = PtrSCEV;
4864continue;
4865 }
4866 }
4867// Dist = PtrSCEVHighest - PtrSCEVLowest;
4868constSCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4869if (isa<SCEVCouldNotCompute>(Dist))
4870return std::nullopt;
4871intSize =DL.getTypeStoreSize(ElemTy);
4872auto TryGetStride = [&](constSCEV *Dist,
4873constSCEV *Multiplier) ->constSCEV * {
4874if (constauto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4875if (M->getOperand(0) == Multiplier)
4876return M->getOperand(1);
4877if (M->getOperand(1) == Multiplier)
4878return M->getOperand(0);
4879returnnullptr;
4880 }
4881if (Multiplier == Dist)
4882return SE.getConstant(Dist->getType(), 1);
4883return SE.getUDivExactExpr(Dist, Multiplier);
4884 };
4885// Stride_in_elements = Dist / element_size * (num_elems - 1).
4886constSCEV *Stride =nullptr;
4887if (Size != 1 || SCEVs.size() > 2) {
4888constSCEV *Sz = SE.getConstant(Dist->getType(),Size * (SCEVs.size() - 1));
4889 Stride = TryGetStride(Dist, Sz);
4890if (!Stride)
4891return std::nullopt;
4892 }
4893if (!Stride || isa<SCEVConstant>(Stride))
4894return std::nullopt;
4895// Iterate through all pointers and check if all distances are
4896// unique multiple of Stride.
4897usingDistOrdPair = std::pair<int64_t, int>;
4898auto Compare =llvm::less_first();
4899 std::set<DistOrdPair,decltype(Compare)> Offsets(Compare);
4900int Cnt = 0;
4901bool IsConsecutive =true;
4902for (constSCEV *PtrSCEV : SCEVs) {
4903unsigned Dist = 0;
4904if (PtrSCEV != PtrSCEVLowest) {
4905constSCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4906constSCEV *Coeff = TryGetStride(Diff, Stride);
4907if (!Coeff)
4908return std::nullopt;
4909constauto *SC = dyn_cast<SCEVConstant>(Coeff);
4910if (!SC || isa<SCEVCouldNotCompute>(SC))
4911return std::nullopt;
4912if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4913 SE.getMulExpr(Stride, SC)))
4914 ->isZero())
4915return std::nullopt;
4916 Dist = SC->getAPInt().getZExtValue();
4917 }
4918// If the strides are not the same or repeated, we can't vectorize.
4919if ((Dist /Size) *Size != Dist || (Dist /Size) >= SCEVs.size())
4920return std::nullopt;
4921auto Res = Offsets.emplace(Dist, Cnt);
4922if (!Res.second)
4923return std::nullopt;
4924// Consecutive order if the inserted element is the last one.
4925 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4926 ++Cnt;
4927 }
4928if (Offsets.size() != SCEVs.size())
4929return std::nullopt;
4930 SortedIndices.clear();
4931if (!IsConsecutive) {
4932// Fill SortedIndices array only if it is non-consecutive.
4933 SortedIndices.resize(PointerOps.size());
4934 Cnt = 0;
4935for (const std::pair<int64_t, int> &Pair : Offsets) {
4936 SortedIndices[Cnt] = Pair.second;
4937 ++Cnt;
4938 }
4939 }
4940if (!Inst)
4941returnnullptr;
4942SCEVExpander Expander(SE,DL,"strided-load-vec");
4943return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4944}
4945
4946static std::pair<InstructionCost, InstructionCost>
4947getGEPCosts(constTargetTransformInfo &TTI,ArrayRef<Value *> Ptrs,
4948Value *BasePtr,unsigned Opcode,TTI::TargetCostKindCostKind,
4949Type *ScalarTy,VectorType *VecTy);
4950
4951/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4952/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4953/// subvector pattern.
4954staticInstructionCost
4955getShuffleCost(constTargetTransformInfo &TTI,TTI::ShuffleKind Kind,
4956VectorType *Tp,ArrayRef<int> Mask = {},
4957TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput,
4958intIndex = 0,VectorType *SubTp =nullptr,
4959ArrayRef<const Value *>Args = {}) {
4960if (Kind !=TTI::SK_PermuteTwoSrc)
4961returnTTI.getShuffleCost(Kind, Tp, Mask,CostKind,Index, SubTp, Args);
4962int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4963int NumSubElts;
4964if (Mask.size() > 2 &&ShuffleVectorInst::isInsertSubvectorMask(
4965 Mask, NumSrcElts, NumSubElts,Index)) {
4966if (Index + NumSubElts > NumSrcElts &&
4967Index + NumSrcElts <=static_cast<int>(Mask.size()))
4968returnTTI.getShuffleCost(
4969TTI::SK_InsertSubvector,
4970getWidenedType(Tp->getElementType(),Mask.size()), Mask,
4971TTI::TCK_RecipThroughput,Index, Tp);
4972 }
4973returnTTI.getShuffleCost(Kind, Tp, Mask,CostKind,Index, SubTp, Args);
4974}
4975
4976/// Correctly creates insert_subvector, checking that the index is multiple of
4977/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4978/// using default shuffle.
4979staticValue *createInsertVector(
4980IRBuilderBase &Builder,Value *Vec,Value *V,unsignedIndex,
4981function_ref<Value *(Value *,Value *,ArrayRef<int>)> Generator = {}) {
4982constunsigned SubVecVF =getNumElements(V->getType());
4983if (Index % SubVecVF == 0) {
4984 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
4985 Builder.getInt64(Index));
4986 }else {
4987// Create shuffle, insertvector requires that index is multiple of
4988// the subvector length.
4989constunsigned VecVF =getNumElements(Vec->getType());
4990SmallVector<int>Mask(VecVF,PoisonMaskElem);
4991 std::iota(Mask.begin(),Mask.end(), 0);
4992for (unsignedI : seq<unsigned>(SubVecVF))
4993Mask[I +Index] =I + VecVF;
4994if (Generator) {
4995 Vec = Generator(Vec, V, Mask);
4996 }else {
4997// 1. Resize V to the size of Vec.
4998SmallVector<int> ResizeMask(VecVF,PoisonMaskElem);
4999 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
5000V = Builder.CreateShuffleVector(V, ResizeMask);
5001 Vec = Builder.CreateShuffleVector(Vec, V, Mask);
5002 }
5003 }
5004return Vec;
5005}
5006
5007/// Correctly creates extract_subvector, checking that the index is multiple of
5008/// the subvectors length. Otherwise, generates shuffle using \p Generator or
5009/// using default shuffle.
5010staticValue *createExtractVector(IRBuilderBase &Builder,Value *Vec,
5011unsigned SubVecVF,unsignedIndex) {
5012if (Index % SubVecVF == 0) {
5013VectorType *SubVecTy =
5014getWidenedType(Vec->getType()->getScalarType(), SubVecVF);
5015return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));
5016 }
5017// Create shuffle, extract_subvector requires that index is multiple of
5018// the subvector length.
5019SmallVector<int> Mask(SubVecVF,PoisonMaskElem);
5020 std::iota(Mask.begin(), Mask.end(),Index);
5021return Builder.CreateShuffleVector(Vec, Mask);
5022}
5023
5024BoUpSLP::LoadsState
5025BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL,constValue *VL0,
5026SmallVectorImpl<unsigned> &Order,
5027SmallVectorImpl<Value *> &PointerOps,
5028unsigned *BestVF,bool TryRecursiveCheck) const{
5029// Check that a vectorized load would load the same memory as a scalar
5030// load. For example, we don't want to vectorize loads that are smaller
5031// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5032// treats loading/storing it as an i8 struct. If we vectorize loads/stores
5033// from such a struct, we read/write packed bits disagreeing with the
5034// unvectorized version.
5035if (BestVF)
5036 *BestVF = 0;
5037if (areKnownNonVectorizableLoads(VL))
5038returnLoadsState::Gather;
5039Type *ScalarTy = VL0->getType();
5040
5041if (DL->getTypeSizeInBits(ScalarTy) !=DL->getTypeAllocSizeInBits(ScalarTy))
5042returnLoadsState::Gather;
5043
5044// Make sure all loads in the bundle are simple - we can't vectorize
5045// atomic or volatile loads.
5046 PointerOps.clear();
5047constunsigned Sz = VL.size();
5048 PointerOps.resize(Sz);
5049auto *POIter = PointerOps.begin();
5050for (Value *V : VL) {
5051auto *L = dyn_cast<LoadInst>(V);
5052if (!L || !L->isSimple())
5053returnLoadsState::Gather;
5054 *POIter = L->getPointerOperand();
5055 ++POIter;
5056 }
5057
5058 Order.clear();
5059// Check the order of pointer operands or that all pointers are the same.
5060bool IsSorted =sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
5061
5062auto *VecTy =getWidenedType(ScalarTy, Sz);
5063Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5064if (!IsSorted) {
5065if (Sz >MinProfitableStridedLoads &&TTI->isTypeLegal(VecTy)) {
5066if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
5067calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
5068returnLoadsState::StridedVectorize;
5069 }
5070
5071if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5072TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5073returnLoadsState::Gather;
5074
5075if (!all_of(PointerOps, [&](Value *P) {
5076returnarePointersCompatible(P, PointerOps.front(), *TLI);
5077 }))
5078returnLoadsState::Gather;
5079
5080 }else {
5081Value *Ptr0;
5082Value *PtrN;
5083if (Order.empty()) {
5084 Ptr0 = PointerOps.front();
5085 PtrN = PointerOps.back();
5086 }else {
5087 Ptr0 = PointerOps[Order.front()];
5088 PtrN = PointerOps[Order.back()];
5089 }
5090 std::optional<int> Diff =
5091getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5092// Check that the sorted loads are consecutive.
5093if (static_cast<unsigned>(*Diff) == Sz - 1)
5094returnLoadsState::Vectorize;
5095if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5096TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5097returnLoadsState::Gather;
5098// Simple check if not a strided access - clear order.
5099bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5100// Try to generate strided load node if:
5101// 1. Target with strided load support is detected.
5102// 2. The number of loads is greater than MinProfitableStridedLoads,
5103// or the potential stride <= MaxProfitableLoadStride and the
5104// potential stride is power-of-2 (to avoid perf regressions for the very
5105// small number of loads) and max distance > number of loads, or potential
5106// stride is -1.
5107// 3. The loads are ordered, or number of unordered loads <=
5108// MaxProfitableUnorderedLoads, or loads are in reversed order.
5109// (this check is to avoid extra costs for very expensive shuffles).
5110// 4. Any pointer operand is an instruction with the users outside of the
5111// current graph (for masked gathers extra extractelement instructions
5112// might be required).
5113auto IsAnyPointerUsedOutGraph =
5114 IsPossibleStrided &&any_of(PointerOps, [&](Value *V) {
5115return isa<Instruction>(V) &&any_of(V->users(), [&](User *U) {
5116 return !getTreeEntry(U) && !MustGather.contains(U);
5117 });
5118 });
5119constunsigned AbsoluteDiff = std::abs(*Diff);
5120if (IsPossibleStrided &&
5121 (IsAnyPointerUsedOutGraph ||
5122 (AbsoluteDiff > Sz &&
5123 (Sz >MinProfitableStridedLoads ||
5124 (AbsoluteDiff <=MaxProfitableLoadStride * Sz &&
5125 AbsoluteDiff % Sz == 0 &&has_single_bit(AbsoluteDiff / Sz)))) ||
5126 *Diff == -(static_cast<int>(Sz) - 1))) {
5127int Stride = *Diff /static_cast<int>(Sz - 1);
5128if (*Diff == Stride *static_cast<int>(Sz - 1)) {
5129Align Alignment =
5130 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5131 ->getAlign();
5132if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5133// Iterate through all pointers and check if all distances are
5134// unique multiple of Dist.
5135SmallSet<int, 4> Dists;
5136for (Value *Ptr : PointerOps) {
5137int Dist = 0;
5138if (Ptr == PtrN)
5139 Dist = *Diff;
5140elseif (Ptr != Ptr0)
5141 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy,Ptr, *DL, *SE);
5142// If the strides are not the same or repeated, we can't
5143// vectorize.
5144if (((Dist / Stride) * Stride) != Dist ||
5145 !Dists.insert(Dist).second)
5146break;
5147 }
5148if (Dists.size() == Sz)
5149returnLoadsState::StridedVectorize;
5150 }
5151 }
5152 }
5153 }
5154// Correctly identify compare the cost of loads + shuffles rather than
5155// strided/masked gather loads. Returns true if vectorized + shuffles
5156// representation is better than just gather.
5157auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5158unsigned *BestVF,
5159bool ProfitableGatherPointers) {
5160if (BestVF)
5161 *BestVF = 0;
5162// Compare masked gather cost and loads + insert subvector costs.
5163TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
5164auto [ScalarGEPCost, VectorGEPCost] =
5165getGEPCosts(TTI, PointerOps, PointerOps.front(),
5166 Instruction::GetElementPtr,CostKind, ScalarTy, VecTy);
5167// Estimate the cost of masked gather GEP. If not a splat, roughly
5168// estimate as a buildvector, otherwise estimate as splat.
5169APInt DemandedElts =APInt::getAllOnes(VecTy->getNumElements());
5170VectorType *PtrVecTy =
5171getWidenedType(PointerOps.front()->getType()->getScalarType(),
5172 VecTy->getNumElements());
5173if (static_cast<unsigned>(count_if(
5174 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5175any_of(PointerOps, [&](Value *V) {
5176returngetUnderlyingObject(V) !=
5177getUnderlyingObject(PointerOps.front());
5178 }))
5179 VectorGEPCost +=TTI.getScalarizationOverhead(
5180 PtrVecTy, DemandedElts,/*Insert=*/true,/*Extract=*/false,CostKind);
5181else
5182 VectorGEPCost +=
5183TTI.getScalarizationOverhead(
5184 PtrVecTy,APInt::getOneBitSet(VecTy->getNumElements(), 0),
5185/*Insert=*/true,/*Extract=*/false,CostKind) +
5186::getShuffleCost(TTI,TTI::SK_Broadcast, PtrVecTy, {},CostKind);
5187// The cost of scalar loads.
5188InstructionCost ScalarLoadsCost =
5189 std::accumulate(VL.begin(), VL.end(),InstructionCost(),
5190 [&](InstructionCostC,Value *V) {
5191returnC +TTI.getInstructionCost(
5192 cast<Instruction>(V),CostKind);
5193 }) +
5194 ScalarGEPCost;
5195// The cost of masked gather.
5196InstructionCost MaskedGatherCost =
5197TTI.getGatherScatterOpCost(
5198 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5199/*VariableMask=*/false, CommonAlignment,CostKind) +
5200 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5201InstructionCost GatherCost =
5202TTI.getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,
5203/*Extract=*/false,CostKind) +
5204 ScalarLoadsCost;
5205// The list of loads is small or perform partial check already - directly
5206// compare masked gather cost and gather cost.
5207constexprunsigned ListLimit = 4;
5208if (!TryRecursiveCheck || VL.size() < ListLimit)
5209return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5210
5211// FIXME: The following code has not been updated for non-power-of-2
5212// vectors (and not whole registers). The splitting logic here does not
5213// cover the original vector if the vector factor is not a power of two.
5214if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
5215returnfalse;
5216
5217unsigned Sz =DL->getTypeSizeInBits(ScalarTy);
5218unsigned MinVF =getMinVF(2 * Sz);
5219 DemandedElts.clearAllBits();
5220// Iterate through possible vectorization factors and check if vectorized +
5221// shuffles is better than just gather.
5222for (unsigned VF =
5223getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
5224 VF >= MinVF;
5225 VF =getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
5226SmallVector<LoadsState> States;
5227for (unsigned Cnt = 0,End = VL.size(); Cnt + VF <=End; Cnt += VF) {
5228ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5229SmallVector<unsigned> Order;
5230SmallVector<Value *> PointerOps;
5231LoadsState LS =
5232canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5233/*TryRecursiveCheck=*/false);
5234// Check that the sorted loads are consecutive.
5235if (LS ==LoadsState::Gather) {
5236if (BestVF) {
5237 DemandedElts.setAllBits();
5238break;
5239 }
5240 DemandedElts.setBits(Cnt, Cnt + VF);
5241continue;
5242 }
5243// If need the reorder - consider as high-cost masked gather for now.
5244if ((LS ==LoadsState::Vectorize ||
5245 LS ==LoadsState::StridedVectorize) &&
5246 !Order.empty() && !isReverseOrder(Order))
5247 LS =LoadsState::ScatterVectorize;
5248 States.push_back(LS);
5249 }
5250if (DemandedElts.isAllOnes())
5251// All loads gathered - try smaller VF.
5252continue;
5253// Can be vectorized later as a serie of loads/insertelements.
5254InstructionCost VecLdCost = 0;
5255if (!DemandedElts.isZero()) {
5256 VecLdCost =
5257TTI.getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,
5258/*Extract=*/false,CostKind) +
5259 ScalarGEPCost;
5260for (unsignedIdx : seq<unsigned>(VL.size()))
5261if (DemandedElts[Idx])
5262 VecLdCost +=
5263TTI.getInstructionCost(cast<Instruction>(VL[Idx]),CostKind);
5264 }
5265unsigned ScalarTyNumElements =getNumElements(ScalarTy);
5266auto *SubVecTy =getWidenedType(ScalarTy, VF);
5267for (auto [I, LS] :enumerate(States)) {
5268auto *LI0 = cast<LoadInst>(VL[I * VF]);
5269InstructionCost VectorGEPCost =
5270 (LS ==LoadsState::ScatterVectorize && ProfitableGatherPointers)
5271 ? 0
5272 :getGEPCosts(TTI,ArrayRef(PointerOps).slice(I * VF, VF),
5273 LI0->getPointerOperand(),
5274 Instruction::GetElementPtr,CostKind, ScalarTy,
5275 SubVecTy)
5276 .second;
5277if (LS ==LoadsState::ScatterVectorize) {
5278if (static_cast<unsigned>(
5279count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5280 PointerOps.size() - 1 ||
5281any_of(PointerOps, [&](Value *V) {
5282returngetUnderlyingObject(V) !=
5283getUnderlyingObject(PointerOps.front());
5284 }))
5285 VectorGEPCost +=TTI.getScalarizationOverhead(
5286 SubVecTy,APInt::getAllOnes(VF),
5287/*Insert=*/true,/*Extract=*/false,CostKind);
5288else
5289 VectorGEPCost +=
5290TTI.getScalarizationOverhead(
5291 SubVecTy,APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5292/*Insert=*/true,/*Extract=*/false,CostKind) +
5293::getShuffleCost(TTI,TTI::SK_Broadcast, SubVecTy, {},
5294CostKind);
5295 }
5296switch (LS) {
5297caseLoadsState::Vectorize:
5298 VecLdCost +=
5299TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5300 LI0->getPointerAddressSpace(),CostKind,
5301TTI::OperandValueInfo()) +
5302 VectorGEPCost;
5303break;
5304caseLoadsState::StridedVectorize:
5305 VecLdCost +=TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5306 LI0->getPointerOperand(),
5307/*VariableMask=*/false,
5308 CommonAlignment,CostKind) +
5309 VectorGEPCost;
5310break;
5311caseLoadsState::ScatterVectorize:
5312 VecLdCost +=TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5313 LI0->getPointerOperand(),
5314/*VariableMask=*/false,
5315 CommonAlignment,CostKind) +
5316 VectorGEPCost;
5317break;
5318caseLoadsState::Gather:
5319// Gathers are already calculated - ignore.
5320continue;
5321 }
5322SmallVector<int> ShuffleMask(VL.size());
5323for (intIdx : seq<int>(0, VL.size()))
5324 ShuffleMask[Idx] =Idx / VF ==I ? VL.size() +Idx % VF :Idx;
5325if (I > 0)
5326 VecLdCost +=
5327::getShuffleCost(TTI,TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5328CostKind,I * VF, SubVecTy);
5329 }
5330// If masked gather cost is higher - better to vectorize, so
5331// consider it as a gather node. It will be better estimated
5332// later.
5333if (MaskedGatherCost >= VecLdCost &&
5334 VecLdCost - GatherCost < -SLPCostThreshold) {
5335if (BestVF)
5336 *BestVF = VF;
5337returntrue;
5338 }
5339 }
5340return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5341 };
5342// TODO: need to improve analysis of the pointers, if not all of them are
5343// GEPs or have > 2 operands, we end up with a gather node, which just
5344// increases the cost.
5345Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5346bool ProfitableGatherPointers =
5347 L && Sz > 2 &&static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5348return L->isLoopInvariant(V);
5349 })) <= Sz / 2;
5350if (ProfitableGatherPointers ||all_of(PointerOps, [](Value *P) {
5351auto *GEP = dyn_cast<GetElementPtrInst>(P);
5352return (!GEP &&doesNotNeedToBeScheduled(P)) ||
5353 (GEP &&GEP->getNumOperands() == 2 &&
5354 isa<Constant, Instruction>(GEP->getOperand(1)));
5355 })) {
5356// Check if potential masked gather can be represented as series
5357// of loads + insertsubvectors.
5358// If masked gather cost is higher - better to vectorize, so
5359// consider it as a gather node. It will be better estimated
5360// later.
5361if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5362 ProfitableGatherPointers))
5363returnLoadsState::ScatterVectorize;
5364 }
5365
5366returnLoadsState::Gather;
5367}
5368
5369staticboolclusterSortPtrAccesses(ArrayRef<Value *> VL,
5370ArrayRef<BasicBlock *> BBs,Type *ElemTy,
5371constDataLayout &DL,ScalarEvolution &SE,
5372SmallVectorImpl<unsigned> &SortedIndices) {
5373assert(
5374all_of(VL, [](constValue *V) {return V->getType()->isPointerTy(); }) &&
5375"Expected list of pointer operands.");
5376// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5377// Ptr into, sort and return the sorted indices with values next to one
5378// another.
5379SmallMapVector<std::pair<BasicBlock *, Value *>,
5380SmallVector<SmallVector<std::tuple<Value *, int, unsigned>>>, 8>
5381 Bases;
5382 Bases
5383 .try_emplace(std::make_pair(
5384 BBs.front(),getUnderlyingObject(VL.front(),RecursionMaxDepth)))
5385 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5386
5387 SortedIndices.clear();
5388for (auto [Cnt,Ptr] :enumerate(VL.drop_front())) {
5389auto Key = std::make_pair(BBs[Cnt + 1],
5390getUnderlyingObject(Ptr,RecursionMaxDepth));
5391bool Found =any_of(Bases.try_emplace(Key).first->second,
5392 [&, &Cnt = Cnt, &Ptr =Ptr](auto &Base) {
5393 std::optional<int> Diff = getPointersDiff(
5394 ElemTy, std::get<0>(Base.front()), ElemTy,
5395 Ptr, DL, SE,
5396/*StrictCheck=*/true);
5397 if (!Diff)
5398 return false;
5399
5400 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5401 return true;
5402 });
5403
5404if (!Found) {
5405// If we haven't found enough to usefully cluster, return early.
5406if (Bases.size() > VL.size() / 2 - 1)
5407returnfalse;
5408
5409// Not found already - add a new Base
5410 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5411 }
5412 }
5413
5414if (Bases.size() == VL.size())
5415returnfalse;
5416
5417if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5418 Bases.front().second.size() == VL.size()))
5419returnfalse;
5420
5421// For each of the bases sort the pointers by Offset and check if any of the
5422// base become consecutively allocated.
5423auto ComparePointers = [](Value *Ptr1,Value *Ptr2) {
5424SmallPtrSet<Value *, 13> FirstPointers;
5425SmallPtrSet<Value *, 13> SecondPointers;
5426Value *P1 = Ptr1;
5427Value *P2 = Ptr2;
5428unsignedDepth = 0;
5429while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
5430if (P1 == P2 ||Depth >RecursionMaxDepth)
5431returnfalse;
5432 FirstPointers.insert(P1);
5433 SecondPointers.insert(P2);
5434 P1 =getUnderlyingObject(P1,/*MaxLookup=*/1);
5435 P2 =getUnderlyingObject(P2,/*MaxLookup=*/1);
5436 ++Depth;
5437 }
5438assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5439"Unable to find matching root.");
5440return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5441 };
5442for (auto &Base : Bases) {
5443for (auto &Vec :Base.second) {
5444if (Vec.size() > 1) {
5445stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5446const std::tuple<Value *, int, unsigned> &Y) {
5447return std::get<1>(X) < std::get<1>(Y);
5448 });
5449int InitialOffset = std::get<1>(Vec[0]);
5450bool AnyConsecutive =
5451all_of(enumerate(Vec), [InitialOffset](constauto &P) {
5452return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5453 });
5454// Fill SortedIndices array only if it looks worth-while to sort the
5455// ptrs.
5456if (!AnyConsecutive)
5457returnfalse;
5458 }
5459 }
5460stable_sort(Base.second, [&](constauto &V1,constauto &V2) {
5461 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5462 });
5463 }
5464
5465for (auto &T : Bases)
5466for (constauto &Vec :T.second)
5467for (constauto &P : Vec)
5468 SortedIndices.push_back(std::get<2>(P));
5469
5470assert(SortedIndices.size() == VL.size() &&
5471"Expected SortedIndices to be the size of VL");
5472returntrue;
5473}
5474
5475std::optional<BoUpSLP::OrdersType>
5476BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5477assert(TE.isGather() &&"Expected gather node only.");
5478Type *ScalarTy = TE.Scalars[0]->getType();
5479
5480SmallVector<Value *> Ptrs;
5481 Ptrs.reserve(TE.Scalars.size());
5482SmallVector<BasicBlock *> BBs;
5483 BBs.reserve(TE.Scalars.size());
5484for (Value *V : TE.Scalars) {
5485auto *L = dyn_cast<LoadInst>(V);
5486if (!L || !L->isSimple())
5487return std::nullopt;
5488 Ptrs.push_back(L->getPointerOperand());
5489 BBs.push_back(L->getParent());
5490 }
5491
5492BoUpSLP::OrdersType Order;
5493if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5494clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5495return std::move(Order);
5496return std::nullopt;
5497}
5498
5499/// Check if two insertelement instructions are from the same buildvector.
5500staticboolareTwoInsertFromSameBuildVector(
5501InsertElementInst *VU,InsertElementInst *V,
5502function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5503// Instructions must be from the same basic blocks.
5504if (VU->getParent() != V->getParent())
5505returnfalse;
5506// Checks if 2 insertelements are from the same buildvector.
5507if (VU->getType() != V->getType())
5508returnfalse;
5509// Multiple used inserts are separate nodes.
5510if (!VU->hasOneUse() && !V->hasOneUse())
5511returnfalse;
5512auto *IE1 = VU;
5513auto *IE2 = V;
5514 std::optional<unsigned> Idx1 =getElementIndex(IE1);
5515 std::optional<unsigned> Idx2 =getElementIndex(IE2);
5516if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5517returnfalse;
5518// Go through the vector operand of insertelement instructions trying to find
5519// either VU as the original vector for IE2 or V as the original vector for
5520// IE1.
5521SmallBitVector ReusedIdx(
5522 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5523bool IsReusedIdx =false;
5524do {
5525if (IE2 == VU && !IE1)
5526return VU->hasOneUse();
5527if (IE1 == V && !IE2)
5528return V->hasOneUse();
5529if (IE1 && IE1 != V) {
5530unsigned Idx1 =getElementIndex(IE1).value_or(*Idx2);
5531 IsReusedIdx |= ReusedIdx.test(Idx1);
5532 ReusedIdx.set(Idx1);
5533if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5534 IE1 =nullptr;
5535else
5536 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5537 }
5538if (IE2 && IE2 != VU) {
5539unsigned Idx2 =getElementIndex(IE2).value_or(*Idx1);
5540 IsReusedIdx |= ReusedIdx.test(Idx2);
5541 ReusedIdx.set(Idx2);
5542if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5543 IE2 =nullptr;
5544else
5545 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5546 }
5547 }while (!IsReusedIdx && (IE1 || IE2));
5548returnfalse;
5549}
5550
5551std::optional<BoUpSLP::OrdersType>
5552BoUpSLP::getReorderingData(const TreeEntry &TE,bool TopToBottom) {
5553// No need to reorder if need to shuffle reuses, still need to shuffle the
5554// node.
5555if (!TE.ReuseShuffleIndices.empty()) {
5556// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5557assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5558"Reshuffling scalars not yet supported for nodes with padding");
5559
5560if (isSplat(TE.Scalars))
5561return std::nullopt;
5562// Check if reuse shuffle indices can be improved by reordering.
5563// For this, check that reuse mask is "clustered", i.e. each scalar values
5564// is used once in each submask of size <number_of_scalars>.
5565// Example: 4 scalar values.
5566// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5567// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5568// element 3 is used twice in the second submask.
5569unsigned Sz = TE.Scalars.size();
5570if (TE.isGather()) {
5571if (std::optional<OrdersType> CurrentOrder =
5572findReusedOrderedScalars(TE)) {
5573SmallVector<int> Mask;
5574fixupOrderingIndices(*CurrentOrder);
5575inversePermutation(*CurrentOrder, Mask);
5576::addMask(Mask, TE.ReuseShuffleIndices);
5577OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5578unsigned Sz = TE.Scalars.size();
5579for (int K = 0,E = TE.getVectorFactor() / Sz; K <E; ++K) {
5580for (auto [I,Idx] :enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5581if (Idx !=PoisonMaskElem)
5582 Res[Idx + K * Sz] =I + K * Sz;
5583 }
5584return std::move(Res);
5585 }
5586 }
5587if (Sz == 2 && TE.getVectorFactor() == 4 &&
5588::getNumberOfParts(*TTI,getWidenedType(TE.Scalars.front()->getType(),
5589 2 * TE.getVectorFactor())) == 1)
5590return std::nullopt;
5591if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5592 Sz)) {
5593SmallVector<int> ReorderMask(Sz,PoisonMaskElem);
5594if (TE.ReorderIndices.empty())
5595 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5596else
5597inversePermutation(TE.ReorderIndices, ReorderMask);
5598::addMask(ReorderMask, TE.ReuseShuffleIndices);
5599unsigned VF = ReorderMask.size();
5600OrdersType ResOrder(VF, VF);
5601unsigned NumParts =divideCeil(VF, Sz);
5602SmallBitVector UsedVals(NumParts);
5603for (unsignedI = 0;I < VF;I += Sz) {
5604int Val =PoisonMaskElem;
5605unsigned UndefCnt = 0;
5606unsigned Limit = std::min(Sz, VF -I);
5607if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5608 [&](intIdx) {
5609if (Val ==PoisonMaskElem &&Idx !=PoisonMaskElem)
5610 Val =Idx;
5611if (Idx ==PoisonMaskElem)
5612 ++UndefCnt;
5613returnIdx !=PoisonMaskElem &&Idx != Val;
5614 }) ||
5615 Val >=static_cast<int>(NumParts) || UsedVals.test(Val) ||
5616 UndefCnt > Sz / 2)
5617return std::nullopt;
5618 UsedVals.set(Val);
5619for (unsigned K = 0; K < NumParts; ++K) {
5620unsignedIdx = Val + Sz * K;
5621if (Idx < VF)
5622 ResOrder[Idx] =I + K;
5623 }
5624 }
5625return std::move(ResOrder);
5626 }
5627unsigned VF = TE.getVectorFactor();
5628// Try build correct order for extractelement instructions.
5629SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5630 TE.ReuseShuffleIndices.end());
5631if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5632all_of(TE.Scalars, [Sz](Value *V) {
5633 if (isa<PoisonValue>(V))
5634 return true;
5635 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5636 return Idx && *Idx < Sz;
5637 })) {
5638assert(!TE.isAltShuffle() &&"Alternate instructions are only supported "
5639"by BinaryOperator and CastInst.");
5640SmallVector<int> ReorderMask(Sz,PoisonMaskElem);
5641if (TE.ReorderIndices.empty())
5642 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5643else
5644inversePermutation(TE.ReorderIndices, ReorderMask);
5645for (unsignedI = 0;I < VF; ++I) {
5646int &Idx = ReusedMask[I];
5647if (Idx ==PoisonMaskElem)
5648continue;
5649Value *V = TE.Scalars[ReorderMask[Idx]];
5650 std::optional<unsigned> EI =getExtractIndex(cast<Instruction>(V));
5651Idx = std::distance(ReorderMask.begin(),find(ReorderMask, *EI));
5652 }
5653 }
5654// Build the order of the VF size, need to reorder reuses shuffles, they are
5655// always of VF size.
5656OrdersType ResOrder(VF);
5657 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5658auto *It = ResOrder.begin();
5659for (unsigned K = 0; K < VF; K += Sz) {
5660OrdersType CurrentOrder(TE.ReorderIndices);
5661SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5662if (SubMask.front() ==PoisonMaskElem)
5663 std::iota(SubMask.begin(), SubMask.end(), 0);
5664reorderOrder(CurrentOrder, SubMask);
5665transform(CurrentOrder, It, [K](unsigned Pos) {return Pos + K; });
5666 std::advance(It, Sz);
5667 }
5668if (TE.isGather() &&all_of(enumerate(ResOrder), [](constauto &Data) {
5669returnData.index() ==Data.value();
5670 }))
5671return std::nullopt;// No need to reorder.
5672return std::move(ResOrder);
5673 }
5674if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5675any_of(TE.UserTreeIndices,
5676 [](constEdgeInfo &EI) {
5677 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5678 }) &&
5679 (TE.ReorderIndices.empty() ||isReverseOrder(TE.ReorderIndices)))
5680return std::nullopt;
5681if ((TE.State == TreeEntry::Vectorize ||
5682 TE.State == TreeEntry::StridedVectorize) &&
5683 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5684 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5685assert(!TE.isAltShuffle() &&"Alternate instructions are only supported by "
5686"BinaryOperator and CastInst.");
5687return TE.ReorderIndices;
5688 }
5689if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5690if (!TE.ReorderIndices.empty())
5691return TE.ReorderIndices;
5692
5693SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5694for (auto [I, V] :zip(UserBVHead, TE.Scalars)) {
5695if (!V->hasNUsesOrMore(1))
5696continue;
5697auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5698if (!II)
5699continue;
5700Instruction *BVHead =nullptr;
5701BasicBlock *BB =II->getParent();
5702while (II &&II->hasOneUse() &&II->getParent() == BB) {
5703 BVHead =II;
5704II = dyn_cast<InsertElementInst>(II->getOperand(0));
5705 }
5706I = BVHead;
5707 }
5708
5709auto CompareByBasicBlocks = [&](BasicBlock *BB1,BasicBlock *BB2) {
5710assert(BB1 != BB2 &&"Expected different basic blocks.");
5711auto *NodeA = DT->getNode(BB1);
5712auto *NodeB = DT->getNode(BB2);
5713assert(NodeA &&"Should only process reachable instructions");
5714assert(NodeB &&"Should only process reachable instructions");
5715assert((NodeA == NodeB) ==
5716 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5717"Different nodes should have different DFS numbers");
5718return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5719 };
5720auto PHICompare = [&](unsigned I1,unsigned I2) {
5721Value *V1 = TE.Scalars[I1];
5722Value *V2 = TE.Scalars[I2];
5723if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5724returnfalse;
5725if (isa<PoisonValue>(V1))
5726returntrue;
5727if (isa<PoisonValue>(V2))
5728returnfalse;
5729if (V1->getNumUses() < V2->getNumUses())
5730returntrue;
5731if (V1->getNumUses() > V2->getNumUses())
5732returnfalse;
5733auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5734auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5735if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5736return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5737 FirstUserOfPhi2->getParent());
5738auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5739auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5740auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5741auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5742if (IE1 && !IE2)
5743returntrue;
5744if (!IE1 && IE2)
5745returnfalse;
5746if (IE1 && IE2) {
5747if (UserBVHead[I1] && !UserBVHead[I2])
5748returntrue;
5749if (!UserBVHead[I1])
5750returnfalse;
5751if (UserBVHead[I1] == UserBVHead[I2])
5752returngetElementIndex(IE1) <getElementIndex(IE2);
5753if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5754return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5755 UserBVHead[I2]->getParent());
5756return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5757 }
5758if (EE1 && !EE2)
5759returntrue;
5760if (!EE1 && EE2)
5761returnfalse;
5762if (EE1 && EE2) {
5763auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5764auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5765auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5766auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5767if (!Inst2 && !P2)
5768return Inst1 || P1;
5769if (EE1->getOperand(0) == EE2->getOperand(0))
5770returngetElementIndex(EE1) <getElementIndex(EE2);
5771if (!Inst1 && Inst2)
5772returnfalse;
5773if (Inst1 && Inst2) {
5774if (Inst1->getParent() != Inst2->getParent())
5775return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5776return Inst1->comesBefore(Inst2);
5777 }
5778if (!P1 && P2)
5779returnfalse;
5780assert(P1 && P2 &&
5781"Expected either instructions or arguments vector operands.");
5782return P1->getArgNo() < P2->getArgNo();
5783 }
5784returnfalse;
5785 };
5786OrdersType Phis(TE.Scalars.size());
5787 std::iota(Phis.begin(), Phis.end(), 0);
5788stable_sort(Phis, PHICompare);
5789if (isIdentityOrder(Phis))
5790return std::nullopt;// No need to reorder.
5791return std::move(Phis);
5792 }
5793if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5794allSameType(TE.Scalars)) {
5795// TODO: add analysis of other gather nodes with extractelement
5796// instructions and other values/instructions, not only undefs.
5797if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5798 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5799any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5800all_of(TE.Scalars, [](Value *V) {
5801 auto *EE = dyn_cast<ExtractElementInst>(V);
5802 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5803 })) {
5804// Check that gather of extractelements can be represented as
5805// just a shuffle of a single vector.
5806OrdersType CurrentOrder;
5807bool Reuse =
5808 canReuseExtract(TE.Scalars, CurrentOrder,/*ResizeAllowed=*/true);
5809if (Reuse || !CurrentOrder.empty())
5810return std::move(CurrentOrder);
5811 }
5812// If the gather node is <undef, v, .., poison> and
5813// insertelement poison, v, 0 [+ permute]
5814// is cheaper than
5815// insertelement poison, v, n - try to reorder.
5816// If rotating the whole graph, exclude the permute cost, the whole graph
5817// might be transformed.
5818int Sz = TE.Scalars.size();
5819if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5820count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5821constauto *It =
5822find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5823if (It == TE.Scalars.begin())
5824returnOrdersType();
5825auto *Ty =getWidenedType(TE.Scalars.front()->getType(), Sz);
5826if (It != TE.Scalars.end()) {
5827OrdersType Order(Sz, Sz);
5828unsignedIdx = std::distance(TE.Scalars.begin(), It);
5829 Order[Idx] = 0;
5830fixupOrderingIndices(Order);
5831SmallVector<int> Mask;
5832inversePermutation(Order, Mask);
5833InstructionCost PermuteCost =
5834 TopToBottom
5835 ? 0
5836 :::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, Ty, Mask);
5837InstructionCost InsertFirstCost =TTI->getVectorInstrCost(
5838 Instruction::InsertElement, Ty,TTI::TCK_RecipThroughput, 0,
5839PoisonValue::get(Ty), *It);
5840InstructionCost InsertIdxCost =TTI->getVectorInstrCost(
5841 Instruction::InsertElement, Ty,TTI::TCK_RecipThroughput,Idx,
5842PoisonValue::get(Ty), *It);
5843if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5844OrdersType Order(Sz, Sz);
5845 Order[Idx] = 0;
5846return std::move(Order);
5847 }
5848 }
5849 }
5850if (isSplat(TE.Scalars))
5851return std::nullopt;
5852if (TE.Scalars.size() >= 3)
5853if (std::optional<OrdersType> Order =findPartiallyOrderedLoads(TE))
5854return Order;
5855// Check if can include the order of vectorized loads. For masked gathers do
5856// extra analysis later, so include such nodes into a special list.
5857if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5858SmallVector<Value *> PointerOps;
5859OrdersType CurrentOrder;
5860LoadsState Res =canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5861 CurrentOrder, PointerOps);
5862if (Res ==LoadsState::Vectorize || Res ==LoadsState::StridedVectorize)
5863return std::move(CurrentOrder);
5864 }
5865// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5866// has been auditted for correctness with non-power-of-two vectors.
5867if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5868if (std::optional<OrdersType> CurrentOrder =findReusedOrderedScalars(TE))
5869return CurrentOrder;
5870 }
5871return std::nullopt;
5872}
5873
5874/// Checks if the given mask is a "clustered" mask with the same clusters of
5875/// size \p Sz, which are not identity submasks.
5876staticboolisRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
5877unsigned Sz) {
5878ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5879if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5880returnfalse;
5881for (unsignedI = Sz,E = Mask.size();I <E;I += Sz) {
5882ArrayRef<int> Cluster = Mask.slice(I, Sz);
5883if (Cluster != FirstCluster)
5884returnfalse;
5885 }
5886returntrue;
5887}
5888
5889void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,ArrayRef<int> Mask) const{
5890// Reorder reuses mask.
5891reorderReuses(TE.ReuseShuffleIndices, Mask);
5892constunsigned Sz =TE.Scalars.size();
5893// For vectorized and non-clustered reused no need to do anything else.
5894if (!TE.isGather() ||
5895 !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5896 Sz) ||
5897 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5898return;
5899SmallVector<int> NewMask;
5900inversePermutation(TE.ReorderIndices, NewMask);
5901addMask(NewMask,TE.ReuseShuffleIndices);
5902// Clear reorder since it is going to be applied to the new mask.
5903TE.ReorderIndices.clear();
5904// Try to improve gathered nodes with clustered reuses, if possible.
5905ArrayRef<int> Slice =ArrayRef(NewMask).slice(0, Sz);
5906SmallVector<unsigned> NewOrder(Slice);
5907inversePermutation(NewOrder, NewMask);
5908reorderScalars(TE.Scalars, NewMask);
5909// Fill the reuses mask with the identity submasks.
5910for (auto *It =TE.ReuseShuffleIndices.begin(),
5911 *End =TE.ReuseShuffleIndices.end();
5912 It !=End; std::advance(It, Sz))
5913 std::iota(It, std::next(It, Sz), 0);
5914}
5915
5916staticvoidcombineOrders(MutableArrayRef<unsigned> Order,
5917ArrayRef<unsigned> SecondaryOrder) {
5918assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5919"Expected same size of orders");
5920unsigned Sz = Order.size();
5921SmallBitVector UsedIndices(Sz);
5922for (unsignedIdx : seq<unsigned>(0, Sz)) {
5923if (Order[Idx] != Sz)
5924 UsedIndices.set(Order[Idx]);
5925 }
5926if (SecondaryOrder.empty()) {
5927for (unsignedIdx : seq<unsigned>(0, Sz))
5928if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5929 Order[Idx] =Idx;
5930 }else {
5931for (unsignedIdx : seq<unsigned>(0, Sz))
5932if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5933 !UsedIndices.test(SecondaryOrder[Idx]))
5934 Order[Idx] = SecondaryOrder[Idx];
5935 }
5936}
5937
5938voidBoUpSLP::reorderTopToBottom() {
5939// Maps VF to the graph nodes.
5940DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
5941// ExtractElement gather nodes which can be vectorized and need to handle
5942// their ordering.
5943DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
5944
5945// Phi nodes can have preferred ordering based on their result users
5946DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
5947
5948// AltShuffles can also have a preferred ordering that leads to fewer
5949// instructions, e.g., the addsub instruction in x86.
5950DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5951
5952// Maps a TreeEntry to the reorder indices of external users.
5953DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
5954 ExternalUserReorderMap;
5955// Find all reorderable nodes with the given VF.
5956// Currently the are vectorized stores,loads,extracts + some gathering of
5957// extracts.
5958for_each(VectorizableTree, [&, &TTIRef = *TTI](
5959const std::unique_ptr<TreeEntry> &TE) {
5960// Look for external users that will probably be vectorized.
5961SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5962 findExternalStoreUsersReorderIndices(TE.get());
5963if (!ExternalUserReorderIndices.empty()) {
5964 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5965 ExternalUserReorderMap.try_emplace(TE.get(),
5966 std::move(ExternalUserReorderIndices));
5967 }
5968
5969// Patterns like [fadd,fsub] can be combined into a single instruction in
5970// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5971// to take into account their order when looking for the most used order.
5972if (TE->hasState() && TE->isAltShuffle()) {
5973VectorType *VecTy =
5974getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5975unsigned Opcode0 = TE->getOpcode();
5976unsigned Opcode1 = TE->getAltOpcode();
5977SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5978// If this pattern is supported by the target then we consider the order.
5979if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5980 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5981 AltShufflesToOrders.try_emplace(TE.get(),OrdersType());
5982 }
5983// TODO: Check the reverse order too.
5984 }
5985
5986if (std::optional<OrdersType> CurrentOrder =
5987getReorderingData(*TE,/*TopToBottom=*/true)) {
5988// Do not include ordering for nodes used in the alt opcode vectorization,
5989// better to reorder them during bottom-to-top stage. If follow the order
5990// here, it causes reordering of the whole graph though actually it is
5991// profitable just to reorder the subgraph that starts from the alternate
5992// opcode vectorization node. Such nodes already end-up with the shuffle
5993// instruction and it is just enough to change this shuffle rather than
5994// rotate the scalars for the whole graph.
5995unsigned Cnt = 0;
5996const TreeEntry *UserTE = TE.get();
5997while (UserTE && Cnt <RecursionMaxDepth) {
5998if (UserTE->UserTreeIndices.size() != 1)
5999break;
6000if (all_of(UserTE->UserTreeIndices, [](constEdgeInfo &EI) {
6001 return EI.UserTE->State == TreeEntry::Vectorize &&
6002 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
6003 }))
6004return;
6005 UserTE = UserTE->UserTreeIndices.back().UserTE;
6006 ++Cnt;
6007 }
6008 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
6009if (!(TE->State == TreeEntry::Vectorize ||
6010 TE->State == TreeEntry::StridedVectorize) ||
6011 !TE->ReuseShuffleIndices.empty())
6012 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
6013if (TE->State == TreeEntry::Vectorize &&
6014 TE->getOpcode() == Instruction::PHI)
6015 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
6016 }
6017 });
6018
6019// Reorder the graph nodes according to their vectorization factor.
6020for (unsigned VF = VectorizableTree.front()->getVectorFactor();
6021 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6022auto It = VFToOrderedEntries.find(VF);
6023if (It == VFToOrderedEntries.end())
6024continue;
6025// Try to find the most profitable order. We just are looking for the most
6026// used order and reorder scalar elements in the nodes according to this
6027// mostly used order.
6028ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
6029// Delete VF entry upon exit.
6030autoCleanup =make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
6031
6032// All operands are reordered and used only in this node - propagate the
6033// most used order to the user node.
6034MapVector<OrdersType,unsigned,
6035DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
6036 OrdersUses;
6037SmallPtrSet<const TreeEntry *, 4> VisitedOps;
6038for (const TreeEntry *OpTE : OrderedEntries) {
6039// No need to reorder this nodes, still need to extend and to use shuffle,
6040// just need to merge reordering shuffle and the reuse shuffle.
6041if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6042continue;
6043// Count number of orders uses.
6044constauto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6045 &PhisToOrders]() ->constOrdersType & {
6046if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6047auto It = GathersToOrders.find(OpTE);
6048if (It != GathersToOrders.end())
6049return It->second;
6050 }
6051if (OpTE->hasState() && OpTE->isAltShuffle()) {
6052auto It = AltShufflesToOrders.find(OpTE);
6053if (It != AltShufflesToOrders.end())
6054return It->second;
6055 }
6056if (OpTE->State == TreeEntry::Vectorize &&
6057 OpTE->getOpcode() == Instruction::PHI) {
6058auto It = PhisToOrders.find(OpTE);
6059if (It != PhisToOrders.end())
6060return It->second;
6061 }
6062return OpTE->ReorderIndices;
6063 }();
6064// First consider the order of the external scalar users.
6065auto It = ExternalUserReorderMap.find(OpTE);
6066if (It != ExternalUserReorderMap.end()) {
6067constauto &ExternalUserReorderIndices = It->second;
6068// If the OpTE vector factor != number of scalars - use natural order,
6069// it is an attempt to reorder node with reused scalars but with
6070// external uses.
6071if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6072 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
6073 ExternalUserReorderIndices.size();
6074 }else {
6075for (constOrdersType &ExtOrder : ExternalUserReorderIndices)
6076 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6077 }
6078// No other useful reorder data in this entry.
6079if (Order.empty())
6080continue;
6081 }
6082// Stores actually store the mask, not the order, need to invert.
6083if (OpTE->State == TreeEntry::Vectorize &&
6084 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6085assert(!OpTE->isAltShuffle() &&
6086"Alternate instructions are only supported by BinaryOperator "
6087"and CastInst.");
6088SmallVector<int> Mask;
6089inversePermutation(Order, Mask);
6090unsignedE = Order.size();
6091OrdersType CurrentOrder(E,E);
6092transform(Mask, CurrentOrder.begin(), [E](intIdx) {
6093 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6094 });
6095fixupOrderingIndices(CurrentOrder);
6096 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6097 }else {
6098 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6099 }
6100 }
6101if (OrdersUses.empty())
6102continue;
6103// Choose the most used order.
6104unsigned IdentityCnt = 0;
6105unsigned FilledIdentityCnt = 0;
6106OrdersType IdentityOrder(VF, VF);
6107for (auto &Pair : OrdersUses) {
6108if (Pair.first.empty() ||isIdentityOrder(Pair.first)) {
6109if (!Pair.first.empty())
6110 FilledIdentityCnt += Pair.second;
6111 IdentityCnt += Pair.second;
6112combineOrders(IdentityOrder, Pair.first);
6113 }
6114 }
6115MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6116unsigned Cnt = IdentityCnt;
6117for (auto &Pair : OrdersUses) {
6118// Prefer identity order. But, if filled identity found (non-empty order)
6119// with same number of uses, as the new candidate order, we can choose
6120// this candidate order.
6121if (Cnt < Pair.second ||
6122 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6123 Cnt == Pair.second && !BestOrder.empty() &&
6124isIdentityOrder(BestOrder))) {
6125combineOrders(Pair.first, BestOrder);
6126 BestOrder = Pair.first;
6127 Cnt = Pair.second;
6128 }else {
6129combineOrders(BestOrder, Pair.first);
6130 }
6131 }
6132// Set order of the user node.
6133if (isIdentityOrder(BestOrder))
6134continue;
6135fixupOrderingIndices(BestOrder);
6136SmallVector<int> Mask;
6137inversePermutation(BestOrder, Mask);
6138SmallVector<int> MaskOrder(BestOrder.size(),PoisonMaskElem);
6139unsignedE = BestOrder.size();
6140transform(BestOrder, MaskOrder.begin(), [E](unsignedI) {
6141 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6142 });
6143// Do an actual reordering, if profitable.
6144for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6145// Just do the reordering for the nodes with the given VF.
6146if (TE->Scalars.size() != VF) {
6147if (TE->ReuseShuffleIndices.size() == VF) {
6148// Need to reorder the reuses masks of the operands with smaller VF to
6149// be able to find the match between the graph nodes and scalar
6150// operands of the given node during vectorization/cost estimation.
6151assert(all_of(TE->UserTreeIndices,
6152 [VF, &TE](constEdgeInfo &EI) {
6153 return EI.UserTE->Scalars.size() == VF ||
6154 EI.UserTE->Scalars.size() ==
6155 TE->Scalars.size();
6156 }) &&
6157"All users must be of VF size.");
6158if (SLPReVec) {
6159assert(SLPReVec &&"Only supported by REVEC.");
6160// ShuffleVectorInst does not do reorderOperands (and it should not
6161// because ShuffleVectorInst supports only a limited set of
6162// patterns). Only do reorderNodeWithReuses if all of the users are
6163// not ShuffleVectorInst.
6164if (all_of(TE->UserTreeIndices, [&](constEdgeInfo &EI) {
6165 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6166 }))
6167continue;
6168assert(none_of(TE->UserTreeIndices,
6169 [&](constEdgeInfo &EI) {
6170 return isa<ShuffleVectorInst>(
6171 EI.UserTE->getMainOp());
6172 }) &&
6173"Does not know how to reorder.");
6174 }
6175// Update ordering of the operands with the smaller VF than the given
6176// one.
6177 reorderNodeWithReuses(*TE, Mask);
6178 }
6179continue;
6180 }
6181if ((TE->State == TreeEntry::Vectorize ||
6182 TE->State == TreeEntry::StridedVectorize) &&
6183 (isa<ExtractElementInst,ExtractValueInst,LoadInst,StoreInst,
6184InsertElementInst>(TE->getMainOp()) ||
6185 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6186assert(!TE->isAltShuffle() &&
6187"Alternate instructions are only supported by BinaryOperator "
6188"and CastInst.");
6189// Build correct orders for extract{element,value}, loads and
6190// stores.
6191reorderOrder(TE->ReorderIndices, Mask);
6192if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6193 TE->reorderOperands(Mask);
6194 }else {
6195// Reorder the node and its operands.
6196 TE->reorderOperands(Mask);
6197assert(TE->ReorderIndices.empty() &&
6198"Expected empty reorder sequence.");
6199reorderScalars(TE->Scalars, Mask);
6200 }
6201if (!TE->ReuseShuffleIndices.empty()) {
6202// Apply reversed order to keep the original ordering of the reused
6203// elements to avoid extra reorder indices shuffling.
6204OrdersType CurrentOrder;
6205reorderOrder(CurrentOrder, MaskOrder);
6206SmallVector<int> NewReuses;
6207inversePermutation(CurrentOrder, NewReuses);
6208addMask(NewReuses, TE->ReuseShuffleIndices);
6209 TE->ReuseShuffleIndices.swap(NewReuses);
6210 }
6211 }
6212 }
6213}
6214
6215bool BoUpSLP::canReorderOperands(
6216 TreeEntry *UserTE,SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6217ArrayRef<TreeEntry *> ReorderableGathers,
6218SmallVectorImpl<TreeEntry *> &GatherOps) {
6219for (unsignedI = 0,E = UserTE->getNumOperands();I <E; ++I) {
6220if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6221return OpData.first ==I &&
6222 (OpData.second->State == TreeEntry::Vectorize ||
6223 OpData.second->State == TreeEntry::StridedVectorize);
6224 }))
6225continue;
6226if (TreeEntry *TE = getVectorizedOperand(UserTE,I)) {
6227// Do not reorder if operand node is used by many user nodes.
6228if (any_of(TE->UserTreeIndices,
6229 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6230returnfalse;
6231// Add the node to the list of the ordered nodes with the identity
6232// order.
6233 Edges.emplace_back(I, TE);
6234// Add ScatterVectorize nodes to the list of operands, where just
6235// reordering of the scalars is required. Similar to the gathers, so
6236// simply add to the list of gathered ops.
6237// If there are reused scalars, process this node as a regular vectorize
6238// node, just reorder reuses mask.
6239if (TE->State != TreeEntry::Vectorize &&
6240 TE->State != TreeEntry::StridedVectorize &&
6241 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6242 GatherOps.push_back(TE);
6243continue;
6244 }
6245 TreeEntry *Gather =nullptr;
6246if (count_if(ReorderableGathers,
6247 [&Gather, UserTE,I](TreeEntry *TE) {
6248assert(TE->State != TreeEntry::Vectorize &&
6249 TE->State != TreeEntry::StridedVectorize &&
6250"Only non-vectorized nodes are expected.");
6251if (any_of(TE->UserTreeIndices,
6252 [UserTE,I](const EdgeInfo &EI) {
6253 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6254 })) {
6255assert(TE->isSame(UserTE->getOperand(I)) &&
6256"Operand entry does not match operands.");
6257Gather = TE;
6258returntrue;
6259 }
6260returnfalse;
6261 }) > 1 &&
6262 !allConstant(UserTE->getOperand(I)))
6263returnfalse;
6264if (Gather)
6265 GatherOps.push_back(Gather);
6266 }
6267returntrue;
6268}
6269
6270voidBoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6271SetVector<TreeEntry *> OrderedEntries;
6272DenseSet<const TreeEntry *> GathersToOrders;
6273// Find all reorderable leaf nodes with the given VF.
6274// Currently the are vectorized loads,extracts without alternate operands +
6275// some gathering of extracts.
6276SmallVector<TreeEntry *> NonVectorized;
6277for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6278if (TE->State != TreeEntry::Vectorize &&
6279 TE->State != TreeEntry::StridedVectorize)
6280 NonVectorized.push_back(TE.get());
6281if (std::optional<OrdersType> CurrentOrder =
6282getReorderingData(*TE,/*TopToBottom=*/false)) {
6283 OrderedEntries.insert(TE.get());
6284if (!(TE->State == TreeEntry::Vectorize ||
6285 TE->State == TreeEntry::StridedVectorize) ||
6286 !TE->ReuseShuffleIndices.empty())
6287 GathersToOrders.insert(TE.get());
6288 }
6289 }
6290
6291// 1. Propagate order to the graph nodes, which use only reordered nodes.
6292// I.e., if the node has operands, that are reordered, try to make at least
6293// one operand order in the natural order and reorder others + reorder the
6294// user node itself.
6295SmallPtrSet<const TreeEntry *, 4> Visited;
6296while (!OrderedEntries.empty()) {
6297// 1. Filter out only reordered nodes.
6298// 2. If the entry has multiple uses - skip it and jump to the next node.
6299DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>Users;
6300SmallVector<TreeEntry *> Filtered;
6301for (TreeEntry *TE : OrderedEntries) {
6302if (!(TE->State == TreeEntry::Vectorize ||
6303 TE->State == TreeEntry::StridedVectorize ||
6304 (TE->isGather() && GathersToOrders.contains(TE))) ||
6305 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6306 !all_of(drop_begin(TE->UserTreeIndices),
6307 [TE](constEdgeInfo &EI) {
6308 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6309 }) ||
6310 !Visited.insert(TE).second) {
6311 Filtered.push_back(TE);
6312continue;
6313 }
6314// Build a map between user nodes and their operands order to speedup
6315// search. The graph currently does not provide this dependency directly.
6316for (EdgeInfo &EI : TE->UserTreeIndices)
6317Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6318 }
6319// Erase filtered entries.
6320for (TreeEntry *TE : Filtered)
6321 OrderedEntries.remove(TE);
6322SmallVector<
6323 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6324 UsersVec(Users.begin(),Users.end());
6325sort(UsersVec, [](constauto &Data1,constauto &Data2) {
6326return Data1.first->Idx > Data2.first->Idx;
6327 });
6328for (auto &Data : UsersVec) {
6329// Check that operands are used only in the User node.
6330SmallVector<TreeEntry *> GatherOps;
6331if (!canReorderOperands(Data.first,Data.second, NonVectorized,
6332 GatherOps)) {
6333for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)
6334 OrderedEntries.remove(Op.second);
6335continue;
6336 }
6337// All operands are reordered and used only in this node - propagate the
6338// most used order to the user node.
6339MapVector<OrdersType,unsigned,
6340DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
6341 OrdersUses;
6342// Do the analysis for each tree entry only once, otherwise the order of
6343// the same node my be considered several times, though might be not
6344// profitable.
6345SmallPtrSet<const TreeEntry *, 4> VisitedOps;
6346SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
6347for (constauto &Op :Data.second) {
6348 TreeEntry *OpTE =Op.second;
6349if (!VisitedOps.insert(OpTE).second)
6350continue;
6351if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6352continue;
6353constauto Order = [&]() ->constOrdersType {
6354if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6355returngetReorderingData(*OpTE,/*TopToBottom=*/false)
6356 .value_or(OrdersType(1));
6357return OpTE->ReorderIndices;
6358 }();
6359// The order is partially ordered, skip it in favor of fully non-ordered
6360// orders.
6361if (Order.size() == 1)
6362continue;
6363unsigned NumOps =count_if(
6364Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6365 return P.second == OpTE;
6366 });
6367// Stores actually store the mask, not the order, need to invert.
6368if (OpTE->State == TreeEntry::Vectorize &&
6369 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6370assert(!OpTE->isAltShuffle() &&
6371"Alternate instructions are only supported by BinaryOperator "
6372"and CastInst.");
6373SmallVector<int> Mask;
6374inversePermutation(Order, Mask);
6375unsignedE = Order.size();
6376OrdersType CurrentOrder(E,E);
6377transform(Mask, CurrentOrder.begin(), [E](intIdx) {
6378 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6379 });
6380fixupOrderingIndices(CurrentOrder);
6381 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6382 NumOps;
6383 }else {
6384 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6385 }
6386auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6387constauto AllowsReordering = [&](const TreeEntry *TE) {
6388if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6389 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6390 (IgnoreReorder && TE->Idx == 0))
6391returntrue;
6392if (TE->isGather()) {
6393if (GathersToOrders.contains(TE))
6394return !getReorderingData(*TE,/*TopToBottom=*/false)
6395 .value_or(OrdersType(1))
6396 .empty();
6397returntrue;
6398 }
6399returnfalse;
6400 };
6401for (constEdgeInfo &EI : OpTE->UserTreeIndices) {
6402 TreeEntry *UserTE = EI.UserTE;
6403if (!VisitedUsers.insert(UserTE).second)
6404continue;
6405// May reorder user node if it requires reordering, has reused
6406// scalars, is an alternate op vectorize node or its op nodes require
6407// reordering.
6408if (AllowsReordering(UserTE))
6409continue;
6410// Check if users allow reordering.
6411// Currently look up just 1 level of operands to avoid increase of
6412// the compile time.
6413// Profitable to reorder if definitely more operands allow
6414// reordering rather than those with natural order.
6415ArrayRef<std::pair<unsigned, TreeEntry *>> Ops =Users[UserTE];
6416if (static_cast<unsigned>(count_if(
6417 Ops, [UserTE, &AllowsReordering](
6418const std::pair<unsigned, TreeEntry *> &Op) {
6419return AllowsReordering(Op.second) &&
6420all_of(Op.second->UserTreeIndices,
6421 [UserTE](constEdgeInfo &EI) {
6422 return EI.UserTE == UserTE;
6423 });
6424 })) <= Ops.size() / 2)
6425 ++Res.first->second;
6426 }
6427 }
6428if (OrdersUses.empty()) {
6429for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)
6430 OrderedEntries.remove(Op.second);
6431continue;
6432 }
6433// Choose the most used order.
6434unsigned IdentityCnt = 0;
6435unsigned VF =Data.second.front().second->getVectorFactor();
6436OrdersType IdentityOrder(VF, VF);
6437for (auto &Pair : OrdersUses) {
6438if (Pair.first.empty() ||isIdentityOrder(Pair.first)) {
6439 IdentityCnt += Pair.second;
6440combineOrders(IdentityOrder, Pair.first);
6441 }
6442 }
6443MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6444unsigned Cnt = IdentityCnt;
6445for (auto &Pair : OrdersUses) {
6446// Prefer identity order. But, if filled identity found (non-empty
6447// order) with same number of uses, as the new candidate order, we can
6448// choose this candidate order.
6449if (Cnt < Pair.second) {
6450combineOrders(Pair.first, BestOrder);
6451 BestOrder = Pair.first;
6452 Cnt = Pair.second;
6453 }else {
6454combineOrders(BestOrder, Pair.first);
6455 }
6456 }
6457// Set order of the user node.
6458if (isIdentityOrder(BestOrder)) {
6459for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)
6460 OrderedEntries.remove(Op.second);
6461continue;
6462 }
6463fixupOrderingIndices(BestOrder);
6464// Erase operands from OrderedEntries list and adjust their orders.
6465 VisitedOps.clear();
6466SmallVector<int> Mask;
6467inversePermutation(BestOrder, Mask);
6468SmallVector<int> MaskOrder(BestOrder.size(),PoisonMaskElem);
6469unsignedE = BestOrder.size();
6470transform(BestOrder, MaskOrder.begin(), [E](unsignedI) {
6471 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6472 });
6473for (const std::pair<unsigned, TreeEntry *> &Op :Data.second) {
6474 TreeEntry *TE =Op.second;
6475 OrderedEntries.remove(TE);
6476if (!VisitedOps.insert(TE).second)
6477continue;
6478if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6479 reorderNodeWithReuses(*TE, Mask);
6480continue;
6481 }
6482// Gathers are processed separately.
6483if (TE->State != TreeEntry::Vectorize &&
6484 TE->State != TreeEntry::StridedVectorize &&
6485 (TE->State != TreeEntry::ScatterVectorize ||
6486 TE->ReorderIndices.empty()))
6487continue;
6488assert((BestOrder.size() == TE->ReorderIndices.size() ||
6489 TE->ReorderIndices.empty()) &&
6490"Non-matching sizes of user/operand entries.");
6491reorderOrder(TE->ReorderIndices, Mask);
6492if (IgnoreReorder && TE == VectorizableTree.front().get())
6493 IgnoreReorder =false;
6494 }
6495// For gathers just need to reorder its scalars.
6496for (TreeEntry *Gather : GatherOps) {
6497assert(Gather->ReorderIndices.empty() &&
6498"Unexpected reordering of gathers.");
6499if (!Gather->ReuseShuffleIndices.empty()) {
6500// Just reorder reuses indices.
6501reorderReuses(Gather->ReuseShuffleIndices, Mask);
6502continue;
6503 }
6504reorderScalars(Gather->Scalars, Mask);
6505 OrderedEntries.remove(Gather);
6506 }
6507// Reorder operands of the user node and set the ordering for the user
6508// node itself.
6509if (Data.first->State != TreeEntry::Vectorize ||
6510 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6511Data.first->getMainOp()) ||
6512Data.first->isAltShuffle())
6513Data.first->reorderOperands(Mask);
6514if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6515Data.first->isAltShuffle() ||
6516Data.first->State == TreeEntry::StridedVectorize) {
6517reorderScalars(Data.first->Scalars, Mask);
6518reorderOrder(Data.first->ReorderIndices, MaskOrder,
6519/*BottomOrder=*/true);
6520if (Data.first->ReuseShuffleIndices.empty() &&
6521 !Data.first->ReorderIndices.empty() &&
6522 !Data.first->isAltShuffle()) {
6523// Insert user node to the list to try to sink reordering deeper in
6524// the graph.
6525 OrderedEntries.insert(Data.first);
6526 }
6527 }else {
6528reorderOrder(Data.first->ReorderIndices, Mask);
6529 }
6530 }
6531 }
6532// If the reordering is unnecessary, just remove the reorder.
6533if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6534 VectorizableTree.front()->ReuseShuffleIndices.empty())
6535 VectorizableTree.front()->ReorderIndices.clear();
6536}
6537
6538Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const{
6539if ((Entry.getOpcode() == Instruction::Store ||
6540 Entry.getOpcode() == Instruction::Load) &&
6541 Entry.State == TreeEntry::StridedVectorize &&
6542 !Entry.ReorderIndices.empty() &&isReverseOrder(Entry.ReorderIndices))
6543return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6544return dyn_cast<Instruction>(Entry.Scalars.front());
6545}
6546
6547voidBoUpSLP::buildExternalUses(
6548constExtraValueToDebugLocsMap &ExternallyUsedValues) {
6549DenseMap<Value *, unsigned> ScalarToExtUses;
6550// Collect the values that we need to extract from the tree.
6551for (auto &TEPtr : VectorizableTree) {
6552 TreeEntry *Entry = TEPtr.get();
6553
6554// No need to handle users of gathered values.
6555if (Entry->isGather())
6556continue;
6557
6558// For each lane:
6559for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6560Value *Scalar = Entry->Scalars[Lane];
6561if (!isa<Instruction>(Scalar))
6562continue;
6563// All uses must be replaced already? No need to do it again.
6564auto It = ScalarToExtUses.find(Scalar);
6565if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6566continue;
6567
6568// Check if the scalar is externally used as an extra arg.
6569constauto ExtI = ExternallyUsedValues.find(Scalar);
6570if (ExtI != ExternallyUsedValues.end()) {
6571int FoundLane = Entry->findLaneForValue(Scalar);
6572LLVM_DEBUG(dbgs() <<"SLP: Need to extract: Extra arg from lane "
6573 << FoundLane <<" from " << *Scalar <<".\n");
6574 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6575 ExternalUses.emplace_back(Scalar,nullptr, FoundLane);
6576continue;
6577 }
6578for (User *U : Scalar->users()) {
6579LLVM_DEBUG(dbgs() <<"SLP: Checking user:" << *U <<".\n");
6580
6581Instruction *UserInst = dyn_cast<Instruction>(U);
6582if (!UserInst ||isDeleted(UserInst))
6583continue;
6584
6585// Ignore users in the user ignore list.
6586if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6587continue;
6588
6589// Skip in-tree scalars that become vectors
6590if (TreeEntry *UseEntry = getTreeEntry(U)) {
6591// Some in-tree scalars will remain as scalar in vectorized
6592// instructions. If that is the case, the one in FoundLane will
6593// be used.
6594if (UseEntry->State == TreeEntry::ScatterVectorize ||
6595 !doesInTreeUserNeedToExtract(
6596 Scalar, getRootEntryInstruction(*UseEntry), TLI,TTI)) {
6597LLVM_DEBUG(dbgs() <<"SLP: \tInternal user will be removed:" << *U
6598 <<".\n");
6599assert(!UseEntry->isGather() &&"Bad state");
6600continue;
6601 }
6602 U =nullptr;
6603if (It != ScalarToExtUses.end()) {
6604 ExternalUses[It->second].User =nullptr;
6605break;
6606 }
6607 }
6608
6609if (U && Scalar->hasNUsesOrMore(UsesLimit))
6610 U =nullptr;
6611int FoundLane = Entry->findLaneForValue(Scalar);
6612LLVM_DEBUG(dbgs() <<"SLP: Need to extract:" << *UserInst
6613 <<" from lane " << FoundLane <<" from " << *Scalar
6614 <<".\n");
6615 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6616 ExternalUses.emplace_back(Scalar, U, FoundLane);
6617if (!U)
6618break;
6619 }
6620 }
6621 }
6622}
6623
6624SmallVector<SmallVector<StoreInst *>>
6625BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const{
6626SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,
6627SmallVector<StoreInst *>, 8>
6628 PtrToStoresMap;
6629for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6630Value *V = TE->Scalars[Lane];
6631// Don't iterate over the users of constant data.
6632if (!isa<Instruction>(V))
6633continue;
6634// To save compilation time we don't visit if we have too many users.
6635if (V->hasNUsesOrMore(UsesLimit))
6636break;
6637
6638// Collect stores per pointer object.
6639for (User *U : V->users()) {
6640auto *SI = dyn_cast<StoreInst>(U);
6641// Test whether we can handle the store. V might be a global, which could
6642// be used in a different function.
6643if (SI ==nullptr || !SI->isSimple() || SI->getFunction() !=F ||
6644 !isValidElementType(SI->getValueOperand()->getType()))
6645continue;
6646// Skip entry if already
6647if (getTreeEntry(U))
6648continue;
6649
6650Value *Ptr =
6651getUnderlyingObject(SI->getPointerOperand(),RecursionMaxDepth);
6652auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6653 SI->getValueOperand()->getType(),Ptr}];
6654// For now just keep one store per pointer object per lane.
6655// TODO: Extend this to support multiple stores per pointer per lane
6656if (StoresVec.size() > Lane)
6657continue;
6658if (!StoresVec.empty()) {
6659 std::optional<int> Diff =getPointersDiff(
6660 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6661 SI->getValueOperand()->getType(),
6662 StoresVec.front()->getPointerOperand(), *DL, *SE,
6663/*StrictCheck=*/true);
6664// We failed to compare the pointers so just abandon this store.
6665if (!Diff)
6666continue;
6667 }
6668 StoresVec.push_back(SI);
6669 }
6670 }
6671SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6672unsignedI = 0;
6673for (auto &P : PtrToStoresMap) {
6674 Res[I].swap(P.second);
6675 ++I;
6676 }
6677return Res;
6678}
6679
6680bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6681 OrdersType &ReorderIndices) const{
6682// We check whether the stores in StoreVec can form a vector by sorting them
6683// and checking whether they are consecutive.
6684
6685// To avoid calling getPointersDiff() while sorting we create a vector of
6686// pairs {store, offset from first} and sort this instead.
6687SmallVector<std::pair<int, unsigned>> StoreOffsetVec;
6688StoreInst *S0 = StoresVec[0];
6689 StoreOffsetVec.emplace_back(0, 0);
6690Type *S0Ty = S0->getValueOperand()->getType();
6691Value *S0Ptr = S0->getPointerOperand();
6692for (unsignedIdx : seq<unsigned>(1, StoresVec.size())) {
6693StoreInst *SI = StoresVec[Idx];
6694 std::optional<int> Diff =
6695getPointersDiff(S0Ty, S0Ptr,SI->getValueOperand()->getType(),
6696SI->getPointerOperand(), *DL, *SE,
6697/*StrictCheck=*/true);
6698 StoreOffsetVec.emplace_back(*Diff,Idx);
6699 }
6700
6701// Check if the stores are consecutive by checking if their difference is 1.
6702if (StoreOffsetVec.size() != StoresVec.size())
6703returnfalse;
6704sort(StoreOffsetVec,
6705 [](const std::pair<int, unsigned> &L,
6706const std::pair<int, unsigned> &R) {returnL.first <R.first; });
6707unsignedIdx = 0;
6708int PrevDist = 0;
6709for (constauto &P : StoreOffsetVec) {
6710if (Idx > 0 &&P.first != PrevDist + 1)
6711returnfalse;
6712 PrevDist =P.first;
6713 ++Idx;
6714 }
6715
6716// Calculate the shuffle indices according to their offset against the sorted
6717// StoreOffsetVec.
6718 ReorderIndices.assign(StoresVec.size(), 0);
6719bool IsIdentity =true;
6720for (auto [I,P] :enumerate(StoreOffsetVec)) {
6721 ReorderIndices[P.second] =I;
6722 IsIdentity &=P.second ==I;
6723 }
6724// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6725// reorderTopToBottom() and reorderBottomToTop(), so we are following the
6726// same convention here.
6727if (IsIdentity)
6728 ReorderIndices.clear();
6729
6730returntrue;
6731}
6732
6733#ifndef NDEBUG
6734LLVM_DUMP_METHODstaticvoiddumpOrder(constBoUpSLP::OrdersType &Order) {
6735for (unsignedIdx : Order)
6736dbgs() <<Idx <<", ";
6737dbgs() <<"\n";
6738}
6739#endif
6740
6741SmallVector<BoUpSLP::OrdersType, 1>
6742BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const{
6743unsigned NumLanes =TE->Scalars.size();
6744
6745SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6746
6747// Holds the reorder indices for each candidate store vector that is a user of
6748// the current TreeEntry.
6749SmallVector<OrdersType, 1> ExternalReorderIndices;
6750
6751// Now inspect the stores collected per pointer and look for vectorization
6752// candidates. For each candidate calculate the reorder index vector and push
6753// it into `ExternalReorderIndices`
6754for (ArrayRef<StoreInst *> StoresVec : Stores) {
6755// If we have fewer than NumLanes stores, then we can't form a vector.
6756if (StoresVec.size() != NumLanes)
6757continue;
6758
6759// If the stores are not consecutive then abandon this StoresVec.
6760OrdersType ReorderIndices;
6761if (!canFormVector(StoresVec, ReorderIndices))
6762continue;
6763
6764// We now know that the scalars in StoresVec can form a vector instruction,
6765// so set the reorder indices.
6766 ExternalReorderIndices.push_back(ReorderIndices);
6767 }
6768return ExternalReorderIndices;
6769}
6770
6771voidBoUpSLP::buildTree(ArrayRef<Value *> Roots,
6772constSmallDenseSet<Value *> &UserIgnoreLst) {
6773deleteTree();
6774 UserIgnoreList = &UserIgnoreLst;
6775if (!allSameType(Roots))
6776return;
6777 buildTree_rec(Roots, 0,EdgeInfo());
6778}
6779
6780voidBoUpSLP::buildTree(ArrayRef<Value *> Roots) {
6781deleteTree();
6782if (!allSameType(Roots))
6783return;
6784 buildTree_rec(Roots, 0,EdgeInfo());
6785}
6786
6787/// Tries to find subvector of loads and builds new vector of only loads if can
6788/// be profitable.
6789staticvoidgatherPossiblyVectorizableLoads(
6790constBoUpSLP &R,ArrayRef<Value *> VL,constDataLayout &DL,
6791ScalarEvolution &SE,constTargetTransformInfo &TTI,
6792SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6793bool AddNew =true) {
6794if (VL.empty())
6795return;
6796Type *ScalarTy =getValueType(VL.front());
6797if (!isValidElementType(ScalarTy))
6798return;
6799SmallVector<SmallVector<std::pair<LoadInst *, int>>> ClusteredLoads;
6800SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6801for (Value *V : VL) {
6802auto *LI = dyn_cast<LoadInst>(V);
6803if (!LI)
6804continue;
6805if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6806continue;
6807bool IsFound =false;
6808for (auto [Map,Data] :zip(ClusteredDistToLoad, ClusteredLoads)) {
6809assert(LI->getParent() ==Data.front().first->getParent() &&
6810 LI->getType() ==Data.front().first->getType() &&
6811getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth) ==
6812getUnderlyingObject(Data.front().first->getPointerOperand(),
6813RecursionMaxDepth) &&
6814"Expected loads with the same type, same parent and same "
6815"underlying pointer.");
6816 std::optional<int> Dist =getPointersDiff(
6817 LI->getType(), LI->getPointerOperand(),Data.front().first->getType(),
6818Data.front().first->getPointerOperand(),DL, SE,
6819/*StrictCheck=*/true);
6820if (!Dist)
6821continue;
6822auto It = Map.find(*Dist);
6823if (It != Map.end() && It->second != LI)
6824continue;
6825if (It == Map.end()) {
6826Data.emplace_back(LI, *Dist);
6827 Map.try_emplace(*Dist, LI);
6828 }
6829 IsFound =true;
6830break;
6831 }
6832if (!IsFound) {
6833 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6834 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6835 }
6836 }
6837auto FindMatchingLoads =
6838 [&](ArrayRef<std::pair<LoadInst *, int>> Loads,
6839SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>>
6840 &GatheredLoads,
6841SetVector<unsigned> &ToAdd,SetVector<unsigned> &Repeated,
6842int &Offset,unsigned &Start) {
6843if (Loads.empty())
6844return GatheredLoads.end();
6845SmallVector<std::pair<int, int>> Res;
6846LoadInst *LI = Loads.front().first;
6847for (auto [Idx,Data] :enumerate(GatheredLoads)) {
6848if (Idx < Start)
6849continue;
6850 ToAdd.clear();
6851if (LI->getParent() !=Data.front().first->getParent() ||
6852 LI->getType() !=Data.front().first->getType())
6853continue;
6854 std::optional<int> Dist =
6855getPointersDiff(LI->getType(), LI->getPointerOperand(),
6856Data.front().first->getType(),
6857Data.front().first->getPointerOperand(),DL, SE,
6858/*StrictCheck=*/true);
6859if (!Dist)
6860continue;
6861SmallSet<int, 4> DataDists;
6862SmallPtrSet<LoadInst *, 4> DataLoads;
6863for (std::pair<LoadInst *, int>P :Data) {
6864 DataDists.insert(P.second);
6865 DataLoads.insert(P.first);
6866 }
6867// Found matching gathered loads - check if all loads are unique or
6868// can be effectively vectorized.
6869unsigned NumUniques = 0;
6870for (auto [Cnt, Pair] :enumerate(Loads)) {
6871bool Used = DataLoads.contains(Pair.first);
6872if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6873 ++NumUniques;
6874 ToAdd.insert(Cnt);
6875 }elseif (Used) {
6876 Repeated.insert(Cnt);
6877 }
6878 }
6879if (NumUniques > 0 &&
6880 (Loads.size() == NumUniques ||
6881 (Loads.size() - NumUniques >= 2 &&
6882 Loads.size() - NumUniques >= Loads.size() / 2 &&
6883 (has_single_bit(Data.size() + NumUniques) ||
6884bit_ceil(Data.size()) <
6885bit_ceil(Data.size() + NumUniques))))) {
6886Offset = *Dist;
6887 Start =Idx + 1;
6888return std::next(GatheredLoads.begin(),Idx);
6889 }
6890 }
6891 ToAdd.clear();
6892return GatheredLoads.end();
6893 };
6894for (ArrayRef<std::pair<LoadInst *, int>>Data : ClusteredLoads) {
6895unsigned Start = 0;
6896SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6897intOffset = 0;
6898auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6899Offset, Start);
6900while (It != GatheredLoads.end()) {
6901assert(!LocalToAdd.empty() &&"Expected some elements to add.");
6902for (unsignedIdx : LocalToAdd)
6903 It->emplace_back(Data[Idx].first,Data[Idx].second +Offset);
6904 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6905 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,Offset,
6906 Start);
6907 }
6908if (any_of(seq<unsigned>(Data.size()), [&](unsignedIdx) {
6909 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6910 })) {
6911auto AddNewLoads =
6912 [&](SmallVectorImpl<std::pair<LoadInst *, int>> &Loads) {
6913for (unsignedIdx : seq<unsigned>(Data.size())) {
6914if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6915continue;
6916 Loads.push_back(Data[Idx]);
6917 }
6918 };
6919if (!AddNew) {
6920LoadInst *LI =Data.front().first;
6921 It =find_if(
6922 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6923return PD.front().first->getParent() == LI->getParent() &&
6924 PD.front().first->getType() == LI->getType();
6925 });
6926while (It != GatheredLoads.end()) {
6927 AddNewLoads(*It);
6928 It = std::find_if(
6929 std::next(It), GatheredLoads.end(),
6930 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6931 return PD.front().first->getParent() == LI->getParent() &&
6932 PD.front().first->getType() == LI->getType();
6933 });
6934 }
6935 }
6936 GatheredLoads.emplace_back().append(Data.begin(),Data.end());
6937 AddNewLoads(GatheredLoads.emplace_back());
6938 }
6939 }
6940}
6941
6942void BoUpSLP::tryToVectorizeGatheredLoads(
6943constSmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6944SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6945 8> &GatheredLoads) {
6946 GatheredLoadsEntriesFirst = VectorizableTree.size();
6947
6948SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6949 LoadEntriesToVectorize.size());
6950for (auto [Idx, Set] :zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6951Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6952 VectorizableTree[Idx]->Scalars.end());
6953
6954// Sort loads by distance.
6955auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6956const std::pair<LoadInst *, int> &L2) {
6957return L1.second > L2.second;
6958 };
6959
6960auto IsMaskedGatherSupported = [&,TTI =TTI](ArrayRef<LoadInst *> Loads) {
6961ArrayRef<Value *> Values(reinterpret_cast<Value *const*>(Loads.begin()),
6962 Loads.size());
6963Align Alignment = computeCommonAlignment<LoadInst>(Values);
6964auto *Ty =getWidenedType(Loads.front()->getType(), Loads.size());
6965returnTTI->isLegalMaskedGather(Ty, Alignment) &&
6966 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6967 };
6968
6969auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6970BoUpSLP::ValueSet &VectorizedLoads,
6971SmallVectorImpl<LoadInst *> &NonVectorized,
6972bool Final,unsigned MaxVF) {
6973SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>Results;
6974unsigned StartIdx = 0;
6975SmallVector<int> CandidateVFs;
6976if (VectorizeNonPowerOf2 &&has_single_bit(MaxVF + 1))
6977 CandidateVFs.push_back(MaxVF);
6978for (int NumElts =getFloorFullVectorNumberOfElements(
6979 *TTI, Loads.front()->getType(), MaxVF);
6980 NumElts > 1; NumElts =getFloorFullVectorNumberOfElements(
6981 *TTI, Loads.front()->getType(), NumElts - 1)) {
6982 CandidateVFs.push_back(NumElts);
6983if (VectorizeNonPowerOf2 && NumElts > 2)
6984 CandidateVFs.push_back(NumElts - 1);
6985 }
6986
6987if (Final && CandidateVFs.empty())
6988returnResults;
6989
6990unsigned BestVF = Final ? CandidateVFs.back() : 0;
6991for (unsigned NumElts : CandidateVFs) {
6992if (Final && NumElts > BestVF)
6993continue;
6994SmallVector<unsigned> MaskedGatherVectorized;
6995for (unsigned Cnt = StartIdx,E = Loads.size(); Cnt <E;
6996 ++Cnt) {
6997ArrayRef<LoadInst *> Slice =
6998ArrayRef(Loads).slice(Cnt, std::min(NumElts,E - Cnt));
6999if (VectorizedLoads.count(Slice.front()) ||
7000 VectorizedLoads.count(Slice.back()) ||
7001areKnownNonVectorizableLoads(Slice))
7002continue;
7003// Check if it is profitable to try vectorizing gathered loads. It is
7004// profitable if we have more than 3 consecutive loads or if we have
7005// less but all users are vectorized or deleted.
7006bool AllowToVectorize =false;
7007// Check if it is profitable to vectorize 2-elements loads.
7008if (NumElts == 2) {
7009bool IsLegalBroadcastLoad =TTI->isLegalBroadcastLoad(
7010 Slice.front()->getType(),ElementCount::getFixed(NumElts));
7011auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
7012for (LoadInst *LI : Slice) {
7013// If single use/user - allow to vectorize.
7014if (LI->hasOneUse())
7015continue;
7016// 1. Check if number of uses equals number of users.
7017// 2. All users are deleted.
7018// 3. The load broadcasts are not allowed or the load is not
7019// broadcasted.
7020if (static_cast<unsignedint>(std::distance(
7021 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7022returnfalse;
7023if (!IsLegalBroadcastLoad)
7024continue;
7025if (LI->hasNUsesOrMore(UsesLimit))
7026returnfalse;
7027for (User *U : LI->users()) {
7028if (auto *UI = dyn_cast<Instruction>(U); UI &&isDeleted(UI))
7029continue;
7030if (const TreeEntry *UTE = getTreeEntry(U)) {
7031for (intI : seq<int>(UTE->getNumOperands())) {
7032if (all_of(UTE->getOperand(I),
7033 [LI](Value *V) { return V == LI; }))
7034// Found legal broadcast - do not vectorize.
7035returnfalse;
7036 }
7037 }
7038 }
7039 }
7040returntrue;
7041 };
7042 AllowToVectorize = CheckIfAllowed(Slice);
7043 }else {
7044 AllowToVectorize =
7045 (NumElts >= 3 ||
7046any_of(ValueToGatherNodes.at(Slice.front()),
7047 [=](const TreeEntry *TE) {
7048 return TE->Scalars.size() == 2 &&
7049 ((TE->Scalars.front() == Slice.front() &&
7050 TE->Scalars.back() == Slice.back()) ||
7051 (TE->Scalars.front() == Slice.back() &&
7052 TE->Scalars.back() == Slice.front()));
7053 })) &&
7054hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
7055 Slice.size());
7056 }
7057if (AllowToVectorize) {
7058SmallVector<Value *> PointerOps;
7059OrdersType CurrentOrder;
7060// Try to build vector load.
7061ArrayRef<Value *> Values(
7062reinterpret_cast<Value *const*>(Slice.begin()), Slice.size());
7063LoadsStateLS =canVectorizeLoads(Values, Slice.front(), CurrentOrder,
7064 PointerOps, &BestVF);
7065if (LS !=LoadsState::Gather ||
7066 (BestVF > 1 &&static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7067if (LS ==LoadsState::ScatterVectorize) {
7068if (MaskedGatherVectorized.empty() ||
7069 Cnt >= MaskedGatherVectorized.back() + NumElts)
7070 MaskedGatherVectorized.push_back(Cnt);
7071continue;
7072 }
7073if (LS !=LoadsState::Gather) {
7074Results.emplace_back(Values, LS);
7075 VectorizedLoads.insert(Slice.begin(), Slice.end());
7076// If we vectorized initial block, no need to try to vectorize it
7077// again.
7078if (Cnt == StartIdx)
7079 StartIdx += NumElts;
7080 }
7081// Check if the whole array was vectorized already - exit.
7082if (StartIdx >= Loads.size())
7083break;
7084// Erase last masked gather candidate, if another candidate within
7085// the range is found to be better.
7086if (!MaskedGatherVectorized.empty() &&
7087 Cnt < MaskedGatherVectorized.back() + NumElts)
7088 MaskedGatherVectorized.pop_back();
7089 Cnt += NumElts - 1;
7090continue;
7091 }
7092 }
7093if (!AllowToVectorize || BestVF == 0)
7094registerNonVectorizableLoads(Slice);
7095 }
7096// Mark masked gathers candidates as vectorized, if any.
7097for (unsigned Cnt : MaskedGatherVectorized) {
7098ArrayRef<LoadInst *> Slice =ArrayRef(Loads).slice(
7099 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7100ArrayRef<Value *> Values(
7101reinterpret_cast<Value *const*>(Slice.begin()), Slice.size());
7102Results.emplace_back(Values,LoadsState::ScatterVectorize);
7103 VectorizedLoads.insert(Slice.begin(), Slice.end());
7104// If we vectorized initial block, no need to try to vectorize it again.
7105if (Cnt == StartIdx)
7106 StartIdx += NumElts;
7107 }
7108 }
7109for (LoadInst *LI : Loads) {
7110if (!VectorizedLoads.contains(LI))
7111 NonVectorized.push_back(LI);
7112 }
7113returnResults;
7114 };
7115auto ProcessGatheredLoads =
7116 [&, &TTI = *TTI](
7117ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
7118bool Final =false) {
7119SmallVector<LoadInst *> NonVectorized;
7120for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7121if (LoadsDists.size() <= 1) {
7122 NonVectorized.push_back(LoadsDists.back().first);
7123continue;
7124 }
7125SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7126SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7127transform(LoadsDists, OriginalLoads.begin(),
7128 [](const std::pair<LoadInst *, int> &L) ->LoadInst * {
7129 return L.first;
7130 });
7131stable_sort(LocalLoadsDists, LoadSorter);
7132SmallVector<LoadInst *> Loads;
7133unsigned MaxConsecutiveDistance = 0;
7134unsigned CurrentConsecutiveDist = 1;
7135int LastDist = LocalLoadsDists.front().second;
7136bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7137for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7138if (getTreeEntry(L.first))
7139continue;
7140assert(LastDist >=L.second &&
7141"Expected first distance always not less than second");
7142if (static_cast<unsigned>(LastDist -L.second) ==
7143 CurrentConsecutiveDist) {
7144 ++CurrentConsecutiveDist;
7145 MaxConsecutiveDistance =
7146 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7147 Loads.push_back(L.first);
7148continue;
7149 }
7150if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7151 !Loads.empty())
7152 Loads.pop_back();
7153 CurrentConsecutiveDist = 1;
7154 LastDist =L.second;
7155 Loads.push_back(L.first);
7156 }
7157if (Loads.size() <= 1)
7158continue;
7159if (AllowMaskedGather)
7160 MaxConsecutiveDistance = Loads.size();
7161elseif (MaxConsecutiveDistance < 2)
7162continue;
7163BoUpSLP::ValueSet VectorizedLoads;
7164SmallVector<LoadInst *> SortedNonVectorized;
7165SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>Results =
7166 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7167 Final, MaxConsecutiveDistance);
7168if (!Results.empty() && !SortedNonVectorized.empty() &&
7169 OriginalLoads.size() == Loads.size() &&
7170 MaxConsecutiveDistance == Loads.size() &&
7171all_of(Results,
7172 [](const std::pair<ArrayRef<Value *>,LoadsState> &P) {
7173returnP.second ==LoadsState::ScatterVectorize;
7174 })) {
7175 VectorizedLoads.clear();
7176SmallVector<LoadInst *> UnsortedNonVectorized;
7177SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>
7178 UnsortedResults =
7179 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7180 UnsortedNonVectorized, Final,
7181 OriginalLoads.size());
7182if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7183 SortedNonVectorized.swap(UnsortedNonVectorized);
7184Results.swap(UnsortedResults);
7185 }
7186 }
7187for (auto [Slice,_] :Results) {
7188LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize gathered loads ("
7189 << Slice.size() <<")\n");
7190if (any_of(Slice, [&](Value *V) {return getTreeEntry(V); })) {
7191for (Value *L : Slice)
7192if (!getTreeEntry(L))
7193 SortedNonVectorized.push_back(cast<LoadInst>(L));
7194continue;
7195 }
7196
7197// Select maximum VF as a maximum of user gathered nodes and
7198// distance between scalar loads in these nodes.
7199unsigned MaxVF = Slice.size();
7200unsigned UserMaxVF = 0;
7201unsigned InterleaveFactor = 0;
7202if (MaxVF == 2) {
7203 UserMaxVF = MaxVF;
7204 }else {
7205// Found distance between segments of the interleaved loads.
7206 std::optional<unsigned> InterleavedLoadsDistance = 0;
7207unsigned Order = 0;
7208 std::optional<unsigned> CommonVF = 0;
7209DenseMap<const TreeEntry *, unsigned> EntryToPosition;
7210SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7211for (auto [Idx, V] :enumerate(Slice)) {
7212for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7213 UserMaxVF = std::max<unsigned>(UserMaxVF,E->Scalars.size());
7214unsigned Pos =
7215 EntryToPosition.try_emplace(E,Idx).first->second;
7216 UserMaxVF = std::max<unsigned>(UserMaxVF,Idx - Pos + 1);
7217if (CommonVF) {
7218if (*CommonVF == 0) {
7219 CommonVF =E->Scalars.size();
7220continue;
7221 }
7222if (*CommonVF !=E->Scalars.size())
7223 CommonVF.reset();
7224 }
7225// Check if the load is the part of the interleaved load.
7226if (Pos !=Idx && InterleavedLoadsDistance) {
7227if (!DeinterleavedNodes.contains(E) &&
7228any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7229 if (isa<Constant>(V))
7230 return false;
7231 if (getTreeEntry(V))
7232 return true;
7233 const auto &Nodes = ValueToGatherNodes.at(V);
7234 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7235 !is_contained(Slice, V);
7236 })) {
7237 InterleavedLoadsDistance.reset();
7238continue;
7239 }
7240 DeinterleavedNodes.insert(E);
7241if (*InterleavedLoadsDistance == 0) {
7242 InterleavedLoadsDistance =Idx - Pos;
7243continue;
7244 }
7245if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7246 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7247 InterleavedLoadsDistance.reset();
7248 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7249 }
7250 }
7251 }
7252 DeinterleavedNodes.clear();
7253// Check if the large load represents interleaved load operation.
7254if (InterleavedLoadsDistance.value_or(0) > 1 &&
7255 CommonVF.value_or(0) != 0) {
7256 InterleaveFactor =bit_ceil(*InterleavedLoadsDistance);
7257unsigned VF = *CommonVF;
7258OrdersType Order;
7259SmallVector<Value *> PointerOps;
7260// Segmented load detected - vectorize at maximum vector factor.
7261if (InterleaveFactor <= Slice.size() &&
7262TTI.isLegalInterleavedAccessType(
7263getWidenedType(Slice.front()->getType(), VF),
7264 InterleaveFactor,
7265 cast<LoadInst>(Slice.front())->getAlign(),
7266 cast<LoadInst>(Slice.front())
7267 ->getPointerAddressSpace()) &&
7268canVectorizeLoads(Slice, Slice.front(), Order,
7269 PointerOps) ==LoadsState::Vectorize) {
7270 UserMaxVF = InterleaveFactor * VF;
7271 }else {
7272 InterleaveFactor = 0;
7273 }
7274 }
7275// Cannot represent the loads as consecutive vectorizable nodes -
7276// just exit.
7277unsigned ConsecutiveNodesSize = 0;
7278if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7279any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7280 [&, Slice = Slice](constauto &P) {
7281constauto *It =find_if(Slice, [&](Value *V) {
7282return std::get<1>(P).contains(V);
7283 });
7284if (It == Slice.end())
7285returnfalse;
7286ArrayRef<Value *> VL =
7287 VectorizableTree[std::get<0>(P)]->Scalars;
7288 ConsecutiveNodesSize += VL.size();
7289unsigned Start = std::distance(Slice.begin(), It);
7290unsigned Sz = Slice.size() - Start;
7291return Sz < VL.size() ||
7292 Slice.slice(std::distance(Slice.begin(), It),
7293 VL.size()) != VL;
7294 }))
7295continue;
7296// Try to build long masked gather loads.
7297 UserMaxVF =bit_ceil(UserMaxVF);
7298if (InterleaveFactor == 0 &&
7299any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7300 [&, Slice = Slice](unsignedIdx) {
7301 OrdersType Order;
7302 SmallVector<Value *> PointerOps;
7303 return canVectorizeLoads(
7304 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7305 Slice[Idx * UserMaxVF], Order,
7306 PointerOps) ==
7307 LoadsState::ScatterVectorize;
7308 }))
7309 UserMaxVF = MaxVF;
7310if (Slice.size() != ConsecutiveNodesSize)
7311 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7312 }
7313for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7314bool IsVectorized =true;
7315for (unsignedI = 0,E = Slice.size();I <E;I += VF) {
7316ArrayRef<Value *> SubSlice =
7317 Slice.slice(I, std::min(VF,E -I));
7318if (getTreeEntry(SubSlice.front()))
7319continue;
7320// Check if the subslice is to be-vectorized entry, which is not
7321// equal to entry.
7322if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7323 [&](constauto &P) {
7324return !SubSlice.equals(
7325 VectorizableTree[std::get<0>(P)]
7326 ->Scalars) &&
7327set_is_subset(SubSlice, std::get<1>(P));
7328 }))
7329continue;
7330unsigned Sz = VectorizableTree.size();
7331 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7332if (Sz == VectorizableTree.size()) {
7333 IsVectorized =false;
7334// Try non-interleaved vectorization with smaller vector
7335// factor.
7336if (InterleaveFactor > 0) {
7337 VF = 2 * (MaxVF / InterleaveFactor);
7338 InterleaveFactor = 0;
7339 }
7340continue;
7341 }
7342 }
7343if (IsVectorized)
7344break;
7345 }
7346 }
7347 NonVectorized.append(SortedNonVectorized);
7348 }
7349return NonVectorized;
7350 };
7351for (constauto &GLs : GatheredLoads) {
7352constauto &Ref = GLs.second;
7353SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7354if (!Ref.empty() && !NonVectorized.empty() &&
7355 std::accumulate(
7356Ref.begin(),Ref.end(), 0u,
7357 [](unsigned S,
7358ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->unsigned {
7359 return S + LoadsDists.size();
7360 }) != NonVectorized.size() &&
7361 IsMaskedGatherSupported(NonVectorized)) {
7362SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;
7363for (LoadInst *LI : NonVectorized) {
7364// Reinsert non-vectorized loads to other list of loads with the same
7365// base pointers.
7366gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7367 FinalGatheredLoads,
7368/*AddNew=*/false);
7369 }
7370// Final attempt to vectorize non-vectorized loads.
7371 (void)ProcessGatheredLoads(FinalGatheredLoads,/*Final=*/true);
7372 }
7373 }
7374// Try to vectorize postponed load entries, previously marked as gathered.
7375for (unsignedIdx : LoadEntriesToVectorize) {
7376const TreeEntry &E = *VectorizableTree[Idx];
7377SmallVector<Value *> GatheredScalars(E.Scalars.begin(),E.Scalars.end());
7378// Avoid reordering, if possible.
7379if (!E.ReorderIndices.empty()) {
7380// Build a mask out of the reorder indices and reorder scalars per this
7381// mask.
7382SmallVector<int> ReorderMask;
7383inversePermutation(E.ReorderIndices, ReorderMask);
7384reorderScalars(GatheredScalars, ReorderMask);
7385 }
7386 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7387 }
7388// If no new entries created, consider it as no gathered loads entries must be
7389// handled.
7390if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7391 VectorizableTree.size())
7392 GatheredLoadsEntriesFirst.reset();
7393}
7394
7395/// \return true if the specified list of values has only one instruction that
7396/// requires scheduling, false otherwise.
7397#ifndef NDEBUG
7398staticboolneedToScheduleSingleInstruction(ArrayRef<Value *> VL) {
7399Value *NeedsScheduling =nullptr;
7400for (Value *V : VL) {
7401if (doesNotNeedToBeScheduled(V))
7402continue;
7403if (!NeedsScheduling) {
7404 NeedsScheduling = V;
7405continue;
7406 }
7407returnfalse;
7408 }
7409return NeedsScheduling;
7410}
7411#endif
7412
7413/// Generates key/subkey pair for the given value to provide effective sorting
7414/// of the values and better detection of the vectorizable values sequences. The
7415/// keys/subkeys can be used for better sorting of the values themselves (keys)
7416/// and in values subgroups (subkeys).
7417static std::pair<size_t, size_t>generateKeySubkey(
7418Value *V,constTargetLibraryInfo *TLI,
7419function_ref<hash_code(size_t,LoadInst *)> LoadsSubkeyGenerator,
7420bool AllowAlternate) {
7421hash_code Key =hash_value(V->getValueID() + 2);
7422hash_code SubKey =hash_value(0);
7423// Sort the loads by the distance between the pointers.
7424if (auto *LI = dyn_cast<LoadInst>(V)) {
7425 Key =hash_combine(LI->getType(),hash_value(Instruction::Load), Key);
7426if (LI->isSimple())
7427 SubKey =hash_value(LoadsSubkeyGenerator(Key, LI));
7428else
7429 Key = SubKey =hash_value(LI);
7430 }elseif (isVectorLikeInstWithConstOps(V)) {
7431// Sort extracts by the vector operands.
7432if (isa<ExtractElementInst, UndefValue>(V))
7433 Key =hash_value(Value::UndefValueVal + 1);
7434if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7435if (!isUndefVector(EI->getVectorOperand()).all() &&
7436 !isa<UndefValue>(EI->getIndexOperand()))
7437 SubKey =hash_value(EI->getVectorOperand());
7438 }
7439 }elseif (auto *I = dyn_cast<Instruction>(V)) {
7440// Sort other instructions just by the opcodes except for CMPInst.
7441// For CMP also sort by the predicate kind.
7442if ((isa<BinaryOperator, CastInst>(I)) &&
7443isValidForAlternation(I->getOpcode())) {
7444if (AllowAlternate)
7445 Key =hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7446else
7447 Key =hash_combine(hash_value(I->getOpcode()), Key);
7448 SubKey =hash_combine(
7449hash_value(I->getOpcode()),hash_value(I->getType()),
7450hash_value(isa<BinaryOperator>(I)
7451 ?I->getType()
7452 : cast<CastInst>(I)->getOperand(0)->getType()));
7453// For casts, look through the only operand to improve compile time.
7454if (isa<CastInst>(I)) {
7455 std::pair<size_t, size_t> OpVals =
7456generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7457/*AllowAlternate=*/true);
7458 Key =hash_combine(OpVals.first, Key);
7459 SubKey =hash_combine(OpVals.first, SubKey);
7460 }
7461 }elseif (auto *CI = dyn_cast<CmpInst>(I)) {
7462CmpInst::Predicate Pred = CI->getPredicate();
7463if (CI->isCommutative())
7464 Pred = std::min(Pred,CmpInst::getInversePredicate(Pred));
7465CmpInst::Predicate SwapPred =CmpInst::getSwappedPredicate(Pred);
7466 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(Pred),
7467hash_value(SwapPred),
7468hash_value(CI->getOperand(0)->getType()));
7469 }elseif (auto *Call = dyn_cast<CallInst>(I)) {
7470Intrinsic::IDID =getVectorIntrinsicIDForCall(Call, TLI);
7471if (isTriviallyVectorizable(ID)) {
7472 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(ID));
7473 }elseif (!VFDatabase(*Call).getMappings(*Call).empty()) {
7474 SubKey =hash_combine(hash_value(I->getOpcode()),
7475hash_value(Call->getCalledFunction()));
7476 }else {
7477 Key =hash_combine(hash_value(Call), Key);
7478 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(Call));
7479 }
7480for (constCallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7481 SubKey =hash_combine(hash_value(Op.Begin),hash_value(Op.End),
7482hash_value(Op.Tag), SubKey);
7483 }elseif (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7484if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7485 SubKey =hash_value(Gep->getPointerOperand());
7486else
7487 SubKey =hash_value(Gep);
7488 }elseif (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7489 !isa<ConstantInt>(I->getOperand(1))) {
7490// Do not try to vectorize instructions with potentially high cost.
7491 SubKey =hash_value(I);
7492 }else {
7493 SubKey =hash_value(I->getOpcode());
7494 }
7495 Key =hash_combine(hash_value(I->getParent()), Key);
7496 }
7497return std::make_pair(Key, SubKey);
7498}
7499
7500/// Checks if the specified instruction \p I is an alternate operation for
7501/// the given \p MainOp and \p AltOp instructions.
7502staticboolisAlternateInstruction(constInstruction *I,
7503constInstruction *MainOp,
7504constInstruction *AltOp,
7505constTargetLibraryInfo &TLI);
7506
7507bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7508ArrayRef<Value *> VL) const{
7509unsigned Opcode0 = S.getOpcode();
7510unsigned Opcode1 = S.getAltOpcode();
7511SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7512// If this pattern is supported by the target then consider it profitable.
7513if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7514 Opcode0, Opcode1, OpcodeMask))
7515returntrue;
7516SmallVector<ValueList>Operands;
7517for (unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7518Operands.emplace_back();
7519// Prepare the operand vector.
7520for (Value *V : VL) {
7521if (isa<PoisonValue>(V)) {
7522Operands.back().push_back(
7523PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7524continue;
7525 }
7526Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7527 }
7528 }
7529if (Operands.size() == 2) {
7530// Try find best operands candidates.
7531for (unsignedI : seq<unsigned>(0, VL.size() - 1)) {
7532SmallVector<std::pair<Value *, Value *>> Candidates(3);
7533 Candidates[0] = std::make_pair(Operands[0][I],Operands[0][I + 1]);
7534 Candidates[1] = std::make_pair(Operands[0][I],Operands[1][I + 1]);
7535 Candidates[2] = std::make_pair(Operands[1][I],Operands[0][I + 1]);
7536 std::optional<int> Res =findBestRootPair(Candidates);
7537switch (Res.value_or(0)) {
7538case 0:
7539break;
7540case 1:
7541std::swap(Operands[0][I + 1],Operands[1][I + 1]);
7542break;
7543case 2:
7544std::swap(Operands[0][I],Operands[1][I]);
7545break;
7546default:
7547llvm_unreachable("Unexpected index.");
7548 }
7549 }
7550 }
7551DenseSet<unsigned> UniqueOpcodes;
7552constexprunsigned NumAltInsts = 3;// main + alt + shuffle.
7553unsigned NonInstCnt = 0;
7554// Estimate number of instructions, required for the vectorized node and for
7555// the buildvector node.
7556unsigned UndefCnt = 0;
7557// Count the number of extra shuffles, required for vector nodes.
7558unsigned ExtraShuffleInsts = 0;
7559// Check that operands do not contain same values and create either perfect
7560// diamond match or shuffled match.
7561if (Operands.size() == 2) {
7562// Do not count same operands twice.
7563if (Operands.front() ==Operands.back()) {
7564Operands.erase(Operands.begin());
7565 }elseif (!allConstant(Operands.front()) &&
7566all_of(Operands.front(), [&](Value *V) {
7567 return is_contained(Operands.back(), V);
7568 })) {
7569Operands.erase(Operands.begin());
7570 ++ExtraShuffleInsts;
7571 }
7572 }
7573constLoop *L = LI->getLoopFor(S.getMainOp()->getParent());
7574// Vectorize node, if:
7575// 1. at least single operand is constant or splat.
7576// 2. Operands have many loop invariants (the instructions are not loop
7577// invariants).
7578// 3. At least single unique operands is supposed to vectorized.
7579returnnone_of(Operands,
7580 [&](ArrayRef<Value *>Op) {
7581if (allConstant(Op) ||
7582 (!isSplat(Op) &&allSameBlock(Op) &&allSameType(Op) &&
7583getSameOpcode(Op, *TLI)))
7584returnfalse;
7585DenseMap<Value *, unsigned> Uniques;
7586for (Value *V :Op) {
7587if (isa<Constant, ExtractElementInst>(V) ||
7588 getTreeEntry(V) || (L &&L->isLoopInvariant(V))) {
7589 if (isa<UndefValue>(V))
7590 ++UndefCnt;
7591 continue;
7592 }
7593auto Res = Uniques.try_emplace(V, 0);
7594// Found first duplicate - need to add shuffle.
7595if (!Res.second && Res.first->second == 1)
7596 ++ExtraShuffleInsts;
7597 ++Res.first->getSecond();
7598if (auto *I = dyn_cast<Instruction>(V))
7599 UniqueOpcodes.insert(I->getOpcode());
7600elseif (Res.second)
7601 ++NonInstCnt;
7602 }
7603returnnone_of(Uniques, [&](constauto &P) {
7604returnP.first->hasNUsesOrMore(P.second + 1) &&
7605none_of(P.first->users(), [&](User *U) {
7606 return getTreeEntry(U) || Uniques.contains(U);
7607 });
7608 });
7609 }) ||
7610// Do not vectorize node, if estimated number of vector instructions is
7611// more than estimated number of buildvector instructions. Number of
7612// vector operands is number of vector instructions + number of vector
7613// instructions for operands (buildvectors). Number of buildvector
7614// instructions is just number_of_operands * number_of_scalars.
7615 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7616 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7617 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7618}
7619
7620BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7621const InstructionsState &S,ArrayRef<Value *> VL,
7622bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7623SmallVectorImpl<Value *> &PointerOps) {
7624assert(S.getMainOp() &&
7625"Expected instructions with same/alternate opcodes only.");
7626
7627unsigned ShuffleOrOp =
7628 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7629Instruction *VL0 = S.getMainOp();
7630switch (ShuffleOrOp) {
7631case Instruction::PHI: {
7632// Too many operands - gather, most probably won't be vectorized.
7633if (VL0->getNumOperands() >MaxPHINumOperands)
7634return TreeEntry::NeedToGather;
7635// Check for terminator values (e.g. invoke).
7636for (Value *V : VL) {
7637auto *PHI = dyn_cast<PHINode>(V);
7638if (!PHI)
7639continue;
7640for (Value *Incoming :PHI->incoming_values()) {
7641Instruction *Term = dyn_cast<Instruction>(Incoming);
7642if (Term &&Term->isTerminator()) {
7643LLVM_DEBUG(dbgs()
7644 <<"SLP: Need to swizzle PHINodes (terminator use).\n");
7645return TreeEntry::NeedToGather;
7646 }
7647 }
7648 }
7649
7650return TreeEntry::Vectorize;
7651 }
7652case Instruction::ExtractValue:
7653case Instruction::ExtractElement: {
7654bool Reuse = canReuseExtract(VL, CurrentOrder);
7655// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
7656// non-full registers).
7657if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
7658return TreeEntry::NeedToGather;
7659if (Reuse || !CurrentOrder.empty())
7660return TreeEntry::Vectorize;
7661LLVM_DEBUG(dbgs() <<"SLP: Gather extract sequence.\n");
7662return TreeEntry::NeedToGather;
7663 }
7664case Instruction::InsertElement: {
7665// Check that we have a buildvector and not a shuffle of 2 or more
7666// different vectors.
7667ValueSet SourceVectors;
7668for (Value *V : VL) {
7669 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7670assert(getElementIndex(V) != std::nullopt &&
7671"Non-constant or undef index?");
7672 }
7673
7674if (count_if(VL, [&SourceVectors](Value *V) {
7675return !SourceVectors.contains(V);
7676 }) >= 2) {
7677// Found 2nd source vector - cancel.
7678LLVM_DEBUG(dbgs() <<"SLP: Gather of insertelement vectors with "
7679"different source vectors.\n");
7680return TreeEntry::NeedToGather;
7681 }
7682
7683if (any_of(VL, [&SourceVectors](Value *V) {
7684// The last InsertElement can have multiple uses.
7685return SourceVectors.contains(V) && !V->hasOneUse();
7686 })) {
7687assert(SLPReVec &&"Only supported by REVEC.");
7688LLVM_DEBUG(dbgs() <<"SLP: Gather of insertelement vectors with "
7689"multiple uses.\n");
7690return TreeEntry::NeedToGather;
7691 }
7692
7693return TreeEntry::Vectorize;
7694 }
7695case Instruction::Load: {
7696// Check that a vectorized load would load the same memory as a scalar
7697// load. For example, we don't want to vectorize loads that are smaller
7698// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7699// treats loading/storing it as an i8 struct. If we vectorize loads/stores
7700// from such a struct, we read/write packed bits disagreeing with the
7701// unvectorized version.
7702switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7703caseLoadsState::Vectorize:
7704return TreeEntry::Vectorize;
7705caseLoadsState::ScatterVectorize:
7706if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7707// Delay slow vectorized nodes for better vectorization attempts.
7708 LoadEntriesToVectorize.insert(VectorizableTree.size());
7709return TreeEntry::NeedToGather;
7710 }
7711return TreeEntry::ScatterVectorize;
7712caseLoadsState::StridedVectorize:
7713if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7714// Delay slow vectorized nodes for better vectorization attempts.
7715 LoadEntriesToVectorize.insert(VectorizableTree.size());
7716return TreeEntry::NeedToGather;
7717 }
7718return TreeEntry::StridedVectorize;
7719caseLoadsState::Gather:
7720#ifndef NDEBUG
7721Type *ScalarTy = VL0->getType();
7722if (DL->getTypeSizeInBits(ScalarTy) !=
7723DL->getTypeAllocSizeInBits(ScalarTy))
7724LLVM_DEBUG(dbgs() <<"SLP: Gathering loads of non-packed type.\n");
7725elseif (any_of(VL, [](Value *V) {
7726auto *LI = dyn_cast<LoadInst>(V);
7727return !LI || !LI->isSimple();
7728 }))
7729LLVM_DEBUG(dbgs() <<"SLP: Gathering non-simple loads.\n");
7730else
7731LLVM_DEBUG(dbgs() <<"SLP: Gathering non-consecutive loads.\n");
7732#endif// NDEBUG
7733registerNonVectorizableLoads(VL);
7734return TreeEntry::NeedToGather;
7735 }
7736llvm_unreachable("Unexpected state of loads");
7737 }
7738case Instruction::ZExt:
7739case Instruction::SExt:
7740case Instruction::FPToUI:
7741case Instruction::FPToSI:
7742case Instruction::FPExt:
7743case Instruction::PtrToInt:
7744case Instruction::IntToPtr:
7745case Instruction::SIToFP:
7746case Instruction::UIToFP:
7747case Instruction::Trunc:
7748case Instruction::FPTrunc:
7749case Instruction::BitCast: {
7750Type *SrcTy = VL0->getOperand(0)->getType();
7751for (Value *V : VL) {
7752if (isa<PoisonValue>(V))
7753continue;
7754Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7755if (Ty != SrcTy || !isValidElementType(Ty)) {
7756LLVM_DEBUG(
7757dbgs() <<"SLP: Gathering casts with different src types.\n");
7758return TreeEntry::NeedToGather;
7759 }
7760 }
7761return TreeEntry::Vectorize;
7762 }
7763case Instruction::ICmp:
7764case Instruction::FCmp: {
7765// Check that all of the compares have the same predicate.
7766CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7767CmpInst::Predicate SwapP0 =CmpInst::getSwappedPredicate(P0);
7768Type *ComparedTy = VL0->getOperand(0)->getType();
7769for (Value *V : VL) {
7770if (isa<PoisonValue>(V))
7771continue;
7772auto *Cmp = cast<CmpInst>(V);
7773if ((Cmp->getPredicate() != P0 &&Cmp->getPredicate() != SwapP0) ||
7774Cmp->getOperand(0)->getType() != ComparedTy) {
7775LLVM_DEBUG(dbgs() <<"SLP: Gathering cmp with different predicate.\n");
7776return TreeEntry::NeedToGather;
7777 }
7778 }
7779return TreeEntry::Vectorize;
7780 }
7781case Instruction::Select:
7782case Instruction::FNeg:
7783case Instruction::Add:
7784case Instruction::FAdd:
7785case Instruction::Sub:
7786case Instruction::FSub:
7787case Instruction::Mul:
7788case Instruction::FMul:
7789case Instruction::UDiv:
7790case Instruction::SDiv:
7791case Instruction::FDiv:
7792case Instruction::URem:
7793case Instruction::SRem:
7794case Instruction::FRem:
7795case Instruction::Shl:
7796case Instruction::LShr:
7797case Instruction::AShr:
7798case Instruction::And:
7799case Instruction::Or:
7800case Instruction::Xor:
7801case Instruction::Freeze:
7802if (S.getMainOp()->getType()->isFloatingPointTy() &&
7803TTI->isFPVectorizationPotentiallyUnsafe() &&any_of(VL, [](Value *V) {
7804auto *I = dyn_cast<Instruction>(V);
7805returnI &&I->isBinaryOp() && !I->isFast();
7806 }))
7807return TreeEntry::NeedToGather;
7808return TreeEntry::Vectorize;
7809case Instruction::GetElementPtr: {
7810// We don't combine GEPs with complicated (nested) indexing.
7811for (Value *V : VL) {
7812auto *I = dyn_cast<GetElementPtrInst>(V);
7813if (!I)
7814continue;
7815if (I->getNumOperands() != 2) {
7816LLVM_DEBUG(dbgs() <<"SLP: not-vectorizable GEP (nested indexes).\n");
7817return TreeEntry::NeedToGather;
7818 }
7819 }
7820
7821// We can't combine several GEPs into one vector if they operate on
7822// different types.
7823Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7824for (Value *V : VL) {
7825auto *GEP = dyn_cast<GEPOperator>(V);
7826if (!GEP)
7827continue;
7828Type *CurTy =GEP->getSourceElementType();
7829if (Ty0 != CurTy) {
7830LLVM_DEBUG(dbgs() <<"SLP: not-vectorizable GEP (different types).\n");
7831return TreeEntry::NeedToGather;
7832 }
7833 }
7834
7835// We don't combine GEPs with non-constant indexes.
7836Type *Ty1 = VL0->getOperand(1)->getType();
7837for (Value *V : VL) {
7838auto *I = dyn_cast<GetElementPtrInst>(V);
7839if (!I)
7840continue;
7841auto *Op =I->getOperand(1);
7842if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7843 (Op->getType() != Ty1 &&
7844 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7845Op->getType()->getScalarSizeInBits() >
7846DL->getIndexSizeInBits(
7847V->getType()->getPointerAddressSpace())))) {
7848LLVM_DEBUG(
7849dbgs() <<"SLP: not-vectorizable GEP (non-constant indexes).\n");
7850return TreeEntry::NeedToGather;
7851 }
7852 }
7853
7854return TreeEntry::Vectorize;
7855 }
7856case Instruction::Store: {
7857// Check if the stores are consecutive or if we need to swizzle them.
7858llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7859// Avoid types that are padded when being allocated as scalars, while
7860// being packed together in a vector (such as i1).
7861if (DL->getTypeSizeInBits(ScalarTy) !=
7862DL->getTypeAllocSizeInBits(ScalarTy)) {
7863LLVM_DEBUG(dbgs() <<"SLP: Gathering stores of non-packed type.\n");
7864return TreeEntry::NeedToGather;
7865 }
7866// Make sure all stores in the bundle are simple - we can't vectorize
7867// atomic or volatile stores.
7868for (Value *V : VL) {
7869auto *SI = cast<StoreInst>(V);
7870if (!SI->isSimple()) {
7871LLVM_DEBUG(dbgs() <<"SLP: Gathering non-simple stores.\n");
7872return TreeEntry::NeedToGather;
7873 }
7874 PointerOps.push_back(SI->getPointerOperand());
7875 }
7876
7877// Check the order of pointer operands.
7878if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7879Value *Ptr0;
7880Value *PtrN;
7881if (CurrentOrder.empty()) {
7882 Ptr0 = PointerOps.front();
7883 PtrN = PointerOps.back();
7884 }else {
7885 Ptr0 = PointerOps[CurrentOrder.front()];
7886 PtrN = PointerOps[CurrentOrder.back()];
7887 }
7888 std::optional<int> Dist =
7889getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7890// Check that the sorted pointer operands are consecutive.
7891if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7892return TreeEntry::Vectorize;
7893 }
7894
7895LLVM_DEBUG(dbgs() <<"SLP: Non-consecutive store.\n");
7896return TreeEntry::NeedToGather;
7897 }
7898case Instruction::Call: {
7899if (S.getMainOp()->getType()->isFloatingPointTy() &&
7900TTI->isFPVectorizationPotentiallyUnsafe() &&any_of(VL, [](Value *V) {
7901auto *I = dyn_cast<Instruction>(V);
7902returnI && !I->isFast();
7903 }))
7904return TreeEntry::NeedToGather;
7905// Check if the calls are all to the same vectorizable intrinsic or
7906// library function.
7907CallInst *CI = cast<CallInst>(VL0);
7908Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
7909
7910VFShape Shape =VFShape::get(
7911 CI->getFunctionType(),
7912ElementCount::getFixed(static_cast<unsignedint>(VL.size())),
7913false/*HasGlobalPred*/);
7914Function *VecFunc =VFDatabase(*CI).getVectorizedFunction(Shape);
7915
7916if (!VecFunc && !isTriviallyVectorizable(ID)) {
7917LLVM_DEBUG(dbgs() <<"SLP: Non-vectorizable call.\n");
7918return TreeEntry::NeedToGather;
7919 }
7920Function *F = CI->getCalledFunction();
7921unsigned NumArgs = CI->arg_size();
7922SmallVector<Value *, 4> ScalarArgs(NumArgs,nullptr);
7923for (unsigned J = 0; J != NumArgs; ++J)
7924if (isVectorIntrinsicWithScalarOpAtArg(ID, J,TTI))
7925 ScalarArgs[J] = CI->getArgOperand(J);
7926for (Value *V : VL) {
7927CallInst *CI2 = dyn_cast<CallInst>(V);
7928if (!CI2 || CI2->getCalledFunction() !=F ||
7929getVectorIntrinsicIDForCall(CI2, TLI) !=ID ||
7930 (VecFunc &&
7931 VecFunc !=VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7932 !CI->hasIdenticalOperandBundleSchema(*CI2)) {
7933LLVM_DEBUG(dbgs() <<"SLP: mismatched calls:" << *CI <<"!=" << *V
7934 <<"\n");
7935return TreeEntry::NeedToGather;
7936 }
7937// Some intrinsics have scalar arguments and should be same in order for
7938// them to be vectorized.
7939for (unsigned J = 0; J != NumArgs; ++J) {
7940if (isVectorIntrinsicWithScalarOpAtArg(ID, J,TTI)) {
7941Value *A1J = CI2->getArgOperand(J);
7942if (ScalarArgs[J] != A1J) {
7943LLVM_DEBUG(dbgs()
7944 <<"SLP: mismatched arguments in call:" << *CI
7945 <<" argument " << ScalarArgs[J] <<"!=" << A1J <<"\n");
7946return TreeEntry::NeedToGather;
7947 }
7948 }
7949 }
7950// Verify that the bundle operands are identical between the two calls.
7951if (CI->hasOperandBundles() &&
7952 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7953 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7954 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7955LLVM_DEBUG(dbgs() <<"SLP: mismatched bundle operands in calls:" << *CI
7956 <<"!=" << *V <<'\n');
7957return TreeEntry::NeedToGather;
7958 }
7959 }
7960
7961return TreeEntry::Vectorize;
7962 }
7963case Instruction::ShuffleVector: {
7964if (!S.isAltShuffle()) {
7965// REVEC can support non alternate shuffle.
7966if (SLPReVec &&getShufflevectorNumGroups(VL))
7967return TreeEntry::Vectorize;
7968// If this is not an alternate sequence of opcode like add-sub
7969// then do not vectorize this instruction.
7970LLVM_DEBUG(dbgs() <<"SLP: ShuffleVector are not vectorized.\n");
7971return TreeEntry::NeedToGather;
7972 }
7973if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7974LLVM_DEBUG(
7975dbgs()
7976 <<"SLP: ShuffleVector not vectorized, operands are buildvector and "
7977"the whole alt sequence is not profitable.\n");
7978return TreeEntry::NeedToGather;
7979 }
7980
7981return TreeEntry::Vectorize;
7982 }
7983default:
7984LLVM_DEBUG(dbgs() <<"SLP: Gathering unknown instruction.\n");
7985return TreeEntry::NeedToGather;
7986 }
7987}
7988
7989namespace{
7990/// Allows to correctly handle operands of the phi nodes based on the \p Main
7991/// PHINode order of incoming basic blocks/values.
7992classPHIHandler {
7993DominatorTree &DT;
7994PHINode *Main =nullptr;
7995SmallVector<Value *> Phis;
7996SmallVector<SmallVector<Value *>>Operands;
7997
7998public:
7999 PHIHandler() =delete;
8000 PHIHandler(DominatorTree &DT,PHINode *Main,ArrayRef<Value *> Phis)
8001 : DT(DT), Main(Main), Phis(Phis),
8002Operands(Main->getNumIncomingValues(),
8003SmallVector<Value *>(Phis.size(), nullptr)) {}
8004void buildOperands() {
8005constexprunsigned FastLimit = 4;
8006if (Main->getNumIncomingValues() <= FastLimit) {
8007for (unsignedI : seq<unsigned>(0, Main->getNumIncomingValues())) {
8008BasicBlock *InBB = Main->getIncomingBlock(I);
8009if (!DT.isReachableFromEntry(InBB)) {
8010Operands[I].assign(Phis.size(),PoisonValue::get(Main->getType()));
8011continue;
8012 }
8013// Prepare the operand vector.
8014for (auto [Idx, V] :enumerate(Phis)) {
8015auto *P = dyn_cast<PHINode>(V);
8016if (!P) {
8017assert(isa<PoisonValue>(V) &&
8018"Expected isa instruction or poison value.");
8019Operands[I][Idx] =V;
8020continue;
8021 }
8022if (P->getIncomingBlock(I) == InBB)
8023Operands[I][Idx] =P->getIncomingValue(I);
8024else
8025Operands[I][Idx] =P->getIncomingValueForBlock(InBB);
8026 }
8027 }
8028return;
8029 }
8030SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4>Blocks;
8031for (unsignedI : seq<unsigned>(0, Main->getNumIncomingValues())) {
8032BasicBlock *InBB = Main->getIncomingBlock(I);
8033if (!DT.isReachableFromEntry(InBB)) {
8034Operands[I].assign(Phis.size(),PoisonValue::get(Main->getType()));
8035continue;
8036 }
8037Blocks.try_emplace(InBB).first->second.push_back(I);
8038 }
8039for (auto [Idx, V] :enumerate(Phis)) {
8040if (isa<PoisonValue>(V)) {
8041for (unsignedI : seq<unsigned>(Main->getNumIncomingValues()))
8042Operands[I][Idx] =V;
8043continue;
8044 }
8045auto *P = cast<PHINode>(V);
8046for (unsignedI : seq<unsigned>(0,P->getNumIncomingValues())) {
8047BasicBlock *InBB =P->getIncomingBlock(I);
8048if (InBB == Main->getIncomingBlock(I)) {
8049if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
8050continue;
8051Operands[I][Idx] =P->getIncomingValue(I);
8052continue;
8053 }
8054auto It =Blocks.find(InBB);
8055if (It ==Blocks.end())
8056continue;
8057Operands[It->second.front()][Idx] =P->getIncomingValue(I);
8058 }
8059 }
8060for (constauto &P :Blocks) {
8061if (P.getSecond().size() <= 1)
8062continue;
8063unsigned BasicI =P.getSecond().front();
8064for (unsignedI :ArrayRef(P.getSecond()).drop_front()) {
8065assert(all_of(enumerate(Operands[I]),
8066 [&](constauto &Data) {
8067return !Data.value() ||
8068 Data.value() ==Operands[BasicI][Data.index()];
8069 }) &&
8070"Expected empty operands list.");
8071Operands[I] =Operands[BasicI];
8072 }
8073 }
8074 }
8075ArrayRef<Value *>getOperands(unsignedI) const{returnOperands[I]; }
8076};
8077}// namespace
8078
8079void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL,unsignedDepth,
8080const EdgeInfo &UserTreeIdx,
8081unsigned InterleaveFactor) {
8082assert((allConstant(VL) ||allSameType(VL)) &&"Invalid types!");
8083
8084SmallVector<int> ReuseShuffleIndices;
8085SmallVector<Value *> UniqueValues;
8086SmallVector<Value *> NonUniqueValueVL;
8087auto TryToFindDuplicates = [&](const InstructionsState &S,
8088bool DoNotFail =false) {
8089// Check that every instruction appears once in this bundle.
8090SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8091for (Value *V : VL) {
8092if (isConstant(V)) {
8093 ReuseShuffleIndices.emplace_back(
8094 isa<PoisonValue>(V) ?PoisonMaskElem : UniqueValues.size());
8095 UniqueValues.emplace_back(V);
8096continue;
8097 }
8098auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8099 ReuseShuffleIndices.emplace_back(Res.first->second);
8100if (Res.second)
8101 UniqueValues.emplace_back(V);
8102 }
8103size_t NumUniqueScalarValues = UniqueValues.size();
8104bool IsFullVectors =hasFullVectorsOrPowerOf2(
8105 *TTI,getValueType(UniqueValues.front()), NumUniqueScalarValues);
8106if (NumUniqueScalarValues == VL.size() &&
8107 (VectorizeNonPowerOf2 || IsFullVectors)) {
8108 ReuseShuffleIndices.clear();
8109 }else {
8110// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8111if ((UserTreeIdx.UserTE &&
8112 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8113 !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
8114LLVM_DEBUG(dbgs() <<"SLP: Reshuffling scalars not yet supported "
8115"for nodes with padding.\n");
8116 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8117returnfalse;
8118 }
8119LLVM_DEBUG(dbgs() <<"SLP: Shuffle for reused scalars.\n");
8120if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8121 (UniquePositions.size() == 1 &&all_of(UniqueValues, [](Value *V) {
8122return isa<UndefValue>(V) || !isConstant(V);
8123 }))) {
8124if (DoNotFail && UniquePositions.size() > 1 &&
8125 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8126all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8127// Find the number of elements, which forms full vectors.
8128unsigned PWSz =getFullVectorNumberOfElements(
8129 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8130if (PWSz == VL.size()) {
8131 ReuseShuffleIndices.clear();
8132 }else {
8133 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8134 NonUniqueValueVL.append(
8135 PWSz - UniqueValues.size(),
8136PoisonValue::get(UniqueValues.front()->getType()));
8137// Check that extended with poisons operations are still valid for
8138// vectorization (div/rem are not allowed).
8139if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
8140LLVM_DEBUG(dbgs() <<"SLP: Scalar used twice in bundle.\n");
8141 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8142returnfalse;
8143 }
8144 VL = NonUniqueValueVL;
8145 }
8146returntrue;
8147 }
8148LLVM_DEBUG(dbgs() <<"SLP: Scalar used twice in bundle.\n");
8149 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8150returnfalse;
8151 }
8152 VL = UniqueValues;
8153 }
8154returntrue;
8155 };
8156
8157 InstructionsState S =getSameOpcode(VL, *TLI);
8158
8159// Don't go into catchswitch blocks, which can happen with PHIs.
8160// Such blocks can only have PHIs and the catchswitch. There is no
8161// place to insert a shuffle if we need to, so just avoid that issue.
8162if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8163LLVM_DEBUG(dbgs() <<"SLP: bundle in catchswitch block.\n");
8164 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8165return;
8166 }
8167
8168// Check if this is a duplicate of another entry.
8169if (S) {
8170if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8171LLVM_DEBUG(dbgs() <<"SLP: \tChecking bundle: " << *S.getMainOp()
8172 <<".\n");
8173if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8174auto It = MultiNodeScalars.find(S.getMainOp());
8175if (It != MultiNodeScalars.end()) {
8176auto *TEIt =find_if(It->getSecond(),
8177 [&](TreeEntry *ME) { return ME->isSame(VL); });
8178if (TEIt != It->getSecond().end())
8179 E = *TEIt;
8180else
8181 E =nullptr;
8182 }else {
8183 E =nullptr;
8184 }
8185 }
8186if (!E) {
8187if (!doesNotNeedToBeScheduled(S.getMainOp())) {
8188LLVM_DEBUG(dbgs() <<"SLP: Gathering due to partial overlap.\n");
8189if (TryToFindDuplicates(S))
8190 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8191 ReuseShuffleIndices);
8192return;
8193 }
8194SmallPtrSet<const TreeEntry *, 4> Nodes;
8195 Nodes.insert(getTreeEntry(S.getMainOp()));
8196for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
8197 Nodes.insert(E);
8198SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
8199if (any_of(Nodes, [&](const TreeEntry *E) {
8200if (all_of(E->Scalars,
8201 [&](Value *V) { return Values.contains(V); }))
8202returntrue;
8203SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),
8204 E->Scalars.end());
8205 return (
8206all_of(VL, [&](Value *V) {return EValues.contains(V); }));
8207 })) {
8208LLVM_DEBUG(dbgs() <<"SLP: Gathering due to full overlap.\n");
8209if (TryToFindDuplicates(S))
8210 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8211 ReuseShuffleIndices);
8212return;
8213 }
8214 }else {
8215// Record the reuse of the tree node. FIXME, currently this is only
8216// used to properly draw the graph rather than for the actual
8217// vectorization.
8218 E->UserTreeIndices.push_back(UserTreeIdx);
8219LLVM_DEBUG(dbgs() <<"SLP: Perfect diamond merge at " << *S.getMainOp()
8220 <<".\n");
8221return;
8222 }
8223 }
8224 }
8225
8226// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8227// a load), in which case peek through to include it in the tree, without
8228// ballooning over-budget.
8229if (Depth >=RecursionMaxDepth &&
8230 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8231 (match(S.getMainOp(),m_Load(m_Value())) ||
8232all_of(VL, [&S](constValue *I) {
8233returnmatch(I,
8234m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
8235 cast<Instruction>(I)->getOpcode() == S.getOpcode();
8236 })))) {
8237LLVM_DEBUG(dbgs() <<"SLP: Gathering due to max recursion depth.\n");
8238if (TryToFindDuplicates(S))
8239 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8240 ReuseShuffleIndices);
8241return;
8242 }
8243
8244// Don't handle scalable vectors
8245if (S && S.getOpcode() == Instruction::ExtractElement &&
8246 isa<ScalableVectorType>(
8247 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8248LLVM_DEBUG(dbgs() <<"SLP: Gathering due to scalable vector type.\n");
8249if (TryToFindDuplicates(S))
8250 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8251 ReuseShuffleIndices);
8252return;
8253 }
8254
8255// Don't handle vectors.
8256if (!SLPReVec &&getValueType(VL.front())->isVectorTy()) {
8257LLVM_DEBUG(dbgs() <<"SLP: Gathering due to vector type.\n");
8258 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8259return;
8260 }
8261
8262// If all of the operands are identical or constant we have a simple solution.
8263// If we deal with insert/extract instructions, they all must have constant
8264// indices, otherwise we should gather them, not try to vectorize.
8265// If alternate op node with 2 elements with gathered operands - do not
8266// vectorize.
8267auto &&NotProfitableForVectorization = [&S,this,
8268Depth](ArrayRef<Value *> VL) {
8269if (!S || !S.isAltShuffle() || VL.size() > 2)
8270returnfalse;
8271if (VectorizableTree.size() <MinTreeSize)
8272returnfalse;
8273if (Depth >=RecursionMaxDepth - 1)
8274returntrue;
8275// Check if all operands are extracts, part of vector node or can build a
8276// regular vectorize node.
8277SmallVector<unsigned, 8> InstsCount;
8278for (Value *V : VL) {
8279auto *I = cast<Instruction>(V);
8280 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8281 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8282 }));
8283 }
8284bool IsCommutative =
8285isCommutative(S.getMainOp()) ||isCommutative(S.getAltOp());
8286if ((IsCommutative &&
8287 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8288 (!IsCommutative &&
8289all_of(InstsCount, [](unsigned ICnt) {return ICnt < 2; })))
8290returntrue;
8291assert(VL.size() == 2 &&"Expected only 2 alternate op instructions.");
8292SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
8293auto *I1 = cast<Instruction>(VL.front());
8294auto *I2 = cast<Instruction>(VL.back());
8295for (intOp : seq<int>(S.getMainOp()->getNumOperands()))
8296 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8297 I2->getOperand(Op));
8298if (static_cast<unsigned>(count_if(
8299 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8300returnfindBestRootPair(Cand,LookAheadHeuristics::ScoreSplat);
8301 })) >= S.getMainOp()->getNumOperands() / 2)
8302returnfalse;
8303if (S.getMainOp()->getNumOperands() > 2)
8304returntrue;
8305if (IsCommutative) {
8306// Check permuted operands.
8307 Candidates.clear();
8308for (intOp = 0, E = S.getMainOp()->getNumOperands();Op < E; ++Op)
8309 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8310 I2->getOperand((Op + 1) % E));
8311if (any_of(
8312 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8313returnfindBestRootPair(Cand,LookAheadHeuristics::ScoreSplat);
8314 }))
8315returnfalse;
8316 }
8317returntrue;
8318 };
8319SmallVector<unsigned> SortedIndices;
8320BasicBlock *BB =nullptr;
8321bool IsScatterVectorizeUserTE =
8322 UserTreeIdx.UserTE &&
8323 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8324bool AreAllSameBlock = S &&allSameBlock(VL);
8325bool AreScatterAllGEPSameBlock =
8326 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8327 VL.size() > 2 &&
8328all_of(VL,
8329 [&BB](Value *V) {
8330auto *I = dyn_cast<GetElementPtrInst>(V);
8331if (!I)
8332returndoesNotNeedToBeScheduled(V);
8333if (!BB)
8334 BB =I->getParent();
8335return BB ==I->getParent() &&I->getNumOperands() == 2;
8336 }) &&
8337 BB &&
8338sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8339 SortedIndices));
8340bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8341if (!AreAllSameInsts || (!S &&allConstant(VL)) ||isSplat(VL) ||
8342 (S &&
8343 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8344 S.getMainOp()) &&
8345 !all_of(VL,isVectorLikeInstWithConstOps)) ||
8346 NotProfitableForVectorization(VL)) {
8347LLVM_DEBUG(dbgs() <<"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8348if (TryToFindDuplicates(S))
8349 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8350 ReuseShuffleIndices);
8351return;
8352 }
8353
8354// Don't vectorize ephemeral values.
8355if (S && !EphValues.empty()) {
8356for (Value *V : VL) {
8357if (EphValues.count(V)) {
8358LLVM_DEBUG(dbgs() <<"SLP: The instruction (" << *V
8359 <<") is ephemeral.\n");
8360 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8361return;
8362 }
8363 }
8364 }
8365
8366// We now know that this is a vector of instructions of the same type from
8367// the same block.
8368
8369// Check that none of the instructions in the bundle are already in the tree.
8370for (Value *V : VL) {
8371if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8372doesNotNeedToBeScheduled(V))
8373continue;
8374if (getTreeEntry(V)) {
8375LLVM_DEBUG(dbgs() <<"SLP: The instruction (" << *V
8376 <<") is already in tree.\n");
8377if (TryToFindDuplicates(S))
8378 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8379 ReuseShuffleIndices);
8380return;
8381 }
8382 }
8383
8384// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8385if (UserIgnoreList && !UserIgnoreList->empty()) {
8386for (Value *V : VL) {
8387if (UserIgnoreList->contains(V)) {
8388LLVM_DEBUG(dbgs() <<"SLP: Gathering due to gathered scalar.\n");
8389if (TryToFindDuplicates(S))
8390 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8391 ReuseShuffleIndices);
8392return;
8393 }
8394 }
8395 }
8396
8397// Special processing for sorted pointers for ScatterVectorize node with
8398// constant indeces only.
8399if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8400assert(VL.front()->getType()->isPointerTy() &&
8401count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8402"Expected pointers only.");
8403// Reset S to make it GetElementPtr kind of node.
8404constauto *It =find_if(VL, IsaPred<GetElementPtrInst>);
8405assert(It != VL.end() &&"Expected at least one GEP.");
8406 S =getSameOpcode(*It, *TLI);
8407 }
8408
8409// Check that all of the users of the scalars that we want to vectorize are
8410// schedulable.
8411Instruction *VL0 = S.getMainOp();
8412 BB = VL0->getParent();
8413
8414if (S &&
8415 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8416 !DT->isReachableFromEntry(BB))) {
8417// Don't go into unreachable blocks. They may contain instructions with
8418// dependency cycles which confuse the final scheduling.
8419// Do not vectorize EH and non-returning blocks, not profitable in most
8420// cases.
8421LLVM_DEBUG(dbgs() <<"SLP: bundle in unreachable block.\n");
8422 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8423return;
8424 }
8425
8426// Check that every instruction appears once in this bundle.
8427if (!TryToFindDuplicates(S,/*DoNotFail=*/true))
8428return;
8429
8430// Perform specific checks for each particular instruction kind.
8431OrdersType CurrentOrder;
8432SmallVector<Value *> PointerOps;
8433 TreeEntry::EntryState State = getScalarsVectorizationState(
8434 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8435if (State == TreeEntry::NeedToGather) {
8436 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8437 ReuseShuffleIndices);
8438return;
8439 }
8440
8441auto &BSRef = BlocksSchedules[BB];
8442if (!BSRef)
8443 BSRef = std::make_unique<BlockScheduling>(BB);
8444
8445 BlockScheduling &BS = *BSRef;
8446
8447 std::optional<ScheduleData *> Bundle =
8448 BS.tryScheduleBundle(UniqueValues,this, S);
8449#ifdef EXPENSIVE_CHECKS
8450// Make sure we didn't break any internal invariants
8451 BS.verify();
8452#endif
8453if (!Bundle) {
8454LLVM_DEBUG(dbgs() <<"SLP: We are not able to schedule this bundle!\n");
8455assert((!BS.getScheduleData(VL0) ||
8456 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8457"tryScheduleBundle should cancelScheduling on failure");
8458 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8459 ReuseShuffleIndices);
8460 NonScheduledFirst.insert(VL.front());
8461if (S.getOpcode() == Instruction::Load &&
8462 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8463registerNonVectorizableLoads(VL);
8464return;
8465 }
8466LLVM_DEBUG(dbgs() <<"SLP: We are able to schedule this bundle.\n");
8467
8468unsigned ShuffleOrOp =
8469 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
8470auto CreateOperandNodes = [&](TreeEntry *TE,constauto &Operands) {
8471// Postpone PHI nodes creation
8472SmallVector<unsigned> PHIOps;
8473for (unsignedI : seq<unsigned>(Operands.size())) {
8474ArrayRef<Value *>Op =Operands[I];
8475if (Op.empty())
8476continue;
8477 InstructionsState S =getSameOpcode(Op, *TLI);
8478if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8479 buildTree_rec(Op,Depth + 1, {TE,I});
8480else
8481 PHIOps.push_back(I);
8482 }
8483for (unsignedI : PHIOps)
8484 buildTree_rec(Operands[I],Depth + 1, {TE,I});
8485 };
8486switch (ShuffleOrOp) {
8487case Instruction::PHI: {
8488auto *PH = cast<PHINode>(VL0);
8489
8490 TreeEntry *TE =
8491 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8492LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (PHINode).\n";
8493TE->dump());
8494
8495// Keeps the reordered operands to avoid code duplication.
8496 PHIHandler Handler(*DT, PH, VL);
8497 Handler.buildOperands();
8498for (unsignedI : seq<unsigned>(PH->getNumOperands()))
8499TE->setOperand(I, Handler.getOperands(I));
8500SmallVector<ArrayRef<Value *>>Operands(PH->getNumOperands());
8501for (unsignedI : seq<unsigned>(PH->getNumOperands()))
8502Operands[I] = Handler.getOperands(I);
8503 CreateOperandNodes(TE,Operands);
8504return;
8505 }
8506case Instruction::ExtractValue:
8507case Instruction::ExtractElement: {
8508if (CurrentOrder.empty()) {
8509LLVM_DEBUG(dbgs() <<"SLP: Reusing or shuffling extract sequence.\n");
8510 }else {
8511LLVM_DEBUG({
8512dbgs() <<"SLP: Reusing or shuffling of reordered extract sequence "
8513"with order";
8514for (unsignedIdx : CurrentOrder)
8515dbgs() <<" " <<Idx;
8516dbgs() <<"\n";
8517 });
8518fixupOrderingIndices(CurrentOrder);
8519 }
8520// Insert new order with initial value 0, if it does not exist,
8521// otherwise return the iterator to the existing one.
8522 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8523 ReuseShuffleIndices, CurrentOrder);
8524LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry "
8525"(ExtractValueInst/ExtractElementInst).\n";
8526TE->dump());
8527// This is a special case, as it does not gather, but at the same time
8528// we are not extending buildTree_rec() towards the operands.
8529TE->setOperand(*this);
8530return;
8531 }
8532case Instruction::InsertElement: {
8533assert(ReuseShuffleIndices.empty() &&"All inserts should be unique");
8534
8535auto OrdCompare = [](const std::pair<int, int> &P1,
8536const std::pair<int, int> &P2) {
8537returnP1.first > P2.first;
8538 };
8539PriorityQueue<std::pair<int, int>,SmallVector<std::pair<int, int>>,
8540decltype(OrdCompare)>
8541 Indices(OrdCompare);
8542for (intI = 0, E = VL.size();I < E; ++I) {
8543unsignedIdx = *getElementIndex(VL[I]);
8544 Indices.emplace(Idx,I);
8545 }
8546OrdersType CurrentOrder(VL.size(), VL.size());
8547bool IsIdentity =true;
8548for (intI = 0, E = VL.size();I < E; ++I) {
8549 CurrentOrder[Indices.top().second] =I;
8550 IsIdentity &= Indices.top().second ==I;
8551 Indices.pop();
8552 }
8553if (IsIdentity)
8554 CurrentOrder.clear();
8555 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8556 {}, CurrentOrder);
8557LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (InsertElementInst).\n";
8558TE->dump());
8559
8560TE->setOperand(*this);
8561 buildTree_rec(TE->getOperand(1),Depth + 1, {TE, 1});
8562return;
8563 }
8564case Instruction::Load: {
8565// Check that a vectorized load would load the same memory as a scalar
8566// load. For example, we don't want to vectorize loads that are smaller
8567// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8568// treats loading/storing it as an i8 struct. If we vectorize loads/stores
8569// from such a struct, we read/write packed bits disagreeing with the
8570// unvectorized version.
8571 TreeEntry *TE =nullptr;
8572fixupOrderingIndices(CurrentOrder);
8573switch (State) {
8574case TreeEntry::Vectorize:
8575TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8576 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8577if (CurrentOrder.empty())
8578LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (LoadInst).\n";
8579TE->dump());
8580else
8581LLVM_DEBUG(dbgs()
8582 <<"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8583TE->dump());
8584break;
8585case TreeEntry::StridedVectorize:
8586// Vectorizing non-consecutive loads with `llvm.masked.gather`.
8587TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8588 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8589LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (strided LoadInst).\n";
8590TE->dump());
8591break;
8592case TreeEntry::ScatterVectorize:
8593// Vectorizing non-consecutive loads with `llvm.masked.gather`.
8594TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8595 UserTreeIdx, ReuseShuffleIndices);
8596LLVM_DEBUG(
8597dbgs()
8598 <<"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8599TE->dump());
8600break;
8601case TreeEntry::CombinedVectorize:
8602case TreeEntry::NeedToGather:
8603llvm_unreachable("Unexpected loads state.");
8604 }
8605TE->setOperand(*this);
8606if (State == TreeEntry::ScatterVectorize)
8607 buildTree_rec(PointerOps,Depth + 1, {TE, 0});
8608return;
8609 }
8610case Instruction::ZExt:
8611case Instruction::SExt:
8612case Instruction::FPToUI:
8613case Instruction::FPToSI:
8614case Instruction::FPExt:
8615case Instruction::PtrToInt:
8616case Instruction::IntToPtr:
8617case Instruction::SIToFP:
8618case Instruction::UIToFP:
8619case Instruction::Trunc:
8620case Instruction::FPTrunc:
8621case Instruction::BitCast: {
8622auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8623 std::make_pair(std::numeric_limits<unsigned>::min(),
8624 std::numeric_limits<unsigned>::max()));
8625if (ShuffleOrOp == Instruction::ZExt ||
8626 ShuffleOrOp == Instruction::SExt) {
8627 CastMaxMinBWSizes = std::make_pair(
8628 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8629 PrevMaxBW),
8630 std::min<unsigned>(
8631 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8632 PrevMinBW));
8633 }elseif (ShuffleOrOp == Instruction::Trunc) {
8634 CastMaxMinBWSizes = std::make_pair(
8635 std::max<unsigned>(
8636 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8637 PrevMaxBW),
8638 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8639 PrevMinBW));
8640 }
8641 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8642 ReuseShuffleIndices);
8643LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CastInst).\n";
8644TE->dump());
8645
8646TE->setOperand(*this);
8647for (unsignedI : seq<unsigned>(VL0->getNumOperands()))
8648 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});
8649if (ShuffleOrOp == Instruction::Trunc) {
8650 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8651 }elseif (ShuffleOrOp == Instruction::SIToFP ||
8652 ShuffleOrOp == Instruction::UIToFP) {
8653unsigned NumSignBits =
8654ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC,nullptr, DT);
8655if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8656APIntMask = DB->getDemandedBits(OpI);
8657 NumSignBits = std::max(NumSignBits,Mask.countl_zero());
8658 }
8659if (NumSignBits * 2 >=
8660 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8661 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8662 }
8663return;
8664 }
8665case Instruction::ICmp:
8666case Instruction::FCmp: {
8667// Check that all of the compares have the same predicate.
8668CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8669 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8670 ReuseShuffleIndices);
8671LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CmpInst).\n";
8672TE->dump());
8673
8674ValueListLeft,Right;
8675 VLOperands Ops(VL, S, *this);
8676if (cast<CmpInst>(VL0)->isCommutative()) {
8677// Commutative predicate - collect + sort operands of the instructions
8678// so that each side is more likely to have the same opcode.
8679assert(P0 ==CmpInst::getSwappedPredicate(P0) &&
8680"Commutative Predicate mismatch");
8681 Ops.reorder();
8682Left = Ops.getVL(0);
8683Right = Ops.getVL(1);
8684 }else {
8685// Collect operands - commute if it uses the swapped predicate.
8686for (Value *V : VL) {
8687if (isa<PoisonValue>(V)) {
8688Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8689Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8690continue;
8691 }
8692auto *Cmp = cast<CmpInst>(V);
8693Value *LHS =Cmp->getOperand(0);
8694Value *RHS =Cmp->getOperand(1);
8695if (Cmp->getPredicate() != P0)
8696std::swap(LHS, RHS);
8697Left.push_back(LHS);
8698Right.push_back(RHS);
8699 }
8700 }
8701TE->setOperand(0,Left);
8702TE->setOperand(1,Right);
8703 buildTree_rec(Left,Depth + 1, {TE, 0});
8704 buildTree_rec(Right,Depth + 1, {TE, 1});
8705if (ShuffleOrOp == Instruction::ICmp) {
8706unsigned NumSignBits0 =
8707ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC,nullptr, DT);
8708if (NumSignBits0 * 2 >=
8709 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8710 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8711unsigned NumSignBits1 =
8712ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC,nullptr, DT);
8713if (NumSignBits1 * 2 >=
8714 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8715 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8716 }
8717return;
8718 }
8719case Instruction::Select:
8720case Instruction::FNeg:
8721case Instruction::Add:
8722case Instruction::FAdd:
8723case Instruction::Sub:
8724case Instruction::FSub:
8725case Instruction::Mul:
8726case Instruction::FMul:
8727case Instruction::UDiv:
8728case Instruction::SDiv:
8729case Instruction::FDiv:
8730case Instruction::URem:
8731case Instruction::SRem:
8732case Instruction::FRem:
8733case Instruction::Shl:
8734case Instruction::LShr:
8735case Instruction::AShr:
8736case Instruction::And:
8737case Instruction::Or:
8738case Instruction::Xor:
8739case Instruction::Freeze: {
8740 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8741 ReuseShuffleIndices);
8742LLVM_DEBUG(
8743dbgs() <<"SLP: added a new TreeEntry "
8744"(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8745TE->dump());
8746
8747TE->setOperand(*this, isa<BinaryOperator>(VL0) &&isCommutative(VL0));
8748for (unsignedI : seq<unsigned>(VL0->getNumOperands()))
8749 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});
8750return;
8751 }
8752case Instruction::GetElementPtr: {
8753 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8754 ReuseShuffleIndices);
8755LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8756TE->dump());
8757SmallVector<ValueList, 2>Operands(2);
8758// Prepare the operand vector for pointer operands.
8759for (Value *V : VL) {
8760auto *GEP = dyn_cast<GetElementPtrInst>(V);
8761if (!GEP) {
8762Operands.front().push_back(V);
8763continue;
8764 }
8765Operands.front().push_back(GEP->getPointerOperand());
8766 }
8767TE->setOperand(0,Operands.front());
8768// Need to cast all indices to the same type before vectorization to
8769// avoid crash.
8770// Required to be able to find correct matches between different gather
8771// nodes and reuse the vectorized values rather than trying to gather them
8772// again.
8773int IndexIdx = 1;
8774Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8775Type *Ty =all_of(VL,
8776 [VL0Ty, IndexIdx](Value *V) {
8777auto *GEP = dyn_cast<GetElementPtrInst>(V);
8778if (!GEP)
8779returntrue;
8780return VL0Ty ==GEP->getOperand(IndexIdx)->getType();
8781 })
8782 ? VL0Ty
8783 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8784 ->getPointerOperandType()
8785 ->getScalarType());
8786// Prepare the operand vector.
8787for (Value *V : VL) {
8788auto *I = dyn_cast<GetElementPtrInst>(V);
8789if (!I) {
8790Operands.back().push_back(
8791 ConstantInt::get(Ty, 0,/*isSigned=*/false));
8792continue;
8793 }
8794auto *Op =I->getOperand(IndexIdx);
8795auto *CI = dyn_cast<ConstantInt>(Op);
8796if (!CI)
8797Operands.back().push_back(Op);
8798else
8799Operands.back().push_back(ConstantFoldIntegerCast(
8800 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8801 }
8802TE->setOperand(IndexIdx,Operands.back());
8803
8804for (unsignedI = 0, Ops =Operands.size();I < Ops; ++I)
8805 buildTree_rec(Operands[I],Depth + 1, {TE,I});
8806return;
8807 }
8808case Instruction::Store: {
8809bool Consecutive = CurrentOrder.empty();
8810if (!Consecutive)
8811fixupOrderingIndices(CurrentOrder);
8812 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8813 ReuseShuffleIndices, CurrentOrder);
8814if (Consecutive)
8815LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (StoreInst).\n";
8816TE->dump());
8817else
8818LLVM_DEBUG(
8819dbgs() <<"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8820TE->dump());
8821TE->setOperand(*this);
8822 buildTree_rec(TE->getOperand(0),Depth + 1, {TE, 0});
8823return;
8824 }
8825case Instruction::Call: {
8826// Check if the calls are all to the same vectorizable intrinsic or
8827// library function.
8828CallInst *CI = cast<CallInst>(VL0);
8829Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
8830
8831 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8832 ReuseShuffleIndices);
8833LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CallInst).\n";
8834TE->dump());
8835TE->setOperand(*this,isCommutative(VL0));
8836for (unsignedI : seq<unsigned>(CI->arg_size())) {
8837// For scalar operands no need to create an entry since no need to
8838// vectorize it.
8839if (isVectorIntrinsicWithScalarOpAtArg(ID,I,TTI))
8840continue;
8841 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});
8842 }
8843return;
8844 }
8845case Instruction::ShuffleVector: {
8846 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8847 ReuseShuffleIndices);
8848if (S.isAltShuffle()) {
8849LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (isAltShuffle).\n";
8850TE->dump());
8851 }else {
8852assert(SLPReVec &&"Only supported by REVEC.");
8853LLVM_DEBUG(
8854dbgs() <<"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8855TE->dump());
8856 }
8857
8858// Reorder operands if reordering would enable vectorization.
8859auto *CI = dyn_cast<CmpInst>(VL0);
8860if (CI &&any_of(VL, [](Value *V) {
8861return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8862 })) {
8863auto *MainCI = cast<CmpInst>(S.getMainOp());
8864auto *AltCI = cast<CmpInst>(S.getAltOp());
8865CmpInst::Predicate MainP = MainCI->getPredicate();
8866CmpInst::Predicate AltP = AltCI->getPredicate();
8867assert(MainP != AltP &&
8868"Expected different main/alternate predicates.");
8869ValueListLeft,Right;
8870// Collect operands - commute if it uses the swapped predicate or
8871// alternate operation.
8872for (Value *V : VL) {
8873if (isa<PoisonValue>(V)) {
8874Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8875Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8876continue;
8877 }
8878auto *Cmp = cast<CmpInst>(V);
8879Value *LHS =Cmp->getOperand(0);
8880Value *RHS =Cmp->getOperand(1);
8881
8882if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8883if (AltP ==CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8884std::swap(LHS, RHS);
8885 }else {
8886if (MainP ==CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8887std::swap(LHS, RHS);
8888 }
8889Left.push_back(LHS);
8890Right.push_back(RHS);
8891 }
8892TE->setOperand(0,Left);
8893TE->setOperand(1,Right);
8894 buildTree_rec(Left,Depth + 1, {TE, 0});
8895 buildTree_rec(Right,Depth + 1, {TE, 1});
8896return;
8897 }
8898
8899TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8900for (unsignedI : seq<unsigned>(VL0->getNumOperands()))
8901 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});
8902return;
8903 }
8904default:
8905break;
8906 }
8907llvm_unreachable("Unexpected vectorization of the instructions.");
8908}
8909
8910unsignedBoUpSLP::canMapToVector(Type *T) const{
8911unsignedN = 1;
8912Type *EltTy =T;
8913
8914while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8915if (EltTy->isEmptyTy())
8916return 0;
8917if (auto *ST = dyn_cast<StructType>(EltTy)) {
8918// Check that struct is homogeneous.
8919for (constauto *Ty : ST->elements())
8920if (Ty != *ST->element_begin())
8921return 0;
8922N *= ST->getNumElements();
8923 EltTy = *ST->element_begin();
8924 }elseif (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8925N *= AT->getNumElements();
8926 EltTy = AT->getElementType();
8927 }else {
8928auto *VT = cast<FixedVectorType>(EltTy);
8929N *= VT->getNumElements();
8930 EltTy = VT->getElementType();
8931 }
8932 }
8933
8934if (!isValidElementType(EltTy))
8935return 0;
8936uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy,N));
8937if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8938 VTSize != DL->getTypeStoreSizeInBits(T))
8939return 0;
8940returnN;
8941}
8942
8943bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
8944SmallVectorImpl<unsigned> &CurrentOrder,
8945bool ResizeAllowed) const{
8946constauto *It =find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8947assert(It != VL.end() &&"Expected at least one extract instruction.");
8948auto *E0 = cast<Instruction>(*It);
8949assert(
8950all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8951"Invalid opcode");
8952// Check if all of the extracts come from the same vector and from the
8953// correct offset.
8954Value *Vec = E0->getOperand(0);
8955
8956 CurrentOrder.clear();
8957
8958// We have to extract from a vector/aggregate with the same number of elements.
8959unsigned NElts;
8960if (E0->getOpcode() == Instruction::ExtractValue) {
8961 NElts =canMapToVector(Vec->getType());
8962if (!NElts)
8963returnfalse;
8964// Check if load can be rewritten as load of vector.
8965LoadInst *LI = dyn_cast<LoadInst>(Vec);
8966if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8967returnfalse;
8968 }else {
8969 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8970 }
8971
8972unsigned E = VL.size();
8973if (!ResizeAllowed && NElts != E)
8974returnfalse;
8975SmallVector<int> Indices(E,PoisonMaskElem);
8976unsigned MinIdx = NElts, MaxIdx = 0;
8977for (auto [I, V] :enumerate(VL)) {
8978auto *Inst = dyn_cast<Instruction>(V);
8979if (!Inst)
8980continue;
8981if (Inst->getOperand(0) != Vec)
8982returnfalse;
8983if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8984if (isa<UndefValue>(EE->getIndexOperand()))
8985continue;
8986 std::optional<unsigned>Idx =getExtractIndex(Inst);
8987if (!Idx)
8988returnfalse;
8989constunsigned ExtIdx = *Idx;
8990if (ExtIdx >= NElts)
8991continue;
8992 Indices[I] = ExtIdx;
8993if (MinIdx > ExtIdx)
8994 MinIdx = ExtIdx;
8995if (MaxIdx < ExtIdx)
8996 MaxIdx = ExtIdx;
8997 }
8998if (MaxIdx - MinIdx + 1 > E)
8999returnfalse;
9000if (MaxIdx + 1 <= E)
9001 MinIdx = 0;
9002
9003// Check that all of the indices extract from the correct offset.
9004bool ShouldKeepOrder =true;
9005// Assign to all items the initial value E + 1 so we can check if the extract
9006// instruction index was used already.
9007// Also, later we can check that all the indices are used and we have a
9008// consecutive access in the extract instructions, by checking that no
9009// element of CurrentOrder still has value E + 1.
9010 CurrentOrder.assign(E, E);
9011for (unsignedI = 0;I < E; ++I) {
9012if (Indices[I] ==PoisonMaskElem)
9013continue;
9014constunsigned ExtIdx = Indices[I] - MinIdx;
9015if (CurrentOrder[ExtIdx] != E) {
9016 CurrentOrder.clear();
9017returnfalse;
9018 }
9019 ShouldKeepOrder &= ExtIdx ==I;
9020 CurrentOrder[ExtIdx] =I;
9021 }
9022if (ShouldKeepOrder)
9023 CurrentOrder.clear();
9024
9025return ShouldKeepOrder;
9026}
9027
9028bool BoUpSLP::areAllUsersVectorized(
9029Instruction *I,constSmallDenseSet<Value *> *VectorizedVals) const{
9030return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
9031all_of(I->users(), [this](User *U) {
9032 return ScalarToTreeEntry.contains(U) ||
9033 isVectorLikeInstWithConstOps(U) ||
9034 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9035 });
9036}
9037
9038static std::pair<InstructionCost, InstructionCost>
9039getVectorCallCosts(CallInst *CI,FixedVectorType *VecTy,
9040TargetTransformInfo *TTI,TargetLibraryInfo *TLI,
9041ArrayRef<Type *> ArgTys) {
9042Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
9043
9044// Calculate the cost of the scalar and vector calls.
9045FastMathFlags FMF;
9046if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9047 FMF = FPCI->getFastMathFlags();
9048IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
9049auto IntrinsicCost =
9050TTI->getIntrinsicInstrCost(CostAttrs,TTI::TCK_RecipThroughput);
9051
9052auto Shape =VFShape::get(CI->getFunctionType(),
9053ElementCount::getFixed(VecTy->getNumElements()),
9054false/*HasGlobalPred*/);
9055Function *VecFunc =VFDatabase(*CI).getVectorizedFunction(Shape);
9056auto LibCost = IntrinsicCost;
9057if (!CI->isNoBuiltin() && VecFunc) {
9058// Calculate the cost of the vector library call.
9059// If the corresponding vector call is cheaper, return its cost.
9060 LibCost =
9061TTI->getCallInstrCost(nullptr, VecTy, ArgTys,TTI::TCK_RecipThroughput);
9062 }
9063return {IntrinsicCost, LibCost};
9064}
9065
9066void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9067constfunction_ref<bool(Instruction *)> IsAltOp,SmallVectorImpl<int> &Mask,
9068SmallVectorImpl<Value *> *OpScalars,
9069SmallVectorImpl<Value *> *AltScalars) const{
9070unsigned Sz = Scalars.size();
9071Mask.assign(Sz,PoisonMaskElem);
9072SmallVector<int> OrderMask;
9073if (!ReorderIndices.empty())
9074inversePermutation(ReorderIndices, OrderMask);
9075for (unsignedI = 0;I < Sz; ++I) {
9076unsignedIdx =I;
9077if (!ReorderIndices.empty())
9078Idx = OrderMask[I];
9079if (isa<PoisonValue>(Scalars[Idx]))
9080continue;
9081auto *OpInst = cast<Instruction>(Scalars[Idx]);
9082if (IsAltOp(OpInst)) {
9083Mask[I] = Sz +Idx;
9084if (AltScalars)
9085 AltScalars->push_back(OpInst);
9086 }else {
9087Mask[I] =Idx;
9088if (OpScalars)
9089 OpScalars->push_back(OpInst);
9090 }
9091 }
9092if (!ReuseShuffleIndices.empty()) {
9093SmallVector<int> NewMask(ReuseShuffleIndices.size(),PoisonMaskElem);
9094transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](intIdx) {
9095 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9096 });
9097Mask.swap(NewMask);
9098 }
9099}
9100
9101staticboolisAlternateInstruction(constInstruction *I,
9102constInstruction *MainOp,
9103constInstruction *AltOp,
9104constTargetLibraryInfo &TLI) {
9105if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9106auto *AltCI = cast<CmpInst>(AltOp);
9107CmpInst::Predicate MainP = MainCI->getPredicate();
9108 [[maybe_unused]]CmpInst::Predicate AltP = AltCI->getPredicate();
9109assert(MainP != AltP &&"Expected different main/alternate predicates.");
9110auto *CI = cast<CmpInst>(I);
9111if (isCmpSameOrSwapped(MainCI, CI, TLI))
9112returnfalse;
9113if (isCmpSameOrSwapped(AltCI, CI, TLI))
9114returntrue;
9115CmpInst::PredicateP = CI->getPredicate();
9116CmpInst::Predicate SwappedP =CmpInst::getSwappedPredicate(P);
9117
9118assert((MainP ==P || AltP ==P || MainP == SwappedP || AltP == SwappedP) &&
9119"CmpInst expected to match either main or alternate predicate or "
9120"their swap.");
9121return MainP !=P && MainP != SwappedP;
9122 }
9123returnI->getOpcode() == AltOp->getOpcode();
9124}
9125
9126TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9127assert(!Ops.empty());
9128constauto *Op0 = Ops.front();
9129
9130constboolIsConstant =all_of(Ops, [](Value *V) {
9131// TODO: We should allow undef elements here
9132returnisConstant(V) && !isa<UndefValue>(V);
9133 });
9134constbool IsUniform =all_of(Ops, [=](Value *V) {
9135// TODO: We should allow undef elements here
9136returnV == Op0;
9137 });
9138constbool IsPowerOfTwo =all_of(Ops, [](Value *V) {
9139// TODO: We should allow undef elements here
9140if (auto *CI = dyn_cast<ConstantInt>(V))
9141return CI->getValue().isPowerOf2();
9142returnfalse;
9143 });
9144constbool IsNegatedPowerOfTwo =all_of(Ops, [](Value *V) {
9145// TODO: We should allow undef elements here
9146if (auto *CI = dyn_cast<ConstantInt>(V))
9147return CI->getValue().isNegatedPowerOf2();
9148returnfalse;
9149 });
9150
9151TTI::OperandValueKind VK =TTI::OK_AnyValue;
9152if (IsConstant && IsUniform)
9153 VK =TTI::OK_UniformConstantValue;
9154elseif (IsConstant)
9155 VK =TTI::OK_NonUniformConstantValue;
9156elseif (IsUniform)
9157 VK =TTI::OK_UniformValue;
9158
9159TTI::OperandValueProperties VP =TTI::OP_None;
9160 VP = IsPowerOfTwo ?TTI::OP_PowerOf2 : VP;
9161 VP = IsNegatedPowerOfTwo ?TTI::OP_NegatedPowerOf2 : VP;
9162
9163return {VK, VP};
9164}
9165
9166namespace{
9167/// The base class for shuffle instruction emission and shuffle cost estimation.
9168classBaseShuffleAnalysis {
9169protected:
9170Type *ScalarTy =nullptr;
9171
9172 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9173
9174 /// V is expected to be a vectorized value.
9175 /// When REVEC is disabled, there is no difference between VF and
9176 /// VNumElements.
9177 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9178 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9179 /// of 8.
9180unsigned getVF(Value *V) const{
9181assert(V &&"V cannot be nullptr");
9182assert(isa<FixedVectorType>(V->getType()) &&
9183"V does not have FixedVectorType");
9184assert(ScalarTy &&"ScalarTy cannot be nullptr");
9185unsigned ScalarTyNumElements =getNumElements(ScalarTy);
9186unsigned VNumElements =
9187 cast<FixedVectorType>(V->getType())->getNumElements();
9188assert(VNumElements > ScalarTyNumElements &&
9189"the number of elements of V is not large enough");
9190assert(VNumElements % ScalarTyNumElements == 0 &&
9191"the number of elements of V is not a vectorized value");
9192return VNumElements / ScalarTyNumElements;
9193 }
9194
9195 /// Checks if the mask is an identity mask.
9196 /// \param IsStrict if is true the function returns false if mask size does
9197 /// not match vector size.
9198staticbool isIdentityMask(ArrayRef<int> Mask,constFixedVectorType *VecTy,
9199bool IsStrict) {
9200int Limit =Mask.size();
9201int VF = VecTy->getNumElements();
9202intIndex = -1;
9203if (VF == Limit &&ShuffleVectorInst::isIdentityMask(Mask, Limit))
9204returntrue;
9205if (!IsStrict) {
9206// Consider extract subvector starting from index 0.
9207if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF,Index) &&
9208Index == 0)
9209returntrue;
9210// All VF-size submasks are identity (e.g.
9211// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9212if (Limit % VF == 0 &&all_of(seq<int>(0, Limit / VF), [=](intIdx) {
9213ArrayRef<int> Slice =Mask.slice(Idx * VF, VF);
9214returnall_of(Slice, [](intI) {returnI ==PoisonMaskElem; }) ||
9215ShuffleVectorInst::isIdentityMask(Slice, VF);
9216 }))
9217returntrue;
9218 }
9219returnfalse;
9220 }
9221
9222 /// Tries to combine 2 different masks into single one.
9223 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9224 /// change the size of the vector, \p LocalVF is the original size of the
9225 /// shuffled vector.
9226staticvoid combineMasks(unsigned LocalVF,SmallVectorImpl<int> &Mask,
9227ArrayRef<int> ExtMask) {
9228unsigned VF =Mask.size();
9229SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);
9230for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {
9231if (ExtMask[I] ==PoisonMaskElem)
9232continue;
9233int MaskedIdx =Mask[ExtMask[I] % VF];
9234 NewMask[I] =
9235 MaskedIdx ==PoisonMaskElem ?PoisonMaskElem : MaskedIdx % LocalVF;
9236 }
9237Mask.swap(NewMask);
9238 }
9239
9240 /// Looks through shuffles trying to reduce final number of shuffles in the
9241 /// code. The function looks through the previously emitted shuffle
9242 /// instructions and properly mark indices in mask as undef.
9243 /// For example, given the code
9244 /// \code
9245 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9246 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9247 /// \endcode
9248 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9249 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9250 /// <0, 1, 2, 3> for the shuffle.
9251 /// If 2 operands are of different size, the smallest one will be resized and
9252 /// the mask recalculated properly.
9253 /// For example, given the code
9254 /// \code
9255 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9256 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9257 /// \endcode
9258 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9259 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9260 /// <0, 1, 2, 3> for the shuffle.
9261 /// So, it tries to transform permutations to simple vector merge, if
9262 /// possible.
9263 /// \param V The input vector which must be shuffled using the given \p Mask.
9264 /// If the better candidate is found, \p V is set to this best candidate
9265 /// vector.
9266 /// \param Mask The input mask for the shuffle. If the best candidate is found
9267 /// during looking-through-shuffles attempt, it is updated accordingly.
9268 /// \param SinglePermute true if the shuffle operation is originally a
9269 /// single-value-permutation. In this case the look-through-shuffles procedure
9270 /// may look for resizing shuffles as the best candidates.
9271 /// \return true if the shuffle results in the non-resizing identity shuffle
9272 /// (and thus can be ignored), false - otherwise.
9273staticbool peekThroughShuffles(Value *&V,SmallVectorImpl<int> &Mask,
9274bool SinglePermute) {
9275Value *Op =V;
9276ShuffleVectorInst *IdentityOp =nullptr;
9277SmallVector<int> IdentityMask;
9278while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9279// Exit if not a fixed vector type or changing size shuffle.
9280auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9281if (!SVTy)
9282break;
9283// Remember the identity or broadcast mask, if it is not a resizing
9284// shuffle. If no better candidates are found, this Op and Mask will be
9285// used in the final shuffle.
9286if (isIdentityMask(Mask, SVTy,/*IsStrict=*/false)) {
9287if (!IdentityOp || !SinglePermute ||
9288 (isIdentityMask(Mask, SVTy,/*IsStrict=*/true) &&
9289 !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
9290 IdentityMask.size()))) {
9291 IdentityOp = SV;
9292// Store current mask in the IdentityMask so later we did not lost
9293// this info if IdentityOp is selected as the best candidate for the
9294// permutation.
9295 IdentityMask.assign(Mask);
9296 }
9297 }
9298// Remember the broadcast mask. If no better candidates are found, this Op
9299// and Mask will be used in the final shuffle.
9300// Zero splat can be used as identity too, since it might be used with
9301// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9302// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9303// expensive, the analysis founds out, that the source vector is just a
9304// broadcast, this original mask can be transformed to identity mask <0,
9305// 1, 2, 3>.
9306// \code
9307// %0 = shuffle %v, poison, zeroinitalizer
9308// %res = shuffle %0, poison, <3, 1, 2, 0>
9309// \endcode
9310// may be transformed to
9311// \code
9312// %0 = shuffle %v, poison, zeroinitalizer
9313// %res = shuffle %0, poison, <0, 1, 2, 3>
9314// \endcode
9315if (SV->isZeroEltSplat()) {
9316 IdentityOp = SV;
9317 IdentityMask.assign(Mask);
9318 }
9319int LocalVF =Mask.size();
9320if (auto *SVOpTy =
9321 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9322 LocalVF = SVOpTy->getNumElements();
9323SmallVector<int> ExtMask(Mask.size(),PoisonMaskElem);
9324for (auto [Idx,I] :enumerate(Mask)) {
9325if (I ==PoisonMaskElem ||
9326static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9327continue;
9328 ExtMask[Idx] = SV->getMaskValue(I);
9329 }
9330bool IsOp1Undef =isUndefVector</*isPoisonOnly=*/true>(
9331 SV->getOperand(0),
9332buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9333 .all();
9334bool IsOp2Undef =isUndefVector</*isPoisonOnly=*/true>(
9335 SV->getOperand(1),
9336buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9337 .all();
9338if (!IsOp1Undef && !IsOp2Undef) {
9339// Update mask and mark undef elems.
9340for (int &I : Mask) {
9341if (I ==PoisonMaskElem)
9342continue;
9343if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9344PoisonMaskElem)
9345I =PoisonMaskElem;
9346 }
9347break;
9348 }
9349SmallVector<int> ShuffleMask(SV->getShuffleMask());
9350 combineMasks(LocalVF, ShuffleMask, Mask);
9351Mask.swap(ShuffleMask);
9352if (IsOp2Undef)
9353Op = SV->getOperand(0);
9354else
9355Op = SV->getOperand(1);
9356 }
9357if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9358 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9359ShuffleVectorInst::isZeroEltSplatMask(Mask,Mask.size())) {
9360if (IdentityOp) {
9361V = IdentityOp;
9362assert(Mask.size() == IdentityMask.size() &&
9363"Expected masks of same sizes.");
9364// Clear known poison elements.
9365for (auto [I,Idx] :enumerate(Mask))
9366if (Idx ==PoisonMaskElem)
9367 IdentityMask[I] =PoisonMaskElem;
9368Mask.swap(IdentityMask);
9369auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9370return SinglePermute &&
9371 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9372/*IsStrict=*/true) ||
9373 (Shuffle &&Mask.size() == Shuffle->getShuffleMask().size() &&
9374 Shuffle->isZeroEltSplat() &&
9375ShuffleVectorInst::isZeroEltSplatMask(Mask,Mask.size())));
9376 }
9377V =Op;
9378returnfalse;
9379 }
9380V =Op;
9381returntrue;
9382 }
9383
9384 /// Smart shuffle instruction emission, walks through shuffles trees and
9385 /// tries to find the best matching vector for the actual shuffle
9386 /// instruction.
9387template <typename T,typename ShuffleBuilderTy>
9388staticT createShuffle(Value *V1,Value *V2,ArrayRef<int> Mask,
9389 ShuffleBuilderTy &Builder,Type *ScalarTy) {
9390assert(V1 &&"Expected at least one vector value.");
9391unsigned ScalarTyNumElements =getNumElements(ScalarTy);
9392SmallVector<int> NewMask(Mask);
9393if (ScalarTyNumElements != 1) {
9394assert(SLPReVec &&"FixedVectorType is not expected.");
9395transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
9396Mask = NewMask;
9397 }
9398if (V2)
9399 Builder.resizeToMatch(V1, V2);
9400int VF =Mask.size();
9401if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9402 VF = FTy->getNumElements();
9403if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9404 V2,buildUseMask(VF, Mask, UseMask::SecondArg))
9405 .all()) {
9406// Peek through shuffles.
9407Value *Op1 = V1;
9408Value *Op2 =V2;
9409int VF =
9410 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9411SmallVector<int> CombinedMask1(Mask.size(),PoisonMaskElem);
9412SmallVector<int> CombinedMask2(Mask.size(),PoisonMaskElem);
9413for (intI = 0,E =Mask.size();I <E; ++I) {
9414if (Mask[I] < VF)
9415 CombinedMask1[I] =Mask[I];
9416else
9417 CombinedMask2[I] =Mask[I] - VF;
9418 }
9419Value *PrevOp1;
9420Value *PrevOp2;
9421do {
9422 PrevOp1 = Op1;
9423 PrevOp2 = Op2;
9424 (void)peekThroughShuffles(Op1, CombinedMask1,/*SinglePermute=*/false);
9425 (void)peekThroughShuffles(Op2, CombinedMask2,/*SinglePermute=*/false);
9426// Check if we have 2 resizing shuffles - need to peek through operands
9427// again.
9428if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9429if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9430SmallVector<int> ExtMask1(Mask.size(),PoisonMaskElem);
9431for (auto [Idx,I] :enumerate(CombinedMask1)) {
9432if (I ==PoisonMaskElem)
9433continue;
9434 ExtMask1[Idx] = SV1->getMaskValue(I);
9435 }
9436SmallBitVector UseMask1 =buildUseMask(
9437 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9438 ->getNumElements(),
9439 ExtMask1, UseMask::SecondArg);
9440SmallVector<int> ExtMask2(CombinedMask2.size(),PoisonMaskElem);
9441for (auto [Idx,I] :enumerate(CombinedMask2)) {
9442if (I ==PoisonMaskElem)
9443continue;
9444 ExtMask2[Idx] = SV2->getMaskValue(I);
9445 }
9446SmallBitVector UseMask2 =buildUseMask(
9447 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9448 ->getNumElements(),
9449 ExtMask2, UseMask::SecondArg);
9450if (SV1->getOperand(0)->getType() ==
9451 SV2->getOperand(0)->getType() &&
9452 SV1->getOperand(0)->getType() != SV1->getType() &&
9453isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9454isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9455 Op1 = SV1->getOperand(0);
9456 Op2 = SV2->getOperand(0);
9457SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9458int LocalVF = ShuffleMask1.size();
9459if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9460 LocalVF = FTy->getNumElements();
9461 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9462 CombinedMask1.swap(ShuffleMask1);
9463SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9464 LocalVF = ShuffleMask2.size();
9465if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9466 LocalVF = FTy->getNumElements();
9467 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9468 CombinedMask2.swap(ShuffleMask2);
9469 }
9470 }
9471 }while (PrevOp1 != Op1 || PrevOp2 != Op2);
9472 Builder.resizeToMatch(Op1, Op2);
9473 VF = std::max(cast<VectorType>(Op1->getType())
9474 ->getElementCount()
9475 .getKnownMinValue(),
9476 cast<VectorType>(Op2->getType())
9477 ->getElementCount()
9478 .getKnownMinValue());
9479for (intI = 0,E =Mask.size();I <E; ++I) {
9480if (CombinedMask2[I] !=PoisonMaskElem) {
9481assert(CombinedMask1[I] ==PoisonMaskElem &&
9482"Expected undefined mask element");
9483 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9484 }
9485 }
9486if (Op1 == Op2 &&
9487 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9488 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9489 isa<ShuffleVectorInst>(Op1) &&
9490 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9491ArrayRef(CombinedMask1))))
9492return Builder.createIdentity(Op1);
9493return Builder.createShuffleVector(
9494 Op1, Op1 == Op2 ?PoisonValue::get(Op1->getType()) : Op2,
9495 CombinedMask1);
9496 }
9497if (isa<PoisonValue>(V1))
9498return Builder.createPoison(
9499 cast<VectorType>(V1->getType())->getElementType(),Mask.size());
9500bool IsIdentity = peekThroughShuffles(V1, NewMask,/*SinglePermute=*/true);
9501assert(V1 &&"Expected non-null value after looking through shuffles.");
9502
9503if (!IsIdentity)
9504return Builder.createShuffleVector(V1, NewMask);
9505return Builder.createIdentity(V1);
9506 }
9507
9508 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
9509 /// shuffle emission.
9510staticvoid transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
9511ArrayRef<int> Mask) {
9512for (unsignedI : seq<unsigned>(CommonMask.size()))
9513if (Mask[I] !=PoisonMaskElem)
9514 CommonMask[I] =I;
9515 }
9516};
9517}// namespace
9518
9519/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9520static std::pair<InstructionCost, InstructionCost>
9521getGEPCosts(constTargetTransformInfo &TTI,ArrayRef<Value *> Ptrs,
9522Value *BasePtr,unsigned Opcode,TTI::TargetCostKindCostKind,
9523Type *ScalarTy,VectorType *VecTy) {
9524InstructionCost ScalarCost = 0;
9525InstructionCost VecCost = 0;
9526// Here we differentiate two cases: (1) when Ptrs represent a regular
9527// vectorization tree node (as they are pointer arguments of scattered
9528// loads) or (2) when Ptrs are the arguments of loads or stores being
9529// vectorized as plane wide unit-stride load/store since all the
9530// loads/stores are known to be from/to adjacent locations.
9531if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9532// Case 2: estimate costs for pointer related costs when vectorizing to
9533// a wide load/store.
9534// Scalar cost is estimated as a set of pointers with known relationship
9535// between them.
9536// For vector code we will use BasePtr as argument for the wide load/store
9537// but we also need to account all the instructions which are going to
9538// stay in vectorized code due to uses outside of these scalar
9539// loads/stores.
9540 ScalarCost =TTI.getPointersChainCost(
9541 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9542CostKind);
9543
9544SmallVector<const Value *> PtrsRetainedInVecCode;
9545for (Value *V : Ptrs) {
9546if (V == BasePtr) {
9547 PtrsRetainedInVecCode.push_back(V);
9548continue;
9549 }
9550auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9551// For simplicity assume Ptr to stay in vectorized code if it's not a
9552// GEP instruction. We don't care since it's cost considered free.
9553// TODO: We should check for any uses outside of vectorizable tree
9554// rather than just single use.
9555if (!Ptr || !Ptr->hasOneUse())
9556 PtrsRetainedInVecCode.push_back(V);
9557 }
9558
9559if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9560// If all pointers stay in vectorized code then we don't have
9561// any savings on that.
9562return std::make_pair(TTI::TCC_Free,TTI::TCC_Free);
9563 }
9564 VecCost =TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9565 TTI::PointersChainInfo::getKnownStride(),
9566 VecTy,CostKind);
9567 }else {
9568// Case 1: Ptrs are the arguments of loads that we are going to transform
9569// into masked gather load intrinsic.
9570// All the scalar GEPs will be removed as a result of vectorization.
9571// For any external uses of some lanes extract element instructions will
9572// be generated (which cost is estimated separately).
9573TTI::PointersChainInfo PtrsInfo =
9574all_of(Ptrs,
9575 [](constValue *V) {
9576auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9577returnPtr && !Ptr->hasAllConstantIndices();
9578 })
9579 ? TTI::PointersChainInfo::getUnknownStride()
9580 : TTI::PointersChainInfo::getKnownStride();
9581
9582 ScalarCost =
9583TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,CostKind);
9584auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9585if (!BaseGEP) {
9586auto *It =find_if(Ptrs, IsaPred<GEPOperator>);
9587if (It != Ptrs.end())
9588 BaseGEP = cast<GEPOperator>(*It);
9589 }
9590if (BaseGEP) {
9591SmallVector<const Value *> Indices(BaseGEP->indices());
9592 VecCost =TTI.getGEPCost(BaseGEP->getSourceElementType(),
9593 BaseGEP->getPointerOperand(), Indices, VecTy,
9594CostKind);
9595 }
9596 }
9597
9598return std::make_pair(ScalarCost, VecCost);
9599}
9600
9601void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9602assert(TE.isGather() &&TE.ReorderIndices.empty() &&
9603"Expected gather node without reordering.");
9604DenseMap<std::pair<size_t, Value *>,SmallVector<LoadInst *>> LoadsMap;
9605SmallSet<size_t, 2> LoadKeyUsed;
9606
9607// Do not reorder nodes if it small (just 2 elements), all-constant or all
9608// instructions have same opcode already.
9609if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
9610all_of(TE.Scalars,isConstant))
9611return;
9612
9613if (any_of(seq<unsigned>(TE.Idx), [&](unsignedIdx) {
9614 return VectorizableTree[Idx]->isSame(TE.Scalars);
9615 }))
9616return;
9617
9618auto GenerateLoadsSubkey = [&](size_tKey,LoadInst *LI) {
9619Key =hash_combine(hash_value(LI->getParent()), Key);
9620Value *Ptr =
9621getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth);
9622if (LoadKeyUsed.contains(Key)) {
9623auto LIt = LoadsMap.find(std::make_pair(Key,Ptr));
9624if (LIt != LoadsMap.end()) {
9625for (LoadInst *RLI : LIt->second) {
9626if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9627 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9628/*StrictCheck=*/true))
9629returnhash_value(RLI->getPointerOperand());
9630 }
9631for (LoadInst *RLI : LIt->second) {
9632if (arePointersCompatible(RLI->getPointerOperand(),
9633 LI->getPointerOperand(), *TLI)) {
9634hash_code SubKey =hash_value(RLI->getPointerOperand());
9635return SubKey;
9636 }
9637 }
9638if (LIt->second.size() > 2) {
9639hash_code SubKey =
9640hash_value(LIt->second.back()->getPointerOperand());
9641return SubKey;
9642 }
9643 }
9644 }
9645 LoadKeyUsed.insert(Key);
9646 LoadsMap.try_emplace(std::make_pair(Key,Ptr)).first->second.push_back(LI);
9647returnhash_value(LI->getPointerOperand());
9648 };
9649MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
9650SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
9651bool IsOrdered =true;
9652unsigned NumInstructions = 0;
9653// Try to "cluster" scalar instructions, to be able to build extra vectorized
9654// nodes.
9655for (auto [I, V] :enumerate(TE.Scalars)) {
9656size_tKey = 1,Idx = 1;
9657if (auto *Inst = dyn_cast<Instruction>(V);
9658 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9659 !isDeleted(Inst) && !isVectorized(V)) {
9660 std::tie(Key,Idx) =generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9661/*AllowAlternate=*/false);
9662 ++NumInstructions;
9663 }
9664auto &Container = SortedValues[Key];
9665if (IsOrdered && !KeyToIndex.contains(V) &&
9666 !(isa<Constant, ExtractElementInst>(V) ||
9667isVectorLikeInstWithConstOps(V)) &&
9668 ((Container.contains(Idx) &&
9669 KeyToIndex.at(Container[Idx].back()).back() !=I - 1) ||
9670 (!Container.empty() && !Container.contains(Idx) &&
9671 KeyToIndex.at(Container.back().second.back()).back() !=I - 1)))
9672 IsOrdered =false;
9673auto &KTI = KeyToIndex[V];
9674if (KTI.empty())
9675 Container[Idx].push_back(V);
9676 KTI.push_back(I);
9677 }
9678SmallVector<std::pair<unsigned, unsigned>> SubVectors;
9679APInt DemandedElts =APInt::getAllOnes(TE.Scalars.size());
9680if (!IsOrdered && NumInstructions > 1) {
9681unsigned Cnt = 0;
9682TE.ReorderIndices.resize(TE.Scalars.size(),TE.Scalars.size());
9683for (constauto &D : SortedValues) {
9684for (constauto &P :D.second) {
9685unsigned Sz = 0;
9686for (Value *V :P.second) {
9687ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9688for (auto [K,Idx] :enumerate(Indices)) {
9689TE.ReorderIndices[Cnt +K] =Idx;
9690TE.Scalars[Cnt +K] =V;
9691 }
9692 Sz += Indices.size();
9693 Cnt += Indices.size();
9694 }
9695if (Sz > 1 && isa<Instruction>(P.second.front())) {
9696constunsigned SubVF =getFloorFullVectorNumberOfElements(
9697 *TTI,TE.Scalars.front()->getType(), Sz);
9698 SubVectors.emplace_back(Cnt - Sz, SubVF);
9699for (unsignedI : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9700 DemandedElts.clearBit(I);
9701 }elseif (!P.second.empty() &&isConstant(P.second.front())) {
9702for (unsignedI : seq<unsigned>(Cnt - Sz, Cnt))
9703 DemandedElts.clearBit(I);
9704 }
9705 }
9706 }
9707 }
9708// Reuses always require shuffles, so consider it as profitable.
9709if (!TE.ReuseShuffleIndices.empty() ||TE.ReorderIndices.empty())
9710return;
9711// Do simple cost estimation.
9712constexprTTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
9713InstructionCostCost = 0;
9714auto *ScalarTy =TE.Scalars.front()->getType();
9715auto *VecTy =getWidenedType(ScalarTy,TE.Scalars.size());
9716for (auto [Idx, Sz] : SubVectors) {
9717Cost +=::getShuffleCost(*TTI,TTI::SK_InsertSubvector, VecTy, {},CostKind,
9718Idx,getWidenedType(ScalarTy, Sz));
9719 }
9720if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9721assert(SLPReVec &&"Only supported by REVEC.");
9722// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9723// of CreateInsertElement.
9724unsigned ScalarTyNumElements =getNumElements(ScalarTy);
9725for (unsignedI : seq<unsigned>(TE.Scalars.size()))
9726if (DemandedElts[I])
9727Cost +=
9728TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9729CostKind,I * ScalarTyNumElements, FTy);
9730 }else {
9731Cost +=TTI->getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,
9732/*Extract=*/false,CostKind);
9733 }
9734int Sz =TE.Scalars.size();
9735SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9736TE.ReorderIndices.end());
9737for (unsignedI : seq<unsigned>(Sz)) {
9738Value *V =TE.getOrdered(I);
9739if (isa<PoisonValue>(V)) {
9740 ReorderMask[I] =PoisonMaskElem;
9741 }elseif (isConstant(V) || DemandedElts[I]) {
9742 ReorderMask[I] =I +TE.ReorderIndices.size();
9743 }
9744 }
9745Cost +=::getShuffleCost(*TTI,
9746any_of(ReorderMask, [&](intI) {returnI >= Sz; })
9747 ?TTI::SK_PermuteTwoSrc
9748 :TTI::SK_PermuteSingleSrc,
9749 VecTy, ReorderMask);
9750 DemandedElts =APInt::getAllOnes(VecTy->getNumElements());
9751 ReorderMask.assign(Sz,PoisonMaskElem);
9752for (unsignedI : seq<unsigned>(Sz)) {
9753Value *V =TE.getOrdered(I);
9754if (isConstant(V)) {
9755 DemandedElts.clearBit(I);
9756if (!isa<PoisonValue>(V))
9757 ReorderMask[I] =I;
9758 }else {
9759 ReorderMask[I] =I + Sz;
9760 }
9761 }
9762InstructionCost BVCost =TTI->getScalarizationOverhead(
9763 VecTy, DemandedElts,/*Insert=*/true,/*Extract=*/false,CostKind);
9764if (!DemandedElts.isAllOnes())
9765 BVCost +=::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9766if (Cost >= BVCost) {
9767SmallVector<int>Mask(TE.ReorderIndices.begin(),TE.ReorderIndices.end());
9768reorderScalars(TE.Scalars, Mask);
9769TE.ReorderIndices.clear();
9770 }
9771}
9772
9773voidBoUpSLP::transformNodes() {
9774constexprTTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
9775 BaseGraphSize = VectorizableTree.size();
9776// Turn graph transforming mode on and off, when done.
9777classGraphTransformModeRAAI {
9778bool &SavedIsGraphTransformMode;
9779
9780public:
9781 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9782 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9783 IsGraphTransformMode =true;
9784 }
9785 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =false; }
9786 } TransformContext(IsGraphTransformMode);
9787// Operands are profitable if they are:
9788// 1. At least one constant
9789// or
9790// 2. Splats
9791// or
9792// 3. Results in good vectorization opportunity, i.e. may generate vector
9793// nodes and reduce cost of the graph.
9794auto CheckOperandsProfitability = [this](Instruction *I1,Instruction *I2,
9795const InstructionsState &S) {
9796SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
9797for (unsignedOp : seq<unsigned>(S.getMainOp()->getNumOperands()))
9798 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9799 I2->getOperand(Op));
9800returnall_of(
9801 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9802returnall_of(Cand,
9803 [](const std::pair<Value *, Value *> &P) {
9804return isa<Constant>(P.first) ||
9805 isa<Constant>(P.second) ||P.first ==P.second;
9806 }) ||
9807findBestRootPair(Cand,LookAheadHeuristics::ScoreSplatLoads);
9808 });
9809 };
9810
9811// Try to reorder gather nodes for better vectorization opportunities.
9812for (unsignedIdx : seq<unsigned>(BaseGraphSize)) {
9813 TreeEntry &E = *VectorizableTree[Idx];
9814if (E.isGather())
9815 reorderGatherNode(E);
9816 }
9817
9818// The tree may grow here, so iterate over nodes, built before.
9819for (unsignedIdx : seq<unsigned>(BaseGraphSize)) {
9820 TreeEntry &E = *VectorizableTree[Idx];
9821if (E.isGather()) {
9822ArrayRef<Value *> VL = E.Scalars;
9823constunsigned Sz =getVectorElementSize(VL.front());
9824unsigned MinVF =getMinVF(2 * Sz);
9825// Do not try partial vectorization for small nodes (<= 2), nodes with the
9826// same opcode and same parent block or all constants.
9827if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9828 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9829 E.isAltShuffle() || !allSameBlock(VL)) ||
9830allConstant(VL) ||isSplat(VL))
9831continue;
9832// Try to find vectorizable sequences and transform them into a series of
9833// insertvector instructions.
9834unsigned StartIdx = 0;
9835unsignedEnd = VL.size();
9836for (unsigned VF =getFloorFullVectorNumberOfElements(
9837 *TTI, VL.front()->getType(), VL.size() - 1);
9838 VF >= MinVF; VF =getFloorFullVectorNumberOfElements(
9839 *TTI, VL.front()->getType(), VF - 1)) {
9840if (StartIdx + VF >End)
9841continue;
9842SmallVector<std::pair<unsigned, unsigned>> Slices;
9843for (unsigned Cnt = StartIdx; Cnt + VF <=End; Cnt += VF) {
9844ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9845// If any instruction is vectorized already - do not try again.
9846// Reuse the existing node, if it fully matches the slice.
9847if (const TreeEntry *SE = getTreeEntry(Slice.front());
9848 SE || getTreeEntry(Slice.back())) {
9849if (!SE)
9850continue;
9851if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9852continue;
9853 }
9854// Constant already handled effectively - skip.
9855if (allConstant(Slice))
9856continue;
9857// Do not try to vectorize small splats (less than vector register and
9858// only with the single non-undef element).
9859bool IsSplat =isSplat(Slice);
9860bool IsTwoRegisterSplat =true;
9861if (IsSplat && VF == 2) {
9862unsigned NumRegs2VF =::getNumberOfParts(
9863 *TTI,getWidenedType(Slice.front()->getType(), 2 * VF));
9864 IsTwoRegisterSplat = NumRegs2VF == 2;
9865 }
9866if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
9867count(Slice, Slice.front()) ==
9868static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9869 : 1)) {
9870if (IsSplat)
9871continue;
9872 InstructionsState S =getSameOpcode(Slice, *TLI);
9873if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9874 (S.getOpcode() == Instruction::Load &&
9875areKnownNonVectorizableLoads(Slice)) ||
9876 (S.getOpcode() != Instruction::Load &&
9877 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
9878continue;
9879if (VF == 2) {
9880// Try to vectorize reduced values or if all users are vectorized.
9881// For expensive instructions extra extracts might be profitable.
9882if ((!UserIgnoreList || E.Idx != 0) &&
9883TTI->getInstructionCost(S.getMainOp(),CostKind) <
9884TTI::TCC_Expensive &&
9885 !all_of(Slice, [&](Value *V) {
9886if (isa<PoisonValue>(V))
9887returntrue;
9888return areAllUsersVectorized(cast<Instruction>(V),
9889 UserIgnoreList);
9890 }))
9891continue;
9892if (S.getOpcode() == Instruction::Load) {
9893OrdersType Order;
9894SmallVector<Value *> PointerOps;
9895LoadsState Res =
9896canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9897// Do not vectorize gathers.
9898if (Res ==LoadsState::ScatterVectorize ||
9899 Res ==LoadsState::Gather) {
9900if (Res ==LoadsState::Gather) {
9901registerNonVectorizableLoads(Slice);
9902// If reductions and the scalars from the root node are
9903// analyzed - mark as non-vectorizable reduction.
9904if (UserIgnoreList && E.Idx == 0)
9905analyzedReductionVals(Slice);
9906 }
9907continue;
9908 }
9909 }elseif (S.getOpcode() == Instruction::ExtractElement ||
9910 (TTI->getInstructionCost(S.getMainOp(),CostKind) <
9911TTI::TCC_Expensive &&
9912 !CheckOperandsProfitability(
9913 S.getMainOp(),
9914 cast<Instruction>(*find_if(reverse(Slice),
9915 IsaPred<Instruction>)),
9916 S))) {
9917// Do not vectorize extractelements (handled effectively
9918// alread). Do not vectorize non-profitable instructions (with
9919// low cost and non-vectorizable operands.)
9920continue;
9921 }
9922 }
9923 }
9924 Slices.emplace_back(Cnt, Slice.size());
9925 }
9926auto AddCombinedNode = [&](unsignedIdx,unsigned Cnt,unsigned Sz) {
9927 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9928if (StartIdx == Cnt)
9929 StartIdx = Cnt + Sz;
9930if (End == Cnt + Sz)
9931End = Cnt;
9932 };
9933for (auto [Cnt, Sz] : Slices) {
9934ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9935// If any instruction is vectorized already - do not try again.
9936if (TreeEntry *SE = getTreeEntry(Slice.front());
9937 SE || getTreeEntry(Slice.back())) {
9938if (!SE)
9939continue;
9940if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9941continue;
9942 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9943 AddCombinedNode(SE->Idx, Cnt, Sz);
9944continue;
9945 }
9946unsigned PrevSize = VectorizableTree.size();
9947 [[maybe_unused]]unsigned PrevEntriesSize =
9948 LoadEntriesToVectorize.size();
9949 buildTree_rec(Slice, 0,EdgeInfo(&E, UINT_MAX));
9950if (PrevSize + 1 == VectorizableTree.size() &&
9951 VectorizableTree[PrevSize]->isGather() &&
9952 VectorizableTree[PrevSize]->hasState() &&
9953 VectorizableTree[PrevSize]->getOpcode() !=
9954 Instruction::ExtractElement &&
9955 !isSplat(Slice)) {
9956if (UserIgnoreList && E.Idx == 0 && VF == 2)
9957analyzedReductionVals(Slice);
9958 VectorizableTree.pop_back();
9959assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9960"LoadEntriesToVectorize expected to remain the same");
9961continue;
9962 }
9963 AddCombinedNode(PrevSize, Cnt, Sz);
9964 }
9965 }
9966// Restore ordering, if no extra vectorization happened.
9967if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9968SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9969reorderScalars(E.Scalars, Mask);
9970 E.ReorderIndices.clear();
9971 }
9972 }
9973if (!E.hasState())
9974continue;
9975switch (E.getOpcode()) {
9976case Instruction::Load: {
9977// No need to reorder masked gather loads, just reorder the scalar
9978// operands.
9979if (E.State != TreeEntry::Vectorize)
9980break;
9981Type *ScalarTy = E.getMainOp()->getType();
9982auto *VecTy =getWidenedType(ScalarTy, E.Scalars.size());
9983Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9984// Check if profitable to represent consecutive load + reverse as strided
9985// load with stride -1.
9986if (!E.ReorderIndices.empty() &&isReverseOrder(E.ReorderIndices) &&
9987TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9988SmallVector<int> Mask;
9989inversePermutation(E.ReorderIndices, Mask);
9990auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9991InstructionCost OriginalVecCost =
9992TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9993 BaseLI->getPointerAddressSpace(),CostKind,
9994TTI::OperandValueInfo()) +
9995::getShuffleCost(*TTI,TTI::SK_Reverse, VecTy, Mask,CostKind);
9996InstructionCost StridedCost =TTI->getStridedMemoryOpCost(
9997 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9998/*VariableMask=*/false, CommonAlignment,CostKind, BaseLI);
9999if (StridedCost < OriginalVecCost)
10000// Strided load is more profitable than consecutive load + reverse -
10001// transform the node to strided load.
10002 E.State = TreeEntry::StridedVectorize;
10003 }
10004break;
10005 }
10006case Instruction::Store: {
10007Type *ScalarTy =
10008 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
10009auto *VecTy =getWidenedType(ScalarTy, E.Scalars.size());
10010Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
10011// Check if profitable to represent consecutive load + reverse as strided
10012// load with stride -1.
10013if (!E.ReorderIndices.empty() &&isReverseOrder(E.ReorderIndices) &&
10014TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
10015SmallVector<int> Mask;
10016inversePermutation(E.ReorderIndices, Mask);
10017auto *BaseSI = cast<StoreInst>(E.Scalars.back());
10018InstructionCost OriginalVecCost =
10019TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
10020 BaseSI->getPointerAddressSpace(),CostKind,
10021TTI::OperandValueInfo()) +
10022::getShuffleCost(*TTI,TTI::SK_Reverse, VecTy, Mask,CostKind);
10023InstructionCost StridedCost =TTI->getStridedMemoryOpCost(
10024 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10025/*VariableMask=*/false, CommonAlignment,CostKind, BaseSI);
10026if (StridedCost < OriginalVecCost)
10027// Strided store is more profitable than reverse + consecutive store -
10028// transform the node to strided store.
10029 E.State = TreeEntry::StridedVectorize;
10030 }elseif (!E.ReorderIndices.empty()) {
10031// Check for interleaved stores.
10032auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
10033auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10034assert(Mask.size() > 1 &&"Expected mask greater than 1 element.");
10035if (Mask.size() < 4)
10036return 0u;
10037for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10038if (ShuffleVectorInst::isInterleaveMask(
10039 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
10040TTI.isLegalInterleavedAccessType(
10041 VecTy, Factor, BaseSI->getAlign(),
10042 BaseSI->getPointerAddressSpace()))
10043return Factor;
10044 }
10045
10046return 0u;
10047 };
10048SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
10049unsigned InterleaveFactor = IsInterleaveMask(Mask);
10050if (InterleaveFactor != 0)
10051 E.setInterleave(InterleaveFactor);
10052 }
10053break;
10054 }
10055case Instruction::Select: {
10056if (E.State != TreeEntry::Vectorize)
10057break;
10058auto [MinMaxID, SelectOnly] =canConvertToMinOrMaxIntrinsic(E.Scalars);
10059if (MinMaxID ==Intrinsic::not_intrinsic)
10060break;
10061// This node is a minmax node.
10062 E.CombinedOp = TreeEntry::MinMax;
10063 TreeEntry *CondEntry =const_cast<TreeEntry *>(getOperandEntry(&E, 0));
10064if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10065 CondEntry->State == TreeEntry::Vectorize) {
10066// The condition node is part of the combined minmax node.
10067 CondEntry->State = TreeEntry::CombinedVectorize;
10068 }
10069break;
10070 }
10071default:
10072break;
10073 }
10074 }
10075
10076if (LoadEntriesToVectorize.empty()) {
10077// Single load node - exit.
10078if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10079 VectorizableTree.front()->getOpcode() == Instruction::Load)
10080return;
10081// Small graph with small VF - exit.
10082constexprunsigned SmallTree = 3;
10083constexprunsigned SmallVF = 2;
10084if ((VectorizableTree.size() <= SmallTree &&
10085 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10086 (VectorizableTree.size() <= 2 && UserIgnoreList))
10087return;
10088
10089if (VectorizableTree.front()->isNonPowOf2Vec() &&
10090getCanonicalGraphSize() !=getTreeSize() && UserIgnoreList &&
10091getCanonicalGraphSize() <= SmallTree &&
10092count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
10093 [](const std::unique_ptr<TreeEntry> &TE) {
10094return TE->isGather() && TE->hasState() &&
10095 TE->getOpcode() == Instruction::Load &&
10096 !allSameBlock(TE->Scalars);
10097 }) == 1)
10098return;
10099 }
10100
10101// A list of loads to be gathered during the vectorization process. We can
10102// try to vectorize them at the end, if profitable.
10103SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
10104SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 8>
10105 GatheredLoads;
10106
10107for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10108 TreeEntry &E = *TE;
10109if (E.isGather() &&
10110 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10111 (!E.hasState() &&any_of(E.Scalars,
10112 [&](Value *V) {
10113 return isa<LoadInst>(V) &&
10114 !isVectorized(V) &&
10115 !isDeleted(cast<Instruction>(V));
10116 }))) &&
10117 !isSplat(E.Scalars)) {
10118for (Value *V : E.Scalars) {
10119auto *LI = dyn_cast<LoadInst>(V);
10120if (!LI)
10121continue;
10122if (isDeleted(LI) ||isVectorized(LI) || !LI->isSimple())
10123continue;
10124gatherPossiblyVectorizableLoads(
10125 *this, V, *DL, *SE, *TTI,
10126 GatheredLoads[std::make_tuple(
10127 LI->getParent(),
10128getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth),
10129 LI->getType())]);
10130 }
10131 }
10132 }
10133// Try to vectorize gathered loads if this is not just a gather of loads.
10134if (!GatheredLoads.empty())
10135 tryToVectorizeGatheredLoads(GatheredLoads);
10136}
10137
10138/// Merges shuffle masks and emits final shuffle instruction, if required. It
10139/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10140/// when the actual shuffle instruction is generated only if this is actually
10141/// required. Otherwise, the shuffle instruction emission is delayed till the
10142/// end of the process, to reduce the number of emitted instructions and further
10143/// analysis/transformations.
10144classBoUpSLP::ShuffleCostEstimator :public BaseShuffleAnalysis {
10145bool IsFinalized =false;
10146SmallVector<int> CommonMask;
10147SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
10148constTargetTransformInfo &TTI;
10149InstructionCostCost = 0;
10150SmallDenseSet<Value *> VectorizedVals;
10151BoUpSLP &R;
10152SmallPtrSetImpl<Value *> &CheckedExtracts;
10153constexprstaticTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;
10154 /// While set, still trying to estimate the cost for the same nodes and we
10155 /// can delay actual cost estimation (virtual shuffle instruction emission).
10156 /// May help better estimate the cost if same nodes must be permuted + allows
10157 /// to move most of the long shuffles cost estimation to TTI.
10158bool SameNodesEstimated =true;
10159
10160staticConstant *getAllOnesValue(constDataLayout &DL,Type *Ty) {
10161if (Ty->getScalarType()->isPointerTy()) {
10162Constant *Res =ConstantExpr::getIntToPtr(
10163ConstantInt::getAllOnesValue(
10164IntegerType::get(Ty->getContext(),
10165DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10166 Ty->getScalarType());
10167if (auto *VTy = dyn_cast<VectorType>(Ty))
10168 Res =ConstantVector::getSplat(VTy->getElementCount(), Res);
10169return Res;
10170 }
10171returnConstant::getAllOnesValue(Ty);
10172 }
10173
10174InstructionCost getBuildVectorCost(ArrayRef<Value *> VL,Value *Root) {
10175if ((!Root &&allConstant(VL)) ||all_of(VL, IsaPred<UndefValue>))
10176returnTTI::TCC_Free;
10177auto *VecTy =getWidenedType(ScalarTy, VL.size());
10178InstructionCost GatherCost = 0;
10179SmallVector<Value *> Gathers(VL);
10180if (!Root &&isSplat(VL)) {
10181// Found the broadcasting of the single scalar, calculate the cost as
10182// the broadcast.
10183constauto *It =find_if_not(VL, IsaPred<UndefValue>);
10184assert(It != VL.end() &&"Expected at least one non-undef value.");
10185// Add broadcast for non-identity shuffle only.
10186bool NeedShuffle =
10187count(VL, *It) > 1 &&
10188 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10189if (!NeedShuffle) {
10190if (isa<FixedVectorType>(ScalarTy)) {
10191assert(SLPReVec &&"FixedVectorType is not expected.");
10192returnTTI.getShuffleCost(
10193TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10194 std::distance(VL.begin(), It) *getNumElements(ScalarTy),
10195 cast<FixedVectorType>(ScalarTy));
10196 }
10197returnTTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10198 CostKind, std::distance(VL.begin(), It),
10199PoisonValue::get(VecTy), *It);
10200 }
10201
10202SmallVector<int> ShuffleMask(VL.size(),PoisonMaskElem);
10203transform(VL, ShuffleMask.begin(), [](Value *V) {
10204 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10205 });
10206InstructionCost InsertCost =
10207TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10208PoisonValue::get(VecTy), *It);
10209return InsertCost +::getShuffleCost(TTI,
10210TargetTransformInfo::SK_Broadcast,
10211 VecTy, ShuffleMask, CostKind,
10212/*Index=*/0,/*SubTp=*/nullptr,
10213/*Args=*/*It);
10214 }
10215return GatherCost +
10216 (all_of(Gathers, IsaPred<UndefValue>)
10217 ?TTI::TCC_Free
10218 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10219 ScalarTy));
10220 };
10221
10222 /// Compute the cost of creating a vector containing the extracted values from
10223 /// \p VL.
10224InstructionCost
10225 computeExtractCost(ArrayRef<Value *> VL,ArrayRef<int> Mask,
10226ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10227unsigned NumParts) {
10228assert(VL.size() > NumParts &&"Unexpected scalarized shuffle.");
10229unsigned NumElts =
10230 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz,Value *V) {
10231 auto *EE = dyn_cast<ExtractElementInst>(V);
10232 if (!EE)
10233 return Sz;
10234 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10235 if (!VecTy)
10236 return Sz;
10237 return std::max(Sz, VecTy->getNumElements());
10238 });
10239// FIXME: this must be moved to TTI for better estimation.
10240unsigned EltsPerVector =getPartNumElems(VL.size(), NumParts);
10241auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10242SmallVectorImpl<unsigned> &Indices)
10243 -> std::optional<TTI::ShuffleKind> {
10244if (NumElts <= EltsPerVector)
10245return std::nullopt;
10246int OffsetReg0 =
10247alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10248 [](int S,intI) {
10249 if (I == PoisonMaskElem)
10250 return S;
10251 return std::min(S, I);
10252 }),
10253 EltsPerVector);
10254int OffsetReg1 = OffsetReg0;
10255DenseSet<int> RegIndices;
10256// Check that if trying to permute same single/2 input vectors.
10257TTI::ShuffleKind ShuffleKind =TTI::SK_PermuteSingleSrc;
10258int FirstRegId = -1;
10259 Indices.assign(1, OffsetReg0);
10260for (auto [Pos,I] :enumerate(Mask)) {
10261if (I ==PoisonMaskElem)
10262continue;
10263intIdx =I - OffsetReg0;
10264int RegId =
10265 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10266if (FirstRegId < 0)
10267 FirstRegId = RegId;
10268 RegIndices.insert(RegId);
10269if (RegIndices.size() > 2)
10270return std::nullopt;
10271if (RegIndices.size() == 2) {
10272 ShuffleKind =TTI::SK_PermuteTwoSrc;
10273if (Indices.size() == 1) {
10274 OffsetReg1 =alignDown(
10275 std::accumulate(
10276 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10277 [&](int S,intI) {
10278 if (I == PoisonMaskElem)
10279 return S;
10280 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10281 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10282 if (RegId == FirstRegId)
10283 return S;
10284 return std::min(S, I);
10285 }),
10286 EltsPerVector);
10287 Indices.push_back(OffsetReg1 % NumElts);
10288 }
10289Idx =I - OffsetReg1;
10290 }
10291I = (Idx % NumElts) % EltsPerVector +
10292 (RegId == FirstRegId ? 0 : EltsPerVector);
10293 }
10294return ShuffleKind;
10295 };
10296InstructionCostCost = 0;
10297
10298// Process extracts in blocks of EltsPerVector to check if the source vector
10299// operand can be re-used directly. If not, add the cost of creating a
10300// shuffle to extract the values into a vector register.
10301for (unsigned Part : seq<unsigned>(NumParts)) {
10302if (!ShuffleKinds[Part])
10303continue;
10304ArrayRef<int> MaskSlice = Mask.slice(
10305 Part * EltsPerVector,getNumElems(Mask.size(), EltsPerVector, Part));
10306SmallVector<int> SubMask(EltsPerVector,PoisonMaskElem);
10307copy(MaskSlice, SubMask.begin());
10308SmallVector<unsigned, 2> Indices;
10309 std::optional<TTI::ShuffleKind> RegShuffleKind =
10310 CheckPerRegistersShuffle(SubMask, Indices);
10311if (!RegShuffleKind) {
10312if (*ShuffleKinds[Part] !=TTI::SK_PermuteSingleSrc ||
10313 !ShuffleVectorInst::isIdentityMask(
10314 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10315Cost +=
10316::getShuffleCost(TTI, *ShuffleKinds[Part],
10317getWidenedType(ScalarTy, NumElts), MaskSlice);
10318continue;
10319 }
10320if (*RegShuffleKind !=TTI::SK_PermuteSingleSrc ||
10321 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10322Cost +=
10323::getShuffleCost(TTI, *RegShuffleKind,
10324getWidenedType(ScalarTy, EltsPerVector), SubMask);
10325 }
10326constunsigned BaseVF =getFullVectorNumberOfElements(
10327 *R.TTI, VL.front()->getType(),alignTo(NumElts, EltsPerVector));
10328for (unsignedIdx : Indices) {
10329assert((Idx + EltsPerVector) <= BaseVF &&
10330"SK_ExtractSubvector index out of range");
10331Cost +=::getShuffleCost(TTI,TTI::SK_ExtractSubvector,
10332getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10333Idx,getWidenedType(ScalarTy, EltsPerVector));
10334 }
10335// Second attempt to check, if just a permute is better estimated than
10336// subvector extract.
10337 SubMask.assign(NumElts,PoisonMaskElem);
10338copy(MaskSlice, SubMask.begin());
10339InstructionCost OriginalCost =::getShuffleCost(
10340TTI, *ShuffleKinds[Part],getWidenedType(ScalarTy, NumElts), SubMask);
10341if (OriginalCost <Cost)
10342Cost = OriginalCost;
10343 }
10344returnCost;
10345 }
10346 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10347 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10348 /// elements.
10349void estimateNodesPermuteCost(const TreeEntry &E1,const TreeEntry *E2,
10350ArrayRef<int> Mask,unsigned Part,
10351unsigned SliceSize) {
10352if (SameNodesEstimated) {
10353// Delay the cost estimation if the same nodes are reshuffling.
10354// If we already requested the cost of reshuffling of E1 and E2 before, no
10355// need to estimate another cost with the sub-Mask, instead include this
10356// sub-Mask into the CommonMask to estimate it later and avoid double cost
10357// estimation.
10358if ((InVectors.size() == 2 &&
10359 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10360 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10361 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10362unsigned Limit =getNumElems(Mask.size(), SliceSize, Part);
10363assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10364 [](intIdx) {returnIdx ==PoisonMaskElem; }) &&
10365"Expected all poisoned elements.");
10366ArrayRef<int> SubMask =ArrayRef(Mask).slice(Part * SliceSize, Limit);
10367copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10368return;
10369 }
10370// Found non-matching nodes - need to estimate the cost for the matched
10371// and transform mask.
10372Cost += createShuffle(InVectors.front(),
10373 InVectors.size() == 1 ?nullptr : InVectors.back(),
10374 CommonMask);
10375 transformMaskAfterShuffle(CommonMask, CommonMask);
10376 }elseif (InVectors.size() == 2) {
10377Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10378 transformMaskAfterShuffle(CommonMask, CommonMask);
10379 }
10380 SameNodesEstimated =false;
10381if (!E2 && InVectors.size() == 1) {
10382unsigned VF = E1.getVectorFactor();
10383if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10384 VF = std::max(VF,
10385 cast<FixedVectorType>(V1->getType())->getNumElements());
10386 }else {
10387constauto *E = cast<const TreeEntry *>(InVectors.front());
10388 VF = std::max(VF, E->getVectorFactor());
10389 }
10390for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
10391if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)
10392 CommonMask[Idx] = Mask[Idx] + VF;
10393Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10394 transformMaskAfterShuffle(CommonMask, CommonMask);
10395 }else {
10396autoP = InVectors.front();
10397Cost += createShuffle(&E1, E2, Mask);
10398unsigned VF = Mask.size();
10399if (Value *V1 =P.dyn_cast<Value *>()) {
10400 VF = std::max(VF,
10401getNumElements(V1->getType()));
10402 }else {
10403constauto *E = cast<const TreeEntry *>(P);
10404 VF = std::max(VF, E->getVectorFactor());
10405 }
10406for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
10407if (Mask[Idx] !=PoisonMaskElem)
10408 CommonMask[Idx] =Idx + (InVectors.empty() ? 0 : VF);
10409Cost += createShuffle(P, InVectors.front(), CommonMask);
10410 transformMaskAfterShuffle(CommonMask, CommonMask);
10411 }
10412 }
10413
10414classShuffleCostBuilder {
10415constTargetTransformInfo &TTI;
10416
10417staticbool isEmptyOrIdentity(ArrayRef<int> Mask,unsigned VF) {
10418int Index = -1;
10419return Mask.empty() ||
10420 (VF == Mask.size() &&
10421ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
10422 (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
10423 Index == 0);
10424 }
10425
10426public:
10427 ShuffleCostBuilder(constTargetTransformInfo &TTI) :TTI(TTI) {}
10428 ~ShuffleCostBuilder() =default;
10429InstructionCost createShuffleVector(Value *V1,Value *,
10430ArrayRef<int> Mask) const{
10431// Empty mask or identity mask are free.
10432unsigned VF =
10433 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10434if (isEmptyOrIdentity(Mask, VF))
10435returnTTI::TCC_Free;
10436 return ::getShuffleCost(TTI,TTI::SK_PermuteTwoSrc,
10437 cast<VectorType>(V1->getType()), Mask);
10438 }
10439InstructionCost createShuffleVector(Value *V1,ArrayRef<int> Mask) const{
10440// Empty mask or identity mask are free.
10441unsigned VF =
10442 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10443if (isEmptyOrIdentity(Mask, VF))
10444returnTTI::TCC_Free;
10445 return ::getShuffleCost(TTI,TTI::SK_PermuteSingleSrc,
10446 cast<VectorType>(V1->getType()), Mask);
10447 }
10448InstructionCost createIdentity(Value *) const{returnTTI::TCC_Free; }
10449InstructionCost createPoison(Type *Ty,unsigned VF) const{
10450returnTTI::TCC_Free;
10451 }
10452void resizeToMatch(Value *&,Value *&) const{}
10453 };
10454
10455 /// Smart shuffle instruction emission, walks through shuffles trees and
10456 /// tries to find the best matching vector for the actual shuffle
10457 /// instruction.
10458InstructionCost
10459 createShuffle(constPointerUnion<Value *, const TreeEntry *> &P1,
10460constPointerUnion<Value *, const TreeEntry *> &P2,
10461ArrayRef<int> Mask) {
10462 ShuffleCostBuilder Builder(TTI);
10463SmallVector<int> CommonMask(Mask);
10464Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10465unsigned CommonVF = Mask.size();
10466InstructionCost ExtraCost = 0;
10467auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10468unsigned VF) ->InstructionCost {
10469if (E.isGather() &&allConstant(E.Scalars))
10470returnTTI::TCC_Free;
10471Type *EScalarTy = E.Scalars.front()->getType();
10472bool IsSigned =true;
10473if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10474 EScalarTy =IntegerType::get(EScalarTy->getContext(), It->second.first);
10475 IsSigned = It->second.second;
10476 }
10477if (EScalarTy != ScalarTy) {
10478unsigned CastOpcode = Instruction::Trunc;
10479unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10480unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10481if (DstSz > SrcSz)
10482 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10483returnTTI.getCastInstrCost(CastOpcode,getWidenedType(ScalarTy, VF),
10484getWidenedType(EScalarTy, VF),
10485TTI::CastContextHint::None, CostKind);
10486 }
10487returnTTI::TCC_Free;
10488 };
10489auto GetValueMinBWAffectedCost = [&](constValue *V) ->InstructionCost {
10490if (isa<Constant>(V))
10491returnTTI::TCC_Free;
10492auto *VecTy = cast<VectorType>(V->getType());
10493Type *EScalarTy = VecTy->getElementType();
10494if (EScalarTy != ScalarTy) {
10495bool IsSigned = !isKnownNonNegative(V,SimplifyQuery(*R.DL));
10496unsigned CastOpcode = Instruction::Trunc;
10497unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10498unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10499if (DstSz > SrcSz)
10500 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10501returnTTI.getCastInstrCost(
10502 CastOpcode,VectorType::get(ScalarTy, VecTy->getElementCount()),
10503 VecTy,TTI::CastContextHint::None, CostKind);
10504 }
10505returnTTI::TCC_Free;
10506 };
10507if (!V1 && !V2 && !P2.isNull()) {
10508// Shuffle 2 entry nodes.
10509const TreeEntry *E = cast<const TreeEntry *>(P1);
10510unsigned VF = E->getVectorFactor();
10511const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10512 CommonVF = std::max(VF, E2->getVectorFactor());
10513assert(all_of(Mask,
10514 [=](intIdx) {
10515return Idx < 2 * static_cast<int>(CommonVF);
10516 }) &&
10517"All elements in mask must be less than 2 * CommonVF.");
10518if (E->Scalars.size() == E2->Scalars.size()) {
10519SmallVector<int> EMask = E->getCommonMask();
10520SmallVector<int> E2Mask = E2->getCommonMask();
10521if (!EMask.empty() || !E2Mask.empty()) {
10522for (int &Idx : CommonMask) {
10523if (Idx ==PoisonMaskElem)
10524continue;
10525if (Idx <static_cast<int>(CommonVF) && !EMask.empty())
10526Idx = EMask[Idx];
10527elseif (Idx >=static_cast<int>(CommonVF))
10528Idx = (E2Mask.empty() ?Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10529 E->Scalars.size();
10530 }
10531 }
10532 CommonVF = E->Scalars.size();
10533 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10534 GetNodeMinBWAffectedCost(*E2, CommonVF);
10535 }else {
10536 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10537 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10538 }
10539 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10540 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10541 }elseif (!V1 && P2.isNull()) {
10542// Shuffle single entry node.
10543const TreeEntry *E = cast<const TreeEntry *>(P1);
10544unsigned VF = E->getVectorFactor();
10545 CommonVF = VF;
10546assert(
10547all_of(Mask,
10548 [=](intIdx) {return Idx < static_cast<int>(CommonVF); }) &&
10549"All elements in mask must be less than CommonVF.");
10550if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10551SmallVector<int> EMask = E->getCommonMask();
10552assert(!EMask.empty() &&"Expected non-empty common mask.");
10553for (int &Idx : CommonMask) {
10554if (Idx !=PoisonMaskElem)
10555Idx = EMask[Idx];
10556 }
10557 CommonVF = E->Scalars.size();
10558 }elseif (unsigned Factor = E->getInterleaveFactor();
10559 Factor > 0 && E->Scalars.size() != Mask.size() &&
10560ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
10561 Factor)) {
10562// Deinterleaved nodes are free.
10563 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10564 }
10565 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10566 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10567// Not identity/broadcast? Try to see if the original vector is better.
10568if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10569 CommonVF == CommonMask.size() &&
10570any_of(enumerate(CommonMask),
10571 [](constauto &&P) {
10572returnP.value() !=PoisonMaskElem &&
10573static_cast<unsigned>(P.value()) !=P.index();
10574 }) &&
10575any_of(CommonMask,
10576 [](intIdx) {returnIdx !=PoisonMaskElem &&Idx != 0; })) {
10577SmallVector<int> ReorderMask;
10578inversePermutation(E->ReorderIndices, ReorderMask);
10579::addMask(CommonMask, ReorderMask);
10580 }
10581 }elseif (V1 && P2.isNull()) {
10582// Shuffle single vector.
10583 ExtraCost += GetValueMinBWAffectedCost(V1);
10584 CommonVF = getVF(V1);
10585assert(
10586all_of(Mask,
10587 [=](intIdx) {return Idx < static_cast<int>(CommonVF); }) &&
10588"All elements in mask must be less than CommonVF.");
10589 }elseif (V1 && !V2) {
10590// Shuffle vector and tree node.
10591unsigned VF = getVF(V1);
10592const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10593 CommonVF = std::max(VF, E2->getVectorFactor());
10594assert(all_of(Mask,
10595 [=](intIdx) {
10596return Idx < 2 * static_cast<int>(CommonVF);
10597 }) &&
10598"All elements in mask must be less than 2 * CommonVF.");
10599if (E2->Scalars.size() == VF && VF != CommonVF) {
10600SmallVector<int> E2Mask = E2->getCommonMask();
10601assert(!E2Mask.empty() &&"Expected non-empty common mask.");
10602for (int &Idx : CommonMask) {
10603if (Idx ==PoisonMaskElem)
10604continue;
10605if (Idx >=static_cast<int>(CommonVF))
10606Idx = E2Mask[Idx - CommonVF] + VF;
10607 }
10608 CommonVF = VF;
10609 }
10610 ExtraCost += GetValueMinBWAffectedCost(V1);
10611 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10612 ExtraCost += GetNodeMinBWAffectedCost(
10613 *E2, std::min(CommonVF, E2->getVectorFactor()));
10614 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10615 }elseif (!V1 && V2) {
10616// Shuffle vector and tree node.
10617unsigned VF = getVF(V2);
10618const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10619 CommonVF = std::max(VF, E1->getVectorFactor());
10620assert(all_of(Mask,
10621 [=](intIdx) {
10622return Idx < 2 * static_cast<int>(CommonVF);
10623 }) &&
10624"All elements in mask must be less than 2 * CommonVF.");
10625if (E1->Scalars.size() == VF && VF != CommonVF) {
10626SmallVector<int> E1Mask = E1->getCommonMask();
10627assert(!E1Mask.empty() &&"Expected non-empty common mask.");
10628for (int &Idx : CommonMask) {
10629if (Idx ==PoisonMaskElem)
10630continue;
10631if (Idx >=static_cast<int>(CommonVF))
10632Idx = E1Mask[Idx - CommonVF] + VF;
10633else
10634Idx = E1Mask[Idx];
10635 }
10636 CommonVF = VF;
10637 }
10638 ExtraCost += GetNodeMinBWAffectedCost(
10639 *E1, std::min(CommonVF, E1->getVectorFactor()));
10640 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10641 ExtraCost += GetValueMinBWAffectedCost(V2);
10642 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10643 }else {
10644assert(V1 && V2 &&"Expected both vectors.");
10645unsigned VF = getVF(V1);
10646 CommonVF = std::max(VF, getVF(V2));
10647assert(all_of(Mask,
10648 [=](intIdx) {
10649return Idx < 2 * static_cast<int>(CommonVF);
10650 }) &&
10651"All elements in mask must be less than 2 * CommonVF.");
10652 ExtraCost +=
10653 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10654if (V1->getType() != V2->getType()) {
10655 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10656 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10657 }else {
10658if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10659 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10660if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10661 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10662 }
10663 }
10664 InVectors.front() =
10665Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10666if (InVectors.size() == 2)
10667 InVectors.pop_back();
10668return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10669 V1, V2, CommonMask, Builder, ScalarTy);
10670 }
10671
10672public:
10673ShuffleCostEstimator(Type *ScalarTy,TargetTransformInfo &TTI,
10674ArrayRef<Value *> VectorizedVals,BoUpSLP &R,
10675SmallPtrSetImpl<Value *> &CheckedExtracts)
10676 : BaseShuffleAnalysis(ScalarTy),TTI(TTI),
10677 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10678 CheckedExtracts(CheckedExtracts) {}
10679Value *adjustExtracts(const TreeEntry *E,MutableArrayRef<int> Mask,
10680ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10681unsigned NumParts,bool &UseVecBaseAsInput) {
10682 UseVecBaseAsInput =false;
10683if (Mask.empty())
10684returnnullptr;
10685Value *VecBase =nullptr;
10686SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10687if (!E->ReorderIndices.empty()) {
10688SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10689 E->ReorderIndices.end());
10690reorderScalars(VL, ReorderMask);
10691 }
10692// Check if it can be considered reused if same extractelements were
10693// vectorized already.
10694bool PrevNodeFound =any_of(
10695ArrayRef(R.VectorizableTree).take_front(E->Idx),
10696 [&](const std::unique_ptr<TreeEntry> &TE) {
10697 return ((TE->hasState() && !TE->isAltShuffle() &&
10698 TE->getOpcode() == Instruction::ExtractElement) ||
10699 TE->isGather()) &&
10700 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10701 return VL.size() > Data.index() &&
10702 (Mask[Data.index()] == PoisonMaskElem ||
10703 isa<UndefValue>(VL[Data.index()]) ||
10704 Data.value() == VL[Data.index()]);
10705 });
10706 });
10707SmallPtrSet<Value *, 4> UniqueBases;
10708unsigned SliceSize =getPartNumElems(VL.size(), NumParts);
10709for (unsigned Part : seq<unsigned>(NumParts)) {
10710unsigned Limit =getNumElems(VL.size(), SliceSize, Part);
10711ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10712for (auto [I, V] :
10713enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10714// Ignore non-extractelement scalars.
10715if (isa<UndefValue>(V) ||
10716 (!SubMask.empty() && SubMask[I] ==PoisonMaskElem))
10717continue;
10718// If all users of instruction are going to be vectorized and this
10719// instruction itself is not going to be vectorized, consider this
10720// instruction as dead and remove its cost from the final cost of the
10721// vectorized tree.
10722// Also, avoid adjusting the cost for extractelements with multiple uses
10723// in different graph entries.
10724auto *EE = cast<ExtractElementInst>(V);
10725 VecBase = EE->getVectorOperand();
10726 UniqueBases.insert(VecBase);
10727const TreeEntry *VE = R.getTreeEntry(V);
10728if (!CheckedExtracts.insert(V).second ||
10729 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10730any_of(EE->users(),
10731 [&](User *U) {
10732 return isa<GetElementPtrInst>(U) &&
10733 !R.areAllUsersVectorized(cast<Instruction>(U),
10734 &VectorizedVals);
10735 }) ||
10736 (VE && VE != E))
10737continue;
10738 std::optional<unsigned> EEIdx =getExtractIndex(EE);
10739if (!EEIdx)
10740continue;
10741unsignedIdx = *EEIdx;
10742// Take credit for instruction that will become dead.
10743if (EE->hasOneUse() || !PrevNodeFound) {
10744Instruction *Ext = EE->user_back();
10745if (isa<SExtInst, ZExtInst>(Ext) &&
10746all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10747// Use getExtractWithExtendCost() to calculate the cost of
10748// extractelement/ext pair.
10749Cost -=
10750TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10751 EE->getVectorOperandType(),Idx);
10752// Add back the cost of s|zext which is subtracted separately.
10753Cost +=TTI.getCastInstrCost(
10754 Ext->getOpcode(), Ext->getType(), EE->getType(),
10755TTI::getCastContextHint(Ext), CostKind, Ext);
10756continue;
10757 }
10758 }
10759Cost -=TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10760 CostKind,Idx);
10761 }
10762 }
10763// Check that gather of extractelements can be represented as just a
10764// shuffle of a single/two vectors the scalars are extracted from.
10765// Found the bunch of extractelement instructions that must be gathered
10766// into a vector and can be represented as a permutation elements in a
10767// single input vector or of 2 input vectors.
10768// Done for reused if same extractelements were vectorized already.
10769if (!PrevNodeFound)
10770Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10771 InVectors.assign(1, E);
10772 CommonMask.assign(Mask.begin(),Mask.end());
10773 transformMaskAfterShuffle(CommonMask, CommonMask);
10774 SameNodesEstimated =false;
10775if (NumParts != 1 && UniqueBases.size() != 1) {
10776 UseVecBaseAsInput =true;
10777 VecBase =
10778Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10779 }
10780return VecBase;
10781 }
10782 /// Checks if the specified entry \p E needs to be delayed because of its
10783 /// dependency nodes.
10784 std::optional<InstructionCost>
10785needToDelay(const TreeEntry *,
10786ArrayRef<SmallVector<const TreeEntry *>>) const{
10787// No need to delay the cost estimation during analysis.
10788return std::nullopt;
10789 }
10790voidadd(const TreeEntry &E1,const TreeEntry &E2,ArrayRef<int> Mask) {
10791if (&E1 == &E2) {
10792assert(all_of(Mask,
10793 [&](intIdx) {
10794return Idx < static_cast<int>(E1.getVectorFactor());
10795 }) &&
10796"Expected single vector shuffle mask.");
10797 add(E1, Mask);
10798return;
10799 }
10800if (InVectors.empty()) {
10801 CommonMask.assign(Mask.begin(), Mask.end());
10802 InVectors.assign({&E1, &E2});
10803return;
10804 }
10805assert(!CommonMask.empty() &&"Expected non-empty common mask.");
10806auto *MaskVecTy =getWidenedType(ScalarTy, Mask.size());
10807unsigned NumParts =::getNumberOfParts(TTI, MaskVecTy, Mask.size());
10808unsigned SliceSize =getPartNumElems(Mask.size(), NumParts);
10809constauto *It =
10810find_if(Mask, [](intIdx) {returnIdx !=PoisonMaskElem; });
10811unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10812 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10813 }
10814voidadd(const TreeEntry &E1,ArrayRef<int> Mask) {
10815if (InVectors.empty()) {
10816 CommonMask.assign(Mask.begin(), Mask.end());
10817 InVectors.assign(1, &E1);
10818return;
10819 }
10820assert(!CommonMask.empty() &&"Expected non-empty common mask.");
10821auto *MaskVecTy =getWidenedType(ScalarTy, Mask.size());
10822unsigned NumParts =::getNumberOfParts(TTI, MaskVecTy, Mask.size());
10823unsigned SliceSize =getPartNumElems(Mask.size(), NumParts);
10824constauto *It =
10825find_if(Mask, [](intIdx) {returnIdx !=PoisonMaskElem; });
10826unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10827 estimateNodesPermuteCost(E1,nullptr, Mask, Part, SliceSize);
10828if (!SameNodesEstimated && InVectors.size() == 1)
10829 InVectors.emplace_back(&E1);
10830 }
10831 /// Adds 2 input vectors and the mask for their shuffling.
10832voidadd(Value *V1,Value *V2,ArrayRef<int> Mask) {
10833// May come only for shuffling of 2 vectors with extractelements, already
10834// handled in adjustExtracts.
10835assert(InVectors.size() == 1 &&
10836all_of(enumerate(CommonMask),
10837 [&](autoP) {
10838if (P.value() ==PoisonMaskElem)
10839return Mask[P.index()] ==PoisonMaskElem;
10840auto *EI = cast<ExtractElementInst>(
10841 cast<const TreeEntry *>(InVectors.front())
10842 ->getOrdered(P.index()));
10843return EI->getVectorOperand() == V1 ||
10844 EI->getVectorOperand() == V2;
10845 }) &&
10846"Expected extractelement vectors.");
10847 }
10848 /// Adds another one input vector and the mask for the shuffling.
10849voidadd(Value *V1,ArrayRef<int> Mask,bool ForExtracts =false) {
10850if (InVectors.empty()) {
10851assert(CommonMask.empty() && !ForExtracts &&
10852"Expected empty input mask/vectors.");
10853 CommonMask.assign(Mask.begin(), Mask.end());
10854 InVectors.assign(1, V1);
10855return;
10856 }
10857if (ForExtracts) {
10858// No need to add vectors here, already handled them in adjustExtracts.
10859assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10860 !CommonMask.empty() &&
10861all_of(enumerate(CommonMask),
10862 [&](autoP) {
10863Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10864 ->getOrdered(P.index());
10865if (P.value() ==PoisonMaskElem)
10866returnP.value() == Mask[P.index()] ||
10867 isa<UndefValue>(Scalar);
10868if (isa<Constant>(V1))
10869returntrue;
10870auto *EI = cast<ExtractElementInst>(Scalar);
10871return EI->getVectorOperand() == V1;
10872 }) &&
10873"Expected only tree entry for extractelement vectors.");
10874return;
10875 }
10876assert(!InVectors.empty() && !CommonMask.empty() &&
10877"Expected only tree entries from extracts/reused buildvectors.");
10878unsigned VF = getVF(V1);
10879if (InVectors.size() == 2) {
10880Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10881 transformMaskAfterShuffle(CommonMask, CommonMask);
10882 VF = std::max<unsigned>(VF, CommonMask.size());
10883 }elseif (constauto *InTE =
10884 InVectors.front().dyn_cast<const TreeEntry *>()) {
10885 VF = std::max(VF, InTE->getVectorFactor());
10886 }else {
10887 VF = std::max(
10888 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10889 ->getNumElements());
10890 }
10891 InVectors.push_back(V1);
10892for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
10893if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)
10894 CommonMask[Idx] = Mask[Idx] + VF;
10895 }
10896Value *gather(ArrayRef<Value *> VL,unsigned MaskVF = 0,
10897Value *Root =nullptr) {
10898Cost += getBuildVectorCost(VL, Root);
10899if (!Root) {
10900// FIXME: Need to find a way to avoid use of getNullValue here.
10901SmallVector<Constant *> Vals;
10902unsigned VF = VL.size();
10903if (MaskVF != 0)
10904 VF = std::min(VF, MaskVF);
10905for (Value *V : VL.take_front(VF)) {
10906if (isa<UndefValue>(V)) {
10907 Vals.push_back(cast<Constant>(V));
10908continue;
10909 }
10910 Vals.push_back(Constant::getNullValue(V->getType()));
10911 }
10912if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10913assert(SLPReVec &&"FixedVectorType is not expected.");
10914// When REVEC is enabled, we need to expand vector types into scalar
10915// types.
10916unsigned VecTyNumElements = VecTy->getNumElements();
10917SmallVector<Constant *> NewVals(VF * VecTyNumElements,nullptr);
10918for (auto [I, V] :enumerate(Vals)) {
10919Type *ScalarTy = V->getType()->getScalarType();
10920Constant *NewVal;
10921if (isa<PoisonValue>(V))
10922 NewVal =PoisonValue::get(ScalarTy);
10923elseif (isa<UndefValue>(V))
10924 NewVal =UndefValue::get(ScalarTy);
10925else
10926 NewVal =Constant::getNullValue(ScalarTy);
10927 std::fill_n(NewVals.begin() +I * VecTyNumElements, VecTyNumElements,
10928 NewVal);
10929 }
10930 Vals.swap(NewVals);
10931 }
10932returnConstantVector::get(Vals);
10933 }
10934returnConstantVector::getSplat(
10935ElementCount::getFixed(
10936 cast<FixedVectorType>(Root->getType())->getNumElements()),
10937 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10938 }
10939InstructionCostcreateFreeze(InstructionCostCost) {returnCost; }
10940 /// Finalize emission of the shuffles.
10941InstructionCost
10942finalize(ArrayRef<int> ExtMask,
10943ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10944ArrayRef<int> SubVectorsMask,unsigned VF = 0,
10945function_ref<void(Value *&,SmallVectorImpl<int> &)> Action = {}) {
10946 IsFinalized =true;
10947if (Action) {
10948constPointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10949if (InVectors.size() == 2)
10950Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10951else
10952Cost += createShuffle(Vec,nullptr, CommonMask);
10953 transformMaskAfterShuffle(CommonMask, CommonMask);
10954assert(VF > 0 &&
10955"Expected vector length for the final value before action.");
10956Value *V = cast<Value *>(Vec);
10957 Action(V, CommonMask);
10958 InVectors.front() = V;
10959 }
10960if (!SubVectors.empty()) {
10961constPointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10962if (InVectors.size() == 2)
10963Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10964else
10965Cost += createShuffle(Vec,nullptr, CommonMask);
10966 transformMaskAfterShuffle(CommonMask, CommonMask);
10967// Add subvectors permutation cost.
10968if (!SubVectorsMask.empty()) {
10969assert(SubVectorsMask.size() <= CommonMask.size() &&
10970"Expected same size of masks for subvectors and common mask.");
10971SmallVector<int> SVMask(CommonMask.size(),PoisonMaskElem);
10972copy(SubVectorsMask, SVMask.begin());
10973for (auto [I1, I2] :zip(SVMask, CommonMask)) {
10974if (I2 !=PoisonMaskElem) {
10975assert(I1 ==PoisonMaskElem &&"Expected unused subvectors mask");
10976 I1 = I2 + CommonMask.size();
10977 }
10978 }
10979Cost +=::getShuffleCost(TTI,TTI::SK_PermuteTwoSrc,
10980getWidenedType(ScalarTy, CommonMask.size()),
10981 SVMask,CostKind);
10982 }
10983for (auto [E,Idx] : SubVectors) {
10984Type *EScalarTy = E->Scalars.front()->getType();
10985bool IsSigned =true;
10986if (auto It =R.MinBWs.find(E); It !=R.MinBWs.end()) {
10987 EScalarTy =
10988IntegerType::get(EScalarTy->getContext(), It->second.first);
10989 IsSigned = It->second.second;
10990 }
10991if (ScalarTy != EScalarTy) {
10992unsigned CastOpcode = Instruction::Trunc;
10993unsigned DstSz =R.DL->getTypeSizeInBits(ScalarTy);
10994unsigned SrcSz =R.DL->getTypeSizeInBits(EScalarTy);
10995if (DstSz > SrcSz)
10996 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10997Cost +=TTI.getCastInstrCost(
10998 CastOpcode,getWidenedType(ScalarTy, E->getVectorFactor()),
10999getWidenedType(EScalarTy, E->getVectorFactor()),
11000TTI::CastContextHint::Normal,CostKind);
11001 }
11002Cost +=::getShuffleCost(
11003TTI,TTI::SK_InsertSubvector,
11004getWidenedType(ScalarTy, CommonMask.size()), {},CostKind,Idx,
11005getWidenedType(ScalarTy, E->getVectorFactor()));
11006if (!CommonMask.empty()) {
11007 std::iota(std::next(CommonMask.begin(),Idx),
11008 std::next(CommonMask.begin(),Idx + E->getVectorFactor()),
11009Idx);
11010 }
11011 }
11012 }
11013
11014if (!ExtMask.empty()) {
11015if (CommonMask.empty()) {
11016 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11017 }else {
11018SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);
11019for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {
11020if (ExtMask[I] ==PoisonMaskElem)
11021continue;
11022 NewMask[I] = CommonMask[ExtMask[I]];
11023 }
11024 CommonMask.swap(NewMask);
11025 }
11026 }
11027if (CommonMask.empty()) {
11028assert(InVectors.size() == 1 &&"Expected only one vector with no mask");
11029returnCost;
11030 }
11031returnCost +
11032 createShuffle(InVectors.front(),
11033 InVectors.size() == 2 ? InVectors.back() :nullptr,
11034 CommonMask);
11035 }
11036
11037~ShuffleCostEstimator() {
11038assert((IsFinalized || CommonMask.empty()) &&
11039"Shuffle construction must be finalized.");
11040 }
11041};
11042
11043const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
11044unsignedIdx) const{
11045if (const TreeEntry *VE = getMatchedVectorizedOperand(E,Idx))
11046return VE;
11047constauto *It =
11048find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11049return TE->isGather() &&
11050find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
11051 return EI.EdgeIdx == Idx && EI.UserTE == E;
11052 }) != TE->UserTreeIndices.end();
11053 });
11054assert(It != VectorizableTree.end() &&"Expected vectorizable entry.");
11055return It->get();
11056}
11057
11058TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const{
11059if (TE.State == TreeEntry::ScatterVectorize ||
11060 TE.State == TreeEntry::StridedVectorize)
11061returnTTI::CastContextHint::GatherScatter;
11062if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11063 !TE.isAltShuffle()) {
11064if (TE.ReorderIndices.empty())
11065returnTTI::CastContextHint::Normal;
11066SmallVector<int> Mask;
11067inversePermutation(TE.ReorderIndices, Mask);
11068if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
11069returnTTI::CastContextHint::Reversed;
11070 }
11071returnTTI::CastContextHint::None;
11072}
11073
11074/// Builds the arguments types vector for the given call instruction with the
11075/// given \p ID for the specified vector factor.
11076staticSmallVector<Type *>
11077buildIntrinsicArgTypes(constCallInst *CI,constIntrinsic::IDID,
11078constunsigned VF,unsigned MinBW,
11079constTargetTransformInfo *TTI) {
11080SmallVector<Type *> ArgTys;
11081for (auto [Idx, Arg] :enumerate(CI->args())) {
11082if (ID !=Intrinsic::not_intrinsic) {
11083if (isVectorIntrinsicWithScalarOpAtArg(ID,Idx,TTI)) {
11084 ArgTys.push_back(Arg->getType());
11085continue;
11086 }
11087if (MinBW > 0) {
11088 ArgTys.push_back(
11089getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11090continue;
11091 }
11092 }
11093 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11094 }
11095return ArgTys;
11096}
11097
11098InstructionCost
11099BoUpSLP::getEntryCost(const TreeEntry *E,ArrayRef<Value *> VectorizedVals,
11100SmallPtrSetImpl<Value *> &CheckedExtracts) {
11101ArrayRef<Value *> VL = E->Scalars;
11102
11103Type *ScalarTy =getValueType(VL[0]);
11104if (!isValidElementType(ScalarTy))
11105returnInstructionCost::getInvalid();
11106TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
11107
11108// If we have computed a smaller type for the expression, update VecTy so
11109// that the costs will be accurate.
11110auto It = MinBWs.find(E);
11111Type *OrigScalarTy = ScalarTy;
11112if (It != MinBWs.end()) {
11113auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11114 ScalarTy =IntegerType::get(F->getContext(), It->second.first);
11115if (VecTy)
11116 ScalarTy =getWidenedType(ScalarTy, VecTy->getNumElements());
11117 }
11118auto *VecTy =getWidenedType(ScalarTy, VL.size());
11119unsigned EntryVF = E->getVectorFactor();
11120auto *FinalVecTy =getWidenedType(ScalarTy, EntryVF);
11121
11122if (E->isGather()) {
11123if (allConstant(VL))
11124return 0;
11125if (isa<InsertElementInst>(VL[0]))
11126returnInstructionCost::getInvalid();
11127if (isa<CmpInst>(VL.front()))
11128 ScalarTy = VL.front()->getType();
11129return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11130 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11131 }
11132InstructionCost CommonCost = 0;
11133SmallVector<int>Mask;
11134if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11135 !isReverseOrder(E->ReorderIndices))) {
11136SmallVector<int> NewMask;
11137if (E->getOpcode() == Instruction::Store) {
11138// For stores the order is actually a mask.
11139 NewMask.resize(E->ReorderIndices.size());
11140copy(E->ReorderIndices, NewMask.begin());
11141 }else {
11142inversePermutation(E->ReorderIndices, NewMask);
11143 }
11144::addMask(Mask, NewMask);
11145 }
11146if (!E->ReuseShuffleIndices.empty())
11147::addMask(Mask, E->ReuseShuffleIndices);
11148if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask,Mask.size()))
11149 CommonCost =
11150::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11151assert((E->State == TreeEntry::Vectorize ||
11152 E->State == TreeEntry::ScatterVectorize ||
11153 E->State == TreeEntry::StridedVectorize) &&
11154"Unhandled state");
11155assert(E->getOpcode() &&
11156 ((allSameType(VL) &&allSameBlock(VL)) ||
11157 (E->getOpcode() == Instruction::GetElementPtr &&
11158 E->getMainOp()->getType()->isPointerTy())) &&
11159"Invalid VL");
11160Instruction *VL0 = E->getMainOp();
11161unsigned ShuffleOrOp =
11162 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11163if (E->CombinedOp != TreeEntry::NotCombinedOp)
11164 ShuffleOrOp = E->CombinedOp;
11165SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11166constunsigned Sz = UniqueValues.size();
11167SmallBitVector UsedScalars(Sz,false);
11168for (unsignedI = 0;I < Sz; ++I) {
11169if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)
11170continue;
11171 UsedScalars.set(I);
11172 }
11173auto GetCastContextHint = [&](Value *V) {
11174if (const TreeEntry *OpTE = getTreeEntry(V))
11175return getCastContextHint(*OpTE);
11176 InstructionsState SrcState =getSameOpcode(E->getOperand(0), *TLI);
11177if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11178 !SrcState.isAltShuffle())
11179returnTTI::CastContextHint::GatherScatter;
11180returnTTI::CastContextHint::None;
11181 };
11182auto GetCostDiff =
11183 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11184function_ref<InstructionCost(InstructionCost)> VectorCost) {
11185// Calculate the cost of this instruction.
11186InstructionCost ScalarCost = 0;
11187if (isa<CastInst, CallInst>(VL0)) {
11188// For some of the instructions no need to calculate cost for each
11189// particular instruction, we can use the cost of the single
11190// instruction x total number of scalar instructions.
11191 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11192 }else {
11193for (unsignedI = 0;I < Sz; ++I) {
11194if (UsedScalars.test(I))
11195continue;
11196 ScalarCost += ScalarEltCost(I);
11197 }
11198 }
11199
11200InstructionCost VecCost = VectorCost(CommonCost);
11201// Check if the current node must be resized, if the parent node is not
11202// resized.
11203if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11204 E->Idx != 0 &&
11205 (E->getOpcode() != Instruction::Load ||
11206 !E->UserTreeIndices.empty())) {
11207const EdgeInfo &EI =
11208 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11209 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11210 });
11211if (EI.UserTE->getOpcode() != Instruction::Select ||
11212 EI.EdgeIdx != 0) {
11213auto UserBWIt = MinBWs.find(EI.UserTE);
11214Type *UserScalarTy =
11215 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11216if (UserBWIt != MinBWs.end())
11217 UserScalarTy =IntegerType::get(ScalarTy->getContext(),
11218 UserBWIt->second.first);
11219if (ScalarTy != UserScalarTy) {
11220unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);
11221unsigned SrcBWSz =DL->getTypeSizeInBits(UserScalarTy);
11222unsigned VecOpcode;
11223auto *UserVecTy =getWidenedType(UserScalarTy, E->Scalars.size());
11224if (BWSz > SrcBWSz)
11225 VecOpcode = Instruction::Trunc;
11226else
11227 VecOpcode =
11228 It->second.second ? Instruction::SExt : Instruction::ZExt;
11229TTI::CastContextHint CCH = GetCastContextHint(VL0);
11230 VecCost +=TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11231CostKind);
11232 }
11233 }
11234 }
11235LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11236 ScalarCost,"Calculated costs for Tree"));
11237return VecCost - ScalarCost;
11238 };
11239// Calculate cost difference from vectorizing set of GEPs.
11240// Negative value means vectorizing is profitable.
11241auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs,Value *BasePtr) {
11242assert((E->State == TreeEntry::Vectorize ||
11243 E->State == TreeEntry::StridedVectorize) &&
11244"Entry state expected to be Vectorize or StridedVectorize here.");
11245InstructionCost ScalarCost = 0;
11246InstructionCost VecCost = 0;
11247 std::tie(ScalarCost, VecCost) =getGEPCosts(
11248 *TTI, Ptrs, BasePtr, E->getOpcode(),CostKind, OrigScalarTy, VecTy);
11249LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11250"Calculated GEPs cost for Tree"));
11251
11252return VecCost - ScalarCost;
11253 };
11254
11255auto GetMinMaxCost = [&](Type *Ty,Instruction *VI =nullptr) {
11256auto [MinMaxID, SelectOnly] =canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11257if (MinMaxID ==Intrinsic::not_intrinsic)
11258returnInstructionCost::getInvalid();
11259Type *CanonicalType = Ty;
11260if (CanonicalType->isPtrOrPtrVectorTy())
11261 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11262 CanonicalType->getContext(),
11263DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11264
11265IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11266 {CanonicalType, CanonicalType});
11267InstructionCost IntrinsicCost =
11268TTI->getIntrinsicInstrCost(CostAttrs,CostKind);
11269// If the selects are the only uses of the compares, they will be
11270// dead and we can adjust the cost by removing their cost.
11271if (VI && SelectOnly) {
11272assert((!Ty->isVectorTy() ||SLPReVec) &&
11273"Expected only for scalar type.");
11274auto *CI = cast<CmpInst>(VI->getOperand(0));
11275 IntrinsicCost -=TTI->getCmpSelInstrCost(
11276 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11277CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11278 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11279 }
11280return IntrinsicCost;
11281 };
11282switch (ShuffleOrOp) {
11283case Instruction::PHI: {
11284// Count reused scalars.
11285InstructionCost ScalarCost = 0;
11286SmallPtrSet<const TreeEntry *, 4> CountedOps;
11287for (Value *V : UniqueValues) {
11288auto *PHI = dyn_cast<PHINode>(V);
11289if (!PHI)
11290continue;
11291
11292ValueListOperands(PHI->getNumIncomingValues(),nullptr);
11293for (unsignedI = 0,N =PHI->getNumIncomingValues();I <N; ++I) {
11294Value *Op =PHI->getIncomingValue(I);
11295Operands[I] =Op;
11296 }
11297if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
11298if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
11299if (!OpTE->ReuseShuffleIndices.empty())
11300 ScalarCost +=TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11301 OpTE->Scalars.size());
11302 }
11303
11304return CommonCost - ScalarCost;
11305 }
11306case Instruction::ExtractValue:
11307case Instruction::ExtractElement: {
11308auto GetScalarCost = [&](unsignedIdx) {
11309if (isa<PoisonValue>(UniqueValues[Idx]))
11310returnInstructionCost(TTI::TCC_Free);
11311
11312auto *I = cast<Instruction>(UniqueValues[Idx]);
11313VectorType *SrcVecTy;
11314if (ShuffleOrOp == Instruction::ExtractElement) {
11315auto *EE = cast<ExtractElementInst>(I);
11316 SrcVecTy = EE->getVectorOperandType();
11317 }else {
11318auto *EV = cast<ExtractValueInst>(I);
11319Type *AggregateTy = EV->getAggregateOperand()->getType();
11320unsigned NumElts;
11321if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11322 NumElts = ATy->getNumElements();
11323else
11324 NumElts = AggregateTy->getStructNumElements();
11325 SrcVecTy =getWidenedType(OrigScalarTy, NumElts);
11326 }
11327if (I->hasOneUse()) {
11328Instruction *Ext =I->user_back();
11329if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11330all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11331// Use getExtractWithExtendCost() to calculate the cost of
11332// extractelement/ext pair.
11333InstructionCostCost =TTI->getExtractWithExtendCost(
11334Ext->getOpcode(),Ext->getType(), SrcVecTy, *getExtractIndex(I));
11335// Subtract the cost of s|zext which is subtracted separately.
11336Cost -=TTI->getCastInstrCost(
11337Ext->getOpcode(),Ext->getType(),I->getType(),
11338TTI::getCastContextHint(Ext),CostKind, Ext);
11339returnCost;
11340 }
11341 }
11342returnTTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11343CostKind, *getExtractIndex(I));
11344 };
11345auto GetVectorCost = [](InstructionCost CommonCost) {return CommonCost; };
11346return GetCostDiff(GetScalarCost, GetVectorCost);
11347 }
11348case Instruction::InsertElement: {
11349assert(E->ReuseShuffleIndices.empty() &&
11350"Unique insertelements only are expected.");
11351auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11352unsignedconst NumElts = SrcVecTy->getNumElements();
11353unsignedconst NumScalars = VL.size();
11354
11355unsigned NumOfParts =::getNumberOfParts(*TTI, SrcVecTy);
11356
11357SmallVector<int> InsertMask(NumElts,PoisonMaskElem);
11358unsigned OffsetBeg = *getElementIndex(VL.front());
11359unsigned OffsetEnd = OffsetBeg;
11360 InsertMask[OffsetBeg] = 0;
11361for (auto [I, V] :enumerate(VL.drop_front())) {
11362unsignedIdx = *getElementIndex(V);
11363if (OffsetBeg >Idx)
11364 OffsetBeg =Idx;
11365elseif (OffsetEnd <Idx)
11366 OffsetEnd =Idx;
11367 InsertMask[Idx] =I + 1;
11368 }
11369unsigned VecScalarsSz =PowerOf2Ceil(NumElts);
11370if (NumOfParts > 0 && NumOfParts < NumElts)
11371 VecScalarsSz =PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11372unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11373 VecScalarsSz;
11374unsignedOffset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11375unsigned InsertVecSz = std::min<unsigned>(
11376PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11377 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11378bool IsWholeSubvector =
11379 OffsetBeg ==Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11380// Check if we can safely insert a subvector. If it is not possible, just
11381// generate a whole-sized vector and shuffle the source vector and the new
11382// subvector.
11383if (OffsetBeg + InsertVecSz > VecSz) {
11384// Align OffsetBeg to generate correct mask.
11385 OffsetBeg =alignDown(OffsetBeg, VecSz,Offset);
11386 InsertVecSz = VecSz;
11387 }
11388
11389APInt DemandedElts =APInt::getZero(NumElts);
11390// TODO: Add support for Instruction::InsertValue.
11391SmallVector<int>Mask;
11392if (!E->ReorderIndices.empty()) {
11393inversePermutation(E->ReorderIndices, Mask);
11394Mask.append(InsertVecSz -Mask.size(),PoisonMaskElem);
11395 }else {
11396Mask.assign(VecSz,PoisonMaskElem);
11397 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11398 }
11399bool IsIdentity =true;
11400SmallVector<int> PrevMask(InsertVecSz,PoisonMaskElem);
11401Mask.swap(PrevMask);
11402for (unsignedI = 0;I < NumScalars; ++I) {
11403unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11404 DemandedElts.setBit(InsertIdx);
11405 IsIdentity &= InsertIdx - OffsetBeg ==I;
11406Mask[InsertIdx - OffsetBeg] =I;
11407 }
11408assert(Offset < NumElts &&"Failed to find vector index offset");
11409
11410InstructionCostCost = 0;
11411Cost -=TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11412/*Insert*/true,/*Extract*/false,
11413CostKind);
11414
11415// First cost - resize to actual vector size if not identity shuffle or
11416// need to shift the vector.
11417// Do not calculate the cost if the actual size is the register size and
11418// we can merge this shuffle with the following SK_Select.
11419auto *InsertVecTy =getWidenedType(ScalarTy, InsertVecSz);
11420if (!IsIdentity)
11421Cost +=::getShuffleCost(*TTI,TargetTransformInfo::SK_PermuteSingleSrc,
11422 InsertVecTy, Mask);
11423auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11424 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11425 }));
11426// Second cost - permutation with subvector, if some elements are from the
11427// initial vector or inserting a subvector.
11428// TODO: Implement the analysis of the FirstInsert->getOperand(0)
11429// subvector of ActualVecTy.
11430SmallBitVector InMask =
11431isUndefVector(FirstInsert->getOperand(0),
11432buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11433if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11434if (InsertVecSz != VecSz) {
11435auto *ActualVecTy =getWidenedType(ScalarTy, VecSz);
11436Cost +=::getShuffleCost(*TTI,TTI::SK_InsertSubvector, ActualVecTy, {},
11437CostKind, OffsetBeg -Offset, InsertVecTy);
11438 }else {
11439for (unsignedI = 0,End = OffsetBeg -Offset;I <End; ++I)
11440 Mask[I] = InMask.test(I) ?PoisonMaskElem :I;
11441for (unsignedI = OffsetBeg -Offset,End = OffsetEnd -Offset;
11442I <=End; ++I)
11443if (Mask[I] !=PoisonMaskElem)
11444Mask[I] =I + VecSz;
11445for (unsignedI = OffsetEnd + 1 -Offset;I < VecSz; ++I)
11446 Mask[I] =
11447 ((I >= InMask.size()) || InMask.test(I)) ?PoisonMaskElem :I;
11448Cost +=
11449::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11450 }
11451 }
11452returnCost;
11453 }
11454case Instruction::ZExt:
11455case Instruction::SExt:
11456case Instruction::FPToUI:
11457case Instruction::FPToSI:
11458case Instruction::FPExt:
11459case Instruction::PtrToInt:
11460case Instruction::IntToPtr:
11461case Instruction::SIToFP:
11462case Instruction::UIToFP:
11463case Instruction::Trunc:
11464case Instruction::FPTrunc:
11465case Instruction::BitCast: {
11466auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11467Type *SrcScalarTy = VL0->getOperand(0)->getType();
11468auto *SrcVecTy =getWidenedType(SrcScalarTy, VL.size());
11469unsigned Opcode = ShuffleOrOp;
11470unsigned VecOpcode = Opcode;
11471if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11472 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11473// Check if the values are candidates to demote.
11474unsigned SrcBWSz =DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11475if (SrcIt != MinBWs.end()) {
11476 SrcBWSz = SrcIt->second.first;
11477unsigned SrcScalarTyNumElements =getNumElements(SrcScalarTy);
11478 SrcScalarTy =IntegerType::get(F->getContext(), SrcBWSz);
11479 SrcVecTy =
11480getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11481 }
11482unsigned BWSz =DL->getTypeSizeInBits(ScalarTy->getScalarType());
11483if (BWSz == SrcBWSz) {
11484 VecOpcode = Instruction::BitCast;
11485 }elseif (BWSz < SrcBWSz) {
11486 VecOpcode = Instruction::Trunc;
11487 }elseif (It != MinBWs.end()) {
11488assert(BWSz > SrcBWSz &&"Invalid cast!");
11489 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11490 }elseif (SrcIt != MinBWs.end()) {
11491assert(BWSz > SrcBWSz &&"Invalid cast!");
11492 VecOpcode =
11493 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11494 }
11495 }elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11496 !SrcIt->second.second) {
11497 VecOpcode = Instruction::UIToFP;
11498 }
11499auto GetScalarCost = [&](unsignedIdx) ->InstructionCost {
11500assert(Idx == 0 &&"Expected 0 index only");
11501returnTTI->getCastInstrCost(Opcode, VL0->getType(),
11502 VL0->getOperand(0)->getType(),
11503TTI::getCastContextHint(VL0),CostKind, VL0);
11504 };
11505auto GetVectorCost = [=](InstructionCost CommonCost) {
11506// Do not count cost here if minimum bitwidth is in effect and it is just
11507// a bitcast (here it is just a noop).
11508if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11509return CommonCost;
11510auto *VI = VL0->getOpcode() == Opcode ? VL0 :nullptr;
11511TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11512
11513bool IsArithmeticExtendedReduction =
11514 E->Idx == 0 && UserIgnoreList &&
11515all_of(*UserIgnoreList, [](Value *V) {
11516auto *I = cast<Instruction>(V);
11517returnis_contained({Instruction::Add, Instruction::FAdd,
11518 Instruction::Mul, Instruction::FMul,
11519 Instruction::And, Instruction::Or,
11520 Instruction::Xor},
11521I->getOpcode());
11522 });
11523if (IsArithmeticExtendedReduction &&
11524 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11525return CommonCost;
11526return CommonCost +
11527TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,CostKind,
11528 VecOpcode == Opcode ? VI :nullptr);
11529 };
11530return GetCostDiff(GetScalarCost, GetVectorCost);
11531 }
11532case Instruction::FCmp:
11533case Instruction::ICmp:
11534case Instruction::Select: {
11535CmpPredicate VecPred, SwappedVecPred;
11536auto MatchCmp =m_Cmp(VecPred,m_Value(),m_Value());
11537if (match(VL0,m_Select(MatchCmp,m_Value(),m_Value())) ||
11538match(VL0, MatchCmp))
11539 SwappedVecPred =CmpInst::getSwappedPredicate(VecPred);
11540else
11541 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11542 ?CmpInst::BAD_FCMP_PREDICATE
11543 :CmpInst::BAD_ICMP_PREDICATE;
11544auto GetScalarCost = [&](unsignedIdx) {
11545if (isa<PoisonValue>(UniqueValues[Idx]))
11546returnInstructionCost(TTI::TCC_Free);
11547
11548auto *VI = cast<Instruction>(UniqueValues[Idx]);
11549CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11550 ?CmpInst::BAD_FCMP_PREDICATE
11551 :CmpInst::BAD_ICMP_PREDICATE;
11552auto MatchCmp =m_Cmp(CurrentPred,m_Value(),m_Value());
11553if ((!match(VI,m_Select(MatchCmp,m_Value(),m_Value())) &&
11554 !match(VI, MatchCmp)) ||
11555 (CurrentPred !=static_cast<CmpInst::Predicate>(VecPred) &&
11556 CurrentPred !=static_cast<CmpInst::Predicate>(SwappedVecPred)))
11557 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11558 ?CmpInst::BAD_FCMP_PREDICATE
11559 :CmpInst::BAD_ICMP_PREDICATE;
11560
11561InstructionCost ScalarCost =TTI->getCmpSelInstrCost(
11562 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11563CostKind, getOperandInfo(VI->getOperand(0)),
11564 getOperandInfo(VI->getOperand(1)), VI);
11565InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11566if (IntrinsicCost.isValid())
11567 ScalarCost = IntrinsicCost;
11568
11569return ScalarCost;
11570 };
11571auto GetVectorCost = [&](InstructionCost CommonCost) {
11572auto *MaskTy =getWidenedType(Builder.getInt1Ty(), VL.size());
11573
11574InstructionCost VecCost =
11575TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11576CostKind, getOperandInfo(E->getOperand(0)),
11577 getOperandInfo(E->getOperand(1)), VL0);
11578if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11579auto *CondType =
11580getWidenedType(SI->getCondition()->getType(), VL.size());
11581unsigned CondNumElements = CondType->getNumElements();
11582unsigned VecTyNumElements =getNumElements(VecTy);
11583assert(VecTyNumElements >= CondNumElements &&
11584 VecTyNumElements % CondNumElements == 0 &&
11585"Cannot vectorize Instruction::Select");
11586if (CondNumElements != VecTyNumElements) {
11587// When the return type is i1 but the source is fixed vector type, we
11588// need to duplicate the condition value.
11589 VecCost +=::getShuffleCost(
11590 *TTI,TTI::SK_PermuteSingleSrc, CondType,
11591createReplicatedMask(VecTyNumElements / CondNumElements,
11592 CondNumElements));
11593 }
11594 }
11595return VecCost + CommonCost;
11596 };
11597return GetCostDiff(GetScalarCost, GetVectorCost);
11598 }
11599case TreeEntry::MinMax: {
11600auto GetScalarCost = [&](unsignedIdx) {
11601return GetMinMaxCost(OrigScalarTy);
11602 };
11603auto GetVectorCost = [&](InstructionCost CommonCost) {
11604InstructionCost VecCost = GetMinMaxCost(VecTy);
11605return VecCost + CommonCost;
11606 };
11607return GetCostDiff(GetScalarCost, GetVectorCost);
11608 }
11609case Instruction::FNeg:
11610case Instruction::Add:
11611case Instruction::FAdd:
11612case Instruction::Sub:
11613case Instruction::FSub:
11614case Instruction::Mul:
11615case Instruction::FMul:
11616case Instruction::UDiv:
11617case Instruction::SDiv:
11618case Instruction::FDiv:
11619case Instruction::URem:
11620case Instruction::SRem:
11621case Instruction::FRem:
11622case Instruction::Shl:
11623case Instruction::LShr:
11624case Instruction::AShr:
11625case Instruction::And:
11626case Instruction::Or:
11627case Instruction::Xor: {
11628auto GetScalarCost = [&](unsignedIdx) {
11629if (isa<PoisonValue>(UniqueValues[Idx]))
11630returnInstructionCost(TTI::TCC_Free);
11631
11632auto *VI = cast<Instruction>(UniqueValues[Idx]);
11633unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11634TTI::OperandValueInfo Op1Info =TTI::getOperandInfo(VI->getOperand(0));
11635TTI::OperandValueInfo Op2Info =
11636TTI::getOperandInfo(VI->getOperand(OpIdx));
11637SmallVector<const Value *>Operands(VI->operand_values());
11638returnTTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy,CostKind,
11639 Op1Info, Op2Info,Operands, VI);
11640 };
11641auto GetVectorCost = [=](InstructionCost CommonCost) {
11642if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11643for (unsignedI : seq<unsigned>(0, E->getNumOperands())) {
11644ArrayRef<Value *> Ops = E->getOperand(I);
11645if (all_of(Ops, [&](Value *Op) {
11646auto *CI = dyn_cast<ConstantInt>(Op);
11647return CI && CI->getValue().countr_one() >= It->second.first;
11648 }))
11649return CommonCost;
11650 }
11651 }
11652unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11653TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11654TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11655returnTTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,CostKind, Op1Info,
11656 Op2Info, {},nullptr, TLI) +
11657 CommonCost;
11658 };
11659return GetCostDiff(GetScalarCost, GetVectorCost);
11660 }
11661case Instruction::GetElementPtr: {
11662return CommonCost + GetGEPCostDiff(VL, VL0);
11663 }
11664case Instruction::Load: {
11665auto GetScalarCost = [&](unsignedIdx) {
11666auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11667returnTTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11668VI->getAlign(),VI->getPointerAddressSpace(),
11669CostKind,TTI::OperandValueInfo(), VI);
11670 };
11671auto *LI0 = cast<LoadInst>(VL0);
11672auto GetVectorCost = [&](InstructionCost CommonCost) {
11673InstructionCost VecLdCost;
11674switch (E->State) {
11675case TreeEntry::Vectorize:
11676if (unsigned Factor = E->getInterleaveFactor()) {
11677 VecLdCost =TTI->getInterleavedMemoryOpCost(
11678 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11679 LI0->getPointerAddressSpace(),CostKind);
11680
11681 }else {
11682 VecLdCost =TTI->getMemoryOpCost(
11683 Instruction::Load, VecTy, LI0->getAlign(),
11684 LI0->getPointerAddressSpace(),CostKind,TTI::OperandValueInfo());
11685 }
11686break;
11687case TreeEntry::StridedVectorize: {
11688Align CommonAlignment =
11689 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11690 VecLdCost =TTI->getStridedMemoryOpCost(
11691 Instruction::Load, VecTy, LI0->getPointerOperand(),
11692/*VariableMask=*/false, CommonAlignment,CostKind);
11693break;
11694 }
11695case TreeEntry::ScatterVectorize: {
11696Align CommonAlignment =
11697 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11698 VecLdCost =TTI->getGatherScatterOpCost(
11699 Instruction::Load, VecTy, LI0->getPointerOperand(),
11700/*VariableMask=*/false, CommonAlignment,CostKind);
11701break;
11702 }
11703case TreeEntry::CombinedVectorize:
11704case TreeEntry::NeedToGather:
11705llvm_unreachable("Unexpected vectorization state.");
11706 }
11707return VecLdCost + CommonCost;
11708 };
11709
11710InstructionCostCost = GetCostDiff(GetScalarCost, GetVectorCost);
11711// If this node generates masked gather load then it is not a terminal node.
11712// Hence address operand cost is estimated separately.
11713if (E->State == TreeEntry::ScatterVectorize)
11714returnCost;
11715
11716// Estimate cost of GEPs since this tree node is a terminator.
11717SmallVector<Value *> PointerOps(VL.size());
11718for (auto [I, V] :enumerate(VL))
11719 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11720returnCost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11721 }
11722case Instruction::Store: {
11723bool IsReorder = !E->ReorderIndices.empty();
11724auto GetScalarCost = [=](unsignedIdx) {
11725auto *VI = cast<StoreInst>(VL[Idx]);
11726TTI::OperandValueInfo OpInfo =TTI::getOperandInfo(VI->getValueOperand());
11727returnTTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11728VI->getAlign(),VI->getPointerAddressSpace(),
11729CostKind, OpInfo, VI);
11730 };
11731auto *BaseSI =
11732 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11733auto GetVectorCost = [=](InstructionCost CommonCost) {
11734// We know that we can merge the stores. Calculate the cost.
11735InstructionCost VecStCost;
11736if (E->State == TreeEntry::StridedVectorize) {
11737Align CommonAlignment =
11738 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11739 VecStCost =TTI->getStridedMemoryOpCost(
11740 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11741/*VariableMask=*/false, CommonAlignment,CostKind);
11742 }else {
11743assert(E->State == TreeEntry::Vectorize &&
11744"Expected either strided or consecutive stores.");
11745if (unsigned Factor = E->getInterleaveFactor()) {
11746assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11747"No reused shuffles expected");
11748 CommonCost = 0;
11749 VecStCost =TTI->getInterleavedMemoryOpCost(
11750 Instruction::Store, VecTy, Factor, std::nullopt,
11751 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),CostKind);
11752 }else {
11753TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11754 VecStCost =TTI->getMemoryOpCost(
11755 Instruction::Store, VecTy, BaseSI->getAlign(),
11756 BaseSI->getPointerAddressSpace(),CostKind, OpInfo);
11757 }
11758 }
11759return VecStCost + CommonCost;
11760 };
11761SmallVector<Value *> PointerOps(VL.size());
11762for (auto [I, V] :enumerate(VL)) {
11763unsignedIdx = IsReorder ? E->ReorderIndices[I] :I;
11764 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11765 }
11766
11767return GetCostDiff(GetScalarCost, GetVectorCost) +
11768 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11769 }
11770case Instruction::Call: {
11771auto GetScalarCost = [&](unsignedIdx) {
11772auto *CI = cast<CallInst>(UniqueValues[Idx]);
11773Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
11774if (ID !=Intrinsic::not_intrinsic) {
11775IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11776returnTTI->getIntrinsicInstrCost(CostAttrs,CostKind);
11777 }
11778returnTTI->getCallInstrCost(CI->getCalledFunction(),
11779 CI->getFunctionType()->getReturnType(),
11780 CI->getFunctionType()->params(),CostKind);
11781 };
11782auto GetVectorCost = [=](InstructionCost CommonCost) {
11783auto *CI = cast<CallInst>(VL0);
11784Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
11785SmallVector<Type *> ArgTys =buildIntrinsicArgTypes(
11786 CI,ID, VecTy->getNumElements(),
11787 It != MinBWs.end() ? It->second.first : 0,TTI);
11788auto VecCallCosts =getVectorCallCosts(CI, VecTy,TTI, TLI, ArgTys);
11789return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11790 };
11791return GetCostDiff(GetScalarCost, GetVectorCost);
11792 }
11793case Instruction::ShuffleVector: {
11794if (!SLPReVec || E->isAltShuffle())
11795assert(E->isAltShuffle() &&
11796 ((Instruction::isBinaryOp(E->getOpcode()) &&
11797Instruction::isBinaryOp(E->getAltOpcode())) ||
11798 (Instruction::isCast(E->getOpcode()) &&
11799Instruction::isCast(E->getAltOpcode())) ||
11800 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11801"Invalid Shuffle Vector Operand");
11802// Try to find the previous shuffle node with the same operands and same
11803// main/alternate ops.
11804auto TryFindNodeWithEqualOperands = [=]() {
11805for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11806if (TE.get() == E)
11807break;
11808if (TE->hasState() &&TE->isAltShuffle() &&
11809 ((TE->getOpcode() == E->getOpcode() &&
11810TE->getAltOpcode() == E->getAltOpcode()) ||
11811 (TE->getOpcode() == E->getAltOpcode() &&
11812TE->getAltOpcode() == E->getOpcode())) &&
11813TE->hasEqualOperands(*E))
11814returntrue;
11815 }
11816returnfalse;
11817 };
11818auto GetScalarCost = [&](unsignedIdx) {
11819if (isa<PoisonValue>(UniqueValues[Idx]))
11820returnInstructionCost(TTI::TCC_Free);
11821
11822auto *VI = cast<Instruction>(UniqueValues[Idx]);
11823assert(E->isOpcodeOrAlt(VI) &&"Unexpected main/alternate opcode");
11824 (void)E;
11825returnTTI->getInstructionCost(VI,CostKind);
11826 };
11827// Need to clear CommonCost since the final shuffle cost is included into
11828// vector cost.
11829auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11830// VecCost is equal to sum of the cost of creating 2 vectors
11831// and the cost of creating shuffle.
11832InstructionCost VecCost = 0;
11833if (TryFindNodeWithEqualOperands()) {
11834LLVM_DEBUG({
11835dbgs() <<"SLP: diamond match for alternate node found.\n";
11836 E->dump();
11837 });
11838// No need to add new vector costs here since we're going to reuse
11839// same main/alternate vector ops, just do different shuffling.
11840 }elseif (Instruction::isBinaryOp(E->getOpcode())) {
11841 VecCost =
11842 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,CostKind);
11843 VecCost +=
11844 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,CostKind);
11845 }elseif (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11846auto *MaskTy =getWidenedType(Builder.getInt1Ty(), VL.size());
11847 VecCost = TTIRef.getCmpSelInstrCost(
11848 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),CostKind,
11849 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11850 VL0);
11851 VecCost += TTIRef.getCmpSelInstrCost(
11852 E->getOpcode(), VecTy, MaskTy,
11853 cast<CmpInst>(E->getAltOp())->getPredicate(),CostKind,
11854 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11855 E->getAltOp());
11856 }else {
11857Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11858auto *SrcTy =getWidenedType(SrcSclTy, VL.size());
11859if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11860auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11861unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);
11862unsigned SrcBWSz =
11863DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11864if (SrcIt != MinBWs.end()) {
11865 SrcBWSz = SrcIt->second.first;
11866 SrcSclTy =IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11867 SrcTy =getWidenedType(SrcSclTy, VL.size());
11868 }
11869if (BWSz <= SrcBWSz) {
11870if (BWSz < SrcBWSz)
11871 VecCost =
11872 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11873TTI::CastContextHint::None,CostKind);
11874LLVM_DEBUG({
11875dbgs()
11876 <<"SLP: alternate extension, which should be truncated.\n";
11877 E->dump();
11878 });
11879return VecCost;
11880 }
11881 }
11882 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11883TTI::CastContextHint::None,CostKind);
11884 VecCost +=
11885 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11886TTI::CastContextHint::None,CostKind);
11887 }
11888SmallVector<int>Mask;
11889 E->buildAltOpShuffleMask(
11890 [&](Instruction *I) {
11891assert(E->isOpcodeOrAlt(I) &&"Unexpected main/alternate opcode");
11892returnisAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11893 *TLI);
11894 },
11895Mask);
11896 VecCost +=::getShuffleCost(TTIRef,TargetTransformInfo::SK_PermuteTwoSrc,
11897 FinalVecTy, Mask,CostKind);
11898// Patterns like [fadd,fsub] can be combined into a single instruction
11899// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11900// need to take into account their order when looking for the most used
11901// order.
11902unsigned Opcode0 = E->getOpcode();
11903unsigned Opcode1 = E->getAltOpcode();
11904SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11905// If this pattern is supported by the target then we consider the
11906// order.
11907if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11908InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11909 VecTy, Opcode0, Opcode1, OpcodeMask,CostKind);
11910return AltVecCost < VecCost ? AltVecCost : VecCost;
11911 }
11912// TODO: Check the reverse order too.
11913return VecCost;
11914 };
11915if (SLPReVec && !E->isAltShuffle())
11916return GetCostDiff(
11917 GetScalarCost, [&](InstructionCost) ->InstructionCost {
11918// If a group uses mask in order, the shufflevector can be
11919// eliminated by instcombine. Then the cost is 0.
11920assert(isa<ShuffleVectorInst>(VL.front()) &&
11921"Not supported shufflevector usage.");
11922auto *SV = cast<ShuffleVectorInst>(VL.front());
11923unsigned SVNumElements =
11924 cast<FixedVectorType>(SV->getOperand(0)->getType())
11925 ->getNumElements();
11926unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11927for (size_tI = 0,End = VL.size();I !=End;I += GroupSize) {
11928ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11929int NextIndex = 0;
11930if (!all_of(Group, [&](Value *V) {
11931assert(isa<ShuffleVectorInst>(V) &&
11932"Not supported shufflevector usage.");
11933auto *SV = cast<ShuffleVectorInst>(V);
11934intIndex;
11935 [[maybe_unused]]bool IsExtractSubvectorMask =
11936 SV->isExtractSubvectorMask(Index);
11937assert(IsExtractSubvectorMask &&
11938"Not supported shufflevector usage.");
11939if (NextIndex != Index)
11940returnfalse;
11941 NextIndex += SV->getShuffleMask().size();
11942returntrue;
11943 }))
11944 return ::getShuffleCost(
11945 *TTI,TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
11946calculateShufflevectorMask(E->Scalars));
11947 }
11948returnTTI::TCC_Free;
11949 });
11950return GetCostDiff(GetScalarCost, GetVectorCost);
11951 }
11952case Instruction::Freeze:
11953return CommonCost;
11954default:
11955llvm_unreachable("Unknown instruction");
11956 }
11957}
11958
11959bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const{
11960LLVM_DEBUG(dbgs() <<"SLP: Check whether the tree with height "
11961 << VectorizableTree.size() <<" is fully vectorizable .\n");
11962
11963auto &&AreVectorizableGathers = [this](const TreeEntry *TE,unsigned Limit) {
11964SmallVector<int>Mask;
11965returnTE->isGather() &&
11966 !any_of(TE->Scalars,
11967 [this](Value *V) { return EphValues.contains(V); }) &&
11968 (allConstant(TE->Scalars) ||isSplat(TE->Scalars) ||
11969TE->Scalars.size() < Limit ||
11970 (((TE->hasState() &&
11971TE->getOpcode() == Instruction::ExtractElement) ||
11972all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11973isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11974 (TE->hasState() &&TE->getOpcode() == Instruction::Load &&
11975 !TE->isAltShuffle()) ||
11976any_of(TE->Scalars, IsaPred<LoadInst>));
11977 };
11978
11979// We only handle trees of heights 1 and 2.
11980if (VectorizableTree.size() == 1 &&
11981 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11982 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11983 (ForReduction &&
11984 AreVectorizableGathers(VectorizableTree[0].get(),
11985 VectorizableTree[0]->Scalars.size()) &&
11986 VectorizableTree[0]->getVectorFactor() > 2)))
11987returntrue;
11988
11989if (VectorizableTree.size() != 2)
11990returnfalse;
11991
11992// Handle splat and all-constants stores. Also try to vectorize tiny trees
11993// with the second gather nodes if they have less scalar operands rather than
11994// the initial tree element (may be profitable to shuffle the second gather)
11995// or they are extractelements, which form shuffle.
11996SmallVector<int>Mask;
11997if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11998 AreVectorizableGathers(VectorizableTree[1].get(),
11999 VectorizableTree[0]->Scalars.size()))
12000returntrue;
12001
12002// Gathering cost would be too much for tiny trees.
12003if (VectorizableTree[0]->isGather() ||
12004 (VectorizableTree[1]->isGather() &&
12005 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
12006 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
12007returnfalse;
12008
12009returntrue;
12010}
12011
12012staticboolisLoadCombineCandidateImpl(Value *Root,unsigned NumElts,
12013TargetTransformInfo *TTI,
12014bool MustMatchOrInst) {
12015// Look past the root to find a source value. Arbitrarily follow the
12016// path through operand 0 of any 'or'. Also, peek through optional
12017// shift-left-by-multiple-of-8-bits.
12018Value *ZextLoad = Root;
12019constAPInt *ShAmtC;
12020bool FoundOr =false;
12021while (!isa<ConstantExpr>(ZextLoad) &&
12022 (match(ZextLoad,m_Or(m_Value(),m_Value())) ||
12023 (match(ZextLoad,m_Shl(m_Value(),m_APInt(ShAmtC))) &&
12024 ShAmtC->urem(8) == 0))) {
12025auto *BinOp = cast<BinaryOperator>(ZextLoad);
12026 ZextLoad = BinOp->getOperand(0);
12027if (BinOp->getOpcode() == Instruction::Or)
12028 FoundOr =true;
12029 }
12030// Check if the input is an extended load of the required or/shift expression.
12031Value *Load;
12032if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12033 !match(ZextLoad,m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
12034returnfalse;
12035
12036// Require that the total load bit width is a legal integer type.
12037// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
12038// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
12039Type *SrcTy = Load->getType();
12040unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
12041if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
12042returnfalse;
12043
12044// Everything matched - assume that we can fold the whole sequence using
12045// load combining.
12046LLVM_DEBUG(dbgs() <<"SLP: Assume load combining for tree starting at "
12047 << *(cast<Instruction>(Root)) <<"\n");
12048
12049returntrue;
12050}
12051
12052boolBoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const{
12053if (RdxKind !=RecurKind::Or)
12054returnfalse;
12055
12056unsigned NumElts = VectorizableTree[0]->Scalars.size();
12057Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12058returnisLoadCombineCandidateImpl(FirstReduced, NumElts,TTI,
12059/* MatchOr */false);
12060}
12061
12062boolBoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const{
12063// Peek through a final sequence of stores and check if all operations are
12064// likely to be load-combined.
12065unsigned NumElts = Stores.size();
12066for (Value *Scalar : Stores) {
12067Value *X;
12068if (!match(Scalar,m_Store(m_Value(X),m_Value())) ||
12069 !isLoadCombineCandidateImpl(X, NumElts,TTI,/* MatchOr */true))
12070returnfalse;
12071 }
12072returntrue;
12073}
12074
12075boolBoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const{
12076if (!DebugCounter::shouldExecute(VectorizedGraphs))
12077returntrue;
12078
12079// Graph is empty - do nothing.
12080if (VectorizableTree.empty()) {
12081assert(ExternalUses.empty() &&"We shouldn't have any external users");
12082
12083returntrue;
12084 }
12085
12086// No need to vectorize inserts of gathered values.
12087if (VectorizableTree.size() == 2 &&
12088 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12089 VectorizableTree[1]->isGather() &&
12090 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12091 !(isSplat(VectorizableTree[1]->Scalars) ||
12092allConstant(VectorizableTree[1]->Scalars))))
12093returntrue;
12094
12095// If the graph includes only PHI nodes and gathers, it is defnitely not
12096// profitable for the vectorization, we can skip it, if the cost threshold is
12097// default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12098// gathers/buildvectors.
12099constexprint Limit = 4;
12100if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12101 !VectorizableTree.empty() &&
12102all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12103return (TE->isGather() &&
12104 (!TE->hasState() ||
12105 TE->getOpcode() != Instruction::ExtractElement) &&
12106count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12107 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12108 }))
12109returntrue;
12110
12111// We can vectorize the tree if its size is greater than or equal to the
12112// minimum size specified by the MinTreeSize command line option.
12113if (VectorizableTree.size() >=MinTreeSize)
12114returnfalse;
12115
12116// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12117// can vectorize it if we can prove it fully vectorizable.
12118if (isFullyVectorizableTinyTree(ForReduction))
12119returnfalse;
12120
12121// Check if any of the gather node forms an insertelement buildvector
12122// somewhere.
12123bool IsAllowedSingleBVNode =
12124 VectorizableTree.size() > 1 ||
12125 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12126 !VectorizableTree.front()->isAltShuffle() &&
12127 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12128 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12129allSameBlock(VectorizableTree.front()->Scalars));
12130if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12131return TE->isGather() &&all_of(TE->Scalars, [&](Value *V) {
12132 return isa<ExtractElementInst, UndefValue>(V) ||
12133 (IsAllowedSingleBVNode &&
12134 !V->hasNUsesOrMore(UsesLimit) &&
12135 any_of(V->users(), IsaPred<InsertElementInst>));
12136 });
12137 }))
12138returnfalse;
12139
12140if (VectorizableTree.back()->isGather() &&
12141 VectorizableTree.back()->hasState() &&
12142 VectorizableTree.back()->isAltShuffle() &&
12143 VectorizableTree.back()->getVectorFactor() > 2 &&
12144allSameBlock(VectorizableTree.back()->Scalars) &&
12145 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12146TTI->getScalarizationOverhead(
12147getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12148 VectorizableTree.back()->getVectorFactor()),
12149APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12150/*Insert=*/true,/*Extract=*/false,
12151TTI::TCK_RecipThroughput) > -SLPCostThreshold)
12152returnfalse;
12153
12154// Otherwise, we can't vectorize the tree. It is both tiny and not fully
12155// vectorizable.
12156returntrue;
12157}
12158
12159boolBoUpSLP::isTreeNotExtendable() const{
12160if (getCanonicalGraphSize() !=getTreeSize()) {
12161constexprunsigned SmallTree = 3;
12162if (VectorizableTree.front()->isNonPowOf2Vec() &&
12163getCanonicalGraphSize() <= SmallTree &&
12164count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12165 [](const std::unique_ptr<TreeEntry> &TE) {
12166return TE->isGather() && TE->hasState() &&
12167 TE->getOpcode() == Instruction::Load &&
12168 !allSameBlock(TE->Scalars);
12169 }) == 1)
12170returntrue;
12171returnfalse;
12172 }
12173bool Res =false;
12174for (unsignedIdx : seq<unsigned>(getTreeSize())) {
12175 TreeEntry &E = *VectorizableTree[Idx];
12176if (!E.isGather())
12177continue;
12178if (E.hasState() && E.getOpcode() != Instruction::Load)
12179returnfalse;
12180if (isSplat(E.Scalars) ||allConstant(E.Scalars))
12181continue;
12182 Res =true;
12183 }
12184return Res;
12185}
12186
12187InstructionCostBoUpSLP::getSpillCost() const{
12188// Walk from the bottom of the tree to the top, tracking which values are
12189// live. When we see a call instruction that is not part of our tree,
12190// query TTI to see if there is a cost to keeping values live over it
12191// (for example, if spills and fills are required).
12192unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12193InstructionCostCost = 0;
12194
12195SmallPtrSet<Instruction *, 4> LiveValues;
12196Instruction *PrevInst =nullptr;
12197
12198// The entries in VectorizableTree are not necessarily ordered by their
12199// position in basic blocks. Collect them and order them by dominance so later
12200// instructions are guaranteed to be visited first. For instructions in
12201// different basic blocks, we only scan to the beginning of the block, so
12202// their order does not matter, as long as all instructions in a basic block
12203// are grouped together. Using dominance ensures a deterministic order.
12204SmallVector<Instruction *, 16> OrderedScalars;
12205for (constauto &TEPtr : VectorizableTree) {
12206if (TEPtr->State != TreeEntry::Vectorize)
12207continue;
12208Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12209if (!Inst)
12210continue;
12211 OrderedScalars.push_back(Inst);
12212 }
12213llvm::sort(OrderedScalars, [&](Instruction *A,Instruction *B) {
12214auto *NodeA = DT->getNode(A->getParent());
12215auto *NodeB = DT->getNode(B->getParent());
12216assert(NodeA &&"Should only process reachable instructions");
12217assert(NodeB &&"Should only process reachable instructions");
12218assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12219"Different nodes should have different DFS numbers");
12220if (NodeA != NodeB)
12221return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12222returnB->comesBefore(A);
12223 });
12224
12225for (Instruction *Inst : OrderedScalars) {
12226if (!PrevInst) {
12227 PrevInst = Inst;
12228continue;
12229 }
12230
12231// Update LiveValues.
12232 LiveValues.erase(PrevInst);
12233for (auto &J : PrevInst->operands()) {
12234if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12235 LiveValues.insert(cast<Instruction>(&*J));
12236 }
12237
12238LLVM_DEBUG({
12239dbgs() <<"SLP: #LV: " << LiveValues.size();
12240for (auto *X : LiveValues)
12241dbgs() <<" " <<X->getName();
12242dbgs() <<", Looking at ";
12243 Inst->dump();
12244 });
12245
12246// Now find the sequence of instructions between PrevInst and Inst.
12247unsigned NumCalls = 0;
12248BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12249 PrevInstIt =
12250 PrevInst->getIterator().getReverse();
12251while (InstIt != PrevInstIt) {
12252if (PrevInstIt == PrevInst->getParent()->rend()) {
12253 PrevInstIt = Inst->getParent()->rbegin();
12254continue;
12255 }
12256
12257auto NoCallIntrinsic = [this](Instruction *I) {
12258if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12259if (II->isAssumeLikeIntrinsic())
12260returntrue;
12261IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12262InstructionCost IntrCost =
12263TTI->getIntrinsicInstrCost(ICA,TTI::TCK_RecipThroughput);
12264InstructionCost CallCost =
12265TTI->getCallInstrCost(nullptr,II->getType(), ICA.getArgTypes(),
12266TTI::TCK_RecipThroughput);
12267if (IntrCost < CallCost)
12268returntrue;
12269 }
12270returnfalse;
12271 };
12272
12273// Debug information does not impact spill cost.
12274if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12275 &*PrevInstIt != PrevInst)
12276 NumCalls++;
12277
12278 ++PrevInstIt;
12279 }
12280
12281if (NumCalls) {
12282SmallVector<Type *, 4> V;
12283for (auto *II : LiveValues) {
12284auto *ScalarTy =II->getType();
12285if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12286 ScalarTy = VectorTy->getElementType();
12287 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12288 }
12289Cost += NumCalls *TTI->getCostOfKeepingLiveOverCall(V);
12290 }
12291
12292 PrevInst = Inst;
12293 }
12294
12295returnCost;
12296}
12297
12298/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12299/// buildvector sequence.
12300staticboolisFirstInsertElement(constInsertElementInst *IE1,
12301constInsertElementInst *IE2) {
12302if (IE1 == IE2)
12303returnfalse;
12304constauto *I1 = IE1;
12305constauto *I2 = IE2;
12306constInsertElementInst *PrevI1;
12307constInsertElementInst *PrevI2;
12308unsigned Idx1 = *getElementIndex(IE1);
12309unsigned Idx2 = *getElementIndex(IE2);
12310do {
12311if (I2 == IE1)
12312returntrue;
12313if (I1 == IE2)
12314returnfalse;
12315 PrevI1 = I1;
12316 PrevI2 = I2;
12317if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12318getElementIndex(I1).value_or(Idx2) != Idx2)
12319 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12320if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12321getElementIndex(I2).value_or(Idx1) != Idx1)
12322 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12323 }while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12324llvm_unreachable("Two different buildvectors not expected.");
12325}
12326
12327namespace{
12328/// Returns incoming Value *, if the requested type is Value * too, or a default
12329/// value, otherwise.
12330structValueSelect {
12331template <typename U>
12332static std::enable_if_t<std::is_same_v<Value *, U>,Value *>get(Value *V) {
12333returnV;
12334 }
12335template <typename U>
12336static std::enable_if_t<!std::is_same_v<Value *, U>,U>get(Value *) {
12337returnU();
12338 }
12339};
12340}// namespace
12341
12342/// Does the analysis of the provided shuffle masks and performs the requested
12343/// actions on the vectors with the given shuffle masks. It tries to do it in
12344/// several steps.
12345/// 1. If the Base vector is not undef vector, resizing the very first mask to
12346/// have common VF and perform action for 2 input vectors (including non-undef
12347/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12348/// and processed as a shuffle of 2 elements.
12349/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12350/// action only for 1 vector with the given mask, if it is not the identity
12351/// mask.
12352/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12353/// vectors, combing the masks properly between the steps.
12354template <typename T>
12355staticT *performExtractsShuffleAction(
12356MutableArrayRef<std::pair<T *,SmallVector<int>>> ShuffleMask,Value *Base,
12357function_ref<unsigned(T *)> GetVF,
12358function_ref<std::pair<T *, bool>(T *,ArrayRef<int>,bool)> ResizeAction,
12359function_ref<T *(ArrayRef<int>,ArrayRef<T *>)> Action) {
12360assert(!ShuffleMask.empty() &&"Empty list of shuffles for inserts.");
12361SmallVector<int> Mask(ShuffleMask.begin()->second);
12362auto VMIt = std::next(ShuffleMask.begin());
12363T *Prev =nullptr;
12364SmallBitVector UseMask =
12365buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12366SmallBitVector IsBaseUndef =isUndefVector(Base, UseMask);
12367if (!IsBaseUndef.all()) {
12368// Base is not undef, need to combine it with the next subvectors.
12369 std::pair<T *, bool> Res =
12370 ResizeAction(ShuffleMask.begin()->first, Mask,/*ForSingleMask=*/false);
12371SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12372for (unsignedIdx = 0, VF = Mask.size();Idx < VF; ++Idx) {
12373if (Mask[Idx] ==PoisonMaskElem)
12374 Mask[Idx] = IsBasePoison.test(Idx) ?PoisonMaskElem :Idx;
12375else
12376 Mask[Idx] = (Res.second ?Idx : Mask[Idx]) + VF;
12377 }
12378 [[maybe_unused]]auto *V = ValueSelect::get<T *>(Base);
12379assert((!V || GetVF(V) == Mask.size()) &&
12380"Expected base vector of VF number of elements.");
12381 Prev = Action(Mask, {nullptr, Res.first});
12382 }elseif (ShuffleMask.size() == 1) {
12383// Base is undef and only 1 vector is shuffled - perform the action only for
12384// single vector, if the mask is not the identity mask.
12385 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12386/*ForSingleMask=*/true);
12387if (Res.second)
12388// Identity mask is found.
12389 Prev = Res.first;
12390else
12391 Prev = Action(Mask, {ShuffleMask.begin()->first});
12392 }else {
12393// Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12394// shuffles step by step, combining shuffle between the steps.
12395unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12396unsigned Vec2VF = GetVF(VMIt->first);
12397if (Vec1VF == Vec2VF) {
12398// No need to resize the input vectors since they are of the same size, we
12399// can shuffle them directly.
12400ArrayRef<int> SecMask = VMIt->second;
12401for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {
12402if (SecMask[I] !=PoisonMaskElem) {
12403assert(Mask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");
12404 Mask[I] = SecMask[I] + Vec1VF;
12405 }
12406 }
12407 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12408 }else {
12409// Vectors of different sizes - resize and reshuffle.
12410 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12411/*ForSingleMask=*/false);
12412 std::pair<T *, bool> Res2 =
12413 ResizeAction(VMIt->first, VMIt->second,/*ForSingleMask=*/false);
12414ArrayRef<int> SecMask = VMIt->second;
12415for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {
12416if (Mask[I] !=PoisonMaskElem) {
12417assert(SecMask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");
12418if (Res1.second)
12419 Mask[I] =I;
12420 }elseif (SecMask[I] !=PoisonMaskElem) {
12421assert(Mask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");
12422 Mask[I] = (Res2.second ?I : SecMask[I]) + VF;
12423 }
12424 }
12425 Prev = Action(Mask, {Res1.first, Res2.first});
12426 }
12427 VMIt = std::next(VMIt);
12428 }
12429 [[maybe_unused]]bool IsBaseNotUndef = !IsBaseUndef.all();
12430// Perform requested actions for the remaining masks/vectors.
12431for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12432// Shuffle other input vectors, if any.
12433 std::pair<T *, bool> Res =
12434 ResizeAction(VMIt->first, VMIt->second,/*ForSingleMask=*/false);
12435ArrayRef<int> SecMask = VMIt->second;
12436for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {
12437if (SecMask[I] !=PoisonMaskElem) {
12438assert((Mask[I] ==PoisonMaskElem || IsBaseNotUndef) &&
12439"Multiple uses of scalars.");
12440 Mask[I] = (Res.second ?I : SecMask[I]) + VF;
12441 }elseif (Mask[I] !=PoisonMaskElem) {
12442 Mask[I] =I;
12443 }
12444 }
12445 Prev = Action(Mask, {Prev, Res.first});
12446 }
12447return Prev;
12448}
12449
12450namespace{
12451/// Data type for handling buildvector sequences with the reused scalars from
12452/// other tree entries.
12453template <typename T>structShuffledInsertData {
12454 /// List of insertelements to be replaced by shuffles.
12455SmallVector<InsertElementInst *> InsertElements;
12456 /// The parent vectors and shuffle mask for the given list of inserts.
12457MapVector<T, SmallVector<int>> ValueMasks;
12458};
12459}// namespace
12460
12461InstructionCostBoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
12462InstructionCostCost = 0;
12463LLVM_DEBUG(dbgs() <<"SLP: Calculating cost for tree of size "
12464 << VectorizableTree.size() <<".\n");
12465
12466unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12467
12468SmallPtrSet<Value *, 4> CheckedExtracts;
12469for (unsignedI = 0, E = VectorizableTree.size();I < E; ++I) {
12470 TreeEntry &TE = *VectorizableTree[I];
12471// No need to count the cost for combined entries, they are combined and
12472// just skip their cost.
12473if (TE.State == TreeEntry::CombinedVectorize) {
12474LLVM_DEBUG(
12475dbgs() <<"SLP: Skipping cost for combined node that starts with "
12476 << *TE.Scalars[0] <<".\n";
12477 TE.dump();dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12478continue;
12479 }
12480if (TE.isGather() && TE.hasState()) {
12481if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
12482 E && E->getVectorFactor() == TE.getVectorFactor() &&
12483 E->isSame(TE.Scalars)) {
12484// Some gather nodes might be absolutely the same as some vectorizable
12485// nodes after reordering, need to handle it.
12486LLVM_DEBUG(dbgs() <<"SLP: Adding cost 0 for bundle "
12487 <<shortBundleName(TE.Scalars, TE.Idx) <<".\n"
12488 <<"SLP: Current total cost = " <<Cost <<"\n");
12489continue;
12490 }
12491 }
12492
12493// Exclude cost of gather loads nodes which are not used. These nodes were
12494// built as part of the final attempt to vectorize gathered loads.
12495assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12496"Expected gather nodes with users only.");
12497
12498InstructionCostC = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12499Cost +=C;
12500LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C <<" for bundle "
12501 <<shortBundleName(TE.Scalars, TE.Idx) <<".\n"
12502 <<"SLP: Current total cost = " <<Cost <<"\n");
12503 }
12504
12505SmallPtrSet<Value *, 16> ExtractCostCalculated;
12506InstructionCost ExtractCost = 0;
12507SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
12508SmallVector<APInt> DemandedElts;
12509SmallDenseSet<Value *, 4> UsedInserts;
12510DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
12511 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12512DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
12513SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12514// Keep track {Scalar, Index, User} tuple.
12515// On AArch64, this helps in fusing a mov instruction, associated with
12516// extractelement, with fmul in the backend so that extractelement is free.
12517SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
12518for (ExternalUser &EU : ExternalUses) {
12519 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12520 }
12521for (ExternalUser &EU : ExternalUses) {
12522// Uses by ephemeral values are free (because the ephemeral value will be
12523// removed prior to code generation, and so the extraction will be
12524// removed as well).
12525if (EphValues.count(EU.User))
12526continue;
12527
12528// Used in unreachable blocks or in EH pads (rarely executed) or is
12529// terminated with unreachable instruction.
12530if (BasicBlock *UserParent =
12531 EU.User ? cast<Instruction>(EU.User)->getParent() :nullptr;
12532 UserParent &&
12533 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12534 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12535continue;
12536
12537// We only add extract cost once for the same scalar.
12538if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12539 !ExtractCostCalculated.insert(EU.Scalar).second)
12540continue;
12541
12542// No extract cost for vector "scalar"
12543if (isa<FixedVectorType>(EU.Scalar->getType()))
12544continue;
12545
12546// If found user is an insertelement, do not calculate extract cost but try
12547// to detect it as a final shuffled/identity match.
12548if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12549 VU && VU->getOperand(1) == EU.Scalar) {
12550if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12551if (!UsedInserts.insert(VU).second)
12552continue;
12553 std::optional<unsigned> InsertIdx =getElementIndex(VU);
12554if (InsertIdx) {
12555const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12556auto *It =find_if(
12557 ShuffledInserts,
12558 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12559// Checks if 2 insertelements are from the same buildvector.
12560InsertElementInst *VecInsert =Data.InsertElements.front();
12561returnareTwoInsertFromSameBuildVector(
12562 VU, VecInsert, [this](InsertElementInst *II) ->Value * {
12563Value *Op0 =II->getOperand(0);
12564if (getTreeEntry(II) && !getTreeEntry(Op0))
12565returnnullptr;
12566return Op0;
12567 });
12568 });
12569int VecId = -1;
12570if (It == ShuffledInserts.end()) {
12571auto &Data = ShuffledInserts.emplace_back();
12572Data.InsertElements.emplace_back(VU);
12573 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12574 VecId = ShuffledInserts.size() - 1;
12575auto It = MinBWs.find(ScalarTE);
12576if (It != MinBWs.end() &&
12577 VectorCasts
12578 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12579 .second) {
12580unsigned BWSz = It->second.first;
12581unsigned DstBWSz =DL->getTypeSizeInBits(FTy->getElementType());
12582unsigned VecOpcode;
12583if (DstBWSz < BWSz)
12584 VecOpcode = Instruction::Trunc;
12585else
12586 VecOpcode =
12587 It->second.second ? Instruction::SExt : Instruction::ZExt;
12588TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
12589InstructionCostC =TTI->getCastInstrCost(
12590 VecOpcode, FTy,
12591getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12592 FTy->getNumElements()),
12593TTI::CastContextHint::None,CostKind);
12594LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C
12595 <<" for extending externally used vector with "
12596"non-equal minimum bitwidth.\n");
12597Cost +=C;
12598 }
12599 }else {
12600if (isFirstInsertElement(VU, It->InsertElements.front()))
12601 It->InsertElements.front() = VU;
12602 VecId = std::distance(ShuffledInserts.begin(), It);
12603 }
12604int InIdx = *InsertIdx;
12605SmallVectorImpl<int> &Mask =
12606 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12607if (Mask.empty())
12608 Mask.assign(FTy->getNumElements(),PoisonMaskElem);
12609 Mask[InIdx] = EU.Lane;
12610 DemandedElts[VecId].setBit(InIdx);
12611continue;
12612 }
12613 }
12614 }
12615
12616TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
12617// If we plan to rewrite the tree in a smaller type, we will need to sign
12618// extend the extracted value back to the original type. Here, we account
12619// for the extract and the added cost of the sign extend if needed.
12620InstructionCost ExtraCost =TTI::TCC_Free;
12621auto *VecTy =getWidenedType(EU.Scalar->getType(), BundleWidth);
12622const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12623auto It = MinBWs.find(Entry);
12624if (It != MinBWs.end()) {
12625auto *MinTy =IntegerType::get(F->getContext(), It->second.first);
12626unsigned Extend =isKnownNonNegative(EU.Scalar,SimplifyQuery(*DL))
12627 ? Instruction::ZExt
12628 : Instruction::SExt;
12629 VecTy =getWidenedType(MinTy, BundleWidth);
12630 ExtraCost =TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12631 VecTy, EU.Lane);
12632 }else {
12633 ExtraCost =
12634TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,CostKind,
12635 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12636 }
12637// Leave the scalar instructions as is if they are cheaper than extracts.
12638if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12639 Entry->getOpcode() == Instruction::Load) {
12640// Checks if the user of the external scalar is phi in loop body.
12641auto IsPhiInLoop = [&](const ExternalUser &U) {
12642if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12643auto *I = cast<Instruction>(U.Scalar);
12644constLoop *L = LI->getLoopFor(Phi->getParent());
12645return L && (Phi->getParent() ==I->getParent() ||
12646 L == LI->getLoopFor(I->getParent()));
12647 }
12648returnfalse;
12649 };
12650if (!ValueToExtUses) {
12651 ValueToExtUses.emplace();
12652for_each(enumerate(ExternalUses), [&](constauto &P) {
12653// Ignore phis in loops.
12654if (IsPhiInLoop(P.value()))
12655return;
12656
12657 ValueToExtUses->try_emplace(P.value().Scalar,P.index());
12658 });
12659 }
12660// Can use original instruction, if no operands vectorized or they are
12661// marked as externally used already.
12662auto *Inst = cast<Instruction>(EU.Scalar);
12663InstructionCost ScalarCost =TTI->getInstructionCost(Inst,CostKind);
12664auto OperandIsScalar = [&](Value *V) {
12665if (!getTreeEntry(V)) {
12666// Some extractelements might be not vectorized, but
12667// transformed into shuffle and removed from the function,
12668// consider it here.
12669if (auto *EE = dyn_cast<ExtractElementInst>(V))
12670return !EE->hasOneUse() || !MustGather.contains(EE);
12671returntrue;
12672 }
12673return ValueToExtUses->contains(V);
12674 };
12675bool CanBeUsedAsScalar =all_of(Inst->operands(), OperandIsScalar);
12676bool CanBeUsedAsScalarCast =false;
12677if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12678if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12679Op &&all_of(Op->operands(), OperandIsScalar)) {
12680InstructionCost OpCost =
12681 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
12682 ?TTI->getInstructionCost(Op,CostKind)
12683 : 0;
12684if (ScalarCost + OpCost <= ExtraCost) {
12685 CanBeUsedAsScalar = CanBeUsedAsScalarCast =true;
12686 ScalarCost += OpCost;
12687 }
12688 }
12689 }
12690if (CanBeUsedAsScalar) {
12691bool KeepScalar = ScalarCost <= ExtraCost;
12692// Try to keep original scalar if the user is the phi node from the same
12693// block as the root phis, currently vectorized. It allows to keep
12694// better ordering info of PHIs, being vectorized currently.
12695bool IsProfitablePHIUser =
12696 (KeepScalar || (ScalarCost - ExtraCost <=TTI::TCC_Basic &&
12697 VectorizableTree.front()->Scalars.size() > 2)) &&
12698 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12699 !Inst->hasNUsesOrMore(UsesLimit) &&
12700none_of(Inst->users(),
12701 [&](User *U) {
12702 auto *PHIUser = dyn_cast<PHINode>(U);
12703 return (!PHIUser ||
12704 PHIUser->getParent() !=
12705 cast<Instruction>(
12706 VectorizableTree.front()->getMainOp())
12707 ->getParent()) &&
12708 !getTreeEntry(U);
12709 }) &&
12710count_if(Entry->Scalars, [&](Value *V) {
12711 return ValueToExtUses->contains(V);
12712 }) <= 2;
12713if (IsProfitablePHIUser) {
12714 KeepScalar =true;
12715 }elseif (KeepScalar && ScalarCost !=TTI::TCC_Free &&
12716 ExtraCost - ScalarCost <=TTI::TCC_Basic &&
12717 (!GatheredLoadsEntriesFirst.has_value() ||
12718 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12719unsigned ScalarUsesCount =count_if(Entry->Scalars, [&](Value *V) {
12720 return ValueToExtUses->contains(V);
12721 });
12722auto It = ExtractsCount.find(Entry);
12723if (It != ExtractsCount.end()) {
12724assert(ScalarUsesCount >= It->getSecond().size() &&
12725"Expected total number of external uses not less than "
12726"number of scalar uses.");
12727 ScalarUsesCount -= It->getSecond().size();
12728 }
12729// Keep original scalar if number of externally used instructions in
12730// the same entry is not power of 2. It may help to do some extra
12731// vectorization for now.
12732 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12733 }
12734if (KeepScalar) {
12735 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12736for_each(Inst->operands(), [&](Value *V) {
12737 auto It = ValueToExtUses->find(V);
12738 if (It != ValueToExtUses->end()) {
12739// Replace all uses to avoid compiler crash.
12740 ExternalUses[It->second].User = nullptr;
12741 }
12742 });
12743 ExtraCost = ScalarCost;
12744if (!IsPhiInLoop(EU))
12745 ExtractsCount[Entry].insert(Inst);
12746if (CanBeUsedAsScalarCast) {
12747 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12748// Update the users of the operands of the cast operand to avoid
12749// compiler crash.
12750if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12751for_each(IOp->operands(), [&](Value *V) {
12752 auto It = ValueToExtUses->find(V);
12753 if (It != ValueToExtUses->end()) {
12754// Replace all uses to avoid compiler crash.
12755 ExternalUses[It->second].User = nullptr;
12756 }
12757 });
12758 }
12759 }
12760 }
12761 }
12762 }
12763
12764 ExtractCost += ExtraCost;
12765 }
12766// Insert externals for extract of operands of casts to be emitted as scalars
12767// instead of extractelement.
12768for (Value *V : ScalarOpsFromCasts) {
12769 ExternalUsesAsOriginalScalar.insert(V);
12770if (const TreeEntry *E = getTreeEntry(V)) {
12771 ExternalUses.emplace_back(V,nullptr, E->findLaneForValue(V));
12772 }
12773 }
12774// Add reduced value cost, if resized.
12775if (!VectorizedVals.empty()) {
12776const TreeEntry &Root = *VectorizableTree.front();
12777auto BWIt = MinBWs.find(&Root);
12778if (BWIt != MinBWs.end()) {
12779Type *DstTy = Root.Scalars.front()->getType();
12780unsigned OriginalSz =DL->getTypeSizeInBits(DstTy->getScalarType());
12781unsigned SrcSz =
12782 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12783if (OriginalSz != SrcSz) {
12784unsigned Opcode = Instruction::Trunc;
12785if (OriginalSz > SrcSz)
12786 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12787Type *SrcTy =IntegerType::get(DstTy->getContext(), SrcSz);
12788if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12789assert(SLPReVec &&"Only supported by REVEC.");
12790 SrcTy =getWidenedType(SrcTy, VecTy->getNumElements());
12791 }
12792Cost +=TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12793TTI::CastContextHint::None,
12794TTI::TCK_RecipThroughput);
12795 }
12796 }
12797 }
12798
12799InstructionCost SpillCost = getSpillCost();
12800Cost += SpillCost + ExtractCost;
12801auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE,ArrayRef<int>Mask,
12802bool) {
12803InstructionCostC = 0;
12804unsigned VF =Mask.size();
12805unsigned VecVF =TE->getVectorFactor();
12806if (VF != VecVF &&
12807 (any_of(Mask, [VF](intIdx) {returnIdx >=static_cast<int>(VF); }) ||
12808 !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
12809SmallVector<int> OrigMask(VecVF,PoisonMaskElem);
12810 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12811 OrigMask.begin());
12812C =::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc,
12813getWidenedType(TE->getMainOp()->getType(), VecVF),
12814 OrigMask);
12815LLVM_DEBUG(
12816dbgs() <<"SLP: Adding cost " <<C
12817 <<" for final shuffle of insertelement external users.\n";
12818TE->dump();dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12819Cost +=C;
12820return std::make_pair(TE,true);
12821 }
12822return std::make_pair(TE,false);
12823 };
12824// Calculate the cost of the reshuffled vectors, if any.
12825for (intI = 0, E = ShuffledInserts.size();I < E; ++I) {
12826Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12827autoVector = ShuffledInserts[I].ValueMasks.takeVector();
12828unsigned VF = 0;
12829auto EstimateShufflesCost = [&](ArrayRef<int>Mask,
12830ArrayRef<const TreeEntry *> TEs) {
12831assert((TEs.size() == 1 || TEs.size() == 2) &&
12832"Expected exactly 1 or 2 tree entries.");
12833if (TEs.size() == 1) {
12834if (VF == 0)
12835 VF = TEs.front()->getVectorFactor();
12836auto *FTy =getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12837if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12838 !all_of(enumerate(Mask), [=](constauto &Data) {
12839returnData.value() ==PoisonMaskElem ||
12840 (Data.index() < VF &&
12841static_cast<int>(Data.index()) ==Data.value());
12842 })) {
12843InstructionCostC =
12844::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, FTy, Mask);
12845LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C
12846 <<" for final shuffle of insertelement "
12847"external users.\n";
12848 TEs.front()->dump();
12849dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12850Cost +=C;
12851 }
12852 }else {
12853if (VF == 0) {
12854if (TEs.front() &&
12855 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12856 VF = TEs.front()->getVectorFactor();
12857else
12858 VF =Mask.size();
12859 }
12860auto *FTy =getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12861InstructionCostC =
12862::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, FTy, Mask);
12863LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C
12864 <<" for final shuffle of vector node and external "
12865"insertelement users.\n";
12866if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12867dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12868Cost +=C;
12869 }
12870 VF =Mask.size();
12871return TEs.back();
12872 };
12873 (void)performExtractsShuffleAction<const TreeEntry>(
12874MutableArrayRef(Vector.data(),Vector.size()),Base,
12875 [](const TreeEntry *E) {return E->getVectorFactor(); }, ResizeToVF,
12876 EstimateShufflesCost);
12877InstructionCost InsertCost =TTI->getScalarizationOverhead(
12878 cast<FixedVectorType>(
12879 ShuffledInserts[I].InsertElements.front()->getType()),
12880 DemandedElts[I],
12881/*Insert*/true,/*Extract*/false,TTI::TCK_RecipThroughput);
12882Cost -= InsertCost;
12883 }
12884
12885// Add the cost for reduced value resize (if required).
12886if (ReductionBitWidth != 0) {
12887assert(UserIgnoreList &&"Expected reduction tree.");
12888const TreeEntry &E = *VectorizableTree.front();
12889auto It = MinBWs.find(&E);
12890if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12891unsigned SrcSize = It->second.first;
12892unsigned DstSize = ReductionBitWidth;
12893unsigned Opcode = Instruction::Trunc;
12894if (SrcSize < DstSize) {
12895bool IsArithmeticExtendedReduction =
12896all_of(*UserIgnoreList, [](Value *V) {
12897auto *I = cast<Instruction>(V);
12898returnis_contained({Instruction::Add, Instruction::FAdd,
12899 Instruction::Mul, Instruction::FMul,
12900 Instruction::And, Instruction::Or,
12901 Instruction::Xor},
12902I->getOpcode());
12903 });
12904if (IsArithmeticExtendedReduction)
12905 Opcode =
12906 Instruction::BitCast;// Handle it by getExtendedReductionCost
12907else
12908 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12909 }
12910if (Opcode != Instruction::BitCast) {
12911auto *SrcVecTy =
12912getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12913auto *DstVecTy =
12914getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12915TTI::CastContextHint CCH = getCastContextHint(E);
12916InstructionCost CastCost;
12917switch (E.getOpcode()) {
12918case Instruction::SExt:
12919case Instruction::ZExt:
12920case Instruction::Trunc: {
12921const TreeEntry *OpTE = getOperandEntry(&E, 0);
12922 CCH = getCastContextHint(*OpTE);
12923break;
12924 }
12925default:
12926break;
12927 }
12928 CastCost +=TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12929TTI::TCK_RecipThroughput);
12930Cost += CastCost;
12931LLVM_DEBUG(dbgs() <<"SLP: Adding cost " << CastCost
12932 <<" for final resize for reduction from " << SrcVecTy
12933 <<" to " << DstVecTy <<"\n";
12934dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12935 }
12936 }
12937 }
12938
12939#ifndef NDEBUG
12940SmallString<256> Str;
12941 {
12942raw_svector_ostreamOS(Str);
12943OS <<"SLP: Spill Cost = " << SpillCost <<".\n"
12944 <<"SLP: Extract Cost = " << ExtractCost <<".\n"
12945 <<"SLP: Total Cost = " <<Cost <<".\n";
12946 }
12947LLVM_DEBUG(dbgs() << Str);
12948if (ViewSLPTree)
12949ViewGraph(this,"SLP" +F->getName(),false, Str);
12950#endif
12951
12952returnCost;
12953}
12954
12955/// Tries to find extractelement instructions with constant indices from fixed
12956/// vector type and gather such instructions into a bunch, which highly likely
12957/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12958/// successful, the matched scalars are replaced by poison values in \p VL for
12959/// future analysis.
12960std::optional<TTI::ShuffleKind>
12961BoUpSLP::tryToGatherSingleRegisterExtractElements(
12962MutableArrayRef<Value *> VL,SmallVectorImpl<int> &Mask) const{
12963// Scan list of gathered scalars for extractelements that can be represented
12964// as shuffles.
12965MapVector<Value *, SmallVector<int>> VectorOpToIdx;
12966SmallVector<int> UndefVectorExtracts;
12967for (intI = 0, E = VL.size();I < E; ++I) {
12968auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12969if (!EI) {
12970if (isa<UndefValue>(VL[I]))
12971 UndefVectorExtracts.push_back(I);
12972continue;
12973 }
12974auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12975if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12976continue;
12977 std::optional<unsigned>Idx =getExtractIndex(EI);
12978// Undefined index.
12979if (!Idx) {
12980 UndefVectorExtracts.push_back(I);
12981continue;
12982 }
12983if (Idx >= VecTy->getNumElements()) {
12984 UndefVectorExtracts.push_back(I);
12985continue;
12986 }
12987SmallBitVector ExtractMask(VecTy->getNumElements(),true);
12988 ExtractMask.reset(*Idx);
12989if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12990 UndefVectorExtracts.push_back(I);
12991continue;
12992 }
12993 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12994 }
12995// Sort the vector operands by the maximum number of uses in extractelements.
12996SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
12997 VectorOpToIdx.takeVector();
12998stable_sort(Vectors, [](constauto &P1,constauto &P2) {
12999returnP1.second.size() > P2.second.size();
13000 });
13001// Find the best pair of the vectors or a single vector.
13002constint UndefSz = UndefVectorExtracts.size();
13003unsigned SingleMax = 0;
13004unsigned PairMax = 0;
13005if (!Vectors.empty()) {
13006 SingleMax = Vectors.front().second.size() + UndefSz;
13007if (Vectors.size() > 1) {
13008auto *ItNext = std::next(Vectors.begin());
13009 PairMax = SingleMax + ItNext->second.size();
13010 }
13011 }
13012if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
13013return std::nullopt;
13014// Check if better to perform a shuffle of 2 vectors or just of a single
13015// vector.
13016SmallVector<Value *> SavedVL(VL.begin(), VL.end());
13017SmallVector<Value *> GatheredExtracts(
13018 VL.size(),PoisonValue::get(VL.front()->getType()));
13019if (SingleMax >= PairMax && SingleMax) {
13020for (intIdx : Vectors.front().second)
13021std::swap(GatheredExtracts[Idx], VL[Idx]);
13022 }elseif (!Vectors.empty()) {
13023for (unsignedIdx : {0, 1})
13024for (intIdx : Vectors[Idx].second)
13025std::swap(GatheredExtracts[Idx], VL[Idx]);
13026 }
13027// Add extracts from undefs too.
13028for (intIdx : UndefVectorExtracts)
13029std::swap(GatheredExtracts[Idx], VL[Idx]);
13030// Check that gather of extractelements can be represented as just a
13031// shuffle of a single/two vectors the scalars are extracted from.
13032 std::optional<TTI::ShuffleKind> Res =
13033isFixedVectorShuffle(GatheredExtracts, Mask, AC);
13034if (!Res ||all_of(Mask, [](intIdx) {returnIdx ==PoisonMaskElem; })) {
13035// TODO: try to check other subsets if possible.
13036// Restore the original VL if attempt was not successful.
13037copy(SavedVL, VL.begin());
13038return std::nullopt;
13039 }
13040// Restore unused scalars from mask, if some of the extractelements were not
13041// selected for shuffle.
13042for (intI = 0, E = GatheredExtracts.size();I < E; ++I) {
13043if (Mask[I] ==PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
13044 isa<UndefValue>(GatheredExtracts[I])) {
13045std::swap(VL[I], GatheredExtracts[I]);
13046continue;
13047 }
13048auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
13049if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13050 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13051is_contained(UndefVectorExtracts,I))
13052continue;
13053 }
13054return Res;
13055}
13056
13057/// Tries to find extractelement instructions with constant indices from fixed
13058/// vector type and gather such instructions into a bunch, which highly likely
13059/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
13060/// successful, the matched scalars are replaced by poison values in \p VL for
13061/// future analysis.
13062SmallVector<std::optional<TTI::ShuffleKind>>
13063BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
13064SmallVectorImpl<int> &Mask,
13065unsigned NumParts) const{
13066assert(NumParts > 0 &&"NumParts expected be greater than or equal to 1.");
13067SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
13068Mask.assign(VL.size(),PoisonMaskElem);
13069unsigned SliceSize =getPartNumElems(VL.size(), NumParts);
13070for (unsigned Part : seq<unsigned>(NumParts)) {
13071// Scan list of gathered scalars for extractelements that can be represented
13072// as shuffles.
13073MutableArrayRef<Value *> SubVL =MutableArrayRef(VL).slice(
13074 Part * SliceSize,getNumElems(VL.size(), SliceSize, Part));
13075SmallVector<int> SubMask;
13076 std::optional<TTI::ShuffleKind> Res =
13077 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13078 ShufflesRes[Part] = Res;
13079copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
13080 }
13081if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
13082return Res.has_value();
13083 }))
13084 ShufflesRes.clear();
13085return ShufflesRes;
13086}
13087
13088std::optional<TargetTransformInfo::ShuffleKind>
13089BoUpSLP::isGatherShuffledSingleRegisterEntry(
13090const TreeEntry *TE,ArrayRef<Value *> VL,MutableArrayRef<int> Mask,
13091SmallVectorImpl<const TreeEntry *> &Entries,unsigned Part,bool ForOrder) {
13092 Entries.clear();
13093// TODO: currently checking only for Scalars in the tree entry, need to count
13094// reused elements too for better cost estimation.
13095const EdgeInfo &TEUseEI =TE == VectorizableTree.front().get()
13096 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13097 :TE->UserTreeIndices.front();
13098constInstruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13099constBasicBlock *TEInsertBlock =nullptr;
13100// Main node of PHI entries keeps the correct order of operands/incoming
13101// blocks.
13102if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13103 TEInsertBlock =PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13104 TEInsertPt = TEInsertBlock->getTerminator();
13105 }else {
13106 TEInsertBlock = TEInsertPt->getParent();
13107 }
13108if (!DT->isReachableFromEntry(TEInsertBlock))
13109return std::nullopt;
13110auto *NodeUI = DT->getNode(TEInsertBlock);
13111assert(NodeUI &&"Should only process reachable instructions");
13112SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13113auto CheckOrdering = [&](constInstruction *InsertPt) {
13114// Argument InsertPt is an instruction where vector code for some other
13115// tree entry (one that shares one or more scalars with TE) is going to be
13116// generated. This lambda returns true if insertion point of vector code
13117// for the TE dominates that point (otherwise dependency is the other way
13118// around). The other node is not limited to be of a gather kind. Gather
13119// nodes are not scheduled and their vector code is inserted before their
13120// first user. If user is PHI, that is supposed to be at the end of a
13121// predecessor block. Otherwise it is the last instruction among scalars of
13122// the user node. So, instead of checking dependency between instructions
13123// themselves, we check dependency between their insertion points for vector
13124// code (since each scalar instruction ends up as a lane of a vector
13125// instruction).
13126constBasicBlock *InsertBlock = InsertPt->getParent();
13127auto *NodeEUI = DT->getNode(InsertBlock);
13128if (!NodeEUI)
13129returnfalse;
13130assert((NodeUI == NodeEUI) ==
13131 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13132"Different nodes should have different DFS numbers");
13133// Check the order of the gather nodes users.
13134if (TEInsertPt->getParent() != InsertBlock &&
13135 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13136returnfalse;
13137if (TEInsertPt->getParent() == InsertBlock &&
13138 TEInsertPt->comesBefore(InsertPt))
13139returnfalse;
13140returntrue;
13141 };
13142// Find all tree entries used by the gathered values. If no common entries
13143// found - not a shuffle.
13144// Here we build a set of tree nodes for each gathered value and trying to
13145// find the intersection between these sets. If we have at least one common
13146// tree node for each gathered value - we have just a permutation of the
13147// single vector. If we have 2 different sets, we're in situation where we
13148// have a permutation of 2 input vectors.
13149SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
13150DenseMap<Value *, int> UsedValuesEntry;
13151for (Value *V : VL) {
13152if (isConstant(V))
13153continue;
13154// Build a list of tree entries where V is used.
13155SmallPtrSet<const TreeEntry *, 4> VToTEs;
13156for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13157if (TEPtr == TE || TEPtr->Idx == 0)
13158continue;
13159assert(any_of(TEPtr->Scalars,
13160 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13161"Must contain at least single gathered value.");
13162assert(TEPtr->UserTreeIndices.size() == 1 &&
13163"Expected only single user of a gather node.");
13164const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13165
13166PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13167constInstruction *InsertPt =
13168 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13169 : &getLastInstructionInBundle(UseEI.UserTE);
13170if (TEInsertPt == InsertPt) {
13171// If 2 gathers are operands of the same entry (regardless of whether
13172// user is PHI or else), compare operands indices, use the earlier one
13173// as the base.
13174if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13175continue;
13176// If the user instruction is used for some reason in different
13177// vectorized nodes - make it depend on index.
13178if (TEUseEI.UserTE != UseEI.UserTE &&
13179 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13180continue;
13181 }
13182
13183// Check if the user node of the TE comes after user node of TEPtr,
13184// otherwise TEPtr depends on TE.
13185if ((TEInsertBlock != InsertPt->getParent() ||
13186 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13187 !CheckOrdering(InsertPt))
13188continue;
13189 VToTEs.insert(TEPtr);
13190 }
13191if (const TreeEntry *VTE = getTreeEntry(V)) {
13192if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13193if (VTE->State != TreeEntry::Vectorize) {
13194auto It = MultiNodeScalars.find(V);
13195if (It == MultiNodeScalars.end())
13196continue;
13197 VTE = *It->getSecond().begin();
13198// Iterate through all vectorized nodes.
13199auto *MIt =find_if(It->getSecond(), [](const TreeEntry *MTE) {
13200 return MTE->State == TreeEntry::Vectorize;
13201 });
13202if (MIt == It->getSecond().end())
13203continue;
13204 VTE = *MIt;
13205 }
13206 }
13207if (none_of(TE->CombinedEntriesWithIndices,
13208 [&](constauto &P) { return P.first == VTE->Idx; })) {
13209Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13210if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13211continue;
13212 }
13213 VToTEs.insert(VTE);
13214 }
13215if (VToTEs.empty())
13216continue;
13217if (UsedTEs.empty()) {
13218// The first iteration, just insert the list of nodes to vector.
13219 UsedTEs.push_back(VToTEs);
13220 UsedValuesEntry.try_emplace(V, 0);
13221 }else {
13222// Need to check if there are any previously used tree nodes which use V.
13223// If there are no such nodes, consider that we have another one input
13224// vector.
13225SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13226unsignedIdx = 0;
13227for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13228// Do we have a non-empty intersection of previously listed tree entries
13229// and tree entries using current V?
13230set_intersect(VToTEs, Set);
13231if (!VToTEs.empty()) {
13232// Yes, write the new subset and continue analysis for the next
13233// scalar.
13234Set.swap(VToTEs);
13235break;
13236 }
13237 VToTEs = SavedVToTEs;
13238 ++Idx;
13239 }
13240// No non-empty intersection found - need to add a second set of possible
13241// source vectors.
13242if (Idx == UsedTEs.size()) {
13243// If the number of input vectors is greater than 2 - not a permutation,
13244// fallback to the regular gather.
13245// TODO: support multiple reshuffled nodes.
13246if (UsedTEs.size() == 2)
13247continue;
13248 UsedTEs.push_back(SavedVToTEs);
13249Idx = UsedTEs.size() - 1;
13250 }
13251 UsedValuesEntry.try_emplace(V,Idx);
13252 }
13253 }
13254
13255if (UsedTEs.empty()) {
13256 Entries.clear();
13257return std::nullopt;
13258 }
13259
13260unsigned VF = 0;
13261if (UsedTEs.size() == 1) {
13262// Keep the order to avoid non-determinism.
13263SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13264 UsedTEs.front().end());
13265sort(FirstEntries, [](const TreeEntry *TE1,const TreeEntry *TE2) {
13266return TE1->Idx < TE2->Idx;
13267 });
13268// Try to find the perfect match in another gather node at first.
13269auto *It =find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13270return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13271 });
13272if (It != FirstEntries.end() &&
13273 ((*It)->getVectorFactor() == VL.size() ||
13274 ((*It)->getVectorFactor() ==TE->Scalars.size() &&
13275TE->ReuseShuffleIndices.size() == VL.size() &&
13276 (*It)->isSame(TE->Scalars)))) {
13277 Entries.push_back(*It);
13278if ((*It)->getVectorFactor() == VL.size()) {
13279 std::iota(std::next(Mask.begin(), Part * VL.size()),
13280 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13281 }else {
13282SmallVector<int> CommonMask =TE->getCommonMask();
13283copy(CommonMask,Mask.begin());
13284 }
13285// Clear undef scalars.
13286for (unsignedI : seq<unsigned>(VL.size()))
13287if (isa<PoisonValue>(VL[I]))
13288Mask[Part * VL.size() +I] =PoisonMaskElem;
13289returnTargetTransformInfo::SK_PermuteSingleSrc;
13290 }
13291// No perfect match, just shuffle, so choose the first tree node from the
13292// tree.
13293 Entries.push_back(FirstEntries.front());
13294 VF = FirstEntries.front()->getVectorFactor();
13295 }else {
13296// Try to find nodes with the same vector factor.
13297assert(UsedTEs.size() == 2 &&"Expected at max 2 permuted entries.");
13298// Keep the order of tree nodes to avoid non-determinism.
13299DenseMap<int, const TreeEntry *> VFToTE;
13300for (const TreeEntry *TE : UsedTEs.front()) {
13301unsigned VF =TE->getVectorFactor();
13302auto It = VFToTE.find(VF);
13303if (It != VFToTE.end()) {
13304if (It->second->Idx >TE->Idx)
13305 It->getSecond() =TE;
13306continue;
13307 }
13308 VFToTE.try_emplace(VF, TE);
13309 }
13310// Same, keep the order to avoid non-determinism.
13311SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13312 UsedTEs.back().end());
13313sort(SecondEntries, [](const TreeEntry *TE1,const TreeEntry *TE2) {
13314return TE1->Idx < TE2->Idx;
13315 });
13316for (const TreeEntry *TE : SecondEntries) {
13317auto It = VFToTE.find(TE->getVectorFactor());
13318if (It != VFToTE.end()) {
13319 VF = It->first;
13320 Entries.push_back(It->second);
13321 Entries.push_back(TE);
13322break;
13323 }
13324 }
13325// No 2 source vectors with the same vector factor - just choose 2 with max
13326// index.
13327if (Entries.empty()) {
13328 Entries.push_back(*llvm::max_element(
13329 UsedTEs.front(), [](const TreeEntry *TE1,const TreeEntry *TE2) {
13330 return TE1->Idx < TE2->Idx;
13331 }));
13332 Entries.push_back(SecondEntries.front());
13333 VF = std::max(Entries.front()->getVectorFactor(),
13334 Entries.back()->getVectorFactor());
13335 }else {
13336 VF = Entries.front()->getVectorFactor();
13337 }
13338 }
13339
13340bool IsSplatOrUndefs =isSplat(VL) ||all_of(VL, IsaPred<UndefValue>);
13341// Checks if the 2 PHIs are compatible in terms of high possibility to be
13342// vectorized.
13343auto AreCompatiblePHIs = [&](Value *V,Value *V1) {
13344auto *PHI = cast<PHINode>(V);
13345auto *PHI1 = cast<PHINode>(V1);
13346// Check that all incoming values are compatible/from same parent (if they
13347// are instructions).
13348// The incoming values are compatible if they all are constants, or
13349// instruction with the same/alternate opcodes from the same basic block.
13350for (intI = 0, E =PHI->getNumIncomingValues();I < E; ++I) {
13351Value *In =PHI->getIncomingValue(I);
13352Value *In1 = PHI1->getIncomingValue(I);
13353if (isConstant(In) &&isConstant(In1))
13354continue;
13355if (!getSameOpcode({In, In1}, *TLI))
13356returnfalse;
13357if (cast<Instruction>(In)->getParent() !=
13358 cast<Instruction>(In1)->getParent())
13359returnfalse;
13360 }
13361returntrue;
13362 };
13363// Check if the value can be ignored during analysis for shuffled gathers.
13364// We suppose it is better to ignore instruction, which do not form splats,
13365// are not vectorized/not extractelements (these instructions will be handled
13366// by extractelements processing) or may form vector node in future.
13367auto MightBeIgnored = [=](Value *V) {
13368auto *I = dyn_cast<Instruction>(V);
13369returnI && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
13370 !isVectorLikeInstWithConstOps(I) &&
13371 !areAllUsersVectorized(I, UserIgnoreList) &&isSimple(I);
13372 };
13373// Check that the neighbor instruction may form a full vector node with the
13374// current instruction V. It is possible, if they have same/alternate opcode
13375// and same parent basic block.
13376auto NeighborMightBeIgnored = [&](Value *V,intIdx) {
13377Value *V1 = VL[Idx];
13378bool UsedInSameVTE =false;
13379auto It = UsedValuesEntry.find(V1);
13380if (It != UsedValuesEntry.end())
13381 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13382returnV != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13383getSameOpcode({V, V1}, *TLI) &&
13384 cast<Instruction>(V)->getParent() ==
13385 cast<Instruction>(V1)->getParent() &&
13386 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13387 };
13388// Build a shuffle mask for better cost estimation and vector emission.
13389SmallBitVector UsedIdxs(Entries.size());
13390SmallVector<std::pair<unsigned, int>> EntryLanes;
13391for (intI = 0, E = VL.size();I < E; ++I) {
13392Value *V = VL[I];
13393auto It = UsedValuesEntry.find(V);
13394if (It == UsedValuesEntry.end())
13395continue;
13396// Do not try to shuffle scalars, if they are constants, or instructions
13397// that can be vectorized as a result of the following vector build
13398// vectorization.
13399if (isConstant(V) || (MightBeIgnored(V) &&
13400 ((I > 0 && NeighborMightBeIgnored(V,I - 1)) ||
13401 (I != E - 1 && NeighborMightBeIgnored(V,I + 1)))))
13402continue;
13403unsignedIdx = It->second;
13404 EntryLanes.emplace_back(Idx,I);
13405 UsedIdxs.set(Idx);
13406 }
13407// Iterate through all shuffled scalars and select entries, which can be used
13408// for final shuffle.
13409SmallVector<const TreeEntry *> TempEntries;
13410for (unsignedI = 0, Sz = Entries.size();I < Sz; ++I) {
13411if (!UsedIdxs.test(I))
13412continue;
13413// Fix the entry number for the given scalar. If it is the first entry, set
13414// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13415// These indices are used when calculating final shuffle mask as the vector
13416// offset.
13417for (std::pair<unsigned, int> &Pair : EntryLanes)
13418if (Pair.first ==I)
13419 Pair.first = TempEntries.size();
13420 TempEntries.push_back(Entries[I]);
13421 }
13422 Entries.swap(TempEntries);
13423if (EntryLanes.size() == Entries.size() &&
13424 !VL.equals(ArrayRef(TE->Scalars)
13425 .slice(Part * VL.size(),
13426 std::min<int>(VL.size(),TE->Scalars.size())))) {
13427// We may have here 1 or 2 entries only. If the number of scalars is equal
13428// to the number of entries, no need to do the analysis, it is not very
13429// profitable. Since VL is not the same as TE->Scalars, it means we already
13430// have some shuffles before. Cut off not profitable case.
13431 Entries.clear();
13432return std::nullopt;
13433 }
13434// Build the final mask, check for the identity shuffle, if possible.
13435bool IsIdentity = Entries.size() == 1;
13436// Pair.first is the offset to the vector, while Pair.second is the index of
13437// scalar in the list.
13438for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13439unsignedIdx = Part * VL.size() + Pair.second;
13440Mask[Idx] =
13441 Pair.first * VF +
13442 (ForOrder ? std::distance(
13443 Entries[Pair.first]->Scalars.begin(),
13444find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13445 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13446 IsIdentity &=Mask[Idx] == Pair.second;
13447 }
13448if (ForOrder || IsIdentity || Entries.empty()) {
13449switch (Entries.size()) {
13450case 1:
13451if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13452returnTargetTransformInfo::SK_PermuteSingleSrc;
13453break;
13454case 2:
13455if (EntryLanes.size() > 2 || VL.size() <= 2)
13456returnTargetTransformInfo::SK_PermuteTwoSrc;
13457break;
13458default:
13459break;
13460 }
13461 }elseif (!isa<VectorType>(VL.front()->getType()) &&
13462 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13463// Do the cost estimation if shuffle beneficial than buildvector.
13464SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13465 std::next(Mask.begin(), (Part + 1) * VL.size()));
13466int MinElement = SubMask.front(), MaxElement = SubMask.front();
13467for (intIdx : SubMask) {
13468if (Idx ==PoisonMaskElem)
13469continue;
13470if (MinElement ==PoisonMaskElem || MinElement % VF >Idx % VF)
13471 MinElement =Idx;
13472if (MaxElement ==PoisonMaskElem || MaxElement % VF <Idx % VF)
13473 MaxElement =Idx;
13474 }
13475assert(MaxElement >= 0 && MinElement >= 0 &&
13476 MaxElement % VF >= MinElement % VF &&
13477"Expected at least single element.");
13478unsigned NewVF = std::max<unsigned>(
13479 VL.size(),getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13480 (MaxElement % VF) -
13481 (MinElement % VF) + 1));
13482if (NewVF < VF) {
13483for_each(SubMask, [&](int &Idx) {
13484if (Idx ==PoisonMaskElem)
13485return;
13486Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13487 (Idx >=static_cast<int>(VF) ? NewVF : 0);
13488 });
13489 }else {
13490 NewVF = VF;
13491 }
13492
13493constexprTTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
13494auto *VecTy =getWidenedType(VL.front()->getType(), NewVF);
13495auto *MaskVecTy =getWidenedType(VL.front()->getType(), SubMask.size());
13496auto GetShuffleCost = [&,
13497 &TTI = *TTI](ArrayRef<int>Mask,
13498ArrayRef<const TreeEntry *> Entries,
13499VectorType *VecTy) ->InstructionCost {
13500if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13501ShuffleVectorInst::isDeInterleaveMaskOfFactor(
13502 Mask, Entries.front()->getInterleaveFactor()))
13503returnTTI::TCC_Free;
13504 return ::getShuffleCost(TTI,
13505 Entries.size() > 1 ?TTI::SK_PermuteTwoSrc
13506 :TTI::SK_PermuteSingleSrc,
13507 VecTy, Mask,CostKind);
13508 };
13509InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13510InstructionCost FirstShuffleCost = 0;
13511SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13512if (Entries.size() == 1 || !Entries[0]->isGather()) {
13513 FirstShuffleCost = ShuffleCost;
13514 }else {
13515// Transform mask to include only first entry.
13516APInt DemandedElts =APInt::getAllOnes(SubMask.size());
13517bool IsIdentity =true;
13518for (auto [I,Idx] :enumerate(FirstMask)) {
13519if (Idx >=static_cast<int>(NewVF)) {
13520Idx =PoisonMaskElem;
13521 }else {
13522 DemandedElts.clearBit(I);
13523if (Idx !=PoisonMaskElem)
13524 IsIdentity &=static_cast<int>(I) ==Idx;
13525 }
13526 }
13527if (!IsIdentity)
13528 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13529 FirstShuffleCost +=TTI->getScalarizationOverhead(
13530 MaskVecTy, DemandedElts,/*Insert=*/true,
13531/*Extract=*/false,CostKind);
13532 }
13533InstructionCost SecondShuffleCost = 0;
13534SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13535if (Entries.size() == 1 || !Entries[1]->isGather()) {
13536 SecondShuffleCost = ShuffleCost;
13537 }else {
13538// Transform mask to include only first entry.
13539APInt DemandedElts =APInt::getAllOnes(SubMask.size());
13540bool IsIdentity =true;
13541for (auto [I,Idx] :enumerate(SecondMask)) {
13542if (Idx <static_cast<int>(NewVF) &&Idx >= 0) {
13543Idx =PoisonMaskElem;
13544 }else {
13545 DemandedElts.clearBit(I);
13546if (Idx !=PoisonMaskElem) {
13547Idx -= NewVF;
13548 IsIdentity &=static_cast<int>(I) ==Idx;
13549 }
13550 }
13551 }
13552if (!IsIdentity)
13553 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13554 SecondShuffleCost +=TTI->getScalarizationOverhead(
13555 MaskVecTy, DemandedElts,/*Insert=*/true,
13556/*Extract=*/false,CostKind);
13557 }
13558APInt DemandedElts =APInt::getAllOnes(SubMask.size());
13559for (auto [I,Idx] :enumerate(SubMask))
13560if (Idx ==PoisonMaskElem)
13561 DemandedElts.clearBit(I);
13562InstructionCost BuildVectorCost =
13563TTI->getScalarizationOverhead(MaskVecTy, DemandedElts,/*Insert=*/true,
13564/*Extract=*/false,CostKind);
13565const TreeEntry *BestEntry =nullptr;
13566if (FirstShuffleCost < ShuffleCost) {
13567 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13568 std::next(Mask.begin(), (Part + 1) * VL.size()),
13569 [&](int &Idx) {
13570 if (Idx >= static_cast<int>(VF))
13571 Idx = PoisonMaskElem;
13572 });
13573 BestEntry = Entries.front();
13574 ShuffleCost = FirstShuffleCost;
13575 }
13576if (SecondShuffleCost < ShuffleCost) {
13577 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13578 std::next(Mask.begin(), (Part + 1) * VL.size()),
13579 [&](int &Idx) {
13580 if (Idx < static_cast<int>(VF))
13581 Idx = PoisonMaskElem;
13582 else
13583 Idx -= VF;
13584 });
13585 BestEntry = Entries[1];
13586 ShuffleCost = SecondShuffleCost;
13587 }
13588if (BuildVectorCost >= ShuffleCost) {
13589if (BestEntry) {
13590 Entries.clear();
13591 Entries.push_back(BestEntry);
13592 }
13593return Entries.size() > 1 ?TargetTransformInfo::SK_PermuteTwoSrc
13594 :TargetTransformInfo::SK_PermuteSingleSrc;
13595 }
13596 }
13597 Entries.clear();
13598// Clear the corresponding mask elements.
13599 std::fill(std::next(Mask.begin(), Part * VL.size()),
13600 std::next(Mask.begin(), (Part + 1) * VL.size()),PoisonMaskElem);
13601return std::nullopt;
13602}
13603
13604SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
13605BoUpSLP::isGatherShuffledEntry(
13606const TreeEntry *TE,ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,
13607SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,unsigned NumParts,
13608bool ForOrder) {
13609assert(NumParts > 0 && NumParts < VL.size() &&
13610"Expected positive number of registers.");
13611 Entries.clear();
13612// No need to check for the topmost gather node.
13613if (TE == VectorizableTree.front().get() &&
13614 (!GatheredLoadsEntriesFirst.has_value() ||
13615none_of(ArrayRef(VectorizableTree).drop_front(),
13616 [](const std::unique_ptr<TreeEntry> &TE) {
13617return !TE->isGather();
13618 })))
13619return {};
13620// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
13621// implemented yet.
13622if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
13623return {};
13624Mask.assign(VL.size(),PoisonMaskElem);
13625assert((TE->UserTreeIndices.size() == 1 ||
13626 TE == VectorizableTree.front().get()) &&
13627"Expected only single user of the gather node.");
13628assert(VL.size() % NumParts == 0 &&
13629"Number of scalars must be divisible by NumParts.");
13630if (!TE->UserTreeIndices.empty() &&
13631TE->UserTreeIndices.front().UserTE->isGather() &&
13632TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13633assert(
13634 (TE->Idx == 0 ||
13635 (TE->hasState() &&TE->getOpcode() == Instruction::ExtractElement) ||
13636isSplat(TE->Scalars)) &&
13637"Expected splat or extractelements only node.");
13638return {};
13639 }
13640unsigned SliceSize =getPartNumElems(VL.size(), NumParts);
13641SmallVector<std::optional<TTI::ShuffleKind>> Res;
13642for (unsigned Part : seq<unsigned>(NumParts)) {
13643ArrayRef<Value *> SubVL =
13644 VL.slice(Part * SliceSize,getNumElems(VL.size(), SliceSize, Part));
13645SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13646 std::optional<TTI::ShuffleKind> SubRes =
13647 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13648 ForOrder);
13649if (!SubRes)
13650 SubEntries.clear();
13651 Res.push_back(SubRes);
13652if (SubEntries.size() == 1 && *SubRes ==TTI::SK_PermuteSingleSrc &&
13653 SubEntries.front()->getVectorFactor() == VL.size() &&
13654 (SubEntries.front()->isSame(TE->Scalars) ||
13655 SubEntries.front()->isSame(VL))) {
13656SmallVector<const TreeEntry *> LocalSubEntries;
13657 LocalSubEntries.swap(SubEntries);
13658 Entries.clear();
13659 Res.clear();
13660 std::iota(Mask.begin(),Mask.end(), 0);
13661// Clear undef scalars.
13662for (intI = 0, Sz = VL.size();I < Sz; ++I)
13663if (isa<PoisonValue>(VL[I]))
13664Mask[I] =PoisonMaskElem;
13665 Entries.emplace_back(1, LocalSubEntries.front());
13666 Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
13667return Res;
13668 }
13669 }
13670if (all_of(Res,
13671 [](const std::optional<TTI::ShuffleKind> &SK) {return !SK; })) {
13672 Entries.clear();
13673return {};
13674 }
13675return Res;
13676}
13677
13678InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,bool ForPoisonSrc,
13679Type *ScalarTy) const{
13680auto *VecTy =getWidenedType(ScalarTy, VL.size());
13681bool DuplicateNonConst =false;
13682// Find the cost of inserting/extracting values from the vector.
13683// Check if the same elements are inserted several times and count them as
13684// shuffle candidates.
13685APInt ShuffledElements =APInt::getZero(VL.size());
13686DenseMap<Value *, unsigned> UniqueElements;
13687constexprTTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
13688InstructionCostCost;
13689auto EstimateInsertCost = [&](unsignedI,Value *V) {
13690if (V->getType() != ScalarTy) {
13691Cost +=TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,V->getType(),
13692TTI::CastContextHint::None,CostKind);
13693V =nullptr;
13694 }
13695if (!ForPoisonSrc)
13696Cost +=
13697TTI->getVectorInstrCost(Instruction::InsertElement, VecTy,CostKind,
13698I,Constant::getNullValue(VecTy),V);
13699 };
13700SmallVector<int> ShuffleMask(VL.size(),PoisonMaskElem);
13701for (unsignedI = 0, E = VL.size();I < E; ++I) {
13702Value *V = VL[I];
13703// No need to shuffle duplicates for constants.
13704if ((ForPoisonSrc &&isConstant(V)) || isa<UndefValue>(V)) {
13705 ShuffledElements.setBit(I);
13706 ShuffleMask[I] = isa<PoisonValue>(V) ?PoisonMaskElem :I;
13707continue;
13708 }
13709
13710auto Res = UniqueElements.try_emplace(V,I);
13711if (Res.second) {
13712 EstimateInsertCost(I, V);
13713 ShuffleMask[I] =I;
13714continue;
13715 }
13716
13717 DuplicateNonConst =true;
13718 ShuffledElements.setBit(I);
13719 ShuffleMask[I] = Res.first->second;
13720 }
13721if (ForPoisonSrc) {
13722if (isa<FixedVectorType>(ScalarTy)) {
13723assert(SLPReVec &&"Only supported by REVEC.");
13724// We don't need to insert elements one by one. Instead, we can insert the
13725// entire vector into the destination.
13726Cost = 0;
13727unsigned ScalarTyNumElements =getNumElements(ScalarTy);
13728for (unsignedI : seq<unsigned>(VL.size()))
13729if (!ShuffledElements[I])
13730Cost +=TTI->getShuffleCost(
13731TTI::SK_InsertSubvector, VecTy, std::nullopt,CostKind,
13732I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13733 }else {
13734Cost =TTI->getScalarizationOverhead(VecTy,
13735/*DemandedElts*/ ~ShuffledElements,
13736/*Insert*/true,
13737/*Extract*/false,CostKind, VL);
13738 }
13739 }
13740if (DuplicateNonConst)
13741Cost +=::getShuffleCost(*TTI,TargetTransformInfo::SK_PermuteSingleSrc,
13742 VecTy, ShuffleMask);
13743returnCost;
13744}
13745
13746Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13747auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13748if (Res)
13749return *Res;
13750// Get the basic block this bundle is in. All instructions in the bundle
13751// should be in this block (except for extractelement-like instructions with
13752// constant indices or gathered loads).
13753auto *Front = E->getMainOp();
13754auto *BB = Front->getParent();
13755assert(((GatheredLoadsEntriesFirst.has_value() &&
13756 E->getOpcode() == Instruction::Load && E->isGather() &&
13757 E->Idx < *GatheredLoadsEntriesFirst) ||
13758all_of(E->Scalars,
13759 [=](Value *V) ->bool {
13760 if (E->getOpcode() == Instruction::GetElementPtr &&
13761 !isa<GetElementPtrInst>(V))
13762 return true;
13763 auto *I = dyn_cast<Instruction>(V);
13764 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13765 isVectorLikeInstWithConstOps(I);
13766 })) &&
13767"Expected gathered loads or GEPs or instructions from same basic "
13768"block.");
13769
13770auto FindLastInst = [&]() {
13771Instruction *LastInst = Front;
13772for (Value *V : E->Scalars) {
13773auto *I = dyn_cast<Instruction>(V);
13774if (!I)
13775continue;
13776if (LastInst->getParent() ==I->getParent()) {
13777if (LastInst->comesBefore(I))
13778 LastInst =I;
13779continue;
13780 }
13781assert(((E->getOpcode() == Instruction::GetElementPtr &&
13782 !isa<GetElementPtrInst>(I)) ||
13783 (isVectorLikeInstWithConstOps(LastInst) &&
13784isVectorLikeInstWithConstOps(I)) ||
13785 (GatheredLoadsEntriesFirst.has_value() &&
13786 E->getOpcode() == Instruction::Load && E->isGather() &&
13787 E->Idx < *GatheredLoadsEntriesFirst)) &&
13788"Expected vector-like or non-GEP in GEP node insts only.");
13789if (!DT->isReachableFromEntry(LastInst->getParent())) {
13790 LastInst =I;
13791continue;
13792 }
13793if (!DT->isReachableFromEntry(I->getParent()))
13794continue;
13795auto *NodeA = DT->getNode(LastInst->getParent());
13796auto *NodeB = DT->getNode(I->getParent());
13797assert(NodeA &&"Should only process reachable instructions");
13798assert(NodeB &&"Should only process reachable instructions");
13799assert((NodeA == NodeB) ==
13800 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13801"Different nodes should have different DFS numbers");
13802if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13803 LastInst =I;
13804 }
13805 BB = LastInst->getParent();
13806return LastInst;
13807 };
13808
13809auto FindFirstInst = [&]() {
13810Instruction *FirstInst = Front;
13811for (Value *V : E->Scalars) {
13812auto *I = dyn_cast<Instruction>(V);
13813if (!I)
13814continue;
13815if (FirstInst->getParent() ==I->getParent()) {
13816if (I->comesBefore(FirstInst))
13817 FirstInst =I;
13818continue;
13819 }
13820assert(((E->getOpcode() == Instruction::GetElementPtr &&
13821 !isa<GetElementPtrInst>(I)) ||
13822 (isVectorLikeInstWithConstOps(FirstInst) &&
13823isVectorLikeInstWithConstOps(I))) &&
13824"Expected vector-like or non-GEP in GEP node insts only.");
13825if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13826 FirstInst =I;
13827continue;
13828 }
13829if (!DT->isReachableFromEntry(I->getParent()))
13830continue;
13831auto *NodeA = DT->getNode(FirstInst->getParent());
13832auto *NodeB = DT->getNode(I->getParent());
13833assert(NodeA &&"Should only process reachable instructions");
13834assert(NodeB &&"Should only process reachable instructions");
13835assert((NodeA == NodeB) ==
13836 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13837"Different nodes should have different DFS numbers");
13838if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13839 FirstInst =I;
13840 }
13841return FirstInst;
13842 };
13843
13844// Set insertpoint for gathered loads to the very first load.
13845if (GatheredLoadsEntriesFirst.has_value() &&
13846 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13847 E->getOpcode() == Instruction::Load) {
13848 Res = FindFirstInst();
13849return *Res;
13850 }
13851
13852// Set the insert point to the beginning of the basic block if the entry
13853// should not be scheduled.
13854if (doesNotNeedToSchedule(E->Scalars) ||
13855 (!E->isGather() &&all_of(E->Scalars,isVectorLikeInstWithConstOps))) {
13856if ((E->getOpcode() == Instruction::GetElementPtr &&
13857any_of(E->Scalars,
13858 [](Value *V) {
13859 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13860 })) ||
13861all_of(E->Scalars,
13862 [](Value *V) {
13863 return isa<PoisonValue>(V) ||
13864 (!isVectorLikeInstWithConstOps(V) &&
13865 isUsedOutsideBlock(V));
13866 }) ||
13867 (E->isGather() && E->Idx == 0 &&all_of(E->Scalars, [](Value *V) {
13868 return isa<ExtractElementInst, UndefValue>(V) ||
13869 areAllOperandsNonInsts(V);
13870 })))
13871 Res = FindLastInst();
13872else
13873 Res = FindFirstInst();
13874return *Res;
13875 }
13876
13877// Find the last instruction. The common case should be that BB has been
13878// scheduled, and the last instruction is VL.back(). So we start with
13879// VL.back() and iterate over schedule data until we reach the end of the
13880// bundle. The end of the bundle is marked by null ScheduleData.
13881if (BlocksSchedules.count(BB) && !E->isGather()) {
13882Value *V = E->isOneOf(E->Scalars.back());
13883if (doesNotNeedToBeScheduled(V))
13884V = *find_if_not(E->Scalars,doesNotNeedToBeScheduled);
13885auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13886if (Bundle && Bundle->isPartOfBundle())
13887for (; Bundle; Bundle = Bundle->NextInBundle)
13888 Res = Bundle->Inst;
13889 }
13890
13891// LastInst can still be null at this point if there's either not an entry
13892// for BB in BlocksSchedules or there's no ScheduleData available for
13893// VL.back(). This can be the case if buildTree_rec aborts for various
13894// reasons (e.g., the maximum recursion depth is reached, the maximum region
13895// size is reached, etc.). ScheduleData is initialized in the scheduling
13896// "dry-run".
13897//
13898// If this happens, we can still find the last instruction by brute force. We
13899// iterate forwards from Front (inclusive) until we either see all
13900// instructions in the bundle or reach the end of the block. If Front is the
13901// last instruction in program order, LastInst will be set to Front, and we
13902// will visit all the remaining instructions in the block.
13903//
13904// One of the reasons we exit early from buildTree_rec is to place an upper
13905// bound on compile-time. Thus, taking an additional compile-time hit here is
13906// not ideal. However, this should be exceedingly rare since it requires that
13907// we both exit early from buildTree_rec and that the bundle be out-of-order
13908// (causing us to iterate all the way to the end of the block).
13909if (!Res)
13910 Res = FindLastInst();
13911assert(Res &&"Failed to find last instruction in bundle");
13912return *Res;
13913}
13914
13915void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13916auto *Front = E->getMainOp();
13917Instruction *LastInst = &getLastInstructionInBundle(E);
13918assert(LastInst &&"Failed to find last instruction in bundle");
13919BasicBlock::iterator LastInstIt = LastInst->getIterator();
13920// If the instruction is PHI, set the insert point after all the PHIs.
13921bool IsPHI = isa<PHINode>(LastInst);
13922if (IsPHI)
13923 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13924if (IsPHI || (!E->isGather() &&doesNotNeedToSchedule(E->Scalars))) {
13925 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13926 }else {
13927// Set the insertion point after the last instruction in the bundle. Set the
13928// debug location to Front.
13929 Builder.SetInsertPoint(
13930 LastInst->getParent(),
13931 LastInst->getNextNonDebugInstruction()->getIterator());
13932 }
13933 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13934}
13935
13936Value *BoUpSLP::gather(
13937ArrayRef<Value *> VL,Value *Root,Type *ScalarTy,
13938function_ref<Value *(Value *,Value *,ArrayRef<int>)> CreateShuffle) {
13939// List of instructions/lanes from current block and/or the blocks which are
13940// part of the current loop. These instructions will be inserted at the end to
13941// make it possible to optimize loops and hoist invariant instructions out of
13942// the loops body with better chances for success.
13943SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
13944SmallSet<int, 4> PostponedIndices;
13945Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13946auto &&CheckPredecessor = [](BasicBlock *InstBB,BasicBlock *InsertBB) {
13947SmallPtrSet<BasicBlock *, 4> Visited;
13948while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13949 InsertBB = InsertBB->getSinglePredecessor();
13950return InsertBB && InsertBB == InstBB;
13951 };
13952for (intI = 0, E = VL.size();I < E; ++I) {
13953if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13954if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13955 getTreeEntry(Inst) ||
13956 (L && (!Root ||L->isLoopInvariant(Root)) &&L->contains(Inst))) &&
13957 PostponedIndices.insert(I).second)
13958 PostponedInsts.emplace_back(Inst,I);
13959 }
13960
13961auto &&CreateInsertElement = [this](Value *Vec,Value *V,unsigned Pos,
13962Type *Ty) {
13963Value *Scalar =V;
13964if (Scalar->getType() != Ty) {
13965assert(Scalar->getType()->isIntOrIntVectorTy() &&
13966 Ty->isIntOrIntVectorTy() &&"Expected integer types only.");
13967Value *V =Scalar;
13968if (auto *CI = dyn_cast<CastInst>(Scalar);
13969 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13970Value *Op = CI->getOperand(0);
13971if (auto *IOp = dyn_cast<Instruction>(Op);
13972 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
13973V =Op;
13974 }
13975Scalar = Builder.CreateIntCast(
13976 V, Ty, !isKnownNonNegative(Scalar,SimplifyQuery(*DL)));
13977 }
13978
13979Instruction *InsElt;
13980if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13981assert(SLPReVec &&"FixedVectorType is not expected.");
13982 Vec =
13983createInsertVector(Builder, Vec, Scalar, Pos *getNumElements(VecTy));
13984auto *II = dyn_cast<IntrinsicInst>(Vec);
13985if (!II ||II->getIntrinsicID() != Intrinsic::vector_insert)
13986return Vec;
13987 InsElt =II;
13988 }else {
13989 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13990 InsElt = dyn_cast<InsertElementInst>(Vec);
13991if (!InsElt)
13992return Vec;
13993 }
13994 GatherShuffleExtractSeq.insert(InsElt);
13995 CSEBlocks.insert(InsElt->getParent());
13996// Add to our 'need-to-extract' list.
13997if (isa<Instruction>(V)) {
13998if (TreeEntry *Entry = getTreeEntry(V)) {
13999// Find which lane we need to extract.
14000User *UserOp =nullptr;
14001if (Scalar != V) {
14002if (auto *SI = dyn_cast<Instruction>(Scalar))
14003 UserOp =SI;
14004 }else {
14005 UserOp = InsElt;
14006 }
14007if (UserOp) {
14008unsigned FoundLane =Entry->findLaneForValue(V);
14009 ExternalUses.emplace_back(V, UserOp, FoundLane);
14010 }
14011 }
14012 }
14013return Vec;
14014 };
14015auto *VecTy =getWidenedType(ScalarTy, VL.size());
14016Value *Vec =PoisonValue::get(VecTy);
14017SmallVector<int> NonConsts;
14018SmallVector<int>Mask(VL.size());
14019 std::iota(Mask.begin(),Mask.end(), 0);
14020Value *OriginalRoot = Root;
14021if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14022 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14023 SV->getOperand(0)->getType() == VecTy) {
14024 Root = SV->getOperand(0);
14025Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14026 }
14027// Insert constant values at first.
14028for (intI = 0, E = VL.size();I < E; ++I) {
14029if (PostponedIndices.contains(I))
14030continue;
14031if (!isConstant(VL[I])) {
14032 NonConsts.push_back(I);
14033continue;
14034 }
14035if (isa<PoisonValue>(VL[I]))
14036continue;
14037 Vec = CreateInsertElement(Vec, VL[I],I, ScalarTy);
14038Mask[I] =I + E;
14039 }
14040if (Root) {
14041if (isa<PoisonValue>(Vec)) {
14042 Vec = OriginalRoot;
14043 }else {
14044 Vec = CreateShuffle(Root, Vec, Mask);
14045if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
14046 OI && OI->hasNUses(0) &&
14047none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14048returnTE->VectorizedValue == OI;
14049 }))
14050eraseInstruction(OI);
14051 }
14052 }
14053// Insert non-constant values.
14054for (intI : NonConsts)
14055 Vec = CreateInsertElement(Vec, VL[I],I, ScalarTy);
14056// Append instructions, which are/may be part of the loop, in the end to make
14057// it possible to hoist non-loop-based instructions.
14058for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14059 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14060
14061return Vec;
14062}
14063
14064/// Merges shuffle masks and emits final shuffle instruction, if required. It
14065/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14066/// when the actual shuffle instruction is generated only if this is actually
14067/// required. Otherwise, the shuffle instruction emission is delayed till the
14068/// end of the process, to reduce the number of emitted instructions and further
14069/// analysis/transformations.
14070/// The class also will look through the previously emitted shuffle instructions
14071/// and properly mark indices in mask as undef.
14072/// For example, given the code
14073/// \code
14074/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
14075/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
14076/// \endcode
14077/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
14078/// look through %s1 and %s2 and emit
14079/// \code
14080/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14081/// \endcode
14082/// instead.
14083/// If 2 operands are of different size, the smallest one will be resized and
14084/// the mask recalculated properly.
14085/// For example, given the code
14086/// \code
14087/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
14088/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
14089/// \endcode
14090/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
14091/// look through %s1 and %s2 and emit
14092/// \code
14093/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14094/// \endcode
14095/// instead.
14096classBoUpSLP::ShuffleInstructionBuilder final :public BaseShuffleAnalysis {
14097bool IsFinalized =false;
14098 /// Combined mask for all applied operands and masks. It is built during
14099 /// analysis and actual emission of shuffle vector instructions.
14100SmallVector<int> CommonMask;
14101 /// List of operands for the shuffle vector instruction. It hold at max 2
14102 /// operands, if the 3rd is going to be added, the first 2 are combined into
14103 /// shuffle with \p CommonMask mask, the first operand sets to be the
14104 /// resulting shuffle and the second operand sets to be the newly added
14105 /// operand. The \p CommonMask is transformed in the proper way after that.
14106SmallVector<Value *, 2> InVectors;
14107IRBuilderBase &Builder;
14108BoUpSLP &R;
14109
14110classShuffleIRBuilder {
14111IRBuilderBase &Builder;
14112 /// Holds all of the instructions that we gathered.
14113SetVector<Instruction *> &GatherShuffleExtractSeq;
14114 /// A list of blocks that we are going to CSE.
14115DenseSet<BasicBlock *> &CSEBlocks;
14116 /// Data layout.
14117constDataLayout &DL;
14118
14119public:
14120 ShuffleIRBuilder(IRBuilderBase &Builder,
14121SetVector<Instruction *> &GatherShuffleExtractSeq,
14122DenseSet<BasicBlock *> &CSEBlocks,constDataLayout &DL)
14123 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14124 CSEBlocks(CSEBlocks),DL(DL) {}
14125 ~ShuffleIRBuilder() =default;
14126 /// Creates shufflevector for the 2 operands with the given mask.
14127Value *createShuffleVector(Value *V1,Value *V2,ArrayRef<int> Mask) {
14128if (V1->getType() != V2->getType()) {
14129assert(V1->getType()->isIntOrIntVectorTy() &&
14130 V1->getType()->isIntOrIntVectorTy() &&
14131"Expected integer vector types only.");
14132if (V1->getType() != V2->getType()) {
14133if (cast<VectorType>(V2->getType())
14134 ->getElementType()
14135 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14136 ->getElementType()
14137 ->getIntegerBitWidth())
14138 V2 = Builder.CreateIntCast(
14139 V2, V1->getType(), !isKnownNonNegative(V2,SimplifyQuery(DL)));
14140else
14141 V1 = Builder.CreateIntCast(
14142 V1, V2->getType(), !isKnownNonNegative(V1,SimplifyQuery(DL)));
14143 }
14144 }
14145Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14146if (auto *I = dyn_cast<Instruction>(Vec)) {
14147 GatherShuffleExtractSeq.insert(I);
14148 CSEBlocks.insert(I->getParent());
14149 }
14150return Vec;
14151 }
14152 /// Creates permutation of the single vector operand with the given mask, if
14153 /// it is not identity mask.
14154Value *createShuffleVector(Value *V1,ArrayRef<int> Mask) {
14155if (Mask.empty())
14156return V1;
14157unsigned VF = Mask.size();
14158unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14159if (VF == LocalVF &&ShuffleVectorInst::isIdentityMask(Mask, VF))
14160return V1;
14161Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14162if (auto *I = dyn_cast<Instruction>(Vec)) {
14163 GatherShuffleExtractSeq.insert(I);
14164 CSEBlocks.insert(I->getParent());
14165 }
14166return Vec;
14167 }
14168Value *createIdentity(Value *V) {return V; }
14169Value *createPoison(Type *Ty,unsigned VF) {
14170returnPoisonValue::get(getWidenedType(Ty, VF));
14171 }
14172 /// Resizes 2 input vector to match the sizes, if the they are not equal
14173 /// yet. The smallest vector is resized to the size of the larger vector.
14174void resizeToMatch(Value *&V1,Value *&V2) {
14175if (V1->getType() == V2->getType())
14176return;
14177int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14178int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14179int VF = std::max(V1VF, V2VF);
14180int MinVF = std::min(V1VF, V2VF);
14181SmallVector<int> IdentityMask(VF,PoisonMaskElem);
14182 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14183 0);
14184Value *&Op = MinVF == V1VF ? V1 : V2;
14185Op = Builder.CreateShuffleVector(Op, IdentityMask);
14186if (auto *I = dyn_cast<Instruction>(Op)) {
14187 GatherShuffleExtractSeq.insert(I);
14188 CSEBlocks.insert(I->getParent());
14189 }
14190if (MinVF == V1VF)
14191 V1 =Op;
14192else
14193 V2 =Op;
14194 }
14195 };
14196
14197 /// Smart shuffle instruction emission, walks through shuffles trees and
14198 /// tries to find the best matching vector for the actual shuffle
14199 /// instruction.
14200Value *createShuffle(Value *V1,Value *V2,ArrayRef<int> Mask) {
14201assert(V1 &&"Expected at least one vector value.");
14202 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14203 R.CSEBlocks, *R.DL);
14204return BaseShuffleAnalysis::createShuffle<Value *>(
14205 V1, V2, Mask, ShuffleBuilder, ScalarTy);
14206 }
14207
14208 /// Cast value \p V to the vector type with the same number of elements, but
14209 /// the base type \p ScalarTy.
14210Value *castToScalarTyElem(Value *V,
14211 std::optional<bool> IsSigned = std::nullopt) {
14212auto *VecTy = cast<VectorType>(V->getType());
14213assert(getNumElements(VecTy) %getNumElements(ScalarTy) == 0);
14214if (VecTy->getElementType() == ScalarTy->getScalarType())
14215return V;
14216return Builder.CreateIntCast(
14217 V,VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14218 IsSigned.value_or(!isKnownNonNegative(V,SimplifyQuery(*R.DL))));
14219 }
14220
14221public:
14222ShuffleInstructionBuilder(Type *ScalarTy,IRBuilderBase &Builder,BoUpSLP &R)
14223 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14224
14225 /// Adjusts extractelements after reusing them.
14226Value *adjustExtracts(const TreeEntry *E,MutableArrayRef<int> Mask,
14227ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14228unsigned NumParts,bool &UseVecBaseAsInput) {
14229 UseVecBaseAsInput =false;
14230SmallPtrSet<Value *, 4> UniqueBases;
14231Value *VecBase =nullptr;
14232SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14233if (!E->ReorderIndices.empty()) {
14234SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14235 E->ReorderIndices.end());
14236reorderScalars(VL, ReorderMask);
14237 }
14238for (intI = 0, Sz = Mask.size();I < Sz; ++I) {
14239intIdx = Mask[I];
14240if (Idx ==PoisonMaskElem)
14241continue;
14242auto *EI = cast<ExtractElementInst>(VL[I]);
14243 VecBase = EI->getVectorOperand();
14244if (const TreeEntry *TE = R.getTreeEntry(VecBase))
14245 VecBase = TE->VectorizedValue;
14246assert(VecBase &&"Expected vectorized value.");
14247 UniqueBases.insert(VecBase);
14248// If the only one use is vectorized - can delete the extractelement
14249// itself.
14250if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14251 (NumParts != 1 &&count(VL, EI) > 1) ||
14252any_of(EI->users(), [&](User *U) {
14253 const TreeEntry *UTE = R.getTreeEntry(U);
14254 return !UTE || R.MultiNodeScalars.contains(U) ||
14255 (isa<GetElementPtrInst>(U) &&
14256 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14257 count_if(R.VectorizableTree,
14258 [&](const std::unique_ptr<TreeEntry> &TE) {
14259 return any_of(TE->UserTreeIndices,
14260 [&](const EdgeInfo &Edge) {
14261 return Edge.UserTE == UTE;
14262 }) &&
14263 is_contained(VL, EI);
14264 }) != 1;
14265 }))
14266continue;
14267 R.eraseInstruction(EI);
14268 }
14269if (NumParts == 1 || UniqueBases.size() == 1) {
14270assert(VecBase &&"Expected vectorized value.");
14271return castToScalarTyElem(VecBase);
14272 }
14273 UseVecBaseAsInput =true;
14274auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14275for (auto [I,Idx] :enumerate(Mask))
14276if (Idx !=PoisonMaskElem)
14277Idx =I;
14278 };
14279// Perform multi-register vector shuffle, joining them into a single virtual
14280// long vector.
14281// Need to shuffle each part independently and then insert all this parts
14282// into a long virtual vector register, forming the original vector.
14283Value *Vec =nullptr;
14284SmallVector<int> VecMask(Mask.size(),PoisonMaskElem);
14285unsigned SliceSize =getPartNumElems(VL.size(), NumParts);
14286for (unsigned Part : seq<unsigned>(NumParts)) {
14287unsigned Limit =getNumElems(VL.size(), SliceSize, Part);
14288ArrayRef<Value *> SubVL =ArrayRef(VL).slice(Part * SliceSize, Limit);
14289MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14290constexprint MaxBases = 2;
14291SmallVector<Value *, MaxBases> Bases(MaxBases);
14292auto VLMask =zip(SubVL, SubMask);
14293constunsigned VF = std::accumulate(
14294 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S,constauto &D) {
14295 if (std::get<1>(D) == PoisonMaskElem)
14296 return S;
14297 Value *VecOp =
14298 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14299 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14300 VecOp = TE->VectorizedValue;
14301 assert(VecOp &&"Expected vectorized value.");
14302 const unsigned Size =
14303 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14304 return std::max(S, Size);
14305 });
14306for (constauto [V,I] : VLMask) {
14307if (I ==PoisonMaskElem)
14308continue;
14309Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14310if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14311 VecOp = TE->VectorizedValue;
14312assert(VecOp &&"Expected vectorized value.");
14313 VecOp = castToScalarTyElem(VecOp);
14314 Bases[I / VF] = VecOp;
14315 }
14316if (!Bases.front())
14317continue;
14318Value *SubVec;
14319if (Bases.back()) {
14320 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14321 TransformToIdentity(SubMask);
14322 }else {
14323 SubVec = Bases.front();
14324 }
14325if (!Vec) {
14326 Vec = SubVec;
14327assert((Part == 0 ||all_of(seq<unsigned>(0, Part),
14328 [&](unsignedP) {
14329ArrayRef<int> SubMask =
14330Mask.slice(P * SliceSize,
14331getNumElems(Mask.size(),
14332 SliceSize,P));
14333returnall_of(SubMask, [](intIdx) {
14334returnIdx ==PoisonMaskElem;
14335 });
14336 })) &&
14337"Expected first part or all previous parts masked.");
14338copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14339 }else {
14340unsigned NewVF =
14341 cast<FixedVectorType>(Vec->getType())->getNumElements();
14342if (Vec->getType() != SubVec->getType()) {
14343unsigned SubVecVF =
14344 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14345 NewVF = std::max(NewVF, SubVecVF);
14346 }
14347// Adjust SubMask.
14348for (int &Idx : SubMask)
14349if (Idx !=PoisonMaskElem)
14350Idx += NewVF;
14351copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14352 Vec = createShuffle(Vec, SubVec, VecMask);
14353 TransformToIdentity(VecMask);
14354 }
14355 }
14356copy(VecMask,Mask.begin());
14357return Vec;
14358 }
14359 /// Checks if the specified entry \p E needs to be delayed because of its
14360 /// dependency nodes.
14361 std::optional<Value *>
14362needToDelay(const TreeEntry *E,
14363ArrayRef<SmallVector<const TreeEntry *>> Deps) const{
14364// No need to delay emission if all deps are ready.
14365if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14366returnall_of(
14367 TEs, [](const TreeEntry *TE) {return TE->VectorizedValue; });
14368 }))
14369return std::nullopt;
14370// Postpone gather emission, will be emitted after the end of the
14371// process to keep correct order.
14372auto *ResVecTy =getWidenedType(ScalarTy, E->getVectorFactor());
14373return Builder.CreateAlignedLoad(
14374 ResVecTy,
14375PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
14376MaybeAlign());
14377 }
14378 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14379 /// shuffling.
14380voidadd(const TreeEntry &E1,const TreeEntry &E2,ArrayRef<int> Mask) {
14381Value *V1 = E1.VectorizedValue;
14382if (V1->getType()->isIntOrIntVectorTy())
14383 V1 = castToScalarTyElem(V1,any_of(E1.Scalars, [&](Value *V) {
14384 if (isa<PoisonValue>(V))
14385 return false;
14386 return !isKnownNonNegative(
14387 V, SimplifyQuery(*R.DL));
14388 }));
14389Value *V2 = E2.VectorizedValue;
14390if (V2->getType()->isIntOrIntVectorTy())
14391 V2 = castToScalarTyElem(V2,any_of(E2.Scalars, [&](Value *V) {
14392 if (isa<PoisonValue>(V))
14393 return false;
14394 return !isKnownNonNegative(
14395 V, SimplifyQuery(*R.DL));
14396 }));
14397 add(V1, V2, Mask);
14398 }
14399 /// Adds single input vector (in form of tree entry) and the mask for its
14400 /// shuffling.
14401voidadd(const TreeEntry &E1,ArrayRef<int> Mask) {
14402Value *V1 = E1.VectorizedValue;
14403if (V1->getType()->isIntOrIntVectorTy())
14404 V1 = castToScalarTyElem(V1,any_of(E1.Scalars, [&](Value *V) {
14405 if (isa<PoisonValue>(V))
14406 return false;
14407 return !isKnownNonNegative(
14408 V, SimplifyQuery(*R.DL));
14409 }));
14410 add(V1, Mask);
14411 }
14412 /// Adds 2 input vectors and the mask for their shuffling.
14413voidadd(Value *V1,Value *V2,ArrayRef<int> Mask) {
14414assert(V1 && V2 && !Mask.empty() &&"Expected non-empty input vectors.");
14415assert(isa<FixedVectorType>(V1->getType()) &&
14416 isa<FixedVectorType>(V2->getType()) &&
14417"castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14418 V1 = castToScalarTyElem(V1);
14419 V2 = castToScalarTyElem(V2);
14420if (InVectors.empty()) {
14421 InVectors.push_back(V1);
14422 InVectors.push_back(V2);
14423 CommonMask.assign(Mask.begin(), Mask.end());
14424return;
14425 }
14426Value *Vec = InVectors.front();
14427if (InVectors.size() == 2) {
14428 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14429 transformMaskAfterShuffle(CommonMask, CommonMask);
14430 }elseif (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14431 Mask.size()) {
14432 Vec = createShuffle(Vec,nullptr, CommonMask);
14433 transformMaskAfterShuffle(CommonMask, CommonMask);
14434 }
14435 V1 = createShuffle(V1, V2, Mask);
14436unsigned VF = std::max(getVF(V1), getVF(Vec));
14437for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
14438if (Mask[Idx] !=PoisonMaskElem)
14439 CommonMask[Idx] =Idx + VF;
14440 InVectors.front() = Vec;
14441if (InVectors.size() == 2)
14442 InVectors.back() = V1;
14443else
14444 InVectors.push_back(V1);
14445 }
14446 /// Adds another one input vector and the mask for the shuffling.
14447voidadd(Value *V1,ArrayRef<int> Mask,bool =false) {
14448assert(isa<FixedVectorType>(V1->getType()) &&
14449"castToScalarTyElem expects V1 to be FixedVectorType");
14450 V1 = castToScalarTyElem(V1);
14451if (InVectors.empty()) {
14452 InVectors.push_back(V1);
14453 CommonMask.assign(Mask.begin(), Mask.end());
14454return;
14455 }
14456constauto *It =find(InVectors, V1);
14457if (It == InVectors.end()) {
14458if (InVectors.size() == 2 ||
14459 InVectors.front()->getType() != V1->getType()) {
14460Value *V = InVectors.front();
14461if (InVectors.size() == 2) {
14462 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14463 transformMaskAfterShuffle(CommonMask, CommonMask);
14464 }elseif (cast<FixedVectorType>(V->getType())->getNumElements() !=
14465 CommonMask.size()) {
14466 V = createShuffle(InVectors.front(),nullptr, CommonMask);
14467 transformMaskAfterShuffle(CommonMask, CommonMask);
14468 }
14469unsigned VF = std::max(CommonMask.size(), Mask.size());
14470for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
14471if (CommonMask[Idx] ==PoisonMaskElem && Mask[Idx] !=PoisonMaskElem)
14472 CommonMask[Idx] =
14473 V->getType() != V1->getType()
14474 ?Idx + VF
14475 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14476 ->getNumElements();
14477if (V->getType() != V1->getType())
14478 V1 = createShuffle(V1,nullptr, Mask);
14479 InVectors.front() = V;
14480if (InVectors.size() == 2)
14481 InVectors.back() = V1;
14482else
14483 InVectors.push_back(V1);
14484return;
14485 }
14486// Check if second vector is required if the used elements are already
14487// used from the first one.
14488for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
14489if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem) {
14490 InVectors.push_back(V1);
14491break;
14492 }
14493 }
14494unsigned VF = 0;
14495for (Value *V : InVectors)
14496 VF = std::max(VF, getVF(V));
14497for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
14498if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)
14499 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14500 }
14501 /// Adds another one input vector and the mask for the shuffling.
14502voidaddOrdered(Value *V1,ArrayRef<unsigned> Order) {
14503SmallVector<int> NewMask;
14504inversePermutation(Order, NewMask);
14505 add(V1, NewMask);
14506 }
14507Value *gather(ArrayRef<Value *> VL,unsigned MaskVF = 0,
14508Value *Root =nullptr) {
14509return R.gather(VL, Root, ScalarTy,
14510 [&](Value *V1,Value *V2,ArrayRef<int> Mask) {
14511return createShuffle(V1, V2, Mask);
14512 });
14513 }
14514Value *createFreeze(Value *V) {return Builder.CreateFreeze(V); }
14515 /// Finalize emission of the shuffles.
14516 /// \param Action the action (if any) to be performed before final applying of
14517 /// the \p ExtMask mask.
14518Value *
14519finalize(ArrayRef<int> ExtMask,
14520ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14521ArrayRef<int> SubVectorsMask,unsigned VF = 0,
14522function_ref<void(Value *&,SmallVectorImpl<int> &)> Action = {}) {
14523 IsFinalized =true;
14524if (Action) {
14525Value *Vec = InVectors.front();
14526if (InVectors.size() == 2) {
14527 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14528 InVectors.pop_back();
14529 }else {
14530 Vec = createShuffle(Vec,nullptr, CommonMask);
14531 }
14532 transformMaskAfterShuffle(CommonMask, CommonMask);
14533assert(VF > 0 &&
14534"Expected vector length for the final value before action.");
14535unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14536if (VecVF < VF) {
14537SmallVector<int> ResizeMask(VF,PoisonMaskElem);
14538 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14539 Vec = createShuffle(Vec,nullptr, ResizeMask);
14540 }
14541 Action(Vec, CommonMask);
14542 InVectors.front() = Vec;
14543 }
14544if (!SubVectors.empty()) {
14545Value *Vec = InVectors.front();
14546if (InVectors.size() == 2) {
14547 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14548 InVectors.pop_back();
14549 }else {
14550 Vec = createShuffle(Vec,nullptr, CommonMask);
14551 }
14552 transformMaskAfterShuffle(CommonMask, CommonMask);
14553auto CreateSubVectors = [&](Value *Vec,
14554SmallVectorImpl<int> &CommonMask) {
14555for (auto [E,Idx] : SubVectors) {
14556Value *V = E->VectorizedValue;
14557if (V->getType()->isIntOrIntVectorTy())
14558 V = castToScalarTyElem(V,any_of(E->Scalars, [&](Value *V) {
14559 if (isa<PoisonValue>(V))
14560 return false;
14561 return !isKnownNonNegative(
14562 V, SimplifyQuery(*R.DL));
14563 }));
14564unsigned InsertionIndex =Idx *getNumElements(ScalarTy);
14565 Vec =createInsertVector(
14566 Builder, Vec, V, InsertionIndex,
14567 std::bind(&ShuffleInstructionBuilder::createShuffle,this, _1, _2,
14568 _3));
14569if (!CommonMask.empty()) {
14570 std::iota(std::next(CommonMask.begin(),Idx),
14571 std::next(CommonMask.begin(),Idx + E->getVectorFactor()),
14572Idx);
14573 }
14574 }
14575return Vec;
14576 };
14577if (SubVectorsMask.empty()) {
14578 Vec = CreateSubVectors(Vec, CommonMask);
14579 }else {
14580SmallVector<int> SVMask(CommonMask.size(),PoisonMaskElem);
14581copy(SubVectorsMask, SVMask.begin());
14582for (auto [I1, I2] :zip(SVMask, CommonMask)) {
14583if (I2 !=PoisonMaskElem) {
14584assert(I1 ==PoisonMaskElem &&"Expected unused subvectors mask");
14585I1 = I2 + CommonMask.size();
14586 }
14587 }
14588Value *InsertVec =
14589 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14590 Vec = createShuffle(InsertVec, Vec, SVMask);
14591 transformMaskAfterShuffle(CommonMask, SVMask);
14592 }
14593 InVectors.front() = Vec;
14594 }
14595
14596if (!ExtMask.empty()) {
14597if (CommonMask.empty()) {
14598 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14599 }else {
14600SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);
14601for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {
14602if (ExtMask[I] ==PoisonMaskElem)
14603continue;
14604 NewMask[I] = CommonMask[ExtMask[I]];
14605 }
14606 CommonMask.swap(NewMask);
14607 }
14608 }
14609if (CommonMask.empty()) {
14610assert(InVectors.size() == 1 &&"Expected only one vector with no mask");
14611return InVectors.front();
14612 }
14613if (InVectors.size() == 2)
14614return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14615return createShuffle(InVectors.front(),nullptr, CommonMask);
14616 }
14617
14618~ShuffleInstructionBuilder() {
14619assert((IsFinalized || CommonMask.empty()) &&
14620"Shuffle construction must be finalized.");
14621 }
14622};
14623
14624BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14625unsigned NodeIdx) {
14626ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14627 InstructionsState S =getSameOpcode(VL, *TLI);
14628// Special processing for GEPs bundle, which may include non-gep values.
14629if (!S && VL.front()->getType()->isPointerTy()) {
14630constauto *It =find_if(VL, IsaPred<GetElementPtrInst>);
14631if (It != VL.end())
14632 S =getSameOpcode(*It, *TLI);
14633 }
14634if (!S)
14635returnnullptr;
14636auto CheckSameVE = [&](const TreeEntry *VE) {
14637return VE->isSame(VL) &&
14638 (any_of(VE->UserTreeIndices,
14639 [E, NodeIdx](const EdgeInfo &EI) {
14640 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14641 }) ||
14642any_of(VectorizableTree,
14643 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14644return TE->isOperandGatherNode(
14645 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14646 VE->isSame(TE->Scalars);
14647 }));
14648 };
14649 TreeEntry *VE = getTreeEntry(S.getMainOp());
14650if (VE && CheckSameVE(VE))
14651return VE;
14652auto It = MultiNodeScalars.find(S.getMainOp());
14653if (It != MultiNodeScalars.end()) {
14654auto *I =find_if(It->getSecond(), [&](const TreeEntry *TE) {
14655 return TE != VE && CheckSameVE(TE);
14656 });
14657if (I != It->getSecond().end())
14658return *I;
14659 }
14660returnnullptr;
14661}
14662
14663Value *BoUpSLP::vectorizeOperand(TreeEntry *E,unsigned NodeIdx,
14664bool PostponedPHIs) {
14665ValueList &VL = E->getOperand(NodeIdx);
14666constunsigned VF = VL.size();
14667if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14668auto FinalShuffle = [&](Value *V,ArrayRef<int>Mask) {
14669// V may be affected by MinBWs.
14670// We want ShuffleInstructionBuilder to correctly support REVEC. The key
14671// factor is the number of elements, not their type.
14672Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14673unsigned NumElements =getNumElements(VL.front()->getType());
14674 ShuffleInstructionBuilder ShuffleBuilder(
14675 NumElements != 1 ?FixedVectorType::get(ScalarTy, NumElements)
14676 : ScalarTy,
14677 Builder, *this);
14678 ShuffleBuilder.add(V, Mask);
14679SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
14680 E->CombinedEntriesWithIndices.size());
14681transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14682 [&](constauto &P) {
14683 return std::make_pair(VectorizableTree[P.first].get(),
14684 P.second);
14685 });
14686assert((E->CombinedEntriesWithIndices.empty() ||
14687 E->ReorderIndices.empty()) &&
14688"Expected either combined subnodes or reordering");
14689return ShuffleBuilder.finalize({}, SubVectors, {});
14690 };
14691Value *V =vectorizeTree(VE, PostponedPHIs);
14692if (VF *getNumElements(VL[0]->getType()) !=
14693 cast<FixedVectorType>(V->getType())->getNumElements()) {
14694if (!VE->ReuseShuffleIndices.empty()) {
14695// Reshuffle to get only unique values.
14696// If some of the scalars are duplicated in the vectorization
14697// tree entry, we do not vectorize them but instead generate a
14698// mask for the reuses. But if there are several users of the
14699// same entry, they may have different vectorization factors.
14700// This is especially important for PHI nodes. In this case, we
14701// need to adapt the resulting instruction for the user
14702// vectorization factor and have to reshuffle it again to take
14703// only unique elements of the vector. Without this code the
14704// function incorrectly returns reduced vector instruction with
14705// the same elements, not with the unique ones.
14706
14707// block:
14708// %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14709// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14710// ... (use %2)
14711// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14712// br %block
14713SmallVector<int>Mask(VF,PoisonMaskElem);
14714for (auto [I, V] :enumerate(VL)) {
14715if (isa<PoisonValue>(V))
14716continue;
14717Mask[I] = VE->findLaneForValue(V);
14718 }
14719V = FinalShuffle(V, Mask);
14720 }else {
14721assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14722"Expected vectorization factor less "
14723"than original vector size.");
14724SmallVector<int> UniformMask(VF, 0);
14725 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14726V = FinalShuffle(V, UniformMask);
14727 }
14728 }
14729// Need to update the operand gather node, if actually the operand is not a
14730// vectorized node, but the buildvector/gather node, which matches one of
14731// the vectorized nodes.
14732if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14733 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14734 }) == VE->UserTreeIndices.end()) {
14735auto *It =
14736find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14737returnTE->isGather() &&TE->UserTreeIndices.front().UserTE == E &&
14738TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14739 });
14740assert(It != VectorizableTree.end() &&"Expected gather node operand.");
14741 (*It)->VectorizedValue =V;
14742 }
14743returnV;
14744 }
14745
14746// Find the corresponding gather entry and vectorize it.
14747// Allows to be more accurate with tree/graph transformations, checks for the
14748// correctness of the transformations in many cases.
14749auto *I =find_if(VectorizableTree,
14750 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14751returnTE->isOperandGatherNode({E, NodeIdx});
14752 });
14753assert(I != VectorizableTree.end() &&"Gather node is not in the graph.");
14754assert(I->get()->UserTreeIndices.size() == 1 &&
14755"Expected only single user for the gather node.");
14756assert(I->get()->isSame(VL) &&"Expected same list of scalars.");
14757returnvectorizeTree(I->get(), PostponedPHIs);
14758}
14759
14760template <typename BVTy,typename ResTy,typename...Args>
14761ResTy BoUpSLP::processBuildVector(const TreeEntry *E,Type *ScalarTy,
14762 Args &...Params) {
14763assert(E->isGather() &&"Expected gather node.");
14764unsigned VF = E->getVectorFactor();
14765
14766bool NeedFreeze =false;
14767SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14768 E->ReuseShuffleIndices.end());
14769SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14770// Clear values, to be replaced by insertvector instructions.
14771for (auto [EIdx,Idx] : E->CombinedEntriesWithIndices)
14772for_each(MutableArrayRef(GatheredScalars)
14773 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14774 [&](Value *&V) {V =PoisonValue::get(V->getType()); });
14775SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
14776 E->CombinedEntriesWithIndices.size());
14777transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14778 [&](constauto &P) {
14779 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14780 });
14781// Build a mask out of the reorder indices and reorder scalars per this
14782// mask.
14783SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14784 E->ReorderIndices.end());
14785if (!ReorderMask.empty())
14786reorderScalars(GatheredScalars, ReorderMask);
14787SmallVector<int> SubVectorsMask;
14788inversePermutation(E->ReorderIndices, SubVectorsMask);
14789// Transform non-clustered elements in the mask to poison (-1).
14790// "Clustered" operations will be reordered using this mask later.
14791if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14792for (unsignedI : seq<unsigned>(GatheredScalars.size()))
14793if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14794 SubVectorsMask[ReorderMask[I]] =PoisonMaskElem;
14795 }else {
14796 SubVectorsMask.clear();
14797 }
14798SmallVector<Value *> StoredGS(GatheredScalars);
14799auto FindReusedSplat = [&](MutableArrayRef<int>Mask,unsigned InputVF,
14800unsignedI,unsigned SliceSize,
14801bool IsNotPoisonous) {
14802if (!isSplat(E->Scalars) ||none_of(E->Scalars, [](Value *V) {
14803 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14804 }))
14805returnfalse;
14806 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14807unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14808if (UserTE->getNumOperands() != 2)
14809returnfalse;
14810if (!IsNotPoisonous) {
14811auto *It =
14812find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14813returnfind_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14814 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14815 }) !=TE->UserTreeIndices.end();
14816 });
14817if (It == VectorizableTree.end())
14818returnfalse;
14819SmallVector<Value *>GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14820if (!(*It)->ReorderIndices.empty()) {
14821inversePermutation((*It)->ReorderIndices, ReorderMask);
14822reorderScalars(GS, ReorderMask);
14823 }
14824if (!all_of(zip(GatheredScalars, GS), [&](constauto &P) {
14825Value *V0 = std::get<0>(P);
14826Value *V1 = std::get<1>(P);
14827return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14828 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14829is_contained(E->Scalars, V1));
14830 }))
14831returnfalse;
14832 }
14833intIdx;
14834if ((Mask.size() < InputVF &&
14835ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF,Idx) &&
14836Idx == 0) ||
14837 (Mask.size() == InputVF &&
14838ShuffleVectorInst::isIdentityMask(Mask,Mask.size()))) {
14839 std::iota(
14840 std::next(Mask.begin(),I * SliceSize),
14841 std::next(Mask.begin(),
14842I * SliceSize +getNumElems(Mask.size(), SliceSize,I)),
14843 0);
14844 }else {
14845unsigned IVal =
14846 *find_if_not(Mask, [](intIdx) {returnIdx ==PoisonMaskElem; });
14847 std::fill(
14848 std::next(Mask.begin(),I * SliceSize),
14849 std::next(Mask.begin(),
14850I * SliceSize +getNumElems(Mask.size(), SliceSize,I)),
14851 IVal);
14852 }
14853returntrue;
14854 };
14855 BVTy ShuffleBuilder(ScalarTy, Params...);
14856 ResTy Res = ResTy();
14857SmallVector<int>Mask;
14858SmallVector<int> ExtractMask(GatheredScalars.size(),PoisonMaskElem);
14859SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
14860Value *ExtractVecBase =nullptr;
14861bool UseVecBaseAsInput =false;
14862SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
14863SmallVector<SmallVector<const TreeEntry *>> Entries;
14864Type *OrigScalarTy = GatheredScalars.front()->getType();
14865auto *VecTy =getWidenedType(ScalarTy, GatheredScalars.size());
14866unsigned NumParts =::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());
14867if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14868// Check for gathered extracts.
14869bool Resized =false;
14870 ExtractShuffles =
14871 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14872if (!ExtractShuffles.empty()) {
14873SmallVector<const TreeEntry *> ExtractEntries;
14874for (auto [Idx,I] :enumerate(ExtractMask)) {
14875if (I ==PoisonMaskElem)
14876continue;
14877if (constauto *TE = getTreeEntry(
14878 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
14879 ExtractEntries.push_back(TE);
14880 }
14881if (std::optional<ResTy> Delayed =
14882 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14883// Delay emission of gathers which are not ready yet.
14884 PostponedGathers.insert(E);
14885// Postpone gather emission, will be emitted after the end of the
14886// process to keep correct order.
14887return *Delayed;
14888 }
14889if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14890 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14891 ExtractVecBase = VecBase;
14892if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14893if (VF == VecBaseTy->getNumElements() &&
14894 GatheredScalars.size() != VF) {
14895 Resized =true;
14896 GatheredScalars.append(VF - GatheredScalars.size(),
14897PoisonValue::get(OrigScalarTy));
14898 NumParts =
14899::getNumberOfParts(*TTI,getWidenedType(OrigScalarTy, VF), VF);
14900 }
14901 }
14902 }
14903// Gather extracts after we check for full matched gathers only.
14904if (!ExtractShuffles.empty() || !E->hasState() ||
14905 E->getOpcode() != Instruction::Load ||
14906 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14907any_of(E->Scalars, IsaPred<LoadInst>)) &&
14908any_of(E->Scalars,
14909 [this](Value *V) {
14910 return isa<LoadInst>(V) && getTreeEntry(V);
14911 })) ||
14912 (E->hasState() && E->isAltShuffle()) ||
14913all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
14914isSplat(E->Scalars) ||
14915 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14916 GatherShuffles =
14917 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14918 }
14919if (!GatherShuffles.empty()) {
14920if (std::optional<ResTy> Delayed =
14921 ShuffleBuilder.needToDelay(E, Entries)) {
14922// Delay emission of gathers which are not ready yet.
14923 PostponedGathers.insert(E);
14924// Postpone gather emission, will be emitted after the end of the
14925// process to keep correct order.
14926return *Delayed;
14927 }
14928if (GatherShuffles.size() == 1 &&
14929 *GatherShuffles.front() ==TTI::SK_PermuteSingleSrc &&
14930 Entries.front().front()->isSame(E->Scalars)) {
14931// Perfect match in the graph, will reuse the previously vectorized
14932// node. Cost is 0.
14933LLVM_DEBUG(dbgs() <<"SLP: perfect diamond match for gather bundle "
14934 <<shortBundleName(E->Scalars, E->Idx) <<".\n");
14935// Restore the mask for previous partially matched values.
14936Mask.resize(E->Scalars.size());
14937const TreeEntry *FrontTE = Entries.front().front();
14938if (FrontTE->ReorderIndices.empty() &&
14939 ((FrontTE->ReuseShuffleIndices.empty() &&
14940 E->Scalars.size() == FrontTE->Scalars.size()) ||
14941 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14942 std::iota(Mask.begin(),Mask.end(), 0);
14943 }else {
14944for (auto [I, V] :enumerate(E->Scalars)) {
14945if (isa<PoisonValue>(V)) {
14946Mask[I] =PoisonMaskElem;
14947continue;
14948 }
14949Mask[I] = FrontTE->findLaneForValue(V);
14950 }
14951 }
14952 ShuffleBuilder.add(*FrontTE, Mask);
14953// Full matched entry found, no need to insert subvectors.
14954 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14955return Res;
14956 }
14957if (!Resized) {
14958if (GatheredScalars.size() != VF &&
14959any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14960returnany_of(TEs, [&](const TreeEntry *TE) {
14961returnTE->getVectorFactor() == VF;
14962 });
14963 }))
14964 GatheredScalars.append(VF - GatheredScalars.size(),
14965PoisonValue::get(OrigScalarTy));
14966 }
14967// Remove shuffled elements from list of gathers.
14968for (intI = 0, Sz =Mask.size();I < Sz; ++I) {
14969if (Mask[I] !=PoisonMaskElem)
14970 GatheredScalars[I] =PoisonValue::get(OrigScalarTy);
14971 }
14972 }
14973 }
14974auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14975SmallVectorImpl<int> &ReuseMask,
14976bool IsRootPoison) {
14977// For splats with can emit broadcasts instead of gathers, so try to find
14978// such sequences.
14979bool IsSplat = IsRootPoison &&isSplat(Scalars) &&
14980 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14981 Scalars.append(VF - Scalars.size(),PoisonValue::get(OrigScalarTy));
14982SmallVector<int> UndefPos;
14983DenseMap<Value *, unsigned> UniquePositions;
14984// Gather unique non-const values and all constant values.
14985// For repeated values, just shuffle them.
14986int NumNonConsts = 0;
14987int SinglePos = 0;
14988for (auto [I, V] :enumerate(Scalars)) {
14989if (isa<UndefValue>(V)) {
14990if (!isa<PoisonValue>(V)) {
14991 ReuseMask[I] =I;
14992 UndefPos.push_back(I);
14993 }
14994continue;
14995 }
14996if (isConstant(V)) {
14997 ReuseMask[I] =I;
14998continue;
14999 }
15000 ++NumNonConsts;
15001 SinglePos =I;
15002Value *OrigV =V;
15003 Scalars[I] =PoisonValue::get(OrigScalarTy);
15004if (IsSplat) {
15005 Scalars.front() = OrigV;
15006 ReuseMask[I] = 0;
15007 }else {
15008constauto Res = UniquePositions.try_emplace(OrigV,I);
15009 Scalars[Res.first->second] = OrigV;
15010 ReuseMask[I] = Res.first->second;
15011 }
15012 }
15013if (NumNonConsts == 1) {
15014// Restore single insert element.
15015if (IsSplat) {
15016 ReuseMask.assign(VF,PoisonMaskElem);
15017std::swap(Scalars.front(), Scalars[SinglePos]);
15018if (!UndefPos.empty() && UndefPos.front() == 0)
15019 Scalars.front() =UndefValue::get(OrigScalarTy);
15020 }
15021 ReuseMask[SinglePos] = SinglePos;
15022 }elseif (!UndefPos.empty() && IsSplat) {
15023// For undef values, try to replace them with the simple broadcast.
15024// We can do it if the broadcasted value is guaranteed to be
15025// non-poisonous, or by freezing the incoming scalar value first.
15026auto *It =find_if(Scalars, [this, E](Value *V) {
15027return !isa<UndefValue>(V) &&
15028 (getTreeEntry(V) ||isGuaranteedNotToBePoison(V, AC) ||
15029 (E->UserTreeIndices.size() == 1 &&
15030any_of(V->uses(), [E](constUse &U) {
15031// Check if the value already used in the same operation in
15032// one of the nodes already.
15033 return E->UserTreeIndices.front().EdgeIdx !=
15034 U.getOperandNo() &&
15035 is_contained(
15036 E->UserTreeIndices.front().UserTE->Scalars,
15037 U.getUser());
15038 })));
15039 });
15040if (It != Scalars.end()) {
15041// Replace undefs by the non-poisoned scalars and emit broadcast.
15042int Pos = std::distance(Scalars.begin(), It);
15043for (intI : UndefPos) {
15044// Set the undef position to the non-poisoned scalar.
15045 ReuseMask[I] = Pos;
15046// Replace the undef by the poison, in the mask it is replaced by
15047// non-poisoned scalar already.
15048if (I != Pos)
15049 Scalars[I] =PoisonValue::get(OrigScalarTy);
15050 }
15051 }else {
15052// Replace undefs by the poisons, emit broadcast and then emit
15053// freeze.
15054for (intI : UndefPos) {
15055 ReuseMask[I] =PoisonMaskElem;
15056if (isa<UndefValue>(Scalars[I]))
15057 Scalars[I] =PoisonValue::get(OrigScalarTy);
15058 }
15059 NeedFreeze =true;
15060 }
15061 }
15062 };
15063if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15064bool IsNonPoisoned =true;
15065bool IsUsedInExpr =true;
15066Value *Vec1 =nullptr;
15067if (!ExtractShuffles.empty()) {
15068// Gather of extractelements can be represented as just a shuffle of
15069// a single/two vectors the scalars are extracted from.
15070// Find input vectors.
15071Value *Vec2 =nullptr;
15072for (unsignedI = 0, Sz = ExtractMask.size();I < Sz; ++I) {
15073if (!Mask.empty() && Mask[I] !=PoisonMaskElem)
15074 ExtractMask[I] =PoisonMaskElem;
15075 }
15076if (UseVecBaseAsInput) {
15077 Vec1 = ExtractVecBase;
15078 }else {
15079for (unsignedI = 0, Sz = ExtractMask.size();I < Sz; ++I) {
15080if (ExtractMask[I] ==PoisonMaskElem)
15081continue;
15082if (isa<UndefValue>(E->Scalars[I]))
15083continue;
15084auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15085Value *VecOp = EI->getVectorOperand();
15086if (constauto *TE = getTreeEntry(VecOp))
15087if (TE->VectorizedValue)
15088 VecOp =TE->VectorizedValue;
15089if (!Vec1) {
15090 Vec1 = VecOp;
15091 }elseif (Vec1 != VecOp) {
15092assert((!Vec2 || Vec2 == VecOp) &&
15093"Expected only 1 or 2 vectors shuffle.");
15094 Vec2 = VecOp;
15095 }
15096 }
15097 }
15098if (Vec2) {
15099 IsUsedInExpr =false;
15100 IsNonPoisoned &=isGuaranteedNotToBePoison(Vec1, AC) &&
15101isGuaranteedNotToBePoison(Vec2, AC);
15102 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15103 }elseif (Vec1) {
15104bool IsNotPoisonedVec =isGuaranteedNotToBePoison(Vec1, AC);
15105 IsUsedInExpr &= FindReusedSplat(
15106 ExtractMask,
15107 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15108 ExtractMask.size(), IsNotPoisonedVec);
15109 ShuffleBuilder.add(Vec1, ExtractMask,/*ForExtracts=*/true);
15110 IsNonPoisoned &= IsNotPoisonedVec;
15111 }else {
15112 IsUsedInExpr =false;
15113 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15114/*ForExtracts=*/true);
15115 }
15116 }
15117if (!GatherShuffles.empty()) {
15118unsigned SliceSize =getPartNumElems(E->Scalars.size(), NumParts);
15119SmallVector<int> VecMask(Mask.size(),PoisonMaskElem);
15120for (constauto [I, TEs] :enumerate(Entries)) {
15121if (TEs.empty()) {
15122assert(!GatherShuffles[I] &&
15123"No shuffles with empty entries list expected.");
15124continue;
15125 }
15126assert((TEs.size() == 1 || TEs.size() == 2) &&
15127"Expected shuffle of 1 or 2 entries.");
15128unsigned Limit =getNumElems(Mask.size(), SliceSize,I);
15129auto SubMask =ArrayRef(Mask).slice(I * SliceSize, Limit);
15130 VecMask.assign(VecMask.size(),PoisonMaskElem);
15131copy(SubMask, std::next(VecMask.begin(),I * SliceSize));
15132if (TEs.size() == 1) {
15133bool IsNotPoisonedVec =
15134 TEs.front()->VectorizedValue
15135 ?isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15136 :true;
15137 IsUsedInExpr &=
15138 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(),I,
15139 SliceSize, IsNotPoisonedVec);
15140 ShuffleBuilder.add(*TEs.front(), VecMask);
15141 IsNonPoisoned &= IsNotPoisonedVec;
15142 }else {
15143 IsUsedInExpr =false;
15144 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15145if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15146 IsNonPoisoned &=
15147isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15148isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15149 }
15150 }
15151 }
15152// Try to figure out best way to combine values: build a shuffle and insert
15153// elements or just build several shuffles.
15154// Insert non-constant scalars.
15155SmallVector<Value *> NonConstants(GatheredScalars);
15156int EMSz = ExtractMask.size();
15157int MSz =Mask.size();
15158// Try to build constant vector and shuffle with it only if currently we
15159// have a single permutation and more than 1 scalar constants.
15160bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15161bool IsIdentityShuffle =
15162 ((UseVecBaseAsInput ||
15163all_of(ExtractShuffles,
15164 [](const std::optional<TTI::ShuffleKind> &SK) {
15165return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15166TTI::SK_PermuteSingleSrc;
15167 })) &&
15168none_of(ExtractMask, [&](intI) {returnI >= EMSz; }) &&
15169ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15170 (!GatherShuffles.empty() &&
15171all_of(GatherShuffles,
15172 [](const std::optional<TTI::ShuffleKind> &SK) {
15173return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15174TTI::SK_PermuteSingleSrc;
15175 }) &&
15176none_of(Mask, [&](intI) {returnI >= MSz; }) &&
15177ShuffleVectorInst::isIdentityMask(Mask, MSz));
15178bool EnoughConstsForShuffle =
15179 IsSingleShuffle &&
15180 (none_of(GatheredScalars,
15181 [](Value *V) {
15182return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15183 }) ||
15184any_of(GatheredScalars,
15185 [](Value *V) {
15186return isa<Constant>(V) && !isa<UndefValue>(V);
15187 })) &&
15188 (!IsIdentityShuffle ||
15189 (GatheredScalars.size() == 2 &&
15190any_of(GatheredScalars,
15191 [](Value *V) {return !isa<UndefValue>(V); })) ||
15192count_if(GatheredScalars, [](Value *V) {
15193return isa<Constant>(V) && !isa<PoisonValue>(V);
15194 }) > 1);
15195// NonConstants array contains just non-constant values, GatheredScalars
15196// contains only constant to build final vector and then shuffle.
15197for (intI = 0, Sz = GatheredScalars.size();I < Sz; ++I) {
15198if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15199 NonConstants[I] =PoisonValue::get(OrigScalarTy);
15200else
15201 GatheredScalars[I] =PoisonValue::get(OrigScalarTy);
15202 }
15203// Generate constants for final shuffle and build a mask for them.
15204if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15205SmallVector<int> BVMask(GatheredScalars.size(),PoisonMaskElem);
15206 TryPackScalars(GatheredScalars, BVMask,/*IsRootPoison=*/true);
15207Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15208 ShuffleBuilder.add(BV, BVMask);
15209 }
15210if (all_of(NonConstants, [=](Value *V) {
15211return isa<PoisonValue>(V) ||
15212 (IsSingleShuffle && ((IsIdentityShuffle &&
15213 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15214 }))
15215 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15216 SubVectorsMask);
15217else
15218 Res = ShuffleBuilder.finalize(
15219 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15220 [&](Value *&Vec,SmallVectorImpl<int> &Mask) {
15221 TryPackScalars(NonConstants, Mask,/*IsRootPoison=*/false);
15222 Vec = ShuffleBuilder.gather(NonConstants,Mask.size(), Vec);
15223 });
15224 }elseif (!allConstant(GatheredScalars)) {
15225// Gather unique scalars and all constants.
15226SmallVector<int> ReuseMask(GatheredScalars.size(),PoisonMaskElem);
15227 TryPackScalars(GatheredScalars, ReuseMask,/*IsRootPoison=*/true);
15228Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15229 ShuffleBuilder.add(BV, ReuseMask);
15230 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15231 SubVectorsMask);
15232 }else {
15233// Gather all constants.
15234SmallVector<int>Mask(GatheredScalars.size(),PoisonMaskElem);
15235for (auto [I, V] :enumerate(GatheredScalars)) {
15236if (!isa<PoisonValue>(V))
15237Mask[I] =I;
15238 }
15239Value *BV = ShuffleBuilder.gather(GatheredScalars);
15240 ShuffleBuilder.add(BV, Mask);
15241 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15242 SubVectorsMask);
15243 }
15244
15245if (NeedFreeze)
15246 Res = ShuffleBuilder.createFreeze(Res);
15247return Res;
15248}
15249
15250Value *BoUpSLP::createBuildVector(const TreeEntry *E,Type *ScalarTy,
15251bool PostponedPHIs) {
15252for (auto [EIdx,_] : E->CombinedEntriesWithIndices)
15253 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15254return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15255 Builder, *this);
15256}
15257
15258/// \returns \p I after propagating metadata from \p VL only for instructions in
15259/// \p VL.
15260staticInstruction *propagateMetadata(Instruction *Inst,ArrayRef<Value *> VL) {
15261SmallVector<Value *> Insts;
15262for (Value *V : VL)
15263if (isa<Instruction>(V))
15264 Insts.push_back(V);
15265returnllvm::propagateMetadata(Inst, Insts);
15266}
15267
15268Value *BoUpSLP::vectorizeTree(TreeEntry *E,bool PostponedPHIs) {
15269IRBuilderBase::InsertPointGuard Guard(Builder);
15270
15271if (E->VectorizedValue &&
15272 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15273 E->isAltShuffle())) {
15274LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *E->Scalars[0] <<".\n");
15275return E->VectorizedValue;
15276 }
15277
15278Value *V = E->Scalars.front();
15279Type *ScalarTy =V->getType();
15280if (!isa<CmpInst>(V))
15281 ScalarTy =getValueType(V);
15282auto It = MinBWs.find(E);
15283if (It != MinBWs.end()) {
15284auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15285 ScalarTy =IntegerType::get(F->getContext(), It->second.first);
15286if (VecTy)
15287 ScalarTy =getWidenedType(ScalarTy, VecTy->getNumElements());
15288 }
15289auto *VecTy =getWidenedType(ScalarTy, E->Scalars.size());
15290if (E->isGather()) {
15291// Set insert point for non-reduction initial nodes.
15292if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15293 setInsertPointAfterBundle(E);
15294Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15295 E->VectorizedValue = Vec;
15296return Vec;
15297 }
15298
15299bool IsReverseOrder =
15300 !E->ReorderIndices.empty() &&isReverseOrder(E->ReorderIndices);
15301auto FinalShuffle = [&](Value *V,const TreeEntry *E) {
15302 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15303if (E->getOpcode() == Instruction::Store &&
15304 E->State == TreeEntry::Vectorize) {
15305ArrayRef<int>Mask =
15306ArrayRef(reinterpret_cast<constint *>(E->ReorderIndices.begin()),
15307 E->ReorderIndices.size());
15308 ShuffleBuilder.add(V, Mask);
15309 }elseif (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15310 ShuffleBuilder.addOrdered(V, {});
15311 }else {
15312 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15313 }
15314SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
15315 E->CombinedEntriesWithIndices.size());
15316transform(
15317 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](constauto &P) {
15318 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15319 });
15320assert(
15321 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15322"Expected either combined subnodes or reordering");
15323return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15324 };
15325
15326assert(!E->isGather() &&"Unhandled state");
15327unsigned ShuffleOrOp =
15328 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15329Instruction *VL0 = E->getMainOp();
15330auto GetOperandSignedness = [&](unsignedIdx) {
15331const TreeEntry *OpE = getOperandEntry(E,Idx);
15332bool IsSigned =false;
15333auto It = MinBWs.find(OpE);
15334if (It != MinBWs.end())
15335 IsSigned = It->second.second;
15336else
15337 IsSigned =any_of(OpE->Scalars, [&](Value *R) {
15338 if (isa<PoisonValue>(V))
15339 return false;
15340 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15341 });
15342return IsSigned;
15343 };
15344switch (ShuffleOrOp) {
15345case Instruction::PHI: {
15346assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15347 E != VectorizableTree.front().get() ||
15348 !E->UserTreeIndices.empty()) &&
15349"PHI reordering is free.");
15350if (PostponedPHIs && E->VectorizedValue)
15351return E->VectorizedValue;
15352auto *PH = cast<PHINode>(VL0);
15353 Builder.SetInsertPoint(PH->getParent(),
15354 PH->getParent()->getFirstNonPHIIt());
15355 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15356if (PostponedPHIs || !E->VectorizedValue) {
15357PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15358 E->PHI = NewPhi;
15359Value *V = NewPhi;
15360
15361// Adjust insertion point once all PHI's have been generated.
15362 Builder.SetInsertPoint(PH->getParent(),
15363 PH->getParent()->getFirstInsertionPt());
15364 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15365
15366V = FinalShuffle(V, E);
15367
15368 E->VectorizedValue =V;
15369if (PostponedPHIs)
15370returnV;
15371 }
15372PHINode *NewPhi = cast<PHINode>(E->PHI);
15373// If phi node is fully emitted - exit.
15374if (NewPhi->getNumIncomingValues() != 0)
15375return NewPhi;
15376
15377// PHINodes may have multiple entries from the same block. We want to
15378// visit every block once.
15379SmallPtrSet<BasicBlock *, 4> VisitedBBs;
15380
15381for (unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {
15382ValueListOperands;
15383BasicBlock *IBB = PH->getIncomingBlock(I);
15384
15385// Stop emission if all incoming values are generated.
15386if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15387LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15388return NewPhi;
15389 }
15390
15391if (!VisitedBBs.insert(IBB).second) {
15392 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15393continue;
15394 }
15395
15396 Builder.SetInsertPoint(IBB->getTerminator());
15397 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15398Value *Vec = vectorizeOperand(E,I,/*PostponedPHIs=*/true);
15399if (VecTy != Vec->getType()) {
15400assert((It != MinBWs.end() || getOperandEntry(E,I)->isGather() ||
15401 MinBWs.contains(getOperandEntry(E,I))) &&
15402"Expected item in MinBWs.");
15403 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15404 }
15405 NewPhi->addIncoming(Vec, IBB);
15406 }
15407
15408assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15409"Invalid number of incoming values");
15410assert(E->VectorizedValue &&"Expected vectorized value.");
15411return E->VectorizedValue;
15412 }
15413
15414case Instruction::ExtractElement: {
15415Value *V = E->getSingleOperand(0);
15416if (const TreeEntry *TE = getTreeEntry(V))
15417V =TE->VectorizedValue;
15418 setInsertPointAfterBundle(E);
15419V = FinalShuffle(V, E);
15420 E->VectorizedValue =V;
15421returnV;
15422 }
15423case Instruction::ExtractValue: {
15424auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15425 Builder.SetInsertPoint(LI);
15426Value *Ptr = LI->getPointerOperand();
15427LoadInst *V = Builder.CreateAlignedLoad(VecTy,Ptr, LI->getAlign());
15428Value *NewV =::propagateMetadata(V, E->Scalars);
15429 NewV = FinalShuffle(NewV, E);
15430 E->VectorizedValue = NewV;
15431return NewV;
15432 }
15433case Instruction::InsertElement: {
15434assert(E->ReuseShuffleIndices.empty() &&"All inserts should be unique");
15435 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15436Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15437ArrayRef<Value *>Op = E->getOperand(1);
15438Type *ScalarTy =Op.front()->getType();
15439if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15440assert(ScalarTy->isIntegerTy() &&"Expected item in MinBWs.");
15441 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15442assert(Res.first > 0 &&"Expected item in MinBWs.");
15443V = Builder.CreateIntCast(
15444 V,
15445getWidenedType(
15446 ScalarTy,
15447 cast<FixedVectorType>(V->getType())->getNumElements()),
15448 Res.second);
15449 }
15450
15451// Create InsertVector shuffle if necessary
15452auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15453 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15454 }));
15455constunsigned NumElts =
15456 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15457constunsigned NumScalars = E->Scalars.size();
15458
15459unsignedOffset = *getElementIndex(VL0);
15460assert(Offset < NumElts &&"Failed to find vector index offset");
15461
15462// Create shuffle to resize vector
15463SmallVector<int>Mask;
15464if (!E->ReorderIndices.empty()) {
15465inversePermutation(E->ReorderIndices, Mask);
15466Mask.append(NumElts - NumScalars,PoisonMaskElem);
15467 }else {
15468Mask.assign(NumElts,PoisonMaskElem);
15469 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15470 }
15471// Create InsertVector shuffle if necessary
15472bool IsIdentity =true;
15473SmallVector<int> PrevMask(NumElts,PoisonMaskElem);
15474Mask.swap(PrevMask);
15475for (unsignedI = 0;I < NumScalars; ++I) {
15476Value *Scalar = E->Scalars[PrevMask[I]];
15477unsigned InsertIdx = *getElementIndex(Scalar);
15478 IsIdentity &= InsertIdx -Offset ==I;
15479Mask[InsertIdx -Offset] =I;
15480 }
15481if (!IsIdentity || NumElts != NumScalars) {
15482Value *V2 =nullptr;
15483bool IsVNonPoisonous =
15484 !isConstant(V) &&isGuaranteedNotToBePoison(V, AC);
15485SmallVector<int> InsertMask(Mask);
15486if (NumElts != NumScalars &&Offset == 0) {
15487// Follow all insert element instructions from the current buildvector
15488// sequence.
15489InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15490do {
15491 std::optional<unsigned> InsertIdx =getElementIndex(Ins);
15492if (!InsertIdx)
15493break;
15494if (InsertMask[*InsertIdx] ==PoisonMaskElem)
15495 InsertMask[*InsertIdx] = *InsertIdx;
15496if (!Ins->hasOneUse())
15497break;
15498Ins = dyn_cast_or_null<InsertElementInst>(
15499Ins->getUniqueUndroppableUser());
15500 }while (Ins);
15501SmallBitVector UseMask =
15502buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15503SmallBitVector IsFirstPoison =
15504 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15505SmallBitVector IsFirstUndef =
15506isUndefVector(FirstInsert->getOperand(0), UseMask);
15507if (!IsFirstPoison.all()) {
15508unsignedIdx = 0;
15509for (unsignedI = 0;I < NumElts;I++) {
15510if (InsertMask[I] ==PoisonMaskElem && !IsFirstPoison.test(I) &&
15511 IsFirstUndef.test(I)) {
15512if (IsVNonPoisonous) {
15513 InsertMask[I] =I < NumScalars ?I : 0;
15514continue;
15515 }
15516if (!V2)
15517V2 =UndefValue::get(V->getType());
15518if (Idx >= NumScalars)
15519Idx = NumScalars - 1;
15520 InsertMask[I] = NumScalars +Idx;
15521 ++Idx;
15522 }elseif (InsertMask[I] !=PoisonMaskElem &&
15523 Mask[I] ==PoisonMaskElem) {
15524 InsertMask[I] =PoisonMaskElem;
15525 }
15526 }
15527 }else {
15528 InsertMask =Mask;
15529 }
15530 }
15531if (!V2)
15532V2 =PoisonValue::get(V->getType());
15533V = Builder.CreateShuffleVector(V, V2, InsertMask);
15534if (auto *I = dyn_cast<Instruction>(V)) {
15535 GatherShuffleExtractSeq.insert(I);
15536 CSEBlocks.insert(I->getParent());
15537 }
15538 }
15539
15540SmallVector<int> InsertMask(NumElts,PoisonMaskElem);
15541for (unsignedI = 0;I < NumElts;I++) {
15542if (Mask[I] !=PoisonMaskElem)
15543 InsertMask[Offset +I] =I;
15544 }
15545SmallBitVector UseMask =
15546buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15547SmallBitVector IsFirstUndef =
15548isUndefVector(FirstInsert->getOperand(0), UseMask);
15549if ((!IsIdentity ||Offset != 0 || !IsFirstUndef.all()) &&
15550 NumElts != NumScalars) {
15551if (IsFirstUndef.all()) {
15552if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15553SmallBitVector IsFirstPoison =
15554 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15555if (!IsFirstPoison.all()) {
15556for (unsignedI = 0;I < NumElts;I++) {
15557if (InsertMask[I] ==PoisonMaskElem && !IsFirstPoison.test(I))
15558 InsertMask[I] =I + NumElts;
15559 }
15560 }
15561V = Builder.CreateShuffleVector(
15562 V,
15563 IsFirstPoison.all() ?PoisonValue::get(V->getType())
15564 : FirstInsert->getOperand(0),
15565 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15566if (auto *I = dyn_cast<Instruction>(V)) {
15567 GatherShuffleExtractSeq.insert(I);
15568 CSEBlocks.insert(I->getParent());
15569 }
15570 }
15571 }else {
15572SmallBitVector IsFirstPoison =
15573 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15574for (unsignedI = 0;I < NumElts;I++) {
15575if (InsertMask[I] ==PoisonMaskElem)
15576 InsertMask[I] = IsFirstPoison.test(I) ?PoisonMaskElem :I;
15577else
15578 InsertMask[I] += NumElts;
15579 }
15580V = Builder.CreateShuffleVector(
15581 FirstInsert->getOperand(0), V, InsertMask,
15582 cast<Instruction>(E->Scalars.back())->getName());
15583if (auto *I = dyn_cast<Instruction>(V)) {
15584 GatherShuffleExtractSeq.insert(I);
15585 CSEBlocks.insert(I->getParent());
15586 }
15587 }
15588 }
15589
15590 ++NumVectorInstructions;
15591 E->VectorizedValue =V;
15592returnV;
15593 }
15594case Instruction::ZExt:
15595case Instruction::SExt:
15596case Instruction::FPToUI:
15597case Instruction::FPToSI:
15598case Instruction::FPExt:
15599case Instruction::PtrToInt:
15600case Instruction::IntToPtr:
15601case Instruction::SIToFP:
15602case Instruction::UIToFP:
15603case Instruction::Trunc:
15604case Instruction::FPTrunc:
15605case Instruction::BitCast: {
15606 setInsertPointAfterBundle(E);
15607
15608Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15609if (E->VectorizedValue) {
15610LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15611return E->VectorizedValue;
15612 }
15613
15614auto *CI = cast<CastInst>(VL0);
15615Instruction::CastOps VecOpcode = CI->getOpcode();
15616Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15617auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15618if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15619 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15620 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15621// Check if the values are candidates to demote.
15622unsigned SrcBWSz =DL->getTypeSizeInBits(SrcScalarTy);
15623if (SrcIt != MinBWs.end())
15624 SrcBWSz = SrcIt->second.first;
15625unsigned BWSz =DL->getTypeSizeInBits(ScalarTy->getScalarType());
15626if (BWSz == SrcBWSz) {
15627 VecOpcode = Instruction::BitCast;
15628 }elseif (BWSz < SrcBWSz) {
15629 VecOpcode = Instruction::Trunc;
15630 }elseif (It != MinBWs.end()) {
15631assert(BWSz > SrcBWSz &&"Invalid cast!");
15632 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15633 }elseif (SrcIt != MinBWs.end()) {
15634assert(BWSz > SrcBWSz &&"Invalid cast!");
15635 VecOpcode =
15636 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15637 }
15638 }elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15639 !SrcIt->second.second) {
15640 VecOpcode = Instruction::UIToFP;
15641 }
15642Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15643 ? InVec
15644 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15645V = FinalShuffle(V, E);
15646
15647 E->VectorizedValue =V;
15648 ++NumVectorInstructions;
15649returnV;
15650 }
15651case Instruction::FCmp:
15652case Instruction::ICmp: {
15653 setInsertPointAfterBundle(E);
15654
15655Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15656if (E->VectorizedValue) {
15657LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15658return E->VectorizedValue;
15659 }
15660Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15661if (E->VectorizedValue) {
15662LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15663return E->VectorizedValue;
15664 }
15665if (L->getType() !=R->getType()) {
15666assert((getOperandEntry(E, 0)->isGather() ||
15667 getOperandEntry(E, 1)->isGather() ||
15668 MinBWs.contains(getOperandEntry(E, 0)) ||
15669 MinBWs.contains(getOperandEntry(E, 1))) &&
15670"Expected item in MinBWs.");
15671if (cast<VectorType>(L->getType())
15672 ->getElementType()
15673 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15674 ->getElementType()
15675 ->getIntegerBitWidth()) {
15676Type *CastTy =R->getType();
15677L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15678 }else {
15679Type *CastTy =L->getType();
15680R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15681 }
15682 }
15683
15684CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15685Value *V = Builder.CreateCmp(P0, L, R);
15686propagateIRFlags(V, E->Scalars, VL0);
15687if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15688 ICmp->setSameSign(/*B=*/false);
15689// Do not cast for cmps.
15690 VecTy = cast<FixedVectorType>(V->getType());
15691V = FinalShuffle(V, E);
15692
15693 E->VectorizedValue =V;
15694 ++NumVectorInstructions;
15695returnV;
15696 }
15697case Instruction::Select: {
15698 setInsertPointAfterBundle(E);
15699
15700Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15701if (E->VectorizedValue) {
15702LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15703return E->VectorizedValue;
15704 }
15705Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15706if (E->VectorizedValue) {
15707LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15708return E->VectorizedValue;
15709 }
15710Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15711if (E->VectorizedValue) {
15712LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15713return E->VectorizedValue;
15714 }
15715if (True->getType() != VecTy || False->getType() != VecTy) {
15716assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15717 getOperandEntry(E, 2)->isGather() ||
15718 MinBWs.contains(getOperandEntry(E, 1)) ||
15719 MinBWs.contains(getOperandEntry(E, 2))) &&
15720"Expected item in MinBWs.");
15721if (True->getType() != VecTy)
15722 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15723if (False->getType() != VecTy)
15724 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15725 }
15726
15727unsigned CondNumElements =getNumElements(Cond->getType());
15728unsigned TrueNumElements =getNumElements(True->getType());
15729assert(TrueNumElements >= CondNumElements &&
15730 TrueNumElements % CondNumElements == 0 &&
15731"Cannot vectorize Instruction::Select");
15732assert(TrueNumElements ==getNumElements(False->getType()) &&
15733"Cannot vectorize Instruction::Select");
15734if (CondNumElements != TrueNumElements) {
15735// When the return type is i1 but the source is fixed vector type, we
15736// need to duplicate the condition value.
15737Cond = Builder.CreateShuffleVector(
15738Cond,createReplicatedMask(TrueNumElements / CondNumElements,
15739 CondNumElements));
15740 }
15741assert(getNumElements(Cond->getType()) == TrueNumElements &&
15742"Cannot vectorize Instruction::Select");
15743Value *V = Builder.CreateSelect(Cond, True, False);
15744V = FinalShuffle(V, E);
15745
15746 E->VectorizedValue =V;
15747 ++NumVectorInstructions;
15748returnV;
15749 }
15750case Instruction::FNeg: {
15751 setInsertPointAfterBundle(E);
15752
15753Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15754
15755if (E->VectorizedValue) {
15756LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15757return E->VectorizedValue;
15758 }
15759
15760Value *V = Builder.CreateUnOp(
15761static_cast<Instruction::UnaryOps>(E->getOpcode()),Op);
15762propagateIRFlags(V, E->Scalars, VL0);
15763if (auto *I = dyn_cast<Instruction>(V))
15764V =::propagateMetadata(I, E->Scalars);
15765
15766V = FinalShuffle(V, E);
15767
15768 E->VectorizedValue =V;
15769 ++NumVectorInstructions;
15770
15771returnV;
15772 }
15773case Instruction::Freeze: {
15774 setInsertPointAfterBundle(E);
15775
15776Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15777
15778if (E->VectorizedValue) {
15779LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15780return E->VectorizedValue;
15781 }
15782
15783if (Op->getType() != VecTy) {
15784assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15785 MinBWs.contains(getOperandEntry(E, 0))) &&
15786"Expected item in MinBWs.");
15787Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15788 }
15789Value *V = Builder.CreateFreeze(Op);
15790V = FinalShuffle(V, E);
15791
15792 E->VectorizedValue =V;
15793 ++NumVectorInstructions;
15794
15795returnV;
15796 }
15797case Instruction::Add:
15798case Instruction::FAdd:
15799case Instruction::Sub:
15800case Instruction::FSub:
15801case Instruction::Mul:
15802case Instruction::FMul:
15803case Instruction::UDiv:
15804case Instruction::SDiv:
15805case Instruction::FDiv:
15806case Instruction::URem:
15807case Instruction::SRem:
15808case Instruction::FRem:
15809case Instruction::Shl:
15810case Instruction::LShr:
15811case Instruction::AShr:
15812case Instruction::And:
15813case Instruction::Or:
15814case Instruction::Xor: {
15815 setInsertPointAfterBundle(E);
15816
15817Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15818if (E->VectorizedValue) {
15819LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15820return E->VectorizedValue;
15821 }
15822Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15823if (E->VectorizedValue) {
15824LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15825return E->VectorizedValue;
15826 }
15827if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15828for (unsignedI : seq<unsigned>(0, E->getNumOperands())) {
15829ArrayRef<Value *> Ops = E->getOperand(I);
15830if (all_of(Ops, [&](Value *Op) {
15831auto *CI = dyn_cast<ConstantInt>(Op);
15832return CI && CI->getValue().countr_one() >= It->second.first;
15833 })) {
15834V = FinalShuffle(I == 0 ? RHS : LHS, E);
15835 E->VectorizedValue =V;
15836 ++NumVectorInstructions;
15837returnV;
15838 }
15839 }
15840 }
15841if (LHS->getType() != VecTy ||RHS->getType() != VecTy) {
15842assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15843 getOperandEntry(E, 1)->isGather() ||
15844 MinBWs.contains(getOperandEntry(E, 0)) ||
15845 MinBWs.contains(getOperandEntry(E, 1))) &&
15846"Expected item in MinBWs.");
15847if (LHS->getType() != VecTy)
15848LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15849if (RHS->getType() != VecTy)
15850RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15851 }
15852
15853Value *V = Builder.CreateBinOp(
15854static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15855 RHS);
15856propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15857if (auto *I = dyn_cast<Instruction>(V)) {
15858V =::propagateMetadata(I, E->Scalars);
15859// Drop nuw flags for abs(sub(commutative), true).
15860if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15861any_of(E->Scalars, [](Value *V) {
15862 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15863 }))
15864I->setHasNoUnsignedWrap(/*b=*/false);
15865 }
15866
15867V = FinalShuffle(V, E);
15868
15869 E->VectorizedValue =V;
15870 ++NumVectorInstructions;
15871
15872returnV;
15873 }
15874case Instruction::Load: {
15875// Loads are inserted at the head of the tree because we don't want to
15876// sink them all the way down past store instructions.
15877 setInsertPointAfterBundle(E);
15878
15879LoadInst *LI = cast<LoadInst>(VL0);
15880Instruction *NewLI;
15881Value *PO = LI->getPointerOperand();
15882if (E->State == TreeEntry::Vectorize) {
15883 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15884 }elseif (E->State == TreeEntry::StridedVectorize) {
15885Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15886Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15887 PO = IsReverseOrder ? PtrN : Ptr0;
15888 std::optional<int> Diff =getPointersDiff(
15889 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15890Type *StrideTy =DL->getIndexType(PO->getType());
15891Value *StrideVal;
15892if (Diff) {
15893int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15894 StrideVal =
15895 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15896DL->getTypeAllocSize(ScalarTy));
15897 }else {
15898SmallVector<Value *> PointerOps(E->Scalars.size(),nullptr);
15899transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15900 return cast<LoadInst>(V)->getPointerOperand();
15901 });
15902OrdersType Order;
15903 std::optional<Value *> Stride =
15904calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15905 &*Builder.GetInsertPoint());
15906Value *NewStride =
15907 Builder.CreateIntCast(*Stride, StrideTy,/*isSigned=*/true);
15908 StrideVal = Builder.CreateMul(
15909 NewStride,
15910 ConstantInt::get(
15911 StrideTy,
15912 (IsReverseOrder ? -1 : 1) *
15913static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15914 }
15915Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15916auto *Inst = Builder.CreateIntrinsic(
15917 Intrinsic::experimental_vp_strided_load,
15918 {VecTy, PO->getType(), StrideTy},
15919 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15920 Builder.getInt32(E->Scalars.size())});
15921 Inst->addParamAttr(
15922/*ArgNo=*/0,
15923Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15924 NewLI = Inst;
15925 }else {
15926assert(E->State == TreeEntry::ScatterVectorize &&"Unhandled state");
15927Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15928if (E->VectorizedValue) {
15929LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15930return E->VectorizedValue;
15931 }
15932if (isa<FixedVectorType>(ScalarTy)) {
15933assert(SLPReVec &&"FixedVectorType is not expected.");
15934// CreateMaskedGather expects VecTy and VecPtr have same size. We need
15935// to expand VecPtr if ScalarTy is a vector type.
15936unsigned ScalarTyNumElements =
15937 cast<FixedVectorType>(ScalarTy)->getNumElements();
15938unsigned VecTyNumElements =
15939 cast<FixedVectorType>(VecTy)->getNumElements();
15940assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15941"Cannot expand getelementptr.");
15942unsigned VF = VecTyNumElements / ScalarTyNumElements;
15943SmallVector<Constant *> Indices(VecTyNumElements);
15944transform(seq(VecTyNumElements), Indices.begin(), [=](unsignedI) {
15945 return Builder.getInt64(I % ScalarTyNumElements);
15946 });
15947 VecPtr = Builder.CreateGEP(
15948 VecTy->getElementType(),
15949 Builder.CreateShuffleVector(
15950 VecPtr,createReplicatedMask(ScalarTyNumElements, VF)),
15951ConstantVector::get(Indices));
15952 }
15953// Use the minimum alignment of the gathered loads.
15954Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15955 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15956 }
15957Value *V =::propagateMetadata(NewLI, E->Scalars);
15958
15959V = FinalShuffle(V, E);
15960 E->VectorizedValue =V;
15961 ++NumVectorInstructions;
15962returnV;
15963 }
15964case Instruction::Store: {
15965auto *SI = cast<StoreInst>(VL0);
15966
15967 setInsertPointAfterBundle(E);
15968
15969Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15970if (VecValue->getType() != VecTy)
15971 VecValue =
15972 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15973 VecValue = FinalShuffle(VecValue, E);
15974
15975Value *Ptr =SI->getPointerOperand();
15976Instruction *ST;
15977if (E->State == TreeEntry::Vectorize) {
15978ST = Builder.CreateAlignedStore(VecValue,Ptr,SI->getAlign());
15979 }else {
15980assert(E->State == TreeEntry::StridedVectorize &&
15981"Expected either strided or consecutive stores.");
15982if (!E->ReorderIndices.empty()) {
15983SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15984Ptr =SI->getPointerOperand();
15985 }
15986Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15987Type *StrideTy =DL->getIndexType(SI->getPointerOperandType());
15988auto *Inst = Builder.CreateIntrinsic(
15989 Intrinsic::experimental_vp_strided_store,
15990 {VecTy,Ptr->getType(), StrideTy},
15991 {VecValue,Ptr,
15992 ConstantInt::get(
15993 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
15994 Builder.getAllOnesMask(VecTy->getElementCount()),
15995 Builder.getInt32(E->Scalars.size())});
15996 Inst->addParamAttr(
15997/*ArgNo=*/1,
15998Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15999ST = Inst;
16000 }
16001
16002Value *V =::propagateMetadata(ST, E->Scalars);
16003
16004 E->VectorizedValue =V;
16005 ++NumVectorInstructions;
16006returnV;
16007 }
16008case Instruction::GetElementPtr: {
16009auto *GEP0 = cast<GetElementPtrInst>(VL0);
16010 setInsertPointAfterBundle(E);
16011
16012Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16013if (E->VectorizedValue) {
16014LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16015return E->VectorizedValue;
16016 }
16017
16018SmallVector<Value *> OpVecs;
16019for (int J = 1,N = GEP0->getNumOperands(); J <N; ++J) {
16020Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16021if (E->VectorizedValue) {
16022LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16023return E->VectorizedValue;
16024 }
16025 OpVecs.push_back(OpVec);
16026 }
16027
16028Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16029if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
16030SmallVector<Value *> GEPs;
16031for (Value *V : E->Scalars) {
16032if (isa<GetElementPtrInst>(V))
16033 GEPs.push_back(V);
16034 }
16035V =::propagateMetadata(I, GEPs);
16036 }
16037
16038V = FinalShuffle(V, E);
16039
16040 E->VectorizedValue =V;
16041 ++NumVectorInstructions;
16042
16043returnV;
16044 }
16045case Instruction::Call: {
16046CallInst *CI = cast<CallInst>(VL0);
16047 setInsertPointAfterBundle(E);
16048
16049Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
16050
16051SmallVector<Type *> ArgTys =buildIntrinsicArgTypes(
16052 CI,ID, VecTy->getNumElements(),
16053 It != MinBWs.end() ? It->second.first : 0,TTI);
16054auto VecCallCosts =getVectorCallCosts(CI, VecTy,TTI, TLI, ArgTys);
16055bool UseIntrinsic =ID !=Intrinsic::not_intrinsic &&
16056 VecCallCosts.first <= VecCallCosts.second;
16057
16058Value *ScalarArg =nullptr;
16059SmallVector<Value *> OpVecs;
16060SmallVector<Type *, 2> TysForDecl;
16061// Add return type if intrinsic is overloaded on it.
16062if (UseIntrinsic &&isVectorIntrinsicWithOverloadTypeAtArg(ID, -1,TTI))
16063 TysForDecl.push_back(VecTy);
16064auto *CEI = cast<CallInst>(VL0);
16065for (unsignedI : seq<unsigned>(0, CI->arg_size())) {
16066ValueList OpVL;
16067// Some intrinsics have scalar arguments. This argument should not be
16068// vectorized.
16069if (UseIntrinsic &&isVectorIntrinsicWithScalarOpAtArg(ID,I,TTI)) {
16070 ScalarArg = CEI->getArgOperand(I);
16071// if decided to reduce bitwidth of abs intrinsic, it second argument
16072// must be set false (do not return poison, if value issigned min).
16073if (ID == Intrinsic::abs && It != MinBWs.end() &&
16074 It->second.first <DL->getTypeSizeInBits(CEI->getType()))
16075 ScalarArg = Builder.getFalse();
16076 OpVecs.push_back(ScalarArg);
16077if (isVectorIntrinsicWithOverloadTypeAtArg(ID,I,TTI))
16078 TysForDecl.push_back(ScalarArg->getType());
16079continue;
16080 }
16081
16082Value *OpVec = vectorizeOperand(E,I, PostponedPHIs);
16083if (E->VectorizedValue) {
16084LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16085return E->VectorizedValue;
16086 }
16087 ScalarArg = CEI->getArgOperand(I);
16088if (cast<VectorType>(OpVec->getType())->getElementType() !=
16089 ScalarArg->getType()->getScalarType() &&
16090 It == MinBWs.end()) {
16091auto *CastTy =
16092getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16093 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16094 }elseif (It != MinBWs.end()) {
16095 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16096 }
16097LLVM_DEBUG(dbgs() <<"SLP: OpVec[" <<I <<"]: " << *OpVec <<"\n");
16098 OpVecs.push_back(OpVec);
16099if (UseIntrinsic &&isVectorIntrinsicWithOverloadTypeAtArg(ID,I,TTI))
16100 TysForDecl.push_back(OpVec->getType());
16101 }
16102
16103Function *CF;
16104if (!UseIntrinsic) {
16105VFShape Shape =
16106VFShape::get(CI->getFunctionType(),
16107ElementCount::getFixed(
16108static_cast<unsigned>(VecTy->getNumElements())),
16109false/*HasGlobalPred*/);
16110 CF =VFDatabase(*CI).getVectorizedFunction(Shape);
16111 }else {
16112 CF =Intrinsic::getOrInsertDeclaration(F->getParent(),ID, TysForDecl);
16113 }
16114
16115SmallVector<OperandBundleDef, 1> OpBundles;
16116 CI->getOperandBundlesAsDefs(OpBundles);
16117Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16118
16119propagateIRFlags(V, E->Scalars, VL0);
16120V = FinalShuffle(V, E);
16121
16122 E->VectorizedValue =V;
16123 ++NumVectorInstructions;
16124returnV;
16125 }
16126case Instruction::ShuffleVector: {
16127Value *V;
16128if (SLPReVec && !E->isAltShuffle()) {
16129 setInsertPointAfterBundle(E);
16130Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16131if (E->VectorizedValue) {
16132LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16133return E->VectorizedValue;
16134 }
16135SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16136if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16137assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16138"Not supported shufflevector usage.");
16139SmallVector<int> NewMask(ThisMask.size());
16140transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16141 return SVSrc->getShuffleMask()[Mask];
16142 });
16143V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16144 }else {
16145V = Builder.CreateShuffleVector(Src, ThisMask);
16146 }
16147propagateIRFlags(V, E->Scalars, VL0);
16148if (auto *I = dyn_cast<Instruction>(V))
16149V =::propagateMetadata(I, E->Scalars);
16150V = FinalShuffle(V, E);
16151 }else {
16152assert(E->isAltShuffle() &&
16153 ((Instruction::isBinaryOp(E->getOpcode()) &&
16154Instruction::isBinaryOp(E->getAltOpcode())) ||
16155 (Instruction::isCast(E->getOpcode()) &&
16156Instruction::isCast(E->getAltOpcode())) ||
16157 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16158"Invalid Shuffle Vector Operand");
16159
16160Value *LHS =nullptr, *RHS =nullptr;
16161if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16162 setInsertPointAfterBundle(E);
16163LHS = vectorizeOperand(E, 0, PostponedPHIs);
16164if (E->VectorizedValue) {
16165LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16166return E->VectorizedValue;
16167 }
16168RHS = vectorizeOperand(E, 1, PostponedPHIs);
16169 }else {
16170 setInsertPointAfterBundle(E);
16171LHS = vectorizeOperand(E, 0, PostponedPHIs);
16172 }
16173if (E->VectorizedValue) {
16174LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16175return E->VectorizedValue;
16176 }
16177if (LHS && RHS &&
16178 ((Instruction::isBinaryOp(E->getOpcode()) &&
16179 (LHS->getType() != VecTy ||RHS->getType() != VecTy)) ||
16180 (isa<CmpInst>(VL0) &&LHS->getType() !=RHS->getType()))) {
16181assert((It != MinBWs.end() ||
16182 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16183 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16184 MinBWs.contains(getOperandEntry(E, 0)) ||
16185 MinBWs.contains(getOperandEntry(E, 1))) &&
16186"Expected item in MinBWs.");
16187Type *CastTy = VecTy;
16188if (isa<CmpInst>(VL0) &&LHS->getType() !=RHS->getType()) {
16189if (cast<VectorType>(LHS->getType())
16190 ->getElementType()
16191 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16192 ->getElementType()
16193 ->getIntegerBitWidth())
16194 CastTy =RHS->getType();
16195else
16196 CastTy =LHS->getType();
16197 }
16198if (LHS->getType() != CastTy)
16199LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16200if (RHS->getType() != CastTy)
16201RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16202 }
16203
16204Value *V0, *V1;
16205if (Instruction::isBinaryOp(E->getOpcode())) {
16206 V0 = Builder.CreateBinOp(
16207static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16208 V1 = Builder.CreateBinOp(
16209static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16210 }elseif (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16211 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16212auto *AltCI = cast<CmpInst>(E->getAltOp());
16213CmpInst::Predicate AltPred = AltCI->getPredicate();
16214 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16215 }else {
16216if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16217unsigned SrcBWSz =DL->getTypeSizeInBits(
16218 cast<VectorType>(LHS->getType())->getElementType());
16219unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);
16220if (BWSz <= SrcBWSz) {
16221if (BWSz < SrcBWSz)
16222LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16223assert(LHS->getType() == VecTy &&
16224"Expected same type as operand.");
16225if (auto *I = dyn_cast<Instruction>(LHS))
16226LHS =::propagateMetadata(I, E->Scalars);
16227LHS = FinalShuffle(LHS, E);
16228 E->VectorizedValue =LHS;
16229 ++NumVectorInstructions;
16230returnLHS;
16231 }
16232 }
16233 V0 = Builder.CreateCast(
16234static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16235 V1 = Builder.CreateCast(
16236static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16237 }
16238// Add V0 and V1 to later analysis to try to find and remove matching
16239// instruction, if any.
16240for (Value *V : {V0, V1}) {
16241if (auto *I = dyn_cast<Instruction>(V)) {
16242 GatherShuffleExtractSeq.insert(I);
16243 CSEBlocks.insert(I->getParent());
16244 }
16245 }
16246
16247// Create shuffle to take alternate operations from the vector.
16248// Also, gather up main and alt scalar ops to propagate IR flags to
16249// each vector operation.
16250ValueList OpScalars, AltScalars;
16251SmallVector<int>Mask;
16252 E->buildAltOpShuffleMask(
16253 [E,this](Instruction *I) {
16254assert(E->isOpcodeOrAlt(I) &&"Unexpected main/alternate opcode");
16255returnisAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16256 *TLI);
16257 },
16258Mask, &OpScalars, &AltScalars);
16259
16260propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16261propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16262auto DropNuwFlag = [&](Value *Vec,unsigned Opcode) {
16263// Drop nuw flags for abs(sub(commutative), true).
16264if (auto *I = dyn_cast<Instruction>(Vec);
16265I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16266any_of(E->Scalars, [](Value *V) {
16267 if (isa<PoisonValue>(V))
16268 return false;
16269 auto *IV = cast<Instruction>(V);
16270 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16271 }))
16272I->setHasNoUnsignedWrap(/*b=*/false);
16273 };
16274 DropNuwFlag(V0, E->getOpcode());
16275 DropNuwFlag(V1, E->getAltOpcode());
16276
16277if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16278assert(SLPReVec &&"FixedVectorType is not expected.");
16279transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
16280 }
16281V = Builder.CreateShuffleVector(V0, V1, Mask);
16282if (auto *I = dyn_cast<Instruction>(V)) {
16283V =::propagateMetadata(I, E->Scalars);
16284 GatherShuffleExtractSeq.insert(I);
16285 CSEBlocks.insert(I->getParent());
16286 }
16287 }
16288
16289 E->VectorizedValue =V;
16290 ++NumVectorInstructions;
16291
16292returnV;
16293 }
16294default:
16295llvm_unreachable("unknown inst");
16296 }
16297returnnullptr;
16298}
16299
16300Value *BoUpSLP::vectorizeTree() {
16301ExtraValueToDebugLocsMap ExternallyUsedValues;
16302returnvectorizeTree(ExternallyUsedValues);
16303}
16304
16305Value *
16306BoUpSLP::vectorizeTree(constExtraValueToDebugLocsMap &ExternallyUsedValues,
16307Instruction *ReductionRoot) {
16308// All blocks must be scheduled before any instructions are inserted.
16309for (auto &BSIter : BlocksSchedules) {
16310 scheduleBlock(BSIter.second.get());
16311 }
16312// Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16313// need to rebuild it.
16314 EntryToLastInstruction.clear();
16315
16316if (ReductionRoot)
16317 Builder.SetInsertPoint(ReductionRoot->getParent(),
16318 ReductionRoot->getIterator());
16319else
16320 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());
16321
16322// Emit gathered loads first to emit better code for the users of those
16323// gathered loads.
16324for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16325if (GatheredLoadsEntriesFirst.has_value() &&
16326 TE->Idx >= *GatheredLoadsEntriesFirst &&
16327 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16328assert((!TE->UserTreeIndices.empty() ||
16329 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16330"Expected gathered load node.");
16331 (void)vectorizeTree(TE.get(),/*PostponedPHIs=*/false);
16332 }
16333 }
16334// Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16335 (void)vectorizeTree(VectorizableTree[0].get(),/*PostponedPHIs=*/true);
16336for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16337if (TE->State == TreeEntry::Vectorize &&
16338 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16339 TE->VectorizedValue)
16340 (void)vectorizeTree(TE.get(),/*PostponedPHIs=*/false);
16341// Run through the list of postponed gathers and emit them, replacing the temp
16342// emitted allocas with actual vector instructions.
16343ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16344DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
16345for (const TreeEntry *E : PostponedNodes) {
16346auto *TE =const_cast<TreeEntry *>(E);
16347if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
16348if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16349 TE->UserTreeIndices.front().EdgeIdx)) &&
16350 VecTE->isSame(TE->Scalars))
16351// Found gather node which is absolutely the same as one of the
16352// vectorized nodes. It may happen after reordering.
16353continue;
16354auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16355 TE->VectorizedValue =nullptr;
16356auto *UserI =
16357 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16358// If user is a PHI node, its vector code have to be inserted right before
16359// block terminator. Since the node was delayed, there were some unresolved
16360// dependencies at the moment when stab instruction was emitted. In a case
16361// when any of these dependencies turn out an operand of another PHI, coming
16362// from this same block, position of a stab instruction will become invalid.
16363// The is because source vector that supposed to feed this gather node was
16364// inserted at the end of the block [after stab instruction]. So we need
16365// to adjust insertion point again to the end of block.
16366if (isa<PHINode>(UserI)) {
16367// Insert before all users.
16368Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16369for (User *U : PrevVec->users()) {
16370if (U == UserI)
16371continue;
16372auto *UI = dyn_cast<Instruction>(U);
16373if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16374continue;
16375if (UI->comesBefore(InsertPt))
16376 InsertPt = UI;
16377 }
16378 Builder.SetInsertPoint(InsertPt);
16379 }else {
16380 Builder.SetInsertPoint(PrevVec);
16381 }
16382 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16383Value *Vec =vectorizeTree(TE,/*PostponedPHIs=*/false);
16384if (auto *VecI = dyn_cast<Instruction>(Vec);
16385 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16386 Builder.GetInsertPoint()->comesBefore(VecI))
16387 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16388 Builder.GetInsertPoint());
16389if (Vec->getType() != PrevVec->getType()) {
16390assert(Vec->getType()->isIntOrIntVectorTy() &&
16391 PrevVec->getType()->isIntOrIntVectorTy() &&
16392"Expected integer vector types only.");
16393 std::optional<bool> IsSigned;
16394for (Value *V : TE->Scalars) {
16395if (const TreeEntry *BaseTE = getTreeEntry(V)) {
16396auto It = MinBWs.find(BaseTE);
16397if (It != MinBWs.end()) {
16398 IsSigned = IsSigned.value_or(false) || It->second.second;
16399if (*IsSigned)
16400break;
16401 }
16402for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
16403auto It = MinBWs.find(MNTE);
16404if (It != MinBWs.end()) {
16405 IsSigned = IsSigned.value_or(false) || It->second.second;
16406if (*IsSigned)
16407break;
16408 }
16409 }
16410if (IsSigned.value_or(false))
16411break;
16412// Scan through gather nodes.
16413for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16414auto It = MinBWs.find(BVE);
16415if (It != MinBWs.end()) {
16416 IsSigned = IsSigned.value_or(false) || It->second.second;
16417if (*IsSigned)
16418break;
16419 }
16420 }
16421if (IsSigned.value_or(false))
16422break;
16423if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16424 IsSigned =
16425 IsSigned.value_or(false) ||
16426 !isKnownNonNegative(EE->getVectorOperand(),SimplifyQuery(*DL));
16427continue;
16428 }
16429if (IsSigned.value_or(false))
16430break;
16431 }
16432 }
16433if (IsSigned.value_or(false)) {
16434// Final attempt - check user node.
16435auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16436if (It != MinBWs.end())
16437 IsSigned = It->second.second;
16438 }
16439assert(IsSigned &&
16440"Expected user node or perfect diamond match in MinBWs.");
16441 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16442 }
16443 PrevVec->replaceAllUsesWith(Vec);
16444 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16445// Replace the stub vector node, if it was used before for one of the
16446// buildvector nodes already.
16447auto It = PostponedValues.find(PrevVec);
16448if (It != PostponedValues.end()) {
16449for (TreeEntry *VTE : It->getSecond())
16450 VTE->VectorizedValue = Vec;
16451 }
16452eraseInstruction(PrevVec);
16453 }
16454
16455LLVM_DEBUG(dbgs() <<"SLP: Extracting " << ExternalUses.size()
16456 <<" values .\n");
16457
16458SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
16459// Maps vector instruction to original insertelement instruction
16460DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16461// Maps extract Scalar to the corresponding extractelement instruction in the
16462// basic block. Only one extractelement per block should be emitted.
16463DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
16464 ScalarToEEs;
16465SmallDenseSet<Value *, 4> UsedInserts;
16466DenseMap<std::pair<Value *, Type *>,Value *> VectorCasts;
16467SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16468SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;
16469// Extract all of the elements with the external uses.
16470for (constauto &ExternalUse : ExternalUses) {
16471Value *Scalar = ExternalUse.Scalar;
16472llvm::User *User = ExternalUse.User;
16473
16474// Skip users that we already RAUW. This happens when one instruction
16475// has multiple uses of the same value.
16476if (User && !is_contained(Scalar->users(),User))
16477continue;
16478 TreeEntry *E = getTreeEntry(Scalar);
16479assert(E &&"Invalid scalar");
16480assert(!E->isGather() &&"Extracting from a gather list");
16481// Non-instruction pointers are not deleted, just skip them.
16482if (E->getOpcode() == Instruction::GetElementPtr &&
16483 !isa<GetElementPtrInst>(Scalar))
16484continue;
16485
16486Value *Vec = E->VectorizedValue;
16487assert(Vec &&"Can't find vectorizable value");
16488
16489Value *Lane = Builder.getInt32(ExternalUse.Lane);
16490auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16491if (Scalar->getType() != Vec->getType()) {
16492Value *Ex =nullptr;
16493Value *ExV =nullptr;
16494auto *Inst = dyn_cast<Instruction>(Scalar);
16495bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16496auto It = ScalarToEEs.find(Scalar);
16497if (It != ScalarToEEs.end()) {
16498// No need to emit many extracts, just move the only one in the
16499// current block.
16500auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16501 : Builder.GetInsertBlock());
16502if (EEIt != It->second.end()) {
16503Value *PrevV = EEIt->second.first;
16504if (auto *I = dyn_cast<Instruction>(PrevV);
16505I && !ReplaceInst &&
16506 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16507 Builder.GetInsertPoint()->comesBefore(I)) {
16508I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16509 Builder.GetInsertPoint());
16510if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16511 CI->moveAfter(I);
16512 }
16513 Ex = PrevV;
16514 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16515 }
16516 }
16517if (!Ex) {
16518// "Reuse" the existing extract to improve final codegen.
16519if (ReplaceInst) {
16520// Leave the instruction as is, if it cheaper extracts and all
16521// operands are scalar.
16522if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16523 IgnoredExtracts.insert(EE);
16524 Ex = EE;
16525 }else {
16526auto *CloneInst = Inst->clone();
16527 CloneInst->insertBefore(Inst->getIterator());
16528if (Inst->hasName())
16529 CloneInst->takeName(Inst);
16530 Ex = CloneInst;
16531 }
16532 }elseif (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16533 ES && isa<Instruction>(Vec)) {
16534Value *V = ES->getVectorOperand();
16535auto *IVec = cast<Instruction>(Vec);
16536if (const TreeEntry *ETE = getTreeEntry(V))
16537 V = ETE->VectorizedValue;
16538if (auto *IV = dyn_cast<Instruction>(V);
16539 !IV ||IV == Vec ||IV->getParent() != IVec->getParent() ||
16540IV->comesBefore(IVec))
16541 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16542else
16543 Ex = Builder.CreateExtractElement(Vec, Lane);
16544 }elseif (auto *VecTy =
16545 dyn_cast<FixedVectorType>(Scalar->getType())) {
16546assert(SLPReVec &&"FixedVectorType is not expected.");
16547unsigned VecTyNumElements = VecTy->getNumElements();
16548// When REVEC is enabled, we need to extract a vector.
16549// Note: The element size of Scalar may be different from the
16550// element size of Vec.
16551 Ex =createExtractVector(Builder, Vec, VecTyNumElements,
16552 ExternalUse.Lane * VecTyNumElements);
16553 }else {
16554 Ex = Builder.CreateExtractElement(Vec, Lane);
16555 }
16556// If necessary, sign-extend or zero-extend ScalarRoot
16557// to the larger type.
16558 ExV = Ex;
16559if (Scalar->getType() != Ex->getType())
16560 ExV = Builder.CreateIntCast(
16561 Ex, Scalar->getType(),
16562 !isKnownNonNegative(Scalar,SimplifyQuery(*DL)));
16563auto *I = dyn_cast<Instruction>(Ex);
16564 ScalarToEEs[Scalar].try_emplace(I ?I->getParent()
16565 : &F->getEntryBlock(),
16566 std::make_pair(Ex, ExV));
16567 }
16568// The then branch of the previous if may produce constants, since 0
16569// operand might be a constant.
16570if (auto *ExI = dyn_cast<Instruction>(Ex);
16571 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16572 GatherShuffleExtractSeq.insert(ExI);
16573 CSEBlocks.insert(ExI->getParent());
16574 }
16575return ExV;
16576 }
16577assert(isa<FixedVectorType>(Scalar->getType()) &&
16578 isa<InsertElementInst>(Scalar) &&
16579"In-tree scalar of vector type is not insertelement?");
16580auto *IE = cast<InsertElementInst>(Scalar);
16581 VectorToInsertElement.try_emplace(Vec, IE);
16582return Vec;
16583 };
16584// If User == nullptr, the Scalar remains as scalar in vectorized
16585// instructions or is used as extra arg. Generate ExtractElement instruction
16586// and update the record for this scalar in ExternallyUsedValues.
16587if (!User) {
16588if (!ScalarsWithNullptrUser.insert(Scalar).second)
16589continue;
16590assert((ExternallyUsedValues.count(Scalar) ||
16591 Scalar->hasNUsesOrMore(UsesLimit) ||
16592 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16593any_of(Scalar->users(),
16594 [&](llvm::User *U) {
16595 if (ExternalUsesAsOriginalScalar.contains(U))
16596 return true;
16597 TreeEntry *UseEntry = getTreeEntry(U);
16598 return UseEntry &&
16599 (UseEntry->State == TreeEntry::Vectorize ||
16600 UseEntry->State ==
16601 TreeEntry::StridedVectorize) &&
16602 (E->State == TreeEntry::Vectorize ||
16603 E->State == TreeEntry::StridedVectorize) &&
16604 doesInTreeUserNeedToExtract(
16605 Scalar, getRootEntryInstruction(*UseEntry),
16606 TLI, TTI);
16607 })) &&
16608"Scalar with nullptr User must be registered in "
16609"ExternallyUsedValues map or remain as scalar in vectorized "
16610"instructions");
16611if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16612if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16613if (PHI->getParent()->isLandingPad())
16614 Builder.SetInsertPoint(
16615PHI->getParent(),
16616 std::next(
16617PHI->getParent()->getLandingPadInst()->getIterator()));
16618else
16619 Builder.SetInsertPoint(PHI->getParent(),
16620PHI->getParent()->getFirstNonPHIIt());
16621 }else {
16622 Builder.SetInsertPoint(VecI->getParent(),
16623 std::next(VecI->getIterator()));
16624 }
16625 }else {
16626 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());
16627 }
16628Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16629// Required to update internally referenced instructions.
16630if (Scalar != NewInst) {
16631assert((!isa<ExtractElementInst>(Scalar) ||
16632 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16633"Extractelements should not be replaced.");
16634 Scalar->replaceAllUsesWith(NewInst);
16635 }
16636continue;
16637 }
16638
16639if (auto *VU = dyn_cast<InsertElementInst>(User);
16640 VU && VU->getOperand(1) == Scalar) {
16641// Skip if the scalar is another vector op or Vec is not an instruction.
16642if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16643if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16644if (!UsedInserts.insert(VU).second)
16645continue;
16646// Need to use original vector, if the root is truncated.
16647auto BWIt = MinBWs.find(E);
16648if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16649auto *ScalarTy = FTy->getElementType();
16650auto Key = std::make_pair(Vec, ScalarTy);
16651auto VecIt = VectorCasts.find(Key);
16652if (VecIt == VectorCasts.end()) {
16653IRBuilderBase::InsertPointGuard Guard(Builder);
16654if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16655if (IVec->getParent()->isLandingPad())
16656 Builder.SetInsertPoint(IVec->getParent(),
16657 std::next(IVec->getParent()
16658 ->getLandingPadInst()
16659 ->getIterator()));
16660else
16661 Builder.SetInsertPoint(
16662 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16663 }elseif (auto *IVec = dyn_cast<Instruction>(Vec)) {
16664 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16665 }
16666 Vec = Builder.CreateIntCast(
16667 Vec,
16668getWidenedType(
16669 ScalarTy,
16670 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16671 BWIt->second.second);
16672 VectorCasts.try_emplace(Key, Vec);
16673 }else {
16674 Vec = VecIt->second;
16675 }
16676 }
16677
16678 std::optional<unsigned> InsertIdx =getElementIndex(VU);
16679if (InsertIdx) {
16680auto *It =find_if(
16681 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16682// Checks if 2 insertelements are from the same buildvector.
16683InsertElementInst *VecInsert =Data.InsertElements.front();
16684returnareTwoInsertFromSameBuildVector(
16685 VU, VecInsert,
16686 [](InsertElementInst *II) {returnII->getOperand(0); });
16687 });
16688unsignedIdx = *InsertIdx;
16689if (It == ShuffledInserts.end()) {
16690 (void)ShuffledInserts.emplace_back();
16691 It = std::next(ShuffledInserts.begin(),
16692 ShuffledInserts.size() - 1);
16693 }
16694SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16695if (Mask.empty())
16696 Mask.assign(FTy->getNumElements(),PoisonMaskElem);
16697 Mask[Idx] = ExternalUse.Lane;
16698 It->InsertElements.push_back(cast<InsertElementInst>(User));
16699continue;
16700 }
16701 }
16702 }
16703 }
16704
16705// Generate extracts for out-of-tree users.
16706// Find the insertion point for the extractelement lane.
16707if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16708if (PHINode *PH = dyn_cast<PHINode>(User)) {
16709for (unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {
16710if (PH->getIncomingValue(I) == Scalar) {
16711Instruction *IncomingTerminator =
16712 PH->getIncomingBlock(I)->getTerminator();
16713if (isa<CatchSwitchInst>(IncomingTerminator)) {
16714 Builder.SetInsertPoint(VecI->getParent(),
16715 std::next(VecI->getIterator()));
16716 }else {
16717 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16718 }
16719Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16720 PH->setOperand(I, NewInst);
16721 }
16722 }
16723 }else {
16724 Builder.SetInsertPoint(cast<Instruction>(User));
16725Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16726User->replaceUsesOfWith(Scalar, NewInst);
16727 }
16728 }else {
16729 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());
16730Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16731User->replaceUsesOfWith(Scalar, NewInst);
16732 }
16733
16734LLVM_DEBUG(dbgs() <<"SLP: Replaced:" << *User <<".\n");
16735 }
16736
16737auto CreateShuffle = [&](Value *V1,Value *V2,ArrayRef<int> Mask) {
16738SmallVector<int> CombinedMask1(Mask.size(),PoisonMaskElem);
16739SmallVector<int> CombinedMask2(Mask.size(),PoisonMaskElem);
16740int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16741for (intI = 0, E = Mask.size();I < E; ++I) {
16742if (Mask[I] < VF)
16743 CombinedMask1[I] = Mask[I];
16744else
16745 CombinedMask2[I] = Mask[I] - VF;
16746 }
16747ShuffleInstructionBuilder ShuffleBuilder(
16748 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16749 ShuffleBuilder.add(V1, CombinedMask1);
16750if (V2)
16751 ShuffleBuilder.add(V2, CombinedMask2);
16752return ShuffleBuilder.finalize({}, {}, {});
16753 };
16754
16755auto &&ResizeToVF = [&CreateShuffle](Value *Vec,ArrayRef<int> Mask,
16756bool ForSingleMask) {
16757unsigned VF = Mask.size();
16758unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16759if (VF != VecVF) {
16760if (any_of(Mask, [VF](intIdx) {returnIdx >=static_cast<int>(VF); })) {
16761 Vec = CreateShuffle(Vec,nullptr, Mask);
16762return std::make_pair(Vec,true);
16763 }
16764if (!ForSingleMask) {
16765SmallVector<int> ResizeMask(VF,PoisonMaskElem);
16766for (unsignedI = 0;I < VF; ++I) {
16767if (Mask[I] !=PoisonMaskElem)
16768 ResizeMask[Mask[I]] = Mask[I];
16769 }
16770 Vec = CreateShuffle(Vec,nullptr, ResizeMask);
16771 }
16772 }
16773
16774return std::make_pair(Vec,false);
16775 };
16776// Perform shuffling of the vectorize tree entries for better handling of
16777// external extracts.
16778for (intI = 0, E = ShuffledInserts.size();I < E; ++I) {
16779// Find the first and the last instruction in the list of insertelements.
16780sort(ShuffledInserts[I].InsertElements,isFirstInsertElement);
16781InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16782InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16783 Builder.SetInsertPoint(LastInsert);
16784autoVector = ShuffledInserts[I].ValueMasks.takeVector();
16785Value *NewInst = performExtractsShuffleAction<Value>(
16786MutableArrayRef(Vector.data(),Vector.size()),
16787 FirstInsert->getOperand(0),
16788 [](Value *Vec) {
16789 return cast<VectorType>(Vec->getType())
16790 ->getElementCount()
16791 .getKnownMinValue();
16792 },
16793 ResizeToVF,
16794 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16795ArrayRef<Value *> Vals) {
16796 assert((Vals.size() == 1 || Vals.size() == 2) &&
16797"Expected exactly 1 or 2 input values.");
16798 if (Vals.size() == 1) {
16799// Do not create shuffle if the mask is a simple identity
16800// non-resizing mask.
16801 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16802 ->getNumElements() ||
16803 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16804 return CreateShuffle(Vals.front(), nullptr, Mask);
16805 return Vals.front();
16806 }
16807return CreateShuffle(Vals.front() ? Vals.front()
16808 : FirstInsert->getOperand(0),
16809 Vals.back(), Mask);
16810 });
16811auto It = ShuffledInserts[I].InsertElements.rbegin();
16812// Rebuild buildvector chain.
16813InsertElementInst *II =nullptr;
16814if (It != ShuffledInserts[I].InsertElements.rend())
16815II = *It;
16816SmallVector<Instruction *> Inserts;
16817while (It != ShuffledInserts[I].InsertElements.rend()) {
16818assert(II &&"Must be an insertelement instruction.");
16819if (*It ==II)
16820 ++It;
16821else
16822 Inserts.push_back(cast<Instruction>(II));
16823II = dyn_cast<InsertElementInst>(II->getOperand(0));
16824 }
16825for (Instruction *II :reverse(Inserts)) {
16826II->replaceUsesOfWith(II->getOperand(0), NewInst);
16827if (auto *NewI = dyn_cast<Instruction>(NewInst))
16828if (II->getParent() == NewI->getParent() &&II->comesBefore(NewI))
16829II->moveAfter(NewI);
16830 NewInst =II;
16831 }
16832 LastInsert->replaceAllUsesWith(NewInst);
16833for (InsertElementInst *IE :reverse(ShuffledInserts[I].InsertElements)) {
16834 IE->replaceUsesOfWith(IE->getOperand(0),
16835PoisonValue::get(IE->getOperand(0)->getType()));
16836 IE->replaceUsesOfWith(IE->getOperand(1),
16837PoisonValue::get(IE->getOperand(1)->getType()));
16838eraseInstruction(IE);
16839 }
16840 CSEBlocks.insert(LastInsert->getParent());
16841 }
16842
16843SmallVector<Instruction *> RemovedInsts;
16844// For each vectorized value:
16845for (auto &TEPtr : VectorizableTree) {
16846 TreeEntry *Entry = TEPtr.get();
16847
16848// No need to handle users of gathered values.
16849if (Entry->isGather())
16850continue;
16851
16852assert(Entry->VectorizedValue &&"Can't find vectorizable value");
16853
16854// For each lane:
16855for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16856Value *Scalar = Entry->Scalars[Lane];
16857
16858if (Entry->getOpcode() == Instruction::GetElementPtr &&
16859 !isa<GetElementPtrInst>(Scalar))
16860continue;
16861if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16862 EE && IgnoredExtracts.contains(EE))
16863continue;
16864if (isa<PoisonValue>(Scalar))
16865continue;
16866#ifndef NDEBUG
16867Type *Ty = Scalar->getType();
16868if (!Ty->isVoidTy()) {
16869for (User *U : Scalar->users()) {
16870LLVM_DEBUG(dbgs() <<"SLP: \tvalidating user:" << *U <<".\n");
16871
16872// It is legal to delete users in the ignorelist.
16873assert((getTreeEntry(U) ||
16874 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16875 (isa_and_nonnull<Instruction>(U) &&
16876 isDeleted(cast<Instruction>(U)))) &&
16877"Deleting out-of-tree value");
16878 }
16879 }
16880#endif
16881LLVM_DEBUG(dbgs() <<"SLP: \tErasing scalar:" << *Scalar <<".\n");
16882auto *I = cast<Instruction>(Scalar);
16883 RemovedInsts.push_back(I);
16884 }
16885 }
16886
16887// Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16888// new vector instruction.
16889if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16890V->mergeDIAssignID(RemovedInsts);
16891
16892// Clear up reduction references, if any.
16893if (UserIgnoreList) {
16894for (Instruction *I : RemovedInsts) {
16895const TreeEntry *IE = getTreeEntry(I);
16896if (IE->Idx != 0 &&
16897 !(VectorizableTree.front()->isGather() &&
16898 !IE->UserTreeIndices.empty() &&
16899 (ValueToGatherNodes.lookup(I).contains(
16900 VectorizableTree.front().get()) ||
16901any_of(IE->UserTreeIndices,
16902 [&](const EdgeInfo &EI) {
16903 return EI.UserTE == VectorizableTree.front().get() &&
16904 EI.EdgeIdx == UINT_MAX;
16905 }))) &&
16906 !(GatheredLoadsEntriesFirst.has_value() &&
16907IE->Idx >= *GatheredLoadsEntriesFirst &&
16908 VectorizableTree.front()->isGather() &&
16909is_contained(VectorizableTree.front()->Scalars,I)))
16910continue;
16911SmallVector<SelectInst *> LogicalOpSelects;
16912I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16913// Do not replace condition of the logical op in form select <cond>.
16914 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16915 (match(U.getUser(), m_LogicalAnd()) ||
16916 match(U.getUser(), m_LogicalOr())) &&
16917 U.getOperandNo() == 0;
16918 if (IsPoisoningLogicalOp) {
16919 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16920 return false;
16921 }
16922return UserIgnoreList->contains(U.getUser());
16923 });
16924// Replace conditions of the poisoning logical ops with the non-poison
16925// constant value.
16926for (SelectInst *SI : LogicalOpSelects)
16927SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16928 }
16929 }
16930// Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16931// cache correctness.
16932// NOTE: removeInstructionAndOperands only marks the instruction for deletion
16933// - instructions are not deleted until later.
16934 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16935
16936 Builder.ClearInsertionPoint();
16937 InstrElementSize.clear();
16938
16939const TreeEntry &RootTE = *VectorizableTree.front();
16940Value *Vec = RootTE.VectorizedValue;
16941if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16942 It != MinBWs.end() &&
16943 ReductionBitWidth != It->second.first) {
16944IRBuilder<>::InsertPointGuard Guard(Builder);
16945 Builder.SetInsertPoint(ReductionRoot->getParent(),
16946 ReductionRoot->getIterator());
16947 Vec = Builder.CreateIntCast(
16948 Vec,
16949VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16950 cast<VectorType>(Vec->getType())->getElementCount()),
16951 It->second.second);
16952 }
16953return Vec;
16954}
16955
16956voidBoUpSLP::optimizeGatherSequence() {
16957LLVM_DEBUG(dbgs() <<"SLP: Optimizing " << GatherShuffleExtractSeq.size()
16958 <<" gather sequences instructions.\n");
16959// LICM InsertElementInst sequences.
16960for (Instruction *I : GatherShuffleExtractSeq) {
16961if (isDeleted(I))
16962continue;
16963
16964// Check if this block is inside a loop.
16965Loop *L = LI->getLoopFor(I->getParent());
16966if (!L)
16967continue;
16968
16969// Check if it has a preheader.
16970BasicBlock *PreHeader = L->getLoopPreheader();
16971if (!PreHeader)
16972continue;
16973
16974// If the vector or the element that we insert into it are
16975// instructions that are defined in this basic block then we can't
16976// hoist this instruction.
16977if (any_of(I->operands(), [L](Value *V) {
16978 auto *OpI = dyn_cast<Instruction>(V);
16979 return OpI && L->contains(OpI);
16980 }))
16981continue;
16982
16983// We can hoist this instruction. Move it to the pre-header.
16984I->moveBefore(PreHeader->getTerminator()->getIterator());
16985 CSEBlocks.insert(PreHeader);
16986 }
16987
16988// Make a list of all reachable blocks in our CSE queue.
16989SmallVector<const DomTreeNode *, 8> CSEWorkList;
16990 CSEWorkList.reserve(CSEBlocks.size());
16991for (BasicBlock *BB : CSEBlocks)
16992if (DomTreeNode *N = DT->getNode(BB)) {
16993assert(DT->isReachableFromEntry(N));
16994 CSEWorkList.push_back(N);
16995 }
16996
16997// Sort blocks by domination. This ensures we visit a block after all blocks
16998// dominating it are visited.
16999llvm::sort(CSEWorkList, [](constDomTreeNode *A,constDomTreeNode *B) {
17000assert((A ==B) == (A->getDFSNumIn() ==B->getDFSNumIn()) &&
17001"Different nodes should have different DFS numbers");
17002returnA->getDFSNumIn() <B->getDFSNumIn();
17003 });
17004
17005// Less defined shuffles can be replaced by the more defined copies.
17006// Between two shuffles one is less defined if it has the same vector operands
17007// and its mask indeces are the same as in the first one or undefs. E.g.
17008// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
17009// poison, <0, 0, 0, 0>.
17010auto &&IsIdenticalOrLessDefined = [TTI =TTI](Instruction *I1,
17011Instruction *I2,
17012SmallVectorImpl<int> &NewMask) {
17013if (I1->getType() != I2->getType())
17014returnfalse;
17015auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17016auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17017if (!SI1 || !SI2)
17018return I1->isIdenticalTo(I2);
17019if (SI1->isIdenticalTo(SI2))
17020returntrue;
17021for (intI = 0, E = SI1->getNumOperands();I < E; ++I)
17022if (SI1->getOperand(I) != SI2->getOperand(I))
17023returnfalse;
17024// Check if the second instruction is more defined than the first one.
17025 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17026ArrayRef<int> SM1 = SI1->getShuffleMask();
17027// Count trailing undefs in the mask to check the final number of used
17028// registers.
17029unsigned LastUndefsCnt = 0;
17030for (intI = 0, E = NewMask.size();I < E; ++I) {
17031if (SM1[I] ==PoisonMaskElem)
17032 ++LastUndefsCnt;
17033else
17034 LastUndefsCnt = 0;
17035if (NewMask[I] !=PoisonMaskElem && SM1[I] !=PoisonMaskElem &&
17036 NewMask[I] != SM1[I])
17037returnfalse;
17038if (NewMask[I] ==PoisonMaskElem)
17039 NewMask[I] = SM1[I];
17040 }
17041// Check if the last undefs actually change the final number of used vector
17042// registers.
17043return SM1.size() - LastUndefsCnt > 1 &&
17044::getNumberOfParts(*TTI, SI1->getType()) ==
17045::getNumberOfParts(
17046 *TTI,getWidenedType(SI1->getType()->getElementType(),
17047 SM1.size() - LastUndefsCnt));
17048 };
17049// Perform O(N^2) search over the gather/shuffle sequences and merge identical
17050// instructions. TODO: We can further optimize this scan if we split the
17051// instructions into different buckets based on the insert lane.
17052SmallVector<Instruction *, 16> Visited;
17053for (autoI = CSEWorkList.begin(), E = CSEWorkList.end();I != E; ++I) {
17054assert(*I &&
17055 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
17056"Worklist not sorted properly!");
17057BasicBlock *BB = (*I)->getBlock();
17058// For all instructions in blocks containing gather sequences:
17059for (Instruction &In :llvm::make_early_inc_range(*BB)) {
17060if (isDeleted(&In))
17061continue;
17062if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17063 !GatherShuffleExtractSeq.contains(&In))
17064continue;
17065
17066// Check if we can replace this instruction with any of the
17067// visited instructions.
17068bool Replaced =false;
17069for (Instruction *&V : Visited) {
17070SmallVector<int> NewMask;
17071if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17072 DT->dominates(V->getParent(), In.getParent())) {
17073 In.replaceAllUsesWith(V);
17074eraseInstruction(&In);
17075if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17076if (!NewMask.empty())
17077 SI->setShuffleMask(NewMask);
17078 Replaced =true;
17079break;
17080 }
17081if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17082 GatherShuffleExtractSeq.contains(V) &&
17083 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17084 DT->dominates(In.getParent(), V->getParent())) {
17085 In.moveAfter(V);
17086 V->replaceAllUsesWith(&In);
17087eraseInstruction(V);
17088if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17089if (!NewMask.empty())
17090 SI->setShuffleMask(NewMask);
17091 V = &In;
17092 Replaced =true;
17093break;
17094 }
17095 }
17096if (!Replaced) {
17097assert(!is_contained(Visited, &In));
17098 Visited.push_back(&In);
17099 }
17100 }
17101 }
17102 CSEBlocks.clear();
17103 GatherShuffleExtractSeq.clear();
17104}
17105
17106BoUpSLP::ScheduleData *
17107BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17108 ScheduleData *Bundle =nullptr;
17109 ScheduleData *PrevInBundle =nullptr;
17110for (Value *V : VL) {
17111if (doesNotNeedToBeScheduled(V))
17112continue;
17113 ScheduleData *BundleMember = getScheduleData(V);
17114assert(BundleMember &&
17115"no ScheduleData for bundle member "
17116"(maybe not in same basic block)");
17117assert(BundleMember->isSchedulingEntity() &&
17118"bundle member already part of other bundle");
17119if (PrevInBundle) {
17120 PrevInBundle->NextInBundle = BundleMember;
17121 }else {
17122 Bundle = BundleMember;
17123 }
17124
17125// Group the instructions to a bundle.
17126 BundleMember->FirstInBundle = Bundle;
17127 PrevInBundle = BundleMember;
17128 }
17129assert(Bundle &&"Failed to find schedule bundle");
17130return Bundle;
17131}
17132
17133// Groups the instructions to a bundle (which is then a single scheduling entity)
17134// and schedules instructions until the bundle gets ready.
17135std::optional<BoUpSLP::ScheduleData *>
17136BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,BoUpSLP *SLP,
17137const InstructionsState &S) {
17138// No need to schedule PHIs, insertelement, extractelement and extractvalue
17139// instructions.
17140if (isa<PHINode>(S.getMainOp()) ||
17141isVectorLikeInstWithConstOps(S.getMainOp()) ||doesNotNeedToSchedule(VL))
17142returnnullptr;
17143
17144// Initialize the instruction bundle.
17145Instruction *OldScheduleEnd = ScheduleEnd;
17146LLVM_DEBUG(dbgs() <<"SLP: bundle: " << *S.getMainOp() <<"\n");
17147
17148auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17149 ScheduleData *Bundle) {
17150// The scheduling region got new instructions at the lower end (or it is a
17151// new region for the first bundle). This makes it necessary to
17152// recalculate all dependencies.
17153// It is seldom that this needs to be done a second time after adding the
17154// initial bundle to the region.
17155if (ScheduleEnd != OldScheduleEnd) {
17156for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode())
17157if (ScheduleData *SD = getScheduleData(I))
17158 SD->clearDependencies();
17159 ReSchedule =true;
17160 }
17161if (Bundle) {
17162LLVM_DEBUG(dbgs() <<"SLP: try schedule bundle " << *Bundle
17163 <<" in block " << BB->getName() <<"\n");
17164 calculateDependencies(Bundle,/*InsertInReadyList=*/true, SLP);
17165 }
17166
17167if (ReSchedule) {
17168 resetSchedule();
17169 initialFillReadyList(ReadyInsts);
17170 }
17171
17172// Now try to schedule the new bundle or (if no bundle) just calculate
17173// dependencies. As soon as the bundle is "ready" it means that there are no
17174// cyclic dependencies and we can schedule it. Note that's important that we
17175// don't "schedule" the bundle yet (see cancelScheduling).
17176while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17177 !ReadyInsts.empty()) {
17178 ScheduleData *Picked = ReadyInsts.pop_back_val();
17179assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17180"must be ready to schedule");
17181 schedule(Picked, ReadyInsts);
17182 }
17183 };
17184
17185// Make sure that the scheduling region contains all
17186// instructions of the bundle.
17187for (Value *V : VL) {
17188if (doesNotNeedToBeScheduled(V))
17189continue;
17190if (!extendSchedulingRegion(V, S)) {
17191// If the scheduling region got new instructions at the lower end (or it
17192// is a new region for the first bundle). This makes it necessary to
17193// recalculate all dependencies.
17194// Otherwise the compiler may crash trying to incorrectly calculate
17195// dependencies and emit instruction in the wrong order at the actual
17196// scheduling.
17197 TryScheduleBundleImpl(/*ReSchedule=*/false,nullptr);
17198return std::nullopt;
17199 }
17200 }
17201
17202bool ReSchedule =false;
17203for (Value *V : VL) {
17204if (doesNotNeedToBeScheduled(V))
17205continue;
17206 ScheduleData *BundleMember = getScheduleData(V);
17207assert(BundleMember &&
17208"no ScheduleData for bundle member (maybe not in same basic block)");
17209
17210// Make sure we don't leave the pieces of the bundle in the ready list when
17211// whole bundle might not be ready.
17212 ReadyInsts.remove(BundleMember);
17213
17214if (!BundleMember->IsScheduled)
17215continue;
17216// A bundle member was scheduled as single instruction before and now
17217// needs to be scheduled as part of the bundle. We just get rid of the
17218// existing schedule.
17219LLVM_DEBUG(dbgs() <<"SLP: reset schedule because " << *BundleMember
17220 <<" was already scheduled\n");
17221 ReSchedule =true;
17222 }
17223
17224auto *Bundle = buildBundle(VL);
17225 TryScheduleBundleImpl(ReSchedule, Bundle);
17226if (!Bundle->isReady()) {
17227 cancelScheduling(VL, S.getMainOp());
17228return std::nullopt;
17229 }
17230return Bundle;
17231}
17232
17233void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17234Value *OpValue) {
17235if (isa<PHINode>(OpValue) ||isVectorLikeInstWithConstOps(OpValue) ||
17236doesNotNeedToSchedule(VL))
17237return;
17238
17239if (doesNotNeedToBeScheduled(OpValue))
17240 OpValue = *find_if_not(VL,doesNotNeedToBeScheduled);
17241 ScheduleData *Bundle = getScheduleData(OpValue);
17242LLVM_DEBUG(dbgs() <<"SLP: cancel scheduling of " << *Bundle <<"\n");
17243assert(!Bundle->IsScheduled &&
17244"Can't cancel bundle which is already scheduled");
17245assert(Bundle->isSchedulingEntity() &&
17246 (Bundle->isPartOfBundle() ||needToScheduleSingleInstruction(VL)) &&
17247"tried to unbundle something which is not a bundle");
17248
17249// Remove the bundle from the ready list.
17250if (Bundle->isReady())
17251 ReadyInsts.remove(Bundle);
17252
17253// Un-bundle: make single instructions out of the bundle.
17254 ScheduleData *BundleMember = Bundle;
17255while (BundleMember) {
17256assert(BundleMember->FirstInBundle == Bundle &&"corrupt bundle links");
17257 BundleMember->FirstInBundle = BundleMember;
17258 ScheduleData *Next = BundleMember->NextInBundle;
17259 BundleMember->NextInBundle =nullptr;
17260 BundleMember->TE =nullptr;
17261if (BundleMember->unscheduledDepsInBundle() == 0) {
17262 ReadyInsts.insert(BundleMember);
17263 }
17264 BundleMember = Next;
17265 }
17266}
17267
17268BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17269// Allocate a new ScheduleData for the instruction.
17270if (ChunkPos >= ChunkSize) {
17271 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17272 ChunkPos = 0;
17273 }
17274return &(ScheduleDataChunks.back()[ChunkPos++]);
17275}
17276
17277bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17278Value *V,const InstructionsState &S) {
17279Instruction *I = dyn_cast<Instruction>(V);
17280assert(I &&"bundle member must be an instruction");
17281assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17282 !doesNotNeedToBeScheduled(I) &&
17283"phi nodes/insertelements/extractelements/extractvalues don't need to "
17284"be scheduled");
17285if (getScheduleData(I))
17286returntrue;
17287if (!ScheduleStart) {
17288// It's the first instruction in the new region.
17289 initScheduleData(I,I->getNextNode(),nullptr,nullptr);
17290 ScheduleStart =I;
17291 ScheduleEnd =I->getNextNode();
17292assert(ScheduleEnd &&"tried to vectorize a terminator?");
17293LLVM_DEBUG(dbgs() <<"SLP: initialize schedule region to " << *I <<"\n");
17294returntrue;
17295 }
17296// Search up and down at the same time, because we don't know if the new
17297// instruction is above or below the existing scheduling region.
17298// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17299// against the budget. Otherwise debug info could affect codegen.
17300BasicBlock::reverse_iterator UpIter =
17301 ++ScheduleStart->getIterator().getReverse();
17302BasicBlock::reverse_iterator UpperEnd = BB->rend();
17303BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17304BasicBlock::iterator LowerEnd = BB->end();
17305auto IsAssumeLikeIntr = [](constInstruction &I) {
17306if (auto *II = dyn_cast<IntrinsicInst>(&I))
17307returnII->isAssumeLikeIntrinsic();
17308returnfalse;
17309 };
17310 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17311 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17312while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=I &&
17313 &*DownIter !=I) {
17314if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17315LLVM_DEBUG(dbgs() <<"SLP: exceeded schedule region size limit\n");
17316returnfalse;
17317 }
17318
17319 ++UpIter;
17320 ++DownIter;
17321
17322 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17323 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17324 }
17325if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==I)) {
17326assert(I->getParent() == ScheduleStart->getParent() &&
17327"Instruction is in wrong basic block.");
17328 initScheduleData(I, ScheduleStart,nullptr, FirstLoadStoreInRegion);
17329 ScheduleStart =I;
17330LLVM_DEBUG(dbgs() <<"SLP: extend schedule region start to " << *I
17331 <<"\n");
17332returntrue;
17333 }
17334assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==I)) &&
17335"Expected to reach top of the basic block or instruction down the "
17336"lower end.");
17337assert(I->getParent() == ScheduleEnd->getParent() &&
17338"Instruction is in wrong basic block.");
17339 initScheduleData(ScheduleEnd,I->getNextNode(), LastLoadStoreInRegion,
17340nullptr);
17341 ScheduleEnd =I->getNextNode();
17342assert(ScheduleEnd &&"tried to vectorize a terminator?");
17343LLVM_DEBUG(dbgs() <<"SLP: extend schedule region end to " << *I <<"\n");
17344returntrue;
17345}
17346
17347void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17348Instruction *ToI,
17349 ScheduleData *PrevLoadStore,
17350 ScheduleData *NextLoadStore) {
17351 ScheduleData *CurrentLoadStore = PrevLoadStore;
17352for (Instruction *I = FromI;I != ToI;I =I->getNextNode()) {
17353// No need to allocate data for non-schedulable instructions.
17354if (doesNotNeedToBeScheduled(I))
17355continue;
17356 ScheduleData *SD = ScheduleDataMap.lookup(I);
17357if (!SD) {
17358 SD = allocateScheduleDataChunks();
17359 ScheduleDataMap[I] = SD;
17360 }
17361assert(!isInSchedulingRegion(SD) &&
17362"new ScheduleData already in scheduling region");
17363 SD->init(SchedulingRegionID,I);
17364
17365if (I->mayReadOrWriteMemory() &&
17366 (!isa<IntrinsicInst>(I) ||
17367 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17368 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17369 Intrinsic::pseudoprobe))) {
17370// Update the linked list of memory accessing instructions.
17371if (CurrentLoadStore) {
17372 CurrentLoadStore->NextLoadStore = SD;
17373 }else {
17374 FirstLoadStoreInRegion = SD;
17375 }
17376 CurrentLoadStore = SD;
17377 }
17378
17379if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17380match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17381 RegionHasStackSave =true;
17382 }
17383if (NextLoadStore) {
17384if (CurrentLoadStore)
17385 CurrentLoadStore->NextLoadStore = NextLoadStore;
17386 }else {
17387 LastLoadStoreInRegion = CurrentLoadStore;
17388 }
17389}
17390
17391void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17392bool InsertInReadyList,
17393BoUpSLP *SLP) {
17394assert(SD->isSchedulingEntity());
17395
17396SmallVector<ScheduleData *, 10> WorkList;
17397 WorkList.push_back(SD);
17398
17399while (!WorkList.empty()) {
17400 ScheduleData *SD = WorkList.pop_back_val();
17401for (ScheduleData *BundleMember = SD; BundleMember;
17402 BundleMember = BundleMember->NextInBundle) {
17403assert(isInSchedulingRegion(BundleMember));
17404if (BundleMember->hasValidDependencies())
17405continue;
17406
17407LLVM_DEBUG(dbgs() <<"SLP: update deps of " << *BundleMember
17408 <<"\n");
17409 BundleMember->Dependencies = 0;
17410 BundleMember->resetUnscheduledDeps();
17411
17412// Handle def-use chain dependencies.
17413for (User *U : BundleMember->Inst->users()) {
17414if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17415 BundleMember->Dependencies++;
17416 ScheduleData *DestBundle = UseSD->FirstInBundle;
17417if (!DestBundle->IsScheduled)
17418 BundleMember->incrementUnscheduledDeps(1);
17419if (!DestBundle->hasValidDependencies())
17420 WorkList.push_back(DestBundle);
17421 }
17422 }
17423
17424auto MakeControlDependent = [&](Instruction *I) {
17425auto *DepDest = getScheduleData(I);
17426assert(DepDest &&"must be in schedule window");
17427 DepDest->ControlDependencies.push_back(BundleMember);
17428 BundleMember->Dependencies++;
17429 ScheduleData *DestBundle = DepDest->FirstInBundle;
17430if (!DestBundle->IsScheduled)
17431 BundleMember->incrementUnscheduledDeps(1);
17432if (!DestBundle->hasValidDependencies())
17433 WorkList.push_back(DestBundle);
17434 };
17435
17436// Any instruction which isn't safe to speculate at the beginning of the
17437// block is control dependend on any early exit or non-willreturn call
17438// which proceeds it.
17439if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17440for (Instruction *I = BundleMember->Inst->getNextNode();
17441I != ScheduleEnd;I =I->getNextNode()) {
17442if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17443continue;
17444
17445// Add the dependency
17446 MakeControlDependent(I);
17447
17448if (!isGuaranteedToTransferExecutionToSuccessor(I))
17449// Everything past here must be control dependent on I.
17450break;
17451 }
17452 }
17453
17454if (RegionHasStackSave) {
17455// If we have an inalloc alloca instruction, it needs to be scheduled
17456// after any preceeding stacksave. We also need to prevent any alloca
17457// from reordering above a preceeding stackrestore.
17458if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17459match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17460for (Instruction *I = BundleMember->Inst->getNextNode();
17461I != ScheduleEnd;I =I->getNextNode()) {
17462if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17463match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17464// Any allocas past here must be control dependent on I, and I
17465// must be memory dependend on BundleMember->Inst.
17466break;
17467
17468if (!isa<AllocaInst>(I))
17469continue;
17470
17471// Add the dependency
17472 MakeControlDependent(I);
17473 }
17474 }
17475
17476// In addition to the cases handle just above, we need to prevent
17477// allocas and loads/stores from moving below a stacksave or a
17478// stackrestore. Avoiding moving allocas below stackrestore is currently
17479// thought to be conservatism. Moving loads/stores below a stackrestore
17480// can lead to incorrect code.
17481if (isa<AllocaInst>(BundleMember->Inst) ||
17482 BundleMember->Inst->mayReadOrWriteMemory()) {
17483for (Instruction *I = BundleMember->Inst->getNextNode();
17484I != ScheduleEnd;I =I->getNextNode()) {
17485if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17486 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17487continue;
17488
17489// Add the dependency
17490 MakeControlDependent(I);
17491break;
17492 }
17493 }
17494 }
17495
17496// Handle the memory dependencies (if any).
17497 ScheduleData *DepDest = BundleMember->NextLoadStore;
17498if (!DepDest)
17499continue;
17500Instruction *SrcInst = BundleMember->Inst;
17501assert(SrcInst->mayReadOrWriteMemory() &&
17502"NextLoadStore list for non memory effecting bundle?");
17503MemoryLocation SrcLoc =getLocation(SrcInst);
17504bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17505unsigned NumAliased = 0;
17506unsigned DistToSrc = 1;
17507
17508for (; DepDest; DepDest = DepDest->NextLoadStore) {
17509assert(isInSchedulingRegion(DepDest));
17510
17511// We have two limits to reduce the complexity:
17512// 1) AliasedCheckLimit: It's a small limit to reduce calls to
17513// SLP->isAliased (which is the expensive part in this loop).
17514// 2) MaxMemDepDistance: It's for very large blocks and it aborts
17515// the whole loop (even if the loop is fast, it's quadratic).
17516// It's important for the loop break condition (see below) to
17517// check this limit even between two read-only instructions.
17518if (DistToSrc >=MaxMemDepDistance ||
17519 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17520 (NumAliased >=AliasedCheckLimit ||
17521 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17522
17523// We increment the counter only if the locations are aliased
17524// (instead of counting all alias checks). This gives a better
17525// balance between reduced runtime and accurate dependencies.
17526 NumAliased++;
17527
17528 DepDest->MemoryDependencies.push_back(BundleMember);
17529 BundleMember->Dependencies++;
17530 ScheduleData *DestBundle = DepDest->FirstInBundle;
17531if (!DestBundle->IsScheduled) {
17532 BundleMember->incrementUnscheduledDeps(1);
17533 }
17534if (!DestBundle->hasValidDependencies()) {
17535 WorkList.push_back(DestBundle);
17536 }
17537 }
17538
17539// Example, explaining the loop break condition: Let's assume our
17540// starting instruction is i0 and MaxMemDepDistance = 3.
17541//
17542// +--------v--v--v
17543// i0,i1,i2,i3,i4,i5,i6,i7,i8
17544// +--------^--^--^
17545//
17546// MaxMemDepDistance let us stop alias-checking at i3 and we add
17547// dependencies from i0 to i3,i4,.. (even if they are not aliased).
17548// Previously we already added dependencies from i3 to i6,i7,i8
17549// (because of MaxMemDepDistance). As we added a dependency from
17550// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17551// and we can abort this loop at i6.
17552if (DistToSrc >= 2 *MaxMemDepDistance)
17553break;
17554 DistToSrc++;
17555 }
17556 }
17557if (InsertInReadyList && SD->isReady()) {
17558 ReadyInsts.insert(SD);
17559LLVM_DEBUG(dbgs() <<"SLP: gets ready on update: " << *SD->Inst
17560 <<"\n");
17561 }
17562 }
17563}
17564
17565void BoUpSLP::BlockScheduling::resetSchedule() {
17566assert(ScheduleStart &&
17567"tried to reset schedule on block which has not been scheduled");
17568for (Instruction *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {
17569if (ScheduleData *SD = getScheduleData(I)) {
17570assert(isInSchedulingRegion(SD) &&
17571"ScheduleData not in scheduling region");
17572 SD->IsScheduled =false;
17573 SD->resetUnscheduledDeps();
17574 }
17575 }
17576 ReadyInsts.clear();
17577}
17578
17579void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17580if (!BS->ScheduleStart)
17581return;
17582
17583LLVM_DEBUG(dbgs() <<"SLP: schedule block " << BS->BB->getName() <<"\n");
17584
17585// A key point - if we got here, pre-scheduling was able to find a valid
17586// scheduling of the sub-graph of the scheduling window which consists
17587// of all vector bundles and their transitive users. As such, we do not
17588// need to reschedule anything *outside of* that subgraph.
17589
17590 BS->resetSchedule();
17591
17592// For the real scheduling we use a more sophisticated ready-list: it is
17593// sorted by the original instruction location. This lets the final schedule
17594// be as close as possible to the original instruction order.
17595// WARNING: If changing this order causes a correctness issue, that means
17596// there is some missing dependence edge in the schedule data graph.
17597structScheduleDataCompare {
17598bool operator()(ScheduleData *SD1, ScheduleData *SD2) const{
17599return SD2->SchedulingPriority < SD1->SchedulingPriority;
17600 }
17601 };
17602 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17603
17604// Ensure that all dependency data is updated (for nodes in the sub-graph)
17605// and fill the ready-list with initial instructions.
17606intIdx = 0;
17607for (auto *I = BS->ScheduleStart;I != BS->ScheduleEnd;
17608I =I->getNextNode()) {
17609if (ScheduleData *SD = BS->getScheduleData(I)) {
17610 [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);
17611assert((isVectorLikeInstWithConstOps(SD->Inst) ||
17612 SD->isPartOfBundle() ==
17613 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
17614"scheduler and vectorizer bundle mismatch");
17615 SD->FirstInBundle->SchedulingPriority =Idx++;
17616
17617if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17618 BS->calculateDependencies(SD,false,this);
17619 }
17620 }
17621 BS->initialFillReadyList(ReadyInsts);
17622
17623Instruction *LastScheduledInst = BS->ScheduleEnd;
17624
17625// Do the "real" scheduling.
17626while (!ReadyInsts.empty()) {
17627 ScheduleData *Picked = *ReadyInsts.begin();
17628 ReadyInsts.erase(ReadyInsts.begin());
17629
17630// Move the scheduled instruction(s) to their dedicated places, if not
17631// there yet.
17632for (ScheduleData *BundleMember = Picked; BundleMember;
17633 BundleMember = BundleMember->NextInBundle) {
17634Instruction *PickedInst = BundleMember->Inst;
17635if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17636 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17637 LastScheduledInst = PickedInst;
17638 }
17639
17640 BS->schedule(Picked, ReadyInsts);
17641 }
17642
17643// Check that we didn't break any of our invariants.
17644#ifdef EXPENSIVE_CHECKS
17645 BS->verify();
17646#endif
17647
17648#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17649// Check that all schedulable entities got scheduled
17650for (auto *I = BS->ScheduleStart;I != BS->ScheduleEnd;I =I->getNextNode()) {
17651 ScheduleData *SD = BS->getScheduleData(I);
17652if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17653assert(SD->IsScheduled &&"must be scheduled at this point");
17654 }
17655#endif
17656
17657// Avoid duplicate scheduling of the block.
17658 BS->ScheduleStart =nullptr;
17659}
17660
17661unsignedBoUpSLP::getVectorElementSize(Value *V) {
17662// If V is a store, just return the width of the stored value (or value
17663// truncated just before storing) without traversing the expression tree.
17664// This is the common case.
17665if (auto *Store = dyn_cast<StoreInst>(V))
17666returnDL->getTypeSizeInBits(Store->getValueOperand()->getType());
17667
17668if (auto *IEI = dyn_cast<InsertElementInst>(V))
17669returngetVectorElementSize(IEI->getOperand(1));
17670
17671auto E = InstrElementSize.find(V);
17672if (E != InstrElementSize.end())
17673return E->second;
17674
17675// If V is not a store, we can traverse the expression tree to find loads
17676// that feed it. The type of the loaded value may indicate a more suitable
17677// width than V's type. We want to base the vector element size on the width
17678// of memory operations where possible.
17679SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
17680SmallPtrSet<Instruction *, 16> Visited;
17681if (auto *I = dyn_cast<Instruction>(V)) {
17682 Worklist.emplace_back(I,I->getParent(), 0);
17683 Visited.insert(I);
17684 }
17685
17686// Traverse the expression tree in bottom-up order looking for loads. If we
17687// encounter an instruction we don't yet handle, we give up.
17688auto Width = 0u;
17689Value *FirstNonBool =nullptr;
17690while (!Worklist.empty()) {
17691auto [I, Parent, Level] = Worklist.pop_back_val();
17692
17693// We should only be looking at scalar instructions here. If the current
17694// instruction has a vector type, skip.
17695auto *Ty =I->getType();
17696if (isa<VectorType>(Ty))
17697continue;
17698if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17699 FirstNonBool =I;
17700if (Level >RecursionMaxDepth)
17701continue;
17702
17703// If the current instruction is a load, update MaxWidth to reflect the
17704// width of the loaded value.
17705if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17706 Width = std::max<unsigned>(Width,DL->getTypeSizeInBits(Ty));
17707
17708// Otherwise, we need to visit the operands of the instruction. We only
17709// handle the interesting cases from buildTree here. If an operand is an
17710// instruction we haven't yet visited and from the same basic block as the
17711// user or the use is a PHI node, we add it to the worklist.
17712elseif (isa<PHINode,CastInst,GetElementPtrInst,CmpInst,SelectInst,
17713BinaryOperator,UnaryOperator>(I)) {
17714for (Use &U :I->operands()) {
17715if (auto *J = dyn_cast<Instruction>(U.get()))
17716if (Visited.insert(J).second &&
17717 (isa<PHINode>(I) || J->getParent() == Parent)) {
17718 Worklist.emplace_back(J, J->getParent(), Level + 1);
17719continue;
17720 }
17721if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17722 FirstNonBool = U.get();
17723 }
17724 }else {
17725break;
17726 }
17727 }
17728
17729// If we didn't encounter a memory access in the expression tree, or if we
17730// gave up for some reason, just return the width of V. Otherwise, return the
17731// maximum width we found.
17732if (!Width) {
17733if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17734 V = FirstNonBool;
17735 Width =DL->getTypeSizeInBits(V->getType());
17736 }
17737
17738for (Instruction *I : Visited)
17739 InstrElementSize[I] = Width;
17740
17741return Width;
17742}
17743
17744bool BoUpSLP::collectValuesToDemote(
17745const TreeEntry &E,bool IsProfitableToDemoteRoot,unsigned &BitWidth,
17746SmallVectorImpl<unsigned> &ToDemote,DenseSet<const TreeEntry *> &Visited,
17747constSmallDenseSet<unsigned, 8> &NodesToKeepBWs,unsigned &MaxDepthLevel,
17748bool &IsProfitableToDemote,bool IsTruncRoot) const{
17749// We can always demote constants.
17750if (all_of(E.Scalars, IsaPred<Constant>))
17751returntrue;
17752
17753unsigned OrigBitWidth =
17754DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17755if (OrigBitWidth ==BitWidth) {
17756 MaxDepthLevel = 1;
17757returntrue;
17758 }
17759
17760// Check if the node was analyzed already and must keep its original bitwidth.
17761if (NodesToKeepBWs.contains(E.Idx))
17762returnfalse;
17763
17764// If the value is not a vectorized instruction in the expression and not used
17765// by the insertelement instruction and not used in multiple vector nodes, it
17766// cannot be demoted.
17767bool IsSignedNode =any_of(E.Scalars, [&](Value *R) {
17768 if (isa<PoisonValue>(R))
17769 return false;
17770 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17771 });
17772auto IsPotentiallyTruncated = [&](Value *V,unsigned &BitWidth) ->bool {
17773if (isa<PoisonValue>(V))
17774returntrue;
17775if (MultiNodeScalars.contains(V))
17776returnfalse;
17777// For lat shuffle of sext/zext with many uses need to check the extra bit
17778// for unsigned values, otherwise may have incorrect casting for reused
17779// scalars.
17780bool IsSignedVal = !isKnownNonNegative(V,SimplifyQuery(*DL));
17781if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >BitWidth) {
17782APInt Mask =APInt::getBitsSetFrom(OrigBitWidth,BitWidth);
17783if (MaskedValueIsZero(V, Mask,SimplifyQuery(*DL)))
17784returntrue;
17785 }
17786unsigned NumSignBits =ComputeNumSignBits(V, *DL, 0, AC,nullptr, DT);
17787unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17788if (IsSignedNode)
17789 ++BitWidth1;
17790if (auto *I = dyn_cast<Instruction>(V)) {
17791APInt Mask = DB->getDemandedBits(I);
17792unsigned BitWidth2 =
17793 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17794while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17795APInt Mask =APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17796if (MaskedValueIsZero(V, Mask,SimplifyQuery(*DL)))
17797break;
17798 BitWidth2 *= 2;
17799 }
17800 BitWidth1 = std::min(BitWidth1, BitWidth2);
17801 }
17802BitWidth = std::max(BitWidth, BitWidth1);
17803returnBitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17804 };
17805auto FinalAnalysis = [&,TTI =TTI]() {
17806if (!IsProfitableToDemote)
17807returnfalse;
17808bool Res =all_of(
17809 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17810// Demote gathers.
17811if (Res && E.isGather()) {
17812// Check possible extractelement instructions bases and final vector
17813// length.
17814SmallPtrSet<Value *, 4> UniqueBases;
17815for (Value *V : E.Scalars) {
17816auto *EE = dyn_cast<ExtractElementInst>(V);
17817if (!EE)
17818continue;
17819 UniqueBases.insert(EE->getVectorOperand());
17820 }
17821constunsigned VF = E.Scalars.size();
17822Type *OrigScalarTy = E.Scalars.front()->getType();
17823if (UniqueBases.size() <= 2 ||
17824::getNumberOfParts(*TTI,getWidenedType(OrigScalarTy, VF)) ==
17825::getNumberOfParts(
17826 *TTI,
17827getWidenedType(
17828IntegerType::get(OrigScalarTy->getContext(),BitWidth),
17829 VF)))
17830 ToDemote.push_back(E.Idx);
17831 }
17832return Res;
17833 };
17834if (E.isGather() || !Visited.insert(&E).second ||
17835any_of(E.Scalars, [&](Value *V) {
17836 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17837 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17838 });
17839 }))
17840return FinalAnalysis();
17841
17842if (any_of(E.Scalars, [&](Value *V) {
17843 return !all_of(V->users(), [=](User *U) {
17844 return getTreeEntry(U) ||
17845 (E.Idx == 0 && UserIgnoreList &&
17846 UserIgnoreList->contains(U)) ||
17847 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17848 !U->getType()->isScalableTy() &&
17849 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17850 }) && !IsPotentiallyTruncated(V,BitWidth);
17851 }))
17852returnfalse;
17853
17854auto ProcessOperands = [&](ArrayRef<const TreeEntry *>Operands,
17855bool &NeedToExit) {
17856 NeedToExit =false;
17857unsigned InitLevel = MaxDepthLevel;
17858for (const TreeEntry *Op :Operands) {
17859unsigned Level = InitLevel;
17860if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot,BitWidth,
17861 ToDemote, Visited, NodesToKeepBWs, Level,
17862 IsProfitableToDemote, IsTruncRoot)) {
17863if (!IsProfitableToDemote)
17864returnfalse;
17865 NeedToExit =true;
17866if (!FinalAnalysis())
17867returnfalse;
17868continue;
17869 }
17870 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17871 }
17872returntrue;
17873 };
17874auto AttemptCheckBitwidth =
17875 [&](function_ref<bool(unsigned,unsigned)> Checker,bool &NeedToExit) {
17876// Try all bitwidth < OrigBitWidth.
17877 NeedToExit =false;
17878unsigned BestFailBitwidth = 0;
17879for (;BitWidth < OrigBitWidth;BitWidth *= 2) {
17880if (Checker(BitWidth, OrigBitWidth))
17881returntrue;
17882if (BestFailBitwidth == 0 && FinalAnalysis())
17883 BestFailBitwidth =BitWidth;
17884 }
17885if (BitWidth >= OrigBitWidth) {
17886if (BestFailBitwidth == 0) {
17887BitWidth = OrigBitWidth;
17888returnfalse;
17889 }
17890 MaxDepthLevel = 1;
17891BitWidth = BestFailBitwidth;
17892 NeedToExit =true;
17893returntrue;
17894 }
17895returnfalse;
17896 };
17897auto TryProcessInstruction =
17898 [&](unsigned &BitWidth,ArrayRef<const TreeEntry *>Operands = {},
17899function_ref<bool(unsigned,unsigned)> Checker = {}) {
17900if (Operands.empty()) {
17901if (!IsTruncRoot)
17902 MaxDepthLevel = 1;
17903 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17904 std::ref(BitWidth)));
17905 }else {
17906// Several vectorized uses? Check if we can truncate it, otherwise -
17907// exit.
17908if (E.UserTreeIndices.size() > 1 &&
17909 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17910 std::ref(BitWidth))))
17911returnfalse;
17912bool NeedToExit =false;
17913if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17914returnfalse;
17915if (NeedToExit)
17916returntrue;
17917if (!ProcessOperands(Operands, NeedToExit))
17918returnfalse;
17919if (NeedToExit)
17920returntrue;
17921 }
17922
17923 ++MaxDepthLevel;
17924// Record the entry that we can demote.
17925 ToDemote.push_back(E.Idx);
17926return IsProfitableToDemote;
17927 };
17928switch (E.getOpcode()) {
17929
17930// We can always demote truncations and extensions. Since truncations can
17931// seed additional demotion, we save the truncated value.
17932case Instruction::Trunc:
17933if (IsProfitableToDemoteRoot)
17934 IsProfitableToDemote =true;
17935return TryProcessInstruction(BitWidth);
17936case Instruction::ZExt:
17937case Instruction::SExt:
17938 IsProfitableToDemote =true;
17939return TryProcessInstruction(BitWidth);
17940
17941// We can demote certain binary operations if we can demote both of their
17942// operands.
17943case Instruction::Add:
17944case Instruction::Sub:
17945case Instruction::Mul:
17946case Instruction::And:
17947case Instruction::Or:
17948case Instruction::Xor: {
17949return TryProcessInstruction(
17950BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17951 }
17952case Instruction::Freeze:
17953return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17954case Instruction::Shl: {
17955// If we are truncating the result of this SHL, and if it's a shift of an
17956// inrange amount, we can always perform a SHL in a smaller type.
17957auto ShlChecker = [&](unsignedBitWidth,unsigned) {
17958returnall_of(E.Scalars, [&](Value *V) {
17959 if (isa<PoisonValue>(V))
17960 return true;
17961 auto *I = cast<Instruction>(V);
17962 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17963 return AmtKnownBits.getMaxValue().ult(BitWidth);
17964 });
17965 };
17966return TryProcessInstruction(
17967BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17968 }
17969case Instruction::LShr: {
17970// If this is a truncate of a logical shr, we can truncate it to a smaller
17971// lshr iff we know that the bits we would otherwise be shifting in are
17972// already zeros.
17973auto LShrChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
17974returnall_of(E.Scalars, [&](Value *V) {
17975 if (isa<PoisonValue>(V))
17976 return true;
17977 auto *I = cast<Instruction>(V);
17978 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17979 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17980 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17981 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17982 SimplifyQuery(*DL));
17983 });
17984 };
17985return TryProcessInstruction(
17986BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17987 LShrChecker);
17988 }
17989case Instruction::AShr: {
17990// If this is a truncate of an arithmetic shr, we can truncate it to a
17991// smaller ashr iff we know that all the bits from the sign bit of the
17992// original type and the sign bit of the truncate type are similar.
17993auto AShrChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
17994returnall_of(E.Scalars, [&](Value *V) {
17995 if (isa<PoisonValue>(V))
17996 return true;
17997 auto *I = cast<Instruction>(V);
17998 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17999 unsigned ShiftedBits = OrigBitWidth - BitWidth;
18000 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18001 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18002 nullptr, DT);
18003 });
18004 };
18005return TryProcessInstruction(
18006BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18007 AShrChecker);
18008 }
18009case Instruction::UDiv:
18010case Instruction::URem: {
18011// UDiv and URem can be truncated if all the truncated bits are zero.
18012auto Checker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
18013assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");
18014returnall_of(E.Scalars, [&](Value *V) {
18015 auto *I = cast<Instruction>(V);
18016 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18017 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18018 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18019 });
18020 };
18021return TryProcessInstruction(
18022BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18023 }
18024
18025// We can demote selects if we can demote their true and false values.
18026case Instruction::Select: {
18027return TryProcessInstruction(
18028BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18029 }
18030
18031// We can demote phis if we can demote all their incoming operands. Note that
18032// we don't need to worry about cycles since we ensure single use above.
18033case Instruction::PHI: {
18034constunsigned NumOps = E.getNumOperands();
18035SmallVector<const TreeEntry *> Ops(NumOps);
18036transform(seq<unsigned>(0, NumOps), Ops.begin(),
18037 std::bind(&BoUpSLP::getOperandEntry,this, &E, _1));
18038
18039return TryProcessInstruction(BitWidth, Ops);
18040 }
18041
18042case Instruction::Call: {
18043auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18044if (!IC)
18045break;
18046Intrinsic::IDID =getVectorIntrinsicIDForCall(IC, TLI);
18047if (ID != Intrinsic::abs &&ID != Intrinsic::smin &&
18048ID != Intrinsic::smax &&ID != Intrinsic::umin &&ID != Intrinsic::umax)
18049break;
18050SmallVector<const TreeEntry *, 2>Operands(1, getOperandEntry(&E, 0));
18051function_ref<bool(unsigned,unsigned)> CallChecker;
18052auto CompChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
18053assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");
18054returnall_of(E.Scalars, [&](Value *V) {
18055 auto *I = cast<Instruction>(V);
18056 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18057 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18058 return MaskedValueIsZero(I->getOperand(0), Mask,
18059 SimplifyQuery(*DL)) &&
18060 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18061 }
18062assert((ID == Intrinsic::smin ||ID == Intrinsic::smax) &&
18063"Expected min/max intrinsics only.");
18064unsigned SignBits = OrigBitWidth -BitWidth;
18065APIntMask =APInt::getBitsSetFrom(OrigBitWidth,BitWidth - 1);
18066unsigned Op0SignBits =ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18067nullptr, DT);
18068unsigned Op1SignBits =ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18069nullptr, DT);
18070return SignBits <= Op0SignBits &&
18071 ((SignBits != Op0SignBits &&
18072 !isKnownNonNegative(I->getOperand(0),SimplifyQuery(*DL))) ||
18073MaskedValueIsZero(I->getOperand(0),Mask,
18074SimplifyQuery(*DL))) &&
18075 SignBits <= Op1SignBits &&
18076 ((SignBits != Op1SignBits &&
18077 !isKnownNonNegative(I->getOperand(1),SimplifyQuery(*DL))) ||
18078MaskedValueIsZero(I->getOperand(1),Mask,SimplifyQuery(*DL)));
18079 });
18080 };
18081auto AbsChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
18082assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");
18083returnall_of(E.Scalars, [&](Value *V) {
18084 auto *I = cast<Instruction>(V);
18085 unsigned SignBits = OrigBitWidth - BitWidth;
18086 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18087 unsigned Op0SignBits =
18088 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18089 return SignBits <= Op0SignBits &&
18090 ((SignBits != Op0SignBits &&
18091 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18092 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18093 });
18094 };
18095if (ID != Intrinsic::abs) {
18096Operands.push_back(getOperandEntry(&E, 1));
18097 CallChecker = CompChecker;
18098 }else {
18099 CallChecker = AbsChecker;
18100 }
18101InstructionCost BestCost =
18102 std::numeric_limits<InstructionCost::CostType>::max();
18103unsigned BestBitWidth =BitWidth;
18104unsigned VF = E.Scalars.size();
18105// Choose the best bitwidth based on cost estimations.
18106auto Checker = [&](unsignedBitWidth,unsigned) {
18107unsigned MinBW =PowerOf2Ceil(BitWidth);
18108SmallVector<Type *> ArgTys =
18109buildIntrinsicArgTypes(IC,ID, VF, MinBW,TTI);
18110auto VecCallCosts =getVectorCallCosts(
18111 IC,getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18112TTI, TLI, ArgTys);
18113InstructionCostCost = std::min(VecCallCosts.first, VecCallCosts.second);
18114if (Cost < BestCost) {
18115 BestCost =Cost;
18116 BestBitWidth =BitWidth;
18117 }
18118returnfalse;
18119 };
18120 [[maybe_unused]]bool NeedToExit;
18121 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18122BitWidth = BestBitWidth;
18123return TryProcessInstruction(BitWidth,Operands, CallChecker);
18124 }
18125
18126// Otherwise, conservatively give up.
18127default:
18128break;
18129 }
18130 MaxDepthLevel = 1;
18131return FinalAnalysis();
18132}
18133
18134staticRecurKindgetRdxKind(Value *V);
18135
18136voidBoUpSLP::computeMinimumValueSizes() {
18137// We only attempt to truncate integer expressions.
18138bool IsStoreOrInsertElt =
18139 VectorizableTree.front()->hasState() &&
18140 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18141 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18142if ((IsStoreOrInsertElt || UserIgnoreList) &&
18143 ExtraBitWidthNodes.size() <= 1 &&
18144 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18145 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18146return;
18147
18148unsigned NodeIdx = 0;
18149if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18150 NodeIdx = 1;
18151
18152// Ensure the roots of the vectorizable tree don't form a cycle.
18153if (VectorizableTree[NodeIdx]->isGather() ||
18154 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18155 (NodeIdx != 0 &&any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18156 [NodeIdx](constEdgeInfo &EI) {
18157return EI.UserTE->Idx > NodeIdx;
18158 })))
18159return;
18160
18161// The first value node for store/insertelement is sext/zext/trunc? Skip it,
18162// resize to the final type.
18163bool IsTruncRoot =false;
18164bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18165SmallVector<unsigned> RootDemotes;
18166SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18167if (NodeIdx != 0 &&
18168 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18169 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18170assert(IsStoreOrInsertElt &&"Expected store/insertelement seeded graph.");
18171 IsTruncRoot =true;
18172 RootDemotes.push_back(NodeIdx);
18173 IsProfitableToDemoteRoot =true;
18174 ++NodeIdx;
18175 }
18176
18177// Analyzed the reduction already and not profitable - exit.
18178if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18179return;
18180
18181SmallVector<unsigned> ToDemote;
18182auto ComputeMaxBitWidth =
18183 [&](const TreeEntry &E,bool IsTopRoot,bool IsProfitableToDemoteRoot,
18184unsigned Limit,bool IsTruncRoot,bool IsSignedCmp) ->unsigned {
18185 ToDemote.clear();
18186// Check if the root is trunc and the next node is gather/buildvector, then
18187// keep trunc in scalars, which is free in most cases.
18188if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18189 !NodesToKeepBWs.contains(E.Idx) &&
18190 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18191all_of(E.Scalars, [&](Value *V) {
18192return V->hasOneUse() || isa<Constant>(V) ||
18193 (!V->hasNUsesOrMore(UsesLimit) &&
18194none_of(V->users(), [&](User *U) {
18195 const TreeEntry *TE = getTreeEntry(U);
18196 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18197 if (TE == UserTE || !TE)
18198 return false;
18199 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18200 SelectInst>(U) ||
18201 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18202 SelectInst>(UserTE->getMainOp()))
18203 return true;
18204 unsigned UserTESz = DL->getTypeSizeInBits(
18205 UserTE->Scalars.front()->getType());
18206 auto It = MinBWs.find(TE);
18207 if (It != MinBWs.end() && It->second.first > UserTESz)
18208 return true;
18209 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18210 }));
18211 })) {
18212 ToDemote.push_back(E.Idx);
18213const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18214auto It = MinBWs.find(UserTE);
18215if (It != MinBWs.end())
18216return It->second.first;
18217unsigned MaxBitWidth =
18218DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18219 MaxBitWidth =bit_ceil(MaxBitWidth);
18220if (MaxBitWidth < 8 && MaxBitWidth > 1)
18221 MaxBitWidth = 8;
18222return MaxBitWidth;
18223 }
18224
18225if (!E.hasState())
18226return 0u;
18227
18228unsigned VF = E.getVectorFactor();
18229Type *ScalarTy = E.Scalars.front()->getType();
18230unsigned ScalarTyNumElements =getNumElements(ScalarTy);
18231auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18232if (!TreeRootIT)
18233return 0u;
18234
18235if (any_of(E.Scalars,
18236 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18237return 0u;
18238
18239unsigned NumParts =::getNumberOfParts(
18240 *TTI,getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18241
18242// The maximum bit width required to represent all the values that can be
18243// demoted without loss of precision. It would be safe to truncate the roots
18244// of the expression to this width.
18245unsigned MaxBitWidth = 1u;
18246
18247// True if the roots can be zero-extended back to their original type,
18248// rather than sign-extended. We know that if the leading bits are not
18249// demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18250// True.
18251// Determine if the sign bit of all the roots is known to be zero. If not,
18252// IsKnownPositive is set to False.
18253bool IsKnownPositive = !IsSignedCmp &&all_of(E.Scalars, [&](Value *R) {
18254 if (isa<PoisonValue>(R))
18255 return true;
18256 KnownBits Known = computeKnownBits(R, *DL);
18257 return Known.isNonNegative();
18258 });
18259
18260// We first check if all the bits of the roots are demanded. If they're not,
18261// we can truncate the roots to this narrower type.
18262for (Value *Root : E.Scalars) {
18263if (isa<PoisonValue>(Root))
18264continue;
18265unsigned NumSignBits =ComputeNumSignBits(Root, *DL, 0, AC,nullptr, DT);
18266TypeSize NumTypeBits =
18267DL->getTypeSizeInBits(Root->getType()->getScalarType());
18268unsigned BitWidth1 = NumTypeBits - NumSignBits;
18269// If we can't prove that the sign bit is zero, we must add one to the
18270// maximum bit width to account for the unknown sign bit. This preserves
18271// the existing sign bit so we can safely sign-extend the root back to the
18272// original type. Otherwise, if we know the sign bit is zero, we will
18273// zero-extend the root instead.
18274//
18275// FIXME: This is somewhat suboptimal, as there will be cases where adding
18276// one to the maximum bit width will yield a larger-than-necessary
18277// type. In general, we need to add an extra bit only if we can't
18278// prove that the upper bit of the original type is equal to the
18279// upper bit of the proposed smaller type. If these two bits are
18280// the same (either zero or one) we know that sign-extending from
18281// the smaller type will result in the same value. Here, since we
18282// can't yet prove this, we are just making the proposed smaller
18283// type larger to ensure correctness.
18284if (!IsKnownPositive)
18285 ++BitWidth1;
18286
18287APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18288unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18289 MaxBitWidth =
18290 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18291 }
18292
18293if (MaxBitWidth < 8 && MaxBitWidth > 1)
18294 MaxBitWidth = 8;
18295
18296// If the original type is large, but reduced type does not improve the reg
18297// use - ignore it.
18298if (NumParts > 1 &&
18299 NumParts ==
18300::getNumberOfParts(
18301 *TTI,getWidenedType(IntegerType::get(F->getContext(),
18302bit_ceil(MaxBitWidth)),
18303 VF)))
18304return 0u;
18305
18306unsigned Opcode = E.getOpcode();
18307bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18308 Opcode == Instruction::SExt ||
18309 Opcode == Instruction::ZExt || NumParts > 1;
18310// Conservatively determine if we can actually truncate the roots of the
18311// expression. Collect the values that can be demoted in ToDemote and
18312// additional roots that require investigating in Roots.
18313DenseSet<const TreeEntry *> Visited;
18314unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18315bool NeedToDemote = IsProfitableToDemote;
18316
18317if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18318 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18319 NeedToDemote, IsTruncRoot) ||
18320 (MaxDepthLevel <= Limit &&
18321 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18322 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18323DL->getTypeSizeInBits(TreeRootIT) /
18324DL->getTypeSizeInBits(
18325 E.getMainOp()->getOperand(0)->getType()) >
18326 2)))))
18327return 0u;
18328// Round MaxBitWidth up to the next power-of-two.
18329 MaxBitWidth =bit_ceil(MaxBitWidth);
18330
18331return MaxBitWidth;
18332 };
18333
18334// If we can truncate the root, we must collect additional values that might
18335// be demoted as a result. That is, those seeded by truncations we will
18336// modify.
18337// Add reduction ops sizes, if any.
18338if (UserIgnoreList &&
18339 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18340// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18341// x i1> to in)).
18342if (all_of(*UserIgnoreList,
18343 [](Value *V) {
18344return isa<PoisonValue>(V) ||
18345 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18346 }) &&
18347 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18348 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18349 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18350 Builder.getInt1Ty()) {
18351 ReductionBitWidth = 1;
18352 }else {
18353for (Value *V : *UserIgnoreList) {
18354if (isa<PoisonValue>(V))
18355continue;
18356unsigned NumSignBits =ComputeNumSignBits(V, *DL, 0, AC,nullptr, DT);
18357TypeSize NumTypeBits =DL->getTypeSizeInBits(V->getType());
18358unsigned BitWidth1 = NumTypeBits - NumSignBits;
18359if (!isKnownNonNegative(V,SimplifyQuery(*DL)))
18360 ++BitWidth1;
18361unsigned BitWidth2 = BitWidth1;
18362if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
18363APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18364 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18365 }
18366 ReductionBitWidth =
18367 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18368 }
18369if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18370 ReductionBitWidth = 8;
18371
18372 ReductionBitWidth =bit_ceil(ReductionBitWidth);
18373 }
18374 }
18375bool IsTopRoot = NodeIdx == 0;
18376while (NodeIdx < VectorizableTree.size() &&
18377 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18378 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18379 RootDemotes.push_back(NodeIdx);
18380 ++NodeIdx;
18381 IsTruncRoot =true;
18382 }
18383bool IsSignedCmp =false;
18384while (NodeIdx < VectorizableTree.size()) {
18385ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18386unsigned Limit = 2;
18387if (IsTopRoot &&
18388 ReductionBitWidth ==
18389DL->getTypeSizeInBits(
18390 VectorizableTree.front()->Scalars.front()->getType()))
18391 Limit = 3;
18392unsigned MaxBitWidth = ComputeMaxBitWidth(
18393 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18394 IsTruncRoot, IsSignedCmp);
18395if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18396if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18397 ReductionBitWidth =bit_ceil(MaxBitWidth);
18398elseif (MaxBitWidth == 0)
18399 ReductionBitWidth = 0;
18400 }
18401
18402for (unsignedIdx : RootDemotes) {
18403if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18404uint32_t OrigBitWidth =
18405DL->getTypeSizeInBits(V->getType()->getScalarType());
18406if (OrigBitWidth > MaxBitWidth) {
18407APInt Mask =APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18408returnMaskedValueIsZero(V, Mask,SimplifyQuery(*DL));
18409 }
18410returnfalse;
18411 }))
18412 ToDemote.push_back(Idx);
18413 }
18414 RootDemotes.clear();
18415 IsTopRoot =false;
18416 IsProfitableToDemoteRoot =true;
18417
18418if (ExtraBitWidthNodes.empty()) {
18419 NodeIdx = VectorizableTree.size();
18420 }else {
18421unsigned NewIdx = 0;
18422do {
18423 NewIdx = *ExtraBitWidthNodes.begin();
18424 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18425 }while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18426 NodeIdx = NewIdx;
18427 IsTruncRoot =
18428 NodeIdx < VectorizableTree.size() &&
18429any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18430 [](constEdgeInfo &EI) {
18431return EI.EdgeIdx == 0 &&
18432 EI.UserTE->getOpcode() == Instruction::Trunc &&
18433 !EI.UserTE->isAltShuffle();
18434 });
18435 IsSignedCmp =
18436 NodeIdx < VectorizableTree.size() &&
18437any_of(
18438 VectorizableTree[NodeIdx]->UserTreeIndices,
18439 [&](constEdgeInfo &EI) {
18440return (EI.UserTE->hasState() &&
18441 EI.UserTE->getOpcode() == Instruction::ICmp) &&
18442any_of(EI.UserTE->Scalars, [&](Value *V) {
18443 auto *IC = dyn_cast<ICmpInst>(V);
18444 return IC &&
18445 (IC->isSigned() ||
18446 !isKnownNonNegative(IC->getOperand(0),
18447 SimplifyQuery(*DL)) ||
18448 !isKnownNonNegative(IC->getOperand(1),
18449 SimplifyQuery(*DL)));
18450 });
18451 });
18452 }
18453
18454// If the maximum bit width we compute is less than the width of the roots'
18455// type, we can proceed with the narrowing. Otherwise, do nothing.
18456if (MaxBitWidth == 0 ||
18457 MaxBitWidth >=
18458 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18459 ->getBitWidth()) {
18460if (UserIgnoreList)
18461 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18462 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18463continue;
18464 }
18465
18466// Finally, map the values we can demote to the maximum bit with we
18467// computed.
18468for (unsignedIdx : ToDemote) {
18469 TreeEntry *TE = VectorizableTree[Idx].get();
18470if (MinBWs.contains(TE))
18471continue;
18472bool IsSigned =any_of(TE->Scalars, [&](Value *R) {
18473 if (isa<PoisonValue>(R))
18474 return false;
18475 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18476 });
18477 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18478 }
18479 }
18480}
18481
18482PreservedAnalysesSLPVectorizerPass::run(Function &F,FunctionAnalysisManager &AM) {
18483auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18484auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18485auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18486auto *AA = &AM.getResult<AAManager>(F);
18487auto *LI = &AM.getResult<LoopAnalysis>(F);
18488auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18489auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18490auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18491auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
18492
18493bool Changed =runImpl(F, SE,TTI, TLI, AA, LI, DT, AC, DB, ORE);
18494if (!Changed)
18495returnPreservedAnalyses::all();
18496
18497PreservedAnalyses PA;
18498 PA.preserveSet<CFGAnalyses>();
18499return PA;
18500}
18501
18502boolSLPVectorizerPass::runImpl(Function &F,ScalarEvolution *SE_,
18503TargetTransformInfo *TTI_,
18504TargetLibraryInfo *TLI_,AAResults *AA_,
18505LoopInfo *LI_,DominatorTree *DT_,
18506AssumptionCache *AC_,DemandedBits *DB_,
18507OptimizationRemarkEmitter *ORE_) {
18508if (!RunSLPVectorization)
18509returnfalse;
18510 SE = SE_;
18511TTI = TTI_;
18512 TLI = TLI_;
18513 AA = AA_;
18514 LI = LI_;
18515 DT = DT_;
18516 AC = AC_;
18517 DB = DB_;
18518DL = &F.getDataLayout();
18519
18520 Stores.clear();
18521 GEPs.clear();
18522bool Changed =false;
18523
18524// If the target claims to have no vector registers don't attempt
18525// vectorization.
18526if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
18527LLVM_DEBUG(
18528dbgs() <<"SLP: Didn't find any vector registers for target, abort.\n");
18529returnfalse;
18530 }
18531
18532// Don't vectorize when the attribute NoImplicitFloat is used.
18533if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18534returnfalse;
18535
18536LLVM_DEBUG(dbgs() <<"SLP: Analyzing blocks in " <<F.getName() <<".\n");
18537
18538// Use the bottom up slp vectorizer to construct chains that start with
18539// store instructions.
18540BoUpSLP R(&F, SE,TTI, TLI, AA, LI, DT, AC, DB,DL, ORE_);
18541
18542// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18543// delete instructions.
18544
18545// Update DFS numbers now so that we can use them for ordering.
18546 DT->updateDFSNumbers();
18547
18548// Scan the blocks in the function in post order.
18549for (auto *BB :post_order(&F.getEntryBlock())) {
18550if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18551continue;
18552
18553// Start new block - clear the list of reduction roots.
18554 R.clearReductionData();
18555 collectSeedInstructions(BB);
18556
18557// Vectorize trees that end at stores.
18558if (!Stores.empty()) {
18559LLVM_DEBUG(dbgs() <<"SLP: Found stores for " << Stores.size()
18560 <<" underlying objects.\n");
18561 Changed |= vectorizeStoreChains(R);
18562 }
18563
18564// Vectorize trees that end at reductions.
18565 Changed |= vectorizeChainsInBlock(BB, R);
18566
18567// Vectorize the index computations of getelementptr instructions. This
18568// is primarily intended to catch gather-like idioms ending at
18569// non-consecutive loads.
18570if (!GEPs.empty()) {
18571LLVM_DEBUG(dbgs() <<"SLP: Found GEPs for " << GEPs.size()
18572 <<" underlying objects.\n");
18573 Changed |= vectorizeGEPIndices(BB, R);
18574 }
18575 }
18576
18577if (Changed) {
18578 R.optimizeGatherSequence();
18579LLVM_DEBUG(dbgs() <<"SLP: vectorized \"" <<F.getName() <<"\"\n");
18580 }
18581return Changed;
18582}
18583
18584std::optional<bool>
18585SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,BoUpSLP &R,
18586unsignedIdx,unsigned MinVF,
18587unsigned &Size) {
18588Size = 0;
18589LLVM_DEBUG(dbgs() <<"SLP: Analyzing a store chain of length " << Chain.size()
18590 <<"\n");
18591constunsigned Sz = R.getVectorElementSize(Chain[0]);
18592unsigned VF = Chain.size();
18593
18594if (!has_single_bit(Sz) ||
18595 !hasFullVectorsOrPowerOf2(
18596 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18597 VF) ||
18598 VF < 2 || VF < MinVF) {
18599// Check if vectorizing with a non-power-of-2 VF should be considered. At
18600// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18601// all vector lanes are used.
18602if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18603returnfalse;
18604 }
18605
18606LLVM_DEBUG(dbgs() <<"SLP: Analyzing " << VF <<" stores at offset " <<Idx
18607 <<"\n");
18608
18609SetVector<Value *> ValOps;
18610for (Value *V : Chain)
18611 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18612// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18613 InstructionsState S =getSameOpcode(ValOps.getArrayRef(), *TLI);
18614if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18615DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18616bool IsAllowedSize =
18617hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18618 ValOps.size()) ||
18619 (VectorizeNonPowerOf2 &&has_single_bit(ValOps.size() + 1));
18620if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18621 (!S.getMainOp()->isSafeToRemove() ||
18622any_of(ValOps.getArrayRef(),
18623 [&](Value *V) {
18624 return !isa<ExtractElementInst>(V) &&
18625 (V->getNumUses() > Chain.size() ||
18626 any_of(V->users(), [&](User *U) {
18627 return !Stores.contains(U);
18628 }));
18629 }))) ||
18630 (ValOps.size() > Chain.size() / 2 && !S)) {
18631Size = (!IsAllowedSize && S) ? 1 : 2;
18632returnfalse;
18633 }
18634 }
18635if (R.isLoadCombineCandidate(Chain))
18636returntrue;
18637R.buildTree(Chain);
18638// Check if tree tiny and store itself or its value is not vectorized.
18639if (R.isTreeTinyAndNotFullyVectorizable()) {
18640if (R.isGathered(Chain.front()) ||
18641R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18642return std::nullopt;
18643Size =R.getCanonicalGraphSize();
18644returnfalse;
18645 }
18646R.reorderTopToBottom();
18647R.reorderBottomToTop();
18648R.transformNodes();
18649R.buildExternalUses();
18650
18651R.computeMinimumValueSizes();
18652
18653Size =R.getCanonicalGraphSize();
18654if (S && S.getOpcode() == Instruction::Load)
18655Size = 2;// cut off masked gather small trees
18656InstructionCostCost =R.getTreeCost();
18657
18658LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost <<" for VF=" << VF <<"\n");
18659if (Cost < -SLPCostThreshold) {
18660LLVM_DEBUG(dbgs() <<"SLP: Decided to vectorize cost = " <<Cost <<"\n");
18661
18662using namespaceore;
18663
18664R.getORE()->emit(OptimizationRemark(SV_NAME,"StoresVectorized",
18665 cast<StoreInst>(Chain[0]))
18666 <<"Stores SLP vectorized with cost " <<NV("Cost",Cost)
18667 <<" and with tree size "
18668 <<NV("TreeSize",R.getTreeSize()));
18669
18670R.vectorizeTree();
18671returntrue;
18672 }
18673
18674returnfalse;
18675}
18676
18677/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18678staticboolcheckTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18679boolFirst) {
18680unsigned Num = 0;
18681uint64_t Sum = std::accumulate(
18682 Sizes.begin(), Sizes.end(),static_cast<uint64_t>(0),
18683 [&](uint64_t V,const std::pair<unsigned, unsigned> &Val) {
18684 unsigned Size = First ? Val.first : Val.second;
18685 if (Size == 1)
18686 return V;
18687 ++Num;
18688 return V + Size;
18689 });
18690if (Num == 0)
18691returntrue;
18692uint64_t Mean = Sum / Num;
18693if (Mean == 0)
18694returntrue;
18695uint64_t Dev = std::accumulate(
18696 Sizes.begin(), Sizes.end(),static_cast<uint64_t>(0),
18697 [&](uint64_t V,const std::pair<unsigned, unsigned> &Val) {
18698 unsigned P = First ? Val.first : Val.second;
18699 if (P == 1)
18700 return V;
18701 return V + (P - Mean) * (P - Mean);
18702 }) /
18703 Num;
18704return Dev * 81 / (Mean * Mean) == 0;
18705}
18706
18707bool SLPVectorizerPass::vectorizeStores(
18708ArrayRef<StoreInst *> Stores,BoUpSLP &R,
18709DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18710 &Visited) {
18711// We may run into multiple chains that merge into a single chain. We mark the
18712// stores that we vectorized so that we don't visit the same store twice.
18713BoUpSLP::ValueSet VectorizedStores;
18714bool Changed =false;
18715
18716structStoreDistCompare {
18717bool operator()(const std::pair<unsigned, int> &Op1,
18718const std::pair<unsigned, int> &Op2) const{
18719return Op1.second < Op2.second;
18720 }
18721 };
18722// A set of pairs (index of store in Stores array ref, Distance of the store
18723// address relative to base store address in units).
18724usingStoreIndexToDistSet =
18725 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18726auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18727int PrevDist = -1;
18728BoUpSLP::ValueListOperands;
18729// Collect the chain into a list.
18730for (auto [Idx,Data] :enumerate(Set)) {
18731if (Operands.empty() ||Data.second - PrevDist == 1) {
18732Operands.push_back(Stores[Data.first]);
18733 PrevDist =Data.second;
18734if (Idx !=Set.size() - 1)
18735continue;
18736 }
18737auto E =make_scope_exit([&, &DataVar =Data]() {
18738Operands.clear();
18739Operands.push_back(Stores[DataVar.first]);
18740 PrevDist = DataVar.second;
18741 });
18742
18743if (Operands.size() <= 1 ||
18744 !Visited
18745 .insert({Operands.front(),
18746 cast<StoreInst>(Operands.front())->getValueOperand(),
18747 Operands.back(),
18748 cast<StoreInst>(Operands.back())->getValueOperand(),
18749 Operands.size()})
18750 .second)
18751continue;
18752
18753unsigned MaxVecRegSize =R.getMaxVecRegSize();
18754unsigned EltSize =R.getVectorElementSize(Operands[0]);
18755unsigned MaxElts =llvm::bit_floor(MaxVecRegSize / EltSize);
18756
18757unsigned MaxVF =
18758 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18759auto *Store = cast<StoreInst>(Operands[0]);
18760Type *StoreTy =Store->getValueOperand()->getType();
18761Type *ValueTy = StoreTy;
18762if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18763 ValueTy = Trunc->getSrcTy();
18764unsigned MinVF = std::max<unsigned>(
18765 2,PowerOf2Ceil(TTI->getStoreMinimumVF(
18766R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18767 ValueTy)));
18768
18769if (MaxVF < MinVF) {
18770LLVM_DEBUG(dbgs() <<"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18771 <<") < "
18772 <<"MinVF (" << MinVF <<")\n");
18773continue;
18774 }
18775
18776unsigned NonPowerOf2VF = 0;
18777if (VectorizeNonPowerOf2) {
18778// First try vectorizing with a non-power-of-2 VF. At the moment, only
18779// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18780// lanes are used.
18781unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18782if (has_single_bit(CandVF + 1)) {
18783 NonPowerOf2VF = CandVF;
18784assert(NonPowerOf2VF != MaxVF &&
18785"Non-power-of-2 VF should not be equal to MaxVF");
18786 }
18787 }
18788
18789unsigned MaxRegVF = MaxVF;
18790 MaxVF = std::min<unsigned>(MaxVF,bit_floor(Operands.size()));
18791if (MaxVF < MinVF) {
18792LLVM_DEBUG(dbgs() <<"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18793 <<") < "
18794 <<"MinVF (" << MinVF <<")\n");
18795continue;
18796 }
18797
18798unsigned Sz = 1 +Log2_32(MaxVF) -Log2_32(MinVF);
18799SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18800unsignedSize = MinVF;
18801for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18802 VF =Size > MaxVF ? NonPowerOf2VF :Size;
18803Size *= 2;
18804 });
18805unsignedEnd =Operands.size();
18806unsigned Repeat = 0;
18807constexprunsigned MaxAttempts = 4;
18808OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
18809for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18810P.first =P.second = 1;
18811 });
18812DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
18813auto IsNotVectorized = [](boolFirst,
18814const std::pair<unsigned, unsigned> &P) {
18815returnFirst ?P.first > 0 :P.second > 0;
18816 };
18817auto IsVectorized = [](boolFirst,
18818const std::pair<unsigned, unsigned> &P) {
18819returnFirst ?P.first == 0 :P.second == 0;
18820 };
18821auto VFIsProfitable = [](boolFirst,unsignedSize,
18822const std::pair<unsigned, unsigned> &P) {
18823returnFirst ?Size >=P.first :Size >=P.second;
18824 };
18825auto FirstSizeSame = [](unsignedSize,
18826const std::pair<unsigned, unsigned> &P) {
18827returnSize ==P.first;
18828 };
18829while (true) {
18830 ++Repeat;
18831bool RepeatChanged =false;
18832bool AnyProfitableGraph =false;
18833for (unsignedSize : CandidateVFs) {
18834 AnyProfitableGraph =false;
18835unsigned StartIdx = std::distance(
18836 RangeSizes.begin(),
18837find_if(RangeSizes, std::bind(IsNotVectorized,Size >= MaxRegVF,
18838 std::placeholders::_1)));
18839while (StartIdx <End) {
18840unsigned EndIdx =
18841 std::distance(RangeSizes.begin(),
18842find_if(RangeSizes.drop_front(StartIdx),
18843 std::bind(IsVectorized,Size >= MaxRegVF,
18844 std::placeholders::_1)));
18845unsigned Sz = EndIdx >=End ?End : EndIdx;
18846for (unsigned Cnt = StartIdx; Cnt +Size <= Sz;) {
18847if (!checkTreeSizes(RangeSizes.slice(Cnt,Size),
18848Size >= MaxRegVF)) {
18849 ++Cnt;
18850continue;
18851 }
18852ArrayRef<Value *> Slice =ArrayRef(Operands).slice(Cnt,Size);
18853assert(all_of(Slice,
18854 [&](Value *V) {
18855return cast<StoreInst>(V)
18856 ->getValueOperand()
18857 ->getType() ==
18858 cast<StoreInst>(Slice.front())
18859 ->getValueOperand()
18860 ->getType();
18861 }) &&
18862"Expected all operands of same type.");
18863if (!NonSchedulable.empty()) {
18864auto [NonSchedSizeMax, NonSchedSizeMin] =
18865 NonSchedulable.lookup(Slice.front());
18866if (NonSchedSizeMax > 0 && NonSchedSizeMin <=Size) {
18867 Cnt += NonSchedSizeMax;
18868continue;
18869 }
18870 }
18871unsigned TreeSize;
18872 std::optional<bool> Res =
18873 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18874if (!Res) {
18875 NonSchedulable
18876 .try_emplace(Slice.front(), std::make_pair(Size,Size))
18877 .first->getSecond()
18878 .second =Size;
18879 }elseif (*Res) {
18880// Mark the vectorized stores so that we don't vectorize them
18881// again.
18882 VectorizedStores.insert(Slice.begin(), Slice.end());
18883// Mark the vectorized stores so that we don't vectorize them
18884// again.
18885 AnyProfitableGraph = RepeatChanged = Changed =true;
18886// If we vectorized initial block, no need to try to vectorize
18887// it again.
18888for_each(RangeSizes.slice(Cnt,Size),
18889 [](std::pair<unsigned, unsigned> &P) {
18890 P.first = P.second = 0;
18891 });
18892if (Cnt < StartIdx + MinVF) {
18893for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18894 [](std::pair<unsigned, unsigned> &P) {
18895 P.first = P.second = 0;
18896 });
18897 StartIdx = Cnt +Size;
18898 }
18899if (Cnt > Sz -Size - MinVF) {
18900for_each(RangeSizes.slice(Cnt +Size, Sz - (Cnt +Size)),
18901 [](std::pair<unsigned, unsigned> &P) {
18902 P.first = P.second = 0;
18903 });
18904if (Sz ==End)
18905End = Cnt;
18906 Sz = Cnt;
18907 }
18908 Cnt +=Size;
18909continue;
18910 }
18911if (Size > 2 && Res &&
18912 !all_of(RangeSizes.slice(Cnt,Size),
18913 std::bind(VFIsProfitable,Size >= MaxRegVF, TreeSize,
18914 std::placeholders::_1))) {
18915 Cnt +=Size;
18916continue;
18917 }
18918// Check for the very big VFs that we're not rebuilding same
18919// trees, just with larger number of elements.
18920if (Size > MaxRegVF && TreeSize > 1 &&
18921all_of(RangeSizes.slice(Cnt,Size),
18922 std::bind(FirstSizeSame, TreeSize,
18923 std::placeholders::_1))) {
18924 Cnt +=Size;
18925while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18926 ++Cnt;
18927continue;
18928 }
18929if (TreeSize > 1)
18930for_each(RangeSizes.slice(Cnt,Size),
18931 [&](std::pair<unsigned, unsigned> &P) {
18932 if (Size >= MaxRegVF)
18933 P.second = std::max(P.second, TreeSize);
18934 else
18935 P.first = std::max(P.first, TreeSize);
18936 });
18937 ++Cnt;
18938 AnyProfitableGraph =true;
18939 }
18940if (StartIdx >=End)
18941break;
18942if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18943 AnyProfitableGraph =true;
18944 StartIdx = std::distance(
18945 RangeSizes.begin(),
18946find_if(RangeSizes.drop_front(Sz),
18947 std::bind(IsNotVectorized,Size >= MaxRegVF,
18948 std::placeholders::_1)));
18949 }
18950if (!AnyProfitableGraph &&Size >= MaxRegVF &&has_single_bit(Size))
18951break;
18952 }
18953// All values vectorized - exit.
18954if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18955returnP.first == 0 &&P.second == 0;
18956 }))
18957break;
18958// Check if tried all attempts or no need for the last attempts at all.
18959if (Repeat >= MaxAttempts ||
18960 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18961break;
18962constexprunsigned StoresLimit = 64;
18963constunsigned MaxTotalNum = std::min<unsigned>(
18964Operands.size(),
18965static_cast<unsigned>(
18966End -
18967 std::distance(
18968 RangeSizes.begin(),
18969find_if(RangeSizes, std::bind(IsNotVectorized,true,
18970 std::placeholders::_1))) +
18971 1));
18972unsigned VF =bit_ceil(CandidateVFs.front()) * 2;
18973unsigned Limit =
18974getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18975 CandidateVFs.clear();
18976if (bit_floor(Limit) == VF)
18977 CandidateVFs.push_back(Limit);
18978if (VF > MaxTotalNum || VF >= StoresLimit)
18979break;
18980for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18981if (P.first != 0)
18982P.first = std::max(P.second,P.first);
18983 });
18984// Last attempt to vectorize max number of elements, if all previous
18985// attempts were unsuccessful because of the cost issues.
18986 CandidateVFs.push_back(VF);
18987 }
18988 }
18989 };
18990
18991// Stores pair (first: index of the store into Stores array ref, address of
18992// which taken as base, second: sorted set of pairs {index, dist}, which are
18993// indices of stores in the set and their store location distances relative to
18994// the base address).
18995
18996// Need to store the index of the very first store separately, since the set
18997// may be reordered after the insertion and the first store may be moved. This
18998// container allows to reduce number of calls of getPointersDiff() function.
18999SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
19000// Inserts the specified store SI with the given index Idx to the set of the
19001// stores. If the store with the same distance is found already - stop
19002// insertion, try to vectorize already found stores. If some stores from this
19003// sequence were not vectorized - try to vectorize them with the new store
19004// later. But this logic is applied only to the stores, that come before the
19005// previous store with the same distance.
19006// Example:
19007// 1. store x, %p
19008// 2. store y, %p+1
19009// 3. store z, %p+2
19010// 4. store a, %p
19011// 5. store b, %p+3
19012// - Scan this from the last to first store. The very first bunch of stores is
19013// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
19014// vector).
19015// - The next store in the list - #1 - has the same distance from store #5 as
19016// the store #4.
19017// - Try to vectorize sequence of stores 4,2,3,5.
19018// - If all these stores are vectorized - just drop them.
19019// - If some of them are not vectorized (say, #3 and #5), do extra analysis.
19020// - Start new stores sequence.
19021// The new bunch of stores is {1, {1, 0}}.
19022// - Add the stores from previous sequence, that were not vectorized.
19023// Here we consider the stores in the reversed order, rather they are used in
19024// the IR (Stores are reversed already, see vectorizeStoreChains() function).
19025// Store #3 can be added -> comes after store #4 with the same distance as
19026// store #1.
19027// Store #5 cannot be added - comes before store #4.
19028// This logic allows to improve the compile time, we assume that the stores
19029// after previous store with the same distance most likely have memory
19030// dependencies and no need to waste compile time to try to vectorize them.
19031// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
19032auto FillStoresSet = [&](unsignedIdx,StoreInst *SI) {
19033for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19034 std::optional<int> Diff =getPointersDiff(
19035 Stores[Set.first]->getValueOperand()->getType(),
19036 Stores[Set.first]->getPointerOperand(),
19037SI->getValueOperand()->getType(),SI->getPointerOperand(), *DL, *SE,
19038/*StrictCheck=*/true);
19039if (!Diff)
19040continue;
19041auto It =Set.second.find(std::make_pair(Idx, *Diff));
19042if (It ==Set.second.end()) {
19043Set.second.emplace(Idx, *Diff);
19044return;
19045 }
19046// Try to vectorize the first found set to avoid duplicate analysis.
19047 TryToVectorize(Set.second);
19048unsigned ItIdx = It->first;
19049int ItDist = It->second;
19050 StoreIndexToDistSet PrevSet;
19051copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
19052 [&](const std::pair<unsigned, int> &Pair) {
19053 return Pair.first > ItIdx;
19054 });
19055Set.second.clear();
19056Set.first =Idx;
19057Set.second.emplace(Idx, 0);
19058// Insert stores that followed previous match to try to vectorize them
19059// with this store.
19060unsigned StartIdx = ItIdx + 1;
19061SmallBitVector UsedStores(Idx - StartIdx);
19062// Distances to previously found dup store (or this store, since they
19063// store to the same addresses).
19064SmallVector<int> Dists(Idx - StartIdx, 0);
19065for (const std::pair<unsigned, int> &Pair :reverse(PrevSet)) {
19066// Do not try to vectorize sequences, we already tried.
19067if (VectorizedStores.contains(Stores[Pair.first]))
19068break;
19069unsigned BI = Pair.first - StartIdx;
19070 UsedStores.set(BI);
19071 Dists[BI] = Pair.second - ItDist;
19072 }
19073for (unsignedI = StartIdx;I <Idx; ++I) {
19074unsigned BI =I - StartIdx;
19075if (UsedStores.test(BI))
19076Set.second.emplace(I, Dists[BI]);
19077 }
19078return;
19079 }
19080auto &Res = SortedStores.emplace_back();
19081 Res.first =Idx;
19082 Res.second.emplace(Idx, 0);
19083 };
19084Type *PrevValTy =nullptr;
19085for (auto [I, SI] :enumerate(Stores)) {
19086if (R.isDeleted(SI))
19087continue;
19088if (!PrevValTy)
19089 PrevValTy =SI->getValueOperand()->getType();
19090// Check that we do not try to vectorize stores of different types.
19091if (PrevValTy !=SI->getValueOperand()->getType()) {
19092for (auto &Set : SortedStores)
19093 TryToVectorize(Set.second);
19094 SortedStores.clear();
19095 PrevValTy =SI->getValueOperand()->getType();
19096 }
19097 FillStoresSet(I, SI);
19098 }
19099
19100// Final vectorization attempt.
19101for (auto &Set : SortedStores)
19102 TryToVectorize(Set.second);
19103
19104return Changed;
19105}
19106
19107void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19108// Initialize the collections. We will make a single pass over the block.
19109 Stores.clear();
19110 GEPs.clear();
19111
19112// Visit the store and getelementptr instructions in BB and organize them in
19113// Stores and GEPs according to the underlying objects of their pointer
19114// operands.
19115for (Instruction &I : *BB) {
19116// Ignore store instructions that are volatile or have a pointer operand
19117// that doesn't point to a scalar type.
19118if (auto *SI = dyn_cast<StoreInst>(&I)) {
19119if (!SI->isSimple())
19120continue;
19121if (!isValidElementType(SI->getValueOperand()->getType()))
19122continue;
19123 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19124 }
19125
19126// Ignore getelementptr instructions that have more than one index, a
19127// constant index, or a pointer operand that doesn't point to a scalar
19128// type.
19129elseif (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19130if (GEP->getNumIndices() != 1)
19131continue;
19132Value *Idx =GEP->idx_begin()->get();
19133if (isa<Constant>(Idx))
19134continue;
19135if (!isValidElementType(Idx->getType()))
19136continue;
19137if (GEP->getType()->isVectorTy())
19138continue;
19139 GEPs[GEP->getPointerOperand()].push_back(GEP);
19140 }
19141 }
19142}
19143
19144bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL,BoUpSLP &R,
19145bool MaxVFOnly) {
19146if (VL.size() < 2)
19147returnfalse;
19148
19149LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize a list of length = "
19150 << VL.size() <<".\n");
19151
19152// Check that all of the parts are instructions of the same type,
19153// we permit an alternate opcode via InstructionsState.
19154 InstructionsState S =getSameOpcode(VL, *TLI);
19155if (!S)
19156returnfalse;
19157
19158Instruction *I0 = S.getMainOp();
19159// Make sure invalid types (including vector type) are rejected before
19160// determining vectorization factor for scalar instructions.
19161for (Value *V : VL) {
19162Type *Ty =V->getType();
19163if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19164// NOTE: the following will give user internal llvm type name, which may
19165// not be useful.
19166R.getORE()->emit([&]() {
19167 std::string TypeStr;
19168llvm::raw_string_ostream rso(TypeStr);
19169 Ty->print(rso);
19170returnOptimizationRemarkMissed(SV_NAME,"UnsupportedType", I0)
19171 <<"Cannot SLP vectorize list: type "
19172 << TypeStr +" is unsupported by vectorizer";
19173 });
19174returnfalse;
19175 }
19176 }
19177
19178Type *ScalarTy =getValueType(VL[0]);
19179unsigned Sz =R.getVectorElementSize(I0);
19180unsigned MinVF =R.getMinVF(Sz);
19181unsigned MaxVF = std::max<unsigned>(
19182getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
19183 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19184if (MaxVF < 2) {
19185R.getORE()->emit([&]() {
19186returnOptimizationRemarkMissed(SV_NAME,"SmallVF", I0)
19187 <<"Cannot SLP vectorize list: vectorization factor "
19188 <<"less than 2 is not supported";
19189 });
19190returnfalse;
19191 }
19192
19193bool Changed =false;
19194bool CandidateFound =false;
19195InstructionCost MinCost =SLPCostThreshold.getValue();
19196
19197unsigned NextInst = 0, MaxInst = VL.size();
19198for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19199 VF =getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
19200// No actual vectorization should happen, if number of parts is the same as
19201// provided vectorization factor (i.e. the scalar type is used for vector
19202// code during codegen).
19203auto *VecTy =getWidenedType(ScalarTy, VF);
19204if (TTI->getNumberOfParts(VecTy) == VF)
19205continue;
19206for (unsignedI = NextInst;I < MaxInst; ++I) {
19207unsigned ActualVF = std::min(MaxInst -I, VF);
19208
19209if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19210continue;
19211
19212if (MaxVFOnly && ActualVF < MaxVF)
19213break;
19214if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19215break;
19216
19217SmallVector<Value *> Ops(ActualVF,nullptr);
19218unsignedIdx = 0;
19219for (Value *V : VL.drop_front(I)) {
19220// Check that a previous iteration of this loop did not delete the
19221// Value.
19222if (auto *Inst = dyn_cast<Instruction>(V);
19223 !Inst || !R.isDeleted(Inst)) {
19224 Ops[Idx] =V;
19225 ++Idx;
19226if (Idx == ActualVF)
19227break;
19228 }
19229 }
19230// Not enough vectorizable instructions - exit.
19231if (Idx != ActualVF)
19232break;
19233
19234LLVM_DEBUG(dbgs() <<"SLP: Analyzing " << ActualVF <<" operations "
19235 <<"\n");
19236
19237R.buildTree(Ops);
19238if (R.isTreeTinyAndNotFullyVectorizable())
19239continue;
19240R.reorderTopToBottom();
19241R.reorderBottomToTop(
19242/*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19243 !R.doesRootHaveInTreeUses());
19244R.transformNodes();
19245R.buildExternalUses();
19246
19247R.computeMinimumValueSizes();
19248InstructionCostCost =R.getTreeCost();
19249 CandidateFound =true;
19250 MinCost = std::min(MinCost,Cost);
19251
19252LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost
19253 <<" for VF=" << ActualVF <<"\n");
19254if (Cost < -SLPCostThreshold) {
19255LLVM_DEBUG(dbgs() <<"SLP: Vectorizing list at cost:" <<Cost <<".\n");
19256R.getORE()->emit(OptimizationRemark(SV_NAME,"VectorizedList",
19257 cast<Instruction>(Ops[0]))
19258 <<"SLP vectorized with cost " <<ore::NV("Cost",Cost)
19259 <<" and with tree size "
19260 <<ore::NV("TreeSize",R.getTreeSize()));
19261
19262R.vectorizeTree();
19263// Move to the next bundle.
19264I += VF - 1;
19265 NextInst =I + 1;
19266 Changed =true;
19267 }
19268 }
19269 }
19270
19271if (!Changed && CandidateFound) {
19272R.getORE()->emit([&]() {
19273returnOptimizationRemarkMissed(SV_NAME,"NotBeneficial", I0)
19274 <<"List vectorization was possible but not beneficial with cost "
19275 <<ore::NV("Cost", MinCost) <<" >= "
19276 <<ore::NV("Treshold", -SLPCostThreshold);
19277 });
19278 }elseif (!Changed) {
19279R.getORE()->emit([&]() {
19280returnOptimizationRemarkMissed(SV_NAME,"NotPossible", I0)
19281 <<"Cannot SLP vectorize list: vectorization was impossible"
19282 <<" with available vectorization factors";
19283 });
19284 }
19285return Changed;
19286}
19287
19288bool SLPVectorizerPass::tryToVectorize(Instruction *I,BoUpSLP &R) {
19289if (!I)
19290returnfalse;
19291
19292if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19293returnfalse;
19294
19295Value *P =I->getParent();
19296
19297// Vectorize in current basic block only.
19298auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19299auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19300if (!Op0 || !Op1 || Op0->getParent() !=P || Op1->getParent() !=P ||
19301R.isDeleted(Op0) ||R.isDeleted(Op1))
19302returnfalse;
19303
19304// First collect all possible candidates
19305SmallVector<std::pair<Value *, Value *>, 4> Candidates;
19306 Candidates.emplace_back(Op0, Op1);
19307
19308auto *A = dyn_cast<BinaryOperator>(Op0);
19309auto *B = dyn_cast<BinaryOperator>(Op1);
19310// Try to skip B.
19311if (A &&B &&B->hasOneUse()) {
19312auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19313auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19314if (B0 && B0->getParent() ==P && !R.isDeleted(B0))
19315 Candidates.emplace_back(A, B0);
19316if (B1 && B1->getParent() ==P && !R.isDeleted(B1))
19317 Candidates.emplace_back(A, B1);
19318 }
19319// Try to skip A.
19320if (B &&A &&A->hasOneUse()) {
19321auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19322auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19323if (A0 && A0->getParent() ==P && !R.isDeleted(A0))
19324 Candidates.emplace_back(A0,B);
19325if (A1 && A1->getParent() ==P && !R.isDeleted(A1))
19326 Candidates.emplace_back(A1,B);
19327 }
19328
19329if (Candidates.size() == 1)
19330return tryToVectorizeList({Op0, Op1},R);
19331
19332// We have multiple options. Try to pick the single best.
19333 std::optional<int> BestCandidate =R.findBestRootPair(Candidates);
19334if (!BestCandidate)
19335returnfalse;
19336return tryToVectorizeList(
19337 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},R);
19338}
19339
19340namespace{
19341
19342/// Model horizontal reductions.
19343///
19344/// A horizontal reduction is a tree of reduction instructions that has values
19345/// that can be put into a vector as its leaves. For example:
19346///
19347/// mul mul mul mul
19348/// \ / \ /
19349/// + +
19350/// \ /
19351/// +
19352/// This tree has "mul" as its leaf values and "+" as its reduction
19353/// instructions. A reduction can feed into a store or a binary operation
19354/// feeding a phi.
19355/// ...
19356/// \ /
19357/// +
19358/// |
19359/// phi +=
19360///
19361/// Or:
19362/// ...
19363/// \ /
19364/// +
19365/// |
19366/// *p =
19367///
19368classHorizontalReduction {
19369usingReductionOpsType =SmallVector<Value *, 16>;
19370usingReductionOpsListType =SmallVector<ReductionOpsType, 2>;
19371 ReductionOpsListType ReductionOps;
19372 /// List of possibly reduced values.
19373SmallVector<SmallVector<Value *>> ReducedVals;
19374 /// Maps reduced value to the corresponding reduction operation.
19375SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
19376WeakTrackingVH ReductionRoot;
19377 /// The type of reduction operation.
19378RecurKind RdxKind;
19379 /// Checks if the optimization of original scalar identity operations on
19380 /// matched horizontal reductions is enabled and allowed.
19381bool IsSupportedHorRdxIdentityOp =false;
19382
19383staticbool isCmpSelMinMax(Instruction *I) {
19384returnmatch(I,m_Select(m_Cmp(),m_Value(),m_Value())) &&
19385RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
19386 }
19387
19388// And/or are potentially poison-safe logical patterns like:
19389// select x, y, false
19390// select x, true, y
19391staticbool isBoolLogicOp(Instruction *I) {
19392return isa<SelectInst>(I) &&
19393 (match(I,m_LogicalAnd()) ||match(I,m_LogicalOr()));
19394 }
19395
19396 /// Checks if instruction is associative and can be vectorized.
19397staticbool isVectorizable(RecurKind Kind,Instruction *I) {
19398if (Kind == RecurKind::None)
19399returnfalse;
19400
19401// Integer ops that map to select instructions or intrinsics are fine.
19402if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
19403 isBoolLogicOp(I))
19404returntrue;
19405
19406if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19407// FP min/max are associative except for NaN and -0.0. We do not
19408// have to rule out -0.0 here because the intrinsic semantics do not
19409// specify a fixed result for it.
19410returnI->getFastMathFlags().noNaNs();
19411 }
19412
19413if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19414returntrue;
19415
19416returnI->isAssociative();
19417 }
19418
19419staticValue *getRdxOperand(Instruction *I,unsignedIndex) {
19420// Poison-safe 'or' takes the form: select X, true, Y
19421// To make that work with the normal operand processing, we skip the
19422// true value operand.
19423// TODO: Change the code and data structures to handle this without a hack.
19424if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) &&Index == 1)
19425returnI->getOperand(2);
19426returnI->getOperand(Index);
19427 }
19428
19429 /// Creates reduction operation with the current opcode.
19430staticValue *createOp(IRBuilderBase &Builder,RecurKind Kind,Value *LHS,
19431Value *RHS,constTwine &Name,bool UseSelect) {
19432switch (Kind) {
19433case RecurKind::Or: {
19434if (UseSelect &&
19435LHS->getType() ==CmpInst::makeCmpResultType(LHS->getType()))
19436return Builder.CreateSelect(LHS, Builder.getTrue(),RHS,Name);
19437unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);
19438return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,
19439Name);
19440 }
19441case RecurKind::And: {
19442if (UseSelect &&
19443LHS->getType() ==CmpInst::makeCmpResultType(LHS->getType()))
19444return Builder.CreateSelect(LHS,RHS, Builder.getFalse(),Name);
19445unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);
19446return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,
19447Name);
19448 }
19449case RecurKind::Add:
19450case RecurKind::Mul:
19451case RecurKind::Xor:
19452case RecurKind::FAdd:
19453case RecurKind::FMul: {
19454unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);
19455return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,
19456Name);
19457 }
19458case RecurKind::SMax:
19459case RecurKind::SMin:
19460case RecurKind::UMax:
19461case RecurKind::UMin:
19462if (UseSelect) {
19463CmpInst::Predicate Pred =llvm::getMinMaxReductionPredicate(Kind);
19464Value *Cmp = Builder.CreateICmp(Pred,LHS,RHS,Name);
19465return Builder.CreateSelect(Cmp,LHS,RHS,Name);
19466 }
19467 [[fallthrough]];
19468case RecurKind::FMax:
19469case RecurKind::FMin:
19470case RecurKind::FMaximum:
19471case RecurKind::FMinimum: {
19472Intrinsic::IDId =llvm::getMinMaxReductionIntrinsicOp(Kind);
19473return Builder.CreateBinaryIntrinsic(Id,LHS,RHS);
19474 }
19475default:
19476llvm_unreachable("Unknown reduction operation.");
19477 }
19478 }
19479
19480 /// Creates reduction operation with the current opcode with the IR flags
19481 /// from \p ReductionOps, dropping nuw/nsw flags.
19482staticValue *createOp(IRBuilderBase &Builder,RecurKind RdxKind,Value *LHS,
19483Value *RHS,constTwine &Name,
19484const ReductionOpsListType &ReductionOps) {
19485bool UseSelect = ReductionOps.size() == 2 ||
19486// Logical or/and.
19487 (ReductionOps.size() == 1 &&
19488any_of(ReductionOps.front(), IsaPred<SelectInst>));
19489assert((!UseSelect || ReductionOps.size() != 2 ||
19490 isa<SelectInst>(ReductionOps[1][0])) &&
19491"Expected cmp + select pairs for reduction");
19492Value *Op = createOp(Builder, RdxKind,LHS,RHS,Name, UseSelect);
19493if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
19494if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19495propagateIRFlags(Sel->getCondition(), ReductionOps[0],nullptr,
19496/*IncludeWrapFlags=*/false);
19497propagateIRFlags(Op, ReductionOps[1],nullptr,
19498/*IncludeWrapFlags=*/false);
19499returnOp;
19500 }
19501 }
19502propagateIRFlags(Op, ReductionOps[0],nullptr,/*IncludeWrapFlags=*/false);
19503returnOp;
19504 }
19505
19506public:
19507staticRecurKindgetRdxKind(Value *V) {
19508auto *I = dyn_cast<Instruction>(V);
19509if (!I)
19510return RecurKind::None;
19511if (match(I,m_Add(m_Value(),m_Value())))
19512return RecurKind::Add;
19513if (match(I,m_Mul(m_Value(),m_Value())))
19514return RecurKind::Mul;
19515if (match(I,m_And(m_Value(),m_Value())) ||
19516match(I,m_LogicalAnd(m_Value(),m_Value())))
19517return RecurKind::And;
19518if (match(I,m_Or(m_Value(),m_Value())) ||
19519match(I,m_LogicalOr(m_Value(),m_Value())))
19520return RecurKind::Or;
19521if (match(I,m_Xor(m_Value(),m_Value())))
19522return RecurKind::Xor;
19523if (match(I,m_FAdd(m_Value(),m_Value())))
19524return RecurKind::FAdd;
19525if (match(I,m_FMul(m_Value(),m_Value())))
19526return RecurKind::FMul;
19527
19528if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(),m_Value())))
19529return RecurKind::FMax;
19530if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(),m_Value())))
19531return RecurKind::FMin;
19532
19533if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(),m_Value())))
19534return RecurKind::FMaximum;
19535if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(),m_Value())))
19536return RecurKind::FMinimum;
19537// This matches either cmp+select or intrinsics. SLP is expected to handle
19538// either form.
19539// TODO: If we are canonicalizing to intrinsics, we can remove several
19540// special-case paths that deal with selects.
19541if (match(I,m_SMax(m_Value(),m_Value())))
19542return RecurKind::SMax;
19543if (match(I,m_SMin(m_Value(),m_Value())))
19544return RecurKind::SMin;
19545if (match(I,m_UMax(m_Value(),m_Value())))
19546return RecurKind::UMax;
19547if (match(I,m_UMin(m_Value(),m_Value())))
19548return RecurKind::UMin;
19549
19550if (auto *Select = dyn_cast<SelectInst>(I)) {
19551// Try harder: look for min/max pattern based on instructions producing
19552// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19553// During the intermediate stages of SLP, it's very common to have
19554// pattern like this (since optimizeGatherSequence is run only once
19555// at the end):
19556// %1 = extractelement <2 x i32> %a, i32 0
19557// %2 = extractelement <2 x i32> %a, i32 1
19558// %cond = icmp sgt i32 %1, %2
19559// %3 = extractelement <2 x i32> %a, i32 0
19560// %4 = extractelement <2 x i32> %a, i32 1
19561// %select = select i1 %cond, i32 %3, i32 %4
19562CmpPredicate Pred;
19563Instruction *L1;
19564Instruction *L2;
19565
19566Value *LHS =Select->getTrueValue();
19567Value *RHS =Select->getFalseValue();
19568Value *Cond =Select->getCondition();
19569
19570// TODO: Support inverse predicates.
19571if (match(Cond,m_Cmp(Pred,m_Specific(LHS),m_Instruction(L2)))) {
19572if (!isa<ExtractElementInst>(RHS) ||
19573 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19574return RecurKind::None;
19575 }elseif (match(Cond,m_Cmp(Pred,m_Instruction(L1),m_Specific(RHS)))) {
19576if (!isa<ExtractElementInst>(LHS) ||
19577 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19578return RecurKind::None;
19579 }else {
19580if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19581return RecurKind::None;
19582if (!match(Cond,m_Cmp(Pred,m_Instruction(L1),m_Instruction(L2))) ||
19583 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19584 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19585return RecurKind::None;
19586 }
19587
19588switch (Pred) {
19589default:
19590return RecurKind::None;
19591caseCmpInst::ICMP_SGT:
19592caseCmpInst::ICMP_SGE:
19593return RecurKind::SMax;
19594caseCmpInst::ICMP_SLT:
19595caseCmpInst::ICMP_SLE:
19596return RecurKind::SMin;
19597caseCmpInst::ICMP_UGT:
19598caseCmpInst::ICMP_UGE:
19599return RecurKind::UMax;
19600caseCmpInst::ICMP_ULT:
19601caseCmpInst::ICMP_ULE:
19602return RecurKind::UMin;
19603 }
19604 }
19605return RecurKind::None;
19606 }
19607
19608 /// Get the index of the first operand.
19609staticunsigned getFirstOperandIndex(Instruction *I) {
19610return isCmpSelMinMax(I) ? 1 : 0;
19611 }
19612
19613private:
19614 /// Total number of operands in the reduction operation.
19615staticunsigned getNumberOfOperands(Instruction *I) {
19616return isCmpSelMinMax(I) ? 3 : 2;
19617 }
19618
19619 /// Checks if the instruction is in basic block \p BB.
19620 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19621staticbool hasSameParent(Instruction *I,BasicBlock *BB) {
19622if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19623auto *Sel = cast<SelectInst>(I);
19624auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19625return Sel->getParent() == BB &&Cmp &&Cmp->getParent() == BB;
19626 }
19627returnI->getParent() == BB;
19628 }
19629
19630 /// Expected number of uses for reduction operations/reduced values.
19631staticbool hasRequiredNumberOfUses(bool IsCmpSelMinMax,Instruction *I) {
19632if (IsCmpSelMinMax) {
19633// SelectInst must be used twice while the condition op must have single
19634// use only.
19635if (auto *Sel = dyn_cast<SelectInst>(I))
19636return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19637returnI->hasNUses(2);
19638 }
19639
19640// Arithmetic reduction operation must be used once only.
19641returnI->hasOneUse();
19642 }
19643
19644 /// Initializes the list of reduction operations.
19645void initReductionOps(Instruction *I) {
19646if (isCmpSelMinMax(I))
19647 ReductionOps.assign(2, ReductionOpsType());
19648else
19649 ReductionOps.assign(1, ReductionOpsType());
19650 }
19651
19652 /// Add all reduction operations for the reduction instruction \p I.
19653void addReductionOps(Instruction *I) {
19654if (isCmpSelMinMax(I)) {
19655 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19656 ReductionOps[1].emplace_back(I);
19657 }else {
19658 ReductionOps[0].emplace_back(I);
19659 }
19660 }
19661
19662staticbool isGoodForReduction(ArrayRef<Value *> Data) {
19663int Sz = Data.size();
19664auto *I = dyn_cast<Instruction>(Data.front());
19665return Sz > 1 ||isConstant(Data.front()) ||
19666 (I && !isa<LoadInst>(I) &&isValidForAlternation(I->getOpcode()));
19667 }
19668
19669public:
19670HorizontalReduction() =default;
19671
19672 /// Try to find a reduction tree.
19673bool matchAssociativeReduction(BoUpSLP &R,Instruction *Root,
19674ScalarEvolution &SE,constDataLayout &DL,
19675constTargetLibraryInfo &TLI) {
19676 RdxKind = HorizontalReduction::getRdxKind(Root);
19677if (!isVectorizable(RdxKind, Root))
19678returnfalse;
19679
19680// Analyze "regular" integer/FP types for reductions - no target-specific
19681// types or pointers.
19682Type *Ty = Root->getType();
19683if (!isValidElementType(Ty) || Ty->isPointerTy())
19684returnfalse;
19685
19686// Though the ultimate reduction may have multiple uses, its condition must
19687// have only single use.
19688if (auto *Sel = dyn_cast<SelectInst>(Root))
19689if (!Sel->getCondition()->hasOneUse())
19690returnfalse;
19691
19692 ReductionRoot = Root;
19693
19694// Iterate through all the operands of the possible reduction tree and
19695// gather all the reduced values, sorting them by their value id.
19696BasicBlock *BB = Root->getParent();
19697bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19698SmallVector<std::pair<Instruction *, unsigned>> Worklist(
19699 1, std::make_pair(Root, 0));
19700// Checks if the operands of the \p TreeN instruction are also reduction
19701// operations or should be treated as reduced values or an extra argument,
19702// which is not part of the reduction.
19703auto CheckOperands = [&](Instruction *TreeN,
19704SmallVectorImpl<Value *> &PossibleReducedVals,
19705SmallVectorImpl<Instruction *> &ReductionOps,
19706unsigned Level) {
19707for (intI :reverse(seq<int>(getFirstOperandIndex(TreeN),
19708 getNumberOfOperands(TreeN)))) {
19709Value *EdgeVal = getRdxOperand(TreeN,I);
19710 ReducedValsToOps[EdgeVal].push_back(TreeN);
19711auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19712// If the edge is not an instruction, or it is different from the main
19713// reduction opcode or has too many uses - possible reduced value.
19714// Also, do not try to reduce const values, if the operation is not
19715// foldable.
19716if (!EdgeInst || Level >RecursionMaxDepth ||
19717getRdxKind(EdgeInst) != RdxKind ||
19718 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19719 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19720 !isVectorizable(RdxKind, EdgeInst) ||
19721 (R.isAnalyzedReductionRoot(EdgeInst) &&
19722all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19723 PossibleReducedVals.push_back(EdgeVal);
19724continue;
19725 }
19726 ReductionOps.push_back(EdgeInst);
19727 }
19728 };
19729// Try to regroup reduced values so that it gets more profitable to try to
19730// reduce them. Values are grouped by their value ids, instructions - by
19731// instruction op id and/or alternate op id, plus do extra analysis for
19732// loads (grouping them by the distabce between pointers) and cmp
19733// instructions (grouping them by the predicate).
19734SmallMapVector<
19735 size_t,SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
19736 8>
19737 PossibleReducedVals;
19738 initReductionOps(Root);
19739DenseMap<std::pair<size_t, Value *>,SmallVector<LoadInst *>> LoadsMap;
19740SmallSet<size_t, 2> LoadKeyUsed;
19741
19742auto GenerateLoadsSubkey = [&](size_tKey,LoadInst *LI) {
19743Key =hash_combine(hash_value(LI->getParent()), Key);
19744Value *Ptr =
19745getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth);
19746if (!LoadKeyUsed.insert(Key).second) {
19747auto LIt = LoadsMap.find(std::make_pair(Key,Ptr));
19748if (LIt != LoadsMap.end()) {
19749for (LoadInst *RLI : LIt->second) {
19750if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19751 LI->getType(), LI->getPointerOperand(),DL, SE,
19752/*StrictCheck=*/true))
19753returnhash_value(RLI->getPointerOperand());
19754 }
19755for (LoadInst *RLI : LIt->second) {
19756if (arePointersCompatible(RLI->getPointerOperand(),
19757 LI->getPointerOperand(), TLI)) {
19758hash_code SubKey =hash_value(RLI->getPointerOperand());
19759return SubKey;
19760 }
19761 }
19762if (LIt->second.size() > 2) {
19763hash_code SubKey =
19764hash_value(LIt->second.back()->getPointerOperand());
19765return SubKey;
19766 }
19767 }
19768 }
19769 LoadsMap.try_emplace(std::make_pair(Key,Ptr))
19770 .first->second.push_back(LI);
19771returnhash_value(LI->getPointerOperand());
19772 };
19773
19774while (!Worklist.empty()) {
19775auto [TreeN, Level] = Worklist.pop_back_val();
19776SmallVector<Value *> PossibleRedVals;
19777SmallVector<Instruction *> PossibleReductionOps;
19778 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19779 addReductionOps(TreeN);
19780// Add reduction values. The values are sorted for better vectorization
19781// results.
19782for (Value *V : PossibleRedVals) {
19783size_tKey,Idx;
19784 std::tie(Key,Idx) =generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19785/*AllowAlternate=*/false);
19786 ++PossibleReducedVals[Key][Idx]
19787 .insert(std::make_pair(V, 0))
19788 .first->second;
19789 }
19790for (Instruction *I :reverse(PossibleReductionOps))
19791 Worklist.emplace_back(I,I->getParent() == BB ? 0 : Level + 1);
19792 }
19793auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19794// Sort values by the total number of values kinds to start the reduction
19795// from the longest possible reduced values sequences.
19796for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19797auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19798SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19799for (auto It = PossibleRedVals.begin(),E = PossibleRedVals.end();
19800 It !=E; ++It) {
19801 PossibleRedValsVect.emplace_back();
19802auto RedValsVect = It->second.takeVector();
19803stable_sort(RedValsVect,llvm::less_second());
19804for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19805 PossibleRedValsVect.back().append(Data.second, Data.first);
19806 }
19807stable_sort(PossibleRedValsVect, [](constauto &P1,constauto &P2) {
19808returnP1.size() > P2.size();
19809 });
19810int NewIdx = -1;
19811for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19812if (NewIdx < 0 ||
19813 (!isGoodForReduction(Data) &&
19814 (!isa<LoadInst>(Data.front()) ||
19815 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19816getUnderlyingObject(
19817 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19818getUnderlyingObject(
19819 cast<LoadInst>(ReducedVals[NewIdx].front())
19820 ->getPointerOperand())))) {
19821 NewIdx = ReducedVals.size();
19822 ReducedVals.emplace_back();
19823 }
19824 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19825 }
19826 }
19827// Sort the reduced values by number of same/alternate opcode and/or pointer
19828// operand.
19829stable_sort(ReducedVals, [](ArrayRef<Value *> P1,ArrayRef<Value *> P2) {
19830returnP1.size() > P2.size();
19831 });
19832returntrue;
19833 }
19834
19835 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19836Value *tryToReduce(BoUpSLP &V,constDataLayout &DL,TargetTransformInfo *TTI,
19837constTargetLibraryInfo &TLI,AssumptionCache *AC) {
19838constunsigned ReductionLimit =VectorizeNonPowerOf2 ? 3 : 4;
19839constexprunsigned RegMaxNumber = 4;
19840constexprunsigned RedValsMaxNumber = 128;
19841// If there are a sufficient number of reduction values, reduce
19842// to a nearby power-of-2. We can safely generate oversized
19843// vectors and rely on the backend to split them to legal sizes.
19844if (unsigned NumReducedVals = std::accumulate(
19845 ReducedVals.begin(), ReducedVals.end(), 0,
19846 [](unsigned Num,ArrayRef<Value *> Vals) ->unsigned {
19847 if (!isGoodForReduction(Vals))
19848 return Num;
19849 return Num + Vals.size();
19850 });
19851 NumReducedVals < ReductionLimit &&
19852all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19853return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19854 })) {
19855for (ReductionOpsType &RdxOps : ReductionOps)
19856for (Value *RdxOp : RdxOps)
19857V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19858returnnullptr;
19859 }
19860
19861IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19862TargetFolder(DL));
19863 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19864
19865// Track the reduced values in case if they are replaced by extractelement
19866// because of the vectorization.
19867DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19868 ReducedVals.front().size());
19869
19870// The compare instruction of a min/max is the insertion point for new
19871// instructions and may be replaced with a new compare instruction.
19872auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19873assert(isa<SelectInst>(RdxRootInst) &&
19874"Expected min/max reduction to have select root instruction");
19875Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19876assert(isa<Instruction>(ScalarCond) &&
19877"Expected min/max reduction to have compare condition");
19878return cast<Instruction>(ScalarCond);
19879 };
19880
19881bool AnyBoolLogicOp =any_of(ReductionOps.back(), [](Value *V) {
19882 return isBoolLogicOp(cast<Instruction>(V));
19883 });
19884// Return new VectorizedTree, based on previous value.
19885auto GetNewVectorizedTree = [&](Value *VectorizedTree,Value *Res) {
19886if (VectorizedTree) {
19887// Update the final value in the reduction.
19888 Builder.SetCurrentDebugLocation(
19889 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19890if (AnyBoolLogicOp) {
19891auto It = ReducedValsToOps.find(VectorizedTree);
19892auto It1 = ReducedValsToOps.find(Res);
19893if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19894isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19895 (It != ReducedValsToOps.end() &&
19896any_of(It->getSecond(), [&](Instruction *I) {
19897 return isBoolLogicOp(I) &&
19898 getRdxOperand(I, 0) == VectorizedTree;
19899 }))) {
19900 ;
19901 }elseif (isGuaranteedNotToBePoison(Res, AC) ||
19902 (It1 != ReducedValsToOps.end() &&
19903any_of(It1->getSecond(), [&](Instruction *I) {
19904 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19905 }))) {
19906std::swap(VectorizedTree, Res);
19907 }else {
19908 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19909 }
19910 }
19911
19912return createOp(Builder, RdxKind, VectorizedTree, Res,"op.rdx",
19913 ReductionOps);
19914 }
19915// Initialize the final value in the reduction.
19916return Res;
19917 };
19918SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19919 ReductionOps.front().size());
19920for (ReductionOpsType &RdxOps : ReductionOps)
19921for (Value *RdxOp : RdxOps) {
19922if (!RdxOp)
19923continue;
19924 IgnoreList.insert(RdxOp);
19925 }
19926// Intersect the fast-math-flags from all reduction operations.
19927FastMathFlags RdxFMF;
19928 RdxFMF.set();
19929for (Value *U : IgnoreList)
19930if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19931 RdxFMF &= FPMO->getFastMathFlags();
19932bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19933
19934// Need to track reduced vals, they may be changed during vectorization of
19935// subvectors.
19936for (ArrayRef<Value *> Candidates : ReducedVals)
19937for (Value *V : Candidates)
19938 TrackedVals.try_emplace(V, V);
19939
19940auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
19941Value *V) ->unsigned & {
19942auto *It = MV.find(V);
19943assert(It != MV.end() &&"Unable to find given key.");
19944return It->second;
19945 };
19946
19947DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19948// List of the values that were reduced in other trees as part of gather
19949// nodes and thus requiring extract if fully vectorized in other trees.
19950SmallPtrSet<Value *, 4> RequiredExtract;
19951WeakTrackingVH VectorizedTree =nullptr;
19952bool CheckForReusedReductionOps =false;
19953// Try to vectorize elements based on their type.
19954SmallVector<InstructionsState> States;
19955for (ArrayRef<Value *> RV : ReducedVals)
19956 States.push_back(getSameOpcode(RV, TLI));
19957for (unsignedI = 0,E = ReducedVals.size();I <E; ++I) {
19958ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19959 InstructionsState S = States[I];
19960SmallVector<Value *> Candidates;
19961 Candidates.reserve(2 * OrigReducedVals.size());
19962DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19963for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19964Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19965// Check if the reduction value was not overriden by the extractelement
19966// instruction because of the vectorization and exclude it, if it is not
19967// compatible with other values.
19968// Also check if the instruction was folded to constant/other value.
19969auto *Inst = dyn_cast<Instruction>(RdxVal);
19970if ((Inst &&isVectorLikeInstWithConstOps(Inst) &&
19971 (!S || !S.isOpcodeOrAlt(Inst))) ||
19972 (S && !Inst))
19973continue;
19974 Candidates.push_back(RdxVal);
19975 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19976 }
19977bool ShuffledExtracts =false;
19978// Try to handle shuffled extractelements.
19979if (S && S.getOpcode() == Instruction::ExtractElement &&
19980 !S.isAltShuffle() &&I + 1 <E) {
19981SmallVector<Value *> CommonCandidates(Candidates);
19982for (Value *RV : ReducedVals[I + 1]) {
19983Value *RdxVal = TrackedVals.at(RV);
19984// Check if the reduction value was not overriden by the
19985// extractelement instruction because of the vectorization and
19986// exclude it, if it is not compatible with other values.
19987auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19988if (!Inst)
19989continue;
19990 CommonCandidates.push_back(RdxVal);
19991 TrackedToOrig.try_emplace(RdxVal, RV);
19992 }
19993SmallVector<int>Mask;
19994if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
19995 ++I;
19996 Candidates.swap(CommonCandidates);
19997 ShuffledExtracts =true;
19998 }
19999 }
20000
20001// Emit code for constant values.
20002if (Candidates.size() > 1 &&allConstant(Candidates)) {
20003Value *Res = Candidates.front();
20004Value *OrigV = TrackedToOrig.at(Candidates.front());
20005 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20006for (Value *VC :ArrayRef(Candidates).drop_front()) {
20007 Res = createOp(Builder, RdxKind, Res, VC,"const.rdx", ReductionOps);
20008Value *OrigV = TrackedToOrig.at(VC);
20009 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20010if (auto *ResI = dyn_cast<Instruction>(Res))
20011V.analyzedReductionRoot(ResI);
20012 }
20013 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20014continue;
20015 }
20016
20017unsigned NumReducedVals = Candidates.size();
20018if (NumReducedVals < ReductionLimit &&
20019 (NumReducedVals < 2 || !isSplat(Candidates)))
20020continue;
20021
20022// Check if we support repeated scalar values processing (optimization of
20023// original scalar identity operations on matched horizontal reductions).
20024 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20025 RdxKind != RecurKind::FMul &&
20026 RdxKind != RecurKind::FMulAdd;
20027// Gather same values.
20028SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
20029if (IsSupportedHorRdxIdentityOp)
20030for (Value *V : Candidates) {
20031Value *OrigV = TrackedToOrig.at(V);
20032 ++SameValuesCounter.try_emplace(OrigV).first->second;
20033 }
20034// Used to check if the reduced values used same number of times. In this
20035// case the compiler may produce better code. E.g. if reduced values are
20036// aabbccdd (8 x values), then the first node of the tree will have a node
20037// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
20038// Plus, the final reduction will be performed on <8 x aabbccdd>.
20039// Instead compiler may build <4 x abcd> tree immediately, + reduction (4
20040// x abcd) * 2.
20041// Currently it only handles add/fadd/xor. and/or/min/max do not require
20042// this analysis, other operations may require an extra estimation of
20043// the profitability.
20044bool SameScaleFactor =false;
20045bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20046 SameValuesCounter.size() != Candidates.size();
20047BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
20048if (OptReusedScalars) {
20049 SameScaleFactor =
20050 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20051 RdxKind == RecurKind::Xor) &&
20052all_of(drop_begin(SameValuesCounter),
20053 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
20054returnP.second == SameValuesCounter.front().second;
20055 });
20056 Candidates.resize(SameValuesCounter.size());
20057transform(SameValuesCounter, Candidates.begin(),
20058 [&](constauto &P) { return TrackedVals.at(P.first); });
20059 NumReducedVals = Candidates.size();
20060// Have a reduction of the same element.
20061if (NumReducedVals == 1) {
20062Value *OrigV = TrackedToOrig.at(Candidates.front());
20063unsigned Cnt = At(SameValuesCounter, OrigV);
20064Value *RedVal =
20065 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20066 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20067 VectorizedVals.try_emplace(OrigV, Cnt);
20068 ExternallyUsedValues.insert(OrigV);
20069continue;
20070 }
20071 }
20072
20073unsigned MaxVecRegSize =V.getMaxVecRegSize();
20074unsigned EltSize =V.getVectorElementSize(Candidates[0]);
20075constunsigned MaxElts = std::clamp<unsigned>(
20076llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20077 RegMaxNumber * RedValsMaxNumber);
20078
20079unsigned ReduxWidth = NumReducedVals;
20080auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20081unsigned NumParts, NumRegs;
20082Type *ScalarTy = Candidates.front()->getType();
20083 ReduxWidth =
20084getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20085VectorType *Tp =getWidenedType(ScalarTy, ReduxWidth);
20086 NumParts =::getNumberOfParts(TTI, Tp);
20087 NumRegs =
20088TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
20089while (NumParts > NumRegs) {
20090assert(ReduxWidth > 0 &&"ReduxWidth is unexpectedly 0.");
20091 ReduxWidth =bit_floor(ReduxWidth - 1);
20092VectorType *Tp =getWidenedType(ScalarTy, ReduxWidth);
20093 NumParts =::getNumberOfParts(TTI, Tp);
20094 NumRegs =
20095TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
20096 }
20097if (NumParts > NumRegs / 2)
20098 ReduxWidth =bit_floor(ReduxWidth);
20099return ReduxWidth;
20100 };
20101if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20102 ReduxWidth = GetVectorFactor(ReduxWidth);
20103 ReduxWidth = std::min(ReduxWidth, MaxElts);
20104
20105unsigned Start = 0;
20106unsigned Pos = Start;
20107// Restarts vectorization attempt with lower vector factor.
20108unsigned PrevReduxWidth = ReduxWidth;
20109bool CheckForReusedReductionOpsLocal =false;
20110auto AdjustReducedVals = [&](bool IgnoreVL =false) {
20111bool IsAnyRedOpGathered = !IgnoreVL &&V.isAnyGathered(IgnoreList);
20112if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20113// Check if any of the reduction ops are gathered. If so, worth
20114// trying again with less number of reduction ops.
20115 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20116 }
20117 ++Pos;
20118if (Pos < NumReducedVals - ReduxWidth + 1)
20119return IsAnyRedOpGathered;
20120 Pos = Start;
20121 --ReduxWidth;
20122if (ReduxWidth > 1)
20123 ReduxWidth = GetVectorFactor(ReduxWidth);
20124return IsAnyRedOpGathered;
20125 };
20126bool AnyVectorized =false;
20127SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20128while (Pos < NumReducedVals - ReduxWidth + 1 &&
20129 ReduxWidth >= ReductionLimit) {
20130// Dependency in tree of the reduction ops - drop this attempt, try
20131// later.
20132if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20133 Start == 0) {
20134 CheckForReusedReductionOps =true;
20135break;
20136 }
20137 PrevReduxWidth = ReduxWidth;
20138ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20139// Been analyzed already - skip.
20140if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20141 (!has_single_bit(ReduxWidth) &&
20142 (IgnoredCandidates.contains(
20143 std::make_pair(Pos,bit_floor(ReduxWidth))) ||
20144 IgnoredCandidates.contains(
20145 std::make_pair(Pos + (ReduxWidth -bit_floor(ReduxWidth)),
20146bit_floor(ReduxWidth))))) ||
20147V.areAnalyzedReductionVals(VL)) {
20148 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20149continue;
20150 }
20151// Early exit if any of the reduction values were deleted during
20152// previous vectorization attempts.
20153if (any_of(VL, [&V](Value *RedVal) {
20154auto *RedValI = dyn_cast<Instruction>(RedVal);
20155if (!RedValI)
20156returnfalse;
20157returnV.isDeleted(RedValI);
20158 }))
20159break;
20160V.buildTree(VL, IgnoreList);
20161if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20162if (!AdjustReducedVals())
20163V.analyzedReductionVals(VL);
20164continue;
20165 }
20166if (V.isLoadCombineReductionCandidate(RdxKind)) {
20167if (!AdjustReducedVals())
20168V.analyzedReductionVals(VL);
20169continue;
20170 }
20171V.reorderTopToBottom();
20172// No need to reorder the root node at all.
20173V.reorderBottomToTop(/*IgnoreReorder=*/true);
20174// Keep extracted other reduction values, if they are used in the
20175// vectorization trees.
20176BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20177 ExternallyUsedValues);
20178// The reduction root is used as the insertion point for new
20179// instructions, so set it as externally used to prevent it from being
20180// deleted.
20181 LocalExternallyUsedValues.insert(ReductionRoot);
20182for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20183if (Cnt ==I || (ShuffledExtracts && Cnt ==I - 1))
20184continue;
20185for (Value *V : ReducedVals[Cnt])
20186if (isa<Instruction>(V))
20187 LocalExternallyUsedValues.insert(TrackedVals[V]);
20188 }
20189if (!IsSupportedHorRdxIdentityOp) {
20190// Number of uses of the candidates in the vector of values.
20191assert(SameValuesCounter.empty() &&
20192"Reused values counter map is not empty");
20193for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20194if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20195continue;
20196Value *V = Candidates[Cnt];
20197Value *OrigV = TrackedToOrig.at(V);
20198 ++SameValuesCounter.try_emplace(OrigV).first->second;
20199 }
20200 }
20201V.transformNodes();
20202SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20203// Gather externally used values.
20204SmallPtrSet<Value *, 4> Visited;
20205for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20206if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20207continue;
20208Value *RdxVal = Candidates[Cnt];
20209if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20210 RdxVal = It->second;
20211if (!Visited.insert(RdxVal).second)
20212continue;
20213// Check if the scalar was vectorized as part of the vectorization
20214// tree but not the top node.
20215if (!VLScalars.contains(RdxVal) &&V.isVectorized(RdxVal)) {
20216 LocalExternallyUsedValues.insert(RdxVal);
20217continue;
20218 }
20219Value *OrigV = TrackedToOrig.at(RdxVal);
20220unsigned NumOps =
20221 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20222if (NumOps != ReducedValsToOps.at(OrigV).size())
20223 LocalExternallyUsedValues.insert(RdxVal);
20224 }
20225// Do not need the list of reused scalars in regular mode anymore.
20226if (!IsSupportedHorRdxIdentityOp)
20227 SameValuesCounter.clear();
20228for (Value *RdxVal : VL)
20229if (RequiredExtract.contains(RdxVal))
20230 LocalExternallyUsedValues.insert(RdxVal);
20231V.buildExternalUses(LocalExternallyUsedValues);
20232
20233V.computeMinimumValueSizes();
20234
20235// Estimate cost.
20236InstructionCost TreeCost =V.getTreeCost(VL);
20237InstructionCost ReductionCost =
20238 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20239InstructionCostCost = TreeCost + ReductionCost;
20240LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost
20241 <<" for reduction\n");
20242if (!Cost.isValid())
20243break;
20244if (Cost >= -SLPCostThreshold) {
20245V.getORE()->emit([&]() {
20246returnOptimizationRemarkMissed(SV_NAME,"HorSLPNotBeneficial",
20247 ReducedValsToOps.at(VL[0]).front())
20248 <<"Vectorizing horizontal reduction is possible "
20249 <<"but not beneficial with cost " <<ore::NV("Cost",Cost)
20250 <<" and threshold "
20251 <<ore::NV("Threshold", -SLPCostThreshold);
20252 });
20253if (!AdjustReducedVals()) {
20254V.analyzedReductionVals(VL);
20255unsignedOffset = Pos == Start ? Pos : Pos - 1;
20256if (ReduxWidth > ReductionLimit &&V.isTreeNotExtendable()) {
20257// Add subvectors of VL to the list of the analyzed values.
20258for (unsigned VF =getFloorFullVectorNumberOfElements(
20259 *TTI, VL.front()->getType(), ReduxWidth - 1);
20260 VF >= ReductionLimit;
20261 VF =getFloorFullVectorNumberOfElements(
20262 *TTI, VL.front()->getType(), VF - 1)) {
20263if (has_single_bit(VF) &&
20264V.getCanonicalGraphSize() !=V.getTreeSize())
20265continue;
20266for (unsignedIdx : seq<unsigned>(ReduxWidth - VF))
20267 IgnoredCandidates.insert(std::make_pair(Offset +Idx, VF));
20268 }
20269 }
20270 }
20271continue;
20272 }
20273
20274LLVM_DEBUG(dbgs() <<"SLP: Vectorizing horizontal reduction at cost:"
20275 <<Cost <<". (HorRdx)\n");
20276V.getORE()->emit([&]() {
20277returnOptimizationRemark(SV_NAME,"VectorizedHorizontalReduction",
20278 ReducedValsToOps.at(VL[0]).front())
20279 <<"Vectorized horizontal reduction with cost "
20280 <<ore::NV("Cost",Cost) <<" and with tree size "
20281 <<ore::NV("TreeSize",V.getTreeSize());
20282 });
20283
20284 Builder.setFastMathFlags(RdxFMF);
20285
20286// Emit a reduction. If the root is a select (min/max idiom), the insert
20287// point is the compare condition of that select.
20288Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20289Instruction *InsertPt = RdxRootInst;
20290if (IsCmpSelMinMax)
20291 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20292
20293// Vectorize a tree.
20294Value *VectorizedRoot =
20295V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20296// Update TrackedToOrig mapping, since the tracked values might be
20297// updated.
20298for (Value *RdxVal : Candidates) {
20299Value *OrigVal = TrackedToOrig.at(RdxVal);
20300Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20301if (TransformedRdxVal != RdxVal)
20302 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20303 }
20304
20305 Builder.SetInsertPoint(InsertPt);
20306
20307// To prevent poison from leaking across what used to be sequential,
20308// safe, scalar boolean logic operations, the reduction operand must be
20309// frozen.
20310if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20311 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20312
20313// Emit code to correctly handle reused reduced values, if required.
20314if (OptReusedScalars && !SameScaleFactor) {
20315 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20316 SameValuesCounter, TrackedToOrig);
20317 }
20318
20319Value *ReducedSubTree;
20320Type *ScalarTy = VL.front()->getType();
20321if (isa<FixedVectorType>(ScalarTy)) {
20322assert(SLPReVec &&"FixedVectorType is not expected.");
20323unsigned ScalarTyNumElements =getNumElements(ScalarTy);
20324 ReducedSubTree =PoisonValue::get(FixedVectorType::get(
20325 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20326for (unsignedI : seq<unsigned>(ScalarTyNumElements)) {
20327// Do reduction for each lane.
20328// e.g., do reduce add for
20329// VL[0] = <4 x Ty> <a, b, c, d>
20330// VL[1] = <4 x Ty> <e, f, g, h>
20331// Lane[0] = <2 x Ty> <a, e>
20332// Lane[1] = <2 x Ty> <b, f>
20333// Lane[2] = <2 x Ty> <c, g>
20334// Lane[3] = <2 x Ty> <d, h>
20335// result[0] = reduce add Lane[0]
20336// result[1] = reduce add Lane[1]
20337// result[2] = reduce add Lane[2]
20338// result[3] = reduce add Lane[3]
20339SmallVector<int, 16>Mask =
20340createStrideMask(I, ScalarTyNumElements, VL.size());
20341Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20342 ReducedSubTree = Builder.CreateInsertElement(
20343 ReducedSubTree,
20344 emitReduction(Lane, Builder,TTI, RdxRootInst->getType()),I);
20345 }
20346 }else {
20347 ReducedSubTree = emitReduction(VectorizedRoot, Builder,TTI,
20348 RdxRootInst->getType());
20349 }
20350if (ReducedSubTree->getType() != VL.front()->getType()) {
20351assert(ReducedSubTree->getType() != VL.front()->getType() &&
20352"Expected different reduction type.");
20353 ReducedSubTree =
20354 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20355V.isSignedMinBitwidthRootNode());
20356 }
20357
20358// Improved analysis for add/fadd/xor reductions with same scale factor
20359// for all operands of reductions. We can emit scalar ops for them
20360// instead.
20361if (OptReusedScalars && SameScaleFactor)
20362 ReducedSubTree = emitScaleForReusedOps(
20363 ReducedSubTree, Builder, SameValuesCounter.front().second);
20364
20365 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20366// Count vectorized reduced values to exclude them from final reduction.
20367for (Value *RdxVal : VL) {
20368Value *OrigV = TrackedToOrig.at(RdxVal);
20369if (IsSupportedHorRdxIdentityOp) {
20370 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20371continue;
20372 }
20373 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20374if (!V.isVectorized(RdxVal))
20375 RequiredExtract.insert(RdxVal);
20376 }
20377 Pos += ReduxWidth;
20378 Start = Pos;
20379 ReduxWidth = NumReducedVals - Pos;
20380if (ReduxWidth > 1)
20381 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20382 AnyVectorized =true;
20383 }
20384if (OptReusedScalars && !AnyVectorized) {
20385for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20386Value *RdxVal = TrackedVals.at(P.first);
20387Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,P.second);
20388 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20389 VectorizedVals.try_emplace(P.first,P.second);
20390 }
20391continue;
20392 }
20393 }
20394if (VectorizedTree) {
20395// Reorder operands of bool logical op in the natural order to avoid
20396// possible problem with poison propagation. If not possible to reorder
20397// (both operands are originally RHS), emit an extra freeze instruction
20398// for the LHS operand.
20399// I.e., if we have original code like this:
20400// RedOp1 = select i1 ?, i1 LHS, i1 false
20401// RedOp2 = select i1 RHS, i1 ?, i1 false
20402
20403// Then, we swap LHS/RHS to create a new op that matches the poison
20404// semantics of the original code.
20405
20406// If we have original code like this and both values could be poison:
20407// RedOp1 = select i1 ?, i1 LHS, i1 false
20408// RedOp2 = select i1 ?, i1 RHS, i1 false
20409
20410// Then, we must freeze LHS in the new op.
20411auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS,Value *&RHS,
20412Instruction *RedOp1,
20413Instruction *RedOp2,
20414bool InitStep) {
20415if (!AnyBoolLogicOp)
20416return;
20417if (isBoolLogicOp(RedOp1) && ((!InitStep &&LHS == VectorizedTree) ||
20418 getRdxOperand(RedOp1, 0) ==LHS ||
20419isGuaranteedNotToBePoison(LHS, AC)))
20420return;
20421if (isBoolLogicOp(RedOp2) && ((!InitStep &&RHS == VectorizedTree) ||
20422 getRdxOperand(RedOp2, 0) ==RHS ||
20423isGuaranteedNotToBePoison(RHS, AC))) {
20424std::swap(LHS,RHS);
20425return;
20426 }
20427if (LHS != VectorizedTree)
20428LHS = Builder.CreateFreeze(LHS);
20429 };
20430// Finish the reduction.
20431// Need to add extra arguments and not vectorized possible reduction
20432// values.
20433// Try to avoid dependencies between the scalar remainders after
20434// reductions.
20435auto FinalGen =
20436 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
20437bool InitStep) {
20438unsigned Sz = InstVals.size();
20439SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
20440 Sz % 2);
20441for (unsignedI = 0,E = (Sz / 2) * 2;I <E;I += 2) {
20442Instruction *RedOp = InstVals[I + 1].first;
20443 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20444Value *RdxVal1 = InstVals[I].second;
20445Value *StableRdxVal1 = RdxVal1;
20446auto It1 = TrackedVals.find(RdxVal1);
20447if (It1 != TrackedVals.end())
20448 StableRdxVal1 = It1->second;
20449Value *RdxVal2 = InstVals[I + 1].second;
20450Value *StableRdxVal2 = RdxVal2;
20451auto It2 = TrackedVals.find(RdxVal2);
20452if (It2 != TrackedVals.end())
20453 StableRdxVal2 = It2->second;
20454// To prevent poison from leaking across what used to be
20455// sequential, safe, scalar boolean logic operations, the
20456// reduction operand must be frozen.
20457 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20458 RedOp, InitStep);
20459Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20460 StableRdxVal2,"op.rdx", ReductionOps);
20461 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20462 }
20463if (Sz % 2 == 1)
20464 ExtraReds[Sz / 2] = InstVals.back();
20465return ExtraReds;
20466 };
20467SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
20468 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20469 VectorizedTree);
20470SmallPtrSet<Value *, 8> Visited;
20471for (ArrayRef<Value *> Candidates : ReducedVals) {
20472for (Value *RdxVal : Candidates) {
20473if (!Visited.insert(RdxVal).second)
20474continue;
20475unsigned NumOps = VectorizedVals.lookup(RdxVal);
20476for (Instruction *RedOp :
20477ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20478 ExtraReductions.emplace_back(RedOp, RdxVal);
20479 }
20480 }
20481// Iterate through all not-vectorized reduction values/extra arguments.
20482bool InitStep =true;
20483while (ExtraReductions.size() > 1) {
20484SmallVector<std::pair<Instruction *, Value *>> NewReds =
20485 FinalGen(ExtraReductions, InitStep);
20486 ExtraReductions.swap(NewReds);
20487 InitStep =false;
20488 }
20489 VectorizedTree = ExtraReductions.front().second;
20490
20491 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20492
20493// The original scalar reduction is expected to have no remaining
20494// uses outside the reduction tree itself. Assert that we got this
20495// correct, replace internal uses with undef, and mark for eventual
20496// deletion.
20497#ifndef NDEBUG
20498SmallSet<Value *, 4> IgnoreSet;
20499for (ArrayRef<Value *> RdxOps : ReductionOps)
20500 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20501#endif
20502for (ArrayRef<Value *> RdxOps : ReductionOps) {
20503for (Value *Ignore : RdxOps) {
20504if (!Ignore)
20505continue;
20506#ifndef NDEBUG
20507for (auto *U :Ignore->users()) {
20508assert(IgnoreSet.count(U) &&
20509"All users must be either in the reduction ops list.");
20510 }
20511#endif
20512if (!Ignore->use_empty()) {
20513Value *P =PoisonValue::get(Ignore->getType());
20514Ignore->replaceAllUsesWith(P);
20515 }
20516 }
20517V.removeInstructionsAndOperands(RdxOps);
20518 }
20519 }elseif (!CheckForReusedReductionOps) {
20520for (ReductionOpsType &RdxOps : ReductionOps)
20521for (Value *RdxOp : RdxOps)
20522V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20523 }
20524return VectorizedTree;
20525 }
20526
20527private:
20528 /// Calculate the cost of a reduction.
20529InstructionCost getReductionCost(TargetTransformInfo *TTI,
20530ArrayRef<Value *> ReducedVals,
20531bool IsCmpSelMinMax,FastMathFlags FMF,
20532constBoUpSLP &R) {
20533TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
20534Type *ScalarTy = ReducedVals.front()->getType();
20535unsigned ReduxWidth = ReducedVals.size();
20536FixedVectorType *VectorTy =R.getReductionType();
20537InstructionCost VectorCost = 0, ScalarCost;
20538// If all of the reduced values are constant, the vector cost is 0, since
20539// the reduction value can be calculated at the compile time.
20540bool AllConsts =allConstant(ReducedVals);
20541auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20542InstructionCostCost = 0;
20543// Scalar cost is repeated for N-1 elements.
20544int Cnt = ReducedVals.size();
20545for (Value *RdxVal : ReducedVals) {
20546if (Cnt == 1)
20547break;
20548 --Cnt;
20549if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20550Cost += GenCostFn();
20551continue;
20552 }
20553InstructionCost ScalarCost = 0;
20554for (User *U : RdxVal->users()) {
20555auto *RdxOp = cast<Instruction>(U);
20556if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20557 ScalarCost +=TTI->getInstructionCost(RdxOp,CostKind);
20558continue;
20559 }
20560 ScalarCost =InstructionCost::getInvalid();
20561break;
20562 }
20563if (ScalarCost.isValid())
20564Cost += ScalarCost;
20565else
20566Cost += GenCostFn();
20567 }
20568returnCost;
20569 };
20570switch (RdxKind) {
20571case RecurKind::Add:
20572case RecurKind::Mul:
20573case RecurKind::Or:
20574case RecurKind::And:
20575case RecurKind::Xor:
20576case RecurKind::FAdd:
20577case RecurKind::FMul: {
20578unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(RdxKind);
20579if (!AllConsts) {
20580if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20581assert(SLPReVec &&"FixedVectorType is not expected.");
20582unsigned ScalarTyNumElements = VecTy->getNumElements();
20583for (unsignedI : seq<unsigned>(ReducedVals.size())) {
20584 VectorCost +=TTI->getShuffleCost(
20585TTI::SK_PermuteSingleSrc, VectorTy,
20586createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20587 VectorCost +=TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20588CostKind);
20589 }
20590 VectorCost +=TTI->getScalarizationOverhead(
20591 VecTy,APInt::getAllOnes(ScalarTyNumElements),/*Insert*/true,
20592/*Extract*/false,TTI::TCK_RecipThroughput);
20593 }else {
20594Type *RedTy = VectorTy->getElementType();
20595auto [RType, IsSigned] =R.getRootNodeTypeWithNoCast().value_or(
20596 std::make_pair(RedTy,true));
20597if (RType == RedTy) {
20598 VectorCost =TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20599 FMF,CostKind);
20600 }else {
20601 VectorCost =TTI->getExtendedReductionCost(
20602 RdxOpcode, !IsSigned, RedTy,getWidenedType(RType, ReduxWidth),
20603 FMF,CostKind);
20604 }
20605 }
20606 }
20607 ScalarCost = EvaluateScalarCost([&]() {
20608returnTTI->getArithmeticInstrCost(RdxOpcode, ScalarTy,CostKind);
20609 });
20610break;
20611 }
20612case RecurKind::FMax:
20613case RecurKind::FMin:
20614case RecurKind::FMaximum:
20615case RecurKind::FMinimum:
20616case RecurKind::SMax:
20617case RecurKind::SMin:
20618case RecurKind::UMax:
20619case RecurKind::UMin: {
20620Intrinsic::IDId =getMinMaxReductionIntrinsicOp(RdxKind);
20621if (!AllConsts)
20622 VectorCost =TTI->getMinMaxReductionCost(Id, VectorTy, FMF,CostKind);
20623 ScalarCost = EvaluateScalarCost([&]() {
20624IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20625returnTTI->getIntrinsicInstrCost(ICA,CostKind);
20626 });
20627break;
20628 }
20629default:
20630llvm_unreachable("Expected arithmetic or min/max reduction operation");
20631 }
20632
20633LLVM_DEBUG(dbgs() <<"SLP: Adding cost " << VectorCost - ScalarCost
20634 <<" for reduction of " <<shortBundleName(ReducedVals)
20635 <<" (It is a splitting reduction)\n");
20636return VectorCost - ScalarCost;
20637 }
20638
20639 /// Emit a horizontal reduction of the vectorized value.
20640Value *emitReduction(Value *VectorizedValue,IRBuilderBase &Builder,
20641constTargetTransformInfo *TTI,Type *DestTy) {
20642assert(VectorizedValue &&"Need to have a vectorized tree node");
20643assert(RdxKind != RecurKind::FMulAdd &&
20644"A call to the llvm.fmuladd intrinsic is not handled yet");
20645
20646auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20647if (FTy->getScalarType() == Builder.getInt1Ty() &&
20648 RdxKind == RecurKind::Add &&
20649 DestTy->getScalarType() != FTy->getScalarType()) {
20650// Convert vector_reduce_add(ZExt(<n x i1>)) to
20651// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20652Value *V = Builder.CreateBitCast(
20653 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20654 ++NumVectorInstructions;
20655return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20656 }
20657 ++NumVectorInstructions;
20658returncreateSimpleReduction(Builder, VectorizedValue, RdxKind);
20659 }
20660
20661 /// Emits optimized code for unique scalar value reused \p Cnt times.
20662Value *emitScaleForReusedOps(Value *VectorizedValue,IRBuilderBase &Builder,
20663unsigned Cnt) {
20664assert(IsSupportedHorRdxIdentityOp &&
20665"The optimization of matched scalar identity horizontal reductions "
20666"must be supported.");
20667if (Cnt == 1)
20668return VectorizedValue;
20669switch (RdxKind) {
20670case RecurKind::Add: {
20671// res = mul vv, n
20672Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20673LLVM_DEBUG(dbgs() <<"SLP: Add (to-mul) " << Cnt <<"of "
20674 << VectorizedValue <<". (HorRdx)\n");
20675return Builder.CreateMul(VectorizedValue, Scale);
20676 }
20677case RecurKind::Xor: {
20678// res = n % 2 ? 0 : vv
20679LLVM_DEBUG(dbgs() <<"SLP: Xor " << Cnt <<"of " << VectorizedValue
20680 <<". (HorRdx)\n");
20681if (Cnt % 2 == 0)
20682returnConstant::getNullValue(VectorizedValue->getType());
20683return VectorizedValue;
20684 }
20685case RecurKind::FAdd: {
20686// res = fmul v, n
20687Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20688LLVM_DEBUG(dbgs() <<"SLP: FAdd (to-fmul) " << Cnt <<"of "
20689 << VectorizedValue <<". (HorRdx)\n");
20690return Builder.CreateFMul(VectorizedValue, Scale);
20691 }
20692case RecurKind::And:
20693case RecurKind::Or:
20694case RecurKind::SMax:
20695case RecurKind::SMin:
20696case RecurKind::UMax:
20697case RecurKind::UMin:
20698case RecurKind::FMax:
20699case RecurKind::FMin:
20700case RecurKind::FMaximum:
20701case RecurKind::FMinimum:
20702// res = vv
20703return VectorizedValue;
20704case RecurKind::Mul:
20705case RecurKind::FMul:
20706case RecurKind::FMulAdd:
20707case RecurKind::IAnyOf:
20708case RecurKind::FAnyOf:
20709case RecurKind::IFindLastIV:
20710case RecurKind::FFindLastIV:
20711case RecurKind::None:
20712llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20713 }
20714returnnullptr;
20715 }
20716
20717 /// Emits actual operation for the scalar identity values, found during
20718 /// horizontal reduction analysis.
20719Value *
20720 emitReusedOps(Value *VectorizedValue,IRBuilderBase &Builder,BoUpSLP &R,
20721constSmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20722constDenseMap<Value *, Value *> &TrackedToOrig) {
20723assert(IsSupportedHorRdxIdentityOp &&
20724"The optimization of matched scalar identity horizontal reductions "
20725"must be supported.");
20726ArrayRef<Value *> VL =R.getRootNodeScalars();
20727auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20728if (VTy->getElementType() != VL.front()->getType()) {
20729 VectorizedValue = Builder.CreateIntCast(
20730 VectorizedValue,
20731getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20732R.isSignedMinBitwidthRootNode());
20733 }
20734switch (RdxKind) {
20735case RecurKind::Add: {
20736// root = mul prev_root, <1, 1, n, 1>
20737SmallVector<Constant *> Vals;
20738for (Value *V : VL) {
20739unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20740 Vals.push_back(ConstantInt::get(V->getType(), Cnt,/*IsSigned=*/false));
20741 }
20742auto *Scale =ConstantVector::get(Vals);
20743LLVM_DEBUG(dbgs() <<"SLP: Add (to-mul) " << Scale <<"of "
20744 << VectorizedValue <<". (HorRdx)\n");
20745return Builder.CreateMul(VectorizedValue, Scale);
20746 }
20747case RecurKind::And:
20748case RecurKind::Or:
20749// No need for multiple or/and(s).
20750LLVM_DEBUG(dbgs() <<"SLP: And/or of same " << VectorizedValue
20751 <<". (HorRdx)\n");
20752return VectorizedValue;
20753case RecurKind::SMax:
20754case RecurKind::SMin:
20755case RecurKind::UMax:
20756case RecurKind::UMin:
20757case RecurKind::FMax:
20758case RecurKind::FMin:
20759case RecurKind::FMaximum:
20760case RecurKind::FMinimum:
20761// No need for multiple min/max(s) of the same value.
20762LLVM_DEBUG(dbgs() <<"SLP: Max/min of same " << VectorizedValue
20763 <<". (HorRdx)\n");
20764return VectorizedValue;
20765case RecurKind::Xor: {
20766// Replace values with even number of repeats with 0, since
20767// x xor x = 0.
20768// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20769// 7>, if elements 4th and 6th elements have even number of repeats.
20770SmallVector<int>Mask(
20771 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20772PoisonMaskElem);
20773 std::iota(Mask.begin(),Mask.end(), 0);
20774bool NeedShuffle =false;
20775for (unsignedI = 0, VF = VL.size();I < VF; ++I) {
20776Value *V = VL[I];
20777unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20778if (Cnt % 2 == 0) {
20779Mask[I] = VF;
20780 NeedShuffle =true;
20781 }
20782 }
20783LLVM_DEBUG(dbgs() <<"SLP: Xor <";for (intI
20784 : Mask)dbgs()
20785 <<I <<" ";
20786dbgs() <<"> of " << VectorizedValue <<". (HorRdx)\n");
20787if (NeedShuffle)
20788 VectorizedValue = Builder.CreateShuffleVector(
20789 VectorizedValue,
20790 ConstantVector::getNullValue(VectorizedValue->getType()),Mask);
20791return VectorizedValue;
20792 }
20793case RecurKind::FAdd: {
20794// root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20795SmallVector<Constant *> Vals;
20796for (Value *V : VL) {
20797unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20798 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20799 }
20800auto *Scale =ConstantVector::get(Vals);
20801return Builder.CreateFMul(VectorizedValue, Scale);
20802 }
20803case RecurKind::Mul:
20804case RecurKind::FMul:
20805case RecurKind::FMulAdd:
20806case RecurKind::IAnyOf:
20807case RecurKind::FAnyOf:
20808case RecurKind::IFindLastIV:
20809case RecurKind::FFindLastIV:
20810case RecurKind::None:
20811llvm_unreachable("Unexpected reduction kind for reused scalars.");
20812 }
20813returnnullptr;
20814 }
20815};
20816}// end anonymous namespace
20817
20818/// Gets recurrence kind from the specified value.
20819staticRecurKindgetRdxKind(Value *V) {
20820return HorizontalReduction::getRdxKind(V);
20821}
20822static std::optional<unsigned>getAggregateSize(Instruction *InsertInst) {
20823if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20824return cast<FixedVectorType>(IE->getType())->getNumElements();
20825
20826unsigned AggregateSize = 1;
20827auto *IV = cast<InsertValueInst>(InsertInst);
20828Type *CurrentType =IV->getType();
20829do {
20830if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20831for (auto *Elt : ST->elements())
20832if (Elt != ST->getElementType(0))// check homogeneity
20833return std::nullopt;
20834 AggregateSize *= ST->getNumElements();
20835 CurrentType = ST->getElementType(0);
20836 }elseif (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20837 AggregateSize *= AT->getNumElements();
20838 CurrentType = AT->getElementType();
20839 }elseif (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20840 AggregateSize *= VT->getNumElements();
20841return AggregateSize;
20842 }elseif (CurrentType->isSingleValueType()) {
20843return AggregateSize;
20844 }else {
20845return std::nullopt;
20846 }
20847 }while (true);
20848}
20849
20850staticvoidfindBuildAggregate_rec(Instruction *LastInsertInst,
20851TargetTransformInfo *TTI,
20852SmallVectorImpl<Value *> &BuildVectorOpds,
20853SmallVectorImpl<Value *> &InsertElts,
20854unsigned OperandOffset,constBoUpSLP &R) {
20855do {
20856Value *InsertedOperand = LastInsertInst->getOperand(1);
20857 std::optional<unsigned> OperandIndex =
20858getElementIndex(LastInsertInst, OperandOffset);
20859if (!OperandIndex || R.isDeleted(LastInsertInst))
20860return;
20861if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20862findBuildAggregate_rec(cast<Instruction>(InsertedOperand),TTI,
20863 BuildVectorOpds, InsertElts, *OperandIndex, R);
20864
20865 }else {
20866 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20867 InsertElts[*OperandIndex] = LastInsertInst;
20868 }
20869 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20870 }while (LastInsertInst !=nullptr &&
20871 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20872 LastInsertInst->hasOneUse());
20873}
20874
20875/// Recognize construction of vectors like
20876/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20877/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20878/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20879/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20880/// starting from the last insertelement or insertvalue instruction.
20881///
20882/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20883/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20884/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20885///
20886/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20887///
20888/// \return true if it matches.
20889staticboolfindBuildAggregate(Instruction *LastInsertInst,
20890TargetTransformInfo *TTI,
20891SmallVectorImpl<Value *> &BuildVectorOpds,
20892SmallVectorImpl<Value *> &InsertElts,
20893constBoUpSLP &R) {
20894
20895assert((isa<InsertElementInst>(LastInsertInst) ||
20896 isa<InsertValueInst>(LastInsertInst)) &&
20897"Expected insertelement or insertvalue instruction!");
20898
20899assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20900"Expected empty result vectors!");
20901
20902 std::optional<unsigned> AggregateSize =getAggregateSize(LastInsertInst);
20903if (!AggregateSize)
20904returnfalse;
20905 BuildVectorOpds.resize(*AggregateSize);
20906 InsertElts.resize(*AggregateSize);
20907
20908findBuildAggregate_rec(LastInsertInst,TTI, BuildVectorOpds, InsertElts, 0,
20909 R);
20910llvm::erase(BuildVectorOpds,nullptr);
20911llvm::erase(InsertElts,nullptr);
20912if (BuildVectorOpds.size() >= 2)
20913returntrue;
20914
20915returnfalse;
20916}
20917
20918/// Try and get a reduction instruction from a phi node.
20919///
20920/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20921/// if they come from either \p ParentBB or a containing loop latch.
20922///
20923/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20924/// if not possible.
20925staticInstruction *getReductionInstr(constDominatorTree *DT,PHINode *P,
20926BasicBlock *ParentBB,LoopInfo *LI) {
20927// There are situations where the reduction value is not dominated by the
20928// reduction phi. Vectorizing such cases has been reported to cause
20929// miscompiles. See PR25787.
20930auto DominatedReduxValue = [&](Value *R) {
20931return isa<Instruction>(R) &&
20932 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20933 };
20934
20935Instruction *Rdx =nullptr;
20936
20937// Return the incoming value if it comes from the same BB as the phi node.
20938if (P->getIncomingBlock(0) == ParentBB) {
20939 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20940 }elseif (P->getIncomingBlock(1) == ParentBB) {
20941 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20942 }
20943
20944if (Rdx && DominatedReduxValue(Rdx))
20945return Rdx;
20946
20947// Otherwise, check whether we have a loop latch to look at.
20948Loop *BBL = LI->getLoopFor(ParentBB);
20949if (!BBL)
20950returnnullptr;
20951BasicBlock *BBLatch = BBL->getLoopLatch();
20952if (!BBLatch)
20953returnnullptr;
20954
20955// There is a loop latch, return the incoming value if it comes from
20956// that. This reduction pattern occasionally turns up.
20957if (P->getIncomingBlock(0) == BBLatch) {
20958 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20959 }elseif (P->getIncomingBlock(1) == BBLatch) {
20960 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20961 }
20962
20963if (Rdx && DominatedReduxValue(Rdx))
20964return Rdx;
20965
20966returnnullptr;
20967}
20968
20969staticboolmatchRdxBop(Instruction *I,Value *&V0,Value *&V1) {
20970if (match(I,m_BinOp(m_Value(V0),m_Value(V1))))
20971returntrue;
20972if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0),m_Value(V1))))
20973returntrue;
20974if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0),m_Value(V1))))
20975returntrue;
20976if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0),m_Value(V1))))
20977returntrue;
20978if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0),m_Value(V1))))
20979returntrue;
20980if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0),m_Value(V1))))
20981returntrue;
20982if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0),m_Value(V1))))
20983returntrue;
20984if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0),m_Value(V1))))
20985returntrue;
20986if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0),m_Value(V1))))
20987returntrue;
20988returnfalse;
20989}
20990
20991/// We could have an initial reduction that is not an add.
20992/// r *= v1 + v2 + v3 + v4
20993/// In such a case start looking for a tree rooted in the first '+'.
20994/// \Returns the new root if found, which may be nullptr if not an instruction.
20995staticInstruction *tryGetSecondaryReductionRoot(PHINode *Phi,
20996Instruction *Root) {
20997assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20998 isa<IntrinsicInst>(Root)) &&
20999"Expected binop, select, or intrinsic for reduction matching");
21000Value *LHS =
21001 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
21002Value *RHS =
21003 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
21004if (LHS == Phi)
21005return dyn_cast<Instruction>(RHS);
21006if (RHS == Phi)
21007return dyn_cast<Instruction>(LHS);
21008returnnullptr;
21009}
21010
21011/// \p Returns the first operand of \p I that does not match \p Phi. If
21012/// operand is not an instruction it returns nullptr.
21013staticInstruction *getNonPhiOperand(Instruction *I,PHINode *Phi) {
21014Value *Op0 =nullptr;
21015Value *Op1 =nullptr;
21016if (!matchRdxBop(I, Op0, Op1))
21017returnnullptr;
21018return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21019}
21020
21021/// \Returns true if \p I is a candidate instruction for reduction vectorization.
21022staticboolisReductionCandidate(Instruction *I) {
21023boolIsSelect =match(I,m_Select(m_Value(),m_Value(),m_Value()));
21024Value *B0 =nullptr, *B1 =nullptr;
21025bool IsBinop =matchRdxBop(I, B0, B1);
21026return IsBinop ||IsSelect;
21027}
21028
21029bool SLPVectorizerPass::vectorizeHorReduction(
21030PHINode *P,Instruction *Root,BasicBlock *BB,BoUpSLP &R,
21031SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
21032if (!ShouldVectorizeHor)
21033returnfalse;
21034bool TryOperandsAsNewSeeds =P && isa<BinaryOperator>(Root);
21035
21036if (Root->getParent() != BB || isa<PHINode>(Root))
21037returnfalse;
21038
21039// If we can find a secondary reduction root, use that instead.
21040auto SelectRoot = [&]() {
21041if (TryOperandsAsNewSeeds &&isReductionCandidate(Root) &&
21042 HorizontalReduction::getRdxKind(Root) !=RecurKind::None)
21043if (Instruction *NewRoot =tryGetSecondaryReductionRoot(P, Root))
21044return NewRoot;
21045return Root;
21046 };
21047
21048// Start analysis starting from Root instruction. If horizontal reduction is
21049// found, try to vectorize it. If it is not a horizontal reduction or
21050// vectorization is not possible or not effective, and currently analyzed
21051// instruction is a binary operation, try to vectorize the operands, using
21052// pre-order DFS traversal order. If the operands were not vectorized, repeat
21053// the same procedure considering each operand as a possible root of the
21054// horizontal reduction.
21055// Interrupt the process if the Root instruction itself was vectorized or all
21056// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
21057// If a horizintal reduction was not matched or vectorized we collect
21058// instructions for possible later attempts for vectorization.
21059 std::queue<std::pair<Instruction *, unsigned>>Stack;
21060Stack.emplace(SelectRoot(), 0);
21061SmallPtrSet<Value *, 8> VisitedInstrs;
21062bool Res =false;
21063auto &&TryToReduce = [this, &R](Instruction *Inst) ->Value * {
21064if (R.isAnalyzedReductionRoot(Inst))
21065returnnullptr;
21066if (!isReductionCandidate(Inst))
21067returnnullptr;
21068HorizontalReduction HorRdx;
21069if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21070returnnullptr;
21071return HorRdx.tryToReduce(R, *DL,TTI, *TLI, AC);
21072 };
21073auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21074if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21075 FutureSeed =getNonPhiOperand(Root,P);
21076if (!FutureSeed)
21077returnfalse;
21078 }
21079// Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21080// analysis is done separately.
21081if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21082 PostponedInsts.push_back(FutureSeed);
21083returntrue;
21084 };
21085
21086while (!Stack.empty()) {
21087Instruction *Inst;
21088unsigned Level;
21089 std::tie(Inst, Level) =Stack.front();
21090Stack.pop();
21091// Do not try to analyze instruction that has already been vectorized.
21092// This may happen when we vectorize instruction operands on a previous
21093// iteration while stack was populated before that happened.
21094if (R.isDeleted(Inst))
21095continue;
21096if (Value *VectorizedV = TryToReduce(Inst)) {
21097 Res =true;
21098if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21099// Try to find another reduction.
21100Stack.emplace(I, Level);
21101continue;
21102 }
21103if (R.isDeleted(Inst))
21104continue;
21105 }else {
21106// We could not vectorize `Inst` so try to use it as a future seed.
21107if (!TryAppendToPostponedInsts(Inst)) {
21108assert(Stack.empty() &&"Expected empty stack");
21109break;
21110 }
21111 }
21112
21113// Try to vectorize operands.
21114// Continue analysis for the instruction from the same basic block only to
21115// save compile time.
21116if (++Level <RecursionMaxDepth)
21117for (auto *Op : Inst->operand_values())
21118if (VisitedInstrs.insert(Op).second)
21119if (auto *I = dyn_cast<Instruction>(Op))
21120// Do not try to vectorize CmpInst operands, this is done
21121// separately.
21122if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21123 !R.isDeleted(I) &&I->getParent() == BB)
21124Stack.emplace(I, Level);
21125 }
21126return Res;
21127}
21128
21129bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P,Instruction *Root,
21130BasicBlock *BB,BoUpSLP &R) {
21131SmallVector<WeakTrackingVH> PostponedInsts;
21132bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21133 Res |= tryToVectorize(PostponedInsts, R);
21134return Res;
21135}
21136
21137bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21138BoUpSLP &R) {
21139bool Res =false;
21140for (Value *V : Insts)
21141if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21142 Res |= tryToVectorize(Inst, R);
21143return Res;
21144}
21145
21146bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21147BasicBlock *BB,BoUpSLP &R,
21148bool MaxVFOnly) {
21149if (!R.canMapToVector(IVI->getType()))
21150returnfalse;
21151
21152SmallVector<Value *, 16> BuildVectorOpds;
21153SmallVector<Value *, 16> BuildVectorInsts;
21154if (!findBuildAggregate(IVI,TTI, BuildVectorOpds, BuildVectorInsts, R))
21155returnfalse;
21156
21157if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21158R.getORE()->emit([&]() {
21159returnOptimizationRemarkMissed(SV_NAME,"NotPossible", IVI)
21160 <<"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21161"trying reduction first.";
21162 });
21163returnfalse;
21164 }
21165LLVM_DEBUG(dbgs() <<"SLP: array mappable to vector: " << *IVI <<"\n");
21166// Aggregate value is unlikely to be processed in vector register.
21167return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21168}
21169
21170bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21171BasicBlock *BB,BoUpSLP &R,
21172bool MaxVFOnly) {
21173SmallVector<Value *, 16> BuildVectorInsts;
21174SmallVector<Value *, 16> BuildVectorOpds;
21175SmallVector<int>Mask;
21176if (!findBuildAggregate(IEI,TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21177 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21178isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21179returnfalse;
21180
21181if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21182R.getORE()->emit([&]() {
21183returnOptimizationRemarkMissed(SV_NAME,"NotPossible", IEI)
21184 <<"Cannot SLP vectorize list: only 2 elements of buildvector, "
21185"trying reduction first.";
21186 });
21187returnfalse;
21188 }
21189LLVM_DEBUG(dbgs() <<"SLP: array mappable to vector: " << *IEI <<"\n");
21190return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21191}
21192
21193template <typename T>
21194staticbooltryToVectorizeSequence(
21195SmallVectorImpl<T *> &Incoming,function_ref<bool(T *,T *)> Comparator,
21196function_ref<bool(T *,T *)> AreCompatible,
21197function_ref<bool(ArrayRef<T *>,bool)> TryToVectorizeHelper,
21198bool MaxVFOnly,BoUpSLP &R) {
21199bool Changed =false;
21200// Sort by type, parent, operands.
21201stable_sort(Incoming, Comparator);
21202
21203// Try to vectorize elements base on their type.
21204SmallVector<T *> Candidates;
21205SmallVector<T *> VL;
21206for (auto *IncIt =Incoming.begin(), *E =Incoming.end(); IncIt != E;
21207 VL.clear()) {
21208// Look for the next elements with the same type, parent and operand
21209// kinds.
21210auto *I = dyn_cast<Instruction>(*IncIt);
21211if (!I || R.isDeleted(I)) {
21212 ++IncIt;
21213continue;
21214 }
21215auto *SameTypeIt = IncIt;
21216while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21217 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21218 AreCompatible(*SameTypeIt, *IncIt))) {
21219auto *I = dyn_cast<Instruction>(*SameTypeIt);
21220 ++SameTypeIt;
21221if (I && !R.isDeleted(I))
21222 VL.push_back(cast<T>(I));
21223 }
21224
21225// Try to vectorize them.
21226unsigned NumElts = VL.size();
21227LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize starting at nodes ("
21228 << NumElts <<")\n");
21229// The vectorization is a 3-state attempt:
21230// 1. Try to vectorize instructions with the same/alternate opcodes with the
21231// size of maximal register at first.
21232// 2. Try to vectorize remaining instructions with the same type, if
21233// possible. This may result in the better vectorization results rather than
21234// if we try just to vectorize instructions with the same/alternate opcodes.
21235// 3. Final attempt to try to vectorize all instructions with the
21236// same/alternate ops only, this may result in some extra final
21237// vectorization.
21238if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21239// Success start over because instructions might have been changed.
21240 Changed =true;
21241 VL.swap(Candidates);
21242 Candidates.clear();
21243for (T *V : VL) {
21244if (auto *I = dyn_cast<Instruction>(V);I && !R.isDeleted(I))
21245 Candidates.push_back(V);
21246 }
21247 }else {
21248 /// \Returns the minimum number of elements that we will attempt to
21249 /// vectorize.
21250auto GetMinNumElements = [&R](Value *V) {
21251unsigned EltSize = R.getVectorElementSize(V);
21252return std::max(2U, R.getMaxVecRegSize() / EltSize);
21253 };
21254if (NumElts < GetMinNumElements(*IncIt) &&
21255 (Candidates.empty() ||
21256 Candidates.front()->getType() == (*IncIt)->getType())) {
21257for (T *V : VL) {
21258if (auto *I = dyn_cast<Instruction>(V);I && !R.isDeleted(I))
21259 Candidates.push_back(V);
21260 }
21261 }
21262 }
21263// Final attempt to vectorize instructions with the same types.
21264if (Candidates.size() > 1 &&
21265 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21266if (TryToVectorizeHelper(Candidates,/*MaxVFOnly=*/false)) {
21267// Success start over because instructions might have been changed.
21268 Changed =true;
21269 }elseif (MaxVFOnly) {
21270// Try to vectorize using small vectors.
21271SmallVector<T *> VL;
21272for (auto *It = Candidates.begin(), *End = Candidates.end(); It !=End;
21273 VL.clear()) {
21274auto *I = dyn_cast<Instruction>(*It);
21275if (!I || R.isDeleted(I)) {
21276 ++It;
21277continue;
21278 }
21279auto *SameTypeIt = It;
21280while (SameTypeIt !=End &&
21281 (!isa<Instruction>(*SameTypeIt) ||
21282 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21283 AreCompatible(*SameTypeIt, *It))) {
21284auto *I = dyn_cast<Instruction>(*SameTypeIt);
21285 ++SameTypeIt;
21286if (I && !R.isDeleted(I))
21287 VL.push_back(cast<T>(I));
21288 }
21289unsigned NumElts = VL.size();
21290if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21291/*MaxVFOnly=*/false))
21292 Changed =true;
21293 It = SameTypeIt;
21294 }
21295 }
21296 Candidates.clear();
21297 }
21298
21299// Start over at the next instruction of a different type (or the end).
21300 IncIt = SameTypeIt;
21301 }
21302return Changed;
21303}
21304
21305/// Compare two cmp instructions. If IsCompatibility is true, function returns
21306/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21307/// operands. If IsCompatibility is false, function implements strict weak
21308/// ordering relation between two cmp instructions, returning true if the first
21309/// instruction is "less" than the second, i.e. its predicate is less than the
21310/// predicate of the second or the operands IDs are less than the operands IDs
21311/// of the second cmp instruction.
21312template <bool IsCompatibility>
21313staticboolcompareCmp(Value *V,Value *V2,TargetLibraryInfo &TLI,
21314constDominatorTree &DT) {
21315assert(isValidElementType(V->getType()) &&
21316isValidElementType(V2->getType()) &&
21317"Expected valid element types only.");
21318if (V == V2)
21319return IsCompatibility;
21320auto *CI1 = cast<CmpInst>(V);
21321auto *CI2 = cast<CmpInst>(V2);
21322if (CI1->getOperand(0)->getType()->getTypeID() <
21323 CI2->getOperand(0)->getType()->getTypeID())
21324return !IsCompatibility;
21325if (CI1->getOperand(0)->getType()->getTypeID() >
21326 CI2->getOperand(0)->getType()->getTypeID())
21327returnfalse;
21328if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21329 CI2->getOperand(0)->getType()->getScalarSizeInBits())
21330return !IsCompatibility;
21331if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21332 CI2->getOperand(0)->getType()->getScalarSizeInBits())
21333returnfalse;
21334CmpInst::Predicate Pred1 = CI1->getPredicate();
21335CmpInst::Predicate Pred2 = CI2->getPredicate();
21336CmpInst::Predicate SwapPred1 =CmpInst::getSwappedPredicate(Pred1);
21337CmpInst::Predicate SwapPred2 =CmpInst::getSwappedPredicate(Pred2);
21338CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21339CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21340if (BasePred1 < BasePred2)
21341return !IsCompatibility;
21342if (BasePred1 > BasePred2)
21343returnfalse;
21344// Compare operands.
21345bool CI1Preds = Pred1 == BasePred1;
21346bool CI2Preds = Pred2 == BasePred1;
21347for (intI = 0, E = CI1->getNumOperands();I < E; ++I) {
21348auto *Op1 = CI1->getOperand(CI1Preds ?I : E -I - 1);
21349auto *Op2 = CI2->getOperand(CI2Preds ?I : E -I - 1);
21350if (Op1 == Op2)
21351continue;
21352if (Op1->getValueID() < Op2->getValueID())
21353return !IsCompatibility;
21354if (Op1->getValueID() > Op2->getValueID())
21355returnfalse;
21356if (auto *I1 = dyn_cast<Instruction>(Op1))
21357if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21358if (IsCompatibility) {
21359if (I1->getParent() != I2->getParent())
21360returnfalse;
21361 }else {
21362// Try to compare nodes with same parent.
21363DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21364DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21365if (!NodeI1)
21366return NodeI2 !=nullptr;
21367if (!NodeI2)
21368returnfalse;
21369assert((NodeI1 == NodeI2) ==
21370 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21371"Different nodes should have different DFS numbers");
21372if (NodeI1 != NodeI2)
21373return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21374 }
21375 InstructionsState S =getSameOpcode({I1, I2}, TLI);
21376if (S && (IsCompatibility || !S.isAltShuffle()))
21377continue;
21378if (IsCompatibility)
21379returnfalse;
21380if (I1->getOpcode() != I2->getOpcode())
21381return I1->getOpcode() < I2->getOpcode();
21382 }
21383 }
21384return IsCompatibility;
21385}
21386
21387template <typename ItT>
21388bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21389BasicBlock *BB,BoUpSLP &R) {
21390bool Changed =false;
21391// Try to find reductions first.
21392for (CmpInst *I : CmpInsts) {
21393if (R.isDeleted(I))
21394continue;
21395for (Value *Op :I->operands())
21396if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21397 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21398if (R.isDeleted(I))
21399break;
21400 }
21401 }
21402// Try to vectorize operands as vector bundles.
21403for (CmpInst *I : CmpInsts) {
21404if (R.isDeleted(I))
21405continue;
21406 Changed |= tryToVectorize(I, R);
21407 }
21408// Try to vectorize list of compares.
21409// Sort by type, compare predicate, etc.
21410auto CompareSorter = [&](Value *V,Value *V2) {
21411if (V == V2)
21412returnfalse;
21413return compareCmp<false>(V, V2, *TLI, *DT);
21414 };
21415
21416auto AreCompatibleCompares = [&](Value *V1,Value *V2) {
21417if (V1 == V2)
21418returntrue;
21419return compareCmp<true>(V1, V2, *TLI, *DT);
21420 };
21421
21422SmallVector<Value *> Vals;
21423for (Instruction *V : CmpInsts)
21424if (!R.isDeleted(V) &&isValidElementType(getValueType(V)))
21425 Vals.push_back(V);
21426if (Vals.size() <= 1)
21427return Changed;
21428 Changed |= tryToVectorizeSequence<Value>(
21429 Vals, CompareSorter, AreCompatibleCompares,
21430 [this, &R](ArrayRef<Value *> Candidates,bool MaxVFOnly) {
21431// Exclude possible reductions from other blocks.
21432bool ArePossiblyReducedInOtherBlock =any_of(Candidates, [](Value *V) {
21433returnany_of(V->users(), [V](User *U) {
21434 auto *Select = dyn_cast<SelectInst>(U);
21435 return Select &&
21436 Select->getParent() != cast<Instruction>(V)->getParent();
21437 });
21438 });
21439if (ArePossiblyReducedInOtherBlock)
21440returnfalse;
21441return tryToVectorizeList(Candidates, R, MaxVFOnly);
21442 },
21443/*MaxVFOnly=*/true,R);
21444return Changed;
21445}
21446
21447bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21448BasicBlock *BB,BoUpSLP &R) {
21449assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21450"This function only accepts Insert instructions");
21451bool OpsChanged =false;
21452SmallVector<WeakTrackingVH> PostponedInsts;
21453for (auto *I :reverse(Instructions)) {
21454// pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21455if (R.isDeleted(I) || isa<CmpInst>(I))
21456continue;
21457if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21458 OpsChanged |=
21459 vectorizeInsertValueInst(LastInsertValue, BB, R,/*MaxVFOnly=*/true);
21460 }elseif (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21461 OpsChanged |=
21462 vectorizeInsertElementInst(LastInsertElem, BB, R,/*MaxVFOnly=*/true);
21463 }
21464// pass2 - try to vectorize reductions only
21465if (R.isDeleted(I))
21466continue;
21467 OpsChanged |= vectorizeHorReduction(nullptr,I, BB, R, PostponedInsts);
21468if (R.isDeleted(I) || isa<CmpInst>(I))
21469continue;
21470// pass3 - try to match and vectorize a buildvector sequence.
21471if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21472 OpsChanged |=
21473 vectorizeInsertValueInst(LastInsertValue, BB, R,/*MaxVFOnly=*/false);
21474 }elseif (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21475 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21476/*MaxVFOnly=*/false);
21477 }
21478 }
21479// Now try to vectorize postponed instructions.
21480 OpsChanged |= tryToVectorize(PostponedInsts, R);
21481
21482Instructions.clear();
21483return OpsChanged;
21484}
21485
21486bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,BoUpSLP &R) {
21487bool Changed =false;
21488SmallVector<Value *, 4>Incoming;
21489SmallPtrSet<Value *, 16> VisitedInstrs;
21490// Maps phi nodes to the non-phi nodes found in the use tree for each phi
21491// node. Allows better to identify the chains that can be vectorized in the
21492// better way.
21493DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
21494auto PHICompare = [this, &PHIToOpcodes](Value *V1,Value *V2) {
21495assert(isValidElementType(V1->getType()) &&
21496isValidElementType(V2->getType()) &&
21497"Expected vectorizable types only.");
21498// It is fine to compare type IDs here, since we expect only vectorizable
21499// types, like ints, floats and pointers, we don't care about other type.
21500if (V1->getType()->getTypeID() <V2->getType()->getTypeID())
21501returntrue;
21502if (V1->getType()->getTypeID() >V2->getType()->getTypeID())
21503returnfalse;
21504if (V1->getType()->getScalarSizeInBits() <
21505V2->getType()->getScalarSizeInBits())
21506returntrue;
21507if (V1->getType()->getScalarSizeInBits() >
21508V2->getType()->getScalarSizeInBits())
21509returnfalse;
21510ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21511ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21512if (Opcodes1.size() < Opcodes2.size())
21513returntrue;
21514if (Opcodes1.size() > Opcodes2.size())
21515returnfalse;
21516for (intI = 0, E = Opcodes1.size();I < E; ++I) {
21517 {
21518// Instructions come first.
21519auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21520auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21521if (I1 && I2) {
21522DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21523DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21524if (!NodeI1)
21525return NodeI2 !=nullptr;
21526if (!NodeI2)
21527returnfalse;
21528assert((NodeI1 == NodeI2) ==
21529 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21530"Different nodes should have different DFS numbers");
21531if (NodeI1 != NodeI2)
21532return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21533 InstructionsState S =getSameOpcode({I1, I2}, *TLI);
21534if (S && !S.isAltShuffle())
21535continue;
21536returnI1->getOpcode() < I2->getOpcode();
21537 }
21538if (I1)
21539returntrue;
21540if (I2)
21541returnfalse;
21542 }
21543 {
21544// Non-undef constants come next.
21545bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21546bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21547if (C1 && C2)
21548continue;
21549if (C1)
21550returntrue;
21551if (C2)
21552returnfalse;
21553 }
21554bool U1 = isa<UndefValue>(Opcodes1[I]);
21555bool U2 = isa<UndefValue>(Opcodes2[I]);
21556 {
21557// Non-constant non-instructions come next.
21558if (!U1 && !U2) {
21559auto ValID1 = Opcodes1[I]->getValueID();
21560auto ValID2 = Opcodes2[I]->getValueID();
21561if (ValID1 == ValID2)
21562continue;
21563if (ValID1 < ValID2)
21564returntrue;
21565if (ValID1 > ValID2)
21566returnfalse;
21567 }
21568if (!U1)
21569returntrue;
21570if (!U2)
21571returnfalse;
21572 }
21573// Undefs come last.
21574assert(U1 && U2 &&"The only thing left should be undef & undef.");
21575 }
21576returnfalse;
21577 };
21578auto AreCompatiblePHIs = [&PHIToOpcodes,this, &R](Value *V1,Value *V2) {
21579if (V1 == V2)
21580returntrue;
21581if (V1->getType() !=V2->getType())
21582returnfalse;
21583ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21584ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21585if (Opcodes1.size() != Opcodes2.size())
21586returnfalse;
21587for (intI = 0, E = Opcodes1.size();I < E; ++I) {
21588// Undefs are compatible with any other value.
21589if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21590continue;
21591if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21592if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21593if (R.isDeleted(I1) ||R.isDeleted(I2))
21594returnfalse;
21595if (I1->getParent() != I2->getParent())
21596returnfalse;
21597if (getSameOpcode({I1, I2}, *TLI))
21598continue;
21599returnfalse;
21600 }
21601if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21602continue;
21603if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21604returnfalse;
21605 }
21606returntrue;
21607 };
21608
21609bool HaveVectorizedPhiNodes =false;
21610do {
21611// Collect the incoming values from the PHIs.
21612Incoming.clear();
21613for (Instruction &I : *BB) {
21614auto *P = dyn_cast<PHINode>(&I);
21615if (!P ||P->getNumIncomingValues() >MaxPHINumOperands)
21616break;
21617
21618// No need to analyze deleted, vectorized and non-vectorizable
21619// instructions.
21620if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21621isValidElementType(P->getType()))
21622Incoming.push_back(P);
21623 }
21624
21625if (Incoming.size() <= 1)
21626break;
21627
21628// Find the corresponding non-phi nodes for better matching when trying to
21629// build the tree.
21630for (Value *V :Incoming) {
21631SmallVectorImpl<Value *> &Opcodes =
21632 PHIToOpcodes.try_emplace(V).first->getSecond();
21633if (!Opcodes.empty())
21634continue;
21635SmallVector<Value *, 4> Nodes(1, V);
21636SmallPtrSet<Value *, 4> Visited;
21637while (!Nodes.empty()) {
21638auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21639if (!Visited.insert(PHI).second)
21640continue;
21641for (Value *V :PHI->incoming_values()) {
21642if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21643 Nodes.push_back(PHI1);
21644continue;
21645 }
21646 Opcodes.emplace_back(V);
21647 }
21648 }
21649 }
21650
21651 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21652Incoming, PHICompare, AreCompatiblePHIs,
21653 [this, &R](ArrayRef<Value *> Candidates,bool MaxVFOnly) {
21654return tryToVectorizeList(Candidates, R, MaxVFOnly);
21655 },
21656/*MaxVFOnly=*/true,R);
21657 Changed |= HaveVectorizedPhiNodes;
21658if (HaveVectorizedPhiNodes &&any_of(PHIToOpcodes, [&](constauto &P) {
21659auto *PHI = dyn_cast<PHINode>(P.first);
21660return !PHI ||R.isDeleted(PHI);
21661 }))
21662 PHIToOpcodes.clear();
21663 VisitedInstrs.insert(Incoming.begin(),Incoming.end());
21664 }while (HaveVectorizedPhiNodes);
21665
21666 VisitedInstrs.clear();
21667
21668 InstSetVector PostProcessInserts;
21669SmallSetVector<CmpInst *, 8> PostProcessCmps;
21670// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21671// also vectorizes `PostProcessCmps`.
21672auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21673bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21674if (VectorizeCmps) {
21675 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21676 PostProcessCmps.clear();
21677 }
21678 PostProcessInserts.clear();
21679return Changed;
21680 };
21681// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21682auto IsInPostProcessInstrs = [&](Instruction *I) {
21683if (auto *Cmp = dyn_cast<CmpInst>(I))
21684return PostProcessCmps.contains(Cmp);
21685return isa<InsertElementInst, InsertValueInst>(I) &&
21686 PostProcessInserts.contains(I);
21687 };
21688// Returns true if `I` is an instruction without users, like terminator, or
21689// function call with ignored return value, store. Ignore unused instructions
21690// (basing on instruction type, except for CallInst and InvokeInst).
21691auto HasNoUsers = [](Instruction *I) {
21692returnI->use_empty() &&
21693 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21694 };
21695for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21696// Skip instructions with scalable type. The num of elements is unknown at
21697// compile-time for scalable type.
21698if (isa<ScalableVectorType>(It->getType()))
21699continue;
21700
21701// Skip instructions marked for the deletion.
21702if (R.isDeleted(&*It))
21703continue;
21704// We may go through BB multiple times so skip the one we have checked.
21705if (!VisitedInstrs.insert(&*It).second) {
21706if (HasNoUsers(&*It) &&
21707 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21708// We would like to start over since some instructions are deleted
21709// and the iterator may become invalid value.
21710 Changed =true;
21711 It = BB->begin();
21712 E = BB->end();
21713 }
21714continue;
21715 }
21716
21717if (isa<DbgInfoIntrinsic>(It))
21718continue;
21719
21720// Try to vectorize reductions that use PHINodes.
21721if (PHINode *P = dyn_cast<PHINode>(It)) {
21722// Check that the PHI is a reduction PHI.
21723if (P->getNumIncomingValues() == 2) {
21724// Try to match and vectorize a horizontal reduction.
21725Instruction *Root =getReductionInstr(DT,P, BB, LI);
21726if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21727 Changed =true;
21728 It = BB->begin();
21729 E = BB->end();
21730continue;
21731 }
21732 }
21733// Try to vectorize the incoming values of the PHI, to catch reductions
21734// that feed into PHIs.
21735for (unsignedI : seq<unsigned>(P->getNumIncomingValues())) {
21736// Skip if the incoming block is the current BB for now. Also, bypass
21737// unreachable IR for efficiency and to avoid crashing.
21738// TODO: Collect the skipped incoming values and try to vectorize them
21739// after processing BB.
21740if (BB ==P->getIncomingBlock(I) ||
21741 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21742continue;
21743
21744// Postponed instructions should not be vectorized here, delay their
21745// vectorization.
21746if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21747 PI && !IsInPostProcessInstrs(PI)) {
21748bool Res =
21749 vectorizeRootInstruction(nullptr, PI,P->getIncomingBlock(I), R);
21750 Changed |= Res;
21751if (Res &&R.isDeleted(P)) {
21752 It = BB->begin();
21753 E = BB->end();
21754break;
21755 }
21756 }
21757 }
21758continue;
21759 }
21760
21761if (HasNoUsers(&*It)) {
21762bool OpsChanged =false;
21763auto *SI = dyn_cast<StoreInst>(It);
21764bool TryToVectorizeRoot =ShouldStartVectorizeHorAtStore || !SI;
21765if (SI) {
21766auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21767// Try to vectorize chain in store, if this is the only store to the
21768// address in the block.
21769// TODO: This is just a temporarily solution to save compile time. Need
21770// to investigate if we can safely turn on slp-vectorize-hor-store
21771// instead to allow lookup for reduction chains in all non-vectorized
21772// stores (need to check side effects and compile time).
21773 TryToVectorizeRoot |= (I == Stores.end() ||I->second.size() == 1) &&
21774SI->getValueOperand()->hasOneUse();
21775 }
21776if (TryToVectorizeRoot) {
21777for (auto *V : It->operand_values()) {
21778// Postponed instructions should not be vectorized here, delay their
21779// vectorization.
21780if (auto *VI = dyn_cast<Instruction>(V);
21781VI && !IsInPostProcessInstrs(VI))
21782// Try to match and vectorize a horizontal reduction.
21783 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21784 }
21785 }
21786// Start vectorization of post-process list of instructions from the
21787// top-tree instructions to try to vectorize as many instructions as
21788// possible.
21789 OpsChanged |=
21790 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21791if (OpsChanged) {
21792// We would like to start over since some instructions are deleted
21793// and the iterator may become invalid value.
21794 Changed =true;
21795 It = BB->begin();
21796 E = BB->end();
21797continue;
21798 }
21799 }
21800
21801if (isa<InsertElementInst, InsertValueInst>(It))
21802 PostProcessInserts.insert(&*It);
21803elseif (isa<CmpInst>(It))
21804 PostProcessCmps.insert(cast<CmpInst>(&*It));
21805 }
21806
21807return Changed;
21808}
21809
21810bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,BoUpSLP &R) {
21811auto Changed =false;
21812for (auto &Entry : GEPs) {
21813// If the getelementptr list has fewer than two elements, there's nothing
21814// to do.
21815if (Entry.second.size() < 2)
21816continue;
21817
21818LLVM_DEBUG(dbgs() <<"SLP: Analyzing a getelementptr list of length "
21819 <<Entry.second.size() <<".\n");
21820
21821// Process the GEP list in chunks suitable for the target's supported
21822// vector size. If a vector register can't hold 1 element, we are done. We
21823// are trying to vectorize the index computations, so the maximum number of
21824// elements is based on the size of the index expression, rather than the
21825// size of the GEP itself (the target's pointer size).
21826auto *It =find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21827 return !R.isDeleted(GEP);
21828 });
21829if (It ==Entry.second.end())
21830continue;
21831unsigned MaxVecRegSize =R.getMaxVecRegSize();
21832unsigned EltSize =R.getVectorElementSize(*(*It)->idx_begin());
21833if (MaxVecRegSize < EltSize)
21834continue;
21835
21836unsigned MaxElts = MaxVecRegSize / EltSize;
21837for (unsigned BI = 0, BE =Entry.second.size(); BI < BE; BI += MaxElts) {
21838autoLen = std::min<unsigned>(BE - BI, MaxElts);
21839ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21840
21841// Initialize a set a candidate getelementptrs. Note that we use a
21842// SetVector here to preserve program order. If the index computations
21843// are vectorizable and begin with loads, we want to minimize the chance
21844// of having to reorder them later.
21845SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21846
21847// Some of the candidates may have already been vectorized after we
21848// initially collected them or their index is optimized to constant value.
21849// If so, they are marked as deleted, so remove them from the set of
21850// candidates.
21851 Candidates.remove_if([&R](Value *I) {
21852returnR.isDeleted(cast<Instruction>(I)) ||
21853 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21854 });
21855
21856// Remove from the set of candidates all pairs of getelementptrs with
21857// constant differences. Such getelementptrs are likely not good
21858// candidates for vectorization in a bottom-up phase since one can be
21859// computed from the other. We also ensure all candidate getelementptr
21860// indices are unique.
21861for (intI = 0, E = GEPList.size();I < E && Candidates.size() > 1; ++I) {
21862auto *GEPI = GEPList[I];
21863if (!Candidates.count(GEPI))
21864continue;
21865constSCEV *SCEVI = SE->getSCEV(GEPList[I]);
21866for (int J =I + 1; J < E && Candidates.size() > 1; ++J) {
21867auto *GEPJ = GEPList[J];
21868constSCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21869if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21870 Candidates.remove(GEPI);
21871 Candidates.remove(GEPJ);
21872 }elseif (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21873 Candidates.remove(GEPJ);
21874 }
21875 }
21876 }
21877
21878// We break out of the above computation as soon as we know there are
21879// fewer than two candidates remaining.
21880if (Candidates.size() < 2)
21881continue;
21882
21883// Add the single, non-constant index of each candidate to the bundle. We
21884// ensured the indices met these constraints when we originally collected
21885// the getelementptrs.
21886SmallVector<Value *, 16> Bundle(Candidates.size());
21887auto BundleIndex = 0u;
21888for (auto *V : Candidates) {
21889auto *GEP = cast<GetElementPtrInst>(V);
21890auto *GEPIdx =GEP->idx_begin()->get();
21891assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21892 Bundle[BundleIndex++] = GEPIdx;
21893 }
21894
21895// Try and vectorize the indices. We are currently only interested in
21896// gather-like cases of the form:
21897//
21898// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21899//
21900// where the loads of "a", the loads of "b", and the subtractions can be
21901// performed in parallel. It's likely that detecting this pattern in a
21902// bottom-up phase will be simpler and less costly than building a
21903// full-blown top-down phase beginning at the consecutive loads.
21904 Changed |= tryToVectorizeList(Bundle, R);
21905 }
21906 }
21907return Changed;
21908}
21909
21910bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21911bool Changed =false;
21912// Sort by type, base pointers and values operand. Value operands must be
21913// compatible (have the same opcode, same parent), otherwise it is
21914// definitely not profitable to try to vectorize them.
21915auto &&StoreSorter = [this](StoreInst *V,StoreInst *V2) {
21916if (V->getValueOperand()->getType()->getTypeID() <
21917V2->getValueOperand()->getType()->getTypeID())
21918returntrue;
21919if (V->getValueOperand()->getType()->getTypeID() >
21920V2->getValueOperand()->getType()->getTypeID())
21921returnfalse;
21922if (V->getPointerOperandType()->getTypeID() <
21923V2->getPointerOperandType()->getTypeID())
21924returntrue;
21925if (V->getPointerOperandType()->getTypeID() >
21926V2->getPointerOperandType()->getTypeID())
21927returnfalse;
21928if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21929V2->getValueOperand()->getType()->getScalarSizeInBits())
21930returntrue;
21931if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21932V2->getValueOperand()->getType()->getScalarSizeInBits())
21933returnfalse;
21934// UndefValues are compatible with all other values.
21935if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21936if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21937DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
21938 DT->getNode(I1->getParent());
21939DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
21940 DT->getNode(I2->getParent());
21941assert(NodeI1 &&"Should only process reachable instructions");
21942assert(NodeI2 &&"Should only process reachable instructions");
21943assert((NodeI1 == NodeI2) ==
21944 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21945"Different nodes should have different DFS numbers");
21946if (NodeI1 != NodeI2)
21947return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21948returnI1->getOpcode() < I2->getOpcode();
21949 }
21950returnV->getValueOperand()->getValueID() <
21951V2->getValueOperand()->getValueID();
21952 };
21953
21954auto &&AreCompatibleStores = [this](StoreInst *V1,StoreInst *V2) {
21955if (V1 == V2)
21956returntrue;
21957if (V1->getValueOperand()->getType() !=V2->getValueOperand()->getType())
21958returnfalse;
21959if (V1->getPointerOperandType() !=V2->getPointerOperandType())
21960returnfalse;
21961// Undefs are compatible with any other value.
21962if (isa<UndefValue>(V1->getValueOperand()) ||
21963 isa<UndefValue>(V2->getValueOperand()))
21964returntrue;
21965if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21966if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21967if (I1->getParent() != I2->getParent())
21968returnfalse;
21969returngetSameOpcode({I1, I2}, *TLI).valid();
21970 }
21971if (isa<Constant>(V1->getValueOperand()) &&
21972 isa<Constant>(V2->getValueOperand()))
21973returntrue;
21974return V1->getValueOperand()->getValueID() ==
21975V2->getValueOperand()->getValueID();
21976 };
21977
21978// Attempt to sort and vectorize each of the store-groups.
21979DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
21980for (auto &Pair : Stores) {
21981if (Pair.second.size() < 2)
21982continue;
21983
21984LLVM_DEBUG(dbgs() <<"SLP: Analyzing a store chain of length "
21985 << Pair.second.size() <<".\n");
21986
21987if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21988continue;
21989
21990// Reverse stores to do bottom-to-top analysis. This is important if the
21991// values are stores to the same addresses several times, in this case need
21992// to follow the stores order (reversed to meet the memory dependecies).
21993SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21994 Pair.second.rend());
21995 Changed |= tryToVectorizeSequence<StoreInst>(
21996 ReversedStores, StoreSorter, AreCompatibleStores,
21997 [&](ArrayRef<StoreInst *> Candidates,bool) {
21998return vectorizeStores(Candidates, R, Attempted);
21999 },
22000/*MaxVFOnly=*/false,R);
22001 }
22002return Changed;
22003}
isConstant
static bool isConstant(const MachineInstr &MI)
Definition:AMDGPUInstructionSelector.cpp:2862
Select
AMDGPU Register Bank Select
Definition:AMDGPURegBankSelect.cpp:71
PHI
Rewrite undef for PHI
Definition:AMDGPURewriteUndefForPHI.cpp:100
Ignore
ReachingDefAnalysis InstSet InstSet & Ignore
Definition:ARMLowOverheadLoops.cpp:531
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition:ARMSLSHardening.cpp:73
Results
Function Alias Analysis Results
Definition:AliasAnalysis.cpp:731
AliasAnalysis.h
AssumptionCache.h
Attributes.h
This file contains the simple types necessary to represent the attributes associated with functions a...
getParent
static const Function * getParent(const Value *V)
Definition:BasicAliasAnalysis.cpp:863
true
basic Basic Alias true
Definition:BasicAliasAnalysis.cpp:1981
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Casting.h
CodeMetrics.h
CommandLine.h
Compiler.h
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition:Compiler.h:622
ConstantFolding.h
Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
DOTGraphTraits.h
getElementIndex
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition:DataLayout.cpp:920
DataLayout.h
Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition:DeadArgumentElimination.cpp:353
DebugCounter.h
This file provides an implementation of debug counters.
DEBUG_COUNTER
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition:DebugCounter.h:190
Debug.h
LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition:Debug.h:106
DemandedBits.h
DenseMap.h
This file defines the DenseMap class.
DenseSet.h
This file defines the DenseSet and SmallDenseSet classes.
DerivedTypes.h
Dominators.h
Name
std::string Name
Definition:ELFObjHandler.cpp:77
Index
uint32_t Index
Definition:ELFObjHandler.cpp:83
Size
uint64_t Size
Definition:ELFObjHandler.cpp:81
End
bool End
Definition:ELF_riscv.cpp:480
Blocks
DenseMap< Block *, BlockRelaxAux > Blocks
Definition:ELF_riscv.cpp:507
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
runImpl
static bool runImpl(Function &F, const TargetLowering &TLI)
Definition:ExpandLargeDivRem.cpp:79
GlobalsModRef.h
This is the interface for a simple mod/ref and alias analysis over globals.
GraphWriter.h
Cleanup
static const HTTPClientCleanup Cleanup
Definition:HTTPClient.cpp:42
GEP
Hexagon Common GEP
Definition:HexagonCommonGEP.cpp:170
_
#define _
Definition:HexagonMCCodeEmitter.cpp:46
IRBuilder.h
MI
IRTranslator LLVM IR MI
Definition:IRTranslator.cpp:112
BasicBlock.h
Constant.h
Function.h
Instruction.h
IntrinsicInst.h
Module.h
Module.h This file contains the declarations for the Module class.
Operator.h
Type.h
Use.h
This defines the Use class.
User.h
Value.h
IVDescriptors.h
Users
iv Induction Variable Users
Definition:IVUsers.cpp:48
InjectTLIMappings.h
InstrTypes.h
InstructionCost.h
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Instructions.h
Intrinsics.h
KnownBits.h
LoopAccessAnalysis.h
LoopInfo.h
LoopUtils.h
isSplat
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
Definition:LowerMatrixIntrinsics.cpp:102
F
#define F(x, y, z)
Definition:MD5.cpp:55
I
#define I(x, y, z)
Definition:MD5.cpp:58
Operands
mir Rename Register Operands
Definition:MIRNamerPass.cpp:74
MathExtras.h
MemoryLocation.h
This file provides utility analysis objects describing memory locations.
Unknown
@ Unknown
Definition:NVPTXISelLowering.cpp:4791
II
uint64_t IntrinsicInst * II
Definition:NVVMIntrRange.cpp:51
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
OptimizationRemarkEmitter.h
P
#define P(N)
verify
ppc ctr loops verify
Definition:PPCCTRLoopsVerify.cpp:72
IsSelect
static bool IsSelect(MachineInstr &MI)
Definition:PPCISelLowering.cpp:13186
if
if(PassOpts->AAPipeline)
Definition:PassBuilderBindings.cpp:64
Pass.h
PatternMatch.h
PriorityQueue.h
This file defines the PriorityQueue class.
Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition:RISCVRedundantCopyElimination.cpp:75
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
isLoadCombineCandidateImpl
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
Definition:SLPVectorizer.cpp:12012
RunSLPVectorization
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
getWidenedType
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
Definition:SLPVectorizer.cpp:263
isVectorLikeInstWithConstOps
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
Definition:SLPVectorizer.cpp:417
calculateRtStride
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
Definition:SLPVectorizer.cpp:4834
isRepeatedNonIdentityClusteredMask
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
Definition:SLPVectorizer.cpp:5876
MaxPHINumOperands
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
Definition:SLPVectorizer.cpp:222
MaxVectorRegSizeOption
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
MaxProfitableLoadStride
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
findBuildAggregate
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
Definition:SLPVectorizer.cpp:20889
needToScheduleSingleInstruction
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:7398
clusterSortPtrAccesses
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Definition:SLPVectorizer.cpp:5369
getNumElements
static unsigned getNumElements(Type *Ty)
Definition:SLPVectorizer.cpp:254
buildUseMask
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
Definition:SLPVectorizer.cpp:616
areCompatibleCmpOps
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
Definition:SLPVectorizer.cpp:873
createInsertVector
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
Definition:SLPVectorizer.cpp:4979
getNumElems
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
Definition:SLPVectorizer.cpp:442
getShuffleCost
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
Definition:SLPVectorizer.cpp:4955
findBuildAggregate_rec
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
Definition:SLPVectorizer.cpp:20850
isSimple
static bool isSimple(Instruction *I)
Definition:SLPVectorizer.cpp:1138
MinScheduleRegionSize
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
Definition:SLPVectorizer.cpp:219
MinProfitableStridedLoads
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
isFirstInsertElement
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
Definition:SLPVectorizer.cpp:12300
getAltInstrMask
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
Definition:SLPVectorizer.cpp:1212
LookAheadMaxDepth
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
MaxVFOption
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
reorderReuses
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
Definition:SLPVectorizer.cpp:4568
combineOrders
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
Definition:SLPVectorizer.cpp:5916
MaxMemDepDistance
static const unsigned MaxMemDepDistance
Definition:SLPVectorizer.cpp:215
ViewSLPTree
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
doesInTreeUserNeedToExtract
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
Definition:SLPVectorizer.cpp:1100
VectorizeNonPowerOf2
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
MinTreeSize
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
reorderOrder
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
Definition:SLPVectorizer.cpp:4582
getFullVectorNumberOfElements
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
Definition:SLPVectorizer.cpp:271
performExtractsShuffleAction
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
Definition:SLPVectorizer.cpp:12355
ShouldVectorizeHor
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
isConstant
static bool isConstant(Value *V)
Definition:SLPVectorizer.cpp:410
isSplat
static bool isSplat(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:493
SLPCostThreshold
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
getPartNumElems
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
Definition:SLPVectorizer.cpp:435
allConstant
static bool allConstant(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:485
UsesLimit
static constexpr int UsesLimit
Definition:SLPVectorizer.cpp:210
getElementIndex
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
Definition:SLPVectorizer.cpp:568
isReductionCandidate
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
Definition:SLPVectorizer.cpp:21022
checkTreeSizes
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
Definition:SLPVectorizer.cpp:18678
getShufflevectorNumGroups
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:339
isCmpSameOrSwapped
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
Definition:SLPVectorizer.cpp:887
SLPSkipEarlyProfitabilityCheck
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
generateKeySubkey
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
Definition:SLPVectorizer.cpp:7417
ShouldStartVectorizeHorAtStore
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
getVectorCallCosts
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Definition:SLPVectorizer.cpp:9039
transformScalarShuffleIndiciesToVector
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
Definition:SLPVectorizer.cpp:300
SLPReVec
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
isValidForAlternation
static bool isValidForAlternation(unsigned Opcode)
Definition:SLPVectorizer.cpp:861
buildIntrinsicArgTypes
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
Definition:SLPVectorizer.cpp:11077
getExtractIndex
static std::optional< unsigned > getExtractIndex(Instruction *E)
Definition:SLPVectorizer.cpp:794
RootLookAheadMaxDepth
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
AliasedCheckLimit
static const unsigned AliasedCheckLimit
Definition:SLPVectorizer.cpp:206
getValueType
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
Definition:SLPVectorizer.cpp:243
gatherPossiblyVectorizableLoads
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
Definition:SLPVectorizer.cpp:6789
shortBundleName
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
Definition:SLPVectorizer.cpp:449
dumpOrder
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
Definition:SLPVectorizer.cpp:6734
isValidElementType
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
Definition:SLPVectorizer.cpp:231
getReductionInstr
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
Definition:SLPVectorizer.cpp:20925
calculateShufflevectorMask
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:391
allSameType
static bool allSameType(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:1093
getLocation
static MemoryLocation getLocation(Instruction *I)
Definition:SLPVectorizer.cpp:1129
isCommutative
static bool isCommutative(Instruction *I)
Definition:SLPVectorizer.cpp:509
allSameBlock
static bool allSameBlock(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:461
getFloorFullVectorNumberOfElements
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
Definition:SLPVectorizer.cpp:286
areTwoInsertFromSameBuildVector
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
Definition:SLPVectorizer.cpp:5500
arePointersCompatible
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
Definition:SLPVectorizer.cpp:4790
getGEPCosts
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
Definition:SLPVectorizer.cpp:9521
isUndefVector
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
Definition:SLPVectorizer.cpp:637
tryToVectorizeSequence
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
Definition:SLPVectorizer.cpp:21194
getSameOpcode
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
Definition:SLPVectorizer.cpp:909
ScheduleRegionSizeBudget
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
tryGetSecondaryReductionRoot
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
Definition:SLPVectorizer.cpp:20995
getRdxKind
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
Definition:SLPVectorizer.cpp:20819
matchRdxBop
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
Definition:SLPVectorizer.cpp:20969
MinVectorRegSizeOption
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
isFixedVectorShuffle
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
Definition:SLPVectorizer.cpp:706
getAggregateSize
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
Definition:SLPVectorizer.cpp:20822
getInsertExtractIndex
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
Definition:SLPVectorizer.cpp:543
RecursionMaxDepth
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
computeCommonAlignment
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
Definition:SLPVectorizer.cpp:4809
addMask
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
Definition:SLPVectorizer.cpp:1151
fixupOrderingIndices
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
Definition:SLPVectorizer.cpp:1186
createExtractVector
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
Definition:SLPVectorizer.cpp:5010
getNonPhiOperand
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
Definition:SLPVectorizer.cpp:21013
compareCmp
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
Definition:SLPVectorizer.cpp:21313
isReverseOrder
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
Definition:SLPVectorizer.cpp:4817
isAlternateInstruction
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
Definition:SLPVectorizer.cpp:9101
SLPVectorizer.h
STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.
OS
raw_pwrite_stream & OS
Definition:SampleProfWriter.cpp:51
SV_NAME
#define SV_NAME
Definition:SandboxVectorizer.cpp:17
ScalarEvolutionExpander.h
ScalarEvolutionExpressions.h
ScalarEvolution.h
ScopeExit.h
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
SetOperations.h
This file defines generic set operations that may be used on set's of different types,...
SetVector.h
This file implements a set that has insertion order iteration characteristics.
SmallBitVector.h
This file implements the SmallBitVector class.
SmallPtrSet.h
This file defines the SmallPtrSet class.
SmallSet.h
This file defines the SmallSet class.
SmallString.h
This file defines the SmallString class.
Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition:Statistic.h:166
getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition:TapiFile.cpp:39
Ptr
@ Ptr
Definition:TargetLibraryInfo.cpp:77
TargetLibraryInfo.h
TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.
Local.h
getOpcode
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition:VPlanSLP.cpp:191
getOperands
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition:VPlanSLP.cpp:154
ValueHandle.h
ValueTracking.h
VectorUtils.h
Verifier.h
RHS
Value * RHS
Definition:X86PartialReduction.cpp:74
LHS
Value * LHS
Definition:X86PartialReduction.cpp:73
IV
static const uint32_t IV[8]
Definition:blake3_impl.h:78
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator
Merges shuffle masks and emits final shuffle instruction, if required.
Definition:SLPVectorizer.cpp:10144
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::ShuffleCostEstimator
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
Definition:SLPVectorizer.cpp:10673
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Definition:SLPVectorizer.cpp:10832
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Definition:SLPVectorizer.cpp:10790
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::needToDelay
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Definition:SLPVectorizer.cpp:10785
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::gather
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
Definition:SLPVectorizer.cpp:10896
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::~ShuffleCostEstimator
~ShuffleCostEstimator()
Definition:SLPVectorizer.cpp:11037
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::finalize
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Definition:SLPVectorizer.cpp:10942
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::createFreeze
InstructionCost createFreeze(InstructionCost Cost)
Definition:SLPVectorizer.cpp:10939
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Definition:SLPVectorizer.cpp:10814
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::adjustExtracts
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Definition:SLPVectorizer.cpp:10679
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Definition:SLPVectorizer.cpp:10849
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder
Merges shuffle masks and emits final shuffle instruction, if required.
Definition:SLPVectorizer.cpp:14096
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
Definition:SLPVectorizer.cpp:14447
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::addOrdered
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Definition:SLPVectorizer.cpp:14502
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::needToDelay
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Definition:SLPVectorizer.cpp:14362
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Definition:SLPVectorizer.cpp:14401
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::gather
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
Definition:SLPVectorizer.cpp:14507
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Definition:SLPVectorizer.cpp:14413
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::createFreeze
Value * createFreeze(Value *V)
Definition:SLPVectorizer.cpp:14514
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::ShuffleInstructionBuilder
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
Definition:SLPVectorizer.cpp:14222
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Definition:SLPVectorizer.cpp:14380
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::adjustExtracts
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Definition:SLPVectorizer.cpp:14226
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::finalize
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Definition:SLPVectorizer.cpp:14519
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::~ShuffleInstructionBuilder
~ShuffleInstructionBuilder()
Definition:SLPVectorizer.cpp:14618
T
VectorType
Definition:ItaniumDemangle.h:1173
bool
llvm::AAManager
A manager for alias analyses.
Definition:AliasAnalysis.h:933
llvm::AAResults
Definition:AliasAnalysis.h:314
llvm::APInt
Class for arbitrary precision integers.
Definition:APInt.h:78
llvm::APInt::getAllOnes
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition:APInt.h:234
llvm::APInt::clearBit
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition:APInt.h:1407
llvm::APInt::setBit
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition:APInt.h:1330
llvm::APInt::isAllOnes
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition:APInt.h:371
llvm::APInt::isZero
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition:APInt.h:380
llvm::APInt::urem
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition:APInt.cpp:1640
llvm::APInt::clearAllBits
void clearAllBits()
Set every bit to 0.
Definition:APInt.h:1397
llvm::APInt::setAllBits
void setAllBits()
Set every bit to 1.
Definition:APInt.h:1319
llvm::APInt::setBits
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition:APInt.h:1367
llvm::APInt::getZero
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition:APInt.h:200
llvm::APInt::getBitsSetFrom
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition:APInt.h:286
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition:APInt.h:239
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition:PassManager.h:253
llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition:PassManager.h:429
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition:PassManager.h:410
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition:ArrayRef.h:41
llvm::ArrayRef::equals
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition:ArrayRef.h:190
llvm::ArrayRef::back
const T & back() const
back - Get the last element.
Definition:ArrayRef.h:177
llvm::ArrayRef::take_front
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition:ArrayRef.h:231
llvm::ArrayRef::drop_front
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition:ArrayRef.h:207
llvm::ArrayRef::front
const T & front() const
front - Get the first element.
Definition:ArrayRef.h:171
llvm::ArrayRef::end
iterator end() const
Definition:ArrayRef.h:157
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition:ArrayRef.h:168
llvm::ArrayRef::drop_back
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition:ArrayRef.h:213
llvm::ArrayRef::begin
iterator begin() const
Definition:ArrayRef.h:156
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition:ArrayRef.h:163
llvm::ArrayRef::slice
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition:ArrayRef.h:198
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition:AssumptionCache.h:173
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition:AssumptionCache.h:42
llvm::Attribute::getWithAlignment
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition:Attributes.cpp:234
llvm::BasicBlock
LLVM Basic Block Representation.
Definition:BasicBlock.h:61
llvm::BasicBlock::end
iterator end()
Definition:BasicBlock.h:474
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition:BasicBlock.h:461
llvm::BasicBlock::reverse_iterator
InstListType::reverse_iterator reverse_iterator
Definition:BasicBlock.h:179
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition:BasicBlock.h:220
llvm::BasicBlock::rend
reverse_iterator rend()
Definition:BasicBlock.h:479
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition:BasicBlock.h:177
llvm::BasicBlock::isEHPad
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition:BasicBlock.h:688
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition:BasicBlock.h:240
llvm::BatchAAResults
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
Definition:AliasAnalysis.h:630
llvm::BatchAAResults::getModRefInfo
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Definition:AliasAnalysis.h:653
llvm::BinaryOperator
Definition:InstrTypes.h:170
llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition:Analysis.h:72
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition:InstrTypes.h:1112
llvm::CallBase::getBundleOperandsEndIndex
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition:InstrTypes.h:1980
llvm::CallBase::getOperandBundlesAsDefs
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Definition:Instructions.cpp:483
llvm::CallBase::isNoBuiltin
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition:InstrTypes.h:1875
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition:InstrTypes.h:1341
llvm::CallBase::hasIdenticalOperandBundleSchema
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition:InstrTypes.h:2117
llvm::CallBase::getBundleOperandsStartIndex
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition:InstrTypes.h:1974
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition:InstrTypes.h:1286
llvm::CallBase::getFunctionType
FunctionType * getFunctionType() const
Definition:InstrTypes.h:1199
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition:InstrTypes.h:1277
llvm::CallBase::arg_size
unsigned arg_size() const
Definition:InstrTypes.h:1284
llvm::CallBase::addParamAttr
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition:InstrTypes.h:1494
llvm::CallBase::hasOperandBundles
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition:InstrTypes.h:1971
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition:Instructions.h:1479
llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition:InstrTypes.h:444
llvm::CmpInst
This class is the base class for the comparison instructions.
Definition:InstrTypes.h:661
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition:InstrTypes.h:980
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition:InstrTypes.h:673
llvm::CmpInst::BAD_ICMP_PREDICATE
@ BAD_ICMP_PREDICATE
Definition:InstrTypes.h:706
llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition:InstrTypes.h:702
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition:InstrTypes.h:703
llvm::CmpInst::ICMP_UGE
@ ICMP_UGE
unsigned greater or equal
Definition:InstrTypes.h:697
llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition:InstrTypes.h:696
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition:InstrTypes.h:700
llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition:InstrTypes.h:698
llvm::CmpInst::ICMP_SGE
@ ICMP_SGE
signed greater or equal
Definition:InstrTypes.h:701
llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition:InstrTypes.h:699
llvm::CmpInst::BAD_FCMP_PREDICATE
@ BAD_FCMP_PREDICATE
Definition:InstrTypes.h:693
llvm::CmpInst::getSwappedPredicate
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition:InstrTypes.h:825
llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition:InstrTypes.h:787
llvm::CmpInst::getPredicate
Predicate getPredicate() const
Return the predicate for this instruction.
Definition:InstrTypes.h:763
llvm::CmpPredicate
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition:CmpPredicate.h:22
llvm::ConstantExpr::getIntToPtr
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition:Constants.cpp:2307
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition:Constants.h:83
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition:Constants.h:157
llvm::ConstantVector::getSplat
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition:Constants.cpp:1472
llvm::ConstantVector::get
static Constant * get(ArrayRef< Constant * > V)
Definition:Constants.cpp:1421
llvm::Constant
This is an important base class in LLVM.
Definition:Constant.h:42
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition:Constants.cpp:420
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition:Constants.cpp:373
llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition:DWARFExpression.h:32
llvm::DWARFExpression::Operation::getNumOperands
uint64_t getNumOperands() const
Definition:DWARFExpression.h:90
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition:DataLayout.h:63
llvm::DataLayout::getTypeStoreSizeInBits
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition:DataLayout.h:434
llvm::DataLayout::getIndexType
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition:DataLayout.cpp:878
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition:DataLayout.h:617
llvm::DebugCounter::shouldExecute
static bool shouldExecute(unsigned CounterName)
Definition:DebugCounter.h:87
llvm::DemandedBitsAnalysis
An analysis that produces DemandedBits for a function.
Definition:DemandedBits.h:103
llvm::DemandedBits
Definition:DemandedBits.h:40
llvm::DemandedBits::getDemandedBits
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
Definition:DemandedBits.cpp:399
llvm::DenseMapBase::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition:DenseMap.h:194
llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition:DenseMap.h:156
llvm::DenseMapBase::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition:DenseMap.h:226
llvm::DenseMapBase::erase
bool erase(const KeyT &Val)
Definition:DenseMap.h:321
llvm::DenseMapBase::size
unsigned size() const
Definition:DenseMap.h:99
llvm::DenseMapBase::empty
bool empty() const
Definition:DenseMap.h:98
llvm::DenseMapBase::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition:DenseMap.h:152
llvm::DenseMapBase::end
iterator end()
Definition:DenseMap.h:84
llvm::DenseMapBase::at
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition:DenseMap.h:202
llvm::DenseMapBase::contains
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition:DenseMap.h:147
llvm::DenseMapBase::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition:DenseMap.h:211
llvm::DenseMapBase::clear
void clear()
Definition:DenseMap.h:110
llvm::DenseMap
Definition:DenseMap.h:727
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition:DenseSet.h:278
llvm::DomTreeNodeBase< BasicBlock >
llvm::DomTreeNodeBase::getDFSNumIn
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Definition:GenericDomTree.h:140
llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition:Dominators.h:279
llvm::DominatorTreeBase::updateDFSNumbers
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
Definition:GenericDomTree.h:805
llvm::DominatorTreeBase::getNode
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Definition:GenericDomTree.h:401
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition:Dominators.h:162
llvm::DominatorTree::isReachableFromEntry
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition:Dominators.cpp:321
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition:Dominators.cpp:122
llvm::ElementCount::getFixed
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition:TypeSize.h:311
llvm::ExtractElementInst
This instruction extracts a single (scalar) element from a VectorType value.
Definition:Instructions.h:1775
llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition:Instructions.h:2397
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition:FMF.h:20
llvm::FastMathFlags::set
void set()
Definition:FMF.h:62
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition:DerivedTypes.h:563
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition:DerivedTypes.h:606
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition:Type.cpp:791
llvm::FunctionType::params
ArrayRef< Type * > params() const
Definition:DerivedTypes.h:132
llvm::FunctionType::getReturnType
Type * getReturnType() const
Definition:DerivedTypes.h:126
llvm::Function
Definition:Function.h:63
llvm::Function::empty
bool empty() const
Definition:Function.h:859
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition:Instructions.h:933
llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator++
nodes_iterator operator++()
Definition:SLPVectorizer.cpp:4474
llvm::GraphTraits< BoUpSLP * >::nodes_iterator::nodes_iterator
nodes_iterator(const ItTy &It2)
Definition:SLPVectorizer.cpp:4472
llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator*
NodeRef operator*()
Definition:SLPVectorizer.cpp:4473
llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator!=
bool operator!=(const nodes_iterator &N2) const
Definition:SLPVectorizer.cpp:4478
llvm::IRBuilderBase::InsertPointGuard
Definition:IRBuilder.h:394
llvm::IRBuilderBase
Common base class shared among various IRBuilders.
Definition:IRBuilder.h:113
llvm::IRBuilderBase::CreateExtractVector
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition:IRBuilder.h:1072
llvm::IRBuilderBase::CreateInsertElement
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition:IRBuilder.h:2511
llvm::IRBuilderBase::getInt1Ty
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition:IRBuilder.h:530
llvm::IRBuilderBase::CreateInsertVector
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition:IRBuilder.h:1080
llvm::IRBuilderBase::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition:IRBuilder.h:2499
llvm::IRBuilderBase::getIntNTy
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition:IRBuilder.h:558
llvm::IRBuilderBase::CreateAlignedLoad
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition:IRBuilder.h:1815
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition:IRBuilder.h:485
llvm::IRBuilderBase::CreateSelect
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition:IRBuilder.cpp:1053
llvm::IRBuilderBase::GetInsertPoint
BasicBlock::iterator GetInsertPoint() const
Definition:IRBuilder.h:194
llvm::IRBuilderBase::CreateFreeze
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition:IRBuilder.h:2574
llvm::IRBuilderBase::CreateCast
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition:IRBuilder.h:2186
llvm::IRBuilderBase::GetInsertBlock
BasicBlock * GetInsertBlock() const
Definition:IRBuilder.h:193
llvm::IRBuilderBase::setFastMathFlags
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition:IRBuilder.h:330
llvm::IRBuilderBase::SetCurrentDebugLocation
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition:IRBuilder.h:239
llvm::IRBuilderBase::CreateGEP
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition:IRBuilder.h:1874
llvm::IRBuilderBase::getInt64
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition:IRBuilder.h:510
llvm::IRBuilderBase::getAllOnesMask
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition:IRBuilder.h:867
llvm::IRBuilderBase::CreateUnOp
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition:IRBuilder.h:1761
llvm::IRBuilderBase::CreateBinaryIntrinsic
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition:IRBuilder.cpp:889
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition:IRBuilder.cpp:900
llvm::IRBuilderBase::getInt32
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition:IRBuilder.h:505
llvm::IRBuilderBase::CreateCmp
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition:IRBuilder.h:2404
llvm::IRBuilderBase::CreatePHI
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition:IRBuilder.h:2435
llvm::IRBuilderBase::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition:IRBuilder.h:2152
llvm::IRBuilderBase::CreateUnaryIntrinsic
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition:IRBuilder.cpp:881
llvm::IRBuilderBase::CreateShuffleVector
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition:IRBuilder.h:2533
llvm::IRBuilderBase::getFalse
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition:IRBuilder.h:490
llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition:IRBuilder.h:2449
llvm::IRBuilderBase::CreateBinOp
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition:IRBuilder.h:1671
llvm::IRBuilderBase::ClearInsertionPoint
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition:IRBuilder.h:188
llvm::IRBuilderBase::CreateIntCast
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition:IRBuilder.h:2225
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition:IRBuilder.h:199
llvm::IRBuilderBase::CreateAlignedStore
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition:IRBuilder.h:1834
llvm::IRBuilderBase::CreateICmp
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition:IRBuilder.h:2380
llvm::IRBuilderBase::CreateFMul
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition:IRBuilder.h:1614
llvm::IRBuilderBase::CreateMul
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition:IRBuilder.h:1404
llvm::IRBuilderBase::CreateMaskedGather
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition:IRBuilder.cpp:596
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition:IRBuilder.h:2705
llvm::InsertElementInst
This instruction inserts a single (scalar) element into a VectorType value.
Definition:Instructions.h:1834
llvm::InsertElementInst::getType
VectorType * getType() const
Overload to return most specific vector type.
Definition:Instructions.h:1862
llvm::InsertValueInst
This instruction inserts a struct field of array element value into an aggregate value.
Definition:Instructions.h:2485
llvm::InstructionCost
Definition:InstructionCost.h:29
llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition:InstructionCost.h:73
llvm::InstructionCost::isValid
bool isValid() const
Definition:InstructionCost.h:79
llvm::Instruction
Definition:Instruction.h:68
llvm::Instruction::isCast
bool isCast() const
Definition:Instruction.h:319
llvm::Instruction::mayReadOrWriteMemory
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition:Instruction.h:799
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition:Instruction.h:511
llvm::Instruction::moveAfter
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition:Instruction.cpp:191
llvm::Instruction::isBinaryOp
bool isBinaryOp() const
Definition:Instruction.h:315
llvm::Instruction::comesBefore
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition:Instruction.cpp:334
llvm::Instruction::getNextNonDebugInstruction
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
Definition:Instruction.cpp:1226
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition:Instruction.h:310
llvm::Instruction::BinaryOps
BinaryOps
Definition:Instruction.h:1008
llvm::Instruction::isIdenticalTo
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
Definition:Instruction.cpp:914
llvm::Instruction::isIntDivRem
bool isIntDivRem() const
Definition:Instruction.h:316
llvm::Instruction::UnaryOps
UnaryOps
Definition:Instruction.h:1001
llvm::Instruction::CastOps
CastOps
Definition:Instruction.h:1022
llvm::IntegerType::get
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition:Type.cpp:311
llvm::IntrinsicCostAttributes
Definition:TargetTransformInfo.h:119
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition:TargetTransformInfo.h:156
llvm::LoadInst
An instruction for reading from memory.
Definition:Instructions.h:176
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition:Instructions.h:255
llvm::LoadInst::isSimple
bool isSimple() const
Definition:Instructions.h:247
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition:Instructions.h:211
llvm::LoopAnalysis
Analysis pass that exposes the LoopInfo for a function.
Definition:LoopInfo.h:566
llvm::LoopBase::getLoopLatch
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition:GenericLoopInfoImpl.h:256
llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition:GenericLoopInfo.h:606
llvm::LoopInfo
Definition:LoopInfo.h:407
llvm::Loop
Represents a single loop in the control flow graph.
Definition:LoopInfo.h:39
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition:MapVector.h:36
llvm::MapVector::end
iterator end()
Definition:MapVector.h:71
llvm::MapVector::takeVector
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition:MapVector.h:55
llvm::MapVector::find
iterator find(const KeyT &Key)
Definition:MapVector.h:167
llvm::MapVector::empty
bool empty() const
Definition:MapVector.h:79
llvm::MapVector::try_emplace
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition:MapVector.h:118
llvm::MapVector::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition:MapVector.h:141
llvm::MapVector::lookup
ValueT lookup(const KeyT &Key) const
Definition:MapVector.h:110
llvm::MapVector::size
size_type size() const
Definition:MapVector.h:60
llvm::MapVector::front
std::pair< KeyT, ValueT > & front()
Definition:MapVector.h:83
llvm::MapVector::clear
void clear()
Definition:MapVector.h:88
llvm::MemIntrinsic
This is the common base class for memset/memcpy/memmove.
Definition:IntrinsicInst.h:1205
llvm::MemoryLocation
Representation for a specific memory location.
Definition:MemoryLocation.h:227
llvm::MemoryLocation::get
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Definition:MemoryLocation.cpp:35
llvm::MemoryLocation::Ptr
const Value * Ptr
The address of the start of the location.
Definition:MemoryLocation.h:235
llvm::MutableArrayRef
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition:ArrayRef.h:310
llvm::MutableArrayRef::front
T & front() const
front - Get the first element.
Definition:ArrayRef.h:366
llvm::MutableArrayRef::end
iterator end() const
Definition:ArrayRef.h:360
llvm::MutableArrayRef::begin
iterator begin() const
Definition:ArrayRef.h:359
llvm::MutableArrayRef::slice
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition:ArrayRef.h:379
llvm::OptimizationRemarkEmitterAnalysis
Definition:OptimizationRemarkEmitter.h:164
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition:OptimizationRemarkEmitter.h:32
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition:DiagnosticInfo.h:807
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition:DiagnosticInfo.h:762
llvm::OwningArrayRef
This is a MutableArrayRef that owns its array.
Definition:ArrayRef.h:452
llvm::PHINode
Definition:Instructions.h:2600
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition:Instructions.h:2735
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition:Instructions.h:2775
llvm::PHINode::getIncomingBlock
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Definition:Instructions.h:2695
llvm::PHINode::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Definition:Instructions.h:2671
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition:Pass.h:94
llvm::PointerType::getUnqual
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition:DerivedTypes.h:686
llvm::PointerUnion
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition:PointerUnion.h:118
llvm::PointerUnion::isNull
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition:PointerUnion.h:142
llvm::PointerUnion::dyn_cast
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition:PointerUnion.h:168
llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition:Constants.cpp:1878
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition:Analysis.h:111
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition:Analysis.h:117
llvm::PreservedAnalyses::preserveSet
void preserveSet()
Mark an analysis set as preserved.
Definition:Analysis.h:146
llvm::PriorityQueue
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition:PriorityQueue.h:28
llvm::RecurrenceDescriptor::getOpcode
unsigned getOpcode() const
Definition:IVDescriptors.h:212
llvm::RecurrenceDescriptor::isIntMinMaxRecurrenceKind
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
Definition:IVDescriptors.h:234
llvm::RecurrenceDescriptor::isMinMaxRecurrenceKind
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
Definition:IVDescriptors.h:246
llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition:ScalarEvolutionExpander.h:63
llvm::SCEVExpander::expandCodeFor
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
Definition:ScalarEvolutionExpander.cpp:1443
llvm::SCEV
This class represents an analyzed expression in the program.
Definition:ScalarEvolution.h:71
llvm::SCEV::isZero
bool isZero() const
Return true if the expression is a constant zero.
Definition:ScalarEvolution.cpp:448
llvm::SCEV::isNonConstantNegative
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Definition:ScalarEvolution.cpp:454
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition:ScalarEvolution.cpp:386
llvm::ScalarEvolutionAnalysis
Analysis pass that exposes the ScalarEvolution for a function.
Definition:ScalarEvolution.h:2320
llvm::ScalarEvolution
The main scalar evolution driver.
Definition:ScalarEvolution.h:447
llvm::ScalarEvolution::getConstant
const SCEV * getConstant(ConstantInt *V)
Definition:ScalarEvolution.cpp:473
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition:ScalarEvolution.cpp:4547
llvm::ScalarEvolution::forgetValue
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
Definition:ScalarEvolution.cpp:8542
llvm::ScalarEvolution::getMinusSCEV
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition:ScalarEvolution.cpp:4655
llvm::ScalarEvolution::getMulExpr
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition:ScalarEvolution.cpp:3106
llvm::ScalarEvolution::getUDivExactExpr
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
Definition:ScalarEvolution.cpp:3587
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition:ScalarEvolution.cpp:2526
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition:Instructions.h:1657
llvm::SetVector
A vector that has set insertion semantics.
Definition:SetVector.h:57
llvm::SetVector::getArrayRef
ArrayRef< value_type > getArrayRef() const
Definition:SetVector.h:84
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition:SetVector.h:98
llvm::SetVector::front
const value_type & front() const
Return the first element of the SetVector.
Definition:SetVector.h:143
llvm::SetVector::clear
void clear()
Completely clear the SetVector.
Definition:SetVector.h:273
llvm::SetVector::empty
bool empty() const
Determine if the SetVector is empty or not.
Definition:SetVector.h:93
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition:SetVector.h:162
llvm::SetVector::contains
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition:SetVector.h:254
llvm::ShuffleVectorInst
This instruction constructs a fixed permutation of two input vectors.
Definition:Instructions.h:1901
llvm::ShuffleVectorInst::isZeroEltSplatMask
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
Definition:Instructions.cpp:1911
llvm::ShuffleVectorInst::isOneUseSingleSourceMask
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
Definition:Instructions.cpp:2253
llvm::ShuffleVectorInst::isDeInterleaveMaskOfFactor
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
Definition:Instructions.cpp:2379
llvm::ShuffleVectorInst::isIdentityMask
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
Definition:Instructions.cpp:1883
llvm::ShuffleVectorInst::isExtractSubvectorMask
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
Definition:Instructions.cpp:2010
llvm::ShuffleVectorInst::isReverseMask
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
Definition:Instructions.cpp:1891
llvm::ShuffleVectorInst::isInsertSubvectorMask
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
Definition:Instructions.cpp:2039
llvm::ShuffleVectorInst::isInterleaveMask
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Definition:Instructions.cpp:2295
llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition:SmallBitVector.h:35
llvm::SmallBitVector::find_first
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
Definition:SmallBitVector.h:230
llvm::SmallBitVector::set
SmallBitVector & set()
Definition:SmallBitVector.h:366
llvm::SmallBitVector::test
bool test(unsigned Idx) const
Definition:SmallBitVector.h:472
llvm::SmallBitVector::find_next
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
Definition:SmallBitVector.h:277
llvm::SmallBitVector::all
bool all() const
Returns true if all bits are set.
Definition:SmallBitVector.h:216
llvm::SmallBitVector::size
size_type size() const
Returns the number of bits in this bitvector.
Definition:SmallBitVector.h:195
llvm::SmallBitVector::any
bool any() const
Returns true if any bit is set.
Definition:SmallBitVector.h:209
llvm::SmallBitVector::count
size_type count() const
Returns the number of bits which are set.
Definition:SmallBitVector.h:200
llvm::SmallBitVector::reset
SmallBitVector & reset()
Definition:SmallBitVector.h:401
llvm::SmallBitVector::none
bool none() const
Returns true if none of the bits are set.
Definition:SmallBitVector.h:223
llvm::SmallDenseMap
Definition:DenseMap.h:883
llvm::SmallDenseSet
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition:DenseSet.h:298
llvm::SmallPtrSetImplBase::size
size_type size() const
Definition:SmallPtrSet.h:94
llvm::SmallPtrSetImplBase::clear
void clear()
Definition:SmallPtrSet.h:97
llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition:SmallPtrSet.h:93
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition:SmallPtrSet.h:363
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition:SmallPtrSet.h:401
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition:SmallPtrSet.h:452
llvm::SmallPtrSetImpl::end
iterator end() const
Definition:SmallPtrSet.h:477
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition:SmallPtrSet.h:384
llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition:SmallPtrSet.h:472
llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition:SmallPtrSet.h:458
llvm::SmallPtrSet< Value *, 16 >
llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition:SetVector.h:370
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition:SmallSet.h:132
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition:SmallSet.h:175
llvm::SmallSet::contains
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition:SmallSet.h:222
llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition:SmallSet.h:181
llvm::SmallSet::size
size_type size() const
Definition:SmallSet.h:170
llvm::SmallString
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition:SmallString.h:26
llvm::SmallVectorBase::empty
bool empty() const
Definition:SmallVector.h:81
llvm::SmallVectorBase::size
size_t size() const
Definition:SmallVector.h:78
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition:SmallVector.h:573
llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition:SmallVector.h:673
llvm::SmallVectorImpl::assign
void assign(size_type NumElts, ValueParamT Elt)
Definition:SmallVector.h:704
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition:SmallVector.h:937
llvm::SmallVectorImpl::reserve
void reserve(size_type N)
Definition:SmallVector.h:663
llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition:SmallVector.h:683
llvm::SmallVectorImpl::clear
void clear()
Definition:SmallVector.h:610
llvm::SmallVectorImpl::swap
void swap(SmallVectorImpl &RHS)
Definition:SmallVector.h:968
llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition:SmallVector.h:638
llvm::SmallVectorTemplateBase::pop_back
void pop_back()
Definition:SmallVector.h:425
llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition:SmallVector.h:413
llvm::SmallVectorTemplateCommon::end
iterator end()
Definition:SmallVector.h:269
llvm::SmallVectorTemplateCommon::rbegin
reverse_iterator rbegin()
Definition:SmallVector.h:273
llvm::SmallVectorTemplateCommon::front
reference front()
Definition:SmallVector.h:299
llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition:SmallVector.h:267
llvm::SmallVectorTemplateCommon::back
reference back()
Definition:SmallVector.h:308
llvm::SmallVectorTemplateCommon::rend
reverse_iterator rend()
Definition:SmallVector.h:275
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition:SmallVector.h:1196
llvm::StoreInst
An instruction for storing to memory.
Definition:Instructions.h:292
llvm::StoreInst::getPointerOperandType
Type * getPointerOperandType() const
Definition:Instructions.h:384
llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition:Instructions.h:378
llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition:Instructions.h:381
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition:StringRef.h:51
llvm::TargetFolder
TargetFolder - Create constants with target dependent folding.
Definition:TargetFolder.h:34
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition:TargetTransformInfo.h:3194
llvm::TargetLibraryAnalysis
Analysis pass providing the TargetLibraryInfo.
Definition:TargetLibraryInfo.h:614
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition:TargetLibraryInfo.h:280
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition:TargetTransformInfo.h:212
llvm::TargetTransformInfo::getCastContextHint
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
Definition:TargetTransformInfo.cpp:996
llvm::TargetTransformInfo::getStridedMemoryOpCost
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1156
llvm::TargetTransformInfo::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1067
llvm::TargetTransformInfo::getRegisterBitWidth
TypeSize getRegisterBitWidth(RegisterKind K) const
Definition:TargetTransformInfo.cpp:776
llvm::TargetTransformInfo::isLegalMaskedGather
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
Definition:TargetTransformInfo.cpp:490
llvm::TargetTransformInfo::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1125
llvm::TargetTransformInfo::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
Definition:TargetTransformInfo.cpp:1165
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition:TargetTransformInfo.cpp:1177
llvm::TargetTransformInfo::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
Definition:TargetTransformInfo.cpp:1215
llvm::TargetTransformInfo::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1039
llvm::TargetTransformInfo::getGEPCost
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
Definition:TargetTransformInfo.cpp:248
llvm::TargetTransformInfo::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
Definition:TargetTransformInfo.cpp:531
llvm::TargetTransformInfo::isLegalBroadcastLoad
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
Definition:TargetTransformInfo.cpp:485
llvm::TargetTransformInfo::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
Definition:TargetTransformInfo.cpp:1233
llvm::TargetTransformInfo::getOperandInfo
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
Definition:TargetTransformInfo.cpp:880
llvm::TargetTransformInfo::getRegisterClassForType
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
Definition:TargetTransformInfo.cpp:767
llvm::TargetTransformInfo::forceScalarizeMaskedGather
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
Definition:TargetTransformInfo.cpp:506
llvm::TargetTransformInfo::isLegalStridedLoadStore
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
Definition:TargetTransformInfo.cpp:526
llvm::TargetTransformInfo::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Definition:TargetTransformInfo.cpp:1224
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition:TargetTransformInfo.h:263
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition:TargetTransformInfo.h:264
llvm::TargetTransformInfo::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
Definition:TargetTransformInfo.cpp:940
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition:TargetTransformInfo.h:1126
llvm::TargetTransformInfo::OP_NegatedPowerOf2
@ OP_NegatedPowerOf2
Definition:TargetTransformInfo.h:1129
llvm::TargetTransformInfo::OP_None
@ OP_None
Definition:TargetTransformInfo.h:1127
llvm::TargetTransformInfo::OP_PowerOf2
@ OP_PowerOf2
Definition:TargetTransformInfo.h:1128
llvm::TargetTransformInfo::getPointersChainCost
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
Definition:TargetTransformInfo.cpp:254
llvm::TargetTransformInfo::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition:TargetTransformInfo.cpp:807
llvm::TargetTransformInfo::isTypeLegal
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
Definition:TargetTransformInfo.cpp:583
llvm::TargetTransformInfo::getCostOfKeepingLiveOverCall
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
Definition:TargetTransformInfo.cpp:1247
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition:TargetTransformInfo.h:1180
llvm::TargetTransformInfo::getShuffleCost
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
Definition:TargetTransformInfo.cpp:976
llvm::TargetTransformInfo::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition:TargetTransformInfo.cpp:781
llvm::TargetTransformInfo::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1146
llvm::TargetTransformInfo::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned ClassID) const
Definition:TargetTransformInfo.cpp:759
llvm::TargetTransformInfo::isLegalAltInstr
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
Definition:TargetTransformInfo.cpp:495
llvm::TargetTransformInfo::isFPVectorizationPotentiallyUnsafe
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
Definition:TargetTransformInfo.cpp:680
llvm::TargetTransformInfo::getStoreMinimumVF
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition:TargetTransformInfo.cpp:812
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition:TargetTransformInfo.h:291
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition:TargetTransformInfo.h:289
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition:TargetTransformInfo.h:290
llvm::TargetTransformInfo::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
Definition:TargetTransformInfo.cpp:628
llvm::TargetTransformInfo::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
Definition:TargetTransformInfo.cpp:270
llvm::TargetTransformInfo::getExtractWithExtendCost
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
Definition:TargetTransformInfo.cpp:1050
llvm::TargetTransformInfo::getNumberOfParts
unsigned getNumberOfParts(Type *Tp) const
Definition:TargetTransformInfo.cpp:1193
llvm::TargetTransformInfo::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
Definition:TargetTransformInfo.cpp:1079
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition:TargetTransformInfo.h:1098
llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition:TargetTransformInfo.h:1105
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition:TargetTransformInfo.h:1101
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition:TargetTransformInfo.h:1109
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition:TargetTransformInfo.h:1099
llvm::TargetTransformInfo::SK_PermuteTwoSrc
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
Definition:TargetTransformInfo.h:1107
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition:TargetTransformInfo.h:1100
llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition:TargetTransformInfo.h:1106
llvm::TargetTransformInfo::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
Definition:TargetTransformInfo.cpp:1185
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition:TargetTransformInfo.h:1389
llvm::TargetTransformInfo::CastContextHint::Reversed
@ Reversed
The cast is used with a reversed load/store.
llvm::TargetTransformInfo::CastContextHint::None
@ None
The cast is not used with a load/store of any kind.
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
llvm::TargetTransformInfo::CastContextHint::GatherScatter
@ GatherScatter
The cast is used with a gather/scatter.
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition:TargetTransformInfo.h:1118
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition:TargetTransformInfo.h:1121
llvm::TargetTransformInfo::OK_UniformValue
@ OK_UniformValue
Definition:TargetTransformInfo.h:1120
llvm::TargetTransformInfo::OK_AnyValue
@ OK_AnyValue
Definition:TargetTransformInfo.h:1119
llvm::TargetTransformInfo::OK_NonUniformConstantValue
@ OK_NonUniformConstantValue
Definition:TargetTransformInfo.h:1122
llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition:Twine.h:81
llvm::TypeSize
Definition:TypeSize.h:334
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition:Type.h:45
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition:Type.h:270
llvm::Type::isX86_FP80Ty
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition:Type.h:159
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition:Type.h:243
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition:Type.h:264
llvm::Type::isEmptyTy
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
llvm::Type::getStructNumElements
unsigned getStructNumElements() const
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
llvm::Type::isSingleValueType
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition:Type.h:295
llvm::Type::print
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
llvm::Type::isPPC_FP128Ty
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition:Type.h:165
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
llvm::Type::getWithNewType
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition:Type.h:128
llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition:Type.h:184
llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition:Type.h:267
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition:Type.h:237
llvm::Type::getTypeID
TypeID getTypeID() const
Return the type id for the type.
Definition:Type.h:136
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition:Type.h:225
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition:Type.h:139
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition:Type.h:355
llvm::UnaryOperator
Definition:InstrTypes.h:100
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition:Constants.cpp:1859
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition:Use.h:43
llvm::User
Definition:User.h:44
llvm::User::operands
op_range operands()
Definition:User.h:288
llvm::User::replaceUsesOfWith
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition:User.cpp:21
llvm::User::User
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition:User.h:115
llvm::User::op_begin
op_iterator op_begin()
Definition:User.h:280
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition:User.h:228
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition:User.h:250
llvm::User::operand_values
iterator_range< value_op_iterator > operand_values()
Definition:User.h:312
llvm::VFDatabase
The Vector Function Database.
Definition:VectorUtils.h:31
llvm::VFDatabase::getMappings
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition:VectorUtils.h:72
llvm::Value
LLVM Value Representation.
Definition:Value.h:74
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition:Value.h:255
llvm::Value::user_begin
user_iterator user_begin()
Definition:Value.h:397
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition:Value.h:434
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition:Value.cpp:534
llvm::Value::users
iterator_range< user_iterator > users()
Definition:Value.h:421
llvm::Value::getValueID
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition:Value.h:532
llvm::Value::hasNUsesOrMore
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition:Value.cpp:153
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition:Value.cpp:149
llvm::Value::use_empty
bool use_empty() const
Definition:Value.h:344
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition:Value.cpp:1075
llvm::Value::getNumUses
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition:Value.cpp:255
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition:Value.cpp:309
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition:Value.cpp:383
llvm::VectorType
Base class of all SIMD vector types.
Definition:DerivedTypes.h:427
llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition:DerivedTypes.h:665
llvm::VectorType::get
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
llvm::VectorType::getElementType
Type * getElementType() const
Definition:DerivedTypes.h:460
llvm::WeakTrackingVH
Value handle that is nullable, but tries to track the Value.
Definition:ValueHandle.h:204
llvm::cl::opt
Definition:CommandLine.h:1423
llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition:DenseSet.h:213
llvm::detail::DenseSetImpl::clear
void clear()
Definition:DenseSet.h:92
llvm::detail::DenseSetImpl::find
iterator find(const_arg_type_t< ValueT > V)
Definition:DenseSet.h:187
llvm::detail::DenseSetImpl::end
iterator end()
Definition:DenseSet.h:182
llvm::detail::DenseSetImpl::size
size_type size() const
Definition:DenseSet.h:81
llvm::detail::DenseSetImpl::empty
bool empty() const
Definition:DenseSet.h:80
llvm::detail::DenseSetImpl::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition:DenseSet.h:193
llvm::detail::DenseSetImpl::begin
iterator begin()
Definition:DenseSet.h:181
llvm::detail::DenseSetImpl::erase
bool erase(const ValueT &V)
Definition:DenseSet.h:97
llvm::detail::DenseSetImpl::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition:DenseSet.h:95
llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition:TypeSize.h:202
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition:STLFunctionalExtras.h:37
llvm::hash_code
An opaque object representing a hash code.
Definition:Hashing.h:75
llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition:ilist_node.h:32
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition:ilist_node.h:132
llvm::ilist_node_with_parent::getPrevNode
NodeTy * getPrevNode()
Definition:ilist_node.h:339
llvm::iterator_adaptor_base
CRTP base class for adapting an iterator to a different type.
Definition:iterator.h:237
llvm::iterator_range
A range adaptor for a pair of iterators.
Definition:iterator_range.h:42
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition:raw_ostream.h:52
llvm::raw_ostream::indent
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
Definition:raw_ostream.cpp:495
llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition:raw_ostream.h:661
llvm::raw_svector_ostream
A raw_ostream that writes to an SmallVector or SmallString.
Definition:raw_ostream.h:691
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics
A helper class used for scoring candidates for two consecutive lanes.
Definition:SLPVectorizer.cpp:1687
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveExtracts
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
Definition:SLPVectorizer.cpp:1725
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getShallowScore
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
Definition:SLPVectorizer.cpp:1747
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAllUserVectorized
static const int ScoreAllUserVectorized
Score if all users are vectorized.
Definition:SLPVectorizer.cpp:1741
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSameOpcode
static const int ScoreSameOpcode
Instructions with the same opcode.
Definition:SLPVectorizer.cpp:1731
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreUndef
static const int ScoreUndef
Matching with an undef is preferable to failing.
Definition:SLPVectorizer.cpp:1737
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getScoreAtLevelRec
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
Definition:SLPVectorizer.cpp:1919
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreFail
static const int ScoreFail
Score for failing to find a decent match.
Definition:SLPVectorizer.cpp:1739
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreMaskedGatherCandidate
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
Definition:SLPVectorizer.cpp:1723
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplat
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
Definition:SLPVectorizer.cpp:1735
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::LookAheadHeuristics
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
Definition:SLPVectorizer.cpp:1696
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplatLoads
static const int ScoreSplatLoads
The same load multiple times.
Definition:SLPVectorizer.cpp:1719
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedLoads
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
Definition:SLPVectorizer.cpp:1721
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConstants
static const int ScoreConstants
Constants.
Definition:SLPVectorizer.cpp:1729
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAltOpcodes
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
Definition:SLPVectorizer.cpp:1733
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveLoads
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
Definition:SLPVectorizer.cpp:1714
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedExtracts
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
Definition:SLPVectorizer.cpp:1727
llvm::slpvectorizer::BoUpSLP::VLOperands
A helper data structure to hold the operands of a vector of instructions.
Definition:SLPVectorizer.cpp:1988
llvm::slpvectorizer::BoUpSLP::VLOperands::getVL
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
Definition:SLPVectorizer.cpp:2598
llvm::slpvectorizer::BoUpSLP::VLOperands::getModeStr
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
Definition:SLPVectorizer.cpp:2751
llvm::slpvectorizer::BoUpSLP::VLOperands::VLOperands
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Definition:SLPVectorizer.cpp:2588
llvm::slpvectorizer::BoUpSLP::VLOperands::dumpMode
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
Definition:SLPVectorizer.cpp:2773
llvm::slpvectorizer::BoUpSLP::VLOperands::dump
LLVM_DUMP_METHOD void dump() const
Debug print.
Definition:SLPVectorizer.cpp:2800
llvm::slpvectorizer::BoUpSLP::VLOperands::operator<<
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
Definition:SLPVectorizer.cpp:2777
llvm::slpvectorizer::BoUpSLP::VLOperands::reorder
void reorder()
Definition:SLPVectorizer.cpp:2610
llvm::slpvectorizer::BoUpSLP::VLOperands::printMode
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
Definition:SLPVectorizer.cpp:2767
llvm::slpvectorizer::BoUpSLP::VLOperands::print
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Definition:SLPVectorizer.cpp:2781
llvm::slpvectorizer::BoUpSLP
Bottom Up SLP Vectorizer.
Definition:SLPVectorizer.cpp:1336
llvm::slpvectorizer::BoUpSLP::OrdersType
SmallVector< unsigned, 4 > OrdersType
Definition:SLPVectorizer.cpp:1356
llvm::slpvectorizer::BoUpSLP::getRootNodeTypeWithNoCast
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
Definition:SLPVectorizer.cpp:1426
llvm::slpvectorizer::BoUpSLP::findPartiallyOrderedLoads
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
Definition:SLPVectorizer.cpp:5476
llvm::slpvectorizer::BoUpSLP::LoadsState
LoadsState
Tracks the state we can represent the loads in the given sequence.
Definition:SLPVectorizer.cpp:1344
llvm::slpvectorizer::BoUpSLP::LoadsState::ScatterVectorize
@ ScatterVectorize
llvm::slpvectorizer::BoUpSLP::LoadsState::Gather
@ Gather
llvm::slpvectorizer::BoUpSLP::LoadsState::Vectorize
@ Vectorize
llvm::slpvectorizer::BoUpSLP::LoadsState::StridedVectorize
@ StridedVectorize
llvm::slpvectorizer::BoUpSLP::operator<<
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
Definition:SLPVectorizer.cpp:4062
llvm::slpvectorizer::BoUpSLP::reorderTopToBottom
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
Definition:SLPVectorizer.cpp:5938
llvm::slpvectorizer::BoUpSLP::reorderBottomToTop
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
Definition:SLPVectorizer.cpp:6270
llvm::slpvectorizer::BoUpSLP::registerNonVectorizableLoads
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
Definition:SLPVectorizer.cpp:1644
llvm::slpvectorizer::BoUpSLP::getTreeSize
unsigned getTreeSize() const
Definition:SLPVectorizer.cpp:1504
llvm::slpvectorizer::BoUpSLP::~BoUpSLP
~BoUpSLP()
Definition:SLPVectorizer.cpp:4527
llvm::slpvectorizer::BoUpSLP::areKnownNonVectorizableLoads
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
Definition:SLPVectorizer.cpp:1650
llvm::slpvectorizer::BoUpSLP::getCanonicalGraphSize
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
Definition:SLPVectorizer.cpp:1507
llvm::slpvectorizer::BoUpSLP::areAnalyzedReductionVals
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
Definition:SLPVectorizer.cpp:2936
llvm::slpvectorizer::BoUpSLP::canVectorizeLoads
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
Definition:SLPVectorizer.cpp:5025
llvm::slpvectorizer::BoUpSLP::isLoadCombineCandidate
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
Definition:SLPVectorizer.cpp:12062
llvm::slpvectorizer::BoUpSLP::analyzedReductionVals
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
Definition:SLPVectorizer.cpp:2941
llvm::slpvectorizer::BoUpSLP::isLoadCombineReductionCandidate
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
Definition:SLPVectorizer.cpp:12052
llvm::slpvectorizer::BoUpSLP::getVectorElementSize
unsigned getVectorElementSize(Value *V)
Definition:SLPVectorizer.cpp:17661
llvm::slpvectorizer::BoUpSLP::isSignedMinBitwidthRootNode
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
Definition:SLPVectorizer.cpp:1445
llvm::slpvectorizer::BoUpSLP::analyzedReductionRoot
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
Definition:SLPVectorizer.cpp:2931
llvm::slpvectorizer::BoUpSLP::getRootNodeScalars
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
Definition:SLPVectorizer.cpp:1419
llvm::slpvectorizer::BoUpSLP::computeMinimumValueSizes
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
Definition:SLPVectorizer.cpp:18136
llvm::slpvectorizer::BoUpSLP::deleteTree
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
Definition:SLPVectorizer.cpp:1477
llvm::slpvectorizer::BoUpSLP::getTreeCost
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
Definition:SLPVectorizer.cpp:12461
llvm::slpvectorizer::BoUpSLP::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition:SLPVectorizer.cpp:1585
llvm::slpvectorizer::BoUpSLP::ValueSet
SmallPtrSet< Value *, 16 > ValueSet
Definition:SLPVectorizer.cpp:1353
llvm::slpvectorizer::BoUpSLP::ExtraValueToDebugLocsMap
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
Definition:SLPVectorizer.cpp:1355
llvm::slpvectorizer::BoUpSLP::BoUpSLP
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
Definition:SLPVectorizer.cpp:1358
llvm::slpvectorizer::BoUpSLP::isNotScheduled
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Definition:SLPVectorizer.cpp:2959
llvm::slpvectorizer::BoUpSLP::transformNodes
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
Definition:SLPVectorizer.cpp:9773
llvm::slpvectorizer::BoUpSLP::isDeleted
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
Definition:SLPVectorizer.cpp:2830
llvm::slpvectorizer::BoUpSLP::buildExternalUses
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
Definition:SLPVectorizer.cpp:6547
llvm::slpvectorizer::BoUpSLP::isTreeTinyAndNotFullyVectorizable
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
Definition:SLPVectorizer.cpp:12075
llvm::slpvectorizer::BoUpSLP::removeInstructionsAndOperands
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
Definition:SLPVectorizer.cpp:2842
llvm::slpvectorizer::BoUpSLP::canMapToVector
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
Definition:SLPVectorizer.cpp:8910
llvm::slpvectorizer::BoUpSLP::getMinVF
unsigned getMinVF(unsigned Sz) const
Definition:SLPVectorizer.cpp:1581
llvm::slpvectorizer::BoUpSLP::isAnalyzedReductionRoot
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
Definition:SLPVectorizer.cpp:2926
llvm::slpvectorizer::BoUpSLP::getReorderingData
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
Definition:SLPVectorizer.cpp:5552
llvm::slpvectorizer::BoUpSLP::eraseInstruction
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
Definition:SLPVectorizer.cpp:2835
llvm::slpvectorizer::BoUpSLP::doesRootHaveInTreeUses
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
Definition:SLPVectorizer.cpp:1413
llvm::slpvectorizer::BoUpSLP::getORE
OptimizationRemarkEmitter * getORE()
Definition:SLPVectorizer.cpp:1654
llvm::slpvectorizer::BoUpSLP::isAnyGathered
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
Definition:SLPVectorizer.cpp:2951
llvm::slpvectorizer::BoUpSLP::ValueList
SmallVector< Value *, 8 > ValueList
Definition:SLPVectorizer.cpp:1351
llvm::slpvectorizer::BoUpSLP::buildTree
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
Definition:SLPVectorizer.cpp:6771
llvm::slpvectorizer::BoUpSLP::isTreeNotExtendable
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
Definition:SLPVectorizer.cpp:12159
llvm::slpvectorizer::BoUpSLP::getReductionType
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
Definition:SLPVectorizer.cpp:1450
llvm::slpvectorizer::BoUpSLP::getMaxVecRegSize
unsigned getMaxVecRegSize() const
Definition:SLPVectorizer.cpp:1572
llvm::slpvectorizer::BoUpSLP::isVectorized
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
Definition:SLPVectorizer.cpp:2964
llvm::slpvectorizer::BoUpSLP::isIdentityOrder
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
Definition:SLPVectorizer.cpp:1516
llvm::slpvectorizer::BoUpSLP::isGathered
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
Definition:SLPVectorizer.cpp:2955
llvm::slpvectorizer::BoUpSLP::getSpillCost
InstructionCost getSpillCost() const
Definition:SLPVectorizer.cpp:12187
llvm::slpvectorizer::BoUpSLP::getMinVecRegSize
unsigned getMinVecRegSize() const
Definition:SLPVectorizer.cpp:1577
llvm::slpvectorizer::BoUpSLP::vectorizeTree
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
Definition:SLPVectorizer.cpp:16300
llvm::slpvectorizer::BoUpSLP::findBestRootPair
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
Definition:SLPVectorizer.cpp:2810
llvm::slpvectorizer::BoUpSLP::findReusedOrderedScalars
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
Definition:SLPVectorizer.cpp:4627
llvm::slpvectorizer::BoUpSLP::clearReductionData
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
Definition:SLPVectorizer.cpp:2945
llvm::slpvectorizer::BoUpSLP::optimizeGatherSequence
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Definition:SLPVectorizer.cpp:16956
uint32_t
uint64_t
unsigned
llvm::VFDatabase::getVectorizedFunction
Function * getVectorizedFunction(const VFShape &Shape) const
Definition:VectorUtils.h:106
iterator.h
iterator_range.h
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
ErrorHandling.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition:ErrorHandling.h:143
llvm::AArch64CC::LS
@ LS
Definition:AArch64BaseInfo.h:264
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition:AMDGPUMetadata.h:395
llvm::AMDGPU::PALMD::Key
Key
PAL metadata keys.
Definition:AMDGPUMetadata.h:487
llvm::AMDGPU::VGPRIndexMode::Id
Id
Definition:SIDefines.h:310
llvm::AMDGPU::P1
@ P1
Definition:AMDGPURegBankLegalizeRules.h:53
llvm::ARMII::HorizontalReduction
@ HorizontalReduction
Definition:ARMBaseInfo.h:425
llvm::ARM_MB::ST
@ ST
Definition:ARMBaseInfo.h:73
llvm::ARM_PROC::IE
@ IE
Definition:ARMBaseInfo.h:27
llvm::ARM::PredBlockMask::TE
@ TE
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition:BitmaskEnum.h:125
llvm::COFF::Entry
@ Entry
Definition:COFF.h:844
llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition:CallingConv.h:34
llvm::Intrinsic::getOrInsertDeclaration
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition:Intrinsics.cpp:732
llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition:Intrinsics.h:44
llvm::LegalityPredicates::all
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
Definition:LegalizerInfo.h:234
llvm::M68kBeads::Term
@ Term
Definition:M68kBaseInfo.h:116
llvm::M68k::MemAddrModeKind::U
@ U
llvm::M68k::MemAddrModeKind::V
@ V
llvm::M68k::MemAddrModeKind::u
@ u
llvm::M68k::MemAddrModeKind::K
@ K
llvm::M68k::MemAddrModeKind::L
@ L
llvm::MipsISD::Ext
@ Ext
Definition:MipsISelLowering.h:157
llvm::MipsISD::Ins
@ Ins
Definition:MipsISelLowering.h:158
llvm::NVPTX::PTXLdStInstCode::Scalar
@ Scalar
Definition:NVPTX.h:162
llvm::NVPTX::PTXLdStInstCode::V2
@ V2
Definition:NVPTX.h:163
llvm::PatternMatch
Definition:PatternMatch.h:47
llvm::PatternMatch::m_Store
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
Definition:PatternMatch.h:1930
llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1216
llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1102
llvm::PatternMatch::m_BinOp
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition:PatternMatch.h:100
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1228
llvm::PatternMatch::m_FMul
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1174
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition:PatternMatch.h:49
llvm::PatternMatch::m_Instruction
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition:PatternMatch.h:826
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition:PatternMatch.h:885
llvm::PatternMatch::m_ExtractElt
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
Definition:PatternMatch.h:1837
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition:PatternMatch.h:168
llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition:PatternMatch.h:1799
llvm::PatternMatch::m_SMin
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
Definition:PatternMatch.h:2348
llvm::PatternMatch::m_FAdd
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1108
llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1168
llvm::PatternMatch::m_OneUse
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition:PatternMatch.h:67
llvm::PatternMatch::m_LogicalOr
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
Definition:PatternMatch.h:3099
llvm::PatternMatch::m_Load
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
Definition:PatternMatch.h:1923
llvm::PatternMatch::m_ZExt
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
Definition:PatternMatch.h:2107
llvm::PatternMatch::m_UMax
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
Definition:PatternMatch.h:2354
llvm::PatternMatch::m_Cmp
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition:PatternMatch.h:105
llvm::PatternMatch::m_SMax
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
Definition:PatternMatch.h:2342
llvm::PatternMatch::m_APInt
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition:PatternMatch.h:299
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition:PatternMatch.h:92
llvm::PatternMatch::m_ZExtOrSExt
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
Definition:PatternMatch.h:2138
llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1234
llvm::PatternMatch::m_LogicalAnd
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
Definition:PatternMatch.h:3081
llvm::PatternMatch::m_Undef
auto m_Undef()
Match an arbitrary undef constant.
Definition:PatternMatch.h:152
llvm::PatternMatch::m_Or
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1222
llvm::PatternMatch::m_UMin
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Definition:PatternMatch.h:2360
llvm::PatternMatch::m_CombineOr
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition:PatternMatch.h:239
llvm::RISCVFenceField::R
@ R
Definition:RISCVBaseInfo.h:373
llvm::SIEncodingFamily::VI
@ VI
Definition:SIDefines.h:37
llvm::SIEncodingFamily::SI
@ SI
Definition:SIDefines.h:36
llvm::SPII::Store
@ Store
Definition:SparcInstrInfo.h:33
llvm::X86AS::GS
@ GS
Definition:X86.h:210
llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind::Used
@ Used
llvm::cl::Hidden
@ Hidden
Definition:CommandLine.h:137
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition:CommandLine.h:443
llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
llvm::codeview::ExportFlags::IsConstant
@ IsConstant
llvm::dwarf::Index
Index
Definition:Dwarf.h:882
llvm::dxil::ElementType::I1
@ I1
llvm::logicalview::LVPrintKind::Instructions
@ Instructions
llvm::objcopy::AdjustKind::Set
@ Set
llvm::omp::RTLDependInfoFields::Len
@ Len
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition:OptimizationRemarkEmitter.h:135
llvm::pdb::PDB_MemoryType::Stack
@ Stack
llvm::sampleprof::Base
@ Base
Definition:Discriminator.h:58
llvm::sys::path::begin
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition:Path.cpp:226
llvm::tgtok::In
@ In
Definition:TGLexer.h:84
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition:AddressRanges.h:18
llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition:STLExtras.h:329
llvm::getPointersDiff
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
Definition:LoopAccessAnalysis.cpp:1535
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition:SparseBitVector.h:877
llvm::createSimpleReduction
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition:LoopUtils.cpp:1278
llvm::doesNotNeedToBeScheduled
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
Definition:SLPVectorizer.cpp:1288
llvm::Offset
@ Offset
Definition:DWP.cpp:480
llvm::zip
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition:STLExtras.h:854
llvm::stable_sort
void stable_sort(R &&Range)
Definition:STLExtras.h:2037
llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1759
llvm::for_each
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1732
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1739
llvm::hash_value
hash_code hash_value(const FixedPointSemantics &Val)
Definition:APFixedPoint.h:136
llvm::getMinMaxReductionIntrinsicOp
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition:LoopUtils.cpp:989
llvm::isEqual
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Definition:GCNRegPressure.cpp:22
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition:STLExtras.h:1697
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition:Local.cpp:546
llvm::getVectorIntrinsicIDForCall
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
Definition:VectorUtils.cpp:209
llvm::reorderScalars
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
Definition:SLPVectorizer.cpp:1239
llvm::make_scope_exit
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition:ScopeExit.h:59
llvm::Depth
@ Depth
Definition:SIMachineScheduler.h:36
llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition:STLExtras.h:2448
llvm::set_intersect
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition:SetOperations.h:58
llvm::AlignStyle::Right
@ Right
llvm::AlignStyle::Left
@ Left
llvm::verifyFunction
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition:Verifier.cpp:7301
llvm::salvageDebugInfo
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition:Utils.cpp:1683
llvm::Failed
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition:Error.h:198
llvm::isUsedOutsideBlock
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
Definition:SLPVectorizer.cpp:1271
llvm::canConvertToMinOrMaxIntrinsic
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
Definition:ValueTracking.cpp:9174
llvm::set_is_subset
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
Definition:SetOperations.h:151
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition:ValueTracking.cpp:6775
llvm::interleaveComma
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition:STLExtras.h:2207
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition:STLExtras.h:657
llvm::alignDown
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition:MathExtras.h:556
llvm::post_order
iterator_range< po_iterator< T > > post_order(const T &G)
Definition:PostOrderIterator.h:197
llvm::getAlign
MaybeAlign getAlign(const Function &F, unsigned Index)
Definition:NVPTXUtilities.cpp:323
llvm::propagateMetadata
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
Definition:VectorUtils.cpp:942
llvm::bit_ceil
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition:bit.h:342
llvm::copy_if
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1785
llvm::isGather
bool isGather(IntrinsicInst *IntInst)
Definition:ARMBaseInstrInfo.h:937
llvm::getPointerOperand
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
Definition:Instructions.h:4998
llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition:MathExtras.h:395
llvm::MaskedValueIsZero
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
Definition:ValueTracking.cpp:333
llvm::erase
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition:STLExtras.h:2107
llvm::transform
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition:STLExtras.h:1952
llvm::has_single_bit
constexpr bool has_single_bit(T Value) noexcept
Definition:bit.h:146
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1746
llvm::isInstructionTriviallyDead
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition:Local.cpp:406
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition:MathExtras.h:341
llvm::createStrideMask
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
Definition:VectorUtils.cpp:1032
llvm::reverse
auto reverse(ContainerTy &&C)
Definition:STLExtras.h:420
llvm::inversePermutation
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
Definition:SLPVectorizer.cpp:1229
llvm::get
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Definition:PointerIntPair.h:270
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition:STLExtras.h:1664
llvm::createReplicatedMask
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
Definition:VectorUtils.cpp:1012
llvm::ComplexDeinterleavingOperation::Splat
@ Splat
llvm::find_if_not
auto find_if_not(R &&Range, UnaryPredicate P)
Definition:STLExtras.h:1771
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition:Debug.cpp:163
llvm::hasFullVectorsOrPowerOf2
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
Definition:SLPVectorizer.cpp:1304
llvm::isPointerTy
bool isPointerTy(const Type *T)
Definition:SPIRVUtils.h:256
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1753
llvm::wouldInstructionBeTriviallyDead
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition:Local.cpp:425
llvm::isModOrRefSet
bool isModOrRefSet(const ModRefInfo MRI)
Definition:ModRef.h:42
llvm::isSafeToSpeculativelyExecute
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
Definition:ValueTracking.cpp:7050
llvm::sortPtrAccesses
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
Definition:LoopAccessAnalysis.cpp:1600
llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition:Casting.h:548
llvm::propagateIRFlags
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition:LoopUtils.cpp:1368
llvm::PoisonMaskElem
constexpr int PoisonMaskElem
Definition:Instructions.h:1889
llvm::ModRefInfo::Ref
@ Ref
The access may reference the value stored in memory.
llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition:MathExtras.h:404
llvm::IRMemLocation::Other
@ Other
Any other memory.
llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
llvm::TTI
TargetTransformInfo TTI
Definition:TargetTransformInfo.h:208
llvm::getMinMaxReductionPredicate
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition:LoopUtils.cpp:1054
llvm::RecurKind
RecurKind
These are the kinds of recurrences that we support.
Definition:IVDescriptors.h:33
llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
llvm::RecurKind::None
@ None
Not a recurrence.
llvm::isVectorIntrinsicWithScalarOpAtArg
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
Definition:VectorUtils.cpp:134
llvm::areAllOperandsNonInsts
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
Definition:SLPVectorizer.cpp:1254
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition:Alignment.h:155
llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition:STLExtras.h:1938
llvm::Op
DWARFExpression::Operation Op
Definition:DWARFExpression.cpp:22
llvm::max_element
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition:STLExtras.h:2014
llvm::ViewGraph
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition:GraphWriter.h:427
llvm::copy
OutputIt copy(R &&Range, OutputIt Out)
Definition:STLExtras.h:1841
llvm::doesNotNeedToSchedule
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
Definition:SLPVectorizer.cpp:1296
llvm::BitWidth
constexpr unsigned BitWidth
Definition:BitmaskEnum.h:217
llvm::isGuaranteedToTransferExecutionToSuccessor
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
Definition:ValueTracking.cpp:7927
llvm::PseudoProbeReservedId::Last
@ Last
llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition:STLExtras.h:1945
llvm::getNumberOfParts
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
Definition:SLPVectorizer.cpp:1321
llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1766
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition:STLExtras.h:1903
llvm::ComputeNumSignBits
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
Definition:ValueTracking.cpp:351
llvm::Cost
InstructionCost Cost
Definition:FunctionSpecialization.h:102
llvm::seq
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition:Sequence.h:305
llvm::VFParamKind::Vector
@ Vector
llvm::hash_combine
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition:Hashing.h:590
llvm::isGuaranteedNotToBePoison
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Definition:ValueTracking.cpp:7856
llvm::bit_floor
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition:bit.h:327
llvm::ConstantFoldIntegerCast
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
Definition:ConstantFolding.cpp:1549
llvm::Data
@ Data
Definition:SIMachineScheduler.h:55
llvm::isKnownNonNegative
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
Definition:ValueTracking.cpp:292
llvm::mayHaveNonDefUseDependency
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
Definition:ValueTracking.cpp:7163
llvm::isTriviallyVectorizable
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition:VectorUtils.cpp:46
llvm::isVectorIntrinsicWithOverloadTypeAtArg
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
Definition:VectorUtils.cpp:162
llvm::hash_combine_range
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition:Hashing.h:468
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition:BitVector.h:860
raw_ostream.h
N
#define N
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition:Alignment.h:39
llvm::CallBase::BundleOpInfo
Used to keep track of an operand bundle.
Definition:InstrTypes.h:2138
llvm::CodeMetrics::collectEphemeralValues
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition:CodeMetrics.cpp:71
llvm::DOTGraphTraits< BoUpSLP * >::TreeEntry
BoUpSLP::TreeEntry TreeEntry
Definition:SLPVectorizer.cpp:4493
llvm::DOTGraphTraits< BoUpSLP * >::getNodeLabel
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
Definition:SLPVectorizer.cpp:4497
llvm::DOTGraphTraits< BoUpSLP * >::getNodeAttributes
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
Definition:SLPVectorizer.cpp:4514
llvm::DOTGraphTraits< BoUpSLP * >::DOTGraphTraits
DOTGraphTraits(bool IsSimple=false)
Definition:SLPVectorizer.cpp:4495
llvm::DOTGraphTraits
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
Definition:DOTGraphTraits.h:166
llvm::DefaultDOTGraphTraits
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Definition:DOTGraphTraits.h:28
llvm::DiagnosticInfoOptimizationBase::Argument
Used in the streaming interface as the general argument type.
Definition:DiagnosticInfo.h:499
llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::operator*
NodeRef operator*()
Definition:SLPVectorizer.cpp:4450
llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::ChildIteratorType
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
Definition:SLPVectorizer.cpp:4446
llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::VectorizableTree
ContainerTy & VectorizableTree
Definition:SLPVectorizer.cpp:4444
llvm::GraphTraits< BoUpSLP * >::child_end
static ChildIteratorType child_end(NodeRef N)
Definition:SLPVectorizer.cpp:4461
llvm::GraphTraits< BoUpSLP * >::getEntryNode
static NodeRef getEntryNode(BoUpSLP &R)
Definition:SLPVectorizer.cpp:4453
llvm::GraphTraits< BoUpSLP * >::child_begin
static ChildIteratorType child_begin(NodeRef N)
Definition:SLPVectorizer.cpp:4457
llvm::GraphTraits< BoUpSLP * >::nodes_begin
static nodes_iterator nodes_begin(BoUpSLP *R)
Definition:SLPVectorizer.cpp:4481
llvm::GraphTraits< BoUpSLP * >::NodeRef
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
Definition:SLPVectorizer.cpp:4435
llvm::GraphTraits< BoUpSLP * >::size
static unsigned size(BoUpSLP *R)
Definition:SLPVectorizer.cpp:4489
llvm::GraphTraits< BoUpSLP * >::TreeEntry
BoUpSLP::TreeEntry TreeEntry
Definition:SLPVectorizer.cpp:4432
llvm::GraphTraits< BoUpSLP * >::nodes_end
static nodes_iterator nodes_end(BoUpSLP *R)
Definition:SLPVectorizer.cpp:4485
llvm::GraphTraits
Definition:GraphTraits.h:38
llvm::Incoming
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Definition:SILowerI1Copies.h:25
llvm::Loop::LoopBounds::Direction
Direction
An enum for the direction of the loop.
Definition:LoopInfo.h:215
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition:Alignment.h:117
llvm::MinMax
Definition:AssumeBundleQueries.h:70
llvm::SLPVectorizerPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition:SLPVectorizer.cpp:18482
llvm::SLPVectorizerPass::runImpl
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Definition:SLPVectorizer.cpp:18502
llvm::SimplifyQuery
Definition:SimplifyQuery.h:70
llvm::SmallMapVector
A MapVector that performs no allocations if smaller than a certain size.
Definition:MapVector.h:254
llvm::TargetTransformInfo::OperandValueInfo
Definition:TargetTransformInfo.h:1135
llvm::TargetTransformInfo::PointersChainInfo
Describe known properties for a set of pointers.
Definition:TargetTransformInfo.h:311
llvm::VFShape
Contains the information about the kind of vectorization available.
Definition:VFABIDemangler.h:83
llvm::VFShape::get
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Definition:VFABIDemangler.h:108
llvm::cl::desc
Definition:CommandLine.h:409
llvm::less_first
Function object to check whether the first component of a container supported by std::get (like std::...
Definition:STLExtras.h:1467
llvm::less_second
Function object to check whether the second component of a container supported by std::get (like std:...
Definition:STLExtras.h:1476
llvm::slpvectorizer::BoUpSLP::EdgeInfo
This structure holds any data we need about the edges being traversed during buildTree_rec().
Definition:SLPVectorizer.cpp:1660
llvm::slpvectorizer::BoUpSLP::EdgeInfo::EdgeIdx
unsigned EdgeIdx
The operand index of the use.
Definition:SLPVectorizer.cpp:1667
llvm::slpvectorizer::BoUpSLP::EdgeInfo::EdgeInfo
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
Definition:SLPVectorizer.cpp:1662
llvm::slpvectorizer::BoUpSLP::EdgeInfo::dump
LLVM_DUMP_METHOD void dump() const
Definition:SLPVectorizer.cpp:1679
llvm::slpvectorizer::BoUpSLP::EdgeInfo::UserTE
TreeEntry * UserTE
The user TreeEntry.
Definition:SLPVectorizer.cpp:1665
llvm::slpvectorizer::BoUpSLP::EdgeInfo::EdgeInfo
EdgeInfo()=default
llvm::slpvectorizer::BoUpSLP::EdgeInfo::operator<<
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
Definition:SLPVectorizer.cpp:1669
llvm::slpvectorizer::BoUpSLP::EdgeInfo::dump
void dump(raw_ostream &OS) const
Debug print.
Definition:SLPVectorizer.cpp:1675

Generated on Sun Jul 20 2025 14:55:11 for LLVM by doxygen 1.9.6
[8]ページ先頭

©2009-2025 Movatter.jp