Movatterモバイル変換


[0]ホーム

URL:


LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/PriorityQueue.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/ADT/SetOperations.h"
26#include "llvm/ADT/SetVector.h"
27#include "llvm/ADT/SmallBitVector.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallString.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
33#include "llvm/ADT/iterator_range.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/Analysis/AssumptionCache.h"
36#include "llvm/Analysis/CodeMetrics.h"
37#include "llvm/Analysis/ConstantFolding.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/LoopAccessAnalysis.h"
42#include "llvm/Analysis/LoopInfo.h"
43#include "llvm/Analysis/MemoryLocation.h"
44#include "llvm/Analysis/OptimizationRemarkEmitter.h"
45#include "llvm/Analysis/ScalarEvolution.h"
46#include "llvm/Analysis/ScalarEvolutionExpressions.h"
47#include "llvm/Analysis/TargetLibraryInfo.h"
48#include "llvm/Analysis/TargetTransformInfo.h"
49#include "llvm/Analysis/ValueTracking.h"
50#include "llvm/Analysis/VectorUtils.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
62#include "llvm/IR/Instructions.h"
63#include "llvm/IR/IntrinsicInst.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
67#include "llvm/IR/PatternMatch.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
77#include "llvm/Support/Casting.h"
78#include "llvm/Support/CommandLine.h"
79#include "llvm/Support/Compiler.h"
80#include "llvm/Support/DOTGraphTraits.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/DebugCounter.h"
83#include "llvm/Support/ErrorHandling.h"
84#include "llvm/Support/GraphWriter.h"
85#include "llvm/Support/InstructionCost.h"
86#include "llvm/Support/KnownBits.h"
87#include "llvm/Support/MathExtras.h"
88#include "llvm/Support/raw_ostream.h"
89#include "llvm/Transforms/Utils/InjectTLIMappings.h"
90#include "llvm/Transforms/Utils/Local.h"
91#include "llvm/Transforms/Utils/LoopUtils.h"
92#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespacellvm;
105using namespacellvm::PatternMatch;
106using namespaceslpvectorizer;
107using namespacestd::placeholders;
108
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
111
112STATISTIC(NumVectorInstructions,"Number of vector instructions generated");
113
114DEBUG_COUNTER(VectorizedGraphs,"slp-vectorized",
115"Controls which SLP graphs should be vectorized.");
116
117staticcl::opt<bool>
118RunSLPVectorization("vectorize-slp",cl::init(true),cl::Hidden,
119cl::desc("Run the SLP vectorization passes"));
120
121staticcl::opt<bool>
122SLPReVec("slp-revec",cl::init(false),cl::Hidden,
123cl::desc("Enable vectorization for wider vector utilization"));
124
125staticcl::opt<int>
126SLPCostThreshold("slp-threshold",cl::init(0),cl::Hidden,
127cl::desc("Only vectorize if you gain more than this "
128"number "));
129
130staticcl::opt<bool>SLPSkipEarlyProfitabilityCheck(
131"slp-skip-early-profitability-check",cl::init(false),cl::Hidden,
132cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
133"heuristics and makes vectorization decision via cost modeling."));
134
135staticcl::opt<bool>
136ShouldVectorizeHor("slp-vectorize-hor",cl::init(true),cl::Hidden,
137cl::desc("Attempt to vectorize horizontal reductions"));
138
139staticcl::opt<bool>ShouldStartVectorizeHorAtStore(
140"slp-vectorize-hor-store",cl::init(false),cl::Hidden,
141cl::desc(
142"Attempt to vectorize horizontal reductions feeding into a store"));
143
144staticcl::opt<int>
145MaxVectorRegSizeOption("slp-max-reg-size",cl::init(128),cl::Hidden,
146cl::desc("Attempt to vectorize for this register size in bits"));
147
148staticcl::opt<unsigned>
149MaxVFOption("slp-max-vf",cl::init(0),cl::Hidden,
150cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
151
152/// Limits the size of scheduling regions in a block.
153/// It avoid long compile times for _very_ large blocks where vector
154/// instructions are spread over a wide range.
155/// This limit is way higher than needed by real-world functions.
156staticcl::opt<int>
157ScheduleRegionSizeBudget("slp-schedule-budget",cl::init(100000),cl::Hidden,
158cl::desc("Limit the size of the SLP scheduling region per block"));
159
160staticcl::opt<int>MinVectorRegSizeOption(
161"slp-min-reg-size",cl::init(128),cl::Hidden,
162cl::desc("Attempt to vectorize for this register size in bits"));
163
164staticcl::opt<unsigned>RecursionMaxDepth(
165"slp-recursion-max-depth",cl::init(12),cl::Hidden,
166cl::desc("Limit the recursion depth when building a vectorizable tree"));
167
168staticcl::opt<unsigned>MinTreeSize(
169"slp-min-tree-size",cl::init(3),cl::Hidden,
170cl::desc("Only vectorize small trees if they are fully vectorizable"));
171
172// The maximum depth that the look-ahead score heuristic will explore.
173// The higher this value, the higher the compilation time overhead.
174staticcl::opt<int>LookAheadMaxDepth(
175"slp-max-look-ahead-depth",cl::init(2),cl::Hidden,
176cl::desc("The maximum look-ahead depth for operand reordering scores"));
177
178// The maximum depth that the look-ahead score heuristic will explore
179// when it probing among candidates for vectorization tree roots.
180// The higher this value, the higher the compilation time overhead but unlike
181// similar limit for operands ordering this is less frequently used, hence
182// impact of higher value is less noticeable.
183staticcl::opt<int>RootLookAheadMaxDepth(
184"slp-max-root-look-ahead-depth",cl::init(2),cl::Hidden,
185cl::desc("The maximum look-ahead depth for searching best rooting option"));
186
187staticcl::opt<unsigned>MinProfitableStridedLoads(
188"slp-min-strided-loads",cl::init(2),cl::Hidden,
189cl::desc("The minimum number of loads, which should be considered strided, "
190"if the stride is > 1 or is runtime value"));
191
192staticcl::opt<unsigned>MaxProfitableLoadStride(
193"slp-max-stride",cl::init(8),cl::Hidden,
194cl::desc("The maximum stride, considered to be profitable."));
195
196staticcl::opt<bool>
197ViewSLPTree("view-slp-tree",cl::Hidden,
198cl::desc("Display the SLP trees with Graphviz"));
199
200staticcl::opt<bool>VectorizeNonPowerOf2(
201"slp-vectorize-non-power-of-2",cl::init(false),cl::Hidden,
202cl::desc("Try to vectorize with non-power-of-2 number of elements."));
203
204// Limit the number of alias checks. The limit is chosen so that
205// it has no negative effect on the llvm benchmarks.
206staticconstunsignedAliasedCheckLimit = 10;
207
208// Limit of the number of uses for potentially transformed instructions/values,
209// used in checks to avoid compile-time explode.
210staticconstexprintUsesLimit = 64;
211
212// Another limit for the alias checks: The maximum distance between load/store
213// instructions where alias checks are done.
214// This limit is useful for very large basic blocks.
215staticconstunsignedMaxMemDepDistance = 160;
216
217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
218/// regions to be handled.
219staticconstintMinScheduleRegionSize = 16;
220
221/// Maximum allowed number of operands in the PHI nodes.
222staticconstunsignedMaxPHINumOperands = 128;
223
224/// Predicate for the element types that the SLP vectorizer supports.
225///
226/// The most important thing to filter here are types which are invalid in LLVM
227/// vectors. We also filter target specific types which have absolutely no
228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
229/// avoids spending time checking the cost model and realizing that they will
230/// be inevitably scalarized.
231staticboolisValidElementType(Type *Ty) {
232// TODO: Support ScalableVectorType.
233if (SLPReVec && isa<FixedVectorType>(Ty))
234 Ty = Ty->getScalarType();
235return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
236 !Ty->isPPC_FP128Ty();
237}
238
239/// Returns the type of the given value/instruction \p V. If it is store,
240/// returns the type of its value operand, for Cmp - the types of the compare
241/// operands and for insertelement - the type os the inserted operand.
242/// Otherwise, just the type of the value is returned.
243staticType *getValueType(Value *V) {
244if (auto *SI = dyn_cast<StoreInst>(V))
245return SI->getValueOperand()->getType();
246if (auto *CI = dyn_cast<CmpInst>(V))
247return CI->getOperand(0)->getType();
248if (auto *IE = dyn_cast<InsertElementInst>(V))
249return IE->getOperand(1)->getType();
250return V->getType();
251}
252
253/// \returns the number of elements for Ty.
254staticunsignedgetNumElements(Type *Ty) {
255assert(!isa<ScalableVectorType>(Ty) &&
256"ScalableVectorType is not supported.");
257if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258return VecTy->getNumElements();
259return 1;
260}
261
262/// \returns the vector type of ScalarTy based on vectorization factor.
263staticFixedVectorType *getWidenedType(Type *ScalarTy,unsigned VF) {
264returnFixedVectorType::get(ScalarTy->getScalarType(),
265 VF *getNumElements(ScalarTy));
266}
267
268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
269/// which forms type, which splits by \p TTI into whole vector types during
270/// legalization.
271staticunsignedgetFullVectorNumberOfElements(constTargetTransformInfo &TTI,
272Type *Ty,unsigned Sz) {
273if (!isValidElementType(Ty))
274returnbit_ceil(Sz);
275// Find the number of elements, which forms full vectors.
276constunsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));
277if (NumParts == 0 || NumParts >= Sz)
278returnbit_ceil(Sz);
279returnbit_ceil(divideCeil(Sz, NumParts)) * NumParts;
280}
281
282/// Returns the number of elements of the given type \p Ty, not greater than \p
283/// Sz, which forms type, which splits by \p TTI into whole vector types during
284/// legalization.
285staticunsigned
286getFloorFullVectorNumberOfElements(constTargetTransformInfo &TTI,Type *Ty,
287unsigned Sz) {
288if (!isValidElementType(Ty))
289returnbit_floor(Sz);
290// Find the number of elements, which forms full vectors.
291unsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));
292if (NumParts == 0 || NumParts >= Sz)
293returnbit_floor(Sz);
294unsigned RegVF =bit_ceil(divideCeil(Sz, NumParts));
295if (RegVF > Sz)
296returnbit_floor(Sz);
297return (Sz / RegVF) * RegVF;
298}
299
300staticvoidtransformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
301SmallVectorImpl<int> &Mask) {
302// The ShuffleBuilder implementation use shufflevector to splat an "element".
303// But the element have different meaning for SLP (scalar) and REVEC
304// (vector). We need to expand Mask into masks which shufflevector can use
305// directly.
306SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
307for (unsignedI : seq<unsigned>(Mask.size()))
308for (auto [J, MaskV] :enumerate(MutableArrayRef(NewMask).slice(
309I * VecTyNumElements, VecTyNumElements)))
310 MaskV = Mask[I] ==PoisonMaskElem ?PoisonMaskElem
311 : Mask[I] * VecTyNumElements + J;
312 Mask.swap(NewMask);
313}
314
315/// \returns the number of groups of shufflevector
316/// A group has the following features
317/// 1. All of value in a group are shufflevector.
318/// 2. The mask of all shufflevector is isExtractSubvectorMask.
319/// 3. The mask of all shufflevector uses all of the elements of the source.
320/// e.g., it is 1 group (%0)
321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
325/// it is 2 groups (%3 and %4)
326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
334/// it is 0 group
335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339staticunsignedgetShufflevectorNumGroups(ArrayRef<Value *> VL) {
340if (VL.empty())
341return 0;
342if (!all_of(VL, IsaPred<ShuffleVectorInst>))
343return 0;
344auto *SV = cast<ShuffleVectorInst>(VL.front());
345unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348if (SVNumElements % ShuffleMaskSize != 0)
349return 0;
350unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
352return 0;
353unsigned NumGroup = 0;
354for (size_tI = 0, E = VL.size();I != E;I += GroupSize) {
355auto *SV = cast<ShuffleVectorInst>(VL[I]);
356Value *Src = SV->getOperand(0);
357ArrayRef<Value *> Group = VL.slice(I, GroupSize);
358SmallBitVector ExpectedIndex(GroupSize);
359if (!all_of(Group, [&](Value *V) {
360auto *SV = cast<ShuffleVectorInst>(V);
361// From the same source.
362if (SV->getOperand(0) != Src)
363returnfalse;
364int Index;
365if (!SV->isExtractSubvectorMask(Index))
366returnfalse;
367 ExpectedIndex.set(Index / ShuffleMaskSize);
368returntrue;
369 }))
370return 0;
371if (!ExpectedIndex.all())
372return 0;
373 ++NumGroup;
374 }
375assert(NumGroup == (VL.size() / GroupSize) &&"Unexpected number of groups");
376return NumGroup;
377}
378
379/// \returns a shufflevector mask which is used to vectorize shufflevectors
380/// e.g.,
381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
389/// the result is
390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
391staticSmallVector<int>calculateShufflevectorMask(ArrayRef<Value *> VL) {
392assert(getShufflevectorNumGroups(VL) &&"Not supported shufflevector usage.");
393auto *SV = cast<ShuffleVectorInst>(VL.front());
394unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396SmallVector<int> Mask;
397unsigned AccumulateLength = 0;
398for (Value *V : VL) {
399auto *SV = cast<ShuffleVectorInst>(V);
400for (int M : SV->getShuffleMask())
401 Mask.push_back(M ==PoisonMaskElem ?PoisonMaskElem
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
404 }
405return Mask;
406}
407
408/// \returns True if the value is a constant (but not globals/constant
409/// expressions).
410staticboolisConstant(Value *V) {
411return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
412}
413
414/// Checks if \p V is one of vector-like instructions, i.e. undef,
415/// insertelement/extractelement with constant indices for fixed vector type or
416/// extractvalue instruction.
417staticboolisVectorLikeInstWithConstOps(Value *V) {
418if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
420returnfalse;
421auto *I = dyn_cast<Instruction>(V);
422if (!I || isa<ExtractValueInst>(I))
423returntrue;
424if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
425returnfalse;
426if (isa<ExtractElementInst>(I))
427returnisConstant(I->getOperand(1));
428assert(isa<InsertElementInst>(V) &&"Expected only insertelement.");
429returnisConstant(I->getOperand(2));
430}
431
432/// Returns power-of-2 number of elements in a single register (part), given the
433/// total number of elements \p Size and number of registers (parts) \p
434/// NumParts.
435staticunsignedgetPartNumElems(unsignedSize,unsigned NumParts) {
436return std::min<unsigned>(Size,bit_ceil(divideCeil(Size, NumParts)));
437}
438
439/// Returns correct remaining number of elements, considering total amount \p
440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
441/// and current register (part) \p Part.
442staticunsignedgetNumElems(unsignedSize,unsigned PartNumElems,
443unsigned Part) {
444return std::min<unsigned>(PartNumElems,Size - Part * PartNumElems);
445}
446
447#if !defined(NDEBUG)
448/// Print a short descriptor of the instruction bundle suitable for debug output.
449static std::stringshortBundleName(ArrayRef<Value *> VL,intIdx = -1) {
450 std::string Result;
451raw_string_ostreamOS(Result);
452if (Idx >= 0)
453OS <<"Idx: " <<Idx <<", ";
454OS <<"n=" << VL.size() <<" [" << *VL.front() <<", ..]";
455return Result;
456}
457#endif
458
459/// \returns true if all of the instructions in \p VL are in the same block or
460/// false otherwise.
461staticboolallSameBlock(ArrayRef<Value *> VL) {
462auto *It =find_if(VL, IsaPred<Instruction>);
463if (It == VL.end())
464returnfalse;
465Instruction *I0 = cast<Instruction>(*It);
466if (all_of(VL,isVectorLikeInstWithConstOps))
467returntrue;
468
469BasicBlock *BB = I0->getParent();
470for (Value *V :iterator_range(It, VL.end())) {
471if (isa<PoisonValue>(V))
472continue;
473auto *II = dyn_cast<Instruction>(V);
474if (!II)
475returnfalse;
476
477if (BB !=II->getParent())
478returnfalse;
479 }
480returntrue;
481}
482
483/// \returns True if all of the values in \p VL are constants (but not
484/// globals/constant expressions).
485staticboolallConstant(ArrayRef<Value *> VL) {
486// Constant expressions and globals can't be vectorized like normal integer/FP
487// constants.
488returnall_of(VL,isConstant);
489}
490
491/// \returns True if all of the values in \p VL are identical or some of them
492/// are UndefValue.
493staticboolisSplat(ArrayRef<Value *> VL) {
494Value *FirstNonUndef =nullptr;
495for (Value *V : VL) {
496if (isa<UndefValue>(V))
497continue;
498if (!FirstNonUndef) {
499 FirstNonUndef = V;
500continue;
501 }
502if (V != FirstNonUndef)
503returnfalse;
504 }
505return FirstNonUndef !=nullptr;
506}
507
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
509staticboolisCommutative(Instruction *I) {
510if (auto *Cmp = dyn_cast<CmpInst>(I))
511return Cmp->isCommutative();
512if (auto *BO = dyn_cast<BinaryOperator>(I))
513return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
515 !BO->hasNUsesOrMore(UsesLimit) &&
516all_of(
517 BO->uses(),
518 [](constUse &U) {
519// Commutative, if icmp eq/ne sub, 0
520 CmpPredicate Pred;
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
524 return true;
525// Commutative, if abs(sub nsw, true) or abs(sub, false).
526 ConstantInt *Flag;
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
531 Flag->isOne());
532 })) ||
533 (BO->getOpcode() == Instruction::FSub &&
534 !BO->hasNUsesOrMore(UsesLimit) &&
535all_of(BO->uses(), [](constUse &U) {
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 }));
539returnI->isCommutative();
540}
541
542template <typename T>
543static std::optional<unsigned>getInsertExtractIndex(constValue *Inst,
544unsignedOffset) {
545static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
547"unsupported T");
548int Index =Offset;
549if (constauto *IE = dyn_cast<T>(Inst)) {
550constauto *VT = dyn_cast<FixedVectorType>(IE->getType());
551if (!VT)
552return std::nullopt;
553constauto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
554if (!CI)
555return std::nullopt;
556if (CI->getValue().uge(VT->getNumElements()))
557return std::nullopt;
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
560return Index;
561 }
562return std::nullopt;
563}
564
565/// \returns inserting or extracting index of InsertElement, ExtractElement or
566/// InsertValue instruction, using Offset as base offset for index.
567/// \returns std::nullopt if the index is not an immediate.
568static std::optional<unsigned>getElementIndex(constValue *Inst,
569unsignedOffset = 0) {
570if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst,Offset))
571return Index;
572if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,Offset))
573return Index;
574
575int Index =Offset;
576
577constauto *IV = dyn_cast<InsertValueInst>(Inst);
578if (!IV)
579return std::nullopt;
580
581Type *CurrentType =IV->getType();
582for (unsignedI :IV->indices()) {
583if (constauto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(I);
586 }elseif (constauto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
589 }else {
590return std::nullopt;
591 }
592 Index +=I;
593 }
594return Index;
595}
596
597namespace{
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements
599/// in the shuffle mask.
600enum class UseMask {
601 FirstArg,///< The mask is expected to be for permutation of 1-2 vectors,
602 ///< check for the mask elements for the first argument (mask
603 ///< indices are in range [0:VF)).
604 SecondArg,///< The mask is expected to be for permutation of 2 vectors, check
605 ///< for the mask elements for the second argument (mask indices
606 ///< are in range [VF:2*VF))
607 UndefsAsMask///< Consider undef mask elements (-1) as placeholders for
608 ///< future shuffle elements and mark them as ones as being used
609 ///< in future. Non-undef elements are considered as unused since
610 ///< they're already marked as used in the mask.
611};
612}// namespace
613
614/// Prepares a use bitset for the given mask either for the first argument or
615/// for the second.
616staticSmallBitVectorbuildUseMask(int VF,ArrayRef<int> Mask,
617 UseMask MaskArg) {
618SmallBitVector UseMask(VF,true);
619for (auto [Idx,Value] :enumerate(Mask)) {
620if (Value ==PoisonMaskElem) {
621if (MaskArg == UseMask::UndefsAsMask)
622 UseMask.reset(Idx);
623continue;
624 }
625if (MaskArg == UseMask::FirstArg &&Value < VF)
626 UseMask.reset(Value);
627elseif (MaskArg == UseMask::SecondArg &&Value >= VF)
628 UseMask.reset(Value - VF);
629 }
630return UseMask;
631}
632
633/// Checks if the given value is actually an undefined constant vector.
634/// Also, if the \p UseMask is not empty, tries to check if the non-masked
635/// elements actually mask the insertelement buildvector, if any.
636template <bool IsPoisonOnly = false>
637staticSmallBitVectorisUndefVector(constValue *V,
638constSmallBitVector &UseMask = {}) {
639SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(),true);
640usingT = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
641if (isa<T>(V))
642return Res;
643auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
644if (!VecTy)
645return Res.reset();
646auto *C = dyn_cast<Constant>(V);
647if (!C) {
648if (!UseMask.empty()) {
649constValue *Base =V;
650while (auto *II = dyn_cast<InsertElementInst>(Base)) {
651Base =II->getOperand(0);
652if (isa<T>(II->getOperand(1)))
653continue;
654 std::optional<unsigned>Idx =getElementIndex(II);
655if (!Idx) {
656 Res.reset();
657return Res;
658 }
659if (*Idx < UseMask.size() && !UseMask.test(*Idx))
660 Res.reset(*Idx);
661 }
662// TODO: Add analysis for shuffles here too.
663if (V ==Base) {
664 Res.reset();
665 }else {
666SmallBitVector SubMask(UseMask.size(),false);
667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
668 }
669 }else {
670 Res.reset();
671 }
672return Res;
673 }
674for (unsignedI = 0, E = VecTy->getNumElements();I != E; ++I) {
675if (Constant *Elem =C->getAggregateElement(I))
676if (!isa<T>(Elem) &&
677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
678 Res.reset(I);
679 }
680return Res;
681}
682
683/// Checks if the vector of instructions can be represented as a shuffle, like:
684/// %x0 = extractelement <4 x i8> %x, i32 0
685/// %x3 = extractelement <4 x i8> %x, i32 3
686/// %y1 = extractelement <4 x i8> %y, i32 1
687/// %y2 = extractelement <4 x i8> %y, i32 2
688/// %x0x0 = mul i8 %x0, %x0
689/// %x3x3 = mul i8 %x3, %x3
690/// %y1y1 = mul i8 %y1, %y1
691/// %y2y2 = mul i8 %y2, %y2
692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
696/// ret <4 x i8> %ins4
697/// can be transformed into:
698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
699/// i32 6>
700/// %2 = mul <4 x i8> %1, %1
701/// ret <4 x i8> %2
702/// Mask will return the Shuffle Mask equivalent to the extracted elements.
703/// TODO: Can we split off and reuse the shuffle mask detection from
704/// ShuffleVectorInst/getShuffleCost?
705static std::optional<TargetTransformInfo::ShuffleKind>
706isFixedVectorShuffle(ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,
707AssumptionCache *AC) {
708constauto *It =find_if(VL, IsaPred<ExtractElementInst>);
709if (It == VL.end())
710return std::nullopt;
711unsignedSize =
712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S,Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
714 if (!EI)
715 return S;
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
717 if (!VTy)
718 return S;
719 return std::max(S, VTy->getNumElements());
720 });
721
722Value *Vec1 =nullptr;
723Value *Vec2 =nullptr;
724bool HasNonUndefVec =any_of(VL, [&](Value *V) {
725auto *EE = dyn_cast<ExtractElementInst>(V);
726if (!EE)
727returnfalse;
728Value *Vec = EE->getVectorOperand();
729if (isa<UndefValue>(Vec))
730returnfalse;
731returnisGuaranteedNotToBePoison(Vec, AC);
732 });
733enum ShuffleMode {Unknown,Select, Permute };
734 ShuffleMode CommonShuffleMode =Unknown;
735 Mask.assign(VL.size(),PoisonMaskElem);
736for (unsignedI = 0, E = VL.size();I < E; ++I) {
737// Undef can be represented as an undef element in a vector.
738if (isa<UndefValue>(VL[I]))
739continue;
740auto *EI = cast<ExtractElementInst>(VL[I]);
741if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742return std::nullopt;
743auto *Vec = EI->getVectorOperand();
744// We can extractelement from undef or poison vector.
745if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
746continue;
747// All vector operands must have the same number of vector elements.
748if (isa<UndefValue>(Vec)) {
749 Mask[I] =I;
750 }else {
751if (isa<UndefValue>(EI->getIndexOperand()))
752continue;
753auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
754if (!Idx)
755return std::nullopt;
756// Undefined behavior if Idx is negative or >= Size.
757if (Idx->getValue().uge(Size))
758continue;
759unsigned IntIdx =Idx->getValue().getZExtValue();
760 Mask[I] = IntIdx;
761 }
762if (isUndefVector(Vec).all() && HasNonUndefVec)
763continue;
764// For correct shuffling we have to have at most 2 different vector operands
765// in all extractelement instructions.
766if (!Vec1 || Vec1 == Vec) {
767 Vec1 = Vec;
768 }elseif (!Vec2 || Vec2 == Vec) {
769 Vec2 = Vec;
770 Mask[I] +=Size;
771 }else {
772return std::nullopt;
773 }
774if (CommonShuffleMode == Permute)
775continue;
776// If the extract index is not the same as the operation number, it is a
777// permutation.
778if (Mask[I] %Size !=I) {
779 CommonShuffleMode = Permute;
780continue;
781 }
782 CommonShuffleMode =Select;
783 }
784// If we're not crossing lanes in different vectors, consider it as blending.
785if (CommonShuffleMode ==Select && Vec2)
786returnTargetTransformInfo::SK_Select;
787// If Vec2 was never used, we have a permutation of a single vector, otherwise
788// we have permutation of 2 vectors.
789return Vec2 ?TargetTransformInfo::SK_PermuteTwoSrc
790 :TargetTransformInfo::SK_PermuteSingleSrc;
791}
792
793/// \returns True if Extract{Value,Element} instruction extracts element Idx.
794static std::optional<unsigned>getExtractIndex(Instruction *E) {
795unsigned Opcode = E->getOpcode();
796assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798"Expected extractelement or extractvalue instruction.");
799if (Opcode == Instruction::ExtractElement) {
800auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
801if (!CI)
802return std::nullopt;
803return CI->getZExtValue();
804 }
805auto *EI = cast<ExtractValueInst>(E);
806if (EI->getNumIndices() != 1)
807return std::nullopt;
808return *EI->idx_begin();
809}
810
811namespace{
812
813/// Main data required for vectorization of instructions.
814classInstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0.
816Instruction *MainOp =nullptr;
817Instruction *AltOp =nullptr;
818
819public:
820Instruction *getMainOp() const{
821assert(valid() &&"InstructionsState is invalid.");
822return MainOp;
823 }
824
825Instruction *getAltOp() const{
826assert(valid() &&"InstructionsState is invalid.");
827return AltOp;
828 }
829
830 /// The main/alternate opcodes for the list of instructions.
831unsignedgetOpcode() const{return getMainOp()->getOpcode(); }
832
833unsigned getAltOpcode() const{return getAltOp()->getOpcode(); }
834
835 /// Some of the instructions in the list have alternate opcodes.
836bool isAltShuffle() const{return getMainOp() != getAltOp(); }
837
838bool isOpcodeOrAlt(Instruction *I) const{
839unsigned CheckedOpcode =I->getOpcode();
840returngetOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
841 }
842
843 /// Checks if the current state is valid, i.e. has non-null MainOp
844bool valid() const{return MainOp && AltOp; }
845
846explicitoperatorbool() const{return valid(); }
847
848 InstructionsState() =delete;
849 InstructionsState(Instruction *MainOp,Instruction *AltOp)
850 : MainOp(MainOp), AltOp(AltOp) {}
851static InstructionsState invalid() {return {nullptr,nullptr}; }
852};
853
854}// end anonymous namespace
855
856/// \returns true if \p Opcode is allowed as part of the main/alternate
857/// instruction for SLP vectorization.
858///
859/// Example of unsupported opcode is SDIV that can potentially cause UB if the
860/// "shuffled out" lane would result in division by zero.
861staticboolisValidForAlternation(unsigned Opcode) {
862if (Instruction::isIntDivRem(Opcode))
863returnfalse;
864
865returntrue;
866}
867
868static InstructionsStategetSameOpcode(ArrayRef<Value *> VL,
869constTargetLibraryInfo &TLI);
870
871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
872/// compatible instructions or constants, or just some other regular values.
873staticboolareCompatibleCmpOps(Value *BaseOp0,Value *BaseOp1,Value *Op0,
874Value *Op1,constTargetLibraryInfo &TLI) {
875return (isConstant(BaseOp0) &&isConstant(Op0)) ||
876 (isConstant(BaseOp1) &&isConstant(Op1)) ||
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
880getSameOpcode({BaseOp0, Op0}, TLI) ||
881getSameOpcode({BaseOp1, Op1}, TLI);
882}
883
884/// \returns true if a compare instruction \p CI has similar "look" and
885/// same predicate as \p BaseCI, "as is" or with its operands and predicate
886/// swapped, false otherwise.
887staticboolisCmpSameOrSwapped(constCmpInst *BaseCI,constCmpInst *CI,
888constTargetLibraryInfo &TLI) {
889assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
890"Assessing comparisons of different types?");
891CmpInst::Predicate BasePred = BaseCI->getPredicate();
892CmpInst::Predicate Pred = CI->getPredicate();
893CmpInst::Predicate SwappedPred =CmpInst::getSwappedPredicate(Pred);
894
895Value *BaseOp0 = BaseCI->getOperand(0);
896Value *BaseOp1 = BaseCI->getOperand(1);
897Value *Op0 = CI->getOperand(0);
898Value *Op1 = CI->getOperand(1);
899
900return (BasePred == Pred &&
901areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
902 (BasePred == SwappedPred &&
903areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
904}
905
906/// \returns analysis of the Instructions in \p VL described in
907/// InstructionsState, the Opcode that we suppose the whole list
908/// could be vectorized even if its structure is diverse.
909static InstructionsStategetSameOpcode(ArrayRef<Value *> VL,
910constTargetLibraryInfo &TLI) {
911// Make sure these are all Instructions.
912if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
913return InstructionsState::invalid();
914
915auto *It =find_if(VL, IsaPred<Instruction>);
916if (It == VL.end())
917return InstructionsState::invalid();
918
919Instruction *MainOp = cast<Instruction>(*It);
920unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
921if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
922 (VL.size() == 2 && InstCnt < 2))
923return InstructionsState::invalid();
924
925bool IsCastOp = isa<CastInst>(MainOp);
926bool IsBinOp = isa<BinaryOperator>(MainOp);
927bool IsCmpOp = isa<CmpInst>(MainOp);
928CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
929 :CmpInst::BAD_ICMP_PREDICATE;
930Instruction *AltOp = MainOp;
931unsigned Opcode = MainOp->getOpcode();
932unsigned AltOpcode = Opcode;
933
934bool SwappedPredsCompatible = IsCmpOp && [&]() {
935SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
936 UniquePreds.insert(BasePred);
937 UniqueNonSwappedPreds.insert(BasePred);
938for (Value *V : VL) {
939auto *I = dyn_cast<CmpInst>(V);
940if (!I)
941returnfalse;
942CmpInst::Predicate CurrentPred =I->getPredicate();
943CmpInst::Predicate SwappedCurrentPred =
944CmpInst::getSwappedPredicate(CurrentPred);
945 UniqueNonSwappedPreds.insert(CurrentPred);
946if (!UniquePreds.contains(CurrentPred) &&
947 !UniquePreds.contains(SwappedCurrentPred))
948 UniquePreds.insert(CurrentPred);
949 }
950// Total number of predicates > 2, but if consider swapped predicates
951// compatible only 2, consider swappable predicates as compatible opcodes,
952// not alternate.
953return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
954 }();
955// Check for one alternate opcode from another BinaryOperator.
956// TODO - generalize to support all operators (types, calls etc.).
957Intrinsic::ID BaseID = 0;
958SmallVector<VFInfo> BaseMappings;
959if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
960 BaseID =getVectorIntrinsicIDForCall(CallBase, &TLI);
961 BaseMappings =VFDatabase(*CallBase).getMappings(*CallBase);
962if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
963return InstructionsState::invalid();
964 }
965bool AnyPoison = InstCnt != VL.size();
966// Skip MainOp.
967for (Value *V :iterator_range(It + 1, VL.end())) {
968auto *I = dyn_cast<Instruction>(V);
969if (!I)
970continue;
971
972// Cannot combine poison and divisions.
973// TODO: do some smart analysis of the CallInsts to exclude divide-like
974// intrinsics/functions only.
975if (AnyPoison && (I->isIntDivRem() ||I->isFPDivRem() || isa<CallInst>(I)))
976return InstructionsState::invalid();
977unsigned InstOpcode =I->getOpcode();
978if (IsBinOp && isa<BinaryOperator>(I)) {
979if (InstOpcode == Opcode || InstOpcode == AltOpcode)
980continue;
981if (Opcode == AltOpcode &&isValidForAlternation(InstOpcode) &&
982isValidForAlternation(Opcode)) {
983 AltOpcode = InstOpcode;
984 AltOp =I;
985continue;
986 }
987 }elseif (IsCastOp && isa<CastInst>(I)) {
988Value *Op0 = MainOp->getOperand(0);
989Type *Ty0 = Op0->getType();
990Value *Op1 =I->getOperand(0);
991Type *Ty1 = Op1->getType();
992if (Ty0 == Ty1) {
993if (InstOpcode == Opcode || InstOpcode == AltOpcode)
994continue;
995if (Opcode == AltOpcode) {
996assert(isValidForAlternation(Opcode) &&
997isValidForAlternation(InstOpcode) &&
998"Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1000 AltOp =I;
1001continue;
1002 }
1003 }
1004 }elseif (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1005auto *BaseInst = cast<CmpInst>(MainOp);
1006Type *Ty0 = BaseInst->getOperand(0)->getType();
1007Type *Ty1 = Inst->getOperand(0)->getType();
1008if (Ty0 == Ty1) {
1009assert(InstOpcode == Opcode &&"Expected same CmpInst opcode.");
1010assert(InstOpcode == AltOpcode &&
1011"Alternate instructions are only supported by BinaryOperator "
1012"and CastInst.");
1013// Check for compatible operands. If the corresponding operands are not
1014// compatible - need to perform alternate vectorization.
1015CmpInst::Predicate CurrentPred = Inst->getPredicate();
1016CmpInst::Predicate SwappedCurrentPred =
1017CmpInst::getSwappedPredicate(CurrentPred);
1018
1019if ((VL.size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1021continue;
1022
1023if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1024continue;
1025auto *AltInst = cast<CmpInst>(AltOp);
1026if (MainOp != AltOp) {
1027if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1028continue;
1029 }elseif (BasePred != CurrentPred) {
1030assert(
1031isValidForAlternation(InstOpcode) &&
1032"CmpInst isn't safe for alternation, logic needs to be updated!");
1033 AltOp =I;
1034continue;
1035 }
1036CmpInst::Predicate AltPred = AltInst->getPredicate();
1037if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1039continue;
1040 }
1041 }elseif (InstOpcode == Opcode) {
1042assert(InstOpcode == AltOpcode &&
1043"Alternate instructions are only supported by BinaryOperator and "
1044"CastInst.");
1045if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1046if (Gep->getNumOperands() != 2 ||
1047 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1048return InstructionsState::invalid();
1049 }elseif (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1050if (!isVectorLikeInstWithConstOps(EI))
1051return InstructionsState::invalid();
1052 }elseif (auto *LI = dyn_cast<LoadInst>(I)) {
1053auto *BaseLI = cast<LoadInst>(MainOp);
1054if (!LI->isSimple() || !BaseLI->isSimple())
1055return InstructionsState::invalid();
1056 }elseif (auto *Call = dyn_cast<CallInst>(I)) {
1057auto *CallBase = cast<CallInst>(MainOp);
1058if (Call->getCalledFunction() !=CallBase->getCalledFunction())
1059return InstructionsState::invalid();
1060if (Call->hasOperandBundles() &&
1061 (!CallBase->hasOperandBundles() ||
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1064CallBase->op_begin() +
1065CallBase->getBundleOperandsStartIndex())))
1066return InstructionsState::invalid();
1067Intrinsic::IDID =getVectorIntrinsicIDForCall(Call, &TLI);
1068if (ID != BaseID)
1069return InstructionsState::invalid();
1070if (!ID) {
1071SmallVector<VFInfo> Mappings =VFDatabase(*Call).getMappings(*Call);
1072if (Mappings.size() != BaseMappings.size() ||
1073 Mappings.front().ISA != BaseMappings.front().ISA ||
1074 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1075 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1076 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1077 Mappings.front().Shape.Parameters !=
1078 BaseMappings.front().Shape.Parameters)
1079return InstructionsState::invalid();
1080 }
1081 }
1082continue;
1083 }
1084return InstructionsState::invalid();
1085 }
1086
1087return InstructionsState(MainOp, AltOp);
1088}
1089
1090/// \returns true if all of the values in \p VL have the same type or false
1091/// otherwise.
1092staticboolallSameType(ArrayRef<Value *> VL) {
1093Type *Ty = VL.front()->getType();
1094returnall_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1095}
1096
1097/// \returns True if in-tree use also needs extract. This refers to
1098/// possible scalar operand in vectorized instruction.
1099staticbooldoesInTreeUserNeedToExtract(Value *Scalar,Instruction *UserInst,
1100TargetLibraryInfo *TLI,
1101constTargetTransformInfo *TTI) {
1102if (!UserInst)
1103returnfalse;
1104unsigned Opcode = UserInst->getOpcode();
1105switch (Opcode) {
1106case Instruction::Load: {
1107LoadInst *LI = cast<LoadInst>(UserInst);
1108return (LI->getPointerOperand() == Scalar);
1109 }
1110case Instruction::Store: {
1111StoreInst *SI = cast<StoreInst>(UserInst);
1112return (SI->getPointerOperand() == Scalar);
1113 }
1114case Instruction::Call: {
1115CallInst *CI = cast<CallInst>(UserInst);
1116Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
1117returnany_of(enumerate(CI->args()), [&](auto &&Arg) {
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1120 });
1121 }
1122default:
1123returnfalse;
1124 }
1125}
1126
1127/// \returns the AA location that is being access by the instruction.
1128staticMemoryLocationgetLocation(Instruction *I) {
1129if (StoreInst *SI = dyn_cast<StoreInst>(I))
1130returnMemoryLocation::get(SI);
1131if (LoadInst *LI = dyn_cast<LoadInst>(I))
1132returnMemoryLocation::get(LI);
1133returnMemoryLocation();
1134}
1135
1136/// \returns True if the instruction is not a volatile or atomic load/store.
1137staticboolisSimple(Instruction *I) {
1138if (LoadInst *LI = dyn_cast<LoadInst>(I))
1139return LI->isSimple();
1140if (StoreInst *SI = dyn_cast<StoreInst>(I))
1141return SI->isSimple();
1142if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1143return !MI->isVolatile();
1144returntrue;
1145}
1146
1147/// Shuffles \p Mask in accordance with the given \p SubMask.
1148/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1149/// one but two input vectors.
1150staticvoidaddMask(SmallVectorImpl<int> &Mask,ArrayRef<int> SubMask,
1151bool ExtendingManyInputs =false) {
1152if (SubMask.empty())
1153return;
1154assert(
1155 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1156// Check if input scalars were extended to match the size of other node.
1157 (SubMask.size() == Mask.size() && Mask.back() ==PoisonMaskElem)) &&
1158"SubMask with many inputs support must be larger than the mask.");
1159if (Mask.empty()) {
1160 Mask.append(SubMask.begin(), SubMask.end());
1161return;
1162 }
1163SmallVector<int> NewMask(SubMask.size(),PoisonMaskElem);
1164int TermValue = std::min(Mask.size(), SubMask.size());
1165for (intI = 0, E = SubMask.size();I < E; ++I) {
1166if (SubMask[I] ==PoisonMaskElem ||
1167 (!ExtendingManyInputs &&
1168 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1169continue;
1170 NewMask[I] = Mask[SubMask[I]];
1171 }
1172 Mask.swap(NewMask);
1173}
1174
1175/// Order may have elements assigned special value (size) which is out of
1176/// bounds. Such indices only appear on places which correspond to undef values
1177/// (see canReuseExtract for details) and used in order to avoid undef values
1178/// have effect on operands ordering.
1179/// The first loop below simply finds all unused indices and then the next loop
1180/// nest assigns these indices for undef values positions.
1181/// As an example below Order has two undef positions and they have assigned
1182/// values 3 and 7 respectively:
1183/// before: 6 9 5 4 9 2 1 0
1184/// after: 6 3 5 4 7 2 1 0
1185staticvoidfixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1186constunsigned Sz = Order.size();
1187SmallBitVector UnusedIndices(Sz,/*t=*/true);
1188SmallBitVector MaskedIndices(Sz);
1189for (unsignedI = 0;I < Sz; ++I) {
1190if (Order[I] < Sz)
1191 UnusedIndices.reset(Order[I]);
1192else
1193 MaskedIndices.set(I);
1194 }
1195if (MaskedIndices.none())
1196return;
1197assert(UnusedIndices.count() == MaskedIndices.count() &&
1198"Non-synced masked/available indices.");
1199intIdx = UnusedIndices.find_first();
1200int MIdx = MaskedIndices.find_first();
1201while (MIdx >= 0) {
1202assert(Idx >= 0 &&"Indices must be synced.");
1203 Order[MIdx] =Idx;
1204Idx = UnusedIndices.find_next(Idx);
1205 MIdx = MaskedIndices.find_next(MIdx);
1206 }
1207}
1208
1209/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1210/// Opcode1.
1211staticSmallBitVectorgetAltInstrMask(ArrayRef<Value *> VL,unsigned Opcode0,
1212unsigned Opcode1) {
1213Type *ScalarTy = VL[0]->getType();
1214unsigned ScalarTyNumElements =getNumElements(ScalarTy);
1215SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements,false);
1216for (unsigned Lane : seq<unsigned>(VL.size())) {
1217if (isa<PoisonValue>(VL[Lane]))
1218continue;
1219if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1220 OpcodeMask.set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1222 }
1223return OpcodeMask;
1224}
1225
1226namespacellvm {
1227
1228staticvoidinversePermutation(ArrayRef<unsigned> Indices,
1229SmallVectorImpl<int> &Mask) {
1230 Mask.clear();
1231constunsigned E = Indices.size();
1232 Mask.resize(E,PoisonMaskElem);
1233for (unsignedI = 0;I < E; ++I)
1234 Mask[Indices[I]] =I;
1235}
1236
1237/// Reorders the list of scalars in accordance with the given \p Mask.
1238staticvoidreorderScalars(SmallVectorImpl<Value *> &Scalars,
1239ArrayRef<int> Mask) {
1240assert(!Mask.empty() &&"Expected non-empty mask.");
1241SmallVector<Value *> Prev(Scalars.size(),
1242PoisonValue::get(Scalars.front()->getType()));
1243 Prev.swap(Scalars);
1244for (unsignedI = 0, E = Prev.size();I < E; ++I)
1245if (Mask[I] !=PoisonMaskElem)
1246 Scalars[Mask[I]] = Prev[I];
1247}
1248
1249/// Checks if the provided value does not require scheduling. It does not
1250/// require scheduling if this is not an instruction or it is an instruction
1251/// that does not read/write memory and all operands are either not instructions
1252/// or phi nodes or instructions from different blocks.
1253staticboolareAllOperandsNonInsts(Value *V) {
1254auto *I = dyn_cast<Instruction>(V);
1255if (!I)
1256returntrue;
1257return !mayHaveNonDefUseDependency(*I) &&
1258all_of(I->operands(), [I](Value *V) {
1259 auto *IO = dyn_cast<Instruction>(V);
1260 if (!IO)
1261 return true;
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1263 });
1264}
1265
1266/// Checks if the provided value does not require scheduling. It does not
1267/// require scheduling if this is not an instruction or it is an instruction
1268/// that does not read/write memory and all users are phi nodes or instructions
1269/// from the different blocks.
1270staticboolisUsedOutsideBlock(Value *V) {
1271auto *I = dyn_cast<Instruction>(V);
1272if (!I)
1273returntrue;
1274// Limits the number of uses to save compile time.
1275return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1276all_of(I->users(), [I](User *U) {
1277 auto *IU = dyn_cast<Instruction>(U);
1278 if (!IU)
1279 return true;
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1281 });
1282}
1283
1284/// Checks if the specified value does not require scheduling. It does not
1285/// require scheduling if all operands and all users do not need to be scheduled
1286/// in the current basic block.
1287staticbooldoesNotNeedToBeScheduled(Value *V) {
1288returnareAllOperandsNonInsts(V) &&isUsedOutsideBlock(V);
1289}
1290
1291/// Checks if the specified array of instructions does not require scheduling.
1292/// It is so if all either instructions have operands that do not require
1293/// scheduling or their users do not require scheduling since they are phis or
1294/// in other basic blocks.
1295staticbooldoesNotNeedToSchedule(ArrayRef<Value *> VL) {
1296return !VL.empty() &&
1297 (all_of(VL,isUsedOutsideBlock) ||all_of(VL,areAllOperandsNonInsts));
1298}
1299
1300/// Returns true if widened type of \p Ty elements with size \p Sz represents
1301/// full vector type, i.e. adding extra element results in extra parts upon type
1302/// legalization.
1303staticboolhasFullVectorsOrPowerOf2(constTargetTransformInfo &TTI,Type *Ty,
1304unsigned Sz) {
1305if (Sz <= 1)
1306returnfalse;
1307if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1308returnfalse;
1309if (has_single_bit(Sz))
1310returntrue;
1311constunsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1312return NumParts > 0 && NumParts < Sz &&has_single_bit(Sz / NumParts) &&
1313 Sz % NumParts == 0;
1314}
1315
1316namespaceslpvectorizer {
1317
1318/// Bottom Up SLP Vectorizer.
1319classBoUpSLP {
1320structTreeEntry;
1321structScheduleData;
1322classShuffleCostEstimator;
1323classShuffleInstructionBuilder;
1324
1325public:
1326 /// Tracks the state we can represent the loads in the given sequence.
1327enum classLoadsState {
1328Gather,
1329Vectorize,
1330ScatterVectorize,
1331StridedVectorize
1332 };
1333
1334usingValueList =SmallVector<Value *, 8>;
1335usingInstrList =SmallVector<Instruction *, 16>;
1336usingValueSet =SmallPtrSet<Value *, 16>;
1337usingStoreList =SmallVector<StoreInst *, 8>;
1338usingExtraValueToDebugLocsMap =SmallDenseSet<Value *, 4>;
1339usingOrdersType =SmallVector<unsigned, 4>;
1340
1341BoUpSLP(Function *Func,ScalarEvolution *Se,TargetTransformInfo *Tti,
1342TargetLibraryInfo *TLi,AAResults *Aa,LoopInfo *Li,
1343DominatorTree *Dt,AssumptionCache *AC,DemandedBits *DB,
1344constDataLayout *DL,OptimizationRemarkEmitter *ORE)
1345 : BatchAA(*Aa),F(Func), SE(Se),TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB),DL(DL), ORE(ORE),
1347 Builder(Se->getContext(),TargetFolder(*DL)) {
1348CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1349// Use the vector register size specified by the target unless overridden
1350// by a command-line option.
1351// TODO: It would be better to limit the vectorization factor based on
1352// data type rather than just register size. For example, x86 AVX has
1353// 256-bit registers, but it does not support integer operations
1354// at that width (that requires AVX2).
1355if (MaxVectorRegSizeOption.getNumOccurrences())
1356 MaxVecRegSize =MaxVectorRegSizeOption;
1357else
1358 MaxVecRegSize =
1359TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1360 .getFixedValue();
1361
1362if (MinVectorRegSizeOption.getNumOccurrences())
1363 MinVecRegSize =MinVectorRegSizeOption;
1364else
1365 MinVecRegSize =TTI->getMinVectorRegisterBitWidth();
1366 }
1367
1368 /// Vectorize the tree that starts with the elements in \p VL.
1369 /// Returns the vectorized root.
1370Value *vectorizeTree();
1371
1372 /// Vectorize the tree but with the list of externally used values \p
1373 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1374 /// generated extractvalue instructions.
1375Value *
1376vectorizeTree(constExtraValueToDebugLocsMap &ExternallyUsedValues,
1377Instruction *ReductionRoot =nullptr);
1378
1379 /// \returns the cost incurred by unwanted spills and fills, caused by
1380 /// holding live values over call sites.
1381InstructionCostgetSpillCost()const;
1382
1383 /// \returns the vectorization cost of the subtree that starts at \p VL.
1384 /// A negative number means that this is profitable.
1385InstructionCostgetTreeCost(ArrayRef<Value *> VectorizedVals = {});
1386
1387 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1388 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1389voidbuildTree(ArrayRef<Value *> Roots,
1390constSmallDenseSet<Value *> &UserIgnoreLst);
1391
1392 /// Construct a vectorizable tree that starts at \p Roots.
1393voidbuildTree(ArrayRef<Value *> Roots);
1394
1395 /// Returns whether the root node has in-tree uses.
1396booldoesRootHaveInTreeUses() const{
1397return !VectorizableTree.empty() &&
1398 !VectorizableTree.front()->UserTreeIndices.empty();
1399 }
1400
1401 /// Return the scalars of the root node.
1402ArrayRef<Value *>getRootNodeScalars() const{
1403assert(!VectorizableTree.empty() &&"No graph to get the first node from");
1404return VectorizableTree.front()->Scalars;
1405 }
1406
1407 /// Returns the type/is-signed info for the root node in the graph without
1408 /// casting.
1409 std::optional<std::pair<Type *, bool>>getRootNodeTypeWithNoCast() const{
1410const TreeEntry &Root = *VectorizableTree.front().get();
1411if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1413return std::nullopt;
1414auto It = MinBWs.find(&Root);
1415if (It != MinBWs.end())
1416return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1417 It->second.first),
1418 It->second.second);
1419if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1423return std::nullopt;
1424 }
1425
1426 /// Checks if the root graph node can be emitted with narrower bitwidth at
1427 /// codegen and returns it signedness, if so.
1428boolisSignedMinBitwidthRootNode() const{
1429return MinBWs.at(VectorizableTree.front().get()).second;
1430 }
1431
1432 /// Returns reduction type after minbitdth analysis.
1433FixedVectorType *getReductionType() const{
1434if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437DL->getTypeSizeInBits(
1438 VectorizableTree.front()->Scalars.front()->getType()))
1439returngetWidenedType(
1440 VectorizableTree.front()->Scalars.front()->getType(),
1441 VectorizableTree.front()->getVectorFactor());
1442returngetWidenedType(
1443IntegerType::get(
1444 VectorizableTree.front()->Scalars.front()->getContext(),
1445 ReductionBitWidth),
1446 VectorizableTree.front()->getVectorFactor());
1447 }
1448
1449 /// Builds external uses of the vectorized scalars, i.e. the list of
1450 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1451 /// ExternallyUsedValues contains additional list of external uses to handle
1452 /// vectorization of reductions.
1453void
1454buildExternalUses(constExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1455
1456 /// Transforms graph nodes to target specific representations, if profitable.
1457voidtransformNodes();
1458
1459 /// Clear the internal data structures that are created by 'buildTree'.
1460voiddeleteTree() {
1461 VectorizableTree.clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1464 MustGather.clear();
1465 NonScheduledFirst.clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.clear();
1468 IsGraphTransformMode =false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472for (auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1474 BS->clear();
1475 }
1476 MinBWs.clear();
1477 ReductionBitWidth = 0;
1478 BaseGraphSize = 1;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList =nullptr;
1483 PostponedGathers.clear();
1484 ValueToGatherNodes.clear();
1485 }
1486
1487unsignedgetTreeSize() const{return VectorizableTree.size(); }
1488
1489 /// Returns the base graph size, before any transformations.
1490unsignedgetCanonicalGraphSize() const{return BaseGraphSize; }
1491
1492 /// Perform LICM and CSE on the newly generated gather sequences.
1493voidoptimizeGatherSequence();
1494
1495 /// Does this non-empty order represent an identity order? Identity
1496 /// should be represented as an empty order, so this is used to
1497 /// decide if we can canonicalize a computed order. Undef elements
1498 /// (represented as size) are ignored.
1499boolisIdentityOrder(ArrayRef<unsigned> Order) const{
1500assert(!Order.empty() &&"expected non-empty order");
1501constunsigned Sz = Order.size();
1502returnall_of(enumerate(Order), [&](constauto &P) {
1503returnP.value() ==P.index() ||P.value() == Sz;
1504 });
1505 }
1506
1507 /// Checks if the specified gather tree entry \p TE can be represented as a
1508 /// shuffled vector entry + (possibly) permutation with other gathers. It
1509 /// implements the checks only for possibly ordered scalars (Loads,
1510 /// ExtractElement, ExtractValue), which can be part of the graph.
1511 std::optional<OrdersType>findReusedOrderedScalars(const TreeEntry &TE);
1512
1513 /// Sort loads into increasing pointers offsets to allow greater clustering.
1514 std::optional<OrdersType>findPartiallyOrderedLoads(const TreeEntry &TE);
1515
1516 /// Gets reordering data for the given tree entry. If the entry is vectorized
1517 /// - just return ReorderIndices, otherwise check if the scalars can be
1518 /// reordered and return the most optimal order.
1519 /// \return std::nullopt if ordering is not important, empty order, if
1520 /// identity order is important, or the actual order.
1521 /// \param TopToBottom If true, include the order of vectorized stores and
1522 /// insertelement nodes, otherwise skip them.
1523 std::optional<OrdersType>getReorderingData(const TreeEntry &TE,
1524bool TopToBottom);
1525
1526 /// Reorders the current graph to the most profitable order starting from the
1527 /// root node to the leaf nodes. The best order is chosen only from the nodes
1528 /// of the same size (vectorization factor). Smaller nodes are considered
1529 /// parts of subgraph with smaller VF and they are reordered independently. We
1530 /// can make it because we still need to extend smaller nodes to the wider VF
1531 /// and we can merge reordering shuffles with the widening shuffles.
1532voidreorderTopToBottom();
1533
1534 /// Reorders the current graph to the most profitable order starting from
1535 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1536 /// number of reshuffles if the leaf nodes use the same order. In this case we
1537 /// can merge the orders and just shuffle user node instead of shuffling its
1538 /// operands. Plus, even the leaf nodes have different orders, it allows to
1539 /// sink reordering in the graph closer to the root node and merge it later
1540 /// during analysis.
1541voidreorderBottomToTop(bool IgnoreReorder =false);
1542
1543 /// \return The vector element size in bits to use when vectorizing the
1544 /// expression tree ending at \p V. If V is a store, the size is the width of
1545 /// the stored value. Otherwise, the size is the width of the largest loaded
1546 /// value reaching V. This method is used by the vectorizer to calculate
1547 /// vectorization factors.
1548unsignedgetVectorElementSize(Value *V);
1549
1550 /// Compute the minimum type sizes required to represent the entries in a
1551 /// vectorizable tree.
1552voidcomputeMinimumValueSizes();
1553
1554// \returns maximum vector register size as set by TTI or overridden by cl::opt.
1555unsignedgetMaxVecRegSize() const{
1556return MaxVecRegSize;
1557 }
1558
1559// \returns minimum vector register size as set by cl::opt.
1560unsignedgetMinVecRegSize() const{
1561return MinVecRegSize;
1562 }
1563
1564unsignedgetMinVF(unsigned Sz) const{
1565return std::max(2U,getMinVecRegSize() / Sz);
1566 }
1567
1568unsignedgetMaximumVF(unsigned ElemWidth,unsigned Opcode) const{
1569unsigned MaxVF =MaxVFOption.getNumOccurrences() ?
1570MaxVFOption :TTI->getMaximumVF(ElemWidth, Opcode);
1571return MaxVF ? MaxVF : UINT_MAX;
1572 }
1573
1574 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1575 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1576 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1577 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1578 ///
1579 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1580unsignedcanMapToVector(Type *T)const;
1581
1582 /// \returns True if the VectorizableTree is both tiny and not fully
1583 /// vectorizable. We do not vectorize such trees.
1584boolisTreeTinyAndNotFullyVectorizable(bool ForReduction =false)const;
1585
1586 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1587 /// It may happen, if all gather nodes are loads and they cannot be
1588 /// "clusterized". In this case even subgraphs cannot be vectorized more
1589 /// effectively than the base graph.
1590boolisTreeNotExtendable()const;
1591
1592 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1593 /// can be load combined in the backend. Load combining may not be allowed in
1594 /// the IR optimizer, so we do not want to alter the pattern. For example,
1595 /// partially transforming a scalar bswap() pattern into vector code is
1596 /// effectively impossible for the backend to undo.
1597 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1598 /// may not be necessary.
1599boolisLoadCombineReductionCandidate(RecurKind RdxKind)const;
1600
1601 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1602 /// can be load combined in the backend. Load combining may not be allowed in
1603 /// the IR optimizer, so we do not want to alter the pattern. For example,
1604 /// partially transforming a scalar bswap() pattern into vector code is
1605 /// effectively impossible for the backend to undo.
1606 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1607 /// may not be necessary.
1608boolisLoadCombineCandidate(ArrayRef<Value *> Stores)const;
1609
1610 /// Checks if the given array of loads can be represented as a vectorized,
1611 /// scatter or just simple gather.
1612 /// \param VL list of loads.
1613 /// \param VL0 main load value.
1614 /// \param Order returned order of load instructions.
1615 /// \param PointerOps returned list of pointer operands.
1616 /// \param BestVF return best vector factor, if recursive check found better
1617 /// vectorization sequences rather than masked gather.
1618 /// \param TryRecursiveCheck used to check if long masked gather can be
1619 /// represented as a serie of loads/insert subvector, if profitable.
1620LoadsStatecanVectorizeLoads(ArrayRef<Value *> VL,constValue *VL0,
1621SmallVectorImpl<unsigned> &Order,
1622SmallVectorImpl<Value *> &PointerOps,
1623unsigned *BestVF =nullptr,
1624bool TryRecursiveCheck =true)const;
1625
1626 /// Registers non-vectorizable sequence of loads
1627template <typename T>voidregisterNonVectorizableLoads(ArrayRef<T *> VL) {
1628 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1629 }
1630
1631 /// Checks if the given loads sequence is known as not vectorizable
1632template <typename T>
1633boolareKnownNonVectorizableLoads(ArrayRef<T *> VL) const{
1634return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1635 }
1636
1637OptimizationRemarkEmitter *getORE() {return ORE; }
1638
1639 /// This structure holds any data we need about the edges being traversed
1640 /// during buildTree_rec(). We keep track of:
1641 /// (i) the user TreeEntry index, and
1642 /// (ii) the index of the edge.
1643structEdgeInfo {
1644EdgeInfo() =default;
1645EdgeInfo(TreeEntry *UserTE,unsignedEdgeIdx)
1646 :UserTE(UserTE),EdgeIdx(EdgeIdx) {}
1647 /// The user TreeEntry.
1648 TreeEntry *UserTE =nullptr;
1649 /// The operand index of the use.
1650unsignedEdgeIdx = UINT_MAX;
1651#ifndef NDEBUG
1652friendinlineraw_ostream &operator<<(raw_ostream &OS,
1653constBoUpSLP::EdgeInfo &EI) {
1654 EI.dump(OS);
1655returnOS;
1656 }
1657 /// Debug print.
1658voiddump(raw_ostream &OS) const{
1659OS <<"{User:" << (UserTE ? std::to_string(UserTE->Idx) :"null")
1660 <<" EdgeIdx:" <<EdgeIdx <<"}";
1661 }
1662LLVM_DUMP_METHODvoiddump() const{dump(dbgs()); }
1663#endif
1664bool operator == (constEdgeInfo &Other) const{
1665returnUserTE ==Other.UserTE &&EdgeIdx ==Other.EdgeIdx;
1666 }
1667 };
1668
1669 /// A helper class used for scoring candidates for two consecutive lanes.
1670classLookAheadHeuristics {
1671constTargetLibraryInfo &TLI;
1672constDataLayout &DL;
1673ScalarEvolution &SE;
1674constBoUpSLP &R;
1675int NumLanes;// Total number of lanes (aka vectorization factor).
1676int MaxLevel;// The maximum recursion depth for accumulating score.
1677
1678public:
1679LookAheadHeuristics(constTargetLibraryInfo &TLI,constDataLayout &DL,
1680ScalarEvolution &SE,constBoUpSLP &R,int NumLanes,
1681int MaxLevel)
1682 : TLI(TLI),DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1684
1685// The hard-coded scores listed here are not very important, though it shall
1686// be higher for better matches to improve the resulting cost. When
1687// computing the scores of matching one sub-tree with another, we are
1688// basically counting the number of values that are matching. So even if all
1689// scores are set to 1, we would still get a decent matching result.
1690// However, sometimes we have to break ties. For example we may have to
1691// choose between matching loads vs matching opcodes. This is what these
1692// scores are helping us with: they provide the order of preference. Also,
1693// this is important if the scalar is externally used or used in another
1694// tree entry node in the different lane.
1695
1696 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1697staticconstintScoreConsecutiveLoads = 4;
1698 /// The same load multiple times. This should have a better score than
1699 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1700 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1701 /// a vector load and 1.0 for a broadcast.
1702staticconstintScoreSplatLoads = 3;
1703 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1704staticconstintScoreReversedLoads = 3;
1705 /// A load candidate for masked gather.
1706staticconstintScoreMaskedGatherCandidate = 1;
1707 /// ExtractElementInst from same vector and consecutive indexes.
1708staticconstintScoreConsecutiveExtracts = 4;
1709 /// ExtractElementInst from same vector and reversed indices.
1710staticconstintScoreReversedExtracts = 3;
1711 /// Constants.
1712staticconstintScoreConstants = 2;
1713 /// Instructions with the same opcode.
1714staticconstintScoreSameOpcode = 2;
1715 /// Instructions with alt opcodes (e.g, add + sub).
1716staticconstintScoreAltOpcodes = 1;
1717 /// Identical instructions (a.k.a. splat or broadcast).
1718staticconstintScoreSplat = 1;
1719 /// Matching with an undef is preferable to failing.
1720staticconstintScoreUndef = 1;
1721 /// Score for failing to find a decent match.
1722staticconstintScoreFail = 0;
1723 /// Score if all users are vectorized.
1724staticconstintScoreAllUserVectorized = 1;
1725
1726 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1727 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1728 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1729 /// MainAltOps.
1730intgetShallowScore(Value *V1,Value *V2,Instruction *U1,Instruction *U2,
1731ArrayRef<Value *> MainAltOps) const{
1732if (!isValidElementType(V1->getType()) ||
1733 !isValidElementType(V2->getType()))
1734returnLookAheadHeuristics::ScoreFail;
1735
1736if (V1 == V2) {
1737if (isa<LoadInst>(V1)) {
1738// Retruns true if the users of V1 and V2 won't need to be extracted.
1739auto AllUsersAreInternal = [U1, U2,this](Value *V1,Value *V2) {
1740// Bail out if we have too many uses to save compilation time.
1741if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1742returnfalse;
1743
1744auto AllUsersVectorized = [U1, U2,this](Value *V) {
1745returnllvm::all_of(V->users(), [U1, U2,this](Value *U) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1747 });
1748 };
1749return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1750 };
1751// A broadcast of a load can be cheaper on some targets.
1752if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1753ElementCount::getFixed(NumLanes)) &&
1754 ((int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1756returnLookAheadHeuristics::ScoreSplatLoads;
1757 }
1758returnLookAheadHeuristics::ScoreSplat;
1759 }
1760
1761auto CheckSameEntryOrFail = [&]() {
1762if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1764returnLookAheadHeuristics::ScoreSplatLoads;
1765returnLookAheadHeuristics::ScoreFail;
1766 };
1767
1768auto *LI1 = dyn_cast<LoadInst>(V1);
1769auto *LI2 = dyn_cast<LoadInst>(V2);
1770if (LI1 && LI2) {
1771if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1772 !LI2->isSimple())
1773return CheckSameEntryOrFail();
1774
1775 std::optional<int> Dist =getPointersDiff(
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(),DL, SE,/*StrictCheck=*/true);
1778if (!Dist || *Dist == 0) {
1779if (getUnderlyingObject(LI1->getPointerOperand()) ==
1780getUnderlyingObject(LI2->getPointerOperand()) &&
1781 R.TTI->isLegalMaskedGather(
1782getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1783returnLookAheadHeuristics::ScoreMaskedGatherCandidate;
1784return CheckSameEntryOrFail();
1785 }
1786// The distance is too large - still may be profitable to use masked
1787// loads/gathers.
1788if (std::abs(*Dist) > NumLanes / 2)
1789returnLookAheadHeuristics::ScoreMaskedGatherCandidate;
1790// This still will detect consecutive loads, but we might have "holes"
1791// in some cases. It is ok for non-power-2 vectorization and may produce
1792// better results. It should not affect current vectorization.
1793return (*Dist > 0) ?LookAheadHeuristics::ScoreConsecutiveLoads
1794 :LookAheadHeuristics::ScoreReversedLoads;
1795 }
1796
1797auto *C1 = dyn_cast<Constant>(V1);
1798auto *C2 = dyn_cast<Constant>(V2);
1799if (C1 && C2)
1800returnLookAheadHeuristics::ScoreConstants;
1801
1802// Extracts from consecutive indexes of the same vector better score as
1803// the extracts could be optimized away.
1804Value *EV1;
1805ConstantInt *Ex1Idx;
1806if (match(V1,m_ExtractElt(m_Value(EV1),m_ConstantInt(Ex1Idx)))) {
1807// Undefs are always profitable for extractelements.
1808// Compiler can easily combine poison and extractelement <non-poison> or
1809// undef and extractelement <poison>. But combining undef +
1810// extractelement <non-poison-but-may-produce-poison> requires some
1811// extra operations.
1812if (isa<UndefValue>(V2))
1813return (isa<PoisonValue>(V2) ||isUndefVector(EV1).all())
1814 ?LookAheadHeuristics::ScoreConsecutiveExtracts
1815 :LookAheadHeuristics::ScoreSameOpcode;
1816Value *EV2 =nullptr;
1817ConstantInt *Ex2Idx =nullptr;
1818if (match(V2,
1819m_ExtractElt(m_Value(EV2),m_CombineOr(m_ConstantInt(Ex2Idx),
1820m_Undef())))) {
1821// Undefs are always profitable for extractelements.
1822if (!Ex2Idx)
1823returnLookAheadHeuristics::ScoreConsecutiveExtracts;
1824if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1825returnLookAheadHeuristics::ScoreConsecutiveExtracts;
1826if (EV2 == EV1) {
1827int Idx1 = Ex1Idx->getZExtValue();
1828int Idx2 = Ex2Idx->getZExtValue();
1829int Dist = Idx2 - Idx1;
1830// The distance is too large - still may be profitable to use
1831// shuffles.
1832if (std::abs(Dist) == 0)
1833returnLookAheadHeuristics::ScoreSplat;
1834if (std::abs(Dist) > NumLanes / 2)
1835returnLookAheadHeuristics::ScoreSameOpcode;
1836return (Dist > 0) ?LookAheadHeuristics::ScoreConsecutiveExtracts
1837 :LookAheadHeuristics::ScoreReversedExtracts;
1838 }
1839returnLookAheadHeuristics::ScoreAltOpcodes;
1840 }
1841return CheckSameEntryOrFail();
1842 }
1843
1844auto *I1 = dyn_cast<Instruction>(V1);
1845auto *I2 = dyn_cast<Instruction>(V2);
1846if (I1 && I2) {
1847if (I1->getParent() != I2->getParent())
1848return CheckSameEntryOrFail();
1849SmallVector<Value *, 4> Ops(MainAltOps);
1850 Ops.push_back(I1);
1851 Ops.push_back(I2);
1852 InstructionsState S =getSameOpcode(Ops, TLI);
1853// Note: Only consider instructions with <= 2 operands to avoid
1854// complexity explosion.
1855if (S &&
1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1857 !S.isAltShuffle()) &&
1858all_of(Ops, [&S](Value *V) {
1859return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1862 }))
1863return S.isAltShuffle() ?LookAheadHeuristics::ScoreAltOpcodes
1864 :LookAheadHeuristics::ScoreSameOpcode;
1865 }
1866
1867if (I1 && isa<PoisonValue>(V2))
1868returnLookAheadHeuristics::ScoreSameOpcode;
1869
1870if (isa<UndefValue>(V2))
1871returnLookAheadHeuristics::ScoreUndef;
1872
1873return CheckSameEntryOrFail();
1874 }
1875
1876 /// Go through the operands of \p LHS and \p RHS recursively until
1877 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1878 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1879 /// of \p U1 and \p U2), except at the beginning of the recursion where
1880 /// these are set to nullptr.
1881 ///
1882 /// For example:
1883 /// \verbatim
1884 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1885 /// \ / \ / \ / \ /
1886 /// + + + +
1887 /// G1 G2 G3 G4
1888 /// \endverbatim
1889 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1890 /// each level recursively, accumulating the score. It starts from matching
1891 /// the additions at level 0, then moves on to the loads (level 1). The
1892 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1893 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1894 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1895 /// Please note that the order of the operands does not matter, as we
1896 /// evaluate the score of all profitable combinations of operands. In
1897 /// other words the score of G1 and G4 is the same as G1 and G2. This
1898 /// heuristic is based on ideas described in:
1899 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1900 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1901 /// Luís F. W. Góes
1902intgetScoreAtLevelRec(Value *LHS,Value *RHS,Instruction *U1,
1903Instruction *U2,int CurrLevel,
1904ArrayRef<Value *> MainAltOps) const{
1905
1906// Get the shallow score of V1 and V2.
1907int ShallowScoreAtThisLevel =
1908getShallowScore(LHS,RHS, U1, U2, MainAltOps);
1909
1910// If reached MaxLevel,
1911// or if V1 and V2 are not instructions,
1912// or if they are SPLAT,
1913// or if they are not consecutive,
1914// or if profitable to vectorize loads or extractelements, early return
1915// the current cost.
1916auto *I1 = dyn_cast<Instruction>(LHS);
1917auto *I2 = dyn_cast<Instruction>(RHS);
1918if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1919 ShallowScoreAtThisLevel ==LookAheadHeuristics::ScoreFail ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924return ShallowScoreAtThisLevel;
1925assert(I1 && I2 &&"Should have early exited.");
1926
1927// Contains the I2 operand indexes that got matched with I1 operands.
1928SmallSet<unsigned, 4> Op2Used;
1929
1930// Recursion towards the operands of I1 and I2. We are trying all possible
1931// operand pairs, and keeping track of the best score.
1932for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1934// Try to pair op1I with the best operand of I2.
1935int MaxTmpScore = 0;
1936unsigned MaxOpIdx2 = 0;
1937bool FoundBest =false;
1938// If I2 is commutative try all combinations.
1939unsigned FromIdx =isCommutative(I2) ? 0 : OpIdx1;
1940unsigned ToIdx =isCommutative(I2)
1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943assert(FromIdx <= ToIdx &&"Bad index");
1944for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1945// Skip operands already paired with OpIdx1.
1946if (Op2Used.count(OpIdx2))
1947continue;
1948// Recursively calculate the cost at each level
1949int TmpScore =
1950getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1951 I1, I2, CurrLevel + 1, {});
1952// Look for the best score.
1953if (TmpScore >LookAheadHeuristics::ScoreFail &&
1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1956 MaxOpIdx2 = OpIdx2;
1957 FoundBest =true;
1958 }
1959 }
1960if (FoundBest) {
1961// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1962 Op2Used.insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1964 }
1965 }
1966return ShallowScoreAtThisLevel;
1967 }
1968 };
1969 /// A helper data structure to hold the operands of a vector of instructions.
1970 /// This supports a fixed vector length for all operand vectors.
1971classVLOperands {
1972 /// For each operand we need (i) the value, and (ii) the opcode that it
1973 /// would be attached to if the expression was in a left-linearized form.
1974 /// This is required to avoid illegal operand reordering.
1975 /// For example:
1976 /// \verbatim
1977 /// 0 Op1
1978 /// |/
1979 /// Op1 Op2 Linearized + Op2
1980 /// \ / ----------> |/
1981 /// - -
1982 ///
1983 /// Op1 - Op2 (0 + Op1) - Op2
1984 /// \endverbatim
1985 ///
1986 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1987 ///
1988 /// Another way to think of this is to track all the operations across the
1989 /// path from the operand all the way to the root of the tree and to
1990 /// calculate the operation that corresponds to this path. For example, the
1991 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1992 /// corresponding operation is a '-' (which matches the one in the
1993 /// linearized tree, as shown above).
1994 ///
1995 /// For lack of a better term, we refer to this operation as Accumulated
1996 /// Path Operation (APO).
1997structOperandData {
1998 OperandData() =default;
1999 OperandData(Value *V,bool APO,bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2001 /// The operand value.
2002Value *V =nullptr;
2003 /// TreeEntries only allow a single opcode, or an alternate sequence of
2004 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2005 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2006 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2007 /// (e.g., Add/Mul)
2008bool APO =false;
2009 /// Helper data for the reordering function.
2010bool IsUsed =false;
2011 };
2012
2013 /// During operand reordering, we are trying to select the operand at lane
2014 /// that matches best with the operand at the neighboring lane. Our
2015 /// selection is based on the type of value we are looking for. For example,
2016 /// if the neighboring lane has a load, we need to look for a load that is
2017 /// accessing a consecutive address. These strategies are summarized in the
2018 /// 'ReorderingMode' enumerator.
2019enum class ReorderingMode {
2020 Load,///< Matching loads to consecutive memory addresses
2021 Opcode,///< Matching instructions based on opcode (same or alternate)
2022Constant,///< Matching constants
2023Splat,///< Matching the same instruction multiple times (broadcast)
2024Failed,///< We failed to create a vectorizable group
2025 };
2026
2027usingOperandDataVec =SmallVector<OperandData, 2>;
2028
2029 /// A vector of operand vectors.
2030SmallVector<OperandDataVec, 4> OpsVec;
2031 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2032 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2033unsigned ArgSize = 0;
2034
2035constTargetLibraryInfo &TLI;
2036constDataLayout &DL;
2037ScalarEvolution &SE;
2038constBoUpSLP &R;
2039constLoop *L =nullptr;
2040
2041 /// \returns the operand data at \p OpIdx and \p Lane.
2042 OperandData &getData(unsigned OpIdx,unsigned Lane) {
2043return OpsVec[OpIdx][Lane];
2044 }
2045
2046 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2047const OperandData &getData(unsigned OpIdx,unsigned Lane) const{
2048return OpsVec[OpIdx][Lane];
2049 }
2050
2051 /// Clears the used flag for all entries.
2052void clearUsed() {
2053for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2056 ++Lane)
2057 OpsVec[OpIdx][Lane].IsUsed =false;
2058 }
2059
2060 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2061void swap(unsigned OpIdx1,unsigned OpIdx2,unsigned Lane) {
2062std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2063 }
2064
2065 /// \param Lane lane of the operands under analysis.
2066 /// \param OpIdx operand index in \p Lane lane we're looking the best
2067 /// candidate for.
2068 /// \param Idx operand index of the current candidate value.
2069 /// \returns The additional score due to possible broadcasting of the
2070 /// elements in the lane. It is more profitable to have power-of-2 unique
2071 /// elements in the lane, it will be vectorized with higher probability
2072 /// after removing duplicates. Currently the SLP vectorizer supports only
2073 /// vectorization of the power-of-2 number of unique scalars.
2074int getSplatScore(unsigned Lane,unsigned OpIdx,unsignedIdx,
2075constSmallBitVector &UsedLanes) const{
2076Value *IdxLaneV = getData(Idx, Lane).V;
2077if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2079return 0;
2080SmallDenseMap<Value *, unsigned, 4> Uniques;
2081for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2082if (Ln == Lane)
2083continue;
2084Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085if (!isa<Instruction>(OpIdxLnV))
2086return 0;
2087 Uniques.try_emplace(OpIdxLnV, Ln);
2088 }
2089unsigned UniquesCount = Uniques.size();
2090auto IdxIt = Uniques.find(IdxLaneV);
2091unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2093Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094auto OpIdxIt = Uniques.find(OpIdxLaneV);
2095unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2097if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2098return 0;
2099return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2102bit_floor(UniquesCntWithOpIdxLaneV)) -
2103 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV -bit_floor(UniquesCntWithIdxLaneV)
2105 :bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2106 }
2107
2108 /// \param Lane lane of the operands under analysis.
2109 /// \param OpIdx operand index in \p Lane lane we're looking the best
2110 /// candidate for.
2111 /// \param Idx operand index of the current candidate value.
2112 /// \returns The additional score for the scalar which users are all
2113 /// vectorized.
2114int getExternalUseScore(unsigned Lane,unsigned OpIdx,unsignedIdx) const{
2115Value *IdxLaneV = getData(Idx, Lane).V;
2116Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2117// Do not care about number of uses for vector-like instructions
2118// (extractelement/extractvalue with constant indices), they are extracts
2119// themselves and already externally used. Vectorization of such
2120// instructions does not add extra extractelement instruction, just may
2121// remove it.
2122if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2123isVectorLikeInstWithConstOps(OpIdxLaneV))
2124returnLookAheadHeuristics::ScoreAllUserVectorized;
2125auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2127return 0;
2128return R.areAllUsersVectorized(IdxLaneI)
2129 ?LookAheadHeuristics::ScoreAllUserVectorized
2130 : 0;
2131 }
2132
2133 /// Score scaling factor for fully compatible instructions but with
2134 /// different number of external uses. Allows better selection of the
2135 /// instructions with less external uses.
2136staticconstint ScoreScaleFactor = 10;
2137
2138 /// \Returns the look-ahead score, which tells us how much the sub-trees
2139 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2140 /// score. This helps break ties in an informed way when we cannot decide on
2141 /// the order of the operands by just considering the immediate
2142 /// predecessors.
2143int getLookAheadScore(Value *LHS,Value *RHS,ArrayRef<Value *> MainAltOps,
2144int Lane,unsigned OpIdx,unsignedIdx,
2145bool &IsUsed,constSmallBitVector &UsedLanes) {
2146LookAheadHeuristics LookAhead(TLI,DL, SE, R, getNumLanes(),
2147LookAheadMaxDepth);
2148// Keep track of the instruction stack as we recurse into the operands
2149// during the look-ahead score exploration.
2150int Score =
2151 LookAhead.getScoreAtLevelRec(LHS,RHS,/*U1=*/nullptr,/*U2=*/nullptr,
2152/*CurrLevel=*/1, MainAltOps);
2153if (Score) {
2154int SplatScore = getSplatScore(Lane, OpIdx,Idx, UsedLanes);
2155if (Score <= -SplatScore) {
2156// Failed score.
2157 Score = 0;
2158 }else {
2159 Score += SplatScore;
2160// Scale score to see the difference between different operands
2161// and similar operands but all vectorized/not all vectorized
2162// uses. It does not affect actual selection of the best
2163// compatible operand in general, just allows to select the
2164// operand with all vectorized uses.
2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx,Idx);
2167 IsUsed =true;
2168 }
2169 }
2170return Score;
2171 }
2172
2173 /// Best defined scores per lanes between the passes. Used to choose the
2174 /// best operand (with the highest score) between the passes.
2175 /// The key - {Operand Index, Lane}.
2176 /// The value - the best score between the passes for the lane and the
2177 /// operand.
2178SmallDenseMap<std::pair<unsigned, unsigned>,unsigned, 8>
2179 BestScoresPerLanes;
2180
2181// Search all operands in Ops[*][Lane] for the one that matches best
2182// Ops[OpIdx][LastLane] and return its opreand index.
2183// If no good match can be found, return std::nullopt.
2184 std::optional<unsigned>
2185 getBestOperand(unsigned OpIdx,int Lane,int LastLane,
2186ArrayRef<ReorderingMode> ReorderingModes,
2187ArrayRef<Value *> MainAltOps,
2188constSmallBitVector &UsedLanes) {
2189unsigned NumOperands = getNumOperands();
2190
2191// The operand of the previous lane at OpIdx.
2192Value *OpLastLane = getData(OpIdx, LastLane).V;
2193
2194// Our strategy mode for OpIdx.
2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196if (RMode == ReorderingMode::Failed)
2197return std::nullopt;
2198
2199// The linearized opcode of the operand at OpIdx, Lane.
2200bool OpIdxAPO = getData(OpIdx, Lane).APO;
2201
2202// The best operand index and its score.
2203// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2204// are using the score to differentiate between the two.
2205structBestOpData {
2206 std::optional<unsigned>Idx;
2207unsigned Score = 0;
2208 } BestOp;
2209 BestOp.Score =
2210 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2211 .first->second;
2212
2213// Track if the operand must be marked as used. If the operand is set to
2214// Score 1 explicitly (because of non power-of-2 unique scalars, we may
2215// want to reestimate the operands again on the following iterations).
2216bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2219// Iterate through all unused operands and look for the best.
2220for (unsignedIdx = 0;Idx != NumOperands; ++Idx) {
2221// Get the operand at Idx and Lane.
2222 OperandData &OpData = getData(Idx, Lane);
2223Value *Op = OpData.V;
2224bool OpAPO = OpData.APO;
2225
2226// Skip already selected operands.
2227if (OpData.IsUsed)
2228continue;
2229
2230// Skip if we are trying to move the operand to a position with a
2231// different opcode in the linearized tree form. This would break the
2232// semantics.
2233if (OpAPO != OpIdxAPO)
2234continue;
2235
2236// Look for an operand that matches the current mode.
2237switch (RMode) {
2238case ReorderingMode::Load:
2239case ReorderingMode::Opcode: {
2240bool LeftToRight = Lane > LastLane;
2241Value *OpLeft = (LeftToRight) ? OpLastLane :Op;
2242Value *OpRight = (LeftToRight) ?Op : OpLastLane;
2243int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx,Idx, IsUsed, UsedLanes);
2245if (Score >static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score ==static_cast<int>(BestOp.Score) &&
2247Idx == OpIdx)) {
2248 BestOp.Idx =Idx;
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2251 }
2252break;
2253 }
2254case ReorderingMode::Constant:
2255if (isa<Constant>(Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2257 BestOp.Idx =Idx;
2258if (isa<Constant>(Op)) {
2259 BestOp.Score =LookAheadHeuristics::ScoreConstants;
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2261LookAheadHeuristics::ScoreConstants;
2262 }
2263if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2264 IsUsed =false;
2265 }
2266break;
2267case ReorderingMode::Splat:
2268if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2269 IsUsed =Op == OpLastLane;
2270if (Op == OpLastLane) {
2271 BestOp.Score =LookAheadHeuristics::ScoreSplat;
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2273LookAheadHeuristics::ScoreSplat;
2274 }
2275 BestOp.Idx =Idx;
2276 }
2277break;
2278case ReorderingMode::Failed:
2279llvm_unreachable("Not expected Failed reordering mode.");
2280 }
2281 }
2282
2283if (BestOp.Idx) {
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2285return BestOp.Idx;
2286 }
2287// If we could not find a good match return std::nullopt.
2288return std::nullopt;
2289 }
2290
2291 /// Helper for reorderOperandVecs.
2292 /// \returns the lane that we should start reordering from. This is the one
2293 /// which has the least number of operands that can freely move about or
2294 /// less profitable because it already has the most optimal set of operands.
2295unsigned getBestLaneToStartReordering() const{
2296unsigned Min = UINT_MAX;
2297unsigned SameOpNumber = 0;
2298// std::pair<unsigned, unsigned> is used to implement a simple voting
2299// algorithm and choose the lane with the least number of operands that
2300// can freely move about or less profitable because it already has the
2301// most optimal set of operands. The first unsigned is a counter for
2302// voting, the second unsigned is the counter of lanes with instructions
2303// with same/alternate opcodes and same parent basic block.
2304MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
2305// Try to be closer to the original results, if we have multiple lanes
2306// with same cost. If 2 lanes have the same cost, use the one with the
2307// highest index.
2308for (intI = getNumLanes();I > 0; --I) {
2309unsigned Lane =I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2312// Compare the number of operands that can move and choose the one with
2313// the least number.
2314if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2317 HashMap.clear();
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 }elseif (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2321// Select the most optimal lane in terms of number of operands that
2322// should be moved around.
2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 }elseif (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327auto [It, Inserted] =
2328 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2329if (!Inserted)
2330 ++It->second.first;
2331 }
2332 }
2333// Select the lane with the minimum counter.
2334unsigned BestLane = 0;
2335unsigned CntMin = UINT_MAX;
2336for (constauto &Data :reverse(HashMap)) {
2337if (Data.second.first < CntMin) {
2338 CntMin =Data.second.first;
2339 BestLane =Data.second.second;
2340 }
2341 }
2342return BestLane;
2343 }
2344
2345 /// Data structure that helps to reorder operands.
2346structOperandsOrderData {
2347 /// The best number of operands with the same APOs, which can be
2348 /// reordered.
2349unsigned NumOfAPOs = UINT_MAX;
2350 /// Number of operands with the same/alternate instruction opcode and
2351 /// parent.
2352unsigned NumOpsWithSameOpcodeParent = 0;
2353 /// Hash for the actual operands ordering.
2354 /// Used to count operands, actually their position id and opcode
2355 /// value. It is used in the voting mechanism to find the lane with the
2356 /// least number of operands that can freely move about or less profitable
2357 /// because it already has the most optimal set of operands. Can be
2358 /// replaced with SmallVector<unsigned> instead but hash code is faster
2359 /// and requires less memory.
2360unsigned Hash = 0;
2361 };
2362 /// \returns the maximum number of operands that are allowed to be reordered
2363 /// for \p Lane and the number of compatible instructions(with the same
2364 /// parent/opcode). This is used as a heuristic for selecting the first lane
2365 /// to start operand reordering.
2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const{
2367unsigned CntTrue = 0;
2368unsigned NumOperands = getNumOperands();
2369// Operands with the same APO can be reordered. We therefore need to count
2370// how many of them we have for each APO, like this: Cnt[APO] = x.
2371// Since we only have two APOs, namely true and false, we can avoid using
2372// a map. Instead we can simply count the number of operands that
2373// correspond to one of them (in this case the 'true' APO), and calculate
2374// the other by subtracting it from the total number of operands.
2375// Operands with the same instruction opcode and parent are more
2376// profitable since we don't need to move them in many cases, with a high
2377// probability such lane already can be vectorized effectively.
2378bool AllUndefs =true;
2379unsigned NumOpsWithSameOpcodeParent = 0;
2380Instruction *OpcodeI =nullptr;
2381BasicBlock *Parent =nullptr;
2382unsigned Hash = 0;
2383for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384const OperandData &OpData = getData(OpIdx, Lane);
2385if (OpData.APO)
2386 ++CntTrue;
2387// Use Boyer-Moore majority voting for finding the majority opcode and
2388// the number of times it occurs.
2389if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2390if (!OpcodeI || !getSameOpcode({OpcodeI,I}, TLI) ||
2391I->getParent() != Parent) {
2392if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2394 OpcodeI =I;
2395 Parent =I->getParent();
2396 }else {
2397 --NumOpsWithSameOpcodeParent;
2398 }
2399 }else {
2400 ++NumOpsWithSameOpcodeParent;
2401 }
2402 }
2403 Hash =hash_combine(
2404 Hash,hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2406 }
2407if (AllUndefs)
2408return {};
2409 OperandsOrderDataData;
2410Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2412Data.Hash = Hash;
2413returnData;
2414 }
2415
2416 /// Go through the instructions in VL and append their operands.
2417void appendOperandsOfVL(ArrayRef<Value *> VL,const InstructionsState &S) {
2418assert(!VL.empty() &&"Bad VL");
2419assert((empty() || VL.size() == getNumLanes()) &&
2420"Expected same number of lanes");
2421assert(S.valid() &&"InstructionsState is invalid.");
2422// IntrinsicInst::isCommutative returns true if swapping the first "two"
2423// arguments to the intrinsic produces the same result.
2424constexprunsigned IntrinsicNumOperands = 2;
2425Instruction *MainOp = S.getMainOp();
2426unsigned NumOperands = MainOp->getNumOperands();
2427 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2428 OpsVec.resize(NumOperands);
2429unsigned NumLanes = VL.size();
2430for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2431 OpsVec[OpIdx].resize(NumLanes);
2432for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2433assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2434"Expected instruction or poison value");
2435// Our tree has just 3 nodes: the root and two operands.
2436// It is therefore trivial to get the APO. We only need to check the
2437// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2438// RHS operand. The LHS operand of both add and sub is never attached
2439// to an inversese operation in the linearized form, therefore its APO
2440// is false. The RHS is true only if VL[Lane] is an inverse operation.
2441
2442// Since operand reordering is performed on groups of commutative
2443// operations or alternating sequences (e.g., +, -), we can safely
2444// tell the inverse operations by checking commutativity.
2445if (isa<PoisonValue>(VL[Lane])) {
2446if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2447if (OpIdx == 0) {
2448 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),true,false};
2449continue;
2450 }
2451 }elseif (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2452if (OpIdx == 0) {
2453 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),true,false};
2454continue;
2455 }
2456 }
2457 OpsVec[OpIdx][Lane] = {
2458PoisonValue::get(MainOp->getOperand(OpIdx)->getType()),true,
2459false};
2460continue;
2461 }
2462bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2463bool APO = (OpIdx == 0) ?false : IsInverseOperation;
2464 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2465 APO,false};
2466 }
2467 }
2468 }
2469
2470 /// \returns the number of operands.
2471unsigned getNumOperands() const{return ArgSize; }
2472
2473 /// \returns the number of lanes.
2474unsigned getNumLanes() const{return OpsVec[0].size(); }
2475
2476 /// \returns the operand value at \p OpIdx and \p Lane.
2477Value *getValue(unsigned OpIdx,unsigned Lane) const{
2478return getData(OpIdx, Lane).V;
2479 }
2480
2481 /// \returns true if the data structure is empty.
2482bool empty() const{return OpsVec.empty(); }
2483
2484 /// Clears the data.
2485void clear() { OpsVec.clear(); }
2486
2487 /// \Returns true if there are enough operands identical to \p Op to fill
2488 /// the whole vector (it is mixed with constants or loop invariant values).
2489 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2490bool shouldBroadcast(Value *Op,unsigned OpIdx,unsigned Lane) {
2491assert(Op == getValue(OpIdx, Lane) &&
2492"Op is expected to be getValue(OpIdx, Lane).");
2493// Small number of loads - try load matching.
2494if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2495returnfalse;
2496bool OpAPO = getData(OpIdx, Lane).APO;
2497bool IsInvariant = L && L->isLoopInvariant(Op);
2498unsigned Cnt = 0;
2499for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2500if (Ln == Lane)
2501continue;
2502// This is set to true if we found a candidate for broadcast at Lane.
2503bool FoundCandidate =false;
2504for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2505 OperandData &Data = getData(OpI, Ln);
2506if (Data.APO != OpAPO ||Data.IsUsed)
2507continue;
2508Value *OpILane = getValue(OpI, Lane);
2509bool IsConstantOp = isa<Constant>(OpILane);
2510// Consider the broadcast candidate if:
2511// 1. Same value is found in one of the operands.
2512if (Data.V ==Op ||
2513// 2. The operand in the given lane is not constant but there is a
2514// constant operand in another lane (which can be moved to the
2515// given lane). In this case we can represent it as a simple
2516// permutation of constant and broadcast.
2517 (!IsConstantOp &&
2518 ((Lns > 2 && isa<Constant>(Data.V)) ||
2519// 2.1. If we have only 2 lanes, need to check that value in the
2520// next lane does not build same opcode sequence.
2521 (Lns == 2 &&
2522 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
2523 isa<Constant>(Data.V)))) ||
2524// 3. The operand in the current lane is loop invariant (can be
2525// hoisted out) and another operand is also a loop invariant
2526// (though not a constant). In this case the whole vector can be
2527// hoisted out.
2528// FIXME: need to teach the cost model about this case for better
2529// estimation.
2530 (IsInvariant && !isa<Constant>(Data.V) &&
2531 !getSameOpcode({Op,Data.V}, TLI) &&
2532 L->isLoopInvariant(Data.V))) {
2533 FoundCandidate =true;
2534Data.IsUsed =Data.V ==Op;
2535if (Data.V ==Op)
2536 ++Cnt;
2537break;
2538 }
2539 }
2540if (!FoundCandidate)
2541returnfalse;
2542 }
2543return getNumLanes() == 2 || Cnt > 1;
2544 }
2545
2546 /// Checks if there is at least single compatible operand in lanes other
2547 /// than \p Lane, compatible with the operand \p Op.
2548bool canBeVectorized(Instruction *Op,unsigned OpIdx,unsigned Lane) const{
2549assert(Op == getValue(OpIdx, Lane) &&
2550"Op is expected to be getValue(OpIdx, Lane).");
2551bool OpAPO = getData(OpIdx, Lane).APO;
2552for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2553if (Ln == Lane)
2554continue;
2555if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2556const OperandData &Data = getData(OpI, Ln);
2557if (Data.APO != OpAPO ||Data.IsUsed)
2558returntrue;
2559Value *OpILn = getValue(OpI, Ln);
2560return (L && L->isLoopInvariant(OpILn)) ||
2561 (getSameOpcode({Op, OpILn}, TLI) &&
2562allSameBlock({Op, OpILn}));
2563 }))
2564returntrue;
2565 }
2566returnfalse;
2567 }
2568
2569public:
2570 /// Initialize with all the operands of the instruction vector \p RootVL.
2571VLOperands(ArrayRef<Value *> RootVL,const InstructionsState &S,
2572constBoUpSLP &R)
2573 : TLI(*R.TLI),DL(*R.DL), SE(*R.SE), R(R),
2574 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
2575// Append all the operands of RootVL.
2576 appendOperandsOfVL(RootVL, S);
2577 }
2578
2579 /// \Returns a value vector with the operands across all lanes for the
2580 /// opearnd at \p OpIdx.
2581ValueListgetVL(unsigned OpIdx) const{
2582ValueList OpVL(OpsVec[OpIdx].size());
2583assert(OpsVec[OpIdx].size() == getNumLanes() &&
2584"Expected same num of lanes across all operands");
2585for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2586 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2587return OpVL;
2588 }
2589
2590// Performs operand reordering for 2 or more operands.
2591// The original operands are in OrigOps[OpIdx][Lane].
2592// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2593voidreorder() {
2594unsigned NumOperands = getNumOperands();
2595unsigned NumLanes = getNumLanes();
2596// Each operand has its own mode. We are using this mode to help us select
2597// the instructions for each lane, so that they match best with the ones
2598// we have selected so far.
2599SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2600
2601// This is a greedy single-pass algorithm. We are going over each lane
2602// once and deciding on the best order right away with no back-tracking.
2603// However, in order to increase its effectiveness, we start with the lane
2604// that has operands that can move the least. For example, given the
2605// following lanes:
2606// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2607// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2608// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2609// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2610// we will start at Lane 1, since the operands of the subtraction cannot
2611// be reordered. Then we will visit the rest of the lanes in a circular
2612// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2613
2614// Find the first lane that we will start our search from.
2615unsigned FirstLane = getBestLaneToStartReordering();
2616
2617// Initialize the modes.
2618for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2619Value *OpLane0 = getValue(OpIdx, FirstLane);
2620// Keep track if we have instructions with all the same opcode on one
2621// side.
2622if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2623// Check if OpLane0 should be broadcast.
2624if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2625 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2626 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2627elseif (isa<LoadInst>(OpILane0))
2628 ReorderingModes[OpIdx] = ReorderingMode::Load;
2629else
2630 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2631 }elseif (isa<Constant>(OpLane0)) {
2632 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2633 }elseif (isa<Argument>(OpLane0)) {
2634// Our best hope is a Splat. It may save some cost in some cases.
2635 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2636 }else {
2637llvm_unreachable("Unexpected value kind.");
2638 }
2639 }
2640
2641// Check that we don't have same operands. No need to reorder if operands
2642// are just perfect diamond or shuffled diamond match. Do not do it only
2643// for possible broadcasts or non-power of 2 number of scalars (just for
2644// now).
2645auto &&SkipReordering = [this]() {
2646SmallPtrSet<Value *, 4> UniqueValues;
2647ArrayRef<OperandData> Op0 = OpsVec.front();
2648for (const OperandData &Data : Op0)
2649 UniqueValues.insert(Data.V);
2650for (ArrayRef<OperandData>Op :
2651ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2652if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2653return !UniqueValues.contains(Data.V);
2654 }))
2655returnfalse;
2656 }
2657// TODO: Check if we can remove a check for non-power-2 number of
2658// scalars after full support of non-power-2 vectorization.
2659return UniqueValues.size() != 2 &&
2660hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
2661 UniqueValues.size());
2662 };
2663
2664// If the initial strategy fails for any of the operand indexes, then we
2665// perform reordering again in a second pass. This helps avoid assigning
2666// high priority to the failed strategy, and should improve reordering for
2667// the non-failed operand indexes.
2668for (intPass = 0;Pass != 2; ++Pass) {
2669// Check if no need to reorder operands since they're are perfect or
2670// shuffled diamond match.
2671// Need to do it to avoid extra external use cost counting for
2672// shuffled matches, which may cause regressions.
2673if (SkipReordering())
2674break;
2675// Skip the second pass if the first pass did not fail.
2676bool StrategyFailed =false;
2677// Mark all operand data as free to use.
2678 clearUsed();
2679// We keep the original operand order for the FirstLane, so reorder the
2680// rest of the lanes. We are visiting the nodes in a circular fashion,
2681// using FirstLane as the center point and increasing the radius
2682// distance.
2683SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2684for (unsignedI = 0;I < NumOperands; ++I)
2685 MainAltOps[I].push_back(getData(I, FirstLane).V);
2686
2687SmallBitVector UsedLanes(NumLanes);
2688 UsedLanes.set(FirstLane);
2689for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2690// Visit the lane on the right and then the lane on the left.
2691for (intDirection : {+1, -1}) {
2692int Lane = FirstLane +Direction * Distance;
2693if (Lane < 0 || Lane >= (int)NumLanes)
2694continue;
2695 UsedLanes.set(Lane);
2696int LastLane = Lane -Direction;
2697assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2698"Out of bounds");
2699// Look for a good match for each operand.
2700for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2701// Search for the operand that matches SortedOps[OpIdx][Lane-1].
2702 std::optional<unsigned> BestIdx =
2703 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2704 MainAltOps[OpIdx], UsedLanes);
2705// By not selecting a value, we allow the operands that follow to
2706// select a better matching value. We will get a non-null value in
2707// the next run of getBestOperand().
2708if (BestIdx) {
2709// Swap the current operand with the one returned by
2710// getBestOperand().
2711 swap(OpIdx, *BestIdx, Lane);
2712 }else {
2713// Enable the second pass.
2714 StrategyFailed =true;
2715 }
2716// Try to get the alternate opcode and follow it during analysis.
2717if (MainAltOps[OpIdx].size() != 2) {
2718 OperandData &AltOp = getData(OpIdx, Lane);
2719 InstructionsState OpS =
2720getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2721if (OpS && OpS.isAltShuffle())
2722 MainAltOps[OpIdx].push_back(AltOp.V);
2723 }
2724 }
2725 }
2726 }
2727// Skip second pass if the strategy did not fail.
2728if (!StrategyFailed)
2729break;
2730 }
2731 }
2732
2733#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2734LLVM_DUMP_METHODstaticStringRefgetModeStr(ReorderingMode RMode) {
2735switch (RMode) {
2736case ReorderingMode::Load:
2737return"Load";
2738case ReorderingMode::Opcode:
2739return"Opcode";
2740case ReorderingMode::Constant:
2741return"Constant";
2742case ReorderingMode::Splat:
2743return"Splat";
2744case ReorderingMode::Failed:
2745return"Failed";
2746 }
2747llvm_unreachable("Unimplemented Reordering Type");
2748 }
2749
2750LLVM_DUMP_METHODstaticraw_ostream &printMode(ReorderingMode RMode,
2751raw_ostream &OS) {
2752returnOS <<getModeStr(RMode);
2753 }
2754
2755 /// Debug print.
2756LLVM_DUMP_METHODstaticvoiddumpMode(ReorderingMode RMode) {
2757printMode(RMode,dbgs());
2758 }
2759
2760friendraw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2761returnprintMode(RMode,OS);
2762 }
2763
2764LLVM_DUMP_METHODraw_ostream &print(raw_ostream &OS) const{
2765constunsigned Indent = 2;
2766unsigned Cnt = 0;
2767for (constOperandDataVec &OpDataVec : OpsVec) {
2768OS <<"Operand " << Cnt++ <<"\n";
2769for (const OperandData &OpData : OpDataVec) {
2770OS.indent(Indent) <<"{";
2771if (Value *V = OpData.V)
2772OS << *V;
2773else
2774OS <<"null";
2775OS <<", APO:" << OpData.APO <<"}\n";
2776 }
2777OS <<"\n";
2778 }
2779returnOS;
2780 }
2781
2782 /// Debug print.
2783LLVM_DUMP_METHODvoiddump() const{print(dbgs()); }
2784#endif
2785 };
2786
2787 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2788 /// for a pair which have highest score deemed to have best chance to form
2789 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2790 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2791 /// of the cost, considered to be good enough score.
2792 std::optional<int>
2793findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2794int Limit =LookAheadHeuristics::ScoreFail) const{
2795LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this,/*NumLanes=*/2,
2796RootLookAheadMaxDepth);
2797int BestScore = Limit;
2798 std::optional<int> Index;
2799for (intI : seq<int>(0, Candidates.size())) {
2800int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2801 Candidates[I].second,
2802/*U1=*/nullptr,/*U2=*/nullptr,
2803/*CurrLevel=*/1, {});
2804if (Score > BestScore) {
2805 BestScore = Score;
2806 Index =I;
2807 }
2808 }
2809return Index;
2810 }
2811
2812 /// Checks if the instruction is marked for deletion.
2813boolisDeleted(Instruction *I) const{return DeletedInstructions.count(I); }
2814
2815 /// Removes an instruction from its block and eventually deletes it.
2816 /// It's like Instruction::eraseFromParent() except that the actual deletion
2817 /// is delayed until BoUpSLP is destructed.
2818voideraseInstruction(Instruction *I) {
2819 DeletedInstructions.insert(I);
2820 }
2821
2822 /// Remove instructions from the parent function and clear the operands of \p
2823 /// DeadVals instructions, marking for deletion trivially dead operands.
2824template <typename T>
2825voidremoveInstructionsAndOperands(ArrayRef<T *> DeadVals) {
2826SmallVector<WeakTrackingVH> DeadInsts;
2827for (T *V : DeadVals) {
2828auto *I = cast<Instruction>(V);
2829 DeletedInstructions.insert(I);
2830 }
2831DenseSet<Value *> Processed;
2832for (T *V : DeadVals) {
2833if (!V || !Processed.insert(V).second)
2834continue;
2835auto *I = cast<Instruction>(V);
2836salvageDebugInfo(*I);
2837SmallVector<const TreeEntry *> Entries;
2838if (const TreeEntry *Entry = getTreeEntry(I)) {
2839 Entries.push_back(Entry);
2840auto It = MultiNodeScalars.find(I);
2841if (It != MultiNodeScalars.end())
2842 Entries.append(It->second.begin(), It->second.end());
2843 }
2844for (Use &U :I->operands()) {
2845if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2846 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2847wouldInstructionBeTriviallyDead(OpI, TLI) &&
2848 (Entries.empty() ||none_of(Entries, [&](const TreeEntry *Entry) {
2849return Entry->VectorizedValue == OpI;
2850 })))
2851 DeadInsts.push_back(OpI);
2852 }
2853I->dropAllReferences();
2854 }
2855for (T *V : DeadVals) {
2856auto *I = cast<Instruction>(V);
2857if (!I->getParent())
2858continue;
2859assert((I->use_empty() ||all_of(I->uses(),
2860 [&](Use &U) {
2861 return isDeleted(
2862 cast<Instruction>(U.getUser()));
2863 })) &&
2864"trying to erase instruction with users.");
2865I->removeFromParent();
2866 SE->forgetValue(I);
2867 }
2868// Process the dead instruction list until empty.
2869while (!DeadInsts.empty()) {
2870Value *V = DeadInsts.pop_back_val();
2871Instruction *VI = cast_or_null<Instruction>(V);
2872if (!VI || !VI->getParent())
2873continue;
2874assert(isInstructionTriviallyDead(VI, TLI) &&
2875"Live instruction found in dead worklist!");
2876assert(VI->use_empty() &&"Instructions with uses are not dead.");
2877
2878// Don't lose the debug info while deleting the instructions.
2879salvageDebugInfo(*VI);
2880
2881// Null out all of the instruction's operands to see if any operand
2882// becomes dead as we go.
2883for (Use &OpU : VI->operands()) {
2884Value *OpV = OpU.get();
2885if (!OpV)
2886continue;
2887 OpU.set(nullptr);
2888
2889if (!OpV->use_empty())
2890continue;
2891
2892// If the operand is an instruction that became dead as we nulled out
2893// the operand, and if it is 'trivially' dead, delete it in a future
2894// loop iteration.
2895if (auto *OpI = dyn_cast<Instruction>(OpV))
2896if (!DeletedInstructions.contains(OpI) &&
2897isInstructionTriviallyDead(OpI, TLI))
2898 DeadInsts.push_back(OpI);
2899 }
2900
2901 VI->removeFromParent();
2902 DeletedInstructions.insert(VI);
2903 SE->forgetValue(VI);
2904 }
2905 }
2906
2907 /// Checks if the instruction was already analyzed for being possible
2908 /// reduction root.
2909boolisAnalyzedReductionRoot(Instruction *I) const{
2910return AnalyzedReductionsRoots.count(I);
2911 }
2912 /// Register given instruction as already analyzed for being possible
2913 /// reduction root.
2914voidanalyzedReductionRoot(Instruction *I) {
2915 AnalyzedReductionsRoots.insert(I);
2916 }
2917 /// Checks if the provided list of reduced values was checked already for
2918 /// vectorization.
2919boolareAnalyzedReductionVals(ArrayRef<Value *> VL) const{
2920return AnalyzedReductionVals.contains(hash_value(VL));
2921 }
2922 /// Adds the list of reduced values to list of already checked values for the
2923 /// vectorization.
2924voidanalyzedReductionVals(ArrayRef<Value *> VL) {
2925 AnalyzedReductionVals.insert(hash_value(VL));
2926 }
2927 /// Clear the list of the analyzed reduction root instructions.
2928voidclearReductionData() {
2929 AnalyzedReductionsRoots.clear();
2930 AnalyzedReductionVals.clear();
2931 AnalyzedMinBWVals.clear();
2932 }
2933 /// Checks if the given value is gathered in one of the nodes.
2934boolisAnyGathered(constSmallDenseSet<Value *> &Vals) const{
2935returnany_of(MustGather, [&](Value *V) {return Vals.contains(V); });
2936 }
2937 /// Checks if the given value is gathered in one of the nodes.
2938boolisGathered(constValue *V) const{
2939return MustGather.contains(V);
2940 }
2941 /// Checks if the specified value was not schedule.
2942boolisNotScheduled(constValue *V) const{
2943return NonScheduledFirst.contains(V);
2944 }
2945
2946 /// Check if the value is vectorized in the tree.
2947boolisVectorized(Value *V) const{return getTreeEntry(V); }
2948
2949~BoUpSLP();
2950
2951private:
2952 /// Determine if a node \p E in can be demoted to a smaller type with a
2953 /// truncation. We collect the entries that will be demoted in ToDemote.
2954 /// \param E Node for analysis
2955 /// \param ToDemote indices of the nodes to be demoted.
2956bool collectValuesToDemote(
2957const TreeEntry &E,bool IsProfitableToDemoteRoot,unsigned &BitWidth,
2958SmallVectorImpl<unsigned> &ToDemote,DenseSet<const TreeEntry *> &Visited,
2959constSmallDenseSet<unsigned, 8> &NodesToKeepBWs,unsigned &MaxDepthLevel,
2960bool &IsProfitableToDemote,bool IsTruncRoot)const;
2961
2962 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2963 /// reordering (i.e. the operands can be reordered because they have only one
2964 /// user and reordarable).
2965 /// \param ReorderableGathers List of all gather nodes that require reordering
2966 /// (e.g., gather of extractlements or partially vectorizable loads).
2967 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2968 /// reordering, subset of \p NonVectorized.
2969bool
2970 canReorderOperands(TreeEntry *UserTE,
2971SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2972ArrayRef<TreeEntry *> ReorderableGathers,
2973SmallVectorImpl<TreeEntry *> &GatherOps);
2974
2975 /// Checks if the given \p TE is a gather node with clustered reused scalars
2976 /// and reorders it per given \p Mask.
2977void reorderNodeWithReuses(TreeEntry &TE,ArrayRef<int> Mask)const;
2978
2979 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2980 /// if any. If it is not vectorized (gather node), returns nullptr.
2981 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,unsigned OpIdx) {
2982ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2983 TreeEntry *TE =nullptr;
2984constauto *It =find_if(VL, [&](Value *V) {
2985 TE = getTreeEntry(V);
2986if (TE &&is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2987returntrue;
2988auto It = MultiNodeScalars.find(V);
2989if (It != MultiNodeScalars.end()) {
2990for (TreeEntry *E : It->second) {
2991if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2992 TE = E;
2993returntrue;
2994 }
2995 }
2996 }
2997returnfalse;
2998 });
2999if (It != VL.end()) {
3000assert(TE->isSame(VL) &&"Expected same scalars.");
3001returnTE;
3002 }
3003returnnullptr;
3004 }
3005
3006 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
3007 /// if any. If it is not vectorized (gather node), returns nullptr.
3008const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
3009unsigned OpIdx) const{
3010returnconst_cast<BoUpSLP *>(this)->getVectorizedOperand(
3011const_cast<TreeEntry *>(UserTE), OpIdx);
3012 }
3013
3014 /// Checks if all users of \p I are the part of the vectorization tree.
3015bool areAllUsersVectorized(
3016Instruction *I,
3017constSmallDenseSet<Value *> *VectorizedVals =nullptr)const;
3018
3019 /// Return information about the vector formed for the specified index
3020 /// of a vector of (the same) instruction.
3021TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
3022
3023 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3024const TreeEntry *getOperandEntry(const TreeEntry *E,unsignedIdx)const;
3025
3026 /// Gets the root instruction for the given node. If the node is a strided
3027 /// load/store node with the reverse order, the root instruction is the last
3028 /// one.
3029Instruction *getRootEntryInstruction(const TreeEntry &Entry)const;
3030
3031 /// \returns Cast context for the given graph node.
3032TargetTransformInfo::CastContextHint
3033 getCastContextHint(const TreeEntry &TE)const;
3034
3035 /// \returns the cost of the vectorizable entry.
3036InstructionCost getEntryCost(const TreeEntry *E,
3037ArrayRef<Value *> VectorizedVals,
3038SmallPtrSetImpl<Value *> &CheckedExtracts);
3039
3040 /// This is the recursive part of buildTree.
3041void buildTree_rec(ArrayRef<Value *> Roots,unsignedDepth,
3042const EdgeInfo &EI,unsigned InterleaveFactor = 0);
3043
3044 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3045 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3046 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3047 /// returns false, setting \p CurrentOrder to either an empty vector or a
3048 /// non-identity permutation that allows to reuse extract instructions.
3049 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3050 /// extract order.
3051bool canReuseExtract(ArrayRef<Value *> VL,
3052SmallVectorImpl<unsigned> &CurrentOrder,
3053bool ResizeAllowed =false)const;
3054
3055 /// Vectorize a single entry in the tree.
3056 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3057 /// avoid issues with def-use order.
3058Value *vectorizeTree(TreeEntry *E,bool PostponedPHIs);
3059
3060 /// Returns vectorized operand node, that matches the order of the scalars
3061 /// operand number \p NodeIdx in entry \p E.
3062 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,unsigned NodeIdx);
3063const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3064unsigned NodeIdx) const{
3065returnconst_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3066 }
3067
3068 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3069 /// \p E.
3070 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3071 /// avoid issues with def-use order.
3072Value *vectorizeOperand(TreeEntry *E,unsigned NodeIdx,bool PostponedPHIs);
3073
3074 /// Create a new vector from a list of scalar values. Produces a sequence
3075 /// which exploits values reused across lanes, and arranges the inserts
3076 /// for ease of later optimization.
3077template <typename BVTy,typename ResTy,typename...Args>
3078 ResTy processBuildVector(const TreeEntry *E,Type *ScalarTy, Args &...Params);
3079
3080 /// Create a new vector from a list of scalar values. Produces a sequence
3081 /// which exploits values reused across lanes, and arranges the inserts
3082 /// for ease of later optimization.
3083Value *createBuildVector(const TreeEntry *E,Type *ScalarTy,
3084bool PostponedPHIs);
3085
3086 /// Returns the instruction in the bundle, which can be used as a base point
3087 /// for scheduling. Usually it is the last instruction in the bundle, except
3088 /// for the case when all operands are external (in this case, it is the first
3089 /// instruction in the list).
3090Instruction &getLastInstructionInBundle(const TreeEntry *E);
3091
3092 /// Tries to find extractelement instructions with constant indices from fixed
3093 /// vector type and gather such instructions into a bunch, which highly likely
3094 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3095 /// was successful, the matched scalars are replaced by poison values in \p VL
3096 /// for future analysis.
3097 std::optional<TargetTransformInfo::ShuffleKind>
3098 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3099SmallVectorImpl<int> &Mask)const;
3100
3101 /// Tries to find extractelement instructions with constant indices from fixed
3102 /// vector type and gather such instructions into a bunch, which highly likely
3103 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3104 /// was successful, the matched scalars are replaced by poison values in \p VL
3105 /// for future analysis.
3106SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3107 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3108SmallVectorImpl<int> &Mask,
3109unsigned NumParts)const;
3110
3111 /// Checks if the gathered \p VL can be represented as a single register
3112 /// shuffle(s) of previous tree entries.
3113 /// \param TE Tree entry checked for permutation.
3114 /// \param VL List of scalars (a subset of the TE scalar), checked for
3115 /// permutations. Must form single-register vector.
3116 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3117 /// commands to build the mask using the original vector value, without
3118 /// relying on the potential reordering.
3119 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3120 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3121 std::optional<TargetTransformInfo::ShuffleKind>
3122 isGatherShuffledSingleRegisterEntry(
3123const TreeEntry *TE,ArrayRef<Value *> VL,MutableArrayRef<int> Mask,
3124SmallVectorImpl<const TreeEntry *> &Entries,unsigned Part,
3125bool ForOrder);
3126
3127 /// Checks if the gathered \p VL can be represented as multi-register
3128 /// shuffle(s) of previous tree entries.
3129 /// \param TE Tree entry checked for permutation.
3130 /// \param VL List of scalars (a subset of the TE scalar), checked for
3131 /// permutations.
3132 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3133 /// commands to build the mask using the original vector value, without
3134 /// relying on the potential reordering.
3135 /// \returns per-register series of ShuffleKind, if gathered values can be
3136 /// represented as shuffles of previous tree entries. \p Mask is filled with
3137 /// the shuffle mask (also on per-register base).
3138SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3139 isGatherShuffledEntry(
3140const TreeEntry *TE,ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,
3141SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
3142unsigned NumParts,bool ForOrder =false);
3143
3144 /// \returns the cost of gathering (inserting) the values in \p VL into a
3145 /// vector.
3146 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3147InstructionCost getGatherCost(ArrayRef<Value *> VL,bool ForPoisonSrc,
3148Type *ScalarTy)const;
3149
3150 /// Set the Builder insert point to one after the last instruction in
3151 /// the bundle
3152void setInsertPointAfterBundle(const TreeEntry *E);
3153
3154 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3155 /// specified, the starting vector value is poison.
3156Value *
3157 gather(ArrayRef<Value *> VL,Value *Root,Type *ScalarTy,
3158function_ref<Value *(Value *,Value *,ArrayRef<int>)> CreateShuffle);
3159
3160 /// \returns whether the VectorizableTree is fully vectorizable and will
3161 /// be beneficial even the tree height is tiny.
3162bool isFullyVectorizableTinyTree(bool ForReduction)const;
3163
3164 /// Run through the list of all gathered loads in the graph and try to find
3165 /// vector loads/masked gathers instead of regular gathers. Later these loads
3166 /// are reshufled to build final gathered nodes.
3167void tryToVectorizeGatheredLoads(
3168constSmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3169SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3170 8> &GatheredLoads);
3171
3172 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3173 /// users of \p TE and collects the stores. It returns the map from the store
3174 /// pointers to the collected stores.
3175SmallVector<SmallVector<StoreInst *>>
3176 collectUserStores(const BoUpSLP::TreeEntry *TE)const;
3177
3178 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3179 /// stores in \p StoresVec can form a vector instruction. If so it returns
3180 /// true and populates \p ReorderIndices with the shuffle indices of the
3181 /// stores when compared to the sorted vector.
3182bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3183OrdersType &ReorderIndices)const;
3184
3185 /// Iterates through the users of \p TE, looking for scalar stores that can be
3186 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3187 /// their order and builds an order index vector for each store bundle. It
3188 /// returns all these order vectors found.
3189 /// We run this after the tree has formed, otherwise we may come across user
3190 /// instructions that are not yet in the tree.
3191SmallVector<OrdersType, 1>
3192 findExternalStoreUsersReorderIndices(TreeEntry *TE)const;
3193
3194 /// Tries to reorder the gathering node for better vectorization
3195 /// opportunities.
3196void reorderGatherNode(TreeEntry &TE);
3197
3198structTreeEntry {
3199usingVecTreeTy =SmallVector<std::unique_ptr<TreeEntry>, 8>;
3200 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3201
3202 /// \returns Common mask for reorder indices and reused scalars.
3203SmallVector<int> getCommonMask() const{
3204SmallVector<int>Mask;
3205inversePermutation(ReorderIndices, Mask);
3206::addMask(Mask, ReuseShuffleIndices);
3207returnMask;
3208 }
3209
3210 /// \returns true if the scalars in VL are equal to this entry.
3211bool isSame(ArrayRef<Value *> VL) const{
3212auto &&IsSame = [VL](ArrayRef<Value *> Scalars,ArrayRef<int>Mask) {
3213if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3214return std::equal(VL.begin(), VL.end(), Scalars.begin());
3215return VL.size() ==Mask.size() &&
3216 std::equal(VL.begin(), VL.end(),Mask.begin(),
3217 [Scalars](Value *V,int Idx) {
3218 return (isa<UndefValue>(V) &&
3219 Idx == PoisonMaskElem) ||
3220 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3221 });
3222 };
3223if (!ReorderIndices.empty()) {
3224// TODO: implement matching if the nodes are just reordered, still can
3225// treat the vector as the same if the list of scalars matches VL
3226// directly, without reordering.
3227SmallVector<int>Mask;
3228inversePermutation(ReorderIndices, Mask);
3229if (VL.size() == Scalars.size())
3230return IsSame(Scalars, Mask);
3231if (VL.size() == ReuseShuffleIndices.size()) {
3232::addMask(Mask, ReuseShuffleIndices);
3233return IsSame(Scalars, Mask);
3234 }
3235returnfalse;
3236 }
3237return IsSame(Scalars, ReuseShuffleIndices);
3238 }
3239
3240bool isOperandGatherNode(const EdgeInfo &UserEI) const{
3241returnisGather() && !UserTreeIndices.empty() &&
3242 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3243 UserTreeIndices.front().UserTE == UserEI.UserTE;
3244 }
3245
3246 /// \returns true if current entry has same operands as \p TE.
3247bool hasEqualOperands(const TreeEntry &TE) const{
3248if (TE.getNumOperands() != getNumOperands())
3249returnfalse;
3250SmallBitVectorUsed(getNumOperands());
3251for (unsignedI = 0, E = getNumOperands();I < E; ++I) {
3252unsigned PrevCount =Used.count();
3253for (unsigned K = 0;K < E; ++K) {
3254if (Used.test(K))
3255continue;
3256if (getOperand(K) ==TE.getOperand(I)) {
3257Used.set(K);
3258break;
3259 }
3260 }
3261// Check if we actually found the matching operand.
3262if (PrevCount ==Used.count())
3263returnfalse;
3264 }
3265returntrue;
3266 }
3267
3268 /// \return Final vectorization factor for the node. Defined by the total
3269 /// number of vectorized scalars, including those, used several times in the
3270 /// entry and counted in the \a ReuseShuffleIndices, if any.
3271unsigned getVectorFactor() const{
3272if (!ReuseShuffleIndices.empty())
3273return ReuseShuffleIndices.size();
3274return Scalars.size();
3275 };
3276
3277 /// Checks if the current node is a gather node.
3278boolisGather() const{return State == NeedToGather; }
3279
3280 /// A vector of scalars.
3281ValueList Scalars;
3282
3283 /// The Scalars are vectorized into this value. It is initialized to Null.
3284WeakTrackingVH VectorizedValue =nullptr;
3285
3286 /// New vector phi instructions emitted for the vectorized phi nodes.
3287PHINode *PHI =nullptr;
3288
3289 /// Do we need to gather this sequence or vectorize it
3290 /// (either with vector instruction or with scatter/gather
3291 /// intrinsics for store/load)?
3292enum EntryState {
3293 Vectorize,///< The node is regularly vectorized.
3294 ScatterVectorize,///< Masked scatter/gather node.
3295 StridedVectorize,///< Strided loads (and stores)
3296 NeedToGather,///< Gather/buildvector node.
3297 CombinedVectorize,///< Vectorized node, combined with its user into more
3298 ///< complex node like select/cmp to minmax, mul/add to
3299 ///< fma, etc. Must be used for the following nodes in
3300 ///< the pattern, not the very first one.
3301 };
3302 EntryState State;
3303
3304 /// List of combined opcodes supported by the vectorizer.
3305enum CombinedOpcode {
3306 NotCombinedOp = -1,
3307MinMax = Instruction::OtherOpsEnd + 1,
3308 };
3309 CombinedOpcode CombinedOp = NotCombinedOp;
3310
3311 /// Does this sequence require some shuffling?
3312SmallVector<int, 4> ReuseShuffleIndices;
3313
3314 /// Does this entry require reordering?
3315SmallVector<unsigned, 4> ReorderIndices;
3316
3317 /// Points back to the VectorizableTree.
3318 ///
3319 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3320 /// to be a pointer and needs to be able to initialize the child iterator.
3321 /// Thus we need a reference back to the container to translate the indices
3322 /// to entries.
3323 VecTreeTy &Container;
3324
3325 /// The TreeEntry index containing the user of this entry. We can actually
3326 /// have multiple users so the data structure is not truly a tree.
3327SmallVector<EdgeInfo, 1> UserTreeIndices;
3328
3329 /// The index of this treeEntry in VectorizableTree.
3330unsignedIdx = 0;
3331
3332 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3333 /// other nodes as a series of insertvector instructions.
3334SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3335
3336private:
3337 /// The operands of each instruction in each lane Operands[op_index][lane].
3338 /// Note: This helps avoid the replication of the code that performs the
3339 /// reordering of operands during buildTree_rec() and vectorizeTree().
3340SmallVector<ValueList, 2>Operands;
3341
3342 /// MainOp and AltOp are recorded inside. S should be obtained from
3343 /// newTreeEntry.
3344 InstructionsState S = InstructionsState::invalid();
3345
3346 /// Interleaving factor for interleaved loads Vectorize nodes.
3347unsigned InterleaveFactor = 0;
3348
3349public:
3350 /// Returns interleave factor for interleave nodes.
3351unsigned getInterleaveFactor() const{return InterleaveFactor; }
3352 /// Sets interleaving factor for the interleaving nodes.
3353void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3354
3355 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3356void setOperand(unsigned OpIdx,ArrayRef<Value *> OpVL) {
3357if (Operands.size() < OpIdx + 1)
3358Operands.resize(OpIdx + 1);
3359assert(Operands[OpIdx].empty() &&"Already resized?");
3360assert(OpVL.size() <= Scalars.size() &&
3361"Number of operands is greater than the number of scalars.");
3362Operands[OpIdx].resize(OpVL.size());
3363copy(OpVL, Operands[OpIdx].begin());
3364 }
3365
3366 /// Set this bundle's operand from Scalars.
3367void setOperand(constBoUpSLP &R,bool RequireReorder =false) {
3368 VLOperands Ops(Scalars, S, R);
3369if (RequireReorder)
3370 Ops.reorder();
3371for (unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands()))
3372 setOperand(I, Ops.getVL(I));
3373 }
3374
3375 /// Reorders operands of the node to the given mask \p Mask.
3376void reorderOperands(ArrayRef<int> Mask) {
3377for (ValueList &Operand : Operands)
3378reorderScalars(Operand, Mask);
3379 }
3380
3381 /// \returns the \p OpIdx operand of this TreeEntry.
3382ValueList &getOperand(unsigned OpIdx) {
3383assert(OpIdx <Operands.size() &&"Off bounds");
3384returnOperands[OpIdx];
3385 }
3386
3387 /// \returns the \p OpIdx operand of this TreeEntry.
3388ArrayRef<Value *> getOperand(unsigned OpIdx) const{
3389assert(OpIdx <Operands.size() &&"Off bounds");
3390returnOperands[OpIdx];
3391 }
3392
3393 /// \returns the number of operands.
3394unsigned getNumOperands() const{returnOperands.size(); }
3395
3396 /// \return the single \p OpIdx operand.
3397Value *getSingleOperand(unsigned OpIdx) const{
3398assert(OpIdx <Operands.size() &&"Off bounds");
3399assert(!Operands[OpIdx].empty() &&"No operand available");
3400returnOperands[OpIdx][0];
3401 }
3402
3403 /// Some of the instructions in the list have alternate opcodes.
3404bool isAltShuffle() const{return S.isAltShuffle(); }
3405
3406bool isOpcodeOrAlt(Instruction *I) const{return S.isOpcodeOrAlt(I); }
3407
3408 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3409 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3410 /// \p OpValue.
3411Value *isOneOf(Value *Op) const{
3412auto *I = dyn_cast<Instruction>(Op);
3413if (I && isOpcodeOrAlt(I))
3414returnOp;
3415return S.getMainOp();
3416 }
3417
3418void setOperations(const InstructionsState &S) {
3419assert(S &&"InstructionsState is invalid.");
3420 this->S = S;
3421 }
3422
3423Instruction *getMainOp() const{return S.getMainOp(); }
3424
3425Instruction *getAltOp() const{return S.getAltOp(); }
3426
3427 /// The main/alternate opcodes for the list of instructions.
3428unsigned getOpcode() const{return S.getOpcode(); }
3429
3430unsigned getAltOpcode() const{return S.getAltOpcode(); }
3431
3432bool hasState() const{return S.valid(); }
3433
3434 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3435 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3436int findLaneForValue(Value *V) const{
3437unsigned FoundLane = getVectorFactor();
3438for (auto *It =find(Scalars, V), *End = Scalars.end(); It !=End;
3439 std::advance(It, 1)) {
3440if (*It != V)
3441continue;
3442 FoundLane = std::distance(Scalars.begin(), It);
3443assert(FoundLane < Scalars.size() &&"Couldn't find extract lane");
3444if (!ReorderIndices.empty())
3445 FoundLane = ReorderIndices[FoundLane];
3446assert(FoundLane < Scalars.size() &&"Couldn't find extract lane");
3447if (ReuseShuffleIndices.empty())
3448break;
3449if (auto *RIt =find(ReuseShuffleIndices, FoundLane);
3450 RIt != ReuseShuffleIndices.end()) {
3451 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3452break;
3453 }
3454 }
3455assert(FoundLane < getVectorFactor() &&"Unable to find given value.");
3456return FoundLane;
3457 }
3458
3459 /// Build a shuffle mask for graph entry which represents a merge of main
3460 /// and alternate operations.
3461void
3462 buildAltOpShuffleMask(constfunction_ref<bool(Instruction *)> IsAltOp,
3463SmallVectorImpl<int> &Mask,
3464SmallVectorImpl<Value *> *OpScalars =nullptr,
3465SmallVectorImpl<Value *> *AltScalars =nullptr)const;
3466
3467 /// Return true if this is a non-power-of-2 node.
3468bool isNonPowOf2Vec() const{
3469bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3470return IsNonPowerOf2;
3471 }
3472
3473 /// Return true if this is a node, which tries to vectorize number of
3474 /// elements, forming whole vectors.
3475bool
3476 hasNonWholeRegisterOrNonPowerOf2Vec(constTargetTransformInfo &TTI) const{
3477bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3478TTI,getValueType(Scalars.front()), Scalars.size());
3479assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3480"Reshuffling not supported with non-power-of-2 vectors yet.");
3481return IsNonPowerOf2;
3482 }
3483
3484Value *getOrdered(unsigned Idx) const{
3485assert(isGather() &&"Must be used only for buildvectors/gathers.");
3486if (ReorderIndices.empty())
3487return Scalars[Idx];
3488SmallVector<int>Mask;
3489inversePermutation(ReorderIndices, Mask);
3490return Scalars[Mask[Idx]];
3491 }
3492
3493#ifndef NDEBUG
3494 /// Debug printer.
3495LLVM_DUMP_METHODvoiddump() const{
3496dbgs() <<Idx <<".\n";
3497for (unsigned OpI = 0, OpE =Operands.size(); OpI != OpE; ++OpI) {
3498dbgs() <<"Operand " << OpI <<":\n";
3499for (constValue *V : Operands[OpI])
3500dbgs().indent(2) << *V <<"\n";
3501 }
3502dbgs() <<"Scalars: \n";
3503for (Value *V : Scalars)
3504dbgs().indent(2) << *V <<"\n";
3505dbgs() <<"State: ";
3506switch (State) {
3507case Vectorize:
3508if (InterleaveFactor > 0) {
3509dbgs() <<"Vectorize with interleave factor " << InterleaveFactor
3510 <<"\n";
3511 }else {
3512dbgs() <<"Vectorize\n";
3513 }
3514break;
3515case ScatterVectorize:
3516dbgs() <<"ScatterVectorize\n";
3517break;
3518case StridedVectorize:
3519dbgs() <<"StridedVectorize\n";
3520break;
3521case NeedToGather:
3522dbgs() <<"NeedToGather\n";
3523break;
3524case CombinedVectorize:
3525dbgs() <<"CombinedVectorize\n";
3526break;
3527 }
3528if (S) {
3529dbgs() <<"MainOp: " << *S.getMainOp() <<"\n";
3530dbgs() <<"AltOp: " << *S.getAltOp() <<"\n";
3531 }else {
3532dbgs() <<"MainOp: NULL\n";
3533dbgs() <<"AltOp: NULL\n";
3534 }
3535dbgs() <<"VectorizedValue: ";
3536if (VectorizedValue)
3537dbgs() << *VectorizedValue <<"\n";
3538else
3539dbgs() <<"NULL\n";
3540dbgs() <<"ReuseShuffleIndices: ";
3541if (ReuseShuffleIndices.empty())
3542dbgs() <<"Empty";
3543else
3544for (int ReuseIdx : ReuseShuffleIndices)
3545dbgs() << ReuseIdx <<", ";
3546dbgs() <<"\n";
3547dbgs() <<"ReorderIndices: ";
3548for (unsigned ReorderIdx : ReorderIndices)
3549dbgs() << ReorderIdx <<", ";
3550dbgs() <<"\n";
3551dbgs() <<"UserTreeIndices: ";
3552for (constauto &EInfo : UserTreeIndices)
3553dbgs() << EInfo <<", ";
3554dbgs() <<"\n";
3555if (!CombinedEntriesWithIndices.empty()) {
3556dbgs() <<"Combined entries: ";
3557interleaveComma(CombinedEntriesWithIndices,dbgs(), [&](constauto &P) {
3558dbgs() <<"Entry index " <<P.first <<" with offset " <<P.second;
3559 });
3560dbgs() <<"\n";
3561 }
3562 }
3563#endif
3564 };
3565
3566#ifndef NDEBUG
3567void dumpTreeCosts(const TreeEntry *E,InstructionCost ReuseShuffleCost,
3568InstructionCost VecCost,InstructionCost ScalarCost,
3569StringRef Banner) const{
3570dbgs() <<"SLP: " << Banner <<":\n";
3571 E->dump();
3572dbgs() <<"SLP: Costs:\n";
3573dbgs() <<"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<"\n";
3574dbgs() <<"SLP: VectorCost = " << VecCost <<"\n";
3575dbgs() <<"SLP: ScalarCost = " << ScalarCost <<"\n";
3576dbgs() <<"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3577 << ReuseShuffleCost + VecCost - ScalarCost <<"\n";
3578 }
3579#endif
3580
3581 /// Create a new VectorizableTree entry.
3582 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3583 std::optional<ScheduleData *> Bundle,
3584const InstructionsState &S,
3585const EdgeInfo &UserTreeIdx,
3586ArrayRef<int> ReuseShuffleIndices = {},
3587ArrayRef<unsigned> ReorderIndices = {},
3588unsigned InterleaveFactor = 0) {
3589 TreeEntry::EntryState EntryState =
3590 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3591 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3592 ReuseShuffleIndices, ReorderIndices);
3593if (E && InterleaveFactor > 0)
3594 E->setInterleave(InterleaveFactor);
3595return E;
3596 }
3597
3598 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3599 TreeEntry::EntryState EntryState,
3600 std::optional<ScheduleData *> Bundle,
3601const InstructionsState &S,
3602const EdgeInfo &UserTreeIdx,
3603ArrayRef<int> ReuseShuffleIndices = {},
3604ArrayRef<unsigned> ReorderIndices = {}) {
3605assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3606 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3607"Need to vectorize gather entry?");
3608// Gathered loads still gathered? Do not create entry, use the original one.
3609if (GatheredLoadsEntriesFirst.has_value() &&
3610 EntryState == TreeEntry::NeedToGather && S &&
3611 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3612 !UserTreeIdx.UserTE)
3613returnnullptr;
3614 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3615 TreeEntry *Last = VectorizableTree.back().get();
3616Last->Idx = VectorizableTree.size() - 1;
3617Last->State = EntryState;
3618// FIXME: Remove once support for ReuseShuffleIndices has been implemented
3619// for non-power-of-two vectors.
3620assert(
3621 (hasFullVectorsOrPowerOf2(*TTI,getValueType(VL.front()), VL.size()) ||
3622 ReuseShuffleIndices.empty()) &&
3623"Reshuffling scalars not yet supported for nodes with padding");
3624Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3625 ReuseShuffleIndices.end());
3626if (ReorderIndices.empty()) {
3627Last->Scalars.assign(VL.begin(), VL.end());
3628if (S)
3629Last->setOperations(S);
3630 }else {
3631// Reorder scalars and build final mask.
3632Last->Scalars.assign(VL.size(),nullptr);
3633transform(ReorderIndices,Last->Scalars.begin(),
3634 [VL](unsignedIdx) ->Value * {
3635 if (Idx >= VL.size())
3636 return UndefValue::get(VL.front()->getType());
3637 return VL[Idx];
3638 });
3639 InstructionsState S =getSameOpcode(Last->Scalars, *TLI);
3640if (S)
3641Last->setOperations(S);
3642Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3643 }
3644if (!Last->isGather()) {
3645for (Value *V : VL) {
3646if (isa<PoisonValue>(V))
3647continue;
3648const TreeEntry *TE = getTreeEntry(V);
3649assert((!TE || TE ==Last ||doesNotNeedToBeScheduled(V)) &&
3650"Scalar already in tree!");
3651if (TE) {
3652if (TE !=Last)
3653 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3654continue;
3655 }
3656 ScalarToTreeEntry[V] =Last;
3657 }
3658// Update the scheduler bundle to point to this TreeEntry.
3659 ScheduleData *BundleMember = *Bundle;
3660assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3661isVectorLikeInstWithConstOps(S.getMainOp()) ||
3662doesNotNeedToSchedule(VL)) &&
3663"Bundle and VL out of sync");
3664if (BundleMember) {
3665for (Value *V : VL) {
3666if (doesNotNeedToBeScheduled(V))
3667continue;
3668if (!BundleMember)
3669continue;
3670 BundleMember->TE =Last;
3671 BundleMember = BundleMember->NextInBundle;
3672 }
3673 }
3674assert(!BundleMember &&"Bundle and VL out of sync");
3675 }else {
3676// Build a map for gathered scalars to the nodes where they are used.
3677bool AllConstsOrCasts =true;
3678for (Value *V : VL)
3679if (!isConstant(V)) {
3680auto *I = dyn_cast<CastInst>(V);
3681 AllConstsOrCasts &=I &&I->getType()->isIntegerTy();
3682if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3683 !UserTreeIdx.UserTE->isGather())
3684 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3685 }
3686if (AllConstsOrCasts)
3687 CastMaxMinBWSizes =
3688 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3689 MustGather.insert(VL.begin(), VL.end());
3690 }
3691
3692if (UserTreeIdx.UserTE)
3693Last->UserTreeIndices.push_back(UserTreeIdx);
3694returnLast;
3695 }
3696
3697 /// -- Vectorization State --
3698 /// Holds all of the tree entries.
3699 TreeEntry::VecTreeTy VectorizableTree;
3700
3701#ifndef NDEBUG
3702 /// Debug printer.
3703LLVM_DUMP_METHODvoid dumpVectorizableTree() const{
3704for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3705 VectorizableTree[Id]->dump();
3706dbgs() <<"\n";
3707 }
3708 }
3709#endif
3710
3711 TreeEntry *getTreeEntry(Value *V) {
3712assert(V &&"V cannot be nullptr.");
3713return ScalarToTreeEntry.lookup(V);
3714 }
3715
3716const TreeEntry *getTreeEntry(Value *V) const{
3717assert(V &&"V cannot be nullptr.");
3718return ScalarToTreeEntry.lookup(V);
3719 }
3720
3721 /// Check that the operand node of alternate node does not generate
3722 /// buildvector sequence. If it is, then probably not worth it to build
3723 /// alternate shuffle, if number of buildvector operands + alternate
3724 /// instruction > than the number of buildvector instructions.
3725 /// \param S the instructions state of the analyzed values.
3726 /// \param VL list of the instructions with alternate opcodes.
3727bool areAltOperandsProfitable(const InstructionsState &S,
3728ArrayRef<Value *> VL)const;
3729
3730 /// Checks if the specified list of the instructions/values can be vectorized
3731 /// and fills required data before actual scheduling of the instructions.
3732 TreeEntry::EntryState
3733 getScalarsVectorizationState(const InstructionsState &S,ArrayRef<Value *> VL,
3734bool IsScatterVectorizeUserTE,
3735OrdersType &CurrentOrder,
3736SmallVectorImpl<Value *> &PointerOps);
3737
3738 /// Maps a specific scalar to its tree entry.
3739SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3740
3741 /// List of scalars, used in several vectorize nodes, and the list of the
3742 /// nodes.
3743SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
3744
3745 /// Maps a value to the proposed vectorizable size.
3746SmallDenseMap<Value *, unsigned> InstrElementSize;
3747
3748 /// A list of scalars that we found that we need to keep as scalars.
3749ValueSet MustGather;
3750
3751 /// A set of first non-schedulable values.
3752ValueSet NonScheduledFirst;
3753
3754 /// A map between the vectorized entries and the last instructions in the
3755 /// bundles. The bundles are built in use order, not in the def order of the
3756 /// instructions. So, we cannot rely directly on the last instruction in the
3757 /// bundle being the last instruction in the program order during
3758 /// vectorization process since the basic blocks are affected, need to
3759 /// pre-gather them before.
3760DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3761
3762 /// List of gather nodes, depending on other gather/vector nodes, which should
3763 /// be emitted after the vector instruction emission process to correctly
3764 /// handle order of the vector instructions and shuffles.
3765SetVector<const TreeEntry *> PostponedGathers;
3766
3767usingValueToGatherNodesMap =
3768DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
3769 ValueToGatherNodesMap ValueToGatherNodes;
3770
3771 /// A list of the load entries (node indices), which can be vectorized using
3772 /// strided or masked gather approach, but attempted to be represented as
3773 /// contiguous loads.
3774SetVector<unsigned> LoadEntriesToVectorize;
3775
3776 /// true if graph nodes transforming mode is on.
3777bool IsGraphTransformMode =false;
3778
3779 /// The index of the first gathered load entry in the VectorizeTree.
3780 std::optional<unsigned> GatheredLoadsEntriesFirst;
3781
3782 /// This POD struct describes one external user in the vectorized tree.
3783structExternalUser {
3784 ExternalUser(Value *S,llvm::User *U,int L)
3785 :Scalar(S),User(U), Lane(L) {}
3786
3787// Which scalar in our function.
3788Value *Scalar;
3789
3790// Which user that uses the scalar.
3791llvm::User *User;
3792
3793// Which lane does the scalar belong to.
3794int Lane;
3795 };
3796usingUserList =SmallVector<ExternalUser, 16>;
3797
3798 /// Checks if two instructions may access the same memory.
3799 ///
3800 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3801 /// is invariant in the calling loop.
3802bool isAliased(constMemoryLocation &Loc1,Instruction *Inst1,
3803Instruction *Inst2) {
3804if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3805returntrue;
3806// First check if the result is already in the cache.
3807 AliasCacheKeyKey = std::make_pair(Inst1, Inst2);
3808auto It = AliasCache.find(Key);
3809if (It != AliasCache.end())
3810return It->second;
3811bool Aliased =isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3812// Store the result in the cache.
3813 AliasCache.try_emplace(Key, Aliased);
3814 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3815return Aliased;
3816 }
3817
3818usingAliasCacheKey = std::pair<Instruction *, Instruction *>;
3819
3820 /// Cache for alias results.
3821 /// TODO: consider moving this to the AliasAnalysis itself.
3822DenseMap<AliasCacheKey, bool> AliasCache;
3823
3824// Cache for pointerMayBeCaptured calls inside AA. This is preserved
3825// globally through SLP because we don't perform any action which
3826// invalidates capture results.
3827BatchAAResults BatchAA;
3828
3829 /// Temporary store for deleted instructions. Instructions will be deleted
3830 /// eventually when the BoUpSLP is destructed. The deferral is required to
3831 /// ensure that there are no incorrect collisions in the AliasCache, which
3832 /// can happen if a new instruction is allocated at the same address as a
3833 /// previously deleted instruction.
3834DenseSet<Instruction *> DeletedInstructions;
3835
3836 /// Set of the instruction, being analyzed already for reductions.
3837SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3838
3839 /// Set of hashes for the list of reduction values already being analyzed.
3840DenseSet<size_t> AnalyzedReductionVals;
3841
3842 /// Values, already been analyzed for mininmal bitwidth and found to be
3843 /// non-profitable.
3844DenseSet<Value *> AnalyzedMinBWVals;
3845
3846 /// A list of values that need to extracted out of the tree.
3847 /// This list holds pairs of (Internal Scalar : External User). External User
3848 /// can be nullptr, it means that this Internal Scalar will be used later,
3849 /// after vectorization.
3850 UserList ExternalUses;
3851
3852 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3853 /// extractelement instructions.
3854SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3855
3856 /// Values used only by @llvm.assume calls.
3857SmallPtrSet<const Value *, 32> EphValues;
3858
3859 /// Holds all of the instructions that we gathered, shuffle instructions and
3860 /// extractelements.
3861SetVector<Instruction *> GatherShuffleExtractSeq;
3862
3863 /// A list of blocks that we are going to CSE.
3864DenseSet<BasicBlock *> CSEBlocks;
3865
3866 /// List of hashes of vector of loads, which are known to be non vectorizable.
3867DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3868
3869 /// Contains all scheduling relevant data for an instruction.
3870 /// A ScheduleData either represents a single instruction or a member of an
3871 /// instruction bundle (= a group of instructions which is combined into a
3872 /// vector instruction).
3873structScheduleData {
3874// The initial value for the dependency counters. It means that the
3875// dependencies are not calculated yet.
3876enum { InvalidDeps = -1 };
3877
3878 ScheduleData() =default;
3879
3880voidinit(int BlockSchedulingRegionID,Instruction *I) {
3881 FirstInBundle =this;
3882 NextInBundle =nullptr;
3883 NextLoadStore =nullptr;
3884 IsScheduled =false;
3885 SchedulingRegionID = BlockSchedulingRegionID;
3886 clearDependencies();
3887 Inst =I;
3888TE =nullptr;
3889 }
3890
3891 /// Verify basic self consistency properties
3892voidverify() {
3893if (hasValidDependencies()) {
3894assert(UnscheduledDeps <= Dependencies &&"invariant");
3895 }else {
3896assert(UnscheduledDeps == Dependencies &&"invariant");
3897 }
3898
3899if (IsScheduled) {
3900assert(isSchedulingEntity() &&
3901"unexpected scheduled state");
3902for (const ScheduleData *BundleMember =this; BundleMember;
3903 BundleMember = BundleMember->NextInBundle) {
3904assert(BundleMember->hasValidDependencies() &&
3905 BundleMember->UnscheduledDeps == 0 &&
3906"unexpected scheduled state");
3907assert((BundleMember ==this || !BundleMember->IsScheduled) &&
3908"only bundle is marked scheduled");
3909 }
3910 }
3911
3912assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3913"all bundle members must be in same basic block");
3914 }
3915
3916 /// Returns true if the dependency information has been calculated.
3917 /// Note that depenendency validity can vary between instructions within
3918 /// a single bundle.
3919bool hasValidDependencies() const{return Dependencies != InvalidDeps; }
3920
3921 /// Returns true for single instructions and for bundle representatives
3922 /// (= the head of a bundle).
3923bool isSchedulingEntity() const{return FirstInBundle ==this; }
3924
3925 /// Returns true if it represents an instruction bundle and not only a
3926 /// single instruction.
3927bool isPartOfBundle() const{
3928return NextInBundle !=nullptr || FirstInBundle !=this ||TE;
3929 }
3930
3931 /// Returns true if it is ready for scheduling, i.e. it has no more
3932 /// unscheduled depending instructions/bundles.
3933bool isReady() const{
3934assert(isSchedulingEntity() &&
3935"can't consider non-scheduling entity for ready list");
3936return unscheduledDepsInBundle() == 0 && !IsScheduled;
3937 }
3938
3939 /// Modifies the number of unscheduled dependencies for this instruction,
3940 /// and returns the number of remaining dependencies for the containing
3941 /// bundle.
3942int incrementUnscheduledDeps(int Incr) {
3943assert(hasValidDependencies() &&
3944"increment of unscheduled deps would be meaningless");
3945 UnscheduledDeps += Incr;
3946return FirstInBundle->unscheduledDepsInBundle();
3947 }
3948
3949 /// Sets the number of unscheduled dependencies to the number of
3950 /// dependencies.
3951void resetUnscheduledDeps() {
3952 UnscheduledDeps = Dependencies;
3953 }
3954
3955 /// Clears all dependency information.
3956void clearDependencies() {
3957 Dependencies = InvalidDeps;
3958 resetUnscheduledDeps();
3959 MemoryDependencies.clear();
3960 ControlDependencies.clear();
3961 }
3962
3963int unscheduledDepsInBundle() const{
3964assert(isSchedulingEntity() &&"only meaningful on the bundle");
3965int Sum = 0;
3966for (const ScheduleData *BundleMember =this; BundleMember;
3967 BundleMember = BundleMember->NextInBundle) {
3968if (BundleMember->UnscheduledDeps == InvalidDeps)
3969return InvalidDeps;
3970 Sum += BundleMember->UnscheduledDeps;
3971 }
3972return Sum;
3973 }
3974
3975voiddump(raw_ostream &os) const{
3976if (!isSchedulingEntity()) {
3977 os <<"/ " << *Inst;
3978 }elseif (NextInBundle) {
3979 os <<'[' << *Inst;
3980 ScheduleData *SD = NextInBundle;
3981while (SD) {
3982 os <<';' << *SD->Inst;
3983 SD = SD->NextInBundle;
3984 }
3985 os <<']';
3986 }else {
3987 os << *Inst;
3988 }
3989 }
3990
3991LLVM_DUMP_METHODvoiddump() const{dump(dbgs()); }
3992
3993Instruction *Inst =nullptr;
3994
3995 /// The TreeEntry that this instruction corresponds to.
3996 TreeEntry *TE =nullptr;
3997
3998 /// Points to the head in an instruction bundle (and always to this for
3999 /// single instructions).
4000 ScheduleData *FirstInBundle =nullptr;
4001
4002 /// Single linked list of all instructions in a bundle. Null if it is a
4003 /// single instruction.
4004 ScheduleData *NextInBundle =nullptr;
4005
4006 /// Single linked list of all memory instructions (e.g. load, store, call)
4007 /// in the block - until the end of the scheduling region.
4008 ScheduleData *NextLoadStore =nullptr;
4009
4010 /// The dependent memory instructions.
4011 /// This list is derived on demand in calculateDependencies().
4012SmallVector<ScheduleData *, 4> MemoryDependencies;
4013
4014 /// List of instructions which this instruction could be control dependent
4015 /// on. Allowing such nodes to be scheduled below this one could introduce
4016 /// a runtime fault which didn't exist in the original program.
4017 /// ex: this is a load or udiv following a readonly call which inf loops
4018SmallVector<ScheduleData *, 4> ControlDependencies;
4019
4020 /// This ScheduleData is in the current scheduling region if this matches
4021 /// the current SchedulingRegionID of BlockScheduling.
4022int SchedulingRegionID = 0;
4023
4024 /// Used for getting a "good" final ordering of instructions.
4025int SchedulingPriority = 0;
4026
4027 /// The number of dependencies. Constitutes of the number of users of the
4028 /// instruction plus the number of dependent memory instructions (if any).
4029 /// This value is calculated on demand.
4030 /// If InvalidDeps, the number of dependencies is not calculated yet.
4031int Dependencies = InvalidDeps;
4032
4033 /// The number of dependencies minus the number of dependencies of scheduled
4034 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4035 /// for scheduling.
4036 /// Note that this is negative as long as Dependencies is not calculated.
4037int UnscheduledDeps = InvalidDeps;
4038
4039 /// True if this instruction is scheduled (or considered as scheduled in the
4040 /// dry-run).
4041bool IsScheduled =false;
4042 };
4043
4044#ifndef NDEBUG
4045friendinlineraw_ostream &operator<<(raw_ostream &os,
4046const BoUpSLP::ScheduleData &SD) {
4047 SD.dump(os);
4048return os;
4049 }
4050#endif
4051
4052friendstructGraphTraits<BoUpSLP *>;
4053friendstructDOTGraphTraits<BoUpSLP *>;
4054
4055 /// Contains all scheduling data for a basic block.
4056 /// It does not schedules instructions, which are not memory read/write
4057 /// instructions and their operands are either constants, or arguments, or
4058 /// phis, or instructions from others blocks, or their users are phis or from
4059 /// the other blocks. The resulting vector instructions can be placed at the
4060 /// beginning of the basic block without scheduling (if operands does not need
4061 /// to be scheduled) or at the end of the block (if users are outside of the
4062 /// block). It allows to save some compile time and memory used by the
4063 /// compiler.
4064 /// ScheduleData is assigned for each instruction in between the boundaries of
4065 /// the tree entry, even for those, which are not part of the graph. It is
4066 /// required to correctly follow the dependencies between the instructions and
4067 /// their correct scheduling. The ScheduleData is not allocated for the
4068 /// instructions, which do not require scheduling, like phis, nodes with
4069 /// extractelements/insertelements only or nodes with instructions, with
4070 /// uses/operands outside of the block.
4071structBlockScheduling {
4072 BlockScheduling(BasicBlock *BB)
4073 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4074
4075void clear() {
4076 ReadyInsts.clear();
4077 ScheduleStart =nullptr;
4078 ScheduleEnd =nullptr;
4079 FirstLoadStoreInRegion =nullptr;
4080 LastLoadStoreInRegion =nullptr;
4081 RegionHasStackSave =false;
4082
4083// Reduce the maximum schedule region size by the size of the
4084// previous scheduling run.
4085 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4086if (ScheduleRegionSizeLimit <MinScheduleRegionSize)
4087 ScheduleRegionSizeLimit =MinScheduleRegionSize;
4088 ScheduleRegionSize = 0;
4089
4090// Make a new scheduling region, i.e. all existing ScheduleData is not
4091// in the new region yet.
4092 ++SchedulingRegionID;
4093 }
4094
4095 ScheduleData *getScheduleData(Instruction *I) {
4096if (BB !=I->getParent())
4097// Avoid lookup if can't possibly be in map.
4098returnnullptr;
4099 ScheduleData *SD = ScheduleDataMap.lookup(I);
4100if (SD && isInSchedulingRegion(SD))
4101return SD;
4102returnnullptr;
4103 }
4104
4105 ScheduleData *getScheduleData(Value *V) {
4106if (auto *I = dyn_cast<Instruction>(V))
4107return getScheduleData(I);
4108returnnullptr;
4109 }
4110
4111bool isInSchedulingRegion(ScheduleData *SD) const{
4112return SD->SchedulingRegionID == SchedulingRegionID;
4113 }
4114
4115 /// Marks an instruction as scheduled and puts all dependent ready
4116 /// instructions into the ready-list.
4117template <typename ReadyListType>
4118void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4119 SD->IsScheduled =true;
4120LLVM_DEBUG(dbgs() <<"SLP: schedule " << *SD <<"\n");
4121
4122for (ScheduleData *BundleMember = SD; BundleMember;
4123 BundleMember = BundleMember->NextInBundle) {
4124
4125// Handle the def-use chain dependencies.
4126
4127// Decrement the unscheduled counter and insert to ready list if ready.
4128auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4129 ScheduleData *OpDef = getScheduleData(I);
4130if (OpDef && OpDef->hasValidDependencies() &&
4131 OpDef->incrementUnscheduledDeps(-1) == 0) {
4132// There are no more unscheduled dependencies after
4133// decrementing, so we can put the dependent instruction
4134// into the ready list.
4135 ScheduleData *DepBundle = OpDef->FirstInBundle;
4136assert(!DepBundle->IsScheduled &&
4137"already scheduled bundle gets ready");
4138 ReadyList.insert(DepBundle);
4139LLVM_DEBUG(dbgs()
4140 <<"SLP: gets ready (def): " << *DepBundle <<"\n");
4141 }
4142 };
4143
4144// If BundleMember is a vector bundle, its operands may have been
4145// reordered during buildTree(). We therefore need to get its operands
4146// through the TreeEntry.
4147if (TreeEntry *TE = BundleMember->TE) {
4148// Need to search for the lane since the tree entry can be reordered.
4149auto *In = BundleMember->Inst;
4150int Lane = std::distance(TE->Scalars.begin(),
4151find(TE->Scalars, In));
4152assert(Lane >= 0 &&"Lane not set");
4153
4154// Since vectorization tree is being built recursively this assertion
4155// ensures that the tree entry has all operands set before reaching
4156// this code. Couple of exceptions known at the moment are extracts
4157// where their second (immediate) operand is not added. Since
4158// immediates do not affect scheduler behavior this is considered
4159// okay.
4160assert(
4161 In &&
4162 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4163In->getNumOperands() ==TE->getNumOperands()) &&
4164"Missed TreeEntry operands?");
4165
4166for (unsigned OpIdx : seq<unsigned>(TE->getNumOperands()))
4167if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4168 DecrUnsched(I);
4169 }else {
4170// If BundleMember is a stand-alone instruction, no operand reordering
4171// has taken place, so we directly access its operands.
4172for (Use &U : BundleMember->Inst->operands())
4173if (auto *I = dyn_cast<Instruction>(U.get()))
4174 DecrUnsched(I);
4175 }
4176// Handle the memory dependencies.
4177for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4178if (MemoryDepSD->hasValidDependencies() &&
4179 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4180// There are no more unscheduled dependencies after decrementing,
4181// so we can put the dependent instruction into the ready list.
4182 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4183assert(!DepBundle->IsScheduled &&
4184"already scheduled bundle gets ready");
4185 ReadyList.insert(DepBundle);
4186LLVM_DEBUG(dbgs()
4187 <<"SLP: gets ready (mem): " << *DepBundle <<"\n");
4188 }
4189 }
4190// Handle the control dependencies.
4191for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4192if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4193// There are no more unscheduled dependencies after decrementing,
4194// so we can put the dependent instruction into the ready list.
4195 ScheduleData *DepBundle = DepSD->FirstInBundle;
4196assert(!DepBundle->IsScheduled &&
4197"already scheduled bundle gets ready");
4198 ReadyList.insert(DepBundle);
4199LLVM_DEBUG(dbgs()
4200 <<"SLP: gets ready (ctl): " << *DepBundle <<"\n");
4201 }
4202 }
4203 }
4204 }
4205
4206 /// Verify basic self consistency properties of the data structure.
4207voidverify() {
4208if (!ScheduleStart)
4209return;
4210
4211assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4212 ScheduleStart->comesBefore(ScheduleEnd) &&
4213"Not a valid scheduling region?");
4214
4215for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {
4216auto *SD = getScheduleData(I);
4217if (!SD)
4218continue;
4219assert(isInSchedulingRegion(SD) &&
4220"primary schedule data not in window?");
4221assert(isInSchedulingRegion(SD->FirstInBundle) &&
4222"entire bundle in window!");
4223 SD->verify();
4224 }
4225
4226for (auto *SD : ReadyInsts) {
4227assert(SD->isSchedulingEntity() && SD->isReady() &&
4228"item in ready list not ready?");
4229 (void)SD;
4230 }
4231 }
4232
4233 /// Put all instructions into the ReadyList which are ready for scheduling.
4234template <typename ReadyListType>
4235void initialFillReadyList(ReadyListType &ReadyList) {
4236for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {
4237 ScheduleData *SD = getScheduleData(I);
4238if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4239 SD->isReady()) {
4240 ReadyList.insert(SD);
4241LLVM_DEBUG(dbgs()
4242 <<"SLP: initially in ready list: " << *SD <<"\n");
4243 }
4244 }
4245 }
4246
4247 /// Build a bundle from the ScheduleData nodes corresponding to the
4248 /// scalar instruction for each lane.
4249 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4250
4251 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4252 /// cyclic dependencies. This is only a dry-run, no instructions are
4253 /// actually moved at this stage.
4254 /// \returns the scheduling bundle. The returned Optional value is not
4255 /// std::nullopt if \p VL is allowed to be scheduled.
4256 std::optional<ScheduleData *>
4257 tryScheduleBundle(ArrayRef<Value *> VL,BoUpSLP *SLP,
4258const InstructionsState &S);
4259
4260 /// Un-bundles a group of instructions.
4261void cancelScheduling(ArrayRef<Value *> VL,Value *OpValue);
4262
4263 /// Allocates schedule data chunk.
4264 ScheduleData *allocateScheduleDataChunks();
4265
4266 /// Extends the scheduling region so that V is inside the region.
4267 /// \returns true if the region size is within the limit.
4268bool extendSchedulingRegion(Value *V,const InstructionsState &S);
4269
4270 /// Initialize the ScheduleData structures for new instructions in the
4271 /// scheduling region.
4272void initScheduleData(Instruction *FromI,Instruction *ToI,
4273 ScheduleData *PrevLoadStore,
4274 ScheduleData *NextLoadStore);
4275
4276 /// Updates the dependency information of a bundle and of all instructions/
4277 /// bundles which depend on the original bundle.
4278void calculateDependencies(ScheduleData *SD,bool InsertInReadyList,
4279BoUpSLP *SLP);
4280
4281 /// Sets all instruction in the scheduling region to un-scheduled.
4282void resetSchedule();
4283
4284BasicBlock *BB;
4285
4286 /// Simple memory allocation for ScheduleData.
4287SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
4288
4289 /// The size of a ScheduleData array in ScheduleDataChunks.
4290int ChunkSize;
4291
4292 /// The allocator position in the current chunk, which is the last entry
4293 /// of ScheduleDataChunks.
4294int ChunkPos;
4295
4296 /// Attaches ScheduleData to Instruction.
4297 /// Note that the mapping survives during all vectorization iterations, i.e.
4298 /// ScheduleData structures are recycled.
4299DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
4300
4301 /// The ready-list for scheduling (only used for the dry-run).
4302SetVector<ScheduleData *> ReadyInsts;
4303
4304 /// The first instruction of the scheduling region.
4305Instruction *ScheduleStart =nullptr;
4306
4307 /// The first instruction _after_ the scheduling region.
4308Instruction *ScheduleEnd =nullptr;
4309
4310 /// The first memory accessing instruction in the scheduling region
4311 /// (can be null).
4312 ScheduleData *FirstLoadStoreInRegion =nullptr;
4313
4314 /// The last memory accessing instruction in the scheduling region
4315 /// (can be null).
4316 ScheduleData *LastLoadStoreInRegion =nullptr;
4317
4318 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4319 /// region? Used to optimize the dependence calculation for the
4320 /// common case where there isn't.
4321bool RegionHasStackSave =false;
4322
4323 /// The current size of the scheduling region.
4324int ScheduleRegionSize = 0;
4325
4326 /// The maximum size allowed for the scheduling region.
4327int ScheduleRegionSizeLimit =ScheduleRegionSizeBudget;
4328
4329 /// The ID of the scheduling region. For a new vectorization iteration this
4330 /// is incremented which "removes" all ScheduleData from the region.
4331 /// Make sure that the initial SchedulingRegionID is greater than the
4332 /// initial SchedulingRegionID in ScheduleData (which is 0).
4333int SchedulingRegionID = 1;
4334 };
4335
4336 /// Attaches the BlockScheduling structures to basic blocks.
4337MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
4338
4339 /// Performs the "real" scheduling. Done before vectorization is actually
4340 /// performed in a basic block.
4341void scheduleBlock(BlockScheduling *BS);
4342
4343 /// List of users to ignore during scheduling and that don't need extracting.
4344constSmallDenseSet<Value *> *UserIgnoreList =nullptr;
4345
4346 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4347 /// sorted SmallVectors of unsigned.
4348structOrdersTypeDenseMapInfo {
4349staticOrdersType getEmptyKey() {
4350OrdersTypeV;
4351V.push_back(~1U);
4352returnV;
4353 }
4354
4355staticOrdersType getTombstoneKey() {
4356OrdersTypeV;
4357V.push_back(~2U);
4358returnV;
4359 }
4360
4361staticunsigned getHashValue(constOrdersType &V) {
4362returnstatic_cast<unsigned>(hash_combine_range(V.begin(),V.end()));
4363 }
4364
4365staticboolisEqual(constOrdersType &LHS,constOrdersType &RHS) {
4366returnLHS ==RHS;
4367 }
4368 };
4369
4370// Analysis and block reference.
4371Function *F;
4372ScalarEvolution *SE;
4373TargetTransformInfo *TTI;
4374TargetLibraryInfo *TLI;
4375LoopInfo *LI;
4376DominatorTree *DT;
4377AssumptionCache *AC;
4378DemandedBits *DB;
4379constDataLayout *DL;
4380OptimizationRemarkEmitter *ORE;
4381
4382unsigned MaxVecRegSize;// This is set by TTI or overridden by cl::opt.
4383unsigned MinVecRegSize;// Set by cl::opt (default: 128).
4384
4385 /// Instruction builder to construct the vectorized tree.
4386IRBuilder<TargetFolder> Builder;
4387
4388 /// A map of scalar integer values to the smallest bit width with which they
4389 /// can legally be represented. The values map to (width, signed) pairs,
4390 /// where "width" indicates the minimum bit width and "signed" is True if the
4391 /// value must be signed-extended, rather than zero-extended, back to its
4392 /// original width.
4393DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
4394
4395 /// Final size of the reduced vector, if the current graph represents the
4396 /// input for the reduction and it was possible to narrow the size of the
4397 /// reduction.
4398unsigned ReductionBitWidth = 0;
4399
4400 /// Canonical graph size before the transformations.
4401unsigned BaseGraphSize = 1;
4402
4403 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4404 /// type sizes, used in the tree.
4405 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4406
4407 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4408 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4409DenseSet<unsigned> ExtraBitWidthNodes;
4410};
4411
4412}// end namespace slpvectorizer
4413
4414template <>structGraphTraits<BoUpSLP *> {
4415usingTreeEntry = BoUpSLP::TreeEntry;
4416
4417 /// NodeRef has to be a pointer per the GraphWriter.
4418usingNodeRef =TreeEntry *;
4419
4420usingContainerTy =BoUpSLP::TreeEntry::VecTreeTy;
4421
4422 /// Add the VectorizableTree to the index iterator to be able to return
4423 /// TreeEntry pointers.
4424structChildIteratorType
4425 :publiciterator_adaptor_base<
4426 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4427ContainerTy &VectorizableTree;
4428
4429ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
4430ContainerTy &VT)
4431 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4432
4433NodeRefoperator*() {returnI->UserTE; }
4434 };
4435
4436staticNodeRefgetEntryNode(BoUpSLP &R) {
4437return R.VectorizableTree[0].get();
4438 }
4439
4440static ChildIteratorTypechild_begin(NodeRefN) {
4441return {N->UserTreeIndices.begin(),N->Container};
4442 }
4443
4444static ChildIteratorTypechild_end(NodeRefN) {
4445return {N->UserTreeIndices.end(),N->Container};
4446 }
4447
4448 /// For the node iterator we just need to turn the TreeEntry iterator into a
4449 /// TreeEntry* iterator so that it dereferences to NodeRef.
4450classnodes_iterator {
4451usingItTy =ContainerTy::iterator;
4452ItTy It;
4453
4454public:
4455nodes_iterator(constItTy &It2) : It(It2) {}
4456NodeRefoperator*() {return It->get(); }
4457 nodes_iteratoroperator++() {
4458 ++It;
4459return *this;
4460 }
4461booloperator!=(const nodes_iterator &N2) const{return N2.It != It; }
4462 };
4463
4464static nodes_iteratornodes_begin(BoUpSLP *R) {
4465return nodes_iterator(R->VectorizableTree.begin());
4466 }
4467
4468static nodes_iteratornodes_end(BoUpSLP *R) {
4469return nodes_iterator(R->VectorizableTree.end());
4470 }
4471
4472staticunsignedsize(BoUpSLP *R) {return R->VectorizableTree.size(); }
4473};
4474
4475template <>structDOTGraphTraits<BoUpSLP *> :publicDefaultDOTGraphTraits {
4476usingTreeEntry = BoUpSLP::TreeEntry;
4477
4478DOTGraphTraits(bool IsSimple =false) :DefaultDOTGraphTraits(IsSimple) {}
4479
4480 std::stringgetNodeLabel(constTreeEntry *Entry,constBoUpSLP *R) {
4481 std::string Str;
4482raw_string_ostreamOS(Str);
4483OS << Entry->Idx <<".\n";
4484if (isSplat(Entry->Scalars))
4485OS <<"<splat> ";
4486for (auto *V : Entry->Scalars) {
4487OS << *V;
4488if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4489 return EU.Scalar == V;
4490 }))
4491OS <<" <extract>";
4492OS <<"\n";
4493 }
4494return Str;
4495 }
4496
4497static std::stringgetNodeAttributes(constTreeEntry *Entry,
4498constBoUpSLP *) {
4499if (Entry->isGather())
4500return"color=red";
4501if (Entry->State == TreeEntry::ScatterVectorize ||
4502 Entry->State == TreeEntry::StridedVectorize)
4503return"color=blue";
4504return"";
4505 }
4506};
4507
4508}// end namespace llvm
4509
4510BoUpSLP::~BoUpSLP() {
4511SmallVector<WeakTrackingVH> DeadInsts;
4512for (auto *I : DeletedInstructions) {
4513if (!I->getParent()) {
4514// Temporarily insert instruction back to erase them from parent and
4515// memory later.
4516if (isa<PHINode>(I))
4517// Phi nodes must be the very first instructions in the block.
4518I->insertBefore(F->getEntryBlock(),
4519F->getEntryBlock().getFirstNonPHIIt());
4520else
4521I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
4522continue;
4523 }
4524for (Use &U :I->operands()) {
4525auto *Op = dyn_cast<Instruction>(U.get());
4526if (Op && !DeletedInstructions.count(Op) &&Op->hasOneUser() &&
4527wouldInstructionBeTriviallyDead(Op, TLI))
4528 DeadInsts.emplace_back(Op);
4529 }
4530I->dropAllReferences();
4531 }
4532for (auto *I : DeletedInstructions) {
4533assert(I->use_empty() &&
4534"trying to erase instruction with users.");
4535I->eraseFromParent();
4536 }
4537
4538// Cleanup any dead scalar code feeding the vectorized instructions
4539RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
4540
4541#ifdef EXPENSIVE_CHECKS
4542// If we could guarantee that this call is not extremely slow, we could
4543// remove the ifdef limitation (see PR47712).
4544assert(!verifyFunction(*F, &dbgs()));
4545#endif
4546}
4547
4548/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4549/// contains original mask for the scalars reused in the node. Procedure
4550/// transform this mask in accordance with the given \p Mask.
4551staticvoidreorderReuses(SmallVectorImpl<int> &Reuses,ArrayRef<int> Mask) {
4552assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4553"Expected non-empty mask.");
4554SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4555 Prev.swap(Reuses);
4556for (unsignedI = 0,E = Prev.size();I <E; ++I)
4557if (Mask[I] !=PoisonMaskElem)
4558 Reuses[Mask[I]] = Prev[I];
4559}
4560
4561/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4562/// the original order of the scalars. Procedure transforms the provided order
4563/// in accordance with the given \p Mask. If the resulting \p Order is just an
4564/// identity order, \p Order is cleared.
4565staticvoidreorderOrder(SmallVectorImpl<unsigned> &Order,ArrayRef<int> Mask,
4566bool BottomOrder =false) {
4567assert(!Mask.empty() &&"Expected non-empty mask.");
4568unsigned Sz = Mask.size();
4569if (BottomOrder) {
4570SmallVector<unsigned> PrevOrder;
4571if (Order.empty()) {
4572 PrevOrder.resize(Sz);
4573 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4574 }else {
4575 PrevOrder.swap(Order);
4576 }
4577 Order.assign(Sz, Sz);
4578for (unsignedI = 0;I < Sz; ++I)
4579if (Mask[I] !=PoisonMaskElem)
4580 Order[I] = PrevOrder[Mask[I]];
4581if (all_of(enumerate(Order), [&](constauto &Data) {
4582returnData.value() == Sz ||Data.index() ==Data.value();
4583 })) {
4584 Order.clear();
4585return;
4586 }
4587fixupOrderingIndices(Order);
4588return;
4589 }
4590SmallVector<int> MaskOrder;
4591if (Order.empty()) {
4592 MaskOrder.resize(Sz);
4593 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4594 }else {
4595inversePermutation(Order, MaskOrder);
4596 }
4597reorderReuses(MaskOrder, Mask);
4598if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4599 Order.clear();
4600return;
4601 }
4602 Order.assign(Sz, Sz);
4603for (unsignedI = 0;I < Sz; ++I)
4604if (MaskOrder[I] !=PoisonMaskElem)
4605 Order[MaskOrder[I]] =I;
4606fixupOrderingIndices(Order);
4607}
4608
4609std::optional<BoUpSLP::OrdersType>
4610BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4611assert(TE.isGather() &&"Expected gather node only.");
4612// Try to find subvector extract/insert patterns and reorder only such
4613// patterns.
4614SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4615Type *ScalarTy = GatheredScalars.front()->getType();
4616int NumScalars = GatheredScalars.size();
4617if (!isValidElementType(ScalarTy))
4618return std::nullopt;
4619auto *VecTy =getWidenedType(ScalarTy, NumScalars);
4620int NumParts =TTI->getNumberOfParts(VecTy);
4621if (NumParts == 0 || NumParts >= NumScalars ||
4622 VecTy->getNumElements() % NumParts != 0 ||
4623 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4624 VecTy->getNumElements() / NumParts))
4625 NumParts = 1;
4626SmallVector<int> ExtractMask;
4627SmallVector<int> Mask;
4628SmallVector<SmallVector<const TreeEntry *>> Entries;
4629SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
4630 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4631SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
4632 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4633/*ForOrder=*/true);
4634// No shuffled operands - ignore.
4635if (GatherShuffles.empty() && ExtractShuffles.empty())
4636return std::nullopt;
4637OrdersType CurrentOrder(NumScalars, NumScalars);
4638if (GatherShuffles.size() == 1 &&
4639 *GatherShuffles.front() ==TTI::SK_PermuteSingleSrc &&
4640 Entries.front().front()->isSame(TE.Scalars)) {
4641// Perfect match in the graph, will reuse the previously vectorized
4642// node. Cost is 0.
4643 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4644return CurrentOrder;
4645 }
4646auto IsSplatMask = [](ArrayRef<int> Mask) {
4647int SingleElt =PoisonMaskElem;
4648returnall_of(Mask, [&](intI) {
4649if (SingleElt ==PoisonMaskElem &&I !=PoisonMaskElem)
4650 SingleElt =I;
4651returnI ==PoisonMaskElem ||I == SingleElt;
4652 });
4653 };
4654// Exclusive broadcast mask - ignore.
4655if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4656 (Entries.size() != 1 ||
4657 Entries.front().front()->ReorderIndices.empty())) ||
4658 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4659return std::nullopt;
4660SmallBitVector ShuffledSubMasks(NumParts);
4661auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4662ArrayRef<int> Mask,int PartSz,int NumParts,
4663function_ref<unsigned(unsigned)> GetVF) {
4664for (intI : seq<int>(0, NumParts)) {
4665if (ShuffledSubMasks.test(I))
4666continue;
4667constint VF = GetVF(I);
4668if (VF == 0)
4669continue;
4670unsigned Limit =getNumElems(CurrentOrder.size(), PartSz,I);
4671MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4672// Shuffle of at least 2 vectors - ignore.
4673if (any_of(Slice, [&](intI) {returnI != NumScalars; })) {
4674 std::fill(Slice.begin(), Slice.end(), NumScalars);
4675 ShuffledSubMasks.set(I);
4676continue;
4677 }
4678// Try to include as much elements from the mask as possible.
4679int FirstMin = INT_MAX;
4680int SecondVecFound =false;
4681for (int K : seq<int>(Limit)) {
4682intIdx = Mask[I * PartSz + K];
4683if (Idx ==PoisonMaskElem) {
4684Value *V = GatheredScalars[I * PartSz + K];
4685if (isConstant(V) && !isa<PoisonValue>(V)) {
4686 SecondVecFound =true;
4687break;
4688 }
4689continue;
4690 }
4691if (Idx < VF) {
4692if (FirstMin >Idx)
4693 FirstMin =Idx;
4694 }else {
4695 SecondVecFound =true;
4696break;
4697 }
4698 }
4699 FirstMin = (FirstMin / PartSz) * PartSz;
4700// Shuffle of at least 2 vectors - ignore.
4701if (SecondVecFound) {
4702 std::fill(Slice.begin(), Slice.end(), NumScalars);
4703 ShuffledSubMasks.set(I);
4704continue;
4705 }
4706for (int K : seq<int>(Limit)) {
4707intIdx = Mask[I * PartSz + K];
4708if (Idx ==PoisonMaskElem)
4709continue;
4710Idx -= FirstMin;
4711if (Idx >= PartSz) {
4712 SecondVecFound =true;
4713break;
4714 }
4715if (CurrentOrder[I * PartSz +Idx] >
4716static_cast<unsigned>(I * PartSz + K) &&
4717 CurrentOrder[I * PartSz +Idx] !=
4718static_cast<unsigned>(I * PartSz +Idx))
4719 CurrentOrder[I * PartSz +Idx] =I * PartSz + K;
4720 }
4721// Shuffle of at least 2 vectors - ignore.
4722if (SecondVecFound) {
4723 std::fill(Slice.begin(), Slice.end(), NumScalars);
4724 ShuffledSubMasks.set(I);
4725continue;
4726 }
4727 }
4728 };
4729int PartSz =getPartNumElems(NumScalars, NumParts);
4730if (!ExtractShuffles.empty())
4731 TransformMaskToOrder(
4732 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsignedI) {
4733if (!ExtractShuffles[I])
4734return 0U;
4735unsigned VF = 0;
4736unsigned Sz =getNumElems(TE.getVectorFactor(), PartSz,I);
4737for (unsignedIdx : seq<unsigned>(Sz)) {
4738int K =I * PartSz +Idx;
4739if (ExtractMask[K] ==PoisonMaskElem)
4740continue;
4741if (!TE.ReuseShuffleIndices.empty())
4742 K = TE.ReuseShuffleIndices[K];
4743if (K ==PoisonMaskElem)
4744continue;
4745if (!TE.ReorderIndices.empty())
4746 K = std::distance(TE.ReorderIndices.begin(),
4747find(TE.ReorderIndices, K));
4748auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4749if (!EI)
4750continue;
4751 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4752 ->getElementCount()
4753 .getKnownMinValue());
4754 }
4755return VF;
4756 });
4757// Check special corner case - single shuffle of the same entry.
4758if (GatherShuffles.size() == 1 && NumParts != 1) {
4759if (ShuffledSubMasks.any())
4760return std::nullopt;
4761 PartSz = NumScalars;
4762 NumParts = 1;
4763 }
4764if (!Entries.empty())
4765 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsignedI) {
4766if (!GatherShuffles[I])
4767return 0U;
4768return std::max(Entries[I].front()->getVectorFactor(),
4769 Entries[I].back()->getVectorFactor());
4770 });
4771int NumUndefs =
4772count_if(CurrentOrder, [&](intIdx) {returnIdx == NumScalars; });
4773if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4774return std::nullopt;
4775return std::move(CurrentOrder);
4776}
4777
4778staticboolarePointersCompatible(Value *Ptr1,Value *Ptr2,
4779constTargetLibraryInfo &TLI,
4780bool CompareOpcodes =true) {
4781if (getUnderlyingObject(Ptr1,RecursionMaxDepth) !=
4782getUnderlyingObject(Ptr2,RecursionMaxDepth))
4783returnfalse;
4784auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4785auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4786return (!GEP1 || GEP1->getNumOperands() == 2) &&
4787 (!GEP2 || GEP2->getNumOperands() == 2) &&
4788 (((!GEP1 ||isConstant(GEP1->getOperand(1))) &&
4789 (!GEP2 ||isConstant(GEP2->getOperand(1)))) ||
4790 !CompareOpcodes ||
4791 (GEP1 && GEP2 &&
4792getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4793}
4794
4795/// Calculates minimal alignment as a common alignment.
4796template <typename T>
4797staticAligncomputeCommonAlignment(ArrayRef<Value *> VL) {
4798Align CommonAlignment = cast<T>(VL.front())->getAlign();
4799for (Value *V : VL.drop_front())
4800 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4801return CommonAlignment;
4802}
4803
4804/// Check if \p Order represents reverse order.
4805staticboolisReverseOrder(ArrayRef<unsigned> Order) {
4806assert(!Order.empty() &&
4807"Order is empty. Please check it before using isReverseOrder.");
4808unsigned Sz = Order.size();
4809returnall_of(enumerate(Order), [&](constauto &Pair) {
4810return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4811 });
4812}
4813
4814/// Checks if the provided list of pointers \p Pointers represents the strided
4815/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4816/// Otherwise, if \p Inst is not specified, just initialized optional value is
4817/// returned to show that the pointers represent strided pointers. If \p Inst
4818/// specified, the runtime stride is materialized before the given \p Inst.
4819/// \returns std::nullopt if the pointers are not pointers with the runtime
4820/// stride, nullptr or actual stride value, otherwise.
4821static std::optional<Value *>
4822calculateRtStride(ArrayRef<Value *> PointerOps,Type *ElemTy,
4823constDataLayout &DL,ScalarEvolution &SE,
4824SmallVectorImpl<unsigned> &SortedIndices,
4825Instruction *Inst =nullptr) {
4826SmallVector<const SCEV *> SCEVs;
4827constSCEV *PtrSCEVLowest =nullptr;
4828constSCEV *PtrSCEVHighest =nullptr;
4829// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4830// addresses).
4831for (Value *Ptr : PointerOps) {
4832constSCEV *PtrSCEV = SE.getSCEV(Ptr);
4833if (!PtrSCEV)
4834return std::nullopt;
4835 SCEVs.push_back(PtrSCEV);
4836if (!PtrSCEVLowest && !PtrSCEVHighest) {
4837 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4838continue;
4839 }
4840constSCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4841if (isa<SCEVCouldNotCompute>(Diff))
4842return std::nullopt;
4843if (Diff->isNonConstantNegative()) {
4844 PtrSCEVLowest = PtrSCEV;
4845continue;
4846 }
4847constSCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4848if (isa<SCEVCouldNotCompute>(Diff1))
4849return std::nullopt;
4850if (Diff1->isNonConstantNegative()) {
4851 PtrSCEVHighest = PtrSCEV;
4852continue;
4853 }
4854 }
4855// Dist = PtrSCEVHighest - PtrSCEVLowest;
4856constSCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4857if (isa<SCEVCouldNotCompute>(Dist))
4858return std::nullopt;
4859intSize =DL.getTypeStoreSize(ElemTy);
4860auto TryGetStride = [&](constSCEV *Dist,
4861constSCEV *Multiplier) ->constSCEV * {
4862if (constauto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4863if (M->getOperand(0) == Multiplier)
4864return M->getOperand(1);
4865if (M->getOperand(1) == Multiplier)
4866return M->getOperand(0);
4867returnnullptr;
4868 }
4869if (Multiplier == Dist)
4870return SE.getConstant(Dist->getType(), 1);
4871return SE.getUDivExactExpr(Dist, Multiplier);
4872 };
4873// Stride_in_elements = Dist / element_size * (num_elems - 1).
4874constSCEV *Stride =nullptr;
4875if (Size != 1 || SCEVs.size() > 2) {
4876constSCEV *Sz = SE.getConstant(Dist->getType(),Size * (SCEVs.size() - 1));
4877 Stride = TryGetStride(Dist, Sz);
4878if (!Stride)
4879return std::nullopt;
4880 }
4881if (!Stride || isa<SCEVConstant>(Stride))
4882return std::nullopt;
4883// Iterate through all pointers and check if all distances are
4884// unique multiple of Stride.
4885usingDistOrdPair = std::pair<int64_t, int>;
4886auto Compare =llvm::less_first();
4887 std::set<DistOrdPair,decltype(Compare)> Offsets(Compare);
4888int Cnt = 0;
4889bool IsConsecutive =true;
4890for (constSCEV *PtrSCEV : SCEVs) {
4891unsigned Dist = 0;
4892if (PtrSCEV != PtrSCEVLowest) {
4893constSCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4894constSCEV *Coeff = TryGetStride(Diff, Stride);
4895if (!Coeff)
4896return std::nullopt;
4897constauto *SC = dyn_cast<SCEVConstant>(Coeff);
4898if (!SC || isa<SCEVCouldNotCompute>(SC))
4899return std::nullopt;
4900if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4901 SE.getMulExpr(Stride, SC)))
4902 ->isZero())
4903return std::nullopt;
4904 Dist = SC->getAPInt().getZExtValue();
4905 }
4906// If the strides are not the same or repeated, we can't vectorize.
4907if ((Dist /Size) *Size != Dist || (Dist /Size) >= SCEVs.size())
4908return std::nullopt;
4909auto Res = Offsets.emplace(Dist, Cnt);
4910if (!Res.second)
4911return std::nullopt;
4912// Consecutive order if the inserted element is the last one.
4913 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4914 ++Cnt;
4915 }
4916if (Offsets.size() != SCEVs.size())
4917return std::nullopt;
4918 SortedIndices.clear();
4919if (!IsConsecutive) {
4920// Fill SortedIndices array only if it is non-consecutive.
4921 SortedIndices.resize(PointerOps.size());
4922 Cnt = 0;
4923for (const std::pair<int64_t, int> &Pair : Offsets) {
4924 SortedIndices[Cnt] = Pair.second;
4925 ++Cnt;
4926 }
4927 }
4928if (!Inst)
4929returnnullptr;
4930SCEVExpander Expander(SE,DL,"strided-load-vec");
4931return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4932}
4933
4934static std::pair<InstructionCost, InstructionCost>
4935getGEPCosts(constTargetTransformInfo &TTI,ArrayRef<Value *> Ptrs,
4936Value *BasePtr,unsigned Opcode,TTI::TargetCostKindCostKind,
4937Type *ScalarTy,VectorType *VecTy);
4938
4939/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4940/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4941/// subvector pattern.
4942staticInstructionCost
4943getShuffleCost(constTargetTransformInfo &TTI,TTI::ShuffleKind Kind,
4944VectorType *Tp,ArrayRef<int> Mask = {},
4945TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput,
4946intIndex = 0,VectorType *SubTp =nullptr,
4947ArrayRef<const Value *>Args = {}) {
4948if (Kind !=TTI::SK_PermuteTwoSrc)
4949returnTTI.getShuffleCost(Kind, Tp, Mask,CostKind,Index, SubTp, Args);
4950int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4951int NumSubElts;
4952if (Mask.size() > 2 &&ShuffleVectorInst::isInsertSubvectorMask(
4953 Mask, NumSrcElts, NumSubElts,Index)) {
4954if (Index + NumSubElts > NumSrcElts &&
4955Index + NumSrcElts <=static_cast<int>(Mask.size()))
4956returnTTI.getShuffleCost(
4957TTI::SK_InsertSubvector,
4958getWidenedType(Tp->getElementType(),Mask.size()), Mask,
4959TTI::TCK_RecipThroughput,Index, Tp);
4960 }
4961returnTTI.getShuffleCost(Kind, Tp, Mask,CostKind,Index, SubTp, Args);
4962}
4963
4964/// Correctly creates insert_subvector, checking that the index is multiple of
4965/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4966/// using default shuffle.
4967staticValue *createInsertVector(
4968IRBuilderBase &Builder,Value *Vec,Value *V,unsignedIndex,
4969function_ref<Value *(Value *,Value *,ArrayRef<int>)> Generator = {}) {
4970constunsigned SubVecVF =getNumElements(V->getType());
4971if (Index % SubVecVF == 0) {
4972 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
4973 Builder.getInt64(Index));
4974 }else {
4975// Create shuffle, insertvector requires that index is multiple of
4976// the subvector length.
4977constunsigned VecVF =getNumElements(Vec->getType());
4978SmallVector<int>Mask(VecVF,PoisonMaskElem);
4979 std::iota(Mask.begin(),Mask.end(), 0);
4980for (unsignedI : seq<unsigned>(SubVecVF))
4981Mask[I +Index] =I + VecVF;
4982if (Generator) {
4983 Vec = Generator(Vec, V, Mask);
4984 }else {
4985// 1. Resize V to the size of Vec.
4986SmallVector<int> ResizeMask(VecVF,PoisonMaskElem);
4987 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
4988V = Builder.CreateShuffleVector(V, ResizeMask);
4989 Vec = Builder.CreateShuffleVector(Vec, V, Mask);
4990 }
4991 }
4992return Vec;
4993}
4994
4995/// Correctly creates extract_subvector, checking that the index is multiple of
4996/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4997/// using default shuffle.
4998staticValue *createExtractVector(IRBuilderBase &Builder,Value *Vec,
4999unsigned SubVecVF,unsignedIndex) {
5000if (Index % SubVecVF == 0) {
5001VectorType *SubVecTy =
5002getWidenedType(Vec->getType()->getScalarType(), SubVecVF);
5003return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));
5004 }
5005// Create shuffle, extract_subvector requires that index is multiple of
5006// the subvector length.
5007SmallVector<int> Mask(SubVecVF,PoisonMaskElem);
5008 std::iota(Mask.begin(), Mask.end(),Index);
5009return Builder.CreateShuffleVector(Vec, Mask);
5010}
5011
5012BoUpSLP::LoadsState
5013BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL,constValue *VL0,
5014SmallVectorImpl<unsigned> &Order,
5015SmallVectorImpl<Value *> &PointerOps,
5016unsigned *BestVF,bool TryRecursiveCheck) const{
5017// Check that a vectorized load would load the same memory as a scalar
5018// load. For example, we don't want to vectorize loads that are smaller
5019// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5020// treats loading/storing it as an i8 struct. If we vectorize loads/stores
5021// from such a struct, we read/write packed bits disagreeing with the
5022// unvectorized version.
5023if (BestVF)
5024 *BestVF = 0;
5025if (areKnownNonVectorizableLoads(VL))
5026returnLoadsState::Gather;
5027Type *ScalarTy = VL0->getType();
5028
5029if (DL->getTypeSizeInBits(ScalarTy) !=DL->getTypeAllocSizeInBits(ScalarTy))
5030returnLoadsState::Gather;
5031
5032// Make sure all loads in the bundle are simple - we can't vectorize
5033// atomic or volatile loads.
5034 PointerOps.clear();
5035constunsigned Sz = VL.size();
5036 PointerOps.resize(Sz);
5037auto *POIter = PointerOps.begin();
5038for (Value *V : VL) {
5039auto *L = dyn_cast<LoadInst>(V);
5040if (!L || !L->isSimple())
5041returnLoadsState::Gather;
5042 *POIter = L->getPointerOperand();
5043 ++POIter;
5044 }
5045
5046 Order.clear();
5047// Check the order of pointer operands or that all pointers are the same.
5048bool IsSorted =sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
5049
5050auto *VecTy =getWidenedType(ScalarTy, Sz);
5051Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5052if (!IsSorted) {
5053if (Sz >MinProfitableStridedLoads &&TTI->isTypeLegal(VecTy)) {
5054if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
5055calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
5056returnLoadsState::StridedVectorize;
5057 }
5058
5059if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5060TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5061returnLoadsState::Gather;
5062
5063if (!all_of(PointerOps, [&](Value *P) {
5064returnarePointersCompatible(P, PointerOps.front(), *TLI);
5065 }))
5066returnLoadsState::Gather;
5067
5068 }else {
5069Value *Ptr0;
5070Value *PtrN;
5071if (Order.empty()) {
5072 Ptr0 = PointerOps.front();
5073 PtrN = PointerOps.back();
5074 }else {
5075 Ptr0 = PointerOps[Order.front()];
5076 PtrN = PointerOps[Order.back()];
5077 }
5078 std::optional<int> Diff =
5079getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5080// Check that the sorted loads are consecutive.
5081if (static_cast<unsigned>(*Diff) == Sz - 1)
5082returnLoadsState::Vectorize;
5083if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5084TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5085returnLoadsState::Gather;
5086// Simple check if not a strided access - clear order.
5087bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5088// Try to generate strided load node if:
5089// 1. Target with strided load support is detected.
5090// 2. The number of loads is greater than MinProfitableStridedLoads,
5091// or the potential stride <= MaxProfitableLoadStride and the
5092// potential stride is power-of-2 (to avoid perf regressions for the very
5093// small number of loads) and max distance > number of loads, or potential
5094// stride is -1.
5095// 3. The loads are ordered, or number of unordered loads <=
5096// MaxProfitableUnorderedLoads, or loads are in reversed order.
5097// (this check is to avoid extra costs for very expensive shuffles).
5098// 4. Any pointer operand is an instruction with the users outside of the
5099// current graph (for masked gathers extra extractelement instructions
5100// might be required).
5101auto IsAnyPointerUsedOutGraph =
5102 IsPossibleStrided &&any_of(PointerOps, [&](Value *V) {
5103return isa<Instruction>(V) &&any_of(V->users(), [&](User *U) {
5104 return !getTreeEntry(U) && !MustGather.contains(U);
5105 });
5106 });
5107constunsigned AbsoluteDiff = std::abs(*Diff);
5108if (IsPossibleStrided &&
5109 (IsAnyPointerUsedOutGraph ||
5110 (AbsoluteDiff > Sz &&
5111 (Sz >MinProfitableStridedLoads ||
5112 (AbsoluteDiff <=MaxProfitableLoadStride * Sz &&
5113 AbsoluteDiff % Sz == 0 &&has_single_bit(AbsoluteDiff / Sz)))) ||
5114 *Diff == -(static_cast<int>(Sz) - 1))) {
5115int Stride = *Diff /static_cast<int>(Sz - 1);
5116if (*Diff == Stride *static_cast<int>(Sz - 1)) {
5117Align Alignment =
5118 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5119 ->getAlign();
5120if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5121// Iterate through all pointers and check if all distances are
5122// unique multiple of Dist.
5123SmallSet<int, 4> Dists;
5124for (Value *Ptr : PointerOps) {
5125int Dist = 0;
5126if (Ptr == PtrN)
5127 Dist = *Diff;
5128elseif (Ptr != Ptr0)
5129 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy,Ptr, *DL, *SE);
5130// If the strides are not the same or repeated, we can't
5131// vectorize.
5132if (((Dist / Stride) * Stride) != Dist ||
5133 !Dists.insert(Dist).second)
5134break;
5135 }
5136if (Dists.size() == Sz)
5137returnLoadsState::StridedVectorize;
5138 }
5139 }
5140 }
5141 }
5142// Correctly identify compare the cost of loads + shuffles rather than
5143// strided/masked gather loads. Returns true if vectorized + shuffles
5144// representation is better than just gather.
5145auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5146unsigned *BestVF,
5147bool ProfitableGatherPointers) {
5148if (BestVF)
5149 *BestVF = 0;
5150// Compare masked gather cost and loads + insert subvector costs.
5151TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
5152auto [ScalarGEPCost, VectorGEPCost] =
5153getGEPCosts(TTI, PointerOps, PointerOps.front(),
5154 Instruction::GetElementPtr,CostKind, ScalarTy, VecTy);
5155// Estimate the cost of masked gather GEP. If not a splat, roughly
5156// estimate as a buildvector, otherwise estimate as splat.
5157APInt DemandedElts =APInt::getAllOnes(VecTy->getNumElements());
5158VectorType *PtrVecTy =
5159getWidenedType(PointerOps.front()->getType()->getScalarType(),
5160 VecTy->getNumElements());
5161if (static_cast<unsigned>(count_if(
5162 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5163any_of(PointerOps, [&](Value *V) {
5164returngetUnderlyingObject(V) !=
5165getUnderlyingObject(PointerOps.front());
5166 }))
5167 VectorGEPCost +=TTI.getScalarizationOverhead(
5168 PtrVecTy, DemandedElts,/*Insert=*/true,/*Extract=*/false,CostKind);
5169else
5170 VectorGEPCost +=
5171TTI.getScalarizationOverhead(
5172 PtrVecTy,APInt::getOneBitSet(VecTy->getNumElements(), 0),
5173/*Insert=*/true,/*Extract=*/false,CostKind) +
5174::getShuffleCost(TTI,TTI::SK_Broadcast, PtrVecTy, {},CostKind);
5175// The cost of scalar loads.
5176InstructionCost ScalarLoadsCost =
5177 std::accumulate(VL.begin(), VL.end(),InstructionCost(),
5178 [&](InstructionCostC,Value *V) {
5179returnC +TTI.getInstructionCost(
5180 cast<Instruction>(V),CostKind);
5181 }) +
5182 ScalarGEPCost;
5183// The cost of masked gather.
5184InstructionCost MaskedGatherCost =
5185TTI.getGatherScatterOpCost(
5186 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5187/*VariableMask=*/false, CommonAlignment,CostKind) +
5188 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5189InstructionCost GatherCost =
5190TTI.getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,
5191/*Extract=*/false,CostKind) +
5192 ScalarLoadsCost;
5193// The list of loads is small or perform partial check already - directly
5194// compare masked gather cost and gather cost.
5195constexprunsigned ListLimit = 4;
5196if (!TryRecursiveCheck || VL.size() < ListLimit)
5197return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5198
5199// FIXME: The following code has not been updated for non-power-of-2
5200// vectors (and not whole registers). The splitting logic here does not
5201// cover the original vector if the vector factor is not a power of two.
5202if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
5203returnfalse;
5204
5205unsigned Sz =DL->getTypeSizeInBits(ScalarTy);
5206unsigned MinVF =getMinVF(2 * Sz);
5207 DemandedElts.clearAllBits();
5208// Iterate through possible vectorization factors and check if vectorized +
5209// shuffles is better than just gather.
5210for (unsigned VF =
5211getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
5212 VF >= MinVF;
5213 VF =getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
5214SmallVector<LoadsState> States;
5215for (unsigned Cnt = 0,End = VL.size(); Cnt + VF <=End; Cnt += VF) {
5216ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5217SmallVector<unsigned> Order;
5218SmallVector<Value *> PointerOps;
5219LoadsState LS =
5220canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5221/*TryRecursiveCheck=*/false);
5222// Check that the sorted loads are consecutive.
5223if (LS ==LoadsState::Gather) {
5224if (BestVF) {
5225 DemandedElts.setAllBits();
5226break;
5227 }
5228 DemandedElts.setBits(Cnt, Cnt + VF);
5229continue;
5230 }
5231// If need the reorder - consider as high-cost masked gather for now.
5232if ((LS ==LoadsState::Vectorize ||
5233 LS ==LoadsState::StridedVectorize) &&
5234 !Order.empty() && !isReverseOrder(Order))
5235 LS =LoadsState::ScatterVectorize;
5236 States.push_back(LS);
5237 }
5238if (DemandedElts.isAllOnes())
5239// All loads gathered - try smaller VF.
5240continue;
5241// Can be vectorized later as a serie of loads/insertelements.
5242InstructionCost VecLdCost = 0;
5243if (!DemandedElts.isZero()) {
5244 VecLdCost =
5245TTI.getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,
5246/*Extract=*/false,CostKind) +
5247 ScalarGEPCost;
5248for (unsignedIdx : seq<unsigned>(VL.size()))
5249if (DemandedElts[Idx])
5250 VecLdCost +=
5251TTI.getInstructionCost(cast<Instruction>(VL[Idx]),CostKind);
5252 }
5253unsigned ScalarTyNumElements =getNumElements(ScalarTy);
5254auto *SubVecTy =getWidenedType(ScalarTy, VF);
5255for (auto [I, LS] :enumerate(States)) {
5256auto *LI0 = cast<LoadInst>(VL[I * VF]);
5257InstructionCost VectorGEPCost =
5258 (LS ==LoadsState::ScatterVectorize && ProfitableGatherPointers)
5259 ? 0
5260 :getGEPCosts(TTI,ArrayRef(PointerOps).slice(I * VF, VF),
5261 LI0->getPointerOperand(),
5262 Instruction::GetElementPtr,CostKind, ScalarTy,
5263 SubVecTy)
5264 .second;
5265if (LS ==LoadsState::ScatterVectorize) {
5266if (static_cast<unsigned>(
5267count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5268 PointerOps.size() - 1 ||
5269any_of(PointerOps, [&](Value *V) {
5270returngetUnderlyingObject(V) !=
5271getUnderlyingObject(PointerOps.front());
5272 }))
5273 VectorGEPCost +=TTI.getScalarizationOverhead(
5274 SubVecTy,APInt::getAllOnes(VF),
5275/*Insert=*/true,/*Extract=*/false,CostKind);
5276else
5277 VectorGEPCost +=
5278TTI.getScalarizationOverhead(
5279 SubVecTy,APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5280/*Insert=*/true,/*Extract=*/false,CostKind) +
5281::getShuffleCost(TTI,TTI::SK_Broadcast, SubVecTy, {},
5282CostKind);
5283 }
5284switch (LS) {
5285caseLoadsState::Vectorize:
5286 VecLdCost +=
5287TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5288 LI0->getPointerAddressSpace(),CostKind,
5289TTI::OperandValueInfo()) +
5290 VectorGEPCost;
5291break;
5292caseLoadsState::StridedVectorize:
5293 VecLdCost +=TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5294 LI0->getPointerOperand(),
5295/*VariableMask=*/false,
5296 CommonAlignment,CostKind) +
5297 VectorGEPCost;
5298break;
5299caseLoadsState::ScatterVectorize:
5300 VecLdCost +=TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5301 LI0->getPointerOperand(),
5302/*VariableMask=*/false,
5303 CommonAlignment,CostKind) +
5304 VectorGEPCost;
5305break;
5306caseLoadsState::Gather:
5307// Gathers are already calculated - ignore.
5308continue;
5309 }
5310SmallVector<int> ShuffleMask(VL.size());
5311for (intIdx : seq<int>(0, VL.size()))
5312 ShuffleMask[Idx] =Idx / VF ==I ? VL.size() +Idx % VF :Idx;
5313if (I > 0)
5314 VecLdCost +=
5315::getShuffleCost(TTI,TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5316CostKind,I * VF, SubVecTy);
5317 }
5318// If masked gather cost is higher - better to vectorize, so
5319// consider it as a gather node. It will be better estimated
5320// later.
5321if (MaskedGatherCost >= VecLdCost &&
5322 VecLdCost - GatherCost < -SLPCostThreshold) {
5323if (BestVF)
5324 *BestVF = VF;
5325returntrue;
5326 }
5327 }
5328return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5329 };
5330// TODO: need to improve analysis of the pointers, if not all of them are
5331// GEPs or have > 2 operands, we end up with a gather node, which just
5332// increases the cost.
5333Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5334bool ProfitableGatherPointers =
5335 L && Sz > 2 &&static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5336return L->isLoopInvariant(V);
5337 })) <= Sz / 2;
5338if (ProfitableGatherPointers ||all_of(PointerOps, [](Value *P) {
5339auto *GEP = dyn_cast<GetElementPtrInst>(P);
5340return (!GEP &&doesNotNeedToBeScheduled(P)) ||
5341 (GEP &&GEP->getNumOperands() == 2 &&
5342 isa<Constant, Instruction>(GEP->getOperand(1)));
5343 })) {
5344// Check if potential masked gather can be represented as series
5345// of loads + insertsubvectors.
5346// If masked gather cost is higher - better to vectorize, so
5347// consider it as a gather node. It will be better estimated
5348// later.
5349if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5350 ProfitableGatherPointers))
5351returnLoadsState::ScatterVectorize;
5352 }
5353
5354returnLoadsState::Gather;
5355}
5356
5357staticboolclusterSortPtrAccesses(ArrayRef<Value *> VL,
5358ArrayRef<BasicBlock *> BBs,Type *ElemTy,
5359constDataLayout &DL,ScalarEvolution &SE,
5360SmallVectorImpl<unsigned> &SortedIndices) {
5361assert(
5362all_of(VL, [](constValue *V) {return V->getType()->isPointerTy(); }) &&
5363"Expected list of pointer operands.");
5364// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5365// Ptr into, sort and return the sorted indices with values next to one
5366// another.
5367SmallMapVector<std::pair<BasicBlock *, Value *>,
5368SmallVector<SmallVector<std::tuple<Value *, int, unsigned>>>, 8>
5369 Bases;
5370 Bases
5371 .try_emplace(std::make_pair(
5372 BBs.front(),getUnderlyingObject(VL.front(),RecursionMaxDepth)))
5373 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5374
5375 SortedIndices.clear();
5376for (auto [Cnt,Ptr] :enumerate(VL.drop_front())) {
5377auto Key = std::make_pair(BBs[Cnt + 1],
5378getUnderlyingObject(Ptr,RecursionMaxDepth));
5379bool Found =any_of(Bases.try_emplace(Key).first->second,
5380 [&, &Cnt = Cnt, &Ptr =Ptr](auto &Base) {
5381 std::optional<int> Diff = getPointersDiff(
5382 ElemTy, std::get<0>(Base.front()), ElemTy,
5383 Ptr, DL, SE,
5384/*StrictCheck=*/true);
5385 if (!Diff)
5386 return false;
5387
5388 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5389 return true;
5390 });
5391
5392if (!Found) {
5393// If we haven't found enough to usefully cluster, return early.
5394if (Bases.size() > VL.size() / 2 - 1)
5395returnfalse;
5396
5397// Not found already - add a new Base
5398 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5399 }
5400 }
5401
5402if (Bases.size() == VL.size())
5403returnfalse;
5404
5405if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5406 Bases.front().second.size() == VL.size()))
5407returnfalse;
5408
5409// For each of the bases sort the pointers by Offset and check if any of the
5410// base become consecutively allocated.
5411auto ComparePointers = [](Value *Ptr1,Value *Ptr2) {
5412SmallPtrSet<Value *, 13> FirstPointers;
5413SmallPtrSet<Value *, 13> SecondPointers;
5414Value *P1 = Ptr1;
5415Value *P2 = Ptr2;
5416unsignedDepth = 0;
5417while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
5418if (P1 == P2 ||Depth >RecursionMaxDepth)
5419returnfalse;
5420 FirstPointers.insert(P1);
5421 SecondPointers.insert(P2);
5422 P1 =getUnderlyingObject(P1,/*MaxLookup=*/1);
5423 P2 =getUnderlyingObject(P2,/*MaxLookup=*/1);
5424 ++Depth;
5425 }
5426assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5427"Unable to find matching root.");
5428return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5429 };
5430for (auto &Base : Bases) {
5431for (auto &Vec :Base.second) {
5432if (Vec.size() > 1) {
5433stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5434const std::tuple<Value *, int, unsigned> &Y) {
5435return std::get<1>(X) < std::get<1>(Y);
5436 });
5437int InitialOffset = std::get<1>(Vec[0]);
5438bool AnyConsecutive =
5439all_of(enumerate(Vec), [InitialOffset](constauto &P) {
5440return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5441 });
5442// Fill SortedIndices array only if it looks worth-while to sort the
5443// ptrs.
5444if (!AnyConsecutive)
5445returnfalse;
5446 }
5447 }
5448stable_sort(Base.second, [&](constauto &V1,constauto &V2) {
5449 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5450 });
5451 }
5452
5453for (auto &T : Bases)
5454for (constauto &Vec :T.second)
5455for (constauto &P : Vec)
5456 SortedIndices.push_back(std::get<2>(P));
5457
5458assert(SortedIndices.size() == VL.size() &&
5459"Expected SortedIndices to be the size of VL");
5460returntrue;
5461}
5462
5463std::optional<BoUpSLP::OrdersType>
5464BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5465assert(TE.isGather() &&"Expected gather node only.");
5466Type *ScalarTy = TE.Scalars[0]->getType();
5467
5468SmallVector<Value *> Ptrs;
5469 Ptrs.reserve(TE.Scalars.size());
5470SmallVector<BasicBlock *> BBs;
5471 BBs.reserve(TE.Scalars.size());
5472for (Value *V : TE.Scalars) {
5473auto *L = dyn_cast<LoadInst>(V);
5474if (!L || !L->isSimple())
5475return std::nullopt;
5476 Ptrs.push_back(L->getPointerOperand());
5477 BBs.push_back(L->getParent());
5478 }
5479
5480BoUpSLP::OrdersType Order;
5481if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5482clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5483return std::move(Order);
5484return std::nullopt;
5485}
5486
5487/// Check if two insertelement instructions are from the same buildvector.
5488staticboolareTwoInsertFromSameBuildVector(
5489InsertElementInst *VU,InsertElementInst *V,
5490function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5491// Instructions must be from the same basic blocks.
5492if (VU->getParent() != V->getParent())
5493returnfalse;
5494// Checks if 2 insertelements are from the same buildvector.
5495if (VU->getType() != V->getType())
5496returnfalse;
5497// Multiple used inserts are separate nodes.
5498if (!VU->hasOneUse() && !V->hasOneUse())
5499returnfalse;
5500auto *IE1 = VU;
5501auto *IE2 = V;
5502 std::optional<unsigned> Idx1 =getElementIndex(IE1);
5503 std::optional<unsigned> Idx2 =getElementIndex(IE2);
5504if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5505returnfalse;
5506// Go through the vector operand of insertelement instructions trying to find
5507// either VU as the original vector for IE2 or V as the original vector for
5508// IE1.
5509SmallBitVector ReusedIdx(
5510 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5511bool IsReusedIdx =false;
5512do {
5513if (IE2 == VU && !IE1)
5514return VU->hasOneUse();
5515if (IE1 == V && !IE2)
5516return V->hasOneUse();
5517if (IE1 && IE1 != V) {
5518unsigned Idx1 =getElementIndex(IE1).value_or(*Idx2);
5519 IsReusedIdx |= ReusedIdx.test(Idx1);
5520 ReusedIdx.set(Idx1);
5521if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5522 IE1 =nullptr;
5523else
5524 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5525 }
5526if (IE2 && IE2 != VU) {
5527unsigned Idx2 =getElementIndex(IE2).value_or(*Idx1);
5528 IsReusedIdx |= ReusedIdx.test(Idx2);
5529 ReusedIdx.set(Idx2);
5530if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5531 IE2 =nullptr;
5532else
5533 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5534 }
5535 }while (!IsReusedIdx && (IE1 || IE2));
5536returnfalse;
5537}
5538
5539std::optional<BoUpSLP::OrdersType>
5540BoUpSLP::getReorderingData(const TreeEntry &TE,bool TopToBottom) {
5541// No need to reorder if need to shuffle reuses, still need to shuffle the
5542// node.
5543if (!TE.ReuseShuffleIndices.empty()) {
5544// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5545assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5546"Reshuffling scalars not yet supported for nodes with padding");
5547
5548if (isSplat(TE.Scalars))
5549return std::nullopt;
5550// Check if reuse shuffle indices can be improved by reordering.
5551// For this, check that reuse mask is "clustered", i.e. each scalar values
5552// is used once in each submask of size <number_of_scalars>.
5553// Example: 4 scalar values.
5554// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5555// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5556// element 3 is used twice in the second submask.
5557unsigned Sz = TE.Scalars.size();
5558if (TE.isGather()) {
5559if (std::optional<OrdersType> CurrentOrder =
5560findReusedOrderedScalars(TE)) {
5561SmallVector<int> Mask;
5562fixupOrderingIndices(*CurrentOrder);
5563inversePermutation(*CurrentOrder, Mask);
5564::addMask(Mask, TE.ReuseShuffleIndices);
5565OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5566unsigned Sz = TE.Scalars.size();
5567for (int K = 0,E = TE.getVectorFactor() / Sz; K <E; ++K) {
5568for (auto [I,Idx] :enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5569if (Idx !=PoisonMaskElem)
5570 Res[Idx + K * Sz] =I + K * Sz;
5571 }
5572return std::move(Res);
5573 }
5574 }
5575if (Sz == 2 && TE.getVectorFactor() == 4 &&
5576TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5577 2 * TE.getVectorFactor())) == 1)
5578return std::nullopt;
5579if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5580 Sz)) {
5581SmallVector<int> ReorderMask(Sz,PoisonMaskElem);
5582if (TE.ReorderIndices.empty())
5583 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5584else
5585inversePermutation(TE.ReorderIndices, ReorderMask);
5586::addMask(ReorderMask, TE.ReuseShuffleIndices);
5587unsigned VF = ReorderMask.size();
5588OrdersType ResOrder(VF, VF);
5589unsigned NumParts =divideCeil(VF, Sz);
5590SmallBitVector UsedVals(NumParts);
5591for (unsignedI = 0;I < VF;I += Sz) {
5592int Val =PoisonMaskElem;
5593unsigned UndefCnt = 0;
5594unsigned Limit = std::min(Sz, VF -I);
5595if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5596 [&](intIdx) {
5597if (Val ==PoisonMaskElem &&Idx !=PoisonMaskElem)
5598 Val =Idx;
5599if (Idx ==PoisonMaskElem)
5600 ++UndefCnt;
5601returnIdx !=PoisonMaskElem &&Idx != Val;
5602 }) ||
5603 Val >=static_cast<int>(NumParts) || UsedVals.test(Val) ||
5604 UndefCnt > Sz / 2)
5605return std::nullopt;
5606 UsedVals.set(Val);
5607for (unsigned K = 0; K < NumParts; ++K) {
5608unsignedIdx = Val + Sz * K;
5609if (Idx < VF)
5610 ResOrder[Idx] =I + K;
5611 }
5612 }
5613return std::move(ResOrder);
5614 }
5615unsigned VF = TE.getVectorFactor();
5616// Try build correct order for extractelement instructions.
5617SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5618 TE.ReuseShuffleIndices.end());
5619if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5620all_of(TE.Scalars, [Sz](Value *V) {
5621 if (isa<PoisonValue>(V))
5622 return true;
5623 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5624 return Idx && *Idx < Sz;
5625 })) {
5626assert(!TE.isAltShuffle() &&"Alternate instructions are only supported "
5627"by BinaryOperator and CastInst.");
5628SmallVector<int> ReorderMask(Sz,PoisonMaskElem);
5629if (TE.ReorderIndices.empty())
5630 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5631else
5632inversePermutation(TE.ReorderIndices, ReorderMask);
5633for (unsignedI = 0;I < VF; ++I) {
5634int &Idx = ReusedMask[I];
5635if (Idx ==PoisonMaskElem)
5636continue;
5637Value *V = TE.Scalars[ReorderMask[Idx]];
5638 std::optional<unsigned> EI =getExtractIndex(cast<Instruction>(V));
5639Idx = std::distance(ReorderMask.begin(),find(ReorderMask, *EI));
5640 }
5641 }
5642// Build the order of the VF size, need to reorder reuses shuffles, they are
5643// always of VF size.
5644OrdersType ResOrder(VF);
5645 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5646auto *It = ResOrder.begin();
5647for (unsigned K = 0; K < VF; K += Sz) {
5648OrdersType CurrentOrder(TE.ReorderIndices);
5649SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5650if (SubMask.front() ==PoisonMaskElem)
5651 std::iota(SubMask.begin(), SubMask.end(), 0);
5652reorderOrder(CurrentOrder, SubMask);
5653transform(CurrentOrder, It, [K](unsigned Pos) {return Pos + K; });
5654 std::advance(It, Sz);
5655 }
5656if (TE.isGather() &&all_of(enumerate(ResOrder), [](constauto &Data) {
5657returnData.index() ==Data.value();
5658 }))
5659return std::nullopt;// No need to reorder.
5660return std::move(ResOrder);
5661 }
5662if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5663any_of(TE.UserTreeIndices,
5664 [](constEdgeInfo &EI) {
5665 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5666 }) &&
5667 (TE.ReorderIndices.empty() ||isReverseOrder(TE.ReorderIndices)))
5668return std::nullopt;
5669if ((TE.State == TreeEntry::Vectorize ||
5670 TE.State == TreeEntry::StridedVectorize) &&
5671 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5672 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5673assert(!TE.isAltShuffle() &&"Alternate instructions are only supported by "
5674"BinaryOperator and CastInst.");
5675return TE.ReorderIndices;
5676 }
5677if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5678if (!TE.ReorderIndices.empty())
5679return TE.ReorderIndices;
5680
5681SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5682for (auto [I, V] :zip(UserBVHead, TE.Scalars)) {
5683if (!V->hasNUsesOrMore(1))
5684continue;
5685auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5686if (!II)
5687continue;
5688Instruction *BVHead =nullptr;
5689BasicBlock *BB =II->getParent();
5690while (II &&II->hasOneUse() &&II->getParent() == BB) {
5691 BVHead =II;
5692II = dyn_cast<InsertElementInst>(II->getOperand(0));
5693 }
5694I = BVHead;
5695 }
5696
5697auto CompareByBasicBlocks = [&](BasicBlock *BB1,BasicBlock *BB2) {
5698assert(BB1 != BB2 &&"Expected different basic blocks.");
5699auto *NodeA = DT->getNode(BB1);
5700auto *NodeB = DT->getNode(BB2);
5701assert(NodeA &&"Should only process reachable instructions");
5702assert(NodeB &&"Should only process reachable instructions");
5703assert((NodeA == NodeB) ==
5704 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5705"Different nodes should have different DFS numbers");
5706return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5707 };
5708auto PHICompare = [&](unsigned I1,unsigned I2) {
5709Value *V1 = TE.Scalars[I1];
5710Value *V2 = TE.Scalars[I2];
5711if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5712returnfalse;
5713if (isa<PoisonValue>(V1))
5714returntrue;
5715if (isa<PoisonValue>(V2))
5716returnfalse;
5717if (V1->getNumUses() < V2->getNumUses())
5718returntrue;
5719if (V1->getNumUses() > V2->getNumUses())
5720returnfalse;
5721auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5722auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5723if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5724return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5725 FirstUserOfPhi2->getParent());
5726auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5727auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5728auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5729auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5730if (IE1 && !IE2)
5731returntrue;
5732if (!IE1 && IE2)
5733returnfalse;
5734if (IE1 && IE2) {
5735if (UserBVHead[I1] && !UserBVHead[I2])
5736returntrue;
5737if (!UserBVHead[I1])
5738returnfalse;
5739if (UserBVHead[I1] == UserBVHead[I2])
5740returngetElementIndex(IE1) <getElementIndex(IE2);
5741if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5742return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5743 UserBVHead[I2]->getParent());
5744return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5745 }
5746if (EE1 && !EE2)
5747returntrue;
5748if (!EE1 && EE2)
5749returnfalse;
5750if (EE1 && EE2) {
5751auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5752auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5753auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5754auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5755if (!Inst2 && !P2)
5756return Inst1 || P1;
5757if (EE1->getOperand(0) == EE2->getOperand(0))
5758returngetElementIndex(EE1) <getElementIndex(EE2);
5759if (!Inst1 && Inst2)
5760returnfalse;
5761if (Inst1 && Inst2) {
5762if (Inst1->getParent() != Inst2->getParent())
5763return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5764return Inst1->comesBefore(Inst2);
5765 }
5766if (!P1 && P2)
5767returnfalse;
5768assert(P1 && P2 &&
5769"Expected either instructions or arguments vector operands.");
5770return P1->getArgNo() < P2->getArgNo();
5771 }
5772returnfalse;
5773 };
5774OrdersType Phis(TE.Scalars.size());
5775 std::iota(Phis.begin(), Phis.end(), 0);
5776stable_sort(Phis, PHICompare);
5777if (isIdentityOrder(Phis))
5778return std::nullopt;// No need to reorder.
5779return std::move(Phis);
5780 }
5781if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5782allSameType(TE.Scalars)) {
5783// TODO: add analysis of other gather nodes with extractelement
5784// instructions and other values/instructions, not only undefs.
5785if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5786 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5787any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5788all_of(TE.Scalars, [](Value *V) {
5789 auto *EE = dyn_cast<ExtractElementInst>(V);
5790 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5791 })) {
5792// Check that gather of extractelements can be represented as
5793// just a shuffle of a single vector.
5794OrdersType CurrentOrder;
5795bool Reuse =
5796 canReuseExtract(TE.Scalars, CurrentOrder,/*ResizeAllowed=*/true);
5797if (Reuse || !CurrentOrder.empty())
5798return std::move(CurrentOrder);
5799 }
5800// If the gather node is <undef, v, .., poison> and
5801// insertelement poison, v, 0 [+ permute]
5802// is cheaper than
5803// insertelement poison, v, n - try to reorder.
5804// If rotating the whole graph, exclude the permute cost, the whole graph
5805// might be transformed.
5806int Sz = TE.Scalars.size();
5807if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5808count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5809constauto *It =
5810find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5811if (It == TE.Scalars.begin())
5812returnOrdersType();
5813auto *Ty =getWidenedType(TE.Scalars.front()->getType(), Sz);
5814if (It != TE.Scalars.end()) {
5815OrdersType Order(Sz, Sz);
5816unsignedIdx = std::distance(TE.Scalars.begin(), It);
5817 Order[Idx] = 0;
5818fixupOrderingIndices(Order);
5819SmallVector<int> Mask;
5820inversePermutation(Order, Mask);
5821InstructionCost PermuteCost =
5822 TopToBottom
5823 ? 0
5824 :::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, Ty, Mask);
5825InstructionCost InsertFirstCost =TTI->getVectorInstrCost(
5826 Instruction::InsertElement, Ty,TTI::TCK_RecipThroughput, 0,
5827PoisonValue::get(Ty), *It);
5828InstructionCost InsertIdxCost =TTI->getVectorInstrCost(
5829 Instruction::InsertElement, Ty,TTI::TCK_RecipThroughput,Idx,
5830PoisonValue::get(Ty), *It);
5831if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5832OrdersType Order(Sz, Sz);
5833 Order[Idx] = 0;
5834return std::move(Order);
5835 }
5836 }
5837 }
5838if (isSplat(TE.Scalars))
5839return std::nullopt;
5840if (TE.Scalars.size() >= 3)
5841if (std::optional<OrdersType> Order =findPartiallyOrderedLoads(TE))
5842return Order;
5843// Check if can include the order of vectorized loads. For masked gathers do
5844// extra analysis later, so include such nodes into a special list.
5845if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5846SmallVector<Value *> PointerOps;
5847OrdersType CurrentOrder;
5848LoadsState Res =canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5849 CurrentOrder, PointerOps);
5850if (Res ==LoadsState::Vectorize || Res ==LoadsState::StridedVectorize)
5851return std::move(CurrentOrder);
5852 }
5853// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5854// has been auditted for correctness with non-power-of-two vectors.
5855if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5856if (std::optional<OrdersType> CurrentOrder =findReusedOrderedScalars(TE))
5857return CurrentOrder;
5858 }
5859return std::nullopt;
5860}
5861
5862/// Checks if the given mask is a "clustered" mask with the same clusters of
5863/// size \p Sz, which are not identity submasks.
5864staticboolisRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
5865unsigned Sz) {
5866ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5867if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5868returnfalse;
5869for (unsignedI = Sz,E = Mask.size();I <E;I += Sz) {
5870ArrayRef<int> Cluster = Mask.slice(I, Sz);
5871if (Cluster != FirstCluster)
5872returnfalse;
5873 }
5874returntrue;
5875}
5876
5877void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,ArrayRef<int> Mask) const{
5878// Reorder reuses mask.
5879reorderReuses(TE.ReuseShuffleIndices, Mask);
5880constunsigned Sz =TE.Scalars.size();
5881// For vectorized and non-clustered reused no need to do anything else.
5882if (!TE.isGather() ||
5883 !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5884 Sz) ||
5885 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5886return;
5887SmallVector<int> NewMask;
5888inversePermutation(TE.ReorderIndices, NewMask);
5889addMask(NewMask,TE.ReuseShuffleIndices);
5890// Clear reorder since it is going to be applied to the new mask.
5891TE.ReorderIndices.clear();
5892// Try to improve gathered nodes with clustered reuses, if possible.
5893ArrayRef<int> Slice =ArrayRef(NewMask).slice(0, Sz);
5894SmallVector<unsigned> NewOrder(Slice);
5895inversePermutation(NewOrder, NewMask);
5896reorderScalars(TE.Scalars, NewMask);
5897// Fill the reuses mask with the identity submasks.
5898for (auto *It =TE.ReuseShuffleIndices.begin(),
5899 *End =TE.ReuseShuffleIndices.end();
5900 It !=End; std::advance(It, Sz))
5901 std::iota(It, std::next(It, Sz), 0);
5902}
5903
5904staticvoidcombineOrders(MutableArrayRef<unsigned> Order,
5905ArrayRef<unsigned> SecondaryOrder) {
5906assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5907"Expected same size of orders");
5908unsigned Sz = Order.size();
5909SmallBitVector UsedIndices(Sz);
5910for (unsignedIdx : seq<unsigned>(0, Sz)) {
5911if (Order[Idx] != Sz)
5912 UsedIndices.set(Order[Idx]);
5913 }
5914if (SecondaryOrder.empty()) {
5915for (unsignedIdx : seq<unsigned>(0, Sz))
5916if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5917 Order[Idx] =Idx;
5918 }else {
5919for (unsignedIdx : seq<unsigned>(0, Sz))
5920if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5921 !UsedIndices.test(SecondaryOrder[Idx]))
5922 Order[Idx] = SecondaryOrder[Idx];
5923 }
5924}
5925
5926voidBoUpSLP::reorderTopToBottom() {
5927// Maps VF to the graph nodes.
5928DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
5929// ExtractElement gather nodes which can be vectorized and need to handle
5930// their ordering.
5931DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
5932
5933// Phi nodes can have preferred ordering based on their result users
5934DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
5935
5936// AltShuffles can also have a preferred ordering that leads to fewer
5937// instructions, e.g., the addsub instruction in x86.
5938DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5939
5940// Maps a TreeEntry to the reorder indices of external users.
5941DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
5942 ExternalUserReorderMap;
5943// Find all reorderable nodes with the given VF.
5944// Currently the are vectorized stores,loads,extracts + some gathering of
5945// extracts.
5946for_each(VectorizableTree, [&, &TTIRef = *TTI](
5947const std::unique_ptr<TreeEntry> &TE) {
5948// Look for external users that will probably be vectorized.
5949SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5950 findExternalStoreUsersReorderIndices(TE.get());
5951if (!ExternalUserReorderIndices.empty()) {
5952 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5953 ExternalUserReorderMap.try_emplace(TE.get(),
5954 std::move(ExternalUserReorderIndices));
5955 }
5956
5957// Patterns like [fadd,fsub] can be combined into a single instruction in
5958// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5959// to take into account their order when looking for the most used order.
5960if (TE->hasState() && TE->isAltShuffle()) {
5961VectorType *VecTy =
5962getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5963unsigned Opcode0 = TE->getOpcode();
5964unsigned Opcode1 = TE->getAltOpcode();
5965SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5966// If this pattern is supported by the target then we consider the order.
5967if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5968 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5969 AltShufflesToOrders.try_emplace(TE.get(),OrdersType());
5970 }
5971// TODO: Check the reverse order too.
5972 }
5973
5974if (std::optional<OrdersType> CurrentOrder =
5975getReorderingData(*TE,/*TopToBottom=*/true)) {
5976// Do not include ordering for nodes used in the alt opcode vectorization,
5977// better to reorder them during bottom-to-top stage. If follow the order
5978// here, it causes reordering of the whole graph though actually it is
5979// profitable just to reorder the subgraph that starts from the alternate
5980// opcode vectorization node. Such nodes already end-up with the shuffle
5981// instruction and it is just enough to change this shuffle rather than
5982// rotate the scalars for the whole graph.
5983unsigned Cnt = 0;
5984const TreeEntry *UserTE = TE.get();
5985while (UserTE && Cnt <RecursionMaxDepth) {
5986if (UserTE->UserTreeIndices.size() != 1)
5987break;
5988if (all_of(UserTE->UserTreeIndices, [](constEdgeInfo &EI) {
5989 return EI.UserTE->State == TreeEntry::Vectorize &&
5990 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5991 }))
5992return;
5993 UserTE = UserTE->UserTreeIndices.back().UserTE;
5994 ++Cnt;
5995 }
5996 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5997if (!(TE->State == TreeEntry::Vectorize ||
5998 TE->State == TreeEntry::StridedVectorize) ||
5999 !TE->ReuseShuffleIndices.empty())
6000 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
6001if (TE->State == TreeEntry::Vectorize &&
6002 TE->getOpcode() == Instruction::PHI)
6003 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
6004 }
6005 });
6006
6007// Reorder the graph nodes according to their vectorization factor.
6008for (unsigned VF = VectorizableTree.front()->getVectorFactor();
6009 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6010auto It = VFToOrderedEntries.find(VF);
6011if (It == VFToOrderedEntries.end())
6012continue;
6013// Try to find the most profitable order. We just are looking for the most
6014// used order and reorder scalar elements in the nodes according to this
6015// mostly used order.
6016ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
6017// Delete VF entry upon exit.
6018autoCleanup =make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
6019
6020// All operands are reordered and used only in this node - propagate the
6021// most used order to the user node.
6022MapVector<OrdersType,unsigned,
6023DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
6024 OrdersUses;
6025SmallPtrSet<const TreeEntry *, 4> VisitedOps;
6026for (const TreeEntry *OpTE : OrderedEntries) {
6027// No need to reorder this nodes, still need to extend and to use shuffle,
6028// just need to merge reordering shuffle and the reuse shuffle.
6029if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6030continue;
6031// Count number of orders uses.
6032constauto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6033 &PhisToOrders]() ->constOrdersType & {
6034if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6035auto It = GathersToOrders.find(OpTE);
6036if (It != GathersToOrders.end())
6037return It->second;
6038 }
6039if (OpTE->hasState() && OpTE->isAltShuffle()) {
6040auto It = AltShufflesToOrders.find(OpTE);
6041if (It != AltShufflesToOrders.end())
6042return It->second;
6043 }
6044if (OpTE->State == TreeEntry::Vectorize &&
6045 OpTE->getOpcode() == Instruction::PHI) {
6046auto It = PhisToOrders.find(OpTE);
6047if (It != PhisToOrders.end())
6048return It->second;
6049 }
6050return OpTE->ReorderIndices;
6051 }();
6052// First consider the order of the external scalar users.
6053auto It = ExternalUserReorderMap.find(OpTE);
6054if (It != ExternalUserReorderMap.end()) {
6055constauto &ExternalUserReorderIndices = It->second;
6056// If the OpTE vector factor != number of scalars - use natural order,
6057// it is an attempt to reorder node with reused scalars but with
6058// external uses.
6059if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6060 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
6061 ExternalUserReorderIndices.size();
6062 }else {
6063for (constOrdersType &ExtOrder : ExternalUserReorderIndices)
6064 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6065 }
6066// No other useful reorder data in this entry.
6067if (Order.empty())
6068continue;
6069 }
6070// Stores actually store the mask, not the order, need to invert.
6071if (OpTE->State == TreeEntry::Vectorize &&
6072 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6073assert(!OpTE->isAltShuffle() &&
6074"Alternate instructions are only supported by BinaryOperator "
6075"and CastInst.");
6076SmallVector<int> Mask;
6077inversePermutation(Order, Mask);
6078unsignedE = Order.size();
6079OrdersType CurrentOrder(E,E);
6080transform(Mask, CurrentOrder.begin(), [E](intIdx) {
6081 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6082 });
6083fixupOrderingIndices(CurrentOrder);
6084 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6085 }else {
6086 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6087 }
6088 }
6089if (OrdersUses.empty())
6090continue;
6091// Choose the most used order.
6092unsigned IdentityCnt = 0;
6093unsigned FilledIdentityCnt = 0;
6094OrdersType IdentityOrder(VF, VF);
6095for (auto &Pair : OrdersUses) {
6096if (Pair.first.empty() ||isIdentityOrder(Pair.first)) {
6097if (!Pair.first.empty())
6098 FilledIdentityCnt += Pair.second;
6099 IdentityCnt += Pair.second;
6100combineOrders(IdentityOrder, Pair.first);
6101 }
6102 }
6103MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6104unsigned Cnt = IdentityCnt;
6105for (auto &Pair : OrdersUses) {
6106// Prefer identity order. But, if filled identity found (non-empty order)
6107// with same number of uses, as the new candidate order, we can choose
6108// this candidate order.
6109if (Cnt < Pair.second ||
6110 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6111 Cnt == Pair.second && !BestOrder.empty() &&
6112isIdentityOrder(BestOrder))) {
6113combineOrders(Pair.first, BestOrder);
6114 BestOrder = Pair.first;
6115 Cnt = Pair.second;
6116 }else {
6117combineOrders(BestOrder, Pair.first);
6118 }
6119 }
6120// Set order of the user node.
6121if (isIdentityOrder(BestOrder))
6122continue;
6123fixupOrderingIndices(BestOrder);
6124SmallVector<int> Mask;
6125inversePermutation(BestOrder, Mask);
6126SmallVector<int> MaskOrder(BestOrder.size(),PoisonMaskElem);
6127unsignedE = BestOrder.size();
6128transform(BestOrder, MaskOrder.begin(), [E](unsignedI) {
6129 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6130 });
6131// Do an actual reordering, if profitable.
6132for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6133// Just do the reordering for the nodes with the given VF.
6134if (TE->Scalars.size() != VF) {
6135if (TE->ReuseShuffleIndices.size() == VF) {
6136// Need to reorder the reuses masks of the operands with smaller VF to
6137// be able to find the match between the graph nodes and scalar
6138// operands of the given node during vectorization/cost estimation.
6139assert(all_of(TE->UserTreeIndices,
6140 [VF, &TE](constEdgeInfo &EI) {
6141 return EI.UserTE->Scalars.size() == VF ||
6142 EI.UserTE->Scalars.size() ==
6143 TE->Scalars.size();
6144 }) &&
6145"All users must be of VF size.");
6146if (SLPReVec) {
6147assert(SLPReVec &&"Only supported by REVEC.");
6148// ShuffleVectorInst does not do reorderOperands (and it should not
6149// because ShuffleVectorInst supports only a limited set of
6150// patterns). Only do reorderNodeWithReuses if all of the users are
6151// not ShuffleVectorInst.
6152if (all_of(TE->UserTreeIndices, [&](constEdgeInfo &EI) {
6153 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6154 }))
6155continue;
6156assert(none_of(TE->UserTreeIndices,
6157 [&](constEdgeInfo &EI) {
6158 return isa<ShuffleVectorInst>(
6159 EI.UserTE->getMainOp());
6160 }) &&
6161"Does not know how to reorder.");
6162 }
6163// Update ordering of the operands with the smaller VF than the given
6164// one.
6165 reorderNodeWithReuses(*TE, Mask);
6166 }
6167continue;
6168 }
6169if ((TE->State == TreeEntry::Vectorize ||
6170 TE->State == TreeEntry::StridedVectorize) &&
6171 (isa<ExtractElementInst,ExtractValueInst,LoadInst,StoreInst,
6172InsertElementInst>(TE->getMainOp()) ||
6173 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6174assert(!TE->isAltShuffle() &&
6175"Alternate instructions are only supported by BinaryOperator "
6176"and CastInst.");
6177// Build correct orders for extract{element,value}, loads and
6178// stores.
6179reorderOrder(TE->ReorderIndices, Mask);
6180if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6181 TE->reorderOperands(Mask);
6182 }else {
6183// Reorder the node and its operands.
6184 TE->reorderOperands(Mask);
6185assert(TE->ReorderIndices.empty() &&
6186"Expected empty reorder sequence.");
6187reorderScalars(TE->Scalars, Mask);
6188 }
6189if (!TE->ReuseShuffleIndices.empty()) {
6190// Apply reversed order to keep the original ordering of the reused
6191// elements to avoid extra reorder indices shuffling.
6192OrdersType CurrentOrder;
6193reorderOrder(CurrentOrder, MaskOrder);
6194SmallVector<int> NewReuses;
6195inversePermutation(CurrentOrder, NewReuses);
6196addMask(NewReuses, TE->ReuseShuffleIndices);
6197 TE->ReuseShuffleIndices.swap(NewReuses);
6198 }
6199 }
6200 }
6201}
6202
6203bool BoUpSLP::canReorderOperands(
6204 TreeEntry *UserTE,SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6205ArrayRef<TreeEntry *> ReorderableGathers,
6206SmallVectorImpl<TreeEntry *> &GatherOps) {
6207for (unsignedI = 0,E = UserTE->getNumOperands();I <E; ++I) {
6208if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6209return OpData.first ==I &&
6210 (OpData.second->State == TreeEntry::Vectorize ||
6211 OpData.second->State == TreeEntry::StridedVectorize);
6212 }))
6213continue;
6214if (TreeEntry *TE = getVectorizedOperand(UserTE,I)) {
6215// Do not reorder if operand node is used by many user nodes.
6216if (any_of(TE->UserTreeIndices,
6217 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6218returnfalse;
6219// Add the node to the list of the ordered nodes with the identity
6220// order.
6221 Edges.emplace_back(I, TE);
6222// Add ScatterVectorize nodes to the list of operands, where just
6223// reordering of the scalars is required. Similar to the gathers, so
6224// simply add to the list of gathered ops.
6225// If there are reused scalars, process this node as a regular vectorize
6226// node, just reorder reuses mask.
6227if (TE->State != TreeEntry::Vectorize &&
6228 TE->State != TreeEntry::StridedVectorize &&
6229 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6230 GatherOps.push_back(TE);
6231continue;
6232 }
6233 TreeEntry *Gather =nullptr;
6234if (count_if(ReorderableGathers,
6235 [&Gather, UserTE,I](TreeEntry *TE) {
6236assert(TE->State != TreeEntry::Vectorize &&
6237 TE->State != TreeEntry::StridedVectorize &&
6238"Only non-vectorized nodes are expected.");
6239if (any_of(TE->UserTreeIndices,
6240 [UserTE,I](const EdgeInfo &EI) {
6241 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6242 })) {
6243assert(TE->isSame(UserTE->getOperand(I)) &&
6244"Operand entry does not match operands.");
6245Gather = TE;
6246returntrue;
6247 }
6248returnfalse;
6249 }) > 1 &&
6250 !allConstant(UserTE->getOperand(I)))
6251returnfalse;
6252if (Gather)
6253 GatherOps.push_back(Gather);
6254 }
6255returntrue;
6256}
6257
6258voidBoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6259SetVector<TreeEntry *> OrderedEntries;
6260DenseSet<const TreeEntry *> GathersToOrders;
6261// Find all reorderable leaf nodes with the given VF.
6262// Currently the are vectorized loads,extracts without alternate operands +
6263// some gathering of extracts.
6264SmallVector<TreeEntry *> NonVectorized;
6265for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6266if (TE->State != TreeEntry::Vectorize &&
6267 TE->State != TreeEntry::StridedVectorize)
6268 NonVectorized.push_back(TE.get());
6269if (std::optional<OrdersType> CurrentOrder =
6270getReorderingData(*TE,/*TopToBottom=*/false)) {
6271 OrderedEntries.insert(TE.get());
6272if (!(TE->State == TreeEntry::Vectorize ||
6273 TE->State == TreeEntry::StridedVectorize) ||
6274 !TE->ReuseShuffleIndices.empty())
6275 GathersToOrders.insert(TE.get());
6276 }
6277 }
6278
6279// 1. Propagate order to the graph nodes, which use only reordered nodes.
6280// I.e., if the node has operands, that are reordered, try to make at least
6281// one operand order in the natural order and reorder others + reorder the
6282// user node itself.
6283SmallPtrSet<const TreeEntry *, 4> Visited;
6284while (!OrderedEntries.empty()) {
6285// 1. Filter out only reordered nodes.
6286// 2. If the entry has multiple uses - skip it and jump to the next node.
6287DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>Users;
6288SmallVector<TreeEntry *> Filtered;
6289for (TreeEntry *TE : OrderedEntries) {
6290if (!(TE->State == TreeEntry::Vectorize ||
6291 TE->State == TreeEntry::StridedVectorize ||
6292 (TE->isGather() && GathersToOrders.contains(TE))) ||
6293 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6294 !all_of(drop_begin(TE->UserTreeIndices),
6295 [TE](constEdgeInfo &EI) {
6296 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6297 }) ||
6298 !Visited.insert(TE).second) {
6299 Filtered.push_back(TE);
6300continue;
6301 }
6302// Build a map between user nodes and their operands order to speedup
6303// search. The graph currently does not provide this dependency directly.
6304for (EdgeInfo &EI : TE->UserTreeIndices)
6305Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6306 }
6307// Erase filtered entries.
6308for (TreeEntry *TE : Filtered)
6309 OrderedEntries.remove(TE);
6310SmallVector<
6311 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6312 UsersVec(Users.begin(),Users.end());
6313sort(UsersVec, [](constauto &Data1,constauto &Data2) {
6314return Data1.first->Idx > Data2.first->Idx;
6315 });
6316for (auto &Data : UsersVec) {
6317// Check that operands are used only in the User node.
6318SmallVector<TreeEntry *> GatherOps;
6319if (!canReorderOperands(Data.first,Data.second, NonVectorized,
6320 GatherOps)) {
6321for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)
6322 OrderedEntries.remove(Op.second);
6323continue;
6324 }
6325// All operands are reordered and used only in this node - propagate the
6326// most used order to the user node.
6327MapVector<OrdersType,unsigned,
6328DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
6329 OrdersUses;
6330// Do the analysis for each tree entry only once, otherwise the order of
6331// the same node my be considered several times, though might be not
6332// profitable.
6333SmallPtrSet<const TreeEntry *, 4> VisitedOps;
6334SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
6335for (constauto &Op :Data.second) {
6336 TreeEntry *OpTE =Op.second;
6337if (!VisitedOps.insert(OpTE).second)
6338continue;
6339if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6340continue;
6341constauto Order = [&]() ->constOrdersType {
6342if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6343returngetReorderingData(*OpTE,/*TopToBottom=*/false)
6344 .value_or(OrdersType(1));
6345return OpTE->ReorderIndices;
6346 }();
6347// The order is partially ordered, skip it in favor of fully non-ordered
6348// orders.
6349if (Order.size() == 1)
6350continue;
6351unsigned NumOps =count_if(
6352Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6353 return P.second == OpTE;
6354 });
6355// Stores actually store the mask, not the order, need to invert.
6356if (OpTE->State == TreeEntry::Vectorize &&
6357 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6358assert(!OpTE->isAltShuffle() &&
6359"Alternate instructions are only supported by BinaryOperator "
6360"and CastInst.");
6361SmallVector<int> Mask;
6362inversePermutation(Order, Mask);
6363unsignedE = Order.size();
6364OrdersType CurrentOrder(E,E);
6365transform(Mask, CurrentOrder.begin(), [E](intIdx) {
6366 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6367 });
6368fixupOrderingIndices(CurrentOrder);
6369 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6370 NumOps;
6371 }else {
6372 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6373 }
6374auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6375constauto AllowsReordering = [&](const TreeEntry *TE) {
6376if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6377 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6378 (IgnoreReorder && TE->Idx == 0))
6379returntrue;
6380if (TE->isGather()) {
6381if (GathersToOrders.contains(TE))
6382return !getReorderingData(*TE,/*TopToBottom=*/false)
6383 .value_or(OrdersType(1))
6384 .empty();
6385returntrue;
6386 }
6387returnfalse;
6388 };
6389for (constEdgeInfo &EI : OpTE->UserTreeIndices) {
6390 TreeEntry *UserTE = EI.UserTE;
6391if (!VisitedUsers.insert(UserTE).second)
6392continue;
6393// May reorder user node if it requires reordering, has reused
6394// scalars, is an alternate op vectorize node or its op nodes require
6395// reordering.
6396if (AllowsReordering(UserTE))
6397continue;
6398// Check if users allow reordering.
6399// Currently look up just 1 level of operands to avoid increase of
6400// the compile time.
6401// Profitable to reorder if definitely more operands allow
6402// reordering rather than those with natural order.
6403ArrayRef<std::pair<unsigned, TreeEntry *>> Ops =Users[UserTE];
6404if (static_cast<unsigned>(count_if(
6405 Ops, [UserTE, &AllowsReordering](
6406const std::pair<unsigned, TreeEntry *> &Op) {
6407return AllowsReordering(Op.second) &&
6408all_of(Op.second->UserTreeIndices,
6409 [UserTE](constEdgeInfo &EI) {
6410 return EI.UserTE == UserTE;
6411 });
6412 })) <= Ops.size() / 2)
6413 ++Res.first->second;
6414 }
6415 }
6416if (OrdersUses.empty()) {
6417for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)
6418 OrderedEntries.remove(Op.second);
6419continue;
6420 }
6421// Choose the most used order.
6422unsigned IdentityCnt = 0;
6423unsigned VF =Data.second.front().second->getVectorFactor();
6424OrdersType IdentityOrder(VF, VF);
6425for (auto &Pair : OrdersUses) {
6426if (Pair.first.empty() ||isIdentityOrder(Pair.first)) {
6427 IdentityCnt += Pair.second;
6428combineOrders(IdentityOrder, Pair.first);
6429 }
6430 }
6431MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6432unsigned Cnt = IdentityCnt;
6433for (auto &Pair : OrdersUses) {
6434// Prefer identity order. But, if filled identity found (non-empty
6435// order) with same number of uses, as the new candidate order, we can
6436// choose this candidate order.
6437if (Cnt < Pair.second) {
6438combineOrders(Pair.first, BestOrder);
6439 BestOrder = Pair.first;
6440 Cnt = Pair.second;
6441 }else {
6442combineOrders(BestOrder, Pair.first);
6443 }
6444 }
6445// Set order of the user node.
6446if (isIdentityOrder(BestOrder)) {
6447for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)
6448 OrderedEntries.remove(Op.second);
6449continue;
6450 }
6451fixupOrderingIndices(BestOrder);
6452// Erase operands from OrderedEntries list and adjust their orders.
6453 VisitedOps.clear();
6454SmallVector<int> Mask;
6455inversePermutation(BestOrder, Mask);
6456SmallVector<int> MaskOrder(BestOrder.size(),PoisonMaskElem);
6457unsignedE = BestOrder.size();
6458transform(BestOrder, MaskOrder.begin(), [E](unsignedI) {
6459 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6460 });
6461for (const std::pair<unsigned, TreeEntry *> &Op :Data.second) {
6462 TreeEntry *TE =Op.second;
6463 OrderedEntries.remove(TE);
6464if (!VisitedOps.insert(TE).second)
6465continue;
6466if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6467 reorderNodeWithReuses(*TE, Mask);
6468continue;
6469 }
6470// Gathers are processed separately.
6471if (TE->State != TreeEntry::Vectorize &&
6472 TE->State != TreeEntry::StridedVectorize &&
6473 (TE->State != TreeEntry::ScatterVectorize ||
6474 TE->ReorderIndices.empty()))
6475continue;
6476assert((BestOrder.size() == TE->ReorderIndices.size() ||
6477 TE->ReorderIndices.empty()) &&
6478"Non-matching sizes of user/operand entries.");
6479reorderOrder(TE->ReorderIndices, Mask);
6480if (IgnoreReorder && TE == VectorizableTree.front().get())
6481 IgnoreReorder =false;
6482 }
6483// For gathers just need to reorder its scalars.
6484for (TreeEntry *Gather : GatherOps) {
6485assert(Gather->ReorderIndices.empty() &&
6486"Unexpected reordering of gathers.");
6487if (!Gather->ReuseShuffleIndices.empty()) {
6488// Just reorder reuses indices.
6489reorderReuses(Gather->ReuseShuffleIndices, Mask);
6490continue;
6491 }
6492reorderScalars(Gather->Scalars, Mask);
6493 OrderedEntries.remove(Gather);
6494 }
6495// Reorder operands of the user node and set the ordering for the user
6496// node itself.
6497if (Data.first->State != TreeEntry::Vectorize ||
6498 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6499Data.first->getMainOp()) ||
6500Data.first->isAltShuffle())
6501Data.first->reorderOperands(Mask);
6502if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6503Data.first->isAltShuffle() ||
6504Data.first->State == TreeEntry::StridedVectorize) {
6505reorderScalars(Data.first->Scalars, Mask);
6506reorderOrder(Data.first->ReorderIndices, MaskOrder,
6507/*BottomOrder=*/true);
6508if (Data.first->ReuseShuffleIndices.empty() &&
6509 !Data.first->ReorderIndices.empty() &&
6510 !Data.first->isAltShuffle()) {
6511// Insert user node to the list to try to sink reordering deeper in
6512// the graph.
6513 OrderedEntries.insert(Data.first);
6514 }
6515 }else {
6516reorderOrder(Data.first->ReorderIndices, Mask);
6517 }
6518 }
6519 }
6520// If the reordering is unnecessary, just remove the reorder.
6521if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6522 VectorizableTree.front()->ReuseShuffleIndices.empty())
6523 VectorizableTree.front()->ReorderIndices.clear();
6524}
6525
6526Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const{
6527if ((Entry.getOpcode() == Instruction::Store ||
6528 Entry.getOpcode() == Instruction::Load) &&
6529 Entry.State == TreeEntry::StridedVectorize &&
6530 !Entry.ReorderIndices.empty() &&isReverseOrder(Entry.ReorderIndices))
6531return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6532return dyn_cast<Instruction>(Entry.Scalars.front());
6533}
6534
6535voidBoUpSLP::buildExternalUses(
6536constExtraValueToDebugLocsMap &ExternallyUsedValues) {
6537DenseMap<Value *, unsigned> ScalarToExtUses;
6538// Collect the values that we need to extract from the tree.
6539for (auto &TEPtr : VectorizableTree) {
6540 TreeEntry *Entry = TEPtr.get();
6541
6542// No need to handle users of gathered values.
6543if (Entry->isGather())
6544continue;
6545
6546// For each lane:
6547for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6548Value *Scalar = Entry->Scalars[Lane];
6549if (!isa<Instruction>(Scalar))
6550continue;
6551// All uses must be replaced already? No need to do it again.
6552auto It = ScalarToExtUses.find(Scalar);
6553if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6554continue;
6555
6556// Check if the scalar is externally used as an extra arg.
6557constauto ExtI = ExternallyUsedValues.find(Scalar);
6558if (ExtI != ExternallyUsedValues.end()) {
6559int FoundLane = Entry->findLaneForValue(Scalar);
6560LLVM_DEBUG(dbgs() <<"SLP: Need to extract: Extra arg from lane "
6561 << FoundLane <<" from " << *Scalar <<".\n");
6562 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6563 ExternalUses.emplace_back(Scalar,nullptr, FoundLane);
6564continue;
6565 }
6566for (User *U : Scalar->users()) {
6567LLVM_DEBUG(dbgs() <<"SLP: Checking user:" << *U <<".\n");
6568
6569Instruction *UserInst = dyn_cast<Instruction>(U);
6570if (!UserInst ||isDeleted(UserInst))
6571continue;
6572
6573// Ignore users in the user ignore list.
6574if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6575continue;
6576
6577// Skip in-tree scalars that become vectors
6578if (TreeEntry *UseEntry = getTreeEntry(U)) {
6579// Some in-tree scalars will remain as scalar in vectorized
6580// instructions. If that is the case, the one in FoundLane will
6581// be used.
6582if (UseEntry->State == TreeEntry::ScatterVectorize ||
6583 !doesInTreeUserNeedToExtract(
6584 Scalar, getRootEntryInstruction(*UseEntry), TLI,TTI)) {
6585LLVM_DEBUG(dbgs() <<"SLP: \tInternal user will be removed:" << *U
6586 <<".\n");
6587assert(!UseEntry->isGather() &&"Bad state");
6588continue;
6589 }
6590 U =nullptr;
6591if (It != ScalarToExtUses.end()) {
6592 ExternalUses[It->second].User =nullptr;
6593break;
6594 }
6595 }
6596
6597if (U && Scalar->hasNUsesOrMore(UsesLimit))
6598 U =nullptr;
6599int FoundLane = Entry->findLaneForValue(Scalar);
6600LLVM_DEBUG(dbgs() <<"SLP: Need to extract:" << *UserInst
6601 <<" from lane " << FoundLane <<" from " << *Scalar
6602 <<".\n");
6603 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6604 ExternalUses.emplace_back(Scalar, U, FoundLane);
6605if (!U)
6606break;
6607 }
6608 }
6609 }
6610}
6611
6612SmallVector<SmallVector<StoreInst *>>
6613BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const{
6614SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,
6615SmallVector<StoreInst *>, 8>
6616 PtrToStoresMap;
6617for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6618Value *V = TE->Scalars[Lane];
6619// Don't iterate over the users of constant data.
6620if (!isa<Instruction>(V))
6621continue;
6622// To save compilation time we don't visit if we have too many users.
6623if (V->hasNUsesOrMore(UsesLimit))
6624break;
6625
6626// Collect stores per pointer object.
6627for (User *U : V->users()) {
6628auto *SI = dyn_cast<StoreInst>(U);
6629// Test whether we can handle the store. V might be a global, which could
6630// be used in a different function.
6631if (SI ==nullptr || !SI->isSimple() || SI->getFunction() !=F ||
6632 !isValidElementType(SI->getValueOperand()->getType()))
6633continue;
6634// Skip entry if already
6635if (getTreeEntry(U))
6636continue;
6637
6638Value *Ptr =
6639getUnderlyingObject(SI->getPointerOperand(),RecursionMaxDepth);
6640auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6641 SI->getValueOperand()->getType(),Ptr}];
6642// For now just keep one store per pointer object per lane.
6643// TODO: Extend this to support multiple stores per pointer per lane
6644if (StoresVec.size() > Lane)
6645continue;
6646if (!StoresVec.empty()) {
6647 std::optional<int> Diff =getPointersDiff(
6648 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6649 SI->getValueOperand()->getType(),
6650 StoresVec.front()->getPointerOperand(), *DL, *SE,
6651/*StrictCheck=*/true);
6652// We failed to compare the pointers so just abandon this store.
6653if (!Diff)
6654continue;
6655 }
6656 StoresVec.push_back(SI);
6657 }
6658 }
6659SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6660unsignedI = 0;
6661for (auto &P : PtrToStoresMap) {
6662 Res[I].swap(P.second);
6663 ++I;
6664 }
6665return Res;
6666}
6667
6668bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6669 OrdersType &ReorderIndices) const{
6670// We check whether the stores in StoreVec can form a vector by sorting them
6671// and checking whether they are consecutive.
6672
6673// To avoid calling getPointersDiff() while sorting we create a vector of
6674// pairs {store, offset from first} and sort this instead.
6675SmallVector<std::pair<int, unsigned>> StoreOffsetVec;
6676StoreInst *S0 = StoresVec[0];
6677 StoreOffsetVec.emplace_back(0, 0);
6678Type *S0Ty = S0->getValueOperand()->getType();
6679Value *S0Ptr = S0->getPointerOperand();
6680for (unsignedIdx : seq<unsigned>(1, StoresVec.size())) {
6681StoreInst *SI = StoresVec[Idx];
6682 std::optional<int> Diff =
6683getPointersDiff(S0Ty, S0Ptr,SI->getValueOperand()->getType(),
6684SI->getPointerOperand(), *DL, *SE,
6685/*StrictCheck=*/true);
6686 StoreOffsetVec.emplace_back(*Diff,Idx);
6687 }
6688
6689// Check if the stores are consecutive by checking if their difference is 1.
6690if (StoreOffsetVec.size() != StoresVec.size())
6691returnfalse;
6692sort(StoreOffsetVec,
6693 [](const std::pair<int, unsigned> &L,
6694const std::pair<int, unsigned> &R) {returnL.first <R.first; });
6695unsignedIdx = 0;
6696int PrevDist = 0;
6697for (constauto &P : StoreOffsetVec) {
6698if (Idx > 0 &&P.first != PrevDist + 1)
6699returnfalse;
6700 PrevDist =P.first;
6701 ++Idx;
6702 }
6703
6704// Calculate the shuffle indices according to their offset against the sorted
6705// StoreOffsetVec.
6706 ReorderIndices.assign(StoresVec.size(), 0);
6707bool IsIdentity =true;
6708for (auto [I,P] :enumerate(StoreOffsetVec)) {
6709 ReorderIndices[P.second] =I;
6710 IsIdentity &=P.second ==I;
6711 }
6712// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6713// reorderTopToBottom() and reorderBottomToTop(), so we are following the
6714// same convention here.
6715if (IsIdentity)
6716 ReorderIndices.clear();
6717
6718returntrue;
6719}
6720
6721#ifndef NDEBUG
6722LLVM_DUMP_METHODstaticvoiddumpOrder(constBoUpSLP::OrdersType &Order) {
6723for (unsignedIdx : Order)
6724dbgs() <<Idx <<", ";
6725dbgs() <<"\n";
6726}
6727#endif
6728
6729SmallVector<BoUpSLP::OrdersType, 1>
6730BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const{
6731unsigned NumLanes =TE->Scalars.size();
6732
6733SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6734
6735// Holds the reorder indices for each candidate store vector that is a user of
6736// the current TreeEntry.
6737SmallVector<OrdersType, 1> ExternalReorderIndices;
6738
6739// Now inspect the stores collected per pointer and look for vectorization
6740// candidates. For each candidate calculate the reorder index vector and push
6741// it into `ExternalReorderIndices`
6742for (ArrayRef<StoreInst *> StoresVec : Stores) {
6743// If we have fewer than NumLanes stores, then we can't form a vector.
6744if (StoresVec.size() != NumLanes)
6745continue;
6746
6747// If the stores are not consecutive then abandon this StoresVec.
6748OrdersType ReorderIndices;
6749if (!canFormVector(StoresVec, ReorderIndices))
6750continue;
6751
6752// We now know that the scalars in StoresVec can form a vector instruction,
6753// so set the reorder indices.
6754 ExternalReorderIndices.push_back(ReorderIndices);
6755 }
6756return ExternalReorderIndices;
6757}
6758
6759voidBoUpSLP::buildTree(ArrayRef<Value *> Roots,
6760constSmallDenseSet<Value *> &UserIgnoreLst) {
6761deleteTree();
6762 UserIgnoreList = &UserIgnoreLst;
6763if (!allSameType(Roots))
6764return;
6765 buildTree_rec(Roots, 0,EdgeInfo());
6766}
6767
6768voidBoUpSLP::buildTree(ArrayRef<Value *> Roots) {
6769deleteTree();
6770if (!allSameType(Roots))
6771return;
6772 buildTree_rec(Roots, 0,EdgeInfo());
6773}
6774
6775/// Tries to find subvector of loads and builds new vector of only loads if can
6776/// be profitable.
6777staticvoidgatherPossiblyVectorizableLoads(
6778constBoUpSLP &R,ArrayRef<Value *> VL,constDataLayout &DL,
6779ScalarEvolution &SE,constTargetTransformInfo &TTI,
6780SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6781bool AddNew =true) {
6782if (VL.empty())
6783return;
6784Type *ScalarTy =getValueType(VL.front());
6785if (!isValidElementType(ScalarTy))
6786return;
6787SmallVector<SmallVector<std::pair<LoadInst *, int>>> ClusteredLoads;
6788SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6789for (Value *V : VL) {
6790auto *LI = dyn_cast<LoadInst>(V);
6791if (!LI)
6792continue;
6793if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6794continue;
6795bool IsFound =false;
6796for (auto [Map,Data] :zip(ClusteredDistToLoad, ClusteredLoads)) {
6797assert(LI->getParent() ==Data.front().first->getParent() &&
6798 LI->getType() ==Data.front().first->getType() &&
6799getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth) ==
6800getUnderlyingObject(Data.front().first->getPointerOperand(),
6801RecursionMaxDepth) &&
6802"Expected loads with the same type, same parent and same "
6803"underlying pointer.");
6804 std::optional<int> Dist =getPointersDiff(
6805 LI->getType(), LI->getPointerOperand(),Data.front().first->getType(),
6806Data.front().first->getPointerOperand(),DL, SE,
6807/*StrictCheck=*/true);
6808if (!Dist)
6809continue;
6810auto It = Map.find(*Dist);
6811if (It != Map.end() && It->second != LI)
6812continue;
6813if (It == Map.end()) {
6814Data.emplace_back(LI, *Dist);
6815 Map.try_emplace(*Dist, LI);
6816 }
6817 IsFound =true;
6818break;
6819 }
6820if (!IsFound) {
6821 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6822 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6823 }
6824 }
6825auto FindMatchingLoads =
6826 [&](ArrayRef<std::pair<LoadInst *, int>> Loads,
6827SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>>
6828 &GatheredLoads,
6829SetVector<unsigned> &ToAdd,SetVector<unsigned> &Repeated,
6830int &Offset,unsigned &Start) {
6831if (Loads.empty())
6832return GatheredLoads.end();
6833SmallVector<std::pair<int, int>> Res;
6834LoadInst *LI = Loads.front().first;
6835for (auto [Idx,Data] :enumerate(GatheredLoads)) {
6836if (Idx < Start)
6837continue;
6838 ToAdd.clear();
6839if (LI->getParent() !=Data.front().first->getParent() ||
6840 LI->getType() !=Data.front().first->getType())
6841continue;
6842 std::optional<int> Dist =
6843getPointersDiff(LI->getType(), LI->getPointerOperand(),
6844Data.front().first->getType(),
6845Data.front().first->getPointerOperand(),DL, SE,
6846/*StrictCheck=*/true);
6847if (!Dist)
6848continue;
6849SmallSet<int, 4> DataDists;
6850SmallPtrSet<LoadInst *, 4> DataLoads;
6851for (std::pair<LoadInst *, int>P :Data) {
6852 DataDists.insert(P.second);
6853 DataLoads.insert(P.first);
6854 }
6855// Found matching gathered loads - check if all loads are unique or
6856// can be effectively vectorized.
6857unsigned NumUniques = 0;
6858for (auto [Cnt, Pair] :enumerate(Loads)) {
6859bool Used = DataLoads.contains(Pair.first);
6860if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6861 ++NumUniques;
6862 ToAdd.insert(Cnt);
6863 }elseif (Used) {
6864 Repeated.insert(Cnt);
6865 }
6866 }
6867if (NumUniques > 0 &&
6868 (Loads.size() == NumUniques ||
6869 (Loads.size() - NumUniques >= 2 &&
6870 Loads.size() - NumUniques >= Loads.size() / 2 &&
6871 (has_single_bit(Data.size() + NumUniques) ||
6872bit_ceil(Data.size()) <
6873bit_ceil(Data.size() + NumUniques))))) {
6874Offset = *Dist;
6875 Start =Idx + 1;
6876return std::next(GatheredLoads.begin(),Idx);
6877 }
6878 }
6879 ToAdd.clear();
6880return GatheredLoads.end();
6881 };
6882for (ArrayRef<std::pair<LoadInst *, int>>Data : ClusteredLoads) {
6883unsigned Start = 0;
6884SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6885intOffset = 0;
6886auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6887Offset, Start);
6888while (It != GatheredLoads.end()) {
6889assert(!LocalToAdd.empty() &&"Expected some elements to add.");
6890for (unsignedIdx : LocalToAdd)
6891 It->emplace_back(Data[Idx].first,Data[Idx].second +Offset);
6892 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6893 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,Offset,
6894 Start);
6895 }
6896if (any_of(seq<unsigned>(Data.size()), [&](unsignedIdx) {
6897 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6898 })) {
6899auto AddNewLoads =
6900 [&](SmallVectorImpl<std::pair<LoadInst *, int>> &Loads) {
6901for (unsignedIdx : seq<unsigned>(Data.size())) {
6902if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6903continue;
6904 Loads.push_back(Data[Idx]);
6905 }
6906 };
6907if (!AddNew) {
6908LoadInst *LI =Data.front().first;
6909 It =find_if(
6910 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6911return PD.front().first->getParent() == LI->getParent() &&
6912 PD.front().first->getType() == LI->getType();
6913 });
6914while (It != GatheredLoads.end()) {
6915 AddNewLoads(*It);
6916 It = std::find_if(
6917 std::next(It), GatheredLoads.end(),
6918 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6919 return PD.front().first->getParent() == LI->getParent() &&
6920 PD.front().first->getType() == LI->getType();
6921 });
6922 }
6923 }
6924 GatheredLoads.emplace_back().append(Data.begin(),Data.end());
6925 AddNewLoads(GatheredLoads.emplace_back());
6926 }
6927 }
6928}
6929
6930void BoUpSLP::tryToVectorizeGatheredLoads(
6931constSmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6932SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6933 8> &GatheredLoads) {
6934 GatheredLoadsEntriesFirst = VectorizableTree.size();
6935
6936SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6937 LoadEntriesToVectorize.size());
6938for (auto [Idx, Set] :zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6939Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6940 VectorizableTree[Idx]->Scalars.end());
6941
6942// Sort loads by distance.
6943auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6944const std::pair<LoadInst *, int> &L2) {
6945return L1.second > L2.second;
6946 };
6947
6948auto IsMaskedGatherSupported = [&,TTI =TTI](ArrayRef<LoadInst *> Loads) {
6949ArrayRef<Value *> Values(reinterpret_cast<Value *const*>(Loads.begin()),
6950 Loads.size());
6951Align Alignment = computeCommonAlignment<LoadInst>(Values);
6952auto *Ty =getWidenedType(Loads.front()->getType(), Loads.size());
6953returnTTI->isLegalMaskedGather(Ty, Alignment) &&
6954 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6955 };
6956
6957auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6958BoUpSLP::ValueSet &VectorizedLoads,
6959SmallVectorImpl<LoadInst *> &NonVectorized,
6960bool Final,unsigned MaxVF) {
6961SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>Results;
6962unsigned StartIdx = 0;
6963SmallVector<int> CandidateVFs;
6964if (VectorizeNonPowerOf2 &&has_single_bit(MaxVF + 1))
6965 CandidateVFs.push_back(MaxVF);
6966for (int NumElts =getFloorFullVectorNumberOfElements(
6967 *TTI, Loads.front()->getType(), MaxVF);
6968 NumElts > 1; NumElts =getFloorFullVectorNumberOfElements(
6969 *TTI, Loads.front()->getType(), NumElts - 1)) {
6970 CandidateVFs.push_back(NumElts);
6971if (VectorizeNonPowerOf2 && NumElts > 2)
6972 CandidateVFs.push_back(NumElts - 1);
6973 }
6974
6975if (Final && CandidateVFs.empty())
6976returnResults;
6977
6978unsigned BestVF = Final ? CandidateVFs.back() : 0;
6979for (unsigned NumElts : CandidateVFs) {
6980if (Final && NumElts > BestVF)
6981continue;
6982SmallVector<unsigned> MaskedGatherVectorized;
6983for (unsigned Cnt = StartIdx,E = Loads.size(); Cnt <E;
6984 ++Cnt) {
6985ArrayRef<LoadInst *> Slice =
6986ArrayRef(Loads).slice(Cnt, std::min(NumElts,E - Cnt));
6987if (VectorizedLoads.count(Slice.front()) ||
6988 VectorizedLoads.count(Slice.back()) ||
6989areKnownNonVectorizableLoads(Slice))
6990continue;
6991// Check if it is profitable to try vectorizing gathered loads. It is
6992// profitable if we have more than 3 consecutive loads or if we have
6993// less but all users are vectorized or deleted.
6994bool AllowToVectorize =false;
6995// Check if it is profitable to vectorize 2-elements loads.
6996if (NumElts == 2) {
6997bool IsLegalBroadcastLoad =TTI->isLegalBroadcastLoad(
6998 Slice.front()->getType(),ElementCount::getFixed(NumElts));
6999auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
7000for (LoadInst *LI : Slice) {
7001// If single use/user - allow to vectorize.
7002if (LI->hasOneUse())
7003continue;
7004// 1. Check if number of uses equals number of users.
7005// 2. All users are deleted.
7006// 3. The load broadcasts are not allowed or the load is not
7007// broadcasted.
7008if (static_cast<unsignedint>(std::distance(
7009 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7010returnfalse;
7011if (!IsLegalBroadcastLoad)
7012continue;
7013if (LI->hasNUsesOrMore(UsesLimit))
7014returnfalse;
7015for (User *U : LI->users()) {
7016if (auto *UI = dyn_cast<Instruction>(U); UI &&isDeleted(UI))
7017continue;
7018if (const TreeEntry *UTE = getTreeEntry(U)) {
7019for (intI : seq<int>(UTE->getNumOperands())) {
7020if (all_of(UTE->getOperand(I),
7021 [LI](Value *V) { return V == LI; }))
7022// Found legal broadcast - do not vectorize.
7023returnfalse;
7024 }
7025 }
7026 }
7027 }
7028returntrue;
7029 };
7030 AllowToVectorize = CheckIfAllowed(Slice);
7031 }else {
7032 AllowToVectorize =
7033 (NumElts >= 3 ||
7034any_of(ValueToGatherNodes.at(Slice.front()),
7035 [=](const TreeEntry *TE) {
7036 return TE->Scalars.size() == 2 &&
7037 ((TE->Scalars.front() == Slice.front() &&
7038 TE->Scalars.back() == Slice.back()) ||
7039 (TE->Scalars.front() == Slice.back() &&
7040 TE->Scalars.back() == Slice.front()));
7041 })) &&
7042hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
7043 Slice.size());
7044 }
7045if (AllowToVectorize) {
7046SmallVector<Value *> PointerOps;
7047OrdersType CurrentOrder;
7048// Try to build vector load.
7049ArrayRef<Value *> Values(
7050reinterpret_cast<Value *const*>(Slice.begin()), Slice.size());
7051LoadsStateLS =canVectorizeLoads(Values, Slice.front(), CurrentOrder,
7052 PointerOps, &BestVF);
7053if (LS !=LoadsState::Gather ||
7054 (BestVF > 1 &&static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7055if (LS ==LoadsState::ScatterVectorize) {
7056if (MaskedGatherVectorized.empty() ||
7057 Cnt >= MaskedGatherVectorized.back() + NumElts)
7058 MaskedGatherVectorized.push_back(Cnt);
7059continue;
7060 }
7061if (LS !=LoadsState::Gather) {
7062Results.emplace_back(Values, LS);
7063 VectorizedLoads.insert(Slice.begin(), Slice.end());
7064// If we vectorized initial block, no need to try to vectorize it
7065// again.
7066if (Cnt == StartIdx)
7067 StartIdx += NumElts;
7068 }
7069// Check if the whole array was vectorized already - exit.
7070if (StartIdx >= Loads.size())
7071break;
7072// Erase last masked gather candidate, if another candidate within
7073// the range is found to be better.
7074if (!MaskedGatherVectorized.empty() &&
7075 Cnt < MaskedGatherVectorized.back() + NumElts)
7076 MaskedGatherVectorized.pop_back();
7077 Cnt += NumElts - 1;
7078continue;
7079 }
7080 }
7081if (!AllowToVectorize || BestVF == 0)
7082registerNonVectorizableLoads(Slice);
7083 }
7084// Mark masked gathers candidates as vectorized, if any.
7085for (unsigned Cnt : MaskedGatherVectorized) {
7086ArrayRef<LoadInst *> Slice =ArrayRef(Loads).slice(
7087 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7088ArrayRef<Value *> Values(
7089reinterpret_cast<Value *const*>(Slice.begin()), Slice.size());
7090Results.emplace_back(Values,LoadsState::ScatterVectorize);
7091 VectorizedLoads.insert(Slice.begin(), Slice.end());
7092// If we vectorized initial block, no need to try to vectorize it again.
7093if (Cnt == StartIdx)
7094 StartIdx += NumElts;
7095 }
7096 }
7097for (LoadInst *LI : Loads) {
7098if (!VectorizedLoads.contains(LI))
7099 NonVectorized.push_back(LI);
7100 }
7101returnResults;
7102 };
7103auto ProcessGatheredLoads =
7104 [&, &TTI = *TTI](
7105ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
7106bool Final =false) {
7107SmallVector<LoadInst *> NonVectorized;
7108for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7109if (LoadsDists.size() <= 1) {
7110 NonVectorized.push_back(LoadsDists.back().first);
7111continue;
7112 }
7113SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7114SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7115transform(LoadsDists, OriginalLoads.begin(),
7116 [](const std::pair<LoadInst *, int> &L) ->LoadInst * {
7117 return L.first;
7118 });
7119stable_sort(LocalLoadsDists, LoadSorter);
7120SmallVector<LoadInst *> Loads;
7121unsigned MaxConsecutiveDistance = 0;
7122unsigned CurrentConsecutiveDist = 1;
7123int LastDist = LocalLoadsDists.front().second;
7124bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7125for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7126if (getTreeEntry(L.first))
7127continue;
7128assert(LastDist >=L.second &&
7129"Expected first distance always not less than second");
7130if (static_cast<unsigned>(LastDist -L.second) ==
7131 CurrentConsecutiveDist) {
7132 ++CurrentConsecutiveDist;
7133 MaxConsecutiveDistance =
7134 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7135 Loads.push_back(L.first);
7136continue;
7137 }
7138if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7139 !Loads.empty())
7140 Loads.pop_back();
7141 CurrentConsecutiveDist = 1;
7142 LastDist =L.second;
7143 Loads.push_back(L.first);
7144 }
7145if (Loads.size() <= 1)
7146continue;
7147if (AllowMaskedGather)
7148 MaxConsecutiveDistance = Loads.size();
7149elseif (MaxConsecutiveDistance < 2)
7150continue;
7151BoUpSLP::ValueSet VectorizedLoads;
7152SmallVector<LoadInst *> SortedNonVectorized;
7153SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>Results =
7154 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7155 Final, MaxConsecutiveDistance);
7156if (!Results.empty() && !SortedNonVectorized.empty() &&
7157 OriginalLoads.size() == Loads.size() &&
7158 MaxConsecutiveDistance == Loads.size() &&
7159all_of(Results,
7160 [](const std::pair<ArrayRef<Value *>,LoadsState> &P) {
7161returnP.second ==LoadsState::ScatterVectorize;
7162 })) {
7163 VectorizedLoads.clear();
7164SmallVector<LoadInst *> UnsortedNonVectorized;
7165SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>
7166 UnsortedResults =
7167 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7168 UnsortedNonVectorized, Final,
7169 OriginalLoads.size());
7170if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7171 SortedNonVectorized.swap(UnsortedNonVectorized);
7172Results.swap(UnsortedResults);
7173 }
7174 }
7175for (auto [Slice,_] :Results) {
7176LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize gathered loads ("
7177 << Slice.size() <<")\n");
7178if (any_of(Slice, [&](Value *V) {return getTreeEntry(V); })) {
7179for (Value *L : Slice)
7180if (!getTreeEntry(L))
7181 SortedNonVectorized.push_back(cast<LoadInst>(L));
7182continue;
7183 }
7184
7185// Select maximum VF as a maximum of user gathered nodes and
7186// distance between scalar loads in these nodes.
7187unsigned MaxVF = Slice.size();
7188unsigned UserMaxVF = 0;
7189unsigned InterleaveFactor = 0;
7190if (MaxVF == 2) {
7191 UserMaxVF = MaxVF;
7192 }else {
7193// Found distance between segments of the interleaved loads.
7194 std::optional<unsigned> InterleavedLoadsDistance = 0;
7195unsigned Order = 0;
7196 std::optional<unsigned> CommonVF = 0;
7197DenseMap<const TreeEntry *, unsigned> EntryToPosition;
7198SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7199for (auto [Idx, V] :enumerate(Slice)) {
7200for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7201 UserMaxVF = std::max<unsigned>(UserMaxVF,E->Scalars.size());
7202unsigned Pos =
7203 EntryToPosition.try_emplace(E,Idx).first->second;
7204 UserMaxVF = std::max<unsigned>(UserMaxVF,Idx - Pos + 1);
7205if (CommonVF) {
7206if (*CommonVF == 0) {
7207 CommonVF =E->Scalars.size();
7208continue;
7209 }
7210if (*CommonVF !=E->Scalars.size())
7211 CommonVF.reset();
7212 }
7213// Check if the load is the part of the interleaved load.
7214if (Pos !=Idx && InterleavedLoadsDistance) {
7215if (!DeinterleavedNodes.contains(E) &&
7216any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7217 if (isa<Constant>(V))
7218 return false;
7219 if (getTreeEntry(V))
7220 return true;
7221 const auto &Nodes = ValueToGatherNodes.at(V);
7222 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7223 !is_contained(Slice, V);
7224 })) {
7225 InterleavedLoadsDistance.reset();
7226continue;
7227 }
7228 DeinterleavedNodes.insert(E);
7229if (*InterleavedLoadsDistance == 0) {
7230 InterleavedLoadsDistance =Idx - Pos;
7231continue;
7232 }
7233if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7234 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7235 InterleavedLoadsDistance.reset();
7236 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7237 }
7238 }
7239 }
7240 DeinterleavedNodes.clear();
7241// Check if the large load represents interleaved load operation.
7242if (InterleavedLoadsDistance.value_or(0) > 1 &&
7243 CommonVF.value_or(0) != 0) {
7244 InterleaveFactor =bit_ceil(*InterleavedLoadsDistance);
7245unsigned VF = *CommonVF;
7246OrdersType Order;
7247SmallVector<Value *> PointerOps;
7248// Segmented load detected - vectorize at maximum vector factor.
7249if (InterleaveFactor <= Slice.size() &&
7250TTI.isLegalInterleavedAccessType(
7251getWidenedType(Slice.front()->getType(), VF),
7252 InterleaveFactor,
7253 cast<LoadInst>(Slice.front())->getAlign(),
7254 cast<LoadInst>(Slice.front())
7255 ->getPointerAddressSpace()) &&
7256canVectorizeLoads(Slice, Slice.front(), Order,
7257 PointerOps) ==LoadsState::Vectorize) {
7258 UserMaxVF = InterleaveFactor * VF;
7259 }else {
7260 InterleaveFactor = 0;
7261 }
7262 }
7263// Cannot represent the loads as consecutive vectorizable nodes -
7264// just exit.
7265unsigned ConsecutiveNodesSize = 0;
7266if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7267any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7268 [&, Slice = Slice](constauto &P) {
7269constauto *It =find_if(Slice, [&](Value *V) {
7270return std::get<1>(P).contains(V);
7271 });
7272if (It == Slice.end())
7273returnfalse;
7274ArrayRef<Value *> VL =
7275 VectorizableTree[std::get<0>(P)]->Scalars;
7276 ConsecutiveNodesSize += VL.size();
7277unsigned Start = std::distance(Slice.begin(), It);
7278unsigned Sz = Slice.size() - Start;
7279return Sz < VL.size() ||
7280 Slice.slice(std::distance(Slice.begin(), It),
7281 VL.size()) != VL;
7282 }))
7283continue;
7284// Try to build long masked gather loads.
7285 UserMaxVF =bit_ceil(UserMaxVF);
7286if (InterleaveFactor == 0 &&
7287any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7288 [&, Slice = Slice](unsignedIdx) {
7289 OrdersType Order;
7290 SmallVector<Value *> PointerOps;
7291 return canVectorizeLoads(
7292 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7293 Slice[Idx * UserMaxVF], Order,
7294 PointerOps) ==
7295 LoadsState::ScatterVectorize;
7296 }))
7297 UserMaxVF = MaxVF;
7298if (Slice.size() != ConsecutiveNodesSize)
7299 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7300 }
7301for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7302bool IsVectorized =true;
7303for (unsignedI = 0,E = Slice.size();I <E;I += VF) {
7304ArrayRef<Value *> SubSlice =
7305 Slice.slice(I, std::min(VF,E -I));
7306if (getTreeEntry(SubSlice.front()))
7307continue;
7308// Check if the subslice is to be-vectorized entry, which is not
7309// equal to entry.
7310if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7311 [&](constauto &P) {
7312return !SubSlice.equals(
7313 VectorizableTree[std::get<0>(P)]
7314 ->Scalars) &&
7315set_is_subset(SubSlice, std::get<1>(P));
7316 }))
7317continue;
7318unsigned Sz = VectorizableTree.size();
7319 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7320if (Sz == VectorizableTree.size()) {
7321 IsVectorized =false;
7322// Try non-interleaved vectorization with smaller vector
7323// factor.
7324if (InterleaveFactor > 0) {
7325 VF = 2 * (MaxVF / InterleaveFactor);
7326 InterleaveFactor = 0;
7327 }
7328continue;
7329 }
7330 }
7331if (IsVectorized)
7332break;
7333 }
7334 }
7335 NonVectorized.append(SortedNonVectorized);
7336 }
7337return NonVectorized;
7338 };
7339for (constauto &GLs : GatheredLoads) {
7340constauto &Ref = GLs.second;
7341SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7342if (!Ref.empty() && !NonVectorized.empty() &&
7343 std::accumulate(
7344Ref.begin(),Ref.end(), 0u,
7345 [](unsigned S,
7346ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->unsigned {
7347 return S + LoadsDists.size();
7348 }) != NonVectorized.size() &&
7349 IsMaskedGatherSupported(NonVectorized)) {
7350SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;
7351for (LoadInst *LI : NonVectorized) {
7352// Reinsert non-vectorized loads to other list of loads with the same
7353// base pointers.
7354gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7355 FinalGatheredLoads,
7356/*AddNew=*/false);
7357 }
7358// Final attempt to vectorize non-vectorized loads.
7359 (void)ProcessGatheredLoads(FinalGatheredLoads,/*Final=*/true);
7360 }
7361 }
7362// Try to vectorize postponed load entries, previously marked as gathered.
7363for (unsignedIdx : LoadEntriesToVectorize) {
7364const TreeEntry &E = *VectorizableTree[Idx];
7365SmallVector<Value *> GatheredScalars(E.Scalars.begin(),E.Scalars.end());
7366// Avoid reordering, if possible.
7367if (!E.ReorderIndices.empty()) {
7368// Build a mask out of the reorder indices and reorder scalars per this
7369// mask.
7370SmallVector<int> ReorderMask;
7371inversePermutation(E.ReorderIndices, ReorderMask);
7372reorderScalars(GatheredScalars, ReorderMask);
7373 }
7374 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7375 }
7376// If no new entries created, consider it as no gathered loads entries must be
7377// handled.
7378if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7379 VectorizableTree.size())
7380 GatheredLoadsEntriesFirst.reset();
7381}
7382
7383/// \return true if the specified list of values has only one instruction that
7384/// requires scheduling, false otherwise.
7385#ifndef NDEBUG
7386staticboolneedToScheduleSingleInstruction(ArrayRef<Value *> VL) {
7387Value *NeedsScheduling =nullptr;
7388for (Value *V : VL) {
7389if (doesNotNeedToBeScheduled(V))
7390continue;
7391if (!NeedsScheduling) {
7392 NeedsScheduling = V;
7393continue;
7394 }
7395returnfalse;
7396 }
7397return NeedsScheduling;
7398}
7399#endif
7400
7401/// Generates key/subkey pair for the given value to provide effective sorting
7402/// of the values and better detection of the vectorizable values sequences. The
7403/// keys/subkeys can be used for better sorting of the values themselves (keys)
7404/// and in values subgroups (subkeys).
7405static std::pair<size_t, size_t>generateKeySubkey(
7406Value *V,constTargetLibraryInfo *TLI,
7407function_ref<hash_code(size_t,LoadInst *)> LoadsSubkeyGenerator,
7408bool AllowAlternate) {
7409hash_code Key =hash_value(V->getValueID() + 2);
7410hash_code SubKey =hash_value(0);
7411// Sort the loads by the distance between the pointers.
7412if (auto *LI = dyn_cast<LoadInst>(V)) {
7413 Key =hash_combine(LI->getType(),hash_value(Instruction::Load), Key);
7414if (LI->isSimple())
7415 SubKey =hash_value(LoadsSubkeyGenerator(Key, LI));
7416else
7417 Key = SubKey =hash_value(LI);
7418 }elseif (isVectorLikeInstWithConstOps(V)) {
7419// Sort extracts by the vector operands.
7420if (isa<ExtractElementInst, UndefValue>(V))
7421 Key =hash_value(Value::UndefValueVal + 1);
7422if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7423if (!isUndefVector(EI->getVectorOperand()).all() &&
7424 !isa<UndefValue>(EI->getIndexOperand()))
7425 SubKey =hash_value(EI->getVectorOperand());
7426 }
7427 }elseif (auto *I = dyn_cast<Instruction>(V)) {
7428// Sort other instructions just by the opcodes except for CMPInst.
7429// For CMP also sort by the predicate kind.
7430if ((isa<BinaryOperator, CastInst>(I)) &&
7431isValidForAlternation(I->getOpcode())) {
7432if (AllowAlternate)
7433 Key =hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7434else
7435 Key =hash_combine(hash_value(I->getOpcode()), Key);
7436 SubKey =hash_combine(
7437hash_value(I->getOpcode()),hash_value(I->getType()),
7438hash_value(isa<BinaryOperator>(I)
7439 ?I->getType()
7440 : cast<CastInst>(I)->getOperand(0)->getType()));
7441// For casts, look through the only operand to improve compile time.
7442if (isa<CastInst>(I)) {
7443 std::pair<size_t, size_t> OpVals =
7444generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7445/*AllowAlternate=*/true);
7446 Key =hash_combine(OpVals.first, Key);
7447 SubKey =hash_combine(OpVals.first, SubKey);
7448 }
7449 }elseif (auto *CI = dyn_cast<CmpInst>(I)) {
7450CmpInst::Predicate Pred = CI->getPredicate();
7451if (CI->isCommutative())
7452 Pred = std::min(Pred,CmpInst::getInversePredicate(Pred));
7453CmpInst::Predicate SwapPred =CmpInst::getSwappedPredicate(Pred);
7454 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(Pred),
7455hash_value(SwapPred),
7456hash_value(CI->getOperand(0)->getType()));
7457 }elseif (auto *Call = dyn_cast<CallInst>(I)) {
7458Intrinsic::IDID =getVectorIntrinsicIDForCall(Call, TLI);
7459if (isTriviallyVectorizable(ID)) {
7460 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(ID));
7461 }elseif (!VFDatabase(*Call).getMappings(*Call).empty()) {
7462 SubKey =hash_combine(hash_value(I->getOpcode()),
7463hash_value(Call->getCalledFunction()));
7464 }else {
7465 Key =hash_combine(hash_value(Call), Key);
7466 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(Call));
7467 }
7468for (constCallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7469 SubKey =hash_combine(hash_value(Op.Begin),hash_value(Op.End),
7470hash_value(Op.Tag), SubKey);
7471 }elseif (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7472if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7473 SubKey =hash_value(Gep->getPointerOperand());
7474else
7475 SubKey =hash_value(Gep);
7476 }elseif (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7477 !isa<ConstantInt>(I->getOperand(1))) {
7478// Do not try to vectorize instructions with potentially high cost.
7479 SubKey =hash_value(I);
7480 }else {
7481 SubKey =hash_value(I->getOpcode());
7482 }
7483 Key =hash_combine(hash_value(I->getParent()), Key);
7484 }
7485return std::make_pair(Key, SubKey);
7486}
7487
7488/// Checks if the specified instruction \p I is an alternate operation for
7489/// the given \p MainOp and \p AltOp instructions.
7490staticboolisAlternateInstruction(constInstruction *I,
7491constInstruction *MainOp,
7492constInstruction *AltOp,
7493constTargetLibraryInfo &TLI);
7494
7495bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7496ArrayRef<Value *> VL) const{
7497unsigned Opcode0 = S.getOpcode();
7498unsigned Opcode1 = S.getAltOpcode();
7499SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7500// If this pattern is supported by the target then consider it profitable.
7501if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7502 Opcode0, Opcode1, OpcodeMask))
7503returntrue;
7504SmallVector<ValueList>Operands;
7505for (unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7506Operands.emplace_back();
7507// Prepare the operand vector.
7508for (Value *V : VL) {
7509if (isa<PoisonValue>(V)) {
7510Operands.back().push_back(
7511PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7512continue;
7513 }
7514Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7515 }
7516 }
7517if (Operands.size() == 2) {
7518// Try find best operands candidates.
7519for (unsignedI : seq<unsigned>(0, VL.size() - 1)) {
7520SmallVector<std::pair<Value *, Value *>> Candidates(3);
7521 Candidates[0] = std::make_pair(Operands[0][I],Operands[0][I + 1]);
7522 Candidates[1] = std::make_pair(Operands[0][I],Operands[1][I + 1]);
7523 Candidates[2] = std::make_pair(Operands[1][I],Operands[0][I + 1]);
7524 std::optional<int> Res =findBestRootPair(Candidates);
7525switch (Res.value_or(0)) {
7526case 0:
7527break;
7528case 1:
7529std::swap(Operands[0][I + 1],Operands[1][I + 1]);
7530break;
7531case 2:
7532std::swap(Operands[0][I],Operands[1][I]);
7533break;
7534default:
7535llvm_unreachable("Unexpected index.");
7536 }
7537 }
7538 }
7539DenseSet<unsigned> UniqueOpcodes;
7540constexprunsigned NumAltInsts = 3;// main + alt + shuffle.
7541unsigned NonInstCnt = 0;
7542// Estimate number of instructions, required for the vectorized node and for
7543// the buildvector node.
7544unsigned UndefCnt = 0;
7545// Count the number of extra shuffles, required for vector nodes.
7546unsigned ExtraShuffleInsts = 0;
7547// Check that operands do not contain same values and create either perfect
7548// diamond match or shuffled match.
7549if (Operands.size() == 2) {
7550// Do not count same operands twice.
7551if (Operands.front() ==Operands.back()) {
7552Operands.erase(Operands.begin());
7553 }elseif (!allConstant(Operands.front()) &&
7554all_of(Operands.front(), [&](Value *V) {
7555 return is_contained(Operands.back(), V);
7556 })) {
7557Operands.erase(Operands.begin());
7558 ++ExtraShuffleInsts;
7559 }
7560 }
7561constLoop *L = LI->getLoopFor(S.getMainOp()->getParent());
7562// Vectorize node, if:
7563// 1. at least single operand is constant or splat.
7564// 2. Operands have many loop invariants (the instructions are not loop
7565// invariants).
7566// 3. At least single unique operands is supposed to vectorized.
7567returnnone_of(Operands,
7568 [&](ArrayRef<Value *>Op) {
7569if (allConstant(Op) ||
7570 (!isSplat(Op) &&allSameBlock(Op) &&allSameType(Op) &&
7571getSameOpcode(Op, *TLI)))
7572returnfalse;
7573DenseMap<Value *, unsigned> Uniques;
7574for (Value *V :Op) {
7575if (isa<Constant, ExtractElementInst>(V) ||
7576 getTreeEntry(V) || (L &&L->isLoopInvariant(V))) {
7577 if (isa<UndefValue>(V))
7578 ++UndefCnt;
7579 continue;
7580 }
7581auto Res = Uniques.try_emplace(V, 0);
7582// Found first duplicate - need to add shuffle.
7583if (!Res.second && Res.first->second == 1)
7584 ++ExtraShuffleInsts;
7585 ++Res.first->getSecond();
7586if (auto *I = dyn_cast<Instruction>(V))
7587 UniqueOpcodes.insert(I->getOpcode());
7588elseif (Res.second)
7589 ++NonInstCnt;
7590 }
7591returnnone_of(Uniques, [&](constauto &P) {
7592returnP.first->hasNUsesOrMore(P.second + 1) &&
7593none_of(P.first->users(), [&](User *U) {
7594 return getTreeEntry(U) || Uniques.contains(U);
7595 });
7596 });
7597 }) ||
7598// Do not vectorize node, if estimated number of vector instructions is
7599// more than estimated number of buildvector instructions. Number of
7600// vector operands is number of vector instructions + number of vector
7601// instructions for operands (buildvectors). Number of buildvector
7602// instructions is just number_of_operands * number_of_scalars.
7603 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7604 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7605 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7606}
7607
7608BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7609const InstructionsState &S,ArrayRef<Value *> VL,
7610bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7611SmallVectorImpl<Value *> &PointerOps) {
7612assert(S.getMainOp() &&
7613"Expected instructions with same/alternate opcodes only.");
7614
7615unsigned ShuffleOrOp =
7616 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7617Instruction *VL0 = S.getMainOp();
7618switch (ShuffleOrOp) {
7619case Instruction::PHI: {
7620// Too many operands - gather, most probably won't be vectorized.
7621if (VL0->getNumOperands() >MaxPHINumOperands)
7622return TreeEntry::NeedToGather;
7623// Check for terminator values (e.g. invoke).
7624for (Value *V : VL) {
7625auto *PHI = dyn_cast<PHINode>(V);
7626if (!PHI)
7627continue;
7628for (Value *Incoming :PHI->incoming_values()) {
7629Instruction *Term = dyn_cast<Instruction>(Incoming);
7630if (Term &&Term->isTerminator()) {
7631LLVM_DEBUG(dbgs()
7632 <<"SLP: Need to swizzle PHINodes (terminator use).\n");
7633return TreeEntry::NeedToGather;
7634 }
7635 }
7636 }
7637
7638return TreeEntry::Vectorize;
7639 }
7640case Instruction::ExtractValue:
7641case Instruction::ExtractElement: {
7642bool Reuse = canReuseExtract(VL, CurrentOrder);
7643// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
7644// non-full registers).
7645if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
7646return TreeEntry::NeedToGather;
7647if (Reuse || !CurrentOrder.empty())
7648return TreeEntry::Vectorize;
7649LLVM_DEBUG(dbgs() <<"SLP: Gather extract sequence.\n");
7650return TreeEntry::NeedToGather;
7651 }
7652case Instruction::InsertElement: {
7653// Check that we have a buildvector and not a shuffle of 2 or more
7654// different vectors.
7655ValueSet SourceVectors;
7656for (Value *V : VL) {
7657 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7658assert(getElementIndex(V) != std::nullopt &&
7659"Non-constant or undef index?");
7660 }
7661
7662if (count_if(VL, [&SourceVectors](Value *V) {
7663return !SourceVectors.contains(V);
7664 }) >= 2) {
7665// Found 2nd source vector - cancel.
7666LLVM_DEBUG(dbgs() <<"SLP: Gather of insertelement vectors with "
7667"different source vectors.\n");
7668return TreeEntry::NeedToGather;
7669 }
7670
7671if (any_of(VL, [&SourceVectors](Value *V) {
7672// The last InsertElement can have multiple uses.
7673return SourceVectors.contains(V) && !V->hasOneUse();
7674 })) {
7675assert(SLPReVec &&"Only supported by REVEC.");
7676LLVM_DEBUG(dbgs() <<"SLP: Gather of insertelement vectors with "
7677"multiple uses.\n");
7678return TreeEntry::NeedToGather;
7679 }
7680
7681return TreeEntry::Vectorize;
7682 }
7683case Instruction::Load: {
7684// Check that a vectorized load would load the same memory as a scalar
7685// load. For example, we don't want to vectorize loads that are smaller
7686// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7687// treats loading/storing it as an i8 struct. If we vectorize loads/stores
7688// from such a struct, we read/write packed bits disagreeing with the
7689// unvectorized version.
7690switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7691caseLoadsState::Vectorize:
7692return TreeEntry::Vectorize;
7693caseLoadsState::ScatterVectorize:
7694if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7695// Delay slow vectorized nodes for better vectorization attempts.
7696 LoadEntriesToVectorize.insert(VectorizableTree.size());
7697return TreeEntry::NeedToGather;
7698 }
7699return TreeEntry::ScatterVectorize;
7700caseLoadsState::StridedVectorize:
7701if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7702// Delay slow vectorized nodes for better vectorization attempts.
7703 LoadEntriesToVectorize.insert(VectorizableTree.size());
7704return TreeEntry::NeedToGather;
7705 }
7706return TreeEntry::StridedVectorize;
7707caseLoadsState::Gather:
7708#ifndef NDEBUG
7709Type *ScalarTy = VL0->getType();
7710if (DL->getTypeSizeInBits(ScalarTy) !=
7711DL->getTypeAllocSizeInBits(ScalarTy))
7712LLVM_DEBUG(dbgs() <<"SLP: Gathering loads of non-packed type.\n");
7713elseif (any_of(VL, [](Value *V) {
7714auto *LI = dyn_cast<LoadInst>(V);
7715return !LI || !LI->isSimple();
7716 }))
7717LLVM_DEBUG(dbgs() <<"SLP: Gathering non-simple loads.\n");
7718else
7719LLVM_DEBUG(dbgs() <<"SLP: Gathering non-consecutive loads.\n");
7720#endif// NDEBUG
7721registerNonVectorizableLoads(VL);
7722return TreeEntry::NeedToGather;
7723 }
7724llvm_unreachable("Unexpected state of loads");
7725 }
7726case Instruction::ZExt:
7727case Instruction::SExt:
7728case Instruction::FPToUI:
7729case Instruction::FPToSI:
7730case Instruction::FPExt:
7731case Instruction::PtrToInt:
7732case Instruction::IntToPtr:
7733case Instruction::SIToFP:
7734case Instruction::UIToFP:
7735case Instruction::Trunc:
7736case Instruction::FPTrunc:
7737case Instruction::BitCast: {
7738Type *SrcTy = VL0->getOperand(0)->getType();
7739for (Value *V : VL) {
7740if (isa<PoisonValue>(V))
7741continue;
7742Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7743if (Ty != SrcTy || !isValidElementType(Ty)) {
7744LLVM_DEBUG(
7745dbgs() <<"SLP: Gathering casts with different src types.\n");
7746return TreeEntry::NeedToGather;
7747 }
7748 }
7749return TreeEntry::Vectorize;
7750 }
7751case Instruction::ICmp:
7752case Instruction::FCmp: {
7753// Check that all of the compares have the same predicate.
7754CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7755CmpInst::Predicate SwapP0 =CmpInst::getSwappedPredicate(P0);
7756Type *ComparedTy = VL0->getOperand(0)->getType();
7757for (Value *V : VL) {
7758if (isa<PoisonValue>(V))
7759continue;
7760auto *Cmp = cast<CmpInst>(V);
7761if ((Cmp->getPredicate() != P0 &&Cmp->getPredicate() != SwapP0) ||
7762Cmp->getOperand(0)->getType() != ComparedTy) {
7763LLVM_DEBUG(dbgs() <<"SLP: Gathering cmp with different predicate.\n");
7764return TreeEntry::NeedToGather;
7765 }
7766 }
7767return TreeEntry::Vectorize;
7768 }
7769case Instruction::Select:
7770case Instruction::FNeg:
7771case Instruction::Add:
7772case Instruction::FAdd:
7773case Instruction::Sub:
7774case Instruction::FSub:
7775case Instruction::Mul:
7776case Instruction::FMul:
7777case Instruction::UDiv:
7778case Instruction::SDiv:
7779case Instruction::FDiv:
7780case Instruction::URem:
7781case Instruction::SRem:
7782case Instruction::FRem:
7783case Instruction::Shl:
7784case Instruction::LShr:
7785case Instruction::AShr:
7786case Instruction::And:
7787case Instruction::Or:
7788case Instruction::Xor:
7789case Instruction::Freeze:
7790if (S.getMainOp()->getType()->isFloatingPointTy() &&
7791TTI->isFPVectorizationPotentiallyUnsafe() &&any_of(VL, [](Value *V) {
7792auto *I = dyn_cast<Instruction>(V);
7793returnI &&I->isBinaryOp() && !I->isFast();
7794 }))
7795return TreeEntry::NeedToGather;
7796return TreeEntry::Vectorize;
7797case Instruction::GetElementPtr: {
7798// We don't combine GEPs with complicated (nested) indexing.
7799for (Value *V : VL) {
7800auto *I = dyn_cast<GetElementPtrInst>(V);
7801if (!I)
7802continue;
7803if (I->getNumOperands() != 2) {
7804LLVM_DEBUG(dbgs() <<"SLP: not-vectorizable GEP (nested indexes).\n");
7805return TreeEntry::NeedToGather;
7806 }
7807 }
7808
7809// We can't combine several GEPs into one vector if they operate on
7810// different types.
7811Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7812for (Value *V : VL) {
7813auto *GEP = dyn_cast<GEPOperator>(V);
7814if (!GEP)
7815continue;
7816Type *CurTy =GEP->getSourceElementType();
7817if (Ty0 != CurTy) {
7818LLVM_DEBUG(dbgs() <<"SLP: not-vectorizable GEP (different types).\n");
7819return TreeEntry::NeedToGather;
7820 }
7821 }
7822
7823// We don't combine GEPs with non-constant indexes.
7824Type *Ty1 = VL0->getOperand(1)->getType();
7825for (Value *V : VL) {
7826auto *I = dyn_cast<GetElementPtrInst>(V);
7827if (!I)
7828continue;
7829auto *Op =I->getOperand(1);
7830if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7831 (Op->getType() != Ty1 &&
7832 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7833Op->getType()->getScalarSizeInBits() >
7834DL->getIndexSizeInBits(
7835V->getType()->getPointerAddressSpace())))) {
7836LLVM_DEBUG(
7837dbgs() <<"SLP: not-vectorizable GEP (non-constant indexes).\n");
7838return TreeEntry::NeedToGather;
7839 }
7840 }
7841
7842return TreeEntry::Vectorize;
7843 }
7844case Instruction::Store: {
7845// Check if the stores are consecutive or if we need to swizzle them.
7846llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7847// Avoid types that are padded when being allocated as scalars, while
7848// being packed together in a vector (such as i1).
7849if (DL->getTypeSizeInBits(ScalarTy) !=
7850DL->getTypeAllocSizeInBits(ScalarTy)) {
7851LLVM_DEBUG(dbgs() <<"SLP: Gathering stores of non-packed type.\n");
7852return TreeEntry::NeedToGather;
7853 }
7854// Make sure all stores in the bundle are simple - we can't vectorize
7855// atomic or volatile stores.
7856for (Value *V : VL) {
7857auto *SI = cast<StoreInst>(V);
7858if (!SI->isSimple()) {
7859LLVM_DEBUG(dbgs() <<"SLP: Gathering non-simple stores.\n");
7860return TreeEntry::NeedToGather;
7861 }
7862 PointerOps.push_back(SI->getPointerOperand());
7863 }
7864
7865// Check the order of pointer operands.
7866if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7867Value *Ptr0;
7868Value *PtrN;
7869if (CurrentOrder.empty()) {
7870 Ptr0 = PointerOps.front();
7871 PtrN = PointerOps.back();
7872 }else {
7873 Ptr0 = PointerOps[CurrentOrder.front()];
7874 PtrN = PointerOps[CurrentOrder.back()];
7875 }
7876 std::optional<int> Dist =
7877getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7878// Check that the sorted pointer operands are consecutive.
7879if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7880return TreeEntry::Vectorize;
7881 }
7882
7883LLVM_DEBUG(dbgs() <<"SLP: Non-consecutive store.\n");
7884return TreeEntry::NeedToGather;
7885 }
7886case Instruction::Call: {
7887if (S.getMainOp()->getType()->isFloatingPointTy() &&
7888TTI->isFPVectorizationPotentiallyUnsafe() &&any_of(VL, [](Value *V) {
7889auto *I = dyn_cast<Instruction>(V);
7890returnI && !I->isFast();
7891 }))
7892return TreeEntry::NeedToGather;
7893// Check if the calls are all to the same vectorizable intrinsic or
7894// library function.
7895CallInst *CI = cast<CallInst>(VL0);
7896Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
7897
7898VFShape Shape =VFShape::get(
7899 CI->getFunctionType(),
7900ElementCount::getFixed(static_cast<unsignedint>(VL.size())),
7901false/*HasGlobalPred*/);
7902Function *VecFunc =VFDatabase(*CI).getVectorizedFunction(Shape);
7903
7904if (!VecFunc && !isTriviallyVectorizable(ID)) {
7905LLVM_DEBUG(dbgs() <<"SLP: Non-vectorizable call.\n");
7906return TreeEntry::NeedToGather;
7907 }
7908Function *F = CI->getCalledFunction();
7909unsigned NumArgs = CI->arg_size();
7910SmallVector<Value *, 4> ScalarArgs(NumArgs,nullptr);
7911for (unsigned J = 0; J != NumArgs; ++J)
7912if (isVectorIntrinsicWithScalarOpAtArg(ID, J,TTI))
7913 ScalarArgs[J] = CI->getArgOperand(J);
7914for (Value *V : VL) {
7915CallInst *CI2 = dyn_cast<CallInst>(V);
7916if (!CI2 || CI2->getCalledFunction() !=F ||
7917getVectorIntrinsicIDForCall(CI2, TLI) !=ID ||
7918 (VecFunc &&
7919 VecFunc !=VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7920 !CI->hasIdenticalOperandBundleSchema(*CI2)) {
7921LLVM_DEBUG(dbgs() <<"SLP: mismatched calls:" << *CI <<"!=" << *V
7922 <<"\n");
7923return TreeEntry::NeedToGather;
7924 }
7925// Some intrinsics have scalar arguments and should be same in order for
7926// them to be vectorized.
7927for (unsigned J = 0; J != NumArgs; ++J) {
7928if (isVectorIntrinsicWithScalarOpAtArg(ID, J,TTI)) {
7929Value *A1J = CI2->getArgOperand(J);
7930if (ScalarArgs[J] != A1J) {
7931LLVM_DEBUG(dbgs()
7932 <<"SLP: mismatched arguments in call:" << *CI
7933 <<" argument " << ScalarArgs[J] <<"!=" << A1J <<"\n");
7934return TreeEntry::NeedToGather;
7935 }
7936 }
7937 }
7938// Verify that the bundle operands are identical between the two calls.
7939if (CI->hasOperandBundles() &&
7940 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7941 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7942 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7943LLVM_DEBUG(dbgs() <<"SLP: mismatched bundle operands in calls:" << *CI
7944 <<"!=" << *V <<'\n');
7945return TreeEntry::NeedToGather;
7946 }
7947 }
7948
7949return TreeEntry::Vectorize;
7950 }
7951case Instruction::ShuffleVector: {
7952if (!S.isAltShuffle()) {
7953// REVEC can support non alternate shuffle.
7954if (SLPReVec &&getShufflevectorNumGroups(VL))
7955return TreeEntry::Vectorize;
7956// If this is not an alternate sequence of opcode like add-sub
7957// then do not vectorize this instruction.
7958LLVM_DEBUG(dbgs() <<"SLP: ShuffleVector are not vectorized.\n");
7959return TreeEntry::NeedToGather;
7960 }
7961if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7962LLVM_DEBUG(
7963dbgs()
7964 <<"SLP: ShuffleVector not vectorized, operands are buildvector and "
7965"the whole alt sequence is not profitable.\n");
7966return TreeEntry::NeedToGather;
7967 }
7968
7969return TreeEntry::Vectorize;
7970 }
7971default:
7972LLVM_DEBUG(dbgs() <<"SLP: Gathering unknown instruction.\n");
7973return TreeEntry::NeedToGather;
7974 }
7975}
7976
7977namespace{
7978/// Allows to correctly handle operands of the phi nodes based on the \p Main
7979/// PHINode order of incoming basic blocks/values.
7980classPHIHandler {
7981DominatorTree &DT;
7982PHINode *Main =nullptr;
7983SmallVector<Value *> Phis;
7984SmallVector<SmallVector<Value *>>Operands;
7985
7986public:
7987 PHIHandler() =delete;
7988 PHIHandler(DominatorTree &DT,PHINode *Main,ArrayRef<Value *> Phis)
7989 : DT(DT), Main(Main), Phis(Phis),
7990Operands(Main->getNumIncomingValues(),
7991SmallVector<Value *>(Phis.size(), nullptr)) {}
7992void buildOperands() {
7993constexprunsigned FastLimit = 4;
7994if (Main->getNumIncomingValues() <= FastLimit) {
7995for (unsignedI : seq<unsigned>(0, Main->getNumIncomingValues())) {
7996BasicBlock *InBB = Main->getIncomingBlock(I);
7997if (!DT.isReachableFromEntry(InBB)) {
7998Operands[I].assign(Phis.size(),PoisonValue::get(Main->getType()));
7999continue;
8000 }
8001// Prepare the operand vector.
8002for (auto [Idx, V] :enumerate(Phis)) {
8003auto *P = dyn_cast<PHINode>(V);
8004if (!P) {
8005assert(isa<PoisonValue>(V) &&
8006"Expected isa instruction or poison value.");
8007Operands[I][Idx] =V;
8008continue;
8009 }
8010if (P->getIncomingBlock(I) == InBB)
8011Operands[I][Idx] =P->getIncomingValue(I);
8012else
8013Operands[I][Idx] =P->getIncomingValueForBlock(InBB);
8014 }
8015 }
8016return;
8017 }
8018SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4>Blocks;
8019for (unsignedI : seq<unsigned>(0, Main->getNumIncomingValues())) {
8020BasicBlock *InBB = Main->getIncomingBlock(I);
8021if (!DT.isReachableFromEntry(InBB)) {
8022Operands[I].assign(Phis.size(),PoisonValue::get(Main->getType()));
8023continue;
8024 }
8025Blocks.try_emplace(InBB).first->second.push_back(I);
8026 }
8027for (auto [Idx, V] :enumerate(Phis)) {
8028if (isa<PoisonValue>(V)) {
8029for (unsignedI : seq<unsigned>(Main->getNumIncomingValues()))
8030Operands[I][Idx] =V;
8031continue;
8032 }
8033auto *P = cast<PHINode>(V);
8034for (unsignedI : seq<unsigned>(0,P->getNumIncomingValues())) {
8035BasicBlock *InBB =P->getIncomingBlock(I);
8036if (InBB == Main->getIncomingBlock(I)) {
8037if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
8038continue;
8039Operands[I][Idx] =P->getIncomingValue(I);
8040continue;
8041 }
8042auto It =Blocks.find(InBB);
8043if (It ==Blocks.end())
8044continue;
8045Operands[It->second.front()][Idx] =P->getIncomingValue(I);
8046 }
8047 }
8048for (constauto &P :Blocks) {
8049if (P.getSecond().size() <= 1)
8050continue;
8051unsigned BasicI =P.getSecond().front();
8052for (unsignedI :ArrayRef(P.getSecond()).drop_front()) {
8053assert(all_of(enumerate(Operands[I]),
8054 [&](constauto &Data) {
8055return !Data.value() ||
8056 Data.value() ==Operands[BasicI][Data.index()];
8057 }) &&
8058"Expected empty operands list.");
8059Operands[I] =Operands[BasicI];
8060 }
8061 }
8062 }
8063ArrayRef<Value *>getOperands(unsignedI) const{returnOperands[I]; }
8064};
8065}// namespace
8066
8067void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL,unsignedDepth,
8068const EdgeInfo &UserTreeIdx,
8069unsigned InterleaveFactor) {
8070assert((allConstant(VL) ||allSameType(VL)) &&"Invalid types!");
8071
8072SmallVector<int> ReuseShuffleIndices;
8073SmallVector<Value *> UniqueValues;
8074SmallVector<Value *> NonUniqueValueVL;
8075auto TryToFindDuplicates = [&](const InstructionsState &S,
8076bool DoNotFail =false) {
8077// Check that every instruction appears once in this bundle.
8078SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8079for (Value *V : VL) {
8080if (isConstant(V)) {
8081 ReuseShuffleIndices.emplace_back(
8082 isa<PoisonValue>(V) ?PoisonMaskElem : UniqueValues.size());
8083 UniqueValues.emplace_back(V);
8084continue;
8085 }
8086auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8087 ReuseShuffleIndices.emplace_back(Res.first->second);
8088if (Res.second)
8089 UniqueValues.emplace_back(V);
8090 }
8091size_t NumUniqueScalarValues = UniqueValues.size();
8092bool IsFullVectors =hasFullVectorsOrPowerOf2(
8093 *TTI,getValueType(UniqueValues.front()), NumUniqueScalarValues);
8094if (NumUniqueScalarValues == VL.size() &&
8095 (VectorizeNonPowerOf2 || IsFullVectors)) {
8096 ReuseShuffleIndices.clear();
8097 }else {
8098// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8099if ((UserTreeIdx.UserTE &&
8100 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8101 !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
8102LLVM_DEBUG(dbgs() <<"SLP: Reshuffling scalars not yet supported "
8103"for nodes with padding.\n");
8104 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8105returnfalse;
8106 }
8107LLVM_DEBUG(dbgs() <<"SLP: Shuffle for reused scalars.\n");
8108if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8109 (UniquePositions.size() == 1 &&all_of(UniqueValues, [](Value *V) {
8110return isa<UndefValue>(V) || !isConstant(V);
8111 }))) {
8112if (DoNotFail && UniquePositions.size() > 1 &&
8113 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8114all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8115// Find the number of elements, which forms full vectors.
8116unsigned PWSz =getFullVectorNumberOfElements(
8117 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8118if (PWSz == VL.size()) {
8119 ReuseShuffleIndices.clear();
8120 }else {
8121 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8122 NonUniqueValueVL.append(
8123 PWSz - UniqueValues.size(),
8124PoisonValue::get(UniqueValues.front()->getType()));
8125// Check that extended with poisons operations are still valid for
8126// vectorization (div/rem are not allowed).
8127if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
8128LLVM_DEBUG(dbgs() <<"SLP: Scalar used twice in bundle.\n");
8129 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8130returnfalse;
8131 }
8132 VL = NonUniqueValueVL;
8133 }
8134returntrue;
8135 }
8136LLVM_DEBUG(dbgs() <<"SLP: Scalar used twice in bundle.\n");
8137 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8138returnfalse;
8139 }
8140 VL = UniqueValues;
8141 }
8142returntrue;
8143 };
8144
8145 InstructionsState S =getSameOpcode(VL, *TLI);
8146
8147// Don't go into catchswitch blocks, which can happen with PHIs.
8148// Such blocks can only have PHIs and the catchswitch. There is no
8149// place to insert a shuffle if we need to, so just avoid that issue.
8150if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8151LLVM_DEBUG(dbgs() <<"SLP: bundle in catchswitch block.\n");
8152 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8153return;
8154 }
8155
8156// Check if this is a duplicate of another entry.
8157if (S) {
8158if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8159LLVM_DEBUG(dbgs() <<"SLP: \tChecking bundle: " << *S.getMainOp()
8160 <<".\n");
8161if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8162auto It = MultiNodeScalars.find(S.getMainOp());
8163if (It != MultiNodeScalars.end()) {
8164auto *TEIt =find_if(It->getSecond(),
8165 [&](TreeEntry *ME) { return ME->isSame(VL); });
8166if (TEIt != It->getSecond().end())
8167 E = *TEIt;
8168else
8169 E =nullptr;
8170 }else {
8171 E =nullptr;
8172 }
8173 }
8174if (!E) {
8175if (!doesNotNeedToBeScheduled(S.getMainOp())) {
8176LLVM_DEBUG(dbgs() <<"SLP: Gathering due to partial overlap.\n");
8177if (TryToFindDuplicates(S))
8178 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8179 ReuseShuffleIndices);
8180return;
8181 }
8182SmallPtrSet<const TreeEntry *, 4> Nodes;
8183 Nodes.insert(getTreeEntry(S.getMainOp()));
8184for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
8185 Nodes.insert(E);
8186SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
8187if (any_of(Nodes, [&](const TreeEntry *E) {
8188if (all_of(E->Scalars,
8189 [&](Value *V) { return Values.contains(V); }))
8190returntrue;
8191SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),
8192 E->Scalars.end());
8193 return (
8194all_of(VL, [&](Value *V) {return EValues.contains(V); }));
8195 })) {
8196LLVM_DEBUG(dbgs() <<"SLP: Gathering due to full overlap.\n");
8197if (TryToFindDuplicates(S))
8198 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8199 ReuseShuffleIndices);
8200return;
8201 }
8202 }else {
8203// Record the reuse of the tree node. FIXME, currently this is only
8204// used to properly draw the graph rather than for the actual
8205// vectorization.
8206 E->UserTreeIndices.push_back(UserTreeIdx);
8207LLVM_DEBUG(dbgs() <<"SLP: Perfect diamond merge at " << *S.getMainOp()
8208 <<".\n");
8209return;
8210 }
8211 }
8212 }
8213
8214// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8215// a load), in which case peek through to include it in the tree, without
8216// ballooning over-budget.
8217if (Depth >=RecursionMaxDepth &&
8218 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8219 (match(S.getMainOp(),m_Load(m_Value())) ||
8220all_of(VL, [&S](constValue *I) {
8221returnmatch(I,
8222m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
8223 cast<Instruction>(I)->getOpcode() == S.getOpcode();
8224 })))) {
8225LLVM_DEBUG(dbgs() <<"SLP: Gathering due to max recursion depth.\n");
8226if (TryToFindDuplicates(S))
8227 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8228 ReuseShuffleIndices);
8229return;
8230 }
8231
8232// Don't handle scalable vectors
8233if (S && S.getOpcode() == Instruction::ExtractElement &&
8234 isa<ScalableVectorType>(
8235 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8236LLVM_DEBUG(dbgs() <<"SLP: Gathering due to scalable vector type.\n");
8237if (TryToFindDuplicates(S))
8238 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8239 ReuseShuffleIndices);
8240return;
8241 }
8242
8243// Don't handle vectors.
8244if (!SLPReVec &&getValueType(VL.front())->isVectorTy()) {
8245LLVM_DEBUG(dbgs() <<"SLP: Gathering due to vector type.\n");
8246 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8247return;
8248 }
8249
8250// If all of the operands are identical or constant we have a simple solution.
8251// If we deal with insert/extract instructions, they all must have constant
8252// indices, otherwise we should gather them, not try to vectorize.
8253// If alternate op node with 2 elements with gathered operands - do not
8254// vectorize.
8255auto &&NotProfitableForVectorization = [&S,this,
8256Depth](ArrayRef<Value *> VL) {
8257if (!S || !S.isAltShuffle() || VL.size() > 2)
8258returnfalse;
8259if (VectorizableTree.size() <MinTreeSize)
8260returnfalse;
8261if (Depth >=RecursionMaxDepth - 1)
8262returntrue;
8263// Check if all operands are extracts, part of vector node or can build a
8264// regular vectorize node.
8265SmallVector<unsigned, 8> InstsCount;
8266for (Value *V : VL) {
8267auto *I = cast<Instruction>(V);
8268 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8269 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8270 }));
8271 }
8272bool IsCommutative =
8273isCommutative(S.getMainOp()) ||isCommutative(S.getAltOp());
8274if ((IsCommutative &&
8275 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8276 (!IsCommutative &&
8277all_of(InstsCount, [](unsigned ICnt) {return ICnt < 2; })))
8278returntrue;
8279assert(VL.size() == 2 &&"Expected only 2 alternate op instructions.");
8280SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
8281auto *I1 = cast<Instruction>(VL.front());
8282auto *I2 = cast<Instruction>(VL.back());
8283for (intOp : seq<int>(S.getMainOp()->getNumOperands()))
8284 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8285 I2->getOperand(Op));
8286if (static_cast<unsigned>(count_if(
8287 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8288returnfindBestRootPair(Cand,LookAheadHeuristics::ScoreSplat);
8289 })) >= S.getMainOp()->getNumOperands() / 2)
8290returnfalse;
8291if (S.getMainOp()->getNumOperands() > 2)
8292returntrue;
8293if (IsCommutative) {
8294// Check permuted operands.
8295 Candidates.clear();
8296for (intOp = 0, E = S.getMainOp()->getNumOperands();Op < E; ++Op)
8297 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8298 I2->getOperand((Op + 1) % E));
8299if (any_of(
8300 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8301returnfindBestRootPair(Cand,LookAheadHeuristics::ScoreSplat);
8302 }))
8303returnfalse;
8304 }
8305returntrue;
8306 };
8307SmallVector<unsigned> SortedIndices;
8308BasicBlock *BB =nullptr;
8309bool IsScatterVectorizeUserTE =
8310 UserTreeIdx.UserTE &&
8311 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8312bool AreAllSameBlock = S &&allSameBlock(VL);
8313bool AreScatterAllGEPSameBlock =
8314 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8315 VL.size() > 2 &&
8316all_of(VL,
8317 [&BB](Value *V) {
8318auto *I = dyn_cast<GetElementPtrInst>(V);
8319if (!I)
8320returndoesNotNeedToBeScheduled(V);
8321if (!BB)
8322 BB =I->getParent();
8323return BB ==I->getParent() &&I->getNumOperands() == 2;
8324 }) &&
8325 BB &&
8326sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8327 SortedIndices));
8328bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8329if (!AreAllSameInsts || (!S &&allConstant(VL)) ||isSplat(VL) ||
8330 (S &&
8331 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8332 S.getMainOp()) &&
8333 !all_of(VL,isVectorLikeInstWithConstOps)) ||
8334 NotProfitableForVectorization(VL)) {
8335LLVM_DEBUG(dbgs() <<"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8336if (TryToFindDuplicates(S))
8337 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8338 ReuseShuffleIndices);
8339return;
8340 }
8341
8342// Don't vectorize ephemeral values.
8343if (S && !EphValues.empty()) {
8344for (Value *V : VL) {
8345if (EphValues.count(V)) {
8346LLVM_DEBUG(dbgs() <<"SLP: The instruction (" << *V
8347 <<") is ephemeral.\n");
8348 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8349return;
8350 }
8351 }
8352 }
8353
8354// We now know that this is a vector of instructions of the same type from
8355// the same block.
8356
8357// Check that none of the instructions in the bundle are already in the tree.
8358for (Value *V : VL) {
8359if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8360doesNotNeedToBeScheduled(V))
8361continue;
8362if (getTreeEntry(V)) {
8363LLVM_DEBUG(dbgs() <<"SLP: The instruction (" << *V
8364 <<") is already in tree.\n");
8365if (TryToFindDuplicates(S))
8366 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8367 ReuseShuffleIndices);
8368return;
8369 }
8370 }
8371
8372// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8373if (UserIgnoreList && !UserIgnoreList->empty()) {
8374for (Value *V : VL) {
8375if (UserIgnoreList->contains(V)) {
8376LLVM_DEBUG(dbgs() <<"SLP: Gathering due to gathered scalar.\n");
8377if (TryToFindDuplicates(S))
8378 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8379 ReuseShuffleIndices);
8380return;
8381 }
8382 }
8383 }
8384
8385// Special processing for sorted pointers for ScatterVectorize node with
8386// constant indeces only.
8387if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8388assert(VL.front()->getType()->isPointerTy() &&
8389count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8390"Expected pointers only.");
8391// Reset S to make it GetElementPtr kind of node.
8392constauto *It =find_if(VL, IsaPred<GetElementPtrInst>);
8393assert(It != VL.end() &&"Expected at least one GEP.");
8394 S =getSameOpcode(*It, *TLI);
8395 }
8396
8397// Check that all of the users of the scalars that we want to vectorize are
8398// schedulable.
8399Instruction *VL0 = S.getMainOp();
8400 BB = VL0->getParent();
8401
8402if (S &&
8403 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8404 !DT->isReachableFromEntry(BB))) {
8405// Don't go into unreachable blocks. They may contain instructions with
8406// dependency cycles which confuse the final scheduling.
8407// Do not vectorize EH and non-returning blocks, not profitable in most
8408// cases.
8409LLVM_DEBUG(dbgs() <<"SLP: bundle in unreachable block.\n");
8410 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);
8411return;
8412 }
8413
8414// Check that every instruction appears once in this bundle.
8415if (!TryToFindDuplicates(S,/*DoNotFail=*/true))
8416return;
8417
8418// Perform specific checks for each particular instruction kind.
8419OrdersType CurrentOrder;
8420SmallVector<Value *> PointerOps;
8421 TreeEntry::EntryState State = getScalarsVectorizationState(
8422 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8423if (State == TreeEntry::NeedToGather) {
8424 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8425 ReuseShuffleIndices);
8426return;
8427 }
8428
8429auto &BSRef = BlocksSchedules[BB];
8430if (!BSRef)
8431 BSRef = std::make_unique<BlockScheduling>(BB);
8432
8433 BlockScheduling &BS = *BSRef;
8434
8435 std::optional<ScheduleData *> Bundle =
8436 BS.tryScheduleBundle(UniqueValues,this, S);
8437#ifdef EXPENSIVE_CHECKS
8438// Make sure we didn't break any internal invariants
8439 BS.verify();
8440#endif
8441if (!Bundle) {
8442LLVM_DEBUG(dbgs() <<"SLP: We are not able to schedule this bundle!\n");
8443assert((!BS.getScheduleData(VL0) ||
8444 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8445"tryScheduleBundle should cancelScheduling on failure");
8446 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,
8447 ReuseShuffleIndices);
8448 NonScheduledFirst.insert(VL.front());
8449if (S.getOpcode() == Instruction::Load &&
8450 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8451registerNonVectorizableLoads(VL);
8452return;
8453 }
8454LLVM_DEBUG(dbgs() <<"SLP: We are able to schedule this bundle.\n");
8455
8456unsigned ShuffleOrOp =
8457 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
8458auto CreateOperandNodes = [&](TreeEntry *TE,constauto &Operands) {
8459// Postpone PHI nodes creation
8460SmallVector<unsigned> PHIOps;
8461for (unsignedI : seq<unsigned>(Operands.size())) {
8462ArrayRef<Value *>Op =Operands[I];
8463if (Op.empty())
8464continue;
8465 InstructionsState S =getSameOpcode(Op, *TLI);
8466if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8467 buildTree_rec(Op,Depth + 1, {TE,I});
8468else
8469 PHIOps.push_back(I);
8470 }
8471for (unsignedI : PHIOps)
8472 buildTree_rec(Operands[I],Depth + 1, {TE,I});
8473 };
8474switch (ShuffleOrOp) {
8475case Instruction::PHI: {
8476auto *PH = cast<PHINode>(VL0);
8477
8478 TreeEntry *TE =
8479 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8480LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (PHINode).\n";
8481TE->dump());
8482
8483// Keeps the reordered operands to avoid code duplication.
8484 PHIHandler Handler(*DT, PH, VL);
8485 Handler.buildOperands();
8486for (unsignedI : seq<unsigned>(PH->getNumOperands()))
8487TE->setOperand(I, Handler.getOperands(I));
8488SmallVector<ArrayRef<Value *>>Operands(PH->getNumOperands());
8489for (unsignedI : seq<unsigned>(PH->getNumOperands()))
8490Operands[I] = Handler.getOperands(I);
8491 CreateOperandNodes(TE,Operands);
8492return;
8493 }
8494case Instruction::ExtractValue:
8495case Instruction::ExtractElement: {
8496if (CurrentOrder.empty()) {
8497LLVM_DEBUG(dbgs() <<"SLP: Reusing or shuffling extract sequence.\n");
8498 }else {
8499LLVM_DEBUG({
8500dbgs() <<"SLP: Reusing or shuffling of reordered extract sequence "
8501"with order";
8502for (unsignedIdx : CurrentOrder)
8503dbgs() <<" " <<Idx;
8504dbgs() <<"\n";
8505 });
8506fixupOrderingIndices(CurrentOrder);
8507 }
8508// Insert new order with initial value 0, if it does not exist,
8509// otherwise return the iterator to the existing one.
8510 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8511 ReuseShuffleIndices, CurrentOrder);
8512LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry "
8513"(ExtractValueInst/ExtractElementInst).\n";
8514TE->dump());
8515// This is a special case, as it does not gather, but at the same time
8516// we are not extending buildTree_rec() towards the operands.
8517TE->setOperand(*this);
8518return;
8519 }
8520case Instruction::InsertElement: {
8521assert(ReuseShuffleIndices.empty() &&"All inserts should be unique");
8522
8523auto OrdCompare = [](const std::pair<int, int> &P1,
8524const std::pair<int, int> &P2) {
8525returnP1.first > P2.first;
8526 };
8527PriorityQueue<std::pair<int, int>,SmallVector<std::pair<int, int>>,
8528decltype(OrdCompare)>
8529 Indices(OrdCompare);
8530for (intI = 0, E = VL.size();I < E; ++I) {
8531unsignedIdx = *getElementIndex(VL[I]);
8532 Indices.emplace(Idx,I);
8533 }
8534OrdersType CurrentOrder(VL.size(), VL.size());
8535bool IsIdentity =true;
8536for (intI = 0, E = VL.size();I < E; ++I) {
8537 CurrentOrder[Indices.top().second] =I;
8538 IsIdentity &= Indices.top().second ==I;
8539 Indices.pop();
8540 }
8541if (IsIdentity)
8542 CurrentOrder.clear();
8543 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8544 {}, CurrentOrder);
8545LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (InsertElementInst).\n";
8546TE->dump());
8547
8548TE->setOperand(*this);
8549 buildTree_rec(TE->getOperand(1),Depth + 1, {TE, 1});
8550return;
8551 }
8552case Instruction::Load: {
8553// Check that a vectorized load would load the same memory as a scalar
8554// load. For example, we don't want to vectorize loads that are smaller
8555// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8556// treats loading/storing it as an i8 struct. If we vectorize loads/stores
8557// from such a struct, we read/write packed bits disagreeing with the
8558// unvectorized version.
8559 TreeEntry *TE =nullptr;
8560fixupOrderingIndices(CurrentOrder);
8561switch (State) {
8562case TreeEntry::Vectorize:
8563TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8564 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8565if (CurrentOrder.empty())
8566LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (LoadInst).\n";
8567TE->dump());
8568else
8569LLVM_DEBUG(dbgs()
8570 <<"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8571TE->dump());
8572break;
8573case TreeEntry::StridedVectorize:
8574// Vectorizing non-consecutive loads with `llvm.masked.gather`.
8575TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8576 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8577LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (strided LoadInst).\n";
8578TE->dump());
8579break;
8580case TreeEntry::ScatterVectorize:
8581// Vectorizing non-consecutive loads with `llvm.masked.gather`.
8582TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8583 UserTreeIdx, ReuseShuffleIndices);
8584LLVM_DEBUG(
8585dbgs()
8586 <<"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8587TE->dump());
8588break;
8589case TreeEntry::CombinedVectorize:
8590case TreeEntry::NeedToGather:
8591llvm_unreachable("Unexpected loads state.");
8592 }
8593TE->setOperand(*this);
8594if (State == TreeEntry::ScatterVectorize)
8595 buildTree_rec(PointerOps,Depth + 1, {TE, 0});
8596return;
8597 }
8598case Instruction::ZExt:
8599case Instruction::SExt:
8600case Instruction::FPToUI:
8601case Instruction::FPToSI:
8602case Instruction::FPExt:
8603case Instruction::PtrToInt:
8604case Instruction::IntToPtr:
8605case Instruction::SIToFP:
8606case Instruction::UIToFP:
8607case Instruction::Trunc:
8608case Instruction::FPTrunc:
8609case Instruction::BitCast: {
8610auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8611 std::make_pair(std::numeric_limits<unsigned>::min(),
8612 std::numeric_limits<unsigned>::max()));
8613if (ShuffleOrOp == Instruction::ZExt ||
8614 ShuffleOrOp == Instruction::SExt) {
8615 CastMaxMinBWSizes = std::make_pair(
8616 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8617 PrevMaxBW),
8618 std::min<unsigned>(
8619 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8620 PrevMinBW));
8621 }elseif (ShuffleOrOp == Instruction::Trunc) {
8622 CastMaxMinBWSizes = std::make_pair(
8623 std::max<unsigned>(
8624 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8625 PrevMaxBW),
8626 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8627 PrevMinBW));
8628 }
8629 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8630 ReuseShuffleIndices);
8631LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CastInst).\n";
8632TE->dump());
8633
8634TE->setOperand(*this);
8635for (unsignedI : seq<unsigned>(VL0->getNumOperands()))
8636 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});
8637if (ShuffleOrOp == Instruction::Trunc) {
8638 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8639 }elseif (ShuffleOrOp == Instruction::SIToFP ||
8640 ShuffleOrOp == Instruction::UIToFP) {
8641unsigned NumSignBits =
8642ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC,nullptr, DT);
8643if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8644APIntMask = DB->getDemandedBits(OpI);
8645 NumSignBits = std::max(NumSignBits,Mask.countl_zero());
8646 }
8647if (NumSignBits * 2 >=
8648 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8649 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8650 }
8651return;
8652 }
8653case Instruction::ICmp:
8654case Instruction::FCmp: {
8655// Check that all of the compares have the same predicate.
8656CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8657 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8658 ReuseShuffleIndices);
8659LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CmpInst).\n";
8660TE->dump());
8661
8662ValueListLeft,Right;
8663 VLOperands Ops(VL, S, *this);
8664if (cast<CmpInst>(VL0)->isCommutative()) {
8665// Commutative predicate - collect + sort operands of the instructions
8666// so that each side is more likely to have the same opcode.
8667assert(P0 ==CmpInst::getSwappedPredicate(P0) &&
8668"Commutative Predicate mismatch");
8669 Ops.reorder();
8670Left = Ops.getVL(0);
8671Right = Ops.getVL(1);
8672 }else {
8673// Collect operands - commute if it uses the swapped predicate.
8674for (Value *V : VL) {
8675if (isa<PoisonValue>(V)) {
8676Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8677Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8678continue;
8679 }
8680auto *Cmp = cast<CmpInst>(V);
8681Value *LHS =Cmp->getOperand(0);
8682Value *RHS =Cmp->getOperand(1);
8683if (Cmp->getPredicate() != P0)
8684std::swap(LHS, RHS);
8685Left.push_back(LHS);
8686Right.push_back(RHS);
8687 }
8688 }
8689TE->setOperand(0,Left);
8690TE->setOperand(1,Right);
8691 buildTree_rec(Left,Depth + 1, {TE, 0});
8692 buildTree_rec(Right,Depth + 1, {TE, 1});
8693if (ShuffleOrOp == Instruction::ICmp) {
8694unsigned NumSignBits0 =
8695ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC,nullptr, DT);
8696if (NumSignBits0 * 2 >=
8697 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8698 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8699unsigned NumSignBits1 =
8700ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC,nullptr, DT);
8701if (NumSignBits1 * 2 >=
8702 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8703 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8704 }
8705return;
8706 }
8707case Instruction::Select:
8708case Instruction::FNeg:
8709case Instruction::Add:
8710case Instruction::FAdd:
8711case Instruction::Sub:
8712case Instruction::FSub:
8713case Instruction::Mul:
8714case Instruction::FMul:
8715case Instruction::UDiv:
8716case Instruction::SDiv:
8717case Instruction::FDiv:
8718case Instruction::URem:
8719case Instruction::SRem:
8720case Instruction::FRem:
8721case Instruction::Shl:
8722case Instruction::LShr:
8723case Instruction::AShr:
8724case Instruction::And:
8725case Instruction::Or:
8726case Instruction::Xor:
8727case Instruction::Freeze: {
8728 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8729 ReuseShuffleIndices);
8730LLVM_DEBUG(
8731dbgs() <<"SLP: added a new TreeEntry "
8732"(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8733TE->dump());
8734
8735TE->setOperand(*this, isa<BinaryOperator>(VL0) &&isCommutative(VL0));
8736for (unsignedI : seq<unsigned>(VL0->getNumOperands()))
8737 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});
8738return;
8739 }
8740case Instruction::GetElementPtr: {
8741 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8742 ReuseShuffleIndices);
8743LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8744TE->dump());
8745SmallVector<ValueList, 2>Operands(2);
8746// Prepare the operand vector for pointer operands.
8747for (Value *V : VL) {
8748auto *GEP = dyn_cast<GetElementPtrInst>(V);
8749if (!GEP) {
8750Operands.front().push_back(V);
8751continue;
8752 }
8753Operands.front().push_back(GEP->getPointerOperand());
8754 }
8755TE->setOperand(0,Operands.front());
8756// Need to cast all indices to the same type before vectorization to
8757// avoid crash.
8758// Required to be able to find correct matches between different gather
8759// nodes and reuse the vectorized values rather than trying to gather them
8760// again.
8761int IndexIdx = 1;
8762Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8763Type *Ty =all_of(VL,
8764 [VL0Ty, IndexIdx](Value *V) {
8765auto *GEP = dyn_cast<GetElementPtrInst>(V);
8766if (!GEP)
8767returntrue;
8768return VL0Ty ==GEP->getOperand(IndexIdx)->getType();
8769 })
8770 ? VL0Ty
8771 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8772 ->getPointerOperandType()
8773 ->getScalarType());
8774// Prepare the operand vector.
8775for (Value *V : VL) {
8776auto *I = dyn_cast<GetElementPtrInst>(V);
8777if (!I) {
8778Operands.back().push_back(
8779 ConstantInt::get(Ty, 0,/*isSigned=*/false));
8780continue;
8781 }
8782auto *Op =I->getOperand(IndexIdx);
8783auto *CI = dyn_cast<ConstantInt>(Op);
8784if (!CI)
8785Operands.back().push_back(Op);
8786else
8787Operands.back().push_back(ConstantFoldIntegerCast(
8788 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8789 }
8790TE->setOperand(IndexIdx,Operands.back());
8791
8792for (unsignedI = 0, Ops =Operands.size();I < Ops; ++I)
8793 buildTree_rec(Operands[I],Depth + 1, {TE,I});
8794return;
8795 }
8796case Instruction::Store: {
8797bool Consecutive = CurrentOrder.empty();
8798if (!Consecutive)
8799fixupOrderingIndices(CurrentOrder);
8800 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8801 ReuseShuffleIndices, CurrentOrder);
8802if (Consecutive)
8803LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (StoreInst).\n";
8804TE->dump());
8805else
8806LLVM_DEBUG(
8807dbgs() <<"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8808TE->dump());
8809TE->setOperand(*this);
8810 buildTree_rec(TE->getOperand(0),Depth + 1, {TE, 0});
8811return;
8812 }
8813case Instruction::Call: {
8814// Check if the calls are all to the same vectorizable intrinsic or
8815// library function.
8816CallInst *CI = cast<CallInst>(VL0);
8817Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
8818
8819 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8820 ReuseShuffleIndices);
8821LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CallInst).\n";
8822TE->dump());
8823TE->setOperand(*this,isCommutative(VL0));
8824for (unsignedI : seq<unsigned>(CI->arg_size())) {
8825// For scalar operands no need to create an entry since no need to
8826// vectorize it.
8827if (isVectorIntrinsicWithScalarOpAtArg(ID,I,TTI))
8828continue;
8829 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});
8830 }
8831return;
8832 }
8833case Instruction::ShuffleVector: {
8834 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,
8835 ReuseShuffleIndices);
8836if (S.isAltShuffle()) {
8837LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (isAltShuffle).\n";
8838TE->dump());
8839 }else {
8840assert(SLPReVec &&"Only supported by REVEC.");
8841LLVM_DEBUG(
8842dbgs() <<"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8843TE->dump());
8844 }
8845
8846// Reorder operands if reordering would enable vectorization.
8847auto *CI = dyn_cast<CmpInst>(VL0);
8848if (CI &&any_of(VL, [](Value *V) {
8849return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8850 })) {
8851auto *MainCI = cast<CmpInst>(S.getMainOp());
8852auto *AltCI = cast<CmpInst>(S.getAltOp());
8853CmpInst::Predicate MainP = MainCI->getPredicate();
8854CmpInst::Predicate AltP = AltCI->getPredicate();
8855assert(MainP != AltP &&
8856"Expected different main/alternate predicates.");
8857ValueListLeft,Right;
8858// Collect operands - commute if it uses the swapped predicate or
8859// alternate operation.
8860for (Value *V : VL) {
8861if (isa<PoisonValue>(V)) {
8862Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8863Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8864continue;
8865 }
8866auto *Cmp = cast<CmpInst>(V);
8867Value *LHS =Cmp->getOperand(0);
8868Value *RHS =Cmp->getOperand(1);
8869
8870if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8871if (AltP ==CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8872std::swap(LHS, RHS);
8873 }else {
8874if (MainP ==CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8875std::swap(LHS, RHS);
8876 }
8877Left.push_back(LHS);
8878Right.push_back(RHS);
8879 }
8880TE->setOperand(0,Left);
8881TE->setOperand(1,Right);
8882 buildTree_rec(Left,Depth + 1, {TE, 0});
8883 buildTree_rec(Right,Depth + 1, {TE, 1});
8884return;
8885 }
8886
8887TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8888for (unsignedI : seq<unsigned>(VL0->getNumOperands()))
8889 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});
8890return;
8891 }
8892default:
8893break;
8894 }
8895llvm_unreachable("Unexpected vectorization of the instructions.");
8896}
8897
8898unsignedBoUpSLP::canMapToVector(Type *T) const{
8899unsignedN = 1;
8900Type *EltTy =T;
8901
8902while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8903if (EltTy->isEmptyTy())
8904return 0;
8905if (auto *ST = dyn_cast<StructType>(EltTy)) {
8906// Check that struct is homogeneous.
8907for (constauto *Ty : ST->elements())
8908if (Ty != *ST->element_begin())
8909return 0;
8910N *= ST->getNumElements();
8911 EltTy = *ST->element_begin();
8912 }elseif (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8913N *= AT->getNumElements();
8914 EltTy = AT->getElementType();
8915 }else {
8916auto *VT = cast<FixedVectorType>(EltTy);
8917N *= VT->getNumElements();
8918 EltTy = VT->getElementType();
8919 }
8920 }
8921
8922if (!isValidElementType(EltTy))
8923return 0;
8924uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy,N));
8925if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8926 VTSize != DL->getTypeStoreSizeInBits(T))
8927return 0;
8928returnN;
8929}
8930
8931bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
8932SmallVectorImpl<unsigned> &CurrentOrder,
8933bool ResizeAllowed) const{
8934constauto *It =find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8935assert(It != VL.end() &&"Expected at least one extract instruction.");
8936auto *E0 = cast<Instruction>(*It);
8937assert(
8938all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8939"Invalid opcode");
8940// Check if all of the extracts come from the same vector and from the
8941// correct offset.
8942Value *Vec = E0->getOperand(0);
8943
8944 CurrentOrder.clear();
8945
8946// We have to extract from a vector/aggregate with the same number of elements.
8947unsigned NElts;
8948if (E0->getOpcode() == Instruction::ExtractValue) {
8949 NElts =canMapToVector(Vec->getType());
8950if (!NElts)
8951returnfalse;
8952// Check if load can be rewritten as load of vector.
8953LoadInst *LI = dyn_cast<LoadInst>(Vec);
8954if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8955returnfalse;
8956 }else {
8957 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8958 }
8959
8960unsigned E = VL.size();
8961if (!ResizeAllowed && NElts != E)
8962returnfalse;
8963SmallVector<int> Indices(E,PoisonMaskElem);
8964unsigned MinIdx = NElts, MaxIdx = 0;
8965for (auto [I, V] :enumerate(VL)) {
8966auto *Inst = dyn_cast<Instruction>(V);
8967if (!Inst)
8968continue;
8969if (Inst->getOperand(0) != Vec)
8970returnfalse;
8971if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8972if (isa<UndefValue>(EE->getIndexOperand()))
8973continue;
8974 std::optional<unsigned>Idx =getExtractIndex(Inst);
8975if (!Idx)
8976returnfalse;
8977constunsigned ExtIdx = *Idx;
8978if (ExtIdx >= NElts)
8979continue;
8980 Indices[I] = ExtIdx;
8981if (MinIdx > ExtIdx)
8982 MinIdx = ExtIdx;
8983if (MaxIdx < ExtIdx)
8984 MaxIdx = ExtIdx;
8985 }
8986if (MaxIdx - MinIdx + 1 > E)
8987returnfalse;
8988if (MaxIdx + 1 <= E)
8989 MinIdx = 0;
8990
8991// Check that all of the indices extract from the correct offset.
8992bool ShouldKeepOrder =true;
8993// Assign to all items the initial value E + 1 so we can check if the extract
8994// instruction index was used already.
8995// Also, later we can check that all the indices are used and we have a
8996// consecutive access in the extract instructions, by checking that no
8997// element of CurrentOrder still has value E + 1.
8998 CurrentOrder.assign(E, E);
8999for (unsignedI = 0;I < E; ++I) {
9000if (Indices[I] ==PoisonMaskElem)
9001continue;
9002constunsigned ExtIdx = Indices[I] - MinIdx;
9003if (CurrentOrder[ExtIdx] != E) {
9004 CurrentOrder.clear();
9005returnfalse;
9006 }
9007 ShouldKeepOrder &= ExtIdx ==I;
9008 CurrentOrder[ExtIdx] =I;
9009 }
9010if (ShouldKeepOrder)
9011 CurrentOrder.clear();
9012
9013return ShouldKeepOrder;
9014}
9015
9016bool BoUpSLP::areAllUsersVectorized(
9017Instruction *I,constSmallDenseSet<Value *> *VectorizedVals) const{
9018return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
9019all_of(I->users(), [this](User *U) {
9020 return ScalarToTreeEntry.contains(U) ||
9021 isVectorLikeInstWithConstOps(U) ||
9022 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9023 });
9024}
9025
9026static std::pair<InstructionCost, InstructionCost>
9027getVectorCallCosts(CallInst *CI,FixedVectorType *VecTy,
9028TargetTransformInfo *TTI,TargetLibraryInfo *TLI,
9029ArrayRef<Type *> ArgTys) {
9030Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
9031
9032// Calculate the cost of the scalar and vector calls.
9033FastMathFlags FMF;
9034if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9035 FMF = FPCI->getFastMathFlags();
9036IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
9037auto IntrinsicCost =
9038TTI->getIntrinsicInstrCost(CostAttrs,TTI::TCK_RecipThroughput);
9039
9040auto Shape =VFShape::get(CI->getFunctionType(),
9041ElementCount::getFixed(VecTy->getNumElements()),
9042false/*HasGlobalPred*/);
9043Function *VecFunc =VFDatabase(*CI).getVectorizedFunction(Shape);
9044auto LibCost = IntrinsicCost;
9045if (!CI->isNoBuiltin() && VecFunc) {
9046// Calculate the cost of the vector library call.
9047// If the corresponding vector call is cheaper, return its cost.
9048 LibCost =
9049TTI->getCallInstrCost(nullptr, VecTy, ArgTys,TTI::TCK_RecipThroughput);
9050 }
9051return {IntrinsicCost, LibCost};
9052}
9053
9054void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9055constfunction_ref<bool(Instruction *)> IsAltOp,SmallVectorImpl<int> &Mask,
9056SmallVectorImpl<Value *> *OpScalars,
9057SmallVectorImpl<Value *> *AltScalars) const{
9058unsigned Sz = Scalars.size();
9059Mask.assign(Sz,PoisonMaskElem);
9060SmallVector<int> OrderMask;
9061if (!ReorderIndices.empty())
9062inversePermutation(ReorderIndices, OrderMask);
9063for (unsignedI = 0;I < Sz; ++I) {
9064unsignedIdx =I;
9065if (!ReorderIndices.empty())
9066Idx = OrderMask[I];
9067if (isa<PoisonValue>(Scalars[Idx]))
9068continue;
9069auto *OpInst = cast<Instruction>(Scalars[Idx]);
9070if (IsAltOp(OpInst)) {
9071Mask[I] = Sz +Idx;
9072if (AltScalars)
9073 AltScalars->push_back(OpInst);
9074 }else {
9075Mask[I] =Idx;
9076if (OpScalars)
9077 OpScalars->push_back(OpInst);
9078 }
9079 }
9080if (!ReuseShuffleIndices.empty()) {
9081SmallVector<int> NewMask(ReuseShuffleIndices.size(),PoisonMaskElem);
9082transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](intIdx) {
9083 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9084 });
9085Mask.swap(NewMask);
9086 }
9087}
9088
9089staticboolisAlternateInstruction(constInstruction *I,
9090constInstruction *MainOp,
9091constInstruction *AltOp,
9092constTargetLibraryInfo &TLI) {
9093if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9094auto *AltCI = cast<CmpInst>(AltOp);
9095CmpInst::Predicate MainP = MainCI->getPredicate();
9096 [[maybe_unused]]CmpInst::Predicate AltP = AltCI->getPredicate();
9097assert(MainP != AltP &&"Expected different main/alternate predicates.");
9098auto *CI = cast<CmpInst>(I);
9099if (isCmpSameOrSwapped(MainCI, CI, TLI))
9100returnfalse;
9101if (isCmpSameOrSwapped(AltCI, CI, TLI))
9102returntrue;
9103CmpInst::PredicateP = CI->getPredicate();
9104CmpInst::Predicate SwappedP =CmpInst::getSwappedPredicate(P);
9105
9106assert((MainP ==P || AltP ==P || MainP == SwappedP || AltP == SwappedP) &&
9107"CmpInst expected to match either main or alternate predicate or "
9108"their swap.");
9109return MainP !=P && MainP != SwappedP;
9110 }
9111returnI->getOpcode() == AltOp->getOpcode();
9112}
9113
9114TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9115assert(!Ops.empty());
9116constauto *Op0 = Ops.front();
9117
9118constboolIsConstant =all_of(Ops, [](Value *V) {
9119// TODO: We should allow undef elements here
9120returnisConstant(V) && !isa<UndefValue>(V);
9121 });
9122constbool IsUniform =all_of(Ops, [=](Value *V) {
9123// TODO: We should allow undef elements here
9124returnV == Op0;
9125 });
9126constbool IsPowerOfTwo =all_of(Ops, [](Value *V) {
9127// TODO: We should allow undef elements here
9128if (auto *CI = dyn_cast<ConstantInt>(V))
9129return CI->getValue().isPowerOf2();
9130returnfalse;
9131 });
9132constbool IsNegatedPowerOfTwo =all_of(Ops, [](Value *V) {
9133// TODO: We should allow undef elements here
9134if (auto *CI = dyn_cast<ConstantInt>(V))
9135return CI->getValue().isNegatedPowerOf2();
9136returnfalse;
9137 });
9138
9139TTI::OperandValueKind VK =TTI::OK_AnyValue;
9140if (IsConstant && IsUniform)
9141 VK =TTI::OK_UniformConstantValue;
9142elseif (IsConstant)
9143 VK =TTI::OK_NonUniformConstantValue;
9144elseif (IsUniform)
9145 VK =TTI::OK_UniformValue;
9146
9147TTI::OperandValueProperties VP =TTI::OP_None;
9148 VP = IsPowerOfTwo ?TTI::OP_PowerOf2 : VP;
9149 VP = IsNegatedPowerOfTwo ?TTI::OP_NegatedPowerOf2 : VP;
9150
9151return {VK, VP};
9152}
9153
9154namespace{
9155/// The base class for shuffle instruction emission and shuffle cost estimation.
9156classBaseShuffleAnalysis {
9157protected:
9158Type *ScalarTy =nullptr;
9159
9160 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9161
9162 /// V is expected to be a vectorized value.
9163 /// When REVEC is disabled, there is no difference between VF and
9164 /// VNumElements.
9165 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9166 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9167 /// of 8.
9168unsigned getVF(Value *V) const{
9169assert(V &&"V cannot be nullptr");
9170assert(isa<FixedVectorType>(V->getType()) &&
9171"V does not have FixedVectorType");
9172assert(ScalarTy &&"ScalarTy cannot be nullptr");
9173unsigned ScalarTyNumElements =getNumElements(ScalarTy);
9174unsigned VNumElements =
9175 cast<FixedVectorType>(V->getType())->getNumElements();
9176assert(VNumElements > ScalarTyNumElements &&
9177"the number of elements of V is not large enough");
9178assert(VNumElements % ScalarTyNumElements == 0 &&
9179"the number of elements of V is not a vectorized value");
9180return VNumElements / ScalarTyNumElements;
9181 }
9182
9183 /// Checks if the mask is an identity mask.
9184 /// \param IsStrict if is true the function returns false if mask size does
9185 /// not match vector size.
9186staticbool isIdentityMask(ArrayRef<int> Mask,constFixedVectorType *VecTy,
9187bool IsStrict) {
9188int Limit =Mask.size();
9189int VF = VecTy->getNumElements();
9190intIndex = -1;
9191if (VF == Limit &&ShuffleVectorInst::isIdentityMask(Mask, Limit))
9192returntrue;
9193if (!IsStrict) {
9194// Consider extract subvector starting from index 0.
9195if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF,Index) &&
9196Index == 0)
9197returntrue;
9198// All VF-size submasks are identity (e.g.
9199// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9200if (Limit % VF == 0 &&all_of(seq<int>(0, Limit / VF), [=](intIdx) {
9201ArrayRef<int> Slice =Mask.slice(Idx * VF, VF);
9202returnall_of(Slice, [](intI) {returnI ==PoisonMaskElem; }) ||
9203ShuffleVectorInst::isIdentityMask(Slice, VF);
9204 }))
9205returntrue;
9206 }
9207returnfalse;
9208 }
9209
9210 /// Tries to combine 2 different masks into single one.
9211 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9212 /// change the size of the vector, \p LocalVF is the original size of the
9213 /// shuffled vector.
9214staticvoid combineMasks(unsigned LocalVF,SmallVectorImpl<int> &Mask,
9215ArrayRef<int> ExtMask) {
9216unsigned VF =Mask.size();
9217SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);
9218for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {
9219if (ExtMask[I] ==PoisonMaskElem)
9220continue;
9221int MaskedIdx =Mask[ExtMask[I] % VF];
9222 NewMask[I] =
9223 MaskedIdx ==PoisonMaskElem ?PoisonMaskElem : MaskedIdx % LocalVF;
9224 }
9225Mask.swap(NewMask);
9226 }
9227
9228 /// Looks through shuffles trying to reduce final number of shuffles in the
9229 /// code. The function looks through the previously emitted shuffle
9230 /// instructions and properly mark indices in mask as undef.
9231 /// For example, given the code
9232 /// \code
9233 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9234 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9235 /// \endcode
9236 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9237 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9238 /// <0, 1, 2, 3> for the shuffle.
9239 /// If 2 operands are of different size, the smallest one will be resized and
9240 /// the mask recalculated properly.
9241 /// For example, given the code
9242 /// \code
9243 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9244 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9245 /// \endcode
9246 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9247 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9248 /// <0, 1, 2, 3> for the shuffle.
9249 /// So, it tries to transform permutations to simple vector merge, if
9250 /// possible.
9251 /// \param V The input vector which must be shuffled using the given \p Mask.
9252 /// If the better candidate is found, \p V is set to this best candidate
9253 /// vector.
9254 /// \param Mask The input mask for the shuffle. If the best candidate is found
9255 /// during looking-through-shuffles attempt, it is updated accordingly.
9256 /// \param SinglePermute true if the shuffle operation is originally a
9257 /// single-value-permutation. In this case the look-through-shuffles procedure
9258 /// may look for resizing shuffles as the best candidates.
9259 /// \return true if the shuffle results in the non-resizing identity shuffle
9260 /// (and thus can be ignored), false - otherwise.
9261staticbool peekThroughShuffles(Value *&V,SmallVectorImpl<int> &Mask,
9262bool SinglePermute) {
9263Value *Op =V;
9264ShuffleVectorInst *IdentityOp =nullptr;
9265SmallVector<int> IdentityMask;
9266while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9267// Exit if not a fixed vector type or changing size shuffle.
9268auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9269if (!SVTy)
9270break;
9271// Remember the identity or broadcast mask, if it is not a resizing
9272// shuffle. If no better candidates are found, this Op and Mask will be
9273// used in the final shuffle.
9274if (isIdentityMask(Mask, SVTy,/*IsStrict=*/false)) {
9275if (!IdentityOp || !SinglePermute ||
9276 (isIdentityMask(Mask, SVTy,/*IsStrict=*/true) &&
9277 !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
9278 IdentityMask.size()))) {
9279 IdentityOp = SV;
9280// Store current mask in the IdentityMask so later we did not lost
9281// this info if IdentityOp is selected as the best candidate for the
9282// permutation.
9283 IdentityMask.assign(Mask);
9284 }
9285 }
9286// Remember the broadcast mask. If no better candidates are found, this Op
9287// and Mask will be used in the final shuffle.
9288// Zero splat can be used as identity too, since it might be used with
9289// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9290// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9291// expensive, the analysis founds out, that the source vector is just a
9292// broadcast, this original mask can be transformed to identity mask <0,
9293// 1, 2, 3>.
9294// \code
9295// %0 = shuffle %v, poison, zeroinitalizer
9296// %res = shuffle %0, poison, <3, 1, 2, 0>
9297// \endcode
9298// may be transformed to
9299// \code
9300// %0 = shuffle %v, poison, zeroinitalizer
9301// %res = shuffle %0, poison, <0, 1, 2, 3>
9302// \endcode
9303if (SV->isZeroEltSplat()) {
9304 IdentityOp = SV;
9305 IdentityMask.assign(Mask);
9306 }
9307int LocalVF =Mask.size();
9308if (auto *SVOpTy =
9309 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9310 LocalVF = SVOpTy->getNumElements();
9311SmallVector<int> ExtMask(Mask.size(),PoisonMaskElem);
9312for (auto [Idx,I] :enumerate(Mask)) {
9313if (I ==PoisonMaskElem ||
9314static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9315continue;
9316 ExtMask[Idx] = SV->getMaskValue(I);
9317 }
9318bool IsOp1Undef =isUndefVector</*isPoisonOnly=*/true>(
9319 SV->getOperand(0),
9320buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9321 .all();
9322bool IsOp2Undef =isUndefVector</*isPoisonOnly=*/true>(
9323 SV->getOperand(1),
9324buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9325 .all();
9326if (!IsOp1Undef && !IsOp2Undef) {
9327// Update mask and mark undef elems.
9328for (int &I : Mask) {
9329if (I ==PoisonMaskElem)
9330continue;
9331if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9332PoisonMaskElem)
9333I =PoisonMaskElem;
9334 }
9335break;
9336 }
9337SmallVector<int> ShuffleMask(SV->getShuffleMask());
9338 combineMasks(LocalVF, ShuffleMask, Mask);
9339Mask.swap(ShuffleMask);
9340if (IsOp2Undef)
9341Op = SV->getOperand(0);
9342else
9343Op = SV->getOperand(1);
9344 }
9345if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9346 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9347ShuffleVectorInst::isZeroEltSplatMask(Mask,Mask.size())) {
9348if (IdentityOp) {
9349V = IdentityOp;
9350assert(Mask.size() == IdentityMask.size() &&
9351"Expected masks of same sizes.");
9352// Clear known poison elements.
9353for (auto [I,Idx] :enumerate(Mask))
9354if (Idx ==PoisonMaskElem)
9355 IdentityMask[I] =PoisonMaskElem;
9356Mask.swap(IdentityMask);
9357auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9358return SinglePermute &&
9359 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9360/*IsStrict=*/true) ||
9361 (Shuffle &&Mask.size() == Shuffle->getShuffleMask().size() &&
9362 Shuffle->isZeroEltSplat() &&
9363ShuffleVectorInst::isZeroEltSplatMask(Mask,Mask.size())));
9364 }
9365V =Op;
9366returnfalse;
9367 }
9368V =Op;
9369returntrue;
9370 }
9371
9372 /// Smart shuffle instruction emission, walks through shuffles trees and
9373 /// tries to find the best matching vector for the actual shuffle
9374 /// instruction.
9375template <typename T,typename ShuffleBuilderTy>
9376staticT createShuffle(Value *V1,Value *V2,ArrayRef<int> Mask,
9377 ShuffleBuilderTy &Builder,Type *ScalarTy) {
9378assert(V1 &&"Expected at least one vector value.");
9379unsigned ScalarTyNumElements =getNumElements(ScalarTy);
9380SmallVector<int> NewMask(Mask);
9381if (ScalarTyNumElements != 1) {
9382assert(SLPReVec &&"FixedVectorType is not expected.");
9383transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
9384Mask = NewMask;
9385 }
9386if (V2)
9387 Builder.resizeToMatch(V1, V2);
9388int VF =Mask.size();
9389if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9390 VF = FTy->getNumElements();
9391if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9392 V2,buildUseMask(VF, Mask, UseMask::SecondArg))
9393 .all()) {
9394// Peek through shuffles.
9395Value *Op1 = V1;
9396Value *Op2 =V2;
9397int VF =
9398 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9399SmallVector<int> CombinedMask1(Mask.size(),PoisonMaskElem);
9400SmallVector<int> CombinedMask2(Mask.size(),PoisonMaskElem);
9401for (intI = 0,E =Mask.size();I <E; ++I) {
9402if (Mask[I] < VF)
9403 CombinedMask1[I] =Mask[I];
9404else
9405 CombinedMask2[I] =Mask[I] - VF;
9406 }
9407Value *PrevOp1;
9408Value *PrevOp2;
9409do {
9410 PrevOp1 = Op1;
9411 PrevOp2 = Op2;
9412 (void)peekThroughShuffles(Op1, CombinedMask1,/*SinglePermute=*/false);
9413 (void)peekThroughShuffles(Op2, CombinedMask2,/*SinglePermute=*/false);
9414// Check if we have 2 resizing shuffles - need to peek through operands
9415// again.
9416if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9417if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9418SmallVector<int> ExtMask1(Mask.size(),PoisonMaskElem);
9419for (auto [Idx,I] :enumerate(CombinedMask1)) {
9420if (I ==PoisonMaskElem)
9421continue;
9422 ExtMask1[Idx] = SV1->getMaskValue(I);
9423 }
9424SmallBitVector UseMask1 =buildUseMask(
9425 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9426 ->getNumElements(),
9427 ExtMask1, UseMask::SecondArg);
9428SmallVector<int> ExtMask2(CombinedMask2.size(),PoisonMaskElem);
9429for (auto [Idx,I] :enumerate(CombinedMask2)) {
9430if (I ==PoisonMaskElem)
9431continue;
9432 ExtMask2[Idx] = SV2->getMaskValue(I);
9433 }
9434SmallBitVector UseMask2 =buildUseMask(
9435 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9436 ->getNumElements(),
9437 ExtMask2, UseMask::SecondArg);
9438if (SV1->getOperand(0)->getType() ==
9439 SV2->getOperand(0)->getType() &&
9440 SV1->getOperand(0)->getType() != SV1->getType() &&
9441isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9442isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9443 Op1 = SV1->getOperand(0);
9444 Op2 = SV2->getOperand(0);
9445SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9446int LocalVF = ShuffleMask1.size();
9447if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9448 LocalVF = FTy->getNumElements();
9449 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9450 CombinedMask1.swap(ShuffleMask1);
9451SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9452 LocalVF = ShuffleMask2.size();
9453if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9454 LocalVF = FTy->getNumElements();
9455 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9456 CombinedMask2.swap(ShuffleMask2);
9457 }
9458 }
9459 }while (PrevOp1 != Op1 || PrevOp2 != Op2);
9460 Builder.resizeToMatch(Op1, Op2);
9461 VF = std::max(cast<VectorType>(Op1->getType())
9462 ->getElementCount()
9463 .getKnownMinValue(),
9464 cast<VectorType>(Op2->getType())
9465 ->getElementCount()
9466 .getKnownMinValue());
9467for (intI = 0,E =Mask.size();I <E; ++I) {
9468if (CombinedMask2[I] !=PoisonMaskElem) {
9469assert(CombinedMask1[I] ==PoisonMaskElem &&
9470"Expected undefined mask element");
9471 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9472 }
9473 }
9474if (Op1 == Op2 &&
9475 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9476 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9477 isa<ShuffleVectorInst>(Op1) &&
9478 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9479ArrayRef(CombinedMask1))))
9480return Builder.createIdentity(Op1);
9481return Builder.createShuffleVector(
9482 Op1, Op1 == Op2 ?PoisonValue::get(Op1->getType()) : Op2,
9483 CombinedMask1);
9484 }
9485if (isa<PoisonValue>(V1))
9486return Builder.createPoison(
9487 cast<VectorType>(V1->getType())->getElementType(),Mask.size());
9488bool IsIdentity = peekThroughShuffles(V1, NewMask,/*SinglePermute=*/true);
9489assert(V1 &&"Expected non-null value after looking through shuffles.");
9490
9491if (!IsIdentity)
9492return Builder.createShuffleVector(V1, NewMask);
9493return Builder.createIdentity(V1);
9494 }
9495
9496 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
9497 /// shuffle emission.
9498staticvoid transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
9499ArrayRef<int> Mask) {
9500for (unsignedI : seq<unsigned>(CommonMask.size()))
9501if (Mask[I] !=PoisonMaskElem)
9502 CommonMask[I] =I;
9503 }
9504};
9505}// namespace
9506
9507/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9508static std::pair<InstructionCost, InstructionCost>
9509getGEPCosts(constTargetTransformInfo &TTI,ArrayRef<Value *> Ptrs,
9510Value *BasePtr,unsigned Opcode,TTI::TargetCostKindCostKind,
9511Type *ScalarTy,VectorType *VecTy) {
9512InstructionCost ScalarCost = 0;
9513InstructionCost VecCost = 0;
9514// Here we differentiate two cases: (1) when Ptrs represent a regular
9515// vectorization tree node (as they are pointer arguments of scattered
9516// loads) or (2) when Ptrs are the arguments of loads or stores being
9517// vectorized as plane wide unit-stride load/store since all the
9518// loads/stores are known to be from/to adjacent locations.
9519if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9520// Case 2: estimate costs for pointer related costs when vectorizing to
9521// a wide load/store.
9522// Scalar cost is estimated as a set of pointers with known relationship
9523// between them.
9524// For vector code we will use BasePtr as argument for the wide load/store
9525// but we also need to account all the instructions which are going to
9526// stay in vectorized code due to uses outside of these scalar
9527// loads/stores.
9528 ScalarCost =TTI.getPointersChainCost(
9529 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9530CostKind);
9531
9532SmallVector<const Value *> PtrsRetainedInVecCode;
9533for (Value *V : Ptrs) {
9534if (V == BasePtr) {
9535 PtrsRetainedInVecCode.push_back(V);
9536continue;
9537 }
9538auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9539// For simplicity assume Ptr to stay in vectorized code if it's not a
9540// GEP instruction. We don't care since it's cost considered free.
9541// TODO: We should check for any uses outside of vectorizable tree
9542// rather than just single use.
9543if (!Ptr || !Ptr->hasOneUse())
9544 PtrsRetainedInVecCode.push_back(V);
9545 }
9546
9547if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9548// If all pointers stay in vectorized code then we don't have
9549// any savings on that.
9550return std::make_pair(TTI::TCC_Free,TTI::TCC_Free);
9551 }
9552 VecCost =TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9553 TTI::PointersChainInfo::getKnownStride(),
9554 VecTy,CostKind);
9555 }else {
9556// Case 1: Ptrs are the arguments of loads that we are going to transform
9557// into masked gather load intrinsic.
9558// All the scalar GEPs will be removed as a result of vectorization.
9559// For any external uses of some lanes extract element instructions will
9560// be generated (which cost is estimated separately).
9561TTI::PointersChainInfo PtrsInfo =
9562all_of(Ptrs,
9563 [](constValue *V) {
9564auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9565returnPtr && !Ptr->hasAllConstantIndices();
9566 })
9567 ? TTI::PointersChainInfo::getUnknownStride()
9568 : TTI::PointersChainInfo::getKnownStride();
9569
9570 ScalarCost =
9571TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,CostKind);
9572auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9573if (!BaseGEP) {
9574auto *It =find_if(Ptrs, IsaPred<GEPOperator>);
9575if (It != Ptrs.end())
9576 BaseGEP = cast<GEPOperator>(*It);
9577 }
9578if (BaseGEP) {
9579SmallVector<const Value *> Indices(BaseGEP->indices());
9580 VecCost =TTI.getGEPCost(BaseGEP->getSourceElementType(),
9581 BaseGEP->getPointerOperand(), Indices, VecTy,
9582CostKind);
9583 }
9584 }
9585
9586return std::make_pair(ScalarCost, VecCost);
9587}
9588
9589void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9590assert(TE.isGather() &&TE.ReorderIndices.empty() &&
9591"Expected gather node without reordering.");
9592DenseMap<std::pair<size_t, Value *>,SmallVector<LoadInst *>> LoadsMap;
9593SmallSet<size_t, 2> LoadKeyUsed;
9594
9595// Do not reorder nodes if it small (just 2 elements), all-constant or all
9596// instructions have same opcode already.
9597if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
9598all_of(TE.Scalars,isConstant))
9599return;
9600
9601if (any_of(seq<unsigned>(TE.Idx), [&](unsignedIdx) {
9602 return VectorizableTree[Idx]->isSame(TE.Scalars);
9603 }))
9604return;
9605
9606auto GenerateLoadsSubkey = [&](size_tKey,LoadInst *LI) {
9607Key =hash_combine(hash_value(LI->getParent()), Key);
9608Value *Ptr =
9609getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth);
9610if (LoadKeyUsed.contains(Key)) {
9611auto LIt = LoadsMap.find(std::make_pair(Key,Ptr));
9612if (LIt != LoadsMap.end()) {
9613for (LoadInst *RLI : LIt->second) {
9614if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9615 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9616/*StrictCheck=*/true))
9617returnhash_value(RLI->getPointerOperand());
9618 }
9619for (LoadInst *RLI : LIt->second) {
9620if (arePointersCompatible(RLI->getPointerOperand(),
9621 LI->getPointerOperand(), *TLI)) {
9622hash_code SubKey =hash_value(RLI->getPointerOperand());
9623return SubKey;
9624 }
9625 }
9626if (LIt->second.size() > 2) {
9627hash_code SubKey =
9628hash_value(LIt->second.back()->getPointerOperand());
9629return SubKey;
9630 }
9631 }
9632 }
9633 LoadKeyUsed.insert(Key);
9634 LoadsMap.try_emplace(std::make_pair(Key,Ptr)).first->second.push_back(LI);
9635returnhash_value(LI->getPointerOperand());
9636 };
9637MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
9638SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
9639bool IsOrdered =true;
9640unsigned NumInstructions = 0;
9641// Try to "cluster" scalar instructions, to be able to build extra vectorized
9642// nodes.
9643for (auto [I, V] :enumerate(TE.Scalars)) {
9644size_tKey = 1,Idx = 1;
9645if (auto *Inst = dyn_cast<Instruction>(V);
9646 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9647 !isDeleted(Inst) && !isVectorized(V)) {
9648 std::tie(Key,Idx) =generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9649/*AllowAlternate=*/false);
9650 ++NumInstructions;
9651 }
9652auto &Container = SortedValues[Key];
9653if (IsOrdered && !KeyToIndex.contains(V) &&
9654 !(isa<Constant, ExtractElementInst>(V) ||
9655isVectorLikeInstWithConstOps(V)) &&
9656 ((Container.contains(Idx) &&
9657 KeyToIndex.at(Container[Idx].back()).back() !=I - 1) ||
9658 (!Container.empty() && !Container.contains(Idx) &&
9659 KeyToIndex.at(Container.back().second.back()).back() !=I - 1)))
9660 IsOrdered =false;
9661auto &KTI = KeyToIndex[V];
9662if (KTI.empty())
9663 Container[Idx].push_back(V);
9664 KTI.push_back(I);
9665 }
9666SmallVector<std::pair<unsigned, unsigned>> SubVectors;
9667APInt DemandedElts =APInt::getAllOnes(TE.Scalars.size());
9668if (!IsOrdered && NumInstructions > 1) {
9669unsigned Cnt = 0;
9670TE.ReorderIndices.resize(TE.Scalars.size(),TE.Scalars.size());
9671for (constauto &D : SortedValues) {
9672for (constauto &P :D.second) {
9673unsigned Sz = 0;
9674for (Value *V :P.second) {
9675ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9676for (auto [K,Idx] :enumerate(Indices)) {
9677TE.ReorderIndices[Cnt +K] =Idx;
9678TE.Scalars[Cnt +K] =V;
9679 }
9680 Sz += Indices.size();
9681 Cnt += Indices.size();
9682 }
9683if (Sz > 1 && isa<Instruction>(P.second.front())) {
9684constunsigned SubVF =getFloorFullVectorNumberOfElements(
9685 *TTI,TE.Scalars.front()->getType(), Sz);
9686 SubVectors.emplace_back(Cnt - Sz, SubVF);
9687for (unsignedI : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9688 DemandedElts.clearBit(I);
9689 }elseif (!P.second.empty() &&isConstant(P.second.front())) {
9690for (unsignedI : seq<unsigned>(Cnt - Sz, Cnt))
9691 DemandedElts.clearBit(I);
9692 }
9693 }
9694 }
9695 }
9696// Reuses always require shuffles, so consider it as profitable.
9697if (!TE.ReuseShuffleIndices.empty() ||TE.ReorderIndices.empty())
9698return;
9699// Do simple cost estimation.
9700constexprTTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
9701InstructionCostCost = 0;
9702auto *ScalarTy =TE.Scalars.front()->getType();
9703auto *VecTy =getWidenedType(ScalarTy,TE.Scalars.size());
9704for (auto [Idx, Sz] : SubVectors) {
9705Cost +=::getShuffleCost(*TTI,TTI::SK_InsertSubvector, VecTy, {},CostKind,
9706Idx,getWidenedType(ScalarTy, Sz));
9707 }
9708if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9709assert(SLPReVec &&"Only supported by REVEC.");
9710// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9711// of CreateInsertElement.
9712unsigned ScalarTyNumElements =getNumElements(ScalarTy);
9713for (unsignedI : seq<unsigned>(TE.Scalars.size()))
9714if (DemandedElts[I])
9715Cost +=
9716TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9717CostKind,I * ScalarTyNumElements, FTy);
9718 }else {
9719Cost +=TTI->getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,
9720/*Extract=*/false,CostKind);
9721 }
9722int Sz =TE.Scalars.size();
9723SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9724TE.ReorderIndices.end());
9725for (unsignedI : seq<unsigned>(Sz)) {
9726Value *V =TE.getOrdered(I);
9727if (isa<PoisonValue>(V)) {
9728 ReorderMask[I] =PoisonMaskElem;
9729 }elseif (isConstant(V) || DemandedElts[I]) {
9730 ReorderMask[I] =I +TE.ReorderIndices.size();
9731 }
9732 }
9733Cost +=::getShuffleCost(*TTI,
9734any_of(ReorderMask, [&](intI) {returnI >= Sz; })
9735 ?TTI::SK_PermuteTwoSrc
9736 :TTI::SK_PermuteSingleSrc,
9737 VecTy, ReorderMask);
9738 DemandedElts =APInt::getAllOnes(VecTy->getNumElements());
9739 ReorderMask.assign(Sz,PoisonMaskElem);
9740for (unsignedI : seq<unsigned>(Sz)) {
9741Value *V =TE.getOrdered(I);
9742if (isConstant(V)) {
9743 DemandedElts.clearBit(I);
9744if (!isa<PoisonValue>(V))
9745 ReorderMask[I] =I;
9746 }else {
9747 ReorderMask[I] =I + Sz;
9748 }
9749 }
9750InstructionCost BVCost =TTI->getScalarizationOverhead(
9751 VecTy, DemandedElts,/*Insert=*/true,/*Extract=*/false,CostKind);
9752if (!DemandedElts.isAllOnes())
9753 BVCost +=::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9754if (Cost >= BVCost) {
9755SmallVector<int>Mask(TE.ReorderIndices.begin(),TE.ReorderIndices.end());
9756reorderScalars(TE.Scalars, Mask);
9757TE.ReorderIndices.clear();
9758 }
9759}
9760
9761voidBoUpSLP::transformNodes() {
9762constexprTTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
9763 BaseGraphSize = VectorizableTree.size();
9764// Turn graph transforming mode on and off, when done.
9765classGraphTransformModeRAAI {
9766bool &SavedIsGraphTransformMode;
9767
9768public:
9769 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9770 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9771 IsGraphTransformMode =true;
9772 }
9773 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =false; }
9774 } TransformContext(IsGraphTransformMode);
9775// Operands are profitable if they are:
9776// 1. At least one constant
9777// or
9778// 2. Splats
9779// or
9780// 3. Results in good vectorization opportunity, i.e. may generate vector
9781// nodes and reduce cost of the graph.
9782auto CheckOperandsProfitability = [this](Instruction *I1,Instruction *I2,
9783const InstructionsState &S) {
9784SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
9785for (unsignedOp : seq<unsigned>(S.getMainOp()->getNumOperands()))
9786 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9787 I2->getOperand(Op));
9788returnall_of(
9789 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9790returnall_of(Cand,
9791 [](const std::pair<Value *, Value *> &P) {
9792return isa<Constant>(P.first) ||
9793 isa<Constant>(P.second) ||P.first ==P.second;
9794 }) ||
9795findBestRootPair(Cand,LookAheadHeuristics::ScoreSplatLoads);
9796 });
9797 };
9798
9799// Try to reorder gather nodes for better vectorization opportunities.
9800for (unsignedIdx : seq<unsigned>(BaseGraphSize)) {
9801 TreeEntry &E = *VectorizableTree[Idx];
9802if (E.isGather())
9803 reorderGatherNode(E);
9804 }
9805
9806// The tree may grow here, so iterate over nodes, built before.
9807for (unsignedIdx : seq<unsigned>(BaseGraphSize)) {
9808 TreeEntry &E = *VectorizableTree[Idx];
9809if (E.isGather()) {
9810ArrayRef<Value *> VL = E.Scalars;
9811constunsigned Sz =getVectorElementSize(VL.front());
9812unsigned MinVF =getMinVF(2 * Sz);
9813// Do not try partial vectorization for small nodes (<= 2), nodes with the
9814// same opcode and same parent block or all constants.
9815if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9816 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9817 E.isAltShuffle() || !allSameBlock(VL)) ||
9818allConstant(VL) ||isSplat(VL))
9819continue;
9820// Try to find vectorizable sequences and transform them into a series of
9821// insertvector instructions.
9822unsigned StartIdx = 0;
9823unsignedEnd = VL.size();
9824for (unsigned VF =getFloorFullVectorNumberOfElements(
9825 *TTI, VL.front()->getType(), VL.size() - 1);
9826 VF >= MinVF; VF =getFloorFullVectorNumberOfElements(
9827 *TTI, VL.front()->getType(), VF - 1)) {
9828if (StartIdx + VF >End)
9829continue;
9830SmallVector<std::pair<unsigned, unsigned>> Slices;
9831for (unsigned Cnt = StartIdx; Cnt + VF <=End; Cnt += VF) {
9832ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9833// If any instruction is vectorized already - do not try again.
9834// Reuse the existing node, if it fully matches the slice.
9835if (const TreeEntry *SE = getTreeEntry(Slice.front());
9836 SE || getTreeEntry(Slice.back())) {
9837if (!SE)
9838continue;
9839if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9840continue;
9841 }
9842// Constant already handled effectively - skip.
9843if (allConstant(Slice))
9844continue;
9845// Do not try to vectorize small splats (less than vector register and
9846// only with the single non-undef element).
9847bool IsSplat =isSplat(Slice);
9848if (Slices.empty() || !IsSplat ||
9849 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9850 Slice.front()->getType(), VF)),
9851 1U, VF - 1) !=
9852 std::clamp(TTI->getNumberOfParts(getWidenedType(
9853 Slice.front()->getType(), 2 * VF)),
9854 1U, 2 * VF)) ||
9855count(Slice, Slice.front()) ==
9856static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9857 : 1)) {
9858if (IsSplat)
9859continue;
9860 InstructionsState S =getSameOpcode(Slice, *TLI);
9861if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9862 (S.getOpcode() == Instruction::Load &&
9863areKnownNonVectorizableLoads(Slice)) ||
9864 (S.getOpcode() != Instruction::Load &&
9865 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
9866continue;
9867if (VF == 2) {
9868// Try to vectorize reduced values or if all users are vectorized.
9869// For expensive instructions extra extracts might be profitable.
9870if ((!UserIgnoreList || E.Idx != 0) &&
9871TTI->getInstructionCost(S.getMainOp(),CostKind) <
9872TTI::TCC_Expensive &&
9873 !all_of(Slice, [&](Value *V) {
9874if (isa<PoisonValue>(V))
9875returntrue;
9876return areAllUsersVectorized(cast<Instruction>(V),
9877 UserIgnoreList);
9878 }))
9879continue;
9880if (S.getOpcode() == Instruction::Load) {
9881OrdersType Order;
9882SmallVector<Value *> PointerOps;
9883LoadsState Res =
9884canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9885// Do not vectorize gathers.
9886if (Res ==LoadsState::ScatterVectorize ||
9887 Res ==LoadsState::Gather) {
9888if (Res ==LoadsState::Gather) {
9889registerNonVectorizableLoads(Slice);
9890// If reductions and the scalars from the root node are
9891// analyzed - mark as non-vectorizable reduction.
9892if (UserIgnoreList && E.Idx == 0)
9893analyzedReductionVals(Slice);
9894 }
9895continue;
9896 }
9897 }elseif (S.getOpcode() == Instruction::ExtractElement ||
9898 (TTI->getInstructionCost(S.getMainOp(),CostKind) <
9899TTI::TCC_Expensive &&
9900 !CheckOperandsProfitability(
9901 S.getMainOp(),
9902 cast<Instruction>(*find_if(reverse(Slice),
9903 IsaPred<Instruction>)),
9904 S))) {
9905// Do not vectorize extractelements (handled effectively
9906// alread). Do not vectorize non-profitable instructions (with
9907// low cost and non-vectorizable operands.)
9908continue;
9909 }
9910 }
9911 }
9912 Slices.emplace_back(Cnt, Slice.size());
9913 }
9914auto AddCombinedNode = [&](unsignedIdx,unsigned Cnt,unsigned Sz) {
9915 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9916if (StartIdx == Cnt)
9917 StartIdx = Cnt + Sz;
9918if (End == Cnt + Sz)
9919End = Cnt;
9920 };
9921for (auto [Cnt, Sz] : Slices) {
9922ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9923// If any instruction is vectorized already - do not try again.
9924if (TreeEntry *SE = getTreeEntry(Slice.front());
9925 SE || getTreeEntry(Slice.back())) {
9926if (!SE)
9927continue;
9928if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9929continue;
9930 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9931 AddCombinedNode(SE->Idx, Cnt, Sz);
9932continue;
9933 }
9934unsigned PrevSize = VectorizableTree.size();
9935 [[maybe_unused]]unsigned PrevEntriesSize =
9936 LoadEntriesToVectorize.size();
9937 buildTree_rec(Slice, 0,EdgeInfo(&E, UINT_MAX));
9938if (PrevSize + 1 == VectorizableTree.size() &&
9939 VectorizableTree[PrevSize]->isGather() &&
9940 VectorizableTree[PrevSize]->hasState() &&
9941 VectorizableTree[PrevSize]->getOpcode() !=
9942 Instruction::ExtractElement &&
9943 !isSplat(Slice)) {
9944if (UserIgnoreList && E.Idx == 0 && VF == 2)
9945analyzedReductionVals(Slice);
9946 VectorizableTree.pop_back();
9947assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9948"LoadEntriesToVectorize expected to remain the same");
9949continue;
9950 }
9951 AddCombinedNode(PrevSize, Cnt, Sz);
9952 }
9953 }
9954// Restore ordering, if no extra vectorization happened.
9955if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9956SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9957reorderScalars(E.Scalars, Mask);
9958 E.ReorderIndices.clear();
9959 }
9960 }
9961if (!E.hasState())
9962continue;
9963switch (E.getOpcode()) {
9964case Instruction::Load: {
9965// No need to reorder masked gather loads, just reorder the scalar
9966// operands.
9967if (E.State != TreeEntry::Vectorize)
9968break;
9969Type *ScalarTy = E.getMainOp()->getType();
9970auto *VecTy =getWidenedType(ScalarTy, E.Scalars.size());
9971Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9972// Check if profitable to represent consecutive load + reverse as strided
9973// load with stride -1.
9974if (!E.ReorderIndices.empty() &&isReverseOrder(E.ReorderIndices) &&
9975TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9976SmallVector<int> Mask;
9977inversePermutation(E.ReorderIndices, Mask);
9978auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9979InstructionCost OriginalVecCost =
9980TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9981 BaseLI->getPointerAddressSpace(),CostKind,
9982TTI::OperandValueInfo()) +
9983::getShuffleCost(*TTI,TTI::SK_Reverse, VecTy, Mask,CostKind);
9984InstructionCost StridedCost =TTI->getStridedMemoryOpCost(
9985 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9986/*VariableMask=*/false, CommonAlignment,CostKind, BaseLI);
9987if (StridedCost < OriginalVecCost)
9988// Strided load is more profitable than consecutive load + reverse -
9989// transform the node to strided load.
9990 E.State = TreeEntry::StridedVectorize;
9991 }
9992break;
9993 }
9994case Instruction::Store: {
9995Type *ScalarTy =
9996 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9997auto *VecTy =getWidenedType(ScalarTy, E.Scalars.size());
9998Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9999// Check if profitable to represent consecutive load + reverse as strided
10000// load with stride -1.
10001if (!E.ReorderIndices.empty() &&isReverseOrder(E.ReorderIndices) &&
10002TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
10003SmallVector<int> Mask;
10004inversePermutation(E.ReorderIndices, Mask);
10005auto *BaseSI = cast<StoreInst>(E.Scalars.back());
10006InstructionCost OriginalVecCost =
10007TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
10008 BaseSI->getPointerAddressSpace(),CostKind,
10009TTI::OperandValueInfo()) +
10010::getShuffleCost(*TTI,TTI::SK_Reverse, VecTy, Mask,CostKind);
10011InstructionCost StridedCost =TTI->getStridedMemoryOpCost(
10012 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10013/*VariableMask=*/false, CommonAlignment,CostKind, BaseSI);
10014if (StridedCost < OriginalVecCost)
10015// Strided store is more profitable than reverse + consecutive store -
10016// transform the node to strided store.
10017 E.State = TreeEntry::StridedVectorize;
10018 }elseif (!E.ReorderIndices.empty()) {
10019// Check for interleaved stores.
10020auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
10021auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10022assert(Mask.size() > 1 &&"Expected mask greater than 1 element.");
10023if (Mask.size() < 4)
10024return 0u;
10025for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10026if (ShuffleVectorInst::isInterleaveMask(
10027 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
10028TTI.isLegalInterleavedAccessType(
10029 VecTy, Factor, BaseSI->getAlign(),
10030 BaseSI->getPointerAddressSpace()))
10031return Factor;
10032 }
10033
10034return 0u;
10035 };
10036SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
10037unsigned InterleaveFactor = IsInterleaveMask(Mask);
10038if (InterleaveFactor != 0)
10039 E.setInterleave(InterleaveFactor);
10040 }
10041break;
10042 }
10043case Instruction::Select: {
10044if (E.State != TreeEntry::Vectorize)
10045break;
10046auto [MinMaxID, SelectOnly] =canConvertToMinOrMaxIntrinsic(E.Scalars);
10047if (MinMaxID ==Intrinsic::not_intrinsic)
10048break;
10049// This node is a minmax node.
10050 E.CombinedOp = TreeEntry::MinMax;
10051 TreeEntry *CondEntry =const_cast<TreeEntry *>(getOperandEntry(&E, 0));
10052if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10053 CondEntry->State == TreeEntry::Vectorize) {
10054// The condition node is part of the combined minmax node.
10055 CondEntry->State = TreeEntry::CombinedVectorize;
10056 }
10057break;
10058 }
10059default:
10060break;
10061 }
10062 }
10063
10064if (LoadEntriesToVectorize.empty()) {
10065// Single load node - exit.
10066if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10067 VectorizableTree.front()->getOpcode() == Instruction::Load)
10068return;
10069// Small graph with small VF - exit.
10070constexprunsigned SmallTree = 3;
10071constexprunsigned SmallVF = 2;
10072if ((VectorizableTree.size() <= SmallTree &&
10073 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10074 (VectorizableTree.size() <= 2 && UserIgnoreList))
10075return;
10076
10077if (VectorizableTree.front()->isNonPowOf2Vec() &&
10078getCanonicalGraphSize() !=getTreeSize() && UserIgnoreList &&
10079getCanonicalGraphSize() <= SmallTree &&
10080count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
10081 [](const std::unique_ptr<TreeEntry> &TE) {
10082return TE->isGather() && TE->hasState() &&
10083 TE->getOpcode() == Instruction::Load &&
10084 !allSameBlock(TE->Scalars);
10085 }) == 1)
10086return;
10087 }
10088
10089// A list of loads to be gathered during the vectorization process. We can
10090// try to vectorize them at the end, if profitable.
10091SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
10092SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 8>
10093 GatheredLoads;
10094
10095for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10096 TreeEntry &E = *TE;
10097if (E.isGather() &&
10098 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10099 (!E.hasState() &&any_of(E.Scalars,
10100 [&](Value *V) {
10101 return isa<LoadInst>(V) &&
10102 !isVectorized(V) &&
10103 !isDeleted(cast<Instruction>(V));
10104 }))) &&
10105 !isSplat(E.Scalars)) {
10106for (Value *V : E.Scalars) {
10107auto *LI = dyn_cast<LoadInst>(V);
10108if (!LI)
10109continue;
10110if (isDeleted(LI) ||isVectorized(LI) || !LI->isSimple())
10111continue;
10112gatherPossiblyVectorizableLoads(
10113 *this, V, *DL, *SE, *TTI,
10114 GatheredLoads[std::make_tuple(
10115 LI->getParent(),
10116getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth),
10117 LI->getType())]);
10118 }
10119 }
10120 }
10121// Try to vectorize gathered loads if this is not just a gather of loads.
10122if (!GatheredLoads.empty())
10123 tryToVectorizeGatheredLoads(GatheredLoads);
10124}
10125
10126/// Merges shuffle masks and emits final shuffle instruction, if required. It
10127/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10128/// when the actual shuffle instruction is generated only if this is actually
10129/// required. Otherwise, the shuffle instruction emission is delayed till the
10130/// end of the process, to reduce the number of emitted instructions and further
10131/// analysis/transformations.
10132classBoUpSLP::ShuffleCostEstimator :public BaseShuffleAnalysis {
10133bool IsFinalized =false;
10134SmallVector<int> CommonMask;
10135SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
10136constTargetTransformInfo &TTI;
10137InstructionCostCost = 0;
10138SmallDenseSet<Value *> VectorizedVals;
10139BoUpSLP &R;
10140SmallPtrSetImpl<Value *> &CheckedExtracts;
10141constexprstaticTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;
10142 /// While set, still trying to estimate the cost for the same nodes and we
10143 /// can delay actual cost estimation (virtual shuffle instruction emission).
10144 /// May help better estimate the cost if same nodes must be permuted + allows
10145 /// to move most of the long shuffles cost estimation to TTI.
10146bool SameNodesEstimated =true;
10147
10148staticConstant *getAllOnesValue(constDataLayout &DL,Type *Ty) {
10149if (Ty->getScalarType()->isPointerTy()) {
10150Constant *Res =ConstantExpr::getIntToPtr(
10151ConstantInt::getAllOnesValue(
10152IntegerType::get(Ty->getContext(),
10153DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10154 Ty->getScalarType());
10155if (auto *VTy = dyn_cast<VectorType>(Ty))
10156 Res =ConstantVector::getSplat(VTy->getElementCount(), Res);
10157return Res;
10158 }
10159returnConstant::getAllOnesValue(Ty);
10160 }
10161
10162InstructionCost getBuildVectorCost(ArrayRef<Value *> VL,Value *Root) {
10163if ((!Root &&allConstant(VL)) ||all_of(VL, IsaPred<UndefValue>))
10164returnTTI::TCC_Free;
10165auto *VecTy =getWidenedType(ScalarTy, VL.size());
10166InstructionCost GatherCost = 0;
10167SmallVector<Value *> Gathers(VL);
10168if (!Root &&isSplat(VL)) {
10169// Found the broadcasting of the single scalar, calculate the cost as
10170// the broadcast.
10171constauto *It =find_if_not(VL, IsaPred<UndefValue>);
10172assert(It != VL.end() &&"Expected at least one non-undef value.");
10173// Add broadcast for non-identity shuffle only.
10174bool NeedShuffle =
10175count(VL, *It) > 1 &&
10176 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10177if (!NeedShuffle) {
10178if (isa<FixedVectorType>(ScalarTy)) {
10179assert(SLPReVec &&"FixedVectorType is not expected.");
10180returnTTI.getShuffleCost(
10181TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10182 std::distance(VL.begin(), It) *getNumElements(ScalarTy),
10183 cast<FixedVectorType>(ScalarTy));
10184 }
10185returnTTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10186 CostKind, std::distance(VL.begin(), It),
10187PoisonValue::get(VecTy), *It);
10188 }
10189
10190SmallVector<int> ShuffleMask(VL.size(),PoisonMaskElem);
10191transform(VL, ShuffleMask.begin(), [](Value *V) {
10192 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10193 });
10194InstructionCost InsertCost =
10195TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10196PoisonValue::get(VecTy), *It);
10197return InsertCost +::getShuffleCost(TTI,
10198TargetTransformInfo::SK_Broadcast,
10199 VecTy, ShuffleMask, CostKind,
10200/*Index=*/0,/*SubTp=*/nullptr,
10201/*Args=*/*It);
10202 }
10203return GatherCost +
10204 (all_of(Gathers, IsaPred<UndefValue>)
10205 ?TTI::TCC_Free
10206 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10207 ScalarTy));
10208 };
10209
10210 /// Compute the cost of creating a vector containing the extracted values from
10211 /// \p VL.
10212InstructionCost
10213 computeExtractCost(ArrayRef<Value *> VL,ArrayRef<int> Mask,
10214ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10215unsigned NumParts) {
10216assert(VL.size() > NumParts &&"Unexpected scalarized shuffle.");
10217unsigned NumElts =
10218 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz,Value *V) {
10219 auto *EE = dyn_cast<ExtractElementInst>(V);
10220 if (!EE)
10221 return Sz;
10222 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10223 if (!VecTy)
10224 return Sz;
10225 return std::max(Sz, VecTy->getNumElements());
10226 });
10227// FIXME: this must be moved to TTI for better estimation.
10228unsigned EltsPerVector =getPartNumElems(VL.size(), NumParts);
10229auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10230SmallVectorImpl<unsigned> &Indices)
10231 -> std::optional<TTI::ShuffleKind> {
10232if (NumElts <= EltsPerVector)
10233return std::nullopt;
10234int OffsetReg0 =
10235alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10236 [](int S,intI) {
10237 if (I == PoisonMaskElem)
10238 return S;
10239 return std::min(S, I);
10240 }),
10241 EltsPerVector);
10242int OffsetReg1 = OffsetReg0;
10243DenseSet<int> RegIndices;
10244// Check that if trying to permute same single/2 input vectors.
10245TTI::ShuffleKind ShuffleKind =TTI::SK_PermuteSingleSrc;
10246int FirstRegId = -1;
10247 Indices.assign(1, OffsetReg0);
10248for (auto [Pos,I] :enumerate(Mask)) {
10249if (I ==PoisonMaskElem)
10250continue;
10251intIdx =I - OffsetReg0;
10252int RegId =
10253 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10254if (FirstRegId < 0)
10255 FirstRegId = RegId;
10256 RegIndices.insert(RegId);
10257if (RegIndices.size() > 2)
10258return std::nullopt;
10259if (RegIndices.size() == 2) {
10260 ShuffleKind =TTI::SK_PermuteTwoSrc;
10261if (Indices.size() == 1) {
10262 OffsetReg1 =alignDown(
10263 std::accumulate(
10264 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10265 [&](int S,intI) {
10266 if (I == PoisonMaskElem)
10267 return S;
10268 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10269 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10270 if (RegId == FirstRegId)
10271 return S;
10272 return std::min(S, I);
10273 }),
10274 EltsPerVector);
10275 Indices.push_back(OffsetReg1 % NumElts);
10276 }
10277Idx =I - OffsetReg1;
10278 }
10279I = (Idx % NumElts) % EltsPerVector +
10280 (RegId == FirstRegId ? 0 : EltsPerVector);
10281 }
10282return ShuffleKind;
10283 };
10284InstructionCostCost = 0;
10285
10286// Process extracts in blocks of EltsPerVector to check if the source vector
10287// operand can be re-used directly. If not, add the cost of creating a
10288// shuffle to extract the values into a vector register.
10289for (unsigned Part : seq<unsigned>(NumParts)) {
10290if (!ShuffleKinds[Part])
10291continue;
10292ArrayRef<int> MaskSlice = Mask.slice(
10293 Part * EltsPerVector,getNumElems(Mask.size(), EltsPerVector, Part));
10294SmallVector<int> SubMask(EltsPerVector,PoisonMaskElem);
10295copy(MaskSlice, SubMask.begin());
10296SmallVector<unsigned, 2> Indices;
10297 std::optional<TTI::ShuffleKind> RegShuffleKind =
10298 CheckPerRegistersShuffle(SubMask, Indices);
10299if (!RegShuffleKind) {
10300if (*ShuffleKinds[Part] !=TTI::SK_PermuteSingleSrc ||
10301 !ShuffleVectorInst::isIdentityMask(
10302 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10303Cost +=
10304::getShuffleCost(TTI, *ShuffleKinds[Part],
10305getWidenedType(ScalarTy, NumElts), MaskSlice);
10306continue;
10307 }
10308if (*RegShuffleKind !=TTI::SK_PermuteSingleSrc ||
10309 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10310Cost +=
10311::getShuffleCost(TTI, *RegShuffleKind,
10312getWidenedType(ScalarTy, EltsPerVector), SubMask);
10313 }
10314constunsigned BaseVF =getFullVectorNumberOfElements(
10315 *R.TTI, VL.front()->getType(),alignTo(NumElts, EltsPerVector));
10316for (unsignedIdx : Indices) {
10317assert((Idx + EltsPerVector) <= BaseVF &&
10318"SK_ExtractSubvector index out of range");
10319Cost +=::getShuffleCost(TTI,TTI::SK_ExtractSubvector,
10320getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10321Idx,getWidenedType(ScalarTy, EltsPerVector));
10322 }
10323// Second attempt to check, if just a permute is better estimated than
10324// subvector extract.
10325 SubMask.assign(NumElts,PoisonMaskElem);
10326copy(MaskSlice, SubMask.begin());
10327InstructionCost OriginalCost =::getShuffleCost(
10328TTI, *ShuffleKinds[Part],getWidenedType(ScalarTy, NumElts), SubMask);
10329if (OriginalCost <Cost)
10330Cost = OriginalCost;
10331 }
10332returnCost;
10333 }
10334 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10335 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10336 /// elements.
10337void estimateNodesPermuteCost(const TreeEntry &E1,const TreeEntry *E2,
10338ArrayRef<int> Mask,unsigned Part,
10339unsigned SliceSize) {
10340if (SameNodesEstimated) {
10341// Delay the cost estimation if the same nodes are reshuffling.
10342// If we already requested the cost of reshuffling of E1 and E2 before, no
10343// need to estimate another cost with the sub-Mask, instead include this
10344// sub-Mask into the CommonMask to estimate it later and avoid double cost
10345// estimation.
10346if ((InVectors.size() == 2 &&
10347 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10348 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10349 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10350unsigned Limit =getNumElems(Mask.size(), SliceSize, Part);
10351assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10352 [](intIdx) {returnIdx ==PoisonMaskElem; }) &&
10353"Expected all poisoned elements.");
10354ArrayRef<int> SubMask =ArrayRef(Mask).slice(Part * SliceSize, Limit);
10355copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10356return;
10357 }
10358// Found non-matching nodes - need to estimate the cost for the matched
10359// and transform mask.
10360Cost += createShuffle(InVectors.front(),
10361 InVectors.size() == 1 ?nullptr : InVectors.back(),
10362 CommonMask);
10363 transformMaskAfterShuffle(CommonMask, CommonMask);
10364 }elseif (InVectors.size() == 2) {
10365Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10366 transformMaskAfterShuffle(CommonMask, CommonMask);
10367 }
10368 SameNodesEstimated =false;
10369if (!E2 && InVectors.size() == 1) {
10370unsigned VF = E1.getVectorFactor();
10371if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10372 VF = std::max(VF,
10373 cast<FixedVectorType>(V1->getType())->getNumElements());
10374 }else {
10375constauto *E = cast<const TreeEntry *>(InVectors.front());
10376 VF = std::max(VF, E->getVectorFactor());
10377 }
10378for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
10379if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)
10380 CommonMask[Idx] = Mask[Idx] + VF;
10381Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10382 transformMaskAfterShuffle(CommonMask, CommonMask);
10383 }else {
10384autoP = InVectors.front();
10385Cost += createShuffle(&E1, E2, Mask);
10386unsigned VF = Mask.size();
10387if (Value *V1 =P.dyn_cast<Value *>()) {
10388 VF = std::max(VF,
10389getNumElements(V1->getType()));
10390 }else {
10391constauto *E = cast<const TreeEntry *>(P);
10392 VF = std::max(VF, E->getVectorFactor());
10393 }
10394for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
10395if (Mask[Idx] !=PoisonMaskElem)
10396 CommonMask[Idx] =Idx + (InVectors.empty() ? 0 : VF);
10397Cost += createShuffle(P, InVectors.front(), CommonMask);
10398 transformMaskAfterShuffle(CommonMask, CommonMask);
10399 }
10400 }
10401
10402classShuffleCostBuilder {
10403constTargetTransformInfo &TTI;
10404
10405staticbool isEmptyOrIdentity(ArrayRef<int> Mask,unsigned VF) {
10406int Index = -1;
10407return Mask.empty() ||
10408 (VF == Mask.size() &&
10409ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
10410 (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
10411 Index == 0);
10412 }
10413
10414public:
10415 ShuffleCostBuilder(constTargetTransformInfo &TTI) :TTI(TTI) {}
10416 ~ShuffleCostBuilder() =default;
10417InstructionCost createShuffleVector(Value *V1,Value *,
10418ArrayRef<int> Mask) const{
10419// Empty mask or identity mask are free.
10420unsigned VF =
10421 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10422if (isEmptyOrIdentity(Mask, VF))
10423returnTTI::TCC_Free;
10424 return ::getShuffleCost(TTI,TTI::SK_PermuteTwoSrc,
10425 cast<VectorType>(V1->getType()), Mask);
10426 }
10427InstructionCost createShuffleVector(Value *V1,ArrayRef<int> Mask) const{
10428// Empty mask or identity mask are free.
10429unsigned VF =
10430 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10431if (isEmptyOrIdentity(Mask, VF))
10432returnTTI::TCC_Free;
10433 return ::getShuffleCost(TTI,TTI::SK_PermuteSingleSrc,
10434 cast<VectorType>(V1->getType()), Mask);
10435 }
10436InstructionCost createIdentity(Value *) const{returnTTI::TCC_Free; }
10437InstructionCost createPoison(Type *Ty,unsigned VF) const{
10438returnTTI::TCC_Free;
10439 }
10440void resizeToMatch(Value *&,Value *&) const{}
10441 };
10442
10443 /// Smart shuffle instruction emission, walks through shuffles trees and
10444 /// tries to find the best matching vector for the actual shuffle
10445 /// instruction.
10446InstructionCost
10447 createShuffle(constPointerUnion<Value *, const TreeEntry *> &P1,
10448constPointerUnion<Value *, const TreeEntry *> &P2,
10449ArrayRef<int> Mask) {
10450 ShuffleCostBuilder Builder(TTI);
10451SmallVector<int> CommonMask(Mask);
10452Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10453unsigned CommonVF = Mask.size();
10454InstructionCost ExtraCost = 0;
10455auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10456unsigned VF) ->InstructionCost {
10457if (E.isGather() &&allConstant(E.Scalars))
10458returnTTI::TCC_Free;
10459Type *EScalarTy = E.Scalars.front()->getType();
10460bool IsSigned =true;
10461if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10462 EScalarTy =IntegerType::get(EScalarTy->getContext(), It->second.first);
10463 IsSigned = It->second.second;
10464 }
10465if (EScalarTy != ScalarTy) {
10466unsigned CastOpcode = Instruction::Trunc;
10467unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10468unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10469if (DstSz > SrcSz)
10470 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10471returnTTI.getCastInstrCost(CastOpcode,getWidenedType(ScalarTy, VF),
10472getWidenedType(EScalarTy, VF),
10473TTI::CastContextHint::None, CostKind);
10474 }
10475returnTTI::TCC_Free;
10476 };
10477auto GetValueMinBWAffectedCost = [&](constValue *V) ->InstructionCost {
10478if (isa<Constant>(V))
10479returnTTI::TCC_Free;
10480auto *VecTy = cast<VectorType>(V->getType());
10481Type *EScalarTy = VecTy->getElementType();
10482if (EScalarTy != ScalarTy) {
10483bool IsSigned = !isKnownNonNegative(V,SimplifyQuery(*R.DL));
10484unsigned CastOpcode = Instruction::Trunc;
10485unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10486unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10487if (DstSz > SrcSz)
10488 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10489returnTTI.getCastInstrCost(
10490 CastOpcode,VectorType::get(ScalarTy, VecTy->getElementCount()),
10491 VecTy,TTI::CastContextHint::None, CostKind);
10492 }
10493returnTTI::TCC_Free;
10494 };
10495if (!V1 && !V2 && !P2.isNull()) {
10496// Shuffle 2 entry nodes.
10497const TreeEntry *E = cast<const TreeEntry *>(P1);
10498unsigned VF = E->getVectorFactor();
10499const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10500 CommonVF = std::max(VF, E2->getVectorFactor());
10501assert(all_of(Mask,
10502 [=](intIdx) {
10503return Idx < 2 * static_cast<int>(CommonVF);
10504 }) &&
10505"All elements in mask must be less than 2 * CommonVF.");
10506if (E->Scalars.size() == E2->Scalars.size()) {
10507SmallVector<int> EMask = E->getCommonMask();
10508SmallVector<int> E2Mask = E2->getCommonMask();
10509if (!EMask.empty() || !E2Mask.empty()) {
10510for (int &Idx : CommonMask) {
10511if (Idx ==PoisonMaskElem)
10512continue;
10513if (Idx <static_cast<int>(CommonVF) && !EMask.empty())
10514Idx = EMask[Idx];
10515elseif (Idx >=static_cast<int>(CommonVF))
10516Idx = (E2Mask.empty() ?Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10517 E->Scalars.size();
10518 }
10519 }
10520 CommonVF = E->Scalars.size();
10521 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10522 GetNodeMinBWAffectedCost(*E2, CommonVF);
10523 }else {
10524 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10525 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10526 }
10527 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10528 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10529 }elseif (!V1 && P2.isNull()) {
10530// Shuffle single entry node.
10531const TreeEntry *E = cast<const TreeEntry *>(P1);
10532unsigned VF = E->getVectorFactor();
10533 CommonVF = VF;
10534assert(
10535all_of(Mask,
10536 [=](intIdx) {return Idx < static_cast<int>(CommonVF); }) &&
10537"All elements in mask must be less than CommonVF.");
10538if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10539SmallVector<int> EMask = E->getCommonMask();
10540assert(!EMask.empty() &&"Expected non-empty common mask.");
10541for (int &Idx : CommonMask) {
10542if (Idx !=PoisonMaskElem)
10543Idx = EMask[Idx];
10544 }
10545 CommonVF = E->Scalars.size();
10546 }elseif (unsigned Factor = E->getInterleaveFactor();
10547 Factor > 0 && E->Scalars.size() != Mask.size() &&
10548ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
10549 Factor)) {
10550// Deinterleaved nodes are free.
10551 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10552 }
10553 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10554 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10555// Not identity/broadcast? Try to see if the original vector is better.
10556if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10557 CommonVF == CommonMask.size() &&
10558any_of(enumerate(CommonMask),
10559 [](constauto &&P) {
10560returnP.value() !=PoisonMaskElem &&
10561static_cast<unsigned>(P.value()) !=P.index();
10562 }) &&
10563any_of(CommonMask,
10564 [](intIdx) {returnIdx !=PoisonMaskElem &&Idx != 0; })) {
10565SmallVector<int> ReorderMask;
10566inversePermutation(E->ReorderIndices, ReorderMask);
10567::addMask(CommonMask, ReorderMask);
10568 }
10569 }elseif (V1 && P2.isNull()) {
10570// Shuffle single vector.
10571 ExtraCost += GetValueMinBWAffectedCost(V1);
10572 CommonVF = getVF(V1);
10573assert(
10574all_of(Mask,
10575 [=](intIdx) {return Idx < static_cast<int>(CommonVF); }) &&
10576"All elements in mask must be less than CommonVF.");
10577 }elseif (V1 && !V2) {
10578// Shuffle vector and tree node.
10579unsigned VF = getVF(V1);
10580const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10581 CommonVF = std::max(VF, E2->getVectorFactor());
10582assert(all_of(Mask,
10583 [=](intIdx) {
10584return Idx < 2 * static_cast<int>(CommonVF);
10585 }) &&
10586"All elements in mask must be less than 2 * CommonVF.");
10587if (E2->Scalars.size() == VF && VF != CommonVF) {
10588SmallVector<int> E2Mask = E2->getCommonMask();
10589assert(!E2Mask.empty() &&"Expected non-empty common mask.");
10590for (int &Idx : CommonMask) {
10591if (Idx ==PoisonMaskElem)
10592continue;
10593if (Idx >=static_cast<int>(CommonVF))
10594Idx = E2Mask[Idx - CommonVF] + VF;
10595 }
10596 CommonVF = VF;
10597 }
10598 ExtraCost += GetValueMinBWAffectedCost(V1);
10599 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10600 ExtraCost += GetNodeMinBWAffectedCost(
10601 *E2, std::min(CommonVF, E2->getVectorFactor()));
10602 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10603 }elseif (!V1 && V2) {
10604// Shuffle vector and tree node.
10605unsigned VF = getVF(V2);
10606const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10607 CommonVF = std::max(VF, E1->getVectorFactor());
10608assert(all_of(Mask,
10609 [=](intIdx) {
10610return Idx < 2 * static_cast<int>(CommonVF);
10611 }) &&
10612"All elements in mask must be less than 2 * CommonVF.");
10613if (E1->Scalars.size() == VF && VF != CommonVF) {
10614SmallVector<int> E1Mask = E1->getCommonMask();
10615assert(!E1Mask.empty() &&"Expected non-empty common mask.");
10616for (int &Idx : CommonMask) {
10617if (Idx ==PoisonMaskElem)
10618continue;
10619if (Idx >=static_cast<int>(CommonVF))
10620Idx = E1Mask[Idx - CommonVF] + VF;
10621else
10622Idx = E1Mask[Idx];
10623 }
10624 CommonVF = VF;
10625 }
10626 ExtraCost += GetNodeMinBWAffectedCost(
10627 *E1, std::min(CommonVF, E1->getVectorFactor()));
10628 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10629 ExtraCost += GetValueMinBWAffectedCost(V2);
10630 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10631 }else {
10632assert(V1 && V2 &&"Expected both vectors.");
10633unsigned VF = getVF(V1);
10634 CommonVF = std::max(VF, getVF(V2));
10635assert(all_of(Mask,
10636 [=](intIdx) {
10637return Idx < 2 * static_cast<int>(CommonVF);
10638 }) &&
10639"All elements in mask must be less than 2 * CommonVF.");
10640 ExtraCost +=
10641 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10642if (V1->getType() != V2->getType()) {
10643 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10644 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10645 }else {
10646if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10647 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10648if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10649 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));
10650 }
10651 }
10652 InVectors.front() =
10653Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10654if (InVectors.size() == 2)
10655 InVectors.pop_back();
10656return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10657 V1, V2, CommonMask, Builder, ScalarTy);
10658 }
10659
10660public:
10661ShuffleCostEstimator(Type *ScalarTy,TargetTransformInfo &TTI,
10662ArrayRef<Value *> VectorizedVals,BoUpSLP &R,
10663SmallPtrSetImpl<Value *> &CheckedExtracts)
10664 : BaseShuffleAnalysis(ScalarTy),TTI(TTI),
10665 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10666 CheckedExtracts(CheckedExtracts) {}
10667Value *adjustExtracts(const TreeEntry *E,MutableArrayRef<int> Mask,
10668ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10669unsigned NumParts,bool &UseVecBaseAsInput) {
10670 UseVecBaseAsInput =false;
10671if (Mask.empty())
10672returnnullptr;
10673Value *VecBase =nullptr;
10674SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10675if (!E->ReorderIndices.empty()) {
10676SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10677 E->ReorderIndices.end());
10678reorderScalars(VL, ReorderMask);
10679 }
10680// Check if it can be considered reused if same extractelements were
10681// vectorized already.
10682bool PrevNodeFound =any_of(
10683ArrayRef(R.VectorizableTree).take_front(E->Idx),
10684 [&](const std::unique_ptr<TreeEntry> &TE) {
10685 return ((TE->hasState() && !TE->isAltShuffle() &&
10686 TE->getOpcode() == Instruction::ExtractElement) ||
10687 TE->isGather()) &&
10688 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10689 return VL.size() > Data.index() &&
10690 (Mask[Data.index()] == PoisonMaskElem ||
10691 isa<UndefValue>(VL[Data.index()]) ||
10692 Data.value() == VL[Data.index()]);
10693 });
10694 });
10695SmallPtrSet<Value *, 4> UniqueBases;
10696unsigned SliceSize =getPartNumElems(VL.size(), NumParts);
10697for (unsigned Part : seq<unsigned>(NumParts)) {
10698unsigned Limit =getNumElems(VL.size(), SliceSize, Part);
10699ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10700for (auto [I, V] :
10701enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10702// Ignore non-extractelement scalars.
10703if (isa<UndefValue>(V) ||
10704 (!SubMask.empty() && SubMask[I] ==PoisonMaskElem))
10705continue;
10706// If all users of instruction are going to be vectorized and this
10707// instruction itself is not going to be vectorized, consider this
10708// instruction as dead and remove its cost from the final cost of the
10709// vectorized tree.
10710// Also, avoid adjusting the cost for extractelements with multiple uses
10711// in different graph entries.
10712auto *EE = cast<ExtractElementInst>(V);
10713 VecBase = EE->getVectorOperand();
10714 UniqueBases.insert(VecBase);
10715const TreeEntry *VE = R.getTreeEntry(V);
10716if (!CheckedExtracts.insert(V).second ||
10717 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10718any_of(EE->users(),
10719 [&](User *U) {
10720 return isa<GetElementPtrInst>(U) &&
10721 !R.areAllUsersVectorized(cast<Instruction>(U),
10722 &VectorizedVals);
10723 }) ||
10724 (VE && VE != E))
10725continue;
10726 std::optional<unsigned> EEIdx =getExtractIndex(EE);
10727if (!EEIdx)
10728continue;
10729unsignedIdx = *EEIdx;
10730// Take credit for instruction that will become dead.
10731if (EE->hasOneUse() || !PrevNodeFound) {
10732Instruction *Ext = EE->user_back();
10733if (isa<SExtInst, ZExtInst>(Ext) &&
10734all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10735// Use getExtractWithExtendCost() to calculate the cost of
10736// extractelement/ext pair.
10737Cost -=
10738TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10739 EE->getVectorOperandType(),Idx);
10740// Add back the cost of s|zext which is subtracted separately.
10741Cost +=TTI.getCastInstrCost(
10742 Ext->getOpcode(), Ext->getType(), EE->getType(),
10743TTI::getCastContextHint(Ext), CostKind, Ext);
10744continue;
10745 }
10746 }
10747Cost -=TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10748 CostKind,Idx);
10749 }
10750 }
10751// Check that gather of extractelements can be represented as just a
10752// shuffle of a single/two vectors the scalars are extracted from.
10753// Found the bunch of extractelement instructions that must be gathered
10754// into a vector and can be represented as a permutation elements in a
10755// single input vector or of 2 input vectors.
10756// Done for reused if same extractelements were vectorized already.
10757if (!PrevNodeFound)
10758Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10759 InVectors.assign(1, E);
10760 CommonMask.assign(Mask.begin(),Mask.end());
10761 transformMaskAfterShuffle(CommonMask, CommonMask);
10762 SameNodesEstimated =false;
10763if (NumParts != 1 && UniqueBases.size() != 1) {
10764 UseVecBaseAsInput =true;
10765 VecBase =
10766Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10767 }
10768return VecBase;
10769 }
10770 /// Checks if the specified entry \p E needs to be delayed because of its
10771 /// dependency nodes.
10772 std::optional<InstructionCost>
10773needToDelay(const TreeEntry *,
10774ArrayRef<SmallVector<const TreeEntry *>>) const{
10775// No need to delay the cost estimation during analysis.
10776return std::nullopt;
10777 }
10778voidadd(const TreeEntry &E1,const TreeEntry &E2,ArrayRef<int> Mask) {
10779if (&E1 == &E2) {
10780assert(all_of(Mask,
10781 [&](intIdx) {
10782return Idx < static_cast<int>(E1.getVectorFactor());
10783 }) &&
10784"Expected single vector shuffle mask.");
10785 add(E1, Mask);
10786return;
10787 }
10788if (InVectors.empty()) {
10789 CommonMask.assign(Mask.begin(), Mask.end());
10790 InVectors.assign({&E1, &E2});
10791return;
10792 }
10793assert(!CommonMask.empty() &&"Expected non-empty common mask.");
10794auto *MaskVecTy =getWidenedType(ScalarTy, Mask.size());
10795unsigned NumParts =TTI.getNumberOfParts(MaskVecTy);
10796if (NumParts == 0 || NumParts >= Mask.size() ||
10797 MaskVecTy->getNumElements() % NumParts != 0 ||
10798 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10799 MaskVecTy->getNumElements() / NumParts))
10800 NumParts = 1;
10801unsigned SliceSize =getPartNumElems(Mask.size(), NumParts);
10802constauto *It =
10803find_if(Mask, [](intIdx) {returnIdx !=PoisonMaskElem; });
10804unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10805 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10806 }
10807voidadd(const TreeEntry &E1,ArrayRef<int> Mask) {
10808if (InVectors.empty()) {
10809 CommonMask.assign(Mask.begin(), Mask.end());
10810 InVectors.assign(1, &E1);
10811return;
10812 }
10813assert(!CommonMask.empty() &&"Expected non-empty common mask.");
10814auto *MaskVecTy =getWidenedType(ScalarTy, Mask.size());
10815unsigned NumParts =TTI.getNumberOfParts(MaskVecTy);
10816if (NumParts == 0 || NumParts >= Mask.size() ||
10817 MaskVecTy->getNumElements() % NumParts != 0 ||
10818 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10819 MaskVecTy->getNumElements() / NumParts))
10820 NumParts = 1;
10821unsigned SliceSize =getPartNumElems(Mask.size(), NumParts);
10822constauto *It =
10823find_if(Mask, [](intIdx) {returnIdx !=PoisonMaskElem; });
10824unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10825 estimateNodesPermuteCost(E1,nullptr, Mask, Part, SliceSize);
10826if (!SameNodesEstimated && InVectors.size() == 1)
10827 InVectors.emplace_back(&E1);
10828 }
10829 /// Adds 2 input vectors and the mask for their shuffling.
10830voidadd(Value *V1,Value *V2,ArrayRef<int> Mask) {
10831// May come only for shuffling of 2 vectors with extractelements, already
10832// handled in adjustExtracts.
10833assert(InVectors.size() == 1 &&
10834all_of(enumerate(CommonMask),
10835 [&](autoP) {
10836if (P.value() ==PoisonMaskElem)
10837return Mask[P.index()] ==PoisonMaskElem;
10838auto *EI = cast<ExtractElementInst>(
10839 cast<const TreeEntry *>(InVectors.front())
10840 ->getOrdered(P.index()));
10841return EI->getVectorOperand() == V1 ||
10842 EI->getVectorOperand() == V2;
10843 }) &&
10844"Expected extractelement vectors.");
10845 }
10846 /// Adds another one input vector and the mask for the shuffling.
10847voidadd(Value *V1,ArrayRef<int> Mask,bool ForExtracts =false) {
10848if (InVectors.empty()) {
10849assert(CommonMask.empty() && !ForExtracts &&
10850"Expected empty input mask/vectors.");
10851 CommonMask.assign(Mask.begin(), Mask.end());
10852 InVectors.assign(1, V1);
10853return;
10854 }
10855if (ForExtracts) {
10856// No need to add vectors here, already handled them in adjustExtracts.
10857assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10858 !CommonMask.empty() &&
10859all_of(enumerate(CommonMask),
10860 [&](autoP) {
10861Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10862 ->getOrdered(P.index());
10863if (P.value() ==PoisonMaskElem)
10864returnP.value() == Mask[P.index()] ||
10865 isa<UndefValue>(Scalar);
10866if (isa<Constant>(V1))
10867returntrue;
10868auto *EI = cast<ExtractElementInst>(Scalar);
10869return EI->getVectorOperand() == V1;
10870 }) &&
10871"Expected only tree entry for extractelement vectors.");
10872return;
10873 }
10874assert(!InVectors.empty() && !CommonMask.empty() &&
10875"Expected only tree entries from extracts/reused buildvectors.");
10876unsigned VF = getVF(V1);
10877if (InVectors.size() == 2) {
10878Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10879 transformMaskAfterShuffle(CommonMask, CommonMask);
10880 VF = std::max<unsigned>(VF, CommonMask.size());
10881 }elseif (constauto *InTE =
10882 InVectors.front().dyn_cast<const TreeEntry *>()) {
10883 VF = std::max(VF, InTE->getVectorFactor());
10884 }else {
10885 VF = std::max(
10886 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10887 ->getNumElements());
10888 }
10889 InVectors.push_back(V1);
10890for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
10891if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)
10892 CommonMask[Idx] = Mask[Idx] + VF;
10893 }
10894Value *gather(ArrayRef<Value *> VL,unsigned MaskVF = 0,
10895Value *Root =nullptr) {
10896Cost += getBuildVectorCost(VL, Root);
10897if (!Root) {
10898// FIXME: Need to find a way to avoid use of getNullValue here.
10899SmallVector<Constant *> Vals;
10900unsigned VF = VL.size();
10901if (MaskVF != 0)
10902 VF = std::min(VF, MaskVF);
10903for (Value *V : VL.take_front(VF)) {
10904if (isa<UndefValue>(V)) {
10905 Vals.push_back(cast<Constant>(V));
10906continue;
10907 }
10908 Vals.push_back(Constant::getNullValue(V->getType()));
10909 }
10910if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10911assert(SLPReVec &&"FixedVectorType is not expected.");
10912// When REVEC is enabled, we need to expand vector types into scalar
10913// types.
10914unsigned VecTyNumElements = VecTy->getNumElements();
10915SmallVector<Constant *> NewVals(VF * VecTyNumElements,nullptr);
10916for (auto [I, V] :enumerate(Vals)) {
10917Type *ScalarTy = V->getType()->getScalarType();
10918Constant *NewVal;
10919if (isa<PoisonValue>(V))
10920 NewVal =PoisonValue::get(ScalarTy);
10921elseif (isa<UndefValue>(V))
10922 NewVal =UndefValue::get(ScalarTy);
10923else
10924 NewVal =Constant::getNullValue(ScalarTy);
10925 std::fill_n(NewVals.begin() +I * VecTyNumElements, VecTyNumElements,
10926 NewVal);
10927 }
10928 Vals.swap(NewVals);
10929 }
10930returnConstantVector::get(Vals);
10931 }
10932returnConstantVector::getSplat(
10933ElementCount::getFixed(
10934 cast<FixedVectorType>(Root->getType())->getNumElements()),
10935 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10936 }
10937InstructionCostcreateFreeze(InstructionCostCost) {returnCost; }
10938 /// Finalize emission of the shuffles.
10939InstructionCost
10940finalize(ArrayRef<int> ExtMask,
10941ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10942ArrayRef<int> SubVectorsMask,unsigned VF = 0,
10943function_ref<void(Value *&,SmallVectorImpl<int> &)> Action = {}) {
10944 IsFinalized =true;
10945if (Action) {
10946constPointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10947if (InVectors.size() == 2)
10948Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10949else
10950Cost += createShuffle(Vec,nullptr, CommonMask);
10951 transformMaskAfterShuffle(CommonMask, CommonMask);
10952assert(VF > 0 &&
10953"Expected vector length for the final value before action.");
10954Value *V = cast<Value *>(Vec);
10955 Action(V, CommonMask);
10956 InVectors.front() = V;
10957 }
10958if (!SubVectors.empty()) {
10959constPointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10960if (InVectors.size() == 2)
10961Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10962else
10963Cost += createShuffle(Vec,nullptr, CommonMask);
10964 transformMaskAfterShuffle(CommonMask, CommonMask);
10965// Add subvectors permutation cost.
10966if (!SubVectorsMask.empty()) {
10967assert(SubVectorsMask.size() <= CommonMask.size() &&
10968"Expected same size of masks for subvectors and common mask.");
10969SmallVector<int> SVMask(CommonMask.size(),PoisonMaskElem);
10970copy(SubVectorsMask, SVMask.begin());
10971for (auto [I1, I2] :zip(SVMask, CommonMask)) {
10972if (I2 !=PoisonMaskElem) {
10973assert(I1 ==PoisonMaskElem &&"Expected unused subvectors mask");
10974 I1 = I2 + CommonMask.size();
10975 }
10976 }
10977Cost +=::getShuffleCost(TTI,TTI::SK_PermuteTwoSrc,
10978getWidenedType(ScalarTy, CommonMask.size()),
10979 SVMask,CostKind);
10980 }
10981for (auto [E,Idx] : SubVectors) {
10982Type *EScalarTy = E->Scalars.front()->getType();
10983bool IsSigned =true;
10984if (auto It =R.MinBWs.find(E); It !=R.MinBWs.end()) {
10985 EScalarTy =
10986IntegerType::get(EScalarTy->getContext(), It->second.first);
10987 IsSigned = It->second.second;
10988 }
10989if (ScalarTy != EScalarTy) {
10990unsigned CastOpcode = Instruction::Trunc;
10991unsigned DstSz =R.DL->getTypeSizeInBits(ScalarTy);
10992unsigned SrcSz =R.DL->getTypeSizeInBits(EScalarTy);
10993if (DstSz > SrcSz)
10994 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10995Cost +=TTI.getCastInstrCost(
10996 CastOpcode,getWidenedType(ScalarTy, E->getVectorFactor()),
10997getWidenedType(EScalarTy, E->getVectorFactor()),
10998TTI::CastContextHint::Normal,CostKind);
10999 }
11000Cost +=::getShuffleCost(
11001TTI,TTI::SK_InsertSubvector,
11002getWidenedType(ScalarTy, CommonMask.size()), {},CostKind,Idx,
11003getWidenedType(ScalarTy, E->getVectorFactor()));
11004if (!CommonMask.empty()) {
11005 std::iota(std::next(CommonMask.begin(),Idx),
11006 std::next(CommonMask.begin(),Idx + E->getVectorFactor()),
11007Idx);
11008 }
11009 }
11010 }
11011
11012if (!ExtMask.empty()) {
11013if (CommonMask.empty()) {
11014 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11015 }else {
11016SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);
11017for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {
11018if (ExtMask[I] ==PoisonMaskElem)
11019continue;
11020 NewMask[I] = CommonMask[ExtMask[I]];
11021 }
11022 CommonMask.swap(NewMask);
11023 }
11024 }
11025if (CommonMask.empty()) {
11026assert(InVectors.size() == 1 &&"Expected only one vector with no mask");
11027returnCost;
11028 }
11029returnCost +
11030 createShuffle(InVectors.front(),
11031 InVectors.size() == 2 ? InVectors.back() :nullptr,
11032 CommonMask);
11033 }
11034
11035~ShuffleCostEstimator() {
11036assert((IsFinalized || CommonMask.empty()) &&
11037"Shuffle construction must be finalized.");
11038 }
11039};
11040
11041const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
11042unsignedIdx) const{
11043if (const TreeEntry *VE = getMatchedVectorizedOperand(E,Idx))
11044return VE;
11045constauto *It =
11046find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11047return TE->isGather() &&
11048find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
11049 return EI.EdgeIdx == Idx && EI.UserTE == E;
11050 }) != TE->UserTreeIndices.end();
11051 });
11052assert(It != VectorizableTree.end() &&"Expected vectorizable entry.");
11053return It->get();
11054}
11055
11056TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const{
11057if (TE.State == TreeEntry::ScatterVectorize ||
11058 TE.State == TreeEntry::StridedVectorize)
11059returnTTI::CastContextHint::GatherScatter;
11060if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11061 !TE.isAltShuffle()) {
11062if (TE.ReorderIndices.empty())
11063returnTTI::CastContextHint::Normal;
11064SmallVector<int> Mask;
11065inversePermutation(TE.ReorderIndices, Mask);
11066if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
11067returnTTI::CastContextHint::Reversed;
11068 }
11069returnTTI::CastContextHint::None;
11070}
11071
11072/// Builds the arguments types vector for the given call instruction with the
11073/// given \p ID for the specified vector factor.
11074staticSmallVector<Type *>
11075buildIntrinsicArgTypes(constCallInst *CI,constIntrinsic::IDID,
11076constunsigned VF,unsigned MinBW,
11077constTargetTransformInfo *TTI) {
11078SmallVector<Type *> ArgTys;
11079for (auto [Idx, Arg] :enumerate(CI->args())) {
11080if (ID !=Intrinsic::not_intrinsic) {
11081if (isVectorIntrinsicWithScalarOpAtArg(ID,Idx,TTI)) {
11082 ArgTys.push_back(Arg->getType());
11083continue;
11084 }
11085if (MinBW > 0) {
11086 ArgTys.push_back(
11087getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11088continue;
11089 }
11090 }
11091 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11092 }
11093return ArgTys;
11094}
11095
11096InstructionCost
11097BoUpSLP::getEntryCost(const TreeEntry *E,ArrayRef<Value *> VectorizedVals,
11098SmallPtrSetImpl<Value *> &CheckedExtracts) {
11099ArrayRef<Value *> VL = E->Scalars;
11100
11101Type *ScalarTy =getValueType(VL[0]);
11102if (!isValidElementType(ScalarTy))
11103returnInstructionCost::getInvalid();
11104TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
11105
11106// If we have computed a smaller type for the expression, update VecTy so
11107// that the costs will be accurate.
11108auto It = MinBWs.find(E);
11109Type *OrigScalarTy = ScalarTy;
11110if (It != MinBWs.end()) {
11111auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11112 ScalarTy =IntegerType::get(F->getContext(), It->second.first);
11113if (VecTy)
11114 ScalarTy =getWidenedType(ScalarTy, VecTy->getNumElements());
11115 }
11116auto *VecTy =getWidenedType(ScalarTy, VL.size());
11117unsigned EntryVF = E->getVectorFactor();
11118auto *FinalVecTy =getWidenedType(ScalarTy, EntryVF);
11119
11120if (E->isGather()) {
11121if (allConstant(VL))
11122return 0;
11123if (isa<InsertElementInst>(VL[0]))
11124returnInstructionCost::getInvalid();
11125if (isa<CmpInst>(VL.front()))
11126 ScalarTy = VL.front()->getType();
11127return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11128 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11129 }
11130InstructionCost CommonCost = 0;
11131SmallVector<int>Mask;
11132if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11133 !isReverseOrder(E->ReorderIndices))) {
11134SmallVector<int> NewMask;
11135if (E->getOpcode() == Instruction::Store) {
11136// For stores the order is actually a mask.
11137 NewMask.resize(E->ReorderIndices.size());
11138copy(E->ReorderIndices, NewMask.begin());
11139 }else {
11140inversePermutation(E->ReorderIndices, NewMask);
11141 }
11142::addMask(Mask, NewMask);
11143 }
11144if (!E->ReuseShuffleIndices.empty())
11145::addMask(Mask, E->ReuseShuffleIndices);
11146if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask,Mask.size()))
11147 CommonCost =
11148::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11149assert((E->State == TreeEntry::Vectorize ||
11150 E->State == TreeEntry::ScatterVectorize ||
11151 E->State == TreeEntry::StridedVectorize) &&
11152"Unhandled state");
11153assert(E->getOpcode() &&
11154 ((allSameType(VL) &&allSameBlock(VL)) ||
11155 (E->getOpcode() == Instruction::GetElementPtr &&
11156 E->getMainOp()->getType()->isPointerTy())) &&
11157"Invalid VL");
11158Instruction *VL0 = E->getMainOp();
11159unsigned ShuffleOrOp =
11160 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11161if (E->CombinedOp != TreeEntry::NotCombinedOp)
11162 ShuffleOrOp = E->CombinedOp;
11163SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11164constunsigned Sz = UniqueValues.size();
11165SmallBitVector UsedScalars(Sz,false);
11166for (unsignedI = 0;I < Sz; ++I) {
11167if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)
11168continue;
11169 UsedScalars.set(I);
11170 }
11171auto GetCastContextHint = [&](Value *V) {
11172if (const TreeEntry *OpTE = getTreeEntry(V))
11173return getCastContextHint(*OpTE);
11174 InstructionsState SrcState =getSameOpcode(E->getOperand(0), *TLI);
11175if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11176 !SrcState.isAltShuffle())
11177returnTTI::CastContextHint::GatherScatter;
11178returnTTI::CastContextHint::None;
11179 };
11180auto GetCostDiff =
11181 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11182function_ref<InstructionCost(InstructionCost)> VectorCost) {
11183// Calculate the cost of this instruction.
11184InstructionCost ScalarCost = 0;
11185if (isa<CastInst, CallInst>(VL0)) {
11186// For some of the instructions no need to calculate cost for each
11187// particular instruction, we can use the cost of the single
11188// instruction x total number of scalar instructions.
11189 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11190 }else {
11191for (unsignedI = 0;I < Sz; ++I) {
11192if (UsedScalars.test(I))
11193continue;
11194 ScalarCost += ScalarEltCost(I);
11195 }
11196 }
11197
11198InstructionCost VecCost = VectorCost(CommonCost);
11199// Check if the current node must be resized, if the parent node is not
11200// resized.
11201if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11202 E->Idx != 0 &&
11203 (E->getOpcode() != Instruction::Load ||
11204 !E->UserTreeIndices.empty())) {
11205const EdgeInfo &EI =
11206 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11207 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11208 });
11209if (EI.UserTE->getOpcode() != Instruction::Select ||
11210 EI.EdgeIdx != 0) {
11211auto UserBWIt = MinBWs.find(EI.UserTE);
11212Type *UserScalarTy =
11213 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11214if (UserBWIt != MinBWs.end())
11215 UserScalarTy =IntegerType::get(ScalarTy->getContext(),
11216 UserBWIt->second.first);
11217if (ScalarTy != UserScalarTy) {
11218unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);
11219unsigned SrcBWSz =DL->getTypeSizeInBits(UserScalarTy);
11220unsigned VecOpcode;
11221auto *UserVecTy =getWidenedType(UserScalarTy, E->Scalars.size());
11222if (BWSz > SrcBWSz)
11223 VecOpcode = Instruction::Trunc;
11224else
11225 VecOpcode =
11226 It->second.second ? Instruction::SExt : Instruction::ZExt;
11227TTI::CastContextHint CCH = GetCastContextHint(VL0);
11228 VecCost +=TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11229CostKind);
11230 }
11231 }
11232 }
11233LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11234 ScalarCost,"Calculated costs for Tree"));
11235return VecCost - ScalarCost;
11236 };
11237// Calculate cost difference from vectorizing set of GEPs.
11238// Negative value means vectorizing is profitable.
11239auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs,Value *BasePtr) {
11240assert((E->State == TreeEntry::Vectorize ||
11241 E->State == TreeEntry::StridedVectorize) &&
11242"Entry state expected to be Vectorize or StridedVectorize here.");
11243InstructionCost ScalarCost = 0;
11244InstructionCost VecCost = 0;
11245 std::tie(ScalarCost, VecCost) =getGEPCosts(
11246 *TTI, Ptrs, BasePtr, E->getOpcode(),CostKind, OrigScalarTy, VecTy);
11247LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11248"Calculated GEPs cost for Tree"));
11249
11250return VecCost - ScalarCost;
11251 };
11252
11253auto GetMinMaxCost = [&](Type *Ty,Instruction *VI =nullptr) {
11254auto [MinMaxID, SelectOnly] =canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11255if (MinMaxID ==Intrinsic::not_intrinsic)
11256returnInstructionCost::getInvalid();
11257Type *CanonicalType = Ty;
11258if (CanonicalType->isPtrOrPtrVectorTy())
11259 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11260 CanonicalType->getContext(),
11261DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11262
11263IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11264 {CanonicalType, CanonicalType});
11265InstructionCost IntrinsicCost =
11266TTI->getIntrinsicInstrCost(CostAttrs,CostKind);
11267// If the selects are the only uses of the compares, they will be
11268// dead and we can adjust the cost by removing their cost.
11269if (VI && SelectOnly) {
11270assert((!Ty->isVectorTy() ||SLPReVec) &&
11271"Expected only for scalar type.");
11272auto *CI = cast<CmpInst>(VI->getOperand(0));
11273 IntrinsicCost -=TTI->getCmpSelInstrCost(
11274 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11275CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11276 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11277 }
11278return IntrinsicCost;
11279 };
11280switch (ShuffleOrOp) {
11281case Instruction::PHI: {
11282// Count reused scalars.
11283InstructionCost ScalarCost = 0;
11284SmallPtrSet<const TreeEntry *, 4> CountedOps;
11285for (Value *V : UniqueValues) {
11286auto *PHI = dyn_cast<PHINode>(V);
11287if (!PHI)
11288continue;
11289
11290ValueListOperands(PHI->getNumIncomingValues(),nullptr);
11291for (unsignedI = 0,N =PHI->getNumIncomingValues();I <N; ++I) {
11292Value *Op =PHI->getIncomingValue(I);
11293Operands[I] =Op;
11294 }
11295if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
11296if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
11297if (!OpTE->ReuseShuffleIndices.empty())
11298 ScalarCost +=TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11299 OpTE->Scalars.size());
11300 }
11301
11302return CommonCost - ScalarCost;
11303 }
11304case Instruction::ExtractValue:
11305case Instruction::ExtractElement: {
11306auto GetScalarCost = [&](unsignedIdx) {
11307if (isa<PoisonValue>(UniqueValues[Idx]))
11308returnInstructionCost(TTI::TCC_Free);
11309
11310auto *I = cast<Instruction>(UniqueValues[Idx]);
11311VectorType *SrcVecTy;
11312if (ShuffleOrOp == Instruction::ExtractElement) {
11313auto *EE = cast<ExtractElementInst>(I);
11314 SrcVecTy = EE->getVectorOperandType();
11315 }else {
11316auto *EV = cast<ExtractValueInst>(I);
11317Type *AggregateTy = EV->getAggregateOperand()->getType();
11318unsigned NumElts;
11319if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11320 NumElts = ATy->getNumElements();
11321else
11322 NumElts = AggregateTy->getStructNumElements();
11323 SrcVecTy =getWidenedType(OrigScalarTy, NumElts);
11324 }
11325if (I->hasOneUse()) {
11326Instruction *Ext =I->user_back();
11327if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11328all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11329// Use getExtractWithExtendCost() to calculate the cost of
11330// extractelement/ext pair.
11331InstructionCostCost =TTI->getExtractWithExtendCost(
11332Ext->getOpcode(),Ext->getType(), SrcVecTy, *getExtractIndex(I));
11333// Subtract the cost of s|zext which is subtracted separately.
11334Cost -=TTI->getCastInstrCost(
11335Ext->getOpcode(),Ext->getType(),I->getType(),
11336TTI::getCastContextHint(Ext),CostKind, Ext);
11337returnCost;
11338 }
11339 }
11340returnTTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11341CostKind, *getExtractIndex(I));
11342 };
11343auto GetVectorCost = [](InstructionCost CommonCost) {return CommonCost; };
11344return GetCostDiff(GetScalarCost, GetVectorCost);
11345 }
11346case Instruction::InsertElement: {
11347assert(E->ReuseShuffleIndices.empty() &&
11348"Unique insertelements only are expected.");
11349auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11350unsignedconst NumElts = SrcVecTy->getNumElements();
11351unsignedconst NumScalars = VL.size();
11352
11353unsigned NumOfParts =TTI->getNumberOfParts(SrcVecTy);
11354
11355SmallVector<int> InsertMask(NumElts,PoisonMaskElem);
11356unsigned OffsetBeg = *getElementIndex(VL.front());
11357unsigned OffsetEnd = OffsetBeg;
11358 InsertMask[OffsetBeg] = 0;
11359for (auto [I, V] :enumerate(VL.drop_front())) {
11360unsignedIdx = *getElementIndex(V);
11361if (OffsetBeg >Idx)
11362 OffsetBeg =Idx;
11363elseif (OffsetEnd <Idx)
11364 OffsetEnd =Idx;
11365 InsertMask[Idx] =I + 1;
11366 }
11367unsigned VecScalarsSz =PowerOf2Ceil(NumElts);
11368if (NumOfParts > 0 && NumOfParts < NumElts)
11369 VecScalarsSz =PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11370unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11371 VecScalarsSz;
11372unsignedOffset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11373unsigned InsertVecSz = std::min<unsigned>(
11374PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11375 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11376bool IsWholeSubvector =
11377 OffsetBeg ==Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11378// Check if we can safely insert a subvector. If it is not possible, just
11379// generate a whole-sized vector and shuffle the source vector and the new
11380// subvector.
11381if (OffsetBeg + InsertVecSz > VecSz) {
11382// Align OffsetBeg to generate correct mask.
11383 OffsetBeg =alignDown(OffsetBeg, VecSz,Offset);
11384 InsertVecSz = VecSz;
11385 }
11386
11387APInt DemandedElts =APInt::getZero(NumElts);
11388// TODO: Add support for Instruction::InsertValue.
11389SmallVector<int>Mask;
11390if (!E->ReorderIndices.empty()) {
11391inversePermutation(E->ReorderIndices, Mask);
11392Mask.append(InsertVecSz -Mask.size(),PoisonMaskElem);
11393 }else {
11394Mask.assign(VecSz,PoisonMaskElem);
11395 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11396 }
11397bool IsIdentity =true;
11398SmallVector<int> PrevMask(InsertVecSz,PoisonMaskElem);
11399Mask.swap(PrevMask);
11400for (unsignedI = 0;I < NumScalars; ++I) {
11401unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11402 DemandedElts.setBit(InsertIdx);
11403 IsIdentity &= InsertIdx - OffsetBeg ==I;
11404Mask[InsertIdx - OffsetBeg] =I;
11405 }
11406assert(Offset < NumElts &&"Failed to find vector index offset");
11407
11408InstructionCostCost = 0;
11409Cost -=TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11410/*Insert*/true,/*Extract*/false,
11411CostKind);
11412
11413// First cost - resize to actual vector size if not identity shuffle or
11414// need to shift the vector.
11415// Do not calculate the cost if the actual size is the register size and
11416// we can merge this shuffle with the following SK_Select.
11417auto *InsertVecTy =getWidenedType(ScalarTy, InsertVecSz);
11418if (!IsIdentity)
11419Cost +=::getShuffleCost(*TTI,TargetTransformInfo::SK_PermuteSingleSrc,
11420 InsertVecTy, Mask);
11421auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11422 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11423 }));
11424// Second cost - permutation with subvector, if some elements are from the
11425// initial vector or inserting a subvector.
11426// TODO: Implement the analysis of the FirstInsert->getOperand(0)
11427// subvector of ActualVecTy.
11428SmallBitVector InMask =
11429isUndefVector(FirstInsert->getOperand(0),
11430buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11431if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11432if (InsertVecSz != VecSz) {
11433auto *ActualVecTy =getWidenedType(ScalarTy, VecSz);
11434Cost +=::getShuffleCost(*TTI,TTI::SK_InsertSubvector, ActualVecTy, {},
11435CostKind, OffsetBeg -Offset, InsertVecTy);
11436 }else {
11437for (unsignedI = 0,End = OffsetBeg -Offset;I <End; ++I)
11438 Mask[I] = InMask.test(I) ?PoisonMaskElem :I;
11439for (unsignedI = OffsetBeg -Offset,End = OffsetEnd -Offset;
11440I <=End; ++I)
11441if (Mask[I] !=PoisonMaskElem)
11442Mask[I] =I + VecSz;
11443for (unsignedI = OffsetEnd + 1 -Offset;I < VecSz; ++I)
11444 Mask[I] =
11445 ((I >= InMask.size()) || InMask.test(I)) ?PoisonMaskElem :I;
11446Cost +=
11447::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11448 }
11449 }
11450returnCost;
11451 }
11452case Instruction::ZExt:
11453case Instruction::SExt:
11454case Instruction::FPToUI:
11455case Instruction::FPToSI:
11456case Instruction::FPExt:
11457case Instruction::PtrToInt:
11458case Instruction::IntToPtr:
11459case Instruction::SIToFP:
11460case Instruction::UIToFP:
11461case Instruction::Trunc:
11462case Instruction::FPTrunc:
11463case Instruction::BitCast: {
11464auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11465Type *SrcScalarTy = VL0->getOperand(0)->getType();
11466auto *SrcVecTy =getWidenedType(SrcScalarTy, VL.size());
11467unsigned Opcode = ShuffleOrOp;
11468unsigned VecOpcode = Opcode;
11469if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11470 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11471// Check if the values are candidates to demote.
11472unsigned SrcBWSz =DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11473if (SrcIt != MinBWs.end()) {
11474 SrcBWSz = SrcIt->second.first;
11475unsigned SrcScalarTyNumElements =getNumElements(SrcScalarTy);
11476 SrcScalarTy =IntegerType::get(F->getContext(), SrcBWSz);
11477 SrcVecTy =
11478getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11479 }
11480unsigned BWSz =DL->getTypeSizeInBits(ScalarTy->getScalarType());
11481if (BWSz == SrcBWSz) {
11482 VecOpcode = Instruction::BitCast;
11483 }elseif (BWSz < SrcBWSz) {
11484 VecOpcode = Instruction::Trunc;
11485 }elseif (It != MinBWs.end()) {
11486assert(BWSz > SrcBWSz &&"Invalid cast!");
11487 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11488 }elseif (SrcIt != MinBWs.end()) {
11489assert(BWSz > SrcBWSz &&"Invalid cast!");
11490 VecOpcode =
11491 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11492 }
11493 }elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11494 !SrcIt->second.second) {
11495 VecOpcode = Instruction::UIToFP;
11496 }
11497auto GetScalarCost = [&](unsignedIdx) ->InstructionCost {
11498assert(Idx == 0 &&"Expected 0 index only");
11499returnTTI->getCastInstrCost(Opcode, VL0->getType(),
11500 VL0->getOperand(0)->getType(),
11501TTI::getCastContextHint(VL0),CostKind, VL0);
11502 };
11503auto GetVectorCost = [=](InstructionCost CommonCost) {
11504// Do not count cost here if minimum bitwidth is in effect and it is just
11505// a bitcast (here it is just a noop).
11506if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11507return CommonCost;
11508auto *VI = VL0->getOpcode() == Opcode ? VL0 :nullptr;
11509TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11510
11511bool IsArithmeticExtendedReduction =
11512 E->Idx == 0 && UserIgnoreList &&
11513all_of(*UserIgnoreList, [](Value *V) {
11514auto *I = cast<Instruction>(V);
11515returnis_contained({Instruction::Add, Instruction::FAdd,
11516 Instruction::Mul, Instruction::FMul,
11517 Instruction::And, Instruction::Or,
11518 Instruction::Xor},
11519I->getOpcode());
11520 });
11521if (IsArithmeticExtendedReduction &&
11522 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11523return CommonCost;
11524return CommonCost +
11525TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,CostKind,
11526 VecOpcode == Opcode ? VI :nullptr);
11527 };
11528return GetCostDiff(GetScalarCost, GetVectorCost);
11529 }
11530case Instruction::FCmp:
11531case Instruction::ICmp:
11532case Instruction::Select: {
11533CmpPredicate VecPred, SwappedVecPred;
11534auto MatchCmp =m_Cmp(VecPred,m_Value(),m_Value());
11535if (match(VL0,m_Select(MatchCmp,m_Value(),m_Value())) ||
11536match(VL0, MatchCmp))
11537 SwappedVecPred =CmpInst::getSwappedPredicate(VecPred);
11538else
11539 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11540 ?CmpInst::BAD_FCMP_PREDICATE
11541 :CmpInst::BAD_ICMP_PREDICATE;
11542auto GetScalarCost = [&](unsignedIdx) {
11543if (isa<PoisonValue>(UniqueValues[Idx]))
11544returnInstructionCost(TTI::TCC_Free);
11545
11546auto *VI = cast<Instruction>(UniqueValues[Idx]);
11547CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11548 ?CmpInst::BAD_FCMP_PREDICATE
11549 :CmpInst::BAD_ICMP_PREDICATE;
11550auto MatchCmp =m_Cmp(CurrentPred,m_Value(),m_Value());
11551if ((!match(VI,m_Select(MatchCmp,m_Value(),m_Value())) &&
11552 !match(VI, MatchCmp)) ||
11553 (CurrentPred !=static_cast<CmpInst::Predicate>(VecPred) &&
11554 CurrentPred !=static_cast<CmpInst::Predicate>(SwappedVecPred)))
11555 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11556 ?CmpInst::BAD_FCMP_PREDICATE
11557 :CmpInst::BAD_ICMP_PREDICATE;
11558
11559InstructionCost ScalarCost =TTI->getCmpSelInstrCost(
11560 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11561CostKind, getOperandInfo(VI->getOperand(0)),
11562 getOperandInfo(VI->getOperand(1)), VI);
11563InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11564if (IntrinsicCost.isValid())
11565 ScalarCost = IntrinsicCost;
11566
11567return ScalarCost;
11568 };
11569auto GetVectorCost = [&](InstructionCost CommonCost) {
11570auto *MaskTy =getWidenedType(Builder.getInt1Ty(), VL.size());
11571
11572InstructionCost VecCost =
11573TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11574CostKind, getOperandInfo(E->getOperand(0)),
11575 getOperandInfo(E->getOperand(1)), VL0);
11576if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11577auto *CondType =
11578getWidenedType(SI->getCondition()->getType(), VL.size());
11579unsigned CondNumElements = CondType->getNumElements();
11580unsigned VecTyNumElements =getNumElements(VecTy);
11581assert(VecTyNumElements >= CondNumElements &&
11582 VecTyNumElements % CondNumElements == 0 &&
11583"Cannot vectorize Instruction::Select");
11584if (CondNumElements != VecTyNumElements) {
11585// When the return type is i1 but the source is fixed vector type, we
11586// need to duplicate the condition value.
11587 VecCost +=::getShuffleCost(
11588 *TTI,TTI::SK_PermuteSingleSrc, CondType,
11589createReplicatedMask(VecTyNumElements / CondNumElements,
11590 CondNumElements));
11591 }
11592 }
11593return VecCost + CommonCost;
11594 };
11595return GetCostDiff(GetScalarCost, GetVectorCost);
11596 }
11597case TreeEntry::MinMax: {
11598auto GetScalarCost = [&](unsignedIdx) {
11599return GetMinMaxCost(OrigScalarTy);
11600 };
11601auto GetVectorCost = [&](InstructionCost CommonCost) {
11602InstructionCost VecCost = GetMinMaxCost(VecTy);
11603return VecCost + CommonCost;
11604 };
11605return GetCostDiff(GetScalarCost, GetVectorCost);
11606 }
11607case Instruction::FNeg:
11608case Instruction::Add:
11609case Instruction::FAdd:
11610case Instruction::Sub:
11611case Instruction::FSub:
11612case Instruction::Mul:
11613case Instruction::FMul:
11614case Instruction::UDiv:
11615case Instruction::SDiv:
11616case Instruction::FDiv:
11617case Instruction::URem:
11618case Instruction::SRem:
11619case Instruction::FRem:
11620case Instruction::Shl:
11621case Instruction::LShr:
11622case Instruction::AShr:
11623case Instruction::And:
11624case Instruction::Or:
11625case Instruction::Xor: {
11626auto GetScalarCost = [&](unsignedIdx) {
11627if (isa<PoisonValue>(UniqueValues[Idx]))
11628returnInstructionCost(TTI::TCC_Free);
11629
11630auto *VI = cast<Instruction>(UniqueValues[Idx]);
11631unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11632TTI::OperandValueInfo Op1Info =TTI::getOperandInfo(VI->getOperand(0));
11633TTI::OperandValueInfo Op2Info =
11634TTI::getOperandInfo(VI->getOperand(OpIdx));
11635SmallVector<const Value *>Operands(VI->operand_values());
11636returnTTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy,CostKind,
11637 Op1Info, Op2Info,Operands, VI);
11638 };
11639auto GetVectorCost = [=](InstructionCost CommonCost) {
11640if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11641for (unsignedI : seq<unsigned>(0, E->getNumOperands())) {
11642ArrayRef<Value *> Ops = E->getOperand(I);
11643if (all_of(Ops, [&](Value *Op) {
11644auto *CI = dyn_cast<ConstantInt>(Op);
11645return CI && CI->getValue().countr_one() >= It->second.first;
11646 }))
11647return CommonCost;
11648 }
11649 }
11650unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11651TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11652TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11653returnTTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,CostKind, Op1Info,
11654 Op2Info, {},nullptr, TLI) +
11655 CommonCost;
11656 };
11657return GetCostDiff(GetScalarCost, GetVectorCost);
11658 }
11659case Instruction::GetElementPtr: {
11660return CommonCost + GetGEPCostDiff(VL, VL0);
11661 }
11662case Instruction::Load: {
11663auto GetScalarCost = [&](unsignedIdx) {
11664auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11665returnTTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11666VI->getAlign(),VI->getPointerAddressSpace(),
11667CostKind,TTI::OperandValueInfo(), VI);
11668 };
11669auto *LI0 = cast<LoadInst>(VL0);
11670auto GetVectorCost = [&](InstructionCost CommonCost) {
11671InstructionCost VecLdCost;
11672switch (E->State) {
11673case TreeEntry::Vectorize:
11674if (unsigned Factor = E->getInterleaveFactor()) {
11675 VecLdCost =TTI->getInterleavedMemoryOpCost(
11676 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11677 LI0->getPointerAddressSpace(),CostKind);
11678
11679 }else {
11680 VecLdCost =TTI->getMemoryOpCost(
11681 Instruction::Load, VecTy, LI0->getAlign(),
11682 LI0->getPointerAddressSpace(),CostKind,TTI::OperandValueInfo());
11683 }
11684break;
11685case TreeEntry::StridedVectorize: {
11686Align CommonAlignment =
11687 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11688 VecLdCost =TTI->getStridedMemoryOpCost(
11689 Instruction::Load, VecTy, LI0->getPointerOperand(),
11690/*VariableMask=*/false, CommonAlignment,CostKind);
11691break;
11692 }
11693case TreeEntry::ScatterVectorize: {
11694Align CommonAlignment =
11695 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11696 VecLdCost =TTI->getGatherScatterOpCost(
11697 Instruction::Load, VecTy, LI0->getPointerOperand(),
11698/*VariableMask=*/false, CommonAlignment,CostKind);
11699break;
11700 }
11701case TreeEntry::CombinedVectorize:
11702case TreeEntry::NeedToGather:
11703llvm_unreachable("Unexpected vectorization state.");
11704 }
11705return VecLdCost + CommonCost;
11706 };
11707
11708InstructionCostCost = GetCostDiff(GetScalarCost, GetVectorCost);
11709// If this node generates masked gather load then it is not a terminal node.
11710// Hence address operand cost is estimated separately.
11711if (E->State == TreeEntry::ScatterVectorize)
11712returnCost;
11713
11714// Estimate cost of GEPs since this tree node is a terminator.
11715SmallVector<Value *> PointerOps(VL.size());
11716for (auto [I, V] :enumerate(VL))
11717 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11718returnCost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11719 }
11720case Instruction::Store: {
11721bool IsReorder = !E->ReorderIndices.empty();
11722auto GetScalarCost = [=](unsignedIdx) {
11723auto *VI = cast<StoreInst>(VL[Idx]);
11724TTI::OperandValueInfo OpInfo =TTI::getOperandInfo(VI->getValueOperand());
11725returnTTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11726VI->getAlign(),VI->getPointerAddressSpace(),
11727CostKind, OpInfo, VI);
11728 };
11729auto *BaseSI =
11730 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11731auto GetVectorCost = [=](InstructionCost CommonCost) {
11732// We know that we can merge the stores. Calculate the cost.
11733InstructionCost VecStCost;
11734if (E->State == TreeEntry::StridedVectorize) {
11735Align CommonAlignment =
11736 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11737 VecStCost =TTI->getStridedMemoryOpCost(
11738 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11739/*VariableMask=*/false, CommonAlignment,CostKind);
11740 }else {
11741assert(E->State == TreeEntry::Vectorize &&
11742"Expected either strided or consecutive stores.");
11743if (unsigned Factor = E->getInterleaveFactor()) {
11744assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11745"No reused shuffles expected");
11746 CommonCost = 0;
11747 VecStCost =TTI->getInterleavedMemoryOpCost(
11748 Instruction::Store, VecTy, Factor, std::nullopt,
11749 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),CostKind);
11750 }else {
11751TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11752 VecStCost =TTI->getMemoryOpCost(
11753 Instruction::Store, VecTy, BaseSI->getAlign(),
11754 BaseSI->getPointerAddressSpace(),CostKind, OpInfo);
11755 }
11756 }
11757return VecStCost + CommonCost;
11758 };
11759SmallVector<Value *> PointerOps(VL.size());
11760for (auto [I, V] :enumerate(VL)) {
11761unsignedIdx = IsReorder ? E->ReorderIndices[I] :I;
11762 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11763 }
11764
11765return GetCostDiff(GetScalarCost, GetVectorCost) +
11766 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11767 }
11768case Instruction::Call: {
11769auto GetScalarCost = [&](unsignedIdx) {
11770auto *CI = cast<CallInst>(UniqueValues[Idx]);
11771Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
11772if (ID !=Intrinsic::not_intrinsic) {
11773IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11774returnTTI->getIntrinsicInstrCost(CostAttrs,CostKind);
11775 }
11776returnTTI->getCallInstrCost(CI->getCalledFunction(),
11777 CI->getFunctionType()->getReturnType(),
11778 CI->getFunctionType()->params(),CostKind);
11779 };
11780auto GetVectorCost = [=](InstructionCost CommonCost) {
11781auto *CI = cast<CallInst>(VL0);
11782Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
11783SmallVector<Type *> ArgTys =buildIntrinsicArgTypes(
11784 CI,ID, VecTy->getNumElements(),
11785 It != MinBWs.end() ? It->second.first : 0,TTI);
11786auto VecCallCosts =getVectorCallCosts(CI, VecTy,TTI, TLI, ArgTys);
11787return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11788 };
11789return GetCostDiff(GetScalarCost, GetVectorCost);
11790 }
11791case Instruction::ShuffleVector: {
11792if (!SLPReVec || E->isAltShuffle())
11793assert(E->isAltShuffle() &&
11794 ((Instruction::isBinaryOp(E->getOpcode()) &&
11795Instruction::isBinaryOp(E->getAltOpcode())) ||
11796 (Instruction::isCast(E->getOpcode()) &&
11797Instruction::isCast(E->getAltOpcode())) ||
11798 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11799"Invalid Shuffle Vector Operand");
11800// Try to find the previous shuffle node with the same operands and same
11801// main/alternate ops.
11802auto TryFindNodeWithEqualOperands = [=]() {
11803for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11804if (TE.get() == E)
11805break;
11806if (TE->hasState() &&TE->isAltShuffle() &&
11807 ((TE->getOpcode() == E->getOpcode() &&
11808TE->getAltOpcode() == E->getAltOpcode()) ||
11809 (TE->getOpcode() == E->getAltOpcode() &&
11810TE->getAltOpcode() == E->getOpcode())) &&
11811TE->hasEqualOperands(*E))
11812returntrue;
11813 }
11814returnfalse;
11815 };
11816auto GetScalarCost = [&](unsignedIdx) {
11817if (isa<PoisonValue>(UniqueValues[Idx]))
11818returnInstructionCost(TTI::TCC_Free);
11819
11820auto *VI = cast<Instruction>(UniqueValues[Idx]);
11821assert(E->isOpcodeOrAlt(VI) &&"Unexpected main/alternate opcode");
11822 (void)E;
11823returnTTI->getInstructionCost(VI,CostKind);
11824 };
11825// Need to clear CommonCost since the final shuffle cost is included into
11826// vector cost.
11827auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11828// VecCost is equal to sum of the cost of creating 2 vectors
11829// and the cost of creating shuffle.
11830InstructionCost VecCost = 0;
11831if (TryFindNodeWithEqualOperands()) {
11832LLVM_DEBUG({
11833dbgs() <<"SLP: diamond match for alternate node found.\n";
11834 E->dump();
11835 });
11836// No need to add new vector costs here since we're going to reuse
11837// same main/alternate vector ops, just do different shuffling.
11838 }elseif (Instruction::isBinaryOp(E->getOpcode())) {
11839 VecCost =
11840 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,CostKind);
11841 VecCost +=
11842 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,CostKind);
11843 }elseif (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11844auto *MaskTy =getWidenedType(Builder.getInt1Ty(), VL.size());
11845 VecCost = TTIRef.getCmpSelInstrCost(
11846 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),CostKind,
11847 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11848 VL0);
11849 VecCost += TTIRef.getCmpSelInstrCost(
11850 E->getOpcode(), VecTy, MaskTy,
11851 cast<CmpInst>(E->getAltOp())->getPredicate(),CostKind,
11852 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11853 E->getAltOp());
11854 }else {
11855Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11856auto *SrcTy =getWidenedType(SrcSclTy, VL.size());
11857if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11858auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11859unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);
11860unsigned SrcBWSz =
11861DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11862if (SrcIt != MinBWs.end()) {
11863 SrcBWSz = SrcIt->second.first;
11864 SrcSclTy =IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11865 SrcTy =getWidenedType(SrcSclTy, VL.size());
11866 }
11867if (BWSz <= SrcBWSz) {
11868if (BWSz < SrcBWSz)
11869 VecCost =
11870 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11871TTI::CastContextHint::None,CostKind);
11872LLVM_DEBUG({
11873dbgs()
11874 <<"SLP: alternate extension, which should be truncated.\n";
11875 E->dump();
11876 });
11877return VecCost;
11878 }
11879 }
11880 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11881TTI::CastContextHint::None,CostKind);
11882 VecCost +=
11883 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11884TTI::CastContextHint::None,CostKind);
11885 }
11886SmallVector<int>Mask;
11887 E->buildAltOpShuffleMask(
11888 [&](Instruction *I) {
11889assert(E->isOpcodeOrAlt(I) &&"Unexpected main/alternate opcode");
11890returnisAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11891 *TLI);
11892 },
11893Mask);
11894 VecCost +=::getShuffleCost(TTIRef,TargetTransformInfo::SK_PermuteTwoSrc,
11895 FinalVecTy, Mask,CostKind);
11896// Patterns like [fadd,fsub] can be combined into a single instruction
11897// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11898// need to take into account their order when looking for the most used
11899// order.
11900unsigned Opcode0 = E->getOpcode();
11901unsigned Opcode1 = E->getAltOpcode();
11902SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11903// If this pattern is supported by the target then we consider the
11904// order.
11905if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11906InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11907 VecTy, Opcode0, Opcode1, OpcodeMask,CostKind);
11908return AltVecCost < VecCost ? AltVecCost : VecCost;
11909 }
11910// TODO: Check the reverse order too.
11911return VecCost;
11912 };
11913if (SLPReVec && !E->isAltShuffle())
11914return GetCostDiff(
11915 GetScalarCost, [&](InstructionCost) ->InstructionCost {
11916// If a group uses mask in order, the shufflevector can be
11917// eliminated by instcombine. Then the cost is 0.
11918assert(isa<ShuffleVectorInst>(VL.front()) &&
11919"Not supported shufflevector usage.");
11920auto *SV = cast<ShuffleVectorInst>(VL.front());
11921unsigned SVNumElements =
11922 cast<FixedVectorType>(SV->getOperand(0)->getType())
11923 ->getNumElements();
11924unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11925for (size_tI = 0,End = VL.size();I !=End;I += GroupSize) {
11926ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11927int NextIndex = 0;
11928if (!all_of(Group, [&](Value *V) {
11929assert(isa<ShuffleVectorInst>(V) &&
11930"Not supported shufflevector usage.");
11931auto *SV = cast<ShuffleVectorInst>(V);
11932intIndex;
11933 [[maybe_unused]]bool IsExtractSubvectorMask =
11934 SV->isExtractSubvectorMask(Index);
11935assert(IsExtractSubvectorMask &&
11936"Not supported shufflevector usage.");
11937if (NextIndex != Index)
11938returnfalse;
11939 NextIndex += SV->getShuffleMask().size();
11940returntrue;
11941 }))
11942 return ::getShuffleCost(
11943 *TTI,TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
11944calculateShufflevectorMask(E->Scalars));
11945 }
11946returnTTI::TCC_Free;
11947 });
11948return GetCostDiff(GetScalarCost, GetVectorCost);
11949 }
11950case Instruction::Freeze:
11951return CommonCost;
11952default:
11953llvm_unreachable("Unknown instruction");
11954 }
11955}
11956
11957bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const{
11958LLVM_DEBUG(dbgs() <<"SLP: Check whether the tree with height "
11959 << VectorizableTree.size() <<" is fully vectorizable .\n");
11960
11961auto &&AreVectorizableGathers = [this](const TreeEntry *TE,unsigned Limit) {
11962SmallVector<int>Mask;
11963returnTE->isGather() &&
11964 !any_of(TE->Scalars,
11965 [this](Value *V) { return EphValues.contains(V); }) &&
11966 (allConstant(TE->Scalars) ||isSplat(TE->Scalars) ||
11967TE->Scalars.size() < Limit ||
11968 (((TE->hasState() &&
11969TE->getOpcode() == Instruction::ExtractElement) ||
11970all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11971isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11972 (TE->hasState() &&TE->getOpcode() == Instruction::Load &&
11973 !TE->isAltShuffle()) ||
11974any_of(TE->Scalars, IsaPred<LoadInst>));
11975 };
11976
11977// We only handle trees of heights 1 and 2.
11978if (VectorizableTree.size() == 1 &&
11979 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11980 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11981 (ForReduction &&
11982 AreVectorizableGathers(VectorizableTree[0].get(),
11983 VectorizableTree[0]->Scalars.size()) &&
11984 VectorizableTree[0]->getVectorFactor() > 2)))
11985returntrue;
11986
11987if (VectorizableTree.size() != 2)
11988returnfalse;
11989
11990// Handle splat and all-constants stores. Also try to vectorize tiny trees
11991// with the second gather nodes if they have less scalar operands rather than
11992// the initial tree element (may be profitable to shuffle the second gather)
11993// or they are extractelements, which form shuffle.
11994SmallVector<int>Mask;
11995if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11996 AreVectorizableGathers(VectorizableTree[1].get(),
11997 VectorizableTree[0]->Scalars.size()))
11998returntrue;
11999
12000// Gathering cost would be too much for tiny trees.
12001if (VectorizableTree[0]->isGather() ||
12002 (VectorizableTree[1]->isGather() &&
12003 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
12004 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
12005returnfalse;
12006
12007returntrue;
12008}
12009
12010staticboolisLoadCombineCandidateImpl(Value *Root,unsigned NumElts,
12011TargetTransformInfo *TTI,
12012bool MustMatchOrInst) {
12013// Look past the root to find a source value. Arbitrarily follow the
12014// path through operand 0 of any 'or'. Also, peek through optional
12015// shift-left-by-multiple-of-8-bits.
12016Value *ZextLoad = Root;
12017constAPInt *ShAmtC;
12018bool FoundOr =false;
12019while (!isa<ConstantExpr>(ZextLoad) &&
12020 (match(ZextLoad,m_Or(m_Value(),m_Value())) ||
12021 (match(ZextLoad,m_Shl(m_Value(),m_APInt(ShAmtC))) &&
12022 ShAmtC->urem(8) == 0))) {
12023auto *BinOp = cast<BinaryOperator>(ZextLoad);
12024 ZextLoad = BinOp->getOperand(0);
12025if (BinOp->getOpcode() == Instruction::Or)
12026 FoundOr =true;
12027 }
12028// Check if the input is an extended load of the required or/shift expression.
12029Value *Load;
12030if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12031 !match(ZextLoad,m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
12032returnfalse;
12033
12034// Require that the total load bit width is a legal integer type.
12035// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
12036// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
12037Type *SrcTy = Load->getType();
12038unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
12039if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
12040returnfalse;
12041
12042// Everything matched - assume that we can fold the whole sequence using
12043// load combining.
12044LLVM_DEBUG(dbgs() <<"SLP: Assume load combining for tree starting at "
12045 << *(cast<Instruction>(Root)) <<"\n");
12046
12047returntrue;
12048}
12049
12050boolBoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const{
12051if (RdxKind !=RecurKind::Or)
12052returnfalse;
12053
12054unsigned NumElts = VectorizableTree[0]->Scalars.size();
12055Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12056returnisLoadCombineCandidateImpl(FirstReduced, NumElts,TTI,
12057/* MatchOr */false);
12058}
12059
12060boolBoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const{
12061// Peek through a final sequence of stores and check if all operations are
12062// likely to be load-combined.
12063unsigned NumElts = Stores.size();
12064for (Value *Scalar : Stores) {
12065Value *X;
12066if (!match(Scalar,m_Store(m_Value(X),m_Value())) ||
12067 !isLoadCombineCandidateImpl(X, NumElts,TTI,/* MatchOr */true))
12068returnfalse;
12069 }
12070returntrue;
12071}
12072
12073boolBoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const{
12074if (!DebugCounter::shouldExecute(VectorizedGraphs))
12075returntrue;
12076
12077// Graph is empty - do nothing.
12078if (VectorizableTree.empty()) {
12079assert(ExternalUses.empty() &&"We shouldn't have any external users");
12080
12081returntrue;
12082 }
12083
12084// No need to vectorize inserts of gathered values.
12085if (VectorizableTree.size() == 2 &&
12086 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12087 VectorizableTree[1]->isGather() &&
12088 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12089 !(isSplat(VectorizableTree[1]->Scalars) ||
12090allConstant(VectorizableTree[1]->Scalars))))
12091returntrue;
12092
12093// If the graph includes only PHI nodes and gathers, it is defnitely not
12094// profitable for the vectorization, we can skip it, if the cost threshold is
12095// default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12096// gathers/buildvectors.
12097constexprint Limit = 4;
12098if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12099 !VectorizableTree.empty() &&
12100all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12101return (TE->isGather() &&
12102 (!TE->hasState() ||
12103 TE->getOpcode() != Instruction::ExtractElement) &&
12104count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12105 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12106 }))
12107returntrue;
12108
12109// We can vectorize the tree if its size is greater than or equal to the
12110// minimum size specified by the MinTreeSize command line option.
12111if (VectorizableTree.size() >=MinTreeSize)
12112returnfalse;
12113
12114// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12115// can vectorize it if we can prove it fully vectorizable.
12116if (isFullyVectorizableTinyTree(ForReduction))
12117returnfalse;
12118
12119// Check if any of the gather node forms an insertelement buildvector
12120// somewhere.
12121bool IsAllowedSingleBVNode =
12122 VectorizableTree.size() > 1 ||
12123 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12124 !VectorizableTree.front()->isAltShuffle() &&
12125 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12126 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12127allSameBlock(VectorizableTree.front()->Scalars));
12128if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12129return TE->isGather() &&all_of(TE->Scalars, [&](Value *V) {
12130 return isa<ExtractElementInst, UndefValue>(V) ||
12131 (IsAllowedSingleBVNode &&
12132 !V->hasNUsesOrMore(UsesLimit) &&
12133 any_of(V->users(), IsaPred<InsertElementInst>));
12134 });
12135 }))
12136returnfalse;
12137
12138if (VectorizableTree.back()->isGather() &&
12139 VectorizableTree.back()->hasState() &&
12140 VectorizableTree.back()->isAltShuffle() &&
12141 VectorizableTree.back()->getVectorFactor() > 2 &&
12142allSameBlock(VectorizableTree.back()->Scalars) &&
12143 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12144TTI->getScalarizationOverhead(
12145getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12146 VectorizableTree.back()->getVectorFactor()),
12147APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12148/*Insert=*/true,/*Extract=*/false,
12149TTI::TCK_RecipThroughput) > -SLPCostThreshold)
12150returnfalse;
12151
12152// Otherwise, we can't vectorize the tree. It is both tiny and not fully
12153// vectorizable.
12154returntrue;
12155}
12156
12157boolBoUpSLP::isTreeNotExtendable() const{
12158if (getCanonicalGraphSize() !=getTreeSize()) {
12159constexprunsigned SmallTree = 3;
12160if (VectorizableTree.front()->isNonPowOf2Vec() &&
12161getCanonicalGraphSize() <= SmallTree &&
12162count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12163 [](const std::unique_ptr<TreeEntry> &TE) {
12164return TE->isGather() && TE->hasState() &&
12165 TE->getOpcode() == Instruction::Load &&
12166 !allSameBlock(TE->Scalars);
12167 }) == 1)
12168returntrue;
12169returnfalse;
12170 }
12171bool Res =false;
12172for (unsignedIdx : seq<unsigned>(getTreeSize())) {
12173 TreeEntry &E = *VectorizableTree[Idx];
12174if (!E.isGather())
12175continue;
12176if (E.hasState() && E.getOpcode() != Instruction::Load)
12177returnfalse;
12178if (isSplat(E.Scalars) ||allConstant(E.Scalars))
12179continue;
12180 Res =true;
12181 }
12182return Res;
12183}
12184
12185InstructionCostBoUpSLP::getSpillCost() const{
12186// Walk from the bottom of the tree to the top, tracking which values are
12187// live. When we see a call instruction that is not part of our tree,
12188// query TTI to see if there is a cost to keeping values live over it
12189// (for example, if spills and fills are required).
12190unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12191InstructionCostCost = 0;
12192
12193SmallPtrSet<Instruction *, 4> LiveValues;
12194Instruction *PrevInst =nullptr;
12195
12196// The entries in VectorizableTree are not necessarily ordered by their
12197// position in basic blocks. Collect them and order them by dominance so later
12198// instructions are guaranteed to be visited first. For instructions in
12199// different basic blocks, we only scan to the beginning of the block, so
12200// their order does not matter, as long as all instructions in a basic block
12201// are grouped together. Using dominance ensures a deterministic order.
12202SmallVector<Instruction *, 16> OrderedScalars;
12203for (constauto &TEPtr : VectorizableTree) {
12204if (TEPtr->State != TreeEntry::Vectorize)
12205continue;
12206Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12207if (!Inst)
12208continue;
12209 OrderedScalars.push_back(Inst);
12210 }
12211llvm::sort(OrderedScalars, [&](Instruction *A,Instruction *B) {
12212auto *NodeA = DT->getNode(A->getParent());
12213auto *NodeB = DT->getNode(B->getParent());
12214assert(NodeA &&"Should only process reachable instructions");
12215assert(NodeB &&"Should only process reachable instructions");
12216assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12217"Different nodes should have different DFS numbers");
12218if (NodeA != NodeB)
12219return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12220returnB->comesBefore(A);
12221 });
12222
12223for (Instruction *Inst : OrderedScalars) {
12224if (!PrevInst) {
12225 PrevInst = Inst;
12226continue;
12227 }
12228
12229// Update LiveValues.
12230 LiveValues.erase(PrevInst);
12231for (auto &J : PrevInst->operands()) {
12232if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12233 LiveValues.insert(cast<Instruction>(&*J));
12234 }
12235
12236LLVM_DEBUG({
12237dbgs() <<"SLP: #LV: " << LiveValues.size();
12238for (auto *X : LiveValues)
12239dbgs() <<" " <<X->getName();
12240dbgs() <<", Looking at ";
12241 Inst->dump();
12242 });
12243
12244// Now find the sequence of instructions between PrevInst and Inst.
12245unsigned NumCalls = 0;
12246BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12247 PrevInstIt =
12248 PrevInst->getIterator().getReverse();
12249while (InstIt != PrevInstIt) {
12250if (PrevInstIt == PrevInst->getParent()->rend()) {
12251 PrevInstIt = Inst->getParent()->rbegin();
12252continue;
12253 }
12254
12255auto NoCallIntrinsic = [this](Instruction *I) {
12256if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12257if (II->isAssumeLikeIntrinsic())
12258returntrue;
12259IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
12260InstructionCost IntrCost =
12261TTI->getIntrinsicInstrCost(ICA,TTI::TCK_RecipThroughput);
12262InstructionCost CallCost =
12263TTI->getCallInstrCost(nullptr,II->getType(), ICA.getArgTypes(),
12264TTI::TCK_RecipThroughput);
12265if (IntrCost < CallCost)
12266returntrue;
12267 }
12268returnfalse;
12269 };
12270
12271// Debug information does not impact spill cost.
12272if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12273 &*PrevInstIt != PrevInst)
12274 NumCalls++;
12275
12276 ++PrevInstIt;
12277 }
12278
12279if (NumCalls) {
12280SmallVector<Type *, 4> V;
12281for (auto *II : LiveValues) {
12282auto *ScalarTy =II->getType();
12283if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12284 ScalarTy = VectorTy->getElementType();
12285 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12286 }
12287Cost += NumCalls *TTI->getCostOfKeepingLiveOverCall(V);
12288 }
12289
12290 PrevInst = Inst;
12291 }
12292
12293returnCost;
12294}
12295
12296/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12297/// buildvector sequence.
12298staticboolisFirstInsertElement(constInsertElementInst *IE1,
12299constInsertElementInst *IE2) {
12300if (IE1 == IE2)
12301returnfalse;
12302constauto *I1 = IE1;
12303constauto *I2 = IE2;
12304constInsertElementInst *PrevI1;
12305constInsertElementInst *PrevI2;
12306unsigned Idx1 = *getElementIndex(IE1);
12307unsigned Idx2 = *getElementIndex(IE2);
12308do {
12309if (I2 == IE1)
12310returntrue;
12311if (I1 == IE2)
12312returnfalse;
12313 PrevI1 = I1;
12314 PrevI2 = I2;
12315if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12316getElementIndex(I1).value_or(Idx2) != Idx2)
12317 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12318if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12319getElementIndex(I2).value_or(Idx1) != Idx1)
12320 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12321 }while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12322llvm_unreachable("Two different buildvectors not expected.");
12323}
12324
12325namespace{
12326/// Returns incoming Value *, if the requested type is Value * too, or a default
12327/// value, otherwise.
12328structValueSelect {
12329template <typename U>
12330static std::enable_if_t<std::is_same_v<Value *, U>,Value *>get(Value *V) {
12331returnV;
12332 }
12333template <typename U>
12334static std::enable_if_t<!std::is_same_v<Value *, U>,U>get(Value *) {
12335returnU();
12336 }
12337};
12338}// namespace
12339
12340/// Does the analysis of the provided shuffle masks and performs the requested
12341/// actions on the vectors with the given shuffle masks. It tries to do it in
12342/// several steps.
12343/// 1. If the Base vector is not undef vector, resizing the very first mask to
12344/// have common VF and perform action for 2 input vectors (including non-undef
12345/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12346/// and processed as a shuffle of 2 elements.
12347/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12348/// action only for 1 vector with the given mask, if it is not the identity
12349/// mask.
12350/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12351/// vectors, combing the masks properly between the steps.
12352template <typename T>
12353staticT *performExtractsShuffleAction(
12354MutableArrayRef<std::pair<T *,SmallVector<int>>> ShuffleMask,Value *Base,
12355function_ref<unsigned(T *)> GetVF,
12356function_ref<std::pair<T *, bool>(T *,ArrayRef<int>,bool)> ResizeAction,
12357function_ref<T *(ArrayRef<int>,ArrayRef<T *>)> Action) {
12358assert(!ShuffleMask.empty() &&"Empty list of shuffles for inserts.");
12359SmallVector<int> Mask(ShuffleMask.begin()->second);
12360auto VMIt = std::next(ShuffleMask.begin());
12361T *Prev =nullptr;
12362SmallBitVector UseMask =
12363buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12364SmallBitVector IsBaseUndef =isUndefVector(Base, UseMask);
12365if (!IsBaseUndef.all()) {
12366// Base is not undef, need to combine it with the next subvectors.
12367 std::pair<T *, bool> Res =
12368 ResizeAction(ShuffleMask.begin()->first, Mask,/*ForSingleMask=*/false);
12369SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12370for (unsignedIdx = 0, VF = Mask.size();Idx < VF; ++Idx) {
12371if (Mask[Idx] ==PoisonMaskElem)
12372 Mask[Idx] = IsBasePoison.test(Idx) ?PoisonMaskElem :Idx;
12373else
12374 Mask[Idx] = (Res.second ?Idx : Mask[Idx]) + VF;
12375 }
12376 [[maybe_unused]]auto *V = ValueSelect::get<T *>(Base);
12377assert((!V || GetVF(V) == Mask.size()) &&
12378"Expected base vector of VF number of elements.");
12379 Prev = Action(Mask, {nullptr, Res.first});
12380 }elseif (ShuffleMask.size() == 1) {
12381// Base is undef and only 1 vector is shuffled - perform the action only for
12382// single vector, if the mask is not the identity mask.
12383 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12384/*ForSingleMask=*/true);
12385if (Res.second)
12386// Identity mask is found.
12387 Prev = Res.first;
12388else
12389 Prev = Action(Mask, {ShuffleMask.begin()->first});
12390 }else {
12391// Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12392// shuffles step by step, combining shuffle between the steps.
12393unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12394unsigned Vec2VF = GetVF(VMIt->first);
12395if (Vec1VF == Vec2VF) {
12396// No need to resize the input vectors since they are of the same size, we
12397// can shuffle them directly.
12398ArrayRef<int> SecMask = VMIt->second;
12399for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {
12400if (SecMask[I] !=PoisonMaskElem) {
12401assert(Mask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");
12402 Mask[I] = SecMask[I] + Vec1VF;
12403 }
12404 }
12405 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12406 }else {
12407// Vectors of different sizes - resize and reshuffle.
12408 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12409/*ForSingleMask=*/false);
12410 std::pair<T *, bool> Res2 =
12411 ResizeAction(VMIt->first, VMIt->second,/*ForSingleMask=*/false);
12412ArrayRef<int> SecMask = VMIt->second;
12413for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {
12414if (Mask[I] !=PoisonMaskElem) {
12415assert(SecMask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");
12416if (Res1.second)
12417 Mask[I] =I;
12418 }elseif (SecMask[I] !=PoisonMaskElem) {
12419assert(Mask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");
12420 Mask[I] = (Res2.second ?I : SecMask[I]) + VF;
12421 }
12422 }
12423 Prev = Action(Mask, {Res1.first, Res2.first});
12424 }
12425 VMIt = std::next(VMIt);
12426 }
12427 [[maybe_unused]]bool IsBaseNotUndef = !IsBaseUndef.all();
12428// Perform requested actions for the remaining masks/vectors.
12429for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12430// Shuffle other input vectors, if any.
12431 std::pair<T *, bool> Res =
12432 ResizeAction(VMIt->first, VMIt->second,/*ForSingleMask=*/false);
12433ArrayRef<int> SecMask = VMIt->second;
12434for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {
12435if (SecMask[I] !=PoisonMaskElem) {
12436assert((Mask[I] ==PoisonMaskElem || IsBaseNotUndef) &&
12437"Multiple uses of scalars.");
12438 Mask[I] = (Res.second ?I : SecMask[I]) + VF;
12439 }elseif (Mask[I] !=PoisonMaskElem) {
12440 Mask[I] =I;
12441 }
12442 }
12443 Prev = Action(Mask, {Prev, Res.first});
12444 }
12445return Prev;
12446}
12447
12448namespace{
12449/// Data type for handling buildvector sequences with the reused scalars from
12450/// other tree entries.
12451template <typename T>structShuffledInsertData {
12452 /// List of insertelements to be replaced by shuffles.
12453SmallVector<InsertElementInst *> InsertElements;
12454 /// The parent vectors and shuffle mask for the given list of inserts.
12455MapVector<T, SmallVector<int>> ValueMasks;
12456};
12457}// namespace
12458
12459InstructionCostBoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
12460InstructionCostCost = 0;
12461LLVM_DEBUG(dbgs() <<"SLP: Calculating cost for tree of size "
12462 << VectorizableTree.size() <<".\n");
12463
12464unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12465
12466SmallPtrSet<Value *, 4> CheckedExtracts;
12467for (unsignedI = 0, E = VectorizableTree.size();I < E; ++I) {
12468 TreeEntry &TE = *VectorizableTree[I];
12469// No need to count the cost for combined entries, they are combined and
12470// just skip their cost.
12471if (TE.State == TreeEntry::CombinedVectorize) {
12472LLVM_DEBUG(
12473dbgs() <<"SLP: Skipping cost for combined node that starts with "
12474 << *TE.Scalars[0] <<".\n";
12475 TE.dump();dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12476continue;
12477 }
12478if (TE.isGather() && TE.hasState()) {
12479if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
12480 E && E->getVectorFactor() == TE.getVectorFactor() &&
12481 E->isSame(TE.Scalars)) {
12482// Some gather nodes might be absolutely the same as some vectorizable
12483// nodes after reordering, need to handle it.
12484LLVM_DEBUG(dbgs() <<"SLP: Adding cost 0 for bundle "
12485 <<shortBundleName(TE.Scalars, TE.Idx) <<".\n"
12486 <<"SLP: Current total cost = " <<Cost <<"\n");
12487continue;
12488 }
12489 }
12490
12491// Exclude cost of gather loads nodes which are not used. These nodes were
12492// built as part of the final attempt to vectorize gathered loads.
12493assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12494"Expected gather nodes with users only.");
12495
12496InstructionCostC = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12497Cost +=C;
12498LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C <<" for bundle "
12499 <<shortBundleName(TE.Scalars, TE.Idx) <<".\n"
12500 <<"SLP: Current total cost = " <<Cost <<"\n");
12501 }
12502
12503SmallPtrSet<Value *, 16> ExtractCostCalculated;
12504InstructionCost ExtractCost = 0;
12505SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
12506SmallVector<APInt> DemandedElts;
12507SmallDenseSet<Value *, 4> UsedInserts;
12508DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
12509 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12510DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
12511SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12512// Keep track {Scalar, Index, User} tuple.
12513// On AArch64, this helps in fusing a mov instruction, associated with
12514// extractelement, with fmul in the backend so that extractelement is free.
12515SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
12516for (ExternalUser &EU : ExternalUses) {
12517 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12518 }
12519for (ExternalUser &EU : ExternalUses) {
12520// Uses by ephemeral values are free (because the ephemeral value will be
12521// removed prior to code generation, and so the extraction will be
12522// removed as well).
12523if (EphValues.count(EU.User))
12524continue;
12525
12526// Used in unreachable blocks or in EH pads (rarely executed) or is
12527// terminated with unreachable instruction.
12528if (BasicBlock *UserParent =
12529 EU.User ? cast<Instruction>(EU.User)->getParent() :nullptr;
12530 UserParent &&
12531 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12532 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12533continue;
12534
12535// We only add extract cost once for the same scalar.
12536if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12537 !ExtractCostCalculated.insert(EU.Scalar).second)
12538continue;
12539
12540// No extract cost for vector "scalar"
12541if (isa<FixedVectorType>(EU.Scalar->getType()))
12542continue;
12543
12544// If found user is an insertelement, do not calculate extract cost but try
12545// to detect it as a final shuffled/identity match.
12546if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12547 VU && VU->getOperand(1) == EU.Scalar) {
12548if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12549if (!UsedInserts.insert(VU).second)
12550continue;
12551 std::optional<unsigned> InsertIdx =getElementIndex(VU);
12552if (InsertIdx) {
12553const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12554auto *It =find_if(
12555 ShuffledInserts,
12556 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12557// Checks if 2 insertelements are from the same buildvector.
12558InsertElementInst *VecInsert =Data.InsertElements.front();
12559returnareTwoInsertFromSameBuildVector(
12560 VU, VecInsert, [this](InsertElementInst *II) ->Value * {
12561Value *Op0 =II->getOperand(0);
12562if (getTreeEntry(II) && !getTreeEntry(Op0))
12563returnnullptr;
12564return Op0;
12565 });
12566 });
12567int VecId = -1;
12568if (It == ShuffledInserts.end()) {
12569auto &Data = ShuffledInserts.emplace_back();
12570Data.InsertElements.emplace_back(VU);
12571 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12572 VecId = ShuffledInserts.size() - 1;
12573auto It = MinBWs.find(ScalarTE);
12574if (It != MinBWs.end() &&
12575 VectorCasts
12576 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12577 .second) {
12578unsigned BWSz = It->second.first;
12579unsigned DstBWSz =DL->getTypeSizeInBits(FTy->getElementType());
12580unsigned VecOpcode;
12581if (DstBWSz < BWSz)
12582 VecOpcode = Instruction::Trunc;
12583else
12584 VecOpcode =
12585 It->second.second ? Instruction::SExt : Instruction::ZExt;
12586TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
12587InstructionCostC =TTI->getCastInstrCost(
12588 VecOpcode, FTy,
12589getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12590 FTy->getNumElements()),
12591TTI::CastContextHint::None,CostKind);
12592LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C
12593 <<" for extending externally used vector with "
12594"non-equal minimum bitwidth.\n");
12595Cost +=C;
12596 }
12597 }else {
12598if (isFirstInsertElement(VU, It->InsertElements.front()))
12599 It->InsertElements.front() = VU;
12600 VecId = std::distance(ShuffledInserts.begin(), It);
12601 }
12602int InIdx = *InsertIdx;
12603SmallVectorImpl<int> &Mask =
12604 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12605if (Mask.empty())
12606 Mask.assign(FTy->getNumElements(),PoisonMaskElem);
12607 Mask[InIdx] = EU.Lane;
12608 DemandedElts[VecId].setBit(InIdx);
12609continue;
12610 }
12611 }
12612 }
12613
12614TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
12615// If we plan to rewrite the tree in a smaller type, we will need to sign
12616// extend the extracted value back to the original type. Here, we account
12617// for the extract and the added cost of the sign extend if needed.
12618InstructionCost ExtraCost =TTI::TCC_Free;
12619auto *VecTy =getWidenedType(EU.Scalar->getType(), BundleWidth);
12620const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12621auto It = MinBWs.find(Entry);
12622if (It != MinBWs.end()) {
12623auto *MinTy =IntegerType::get(F->getContext(), It->second.first);
12624unsigned Extend =isKnownNonNegative(EU.Scalar,SimplifyQuery(*DL))
12625 ? Instruction::ZExt
12626 : Instruction::SExt;
12627 VecTy =getWidenedType(MinTy, BundleWidth);
12628 ExtraCost =TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12629 VecTy, EU.Lane);
12630 }else {
12631 ExtraCost =
12632TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,CostKind,
12633 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12634 }
12635// Leave the scalar instructions as is if they are cheaper than extracts.
12636if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12637 Entry->getOpcode() == Instruction::Load) {
12638// Checks if the user of the external scalar is phi in loop body.
12639auto IsPhiInLoop = [&](const ExternalUser &U) {
12640if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12641auto *I = cast<Instruction>(U.Scalar);
12642constLoop *L = LI->getLoopFor(Phi->getParent());
12643return L && (Phi->getParent() ==I->getParent() ||
12644 L == LI->getLoopFor(I->getParent()));
12645 }
12646returnfalse;
12647 };
12648if (!ValueToExtUses) {
12649 ValueToExtUses.emplace();
12650for_each(enumerate(ExternalUses), [&](constauto &P) {
12651// Ignore phis in loops.
12652if (IsPhiInLoop(P.value()))
12653return;
12654
12655 ValueToExtUses->try_emplace(P.value().Scalar,P.index());
12656 });
12657 }
12658// Can use original instruction, if no operands vectorized or they are
12659// marked as externally used already.
12660auto *Inst = cast<Instruction>(EU.Scalar);
12661InstructionCost ScalarCost =TTI->getInstructionCost(Inst,CostKind);
12662auto OperandIsScalar = [&](Value *V) {
12663if (!getTreeEntry(V)) {
12664// Some extractelements might be not vectorized, but
12665// transformed into shuffle and removed from the function,
12666// consider it here.
12667if (auto *EE = dyn_cast<ExtractElementInst>(V))
12668return !EE->hasOneUse() || !MustGather.contains(EE);
12669returntrue;
12670 }
12671return ValueToExtUses->contains(V);
12672 };
12673bool CanBeUsedAsScalar =all_of(Inst->operands(), OperandIsScalar);
12674bool CanBeUsedAsScalarCast =false;
12675if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12676if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12677Op &&all_of(Op->operands(), OperandIsScalar)) {
12678InstructionCost OpCost =
12679 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
12680 ?TTI->getInstructionCost(Op,CostKind)
12681 : 0;
12682if (ScalarCost + OpCost <= ExtraCost) {
12683 CanBeUsedAsScalar = CanBeUsedAsScalarCast =true;
12684 ScalarCost += OpCost;
12685 }
12686 }
12687 }
12688if (CanBeUsedAsScalar) {
12689bool KeepScalar = ScalarCost <= ExtraCost;
12690// Try to keep original scalar if the user is the phi node from the same
12691// block as the root phis, currently vectorized. It allows to keep
12692// better ordering info of PHIs, being vectorized currently.
12693bool IsProfitablePHIUser =
12694 (KeepScalar || (ScalarCost - ExtraCost <=TTI::TCC_Basic &&
12695 VectorizableTree.front()->Scalars.size() > 2)) &&
12696 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12697 !Inst->hasNUsesOrMore(UsesLimit) &&
12698none_of(Inst->users(),
12699 [&](User *U) {
12700 auto *PHIUser = dyn_cast<PHINode>(U);
12701 return (!PHIUser ||
12702 PHIUser->getParent() !=
12703 cast<Instruction>(
12704 VectorizableTree.front()->getMainOp())
12705 ->getParent()) &&
12706 !getTreeEntry(U);
12707 }) &&
12708count_if(Entry->Scalars, [&](Value *V) {
12709 return ValueToExtUses->contains(V);
12710 }) <= 2;
12711if (IsProfitablePHIUser) {
12712 KeepScalar =true;
12713 }elseif (KeepScalar && ScalarCost !=TTI::TCC_Free &&
12714 ExtraCost - ScalarCost <=TTI::TCC_Basic &&
12715 (!GatheredLoadsEntriesFirst.has_value() ||
12716 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12717unsigned ScalarUsesCount =count_if(Entry->Scalars, [&](Value *V) {
12718 return ValueToExtUses->contains(V);
12719 });
12720auto It = ExtractsCount.find(Entry);
12721if (It != ExtractsCount.end()) {
12722assert(ScalarUsesCount >= It->getSecond().size() &&
12723"Expected total number of external uses not less than "
12724"number of scalar uses.");
12725 ScalarUsesCount -= It->getSecond().size();
12726 }
12727// Keep original scalar if number of externally used instructions in
12728// the same entry is not power of 2. It may help to do some extra
12729// vectorization for now.
12730 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12731 }
12732if (KeepScalar) {
12733 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12734for_each(Inst->operands(), [&](Value *V) {
12735 auto It = ValueToExtUses->find(V);
12736 if (It != ValueToExtUses->end()) {
12737// Replace all uses to avoid compiler crash.
12738 ExternalUses[It->second].User = nullptr;
12739 }
12740 });
12741 ExtraCost = ScalarCost;
12742if (!IsPhiInLoop(EU))
12743 ExtractsCount[Entry].insert(Inst);
12744if (CanBeUsedAsScalarCast) {
12745 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12746// Update the users of the operands of the cast operand to avoid
12747// compiler crash.
12748if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12749for_each(IOp->operands(), [&](Value *V) {
12750 auto It = ValueToExtUses->find(V);
12751 if (It != ValueToExtUses->end()) {
12752// Replace all uses to avoid compiler crash.
12753 ExternalUses[It->second].User = nullptr;
12754 }
12755 });
12756 }
12757 }
12758 }
12759 }
12760 }
12761
12762 ExtractCost += ExtraCost;
12763 }
12764// Insert externals for extract of operands of casts to be emitted as scalars
12765// instead of extractelement.
12766for (Value *V : ScalarOpsFromCasts) {
12767 ExternalUsesAsOriginalScalar.insert(V);
12768if (const TreeEntry *E = getTreeEntry(V)) {
12769 ExternalUses.emplace_back(V,nullptr, E->findLaneForValue(V));
12770 }
12771 }
12772// Add reduced value cost, if resized.
12773if (!VectorizedVals.empty()) {
12774const TreeEntry &Root = *VectorizableTree.front();
12775auto BWIt = MinBWs.find(&Root);
12776if (BWIt != MinBWs.end()) {
12777Type *DstTy = Root.Scalars.front()->getType();
12778unsigned OriginalSz =DL->getTypeSizeInBits(DstTy->getScalarType());
12779unsigned SrcSz =
12780 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12781if (OriginalSz != SrcSz) {
12782unsigned Opcode = Instruction::Trunc;
12783if (OriginalSz > SrcSz)
12784 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12785Type *SrcTy =IntegerType::get(DstTy->getContext(), SrcSz);
12786if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12787assert(SLPReVec &&"Only supported by REVEC.");
12788 SrcTy =getWidenedType(SrcTy, VecTy->getNumElements());
12789 }
12790Cost +=TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12791TTI::CastContextHint::None,
12792TTI::TCK_RecipThroughput);
12793 }
12794 }
12795 }
12796
12797InstructionCost SpillCost = getSpillCost();
12798Cost += SpillCost + ExtractCost;
12799auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE,ArrayRef<int>Mask,
12800bool) {
12801InstructionCostC = 0;
12802unsigned VF =Mask.size();
12803unsigned VecVF =TE->getVectorFactor();
12804if (VF != VecVF &&
12805 (any_of(Mask, [VF](intIdx) {returnIdx >=static_cast<int>(VF); }) ||
12806 !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
12807SmallVector<int> OrigMask(VecVF,PoisonMaskElem);
12808 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12809 OrigMask.begin());
12810C =::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc,
12811getWidenedType(TE->getMainOp()->getType(), VecVF),
12812 OrigMask);
12813LLVM_DEBUG(
12814dbgs() <<"SLP: Adding cost " <<C
12815 <<" for final shuffle of insertelement external users.\n";
12816TE->dump();dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12817Cost +=C;
12818return std::make_pair(TE,true);
12819 }
12820return std::make_pair(TE,false);
12821 };
12822// Calculate the cost of the reshuffled vectors, if any.
12823for (intI = 0, E = ShuffledInserts.size();I < E; ++I) {
12824Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12825autoVector = ShuffledInserts[I].ValueMasks.takeVector();
12826unsigned VF = 0;
12827auto EstimateShufflesCost = [&](ArrayRef<int>Mask,
12828ArrayRef<const TreeEntry *> TEs) {
12829assert((TEs.size() == 1 || TEs.size() == 2) &&
12830"Expected exactly 1 or 2 tree entries.");
12831if (TEs.size() == 1) {
12832if (VF == 0)
12833 VF = TEs.front()->getVectorFactor();
12834auto *FTy =getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12835if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12836 !all_of(enumerate(Mask), [=](constauto &Data) {
12837returnData.value() ==PoisonMaskElem ||
12838 (Data.index() < VF &&
12839static_cast<int>(Data.index()) ==Data.value());
12840 })) {
12841InstructionCostC =
12842::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, FTy, Mask);
12843LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C
12844 <<" for final shuffle of insertelement "
12845"external users.\n";
12846 TEs.front()->dump();
12847dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12848Cost +=C;
12849 }
12850 }else {
12851if (VF == 0) {
12852if (TEs.front() &&
12853 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12854 VF = TEs.front()->getVectorFactor();
12855else
12856 VF =Mask.size();
12857 }
12858auto *FTy =getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12859InstructionCostC =
12860::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, FTy, Mask);
12861LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C
12862 <<" for final shuffle of vector node and external "
12863"insertelement users.\n";
12864if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12865dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12866Cost +=C;
12867 }
12868 VF =Mask.size();
12869return TEs.back();
12870 };
12871 (void)performExtractsShuffleAction<const TreeEntry>(
12872MutableArrayRef(Vector.data(),Vector.size()),Base,
12873 [](const TreeEntry *E) {return E->getVectorFactor(); }, ResizeToVF,
12874 EstimateShufflesCost);
12875InstructionCost InsertCost =TTI->getScalarizationOverhead(
12876 cast<FixedVectorType>(
12877 ShuffledInserts[I].InsertElements.front()->getType()),
12878 DemandedElts[I],
12879/*Insert*/true,/*Extract*/false,TTI::TCK_RecipThroughput);
12880Cost -= InsertCost;
12881 }
12882
12883// Add the cost for reduced value resize (if required).
12884if (ReductionBitWidth != 0) {
12885assert(UserIgnoreList &&"Expected reduction tree.");
12886const TreeEntry &E = *VectorizableTree.front();
12887auto It = MinBWs.find(&E);
12888if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12889unsigned SrcSize = It->second.first;
12890unsigned DstSize = ReductionBitWidth;
12891unsigned Opcode = Instruction::Trunc;
12892if (SrcSize < DstSize) {
12893bool IsArithmeticExtendedReduction =
12894all_of(*UserIgnoreList, [](Value *V) {
12895auto *I = cast<Instruction>(V);
12896returnis_contained({Instruction::Add, Instruction::FAdd,
12897 Instruction::Mul, Instruction::FMul,
12898 Instruction::And, Instruction::Or,
12899 Instruction::Xor},
12900I->getOpcode());
12901 });
12902if (IsArithmeticExtendedReduction)
12903 Opcode =
12904 Instruction::BitCast;// Handle it by getExtendedReductionCost
12905else
12906 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12907 }
12908if (Opcode != Instruction::BitCast) {
12909auto *SrcVecTy =
12910getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12911auto *DstVecTy =
12912getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12913TTI::CastContextHint CCH = getCastContextHint(E);
12914InstructionCost CastCost;
12915switch (E.getOpcode()) {
12916case Instruction::SExt:
12917case Instruction::ZExt:
12918case Instruction::Trunc: {
12919const TreeEntry *OpTE = getOperandEntry(&E, 0);
12920 CCH = getCastContextHint(*OpTE);
12921break;
12922 }
12923default:
12924break;
12925 }
12926 CastCost +=TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12927TTI::TCK_RecipThroughput);
12928Cost += CastCost;
12929LLVM_DEBUG(dbgs() <<"SLP: Adding cost " << CastCost
12930 <<" for final resize for reduction from " << SrcVecTy
12931 <<" to " << DstVecTy <<"\n";
12932dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");
12933 }
12934 }
12935 }
12936
12937#ifndef NDEBUG
12938SmallString<256> Str;
12939 {
12940raw_svector_ostreamOS(Str);
12941OS <<"SLP: Spill Cost = " << SpillCost <<".\n"
12942 <<"SLP: Extract Cost = " << ExtractCost <<".\n"
12943 <<"SLP: Total Cost = " <<Cost <<".\n";
12944 }
12945LLVM_DEBUG(dbgs() << Str);
12946if (ViewSLPTree)
12947ViewGraph(this,"SLP" +F->getName(),false, Str);
12948#endif
12949
12950returnCost;
12951}
12952
12953/// Tries to find extractelement instructions with constant indices from fixed
12954/// vector type and gather such instructions into a bunch, which highly likely
12955/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12956/// successful, the matched scalars are replaced by poison values in \p VL for
12957/// future analysis.
12958std::optional<TTI::ShuffleKind>
12959BoUpSLP::tryToGatherSingleRegisterExtractElements(
12960MutableArrayRef<Value *> VL,SmallVectorImpl<int> &Mask) const{
12961// Scan list of gathered scalars for extractelements that can be represented
12962// as shuffles.
12963MapVector<Value *, SmallVector<int>> VectorOpToIdx;
12964SmallVector<int> UndefVectorExtracts;
12965for (intI = 0, E = VL.size();I < E; ++I) {
12966auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12967if (!EI) {
12968if (isa<UndefValue>(VL[I]))
12969 UndefVectorExtracts.push_back(I);
12970continue;
12971 }
12972auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12973if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12974continue;
12975 std::optional<unsigned>Idx =getExtractIndex(EI);
12976// Undefined index.
12977if (!Idx) {
12978 UndefVectorExtracts.push_back(I);
12979continue;
12980 }
12981if (Idx >= VecTy->getNumElements()) {
12982 UndefVectorExtracts.push_back(I);
12983continue;
12984 }
12985SmallBitVector ExtractMask(VecTy->getNumElements(),true);
12986 ExtractMask.reset(*Idx);
12987if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12988 UndefVectorExtracts.push_back(I);
12989continue;
12990 }
12991 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
12992 }
12993// Sort the vector operands by the maximum number of uses in extractelements.
12994SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
12995 VectorOpToIdx.takeVector();
12996stable_sort(Vectors, [](constauto &P1,constauto &P2) {
12997returnP1.second.size() > P2.second.size();
12998 });
12999// Find the best pair of the vectors or a single vector.
13000constint UndefSz = UndefVectorExtracts.size();
13001unsigned SingleMax = 0;
13002unsigned PairMax = 0;
13003if (!Vectors.empty()) {
13004 SingleMax = Vectors.front().second.size() + UndefSz;
13005if (Vectors.size() > 1) {
13006auto *ItNext = std::next(Vectors.begin());
13007 PairMax = SingleMax + ItNext->second.size();
13008 }
13009 }
13010if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
13011return std::nullopt;
13012// Check if better to perform a shuffle of 2 vectors or just of a single
13013// vector.
13014SmallVector<Value *> SavedVL(VL.begin(), VL.end());
13015SmallVector<Value *> GatheredExtracts(
13016 VL.size(),PoisonValue::get(VL.front()->getType()));
13017if (SingleMax >= PairMax && SingleMax) {
13018for (intIdx : Vectors.front().second)
13019std::swap(GatheredExtracts[Idx], VL[Idx]);
13020 }elseif (!Vectors.empty()) {
13021for (unsignedIdx : {0, 1})
13022for (intIdx : Vectors[Idx].second)
13023std::swap(GatheredExtracts[Idx], VL[Idx]);
13024 }
13025// Add extracts from undefs too.
13026for (intIdx : UndefVectorExtracts)
13027std::swap(GatheredExtracts[Idx], VL[Idx]);
13028// Check that gather of extractelements can be represented as just a
13029// shuffle of a single/two vectors the scalars are extracted from.
13030 std::optional<TTI::ShuffleKind> Res =
13031isFixedVectorShuffle(GatheredExtracts, Mask, AC);
13032if (!Res ||all_of(Mask, [](intIdx) {returnIdx ==PoisonMaskElem; })) {
13033// TODO: try to check other subsets if possible.
13034// Restore the original VL if attempt was not successful.
13035copy(SavedVL, VL.begin());
13036return std::nullopt;
13037 }
13038// Restore unused scalars from mask, if some of the extractelements were not
13039// selected for shuffle.
13040for (intI = 0, E = GatheredExtracts.size();I < E; ++I) {
13041if (Mask[I] ==PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
13042 isa<UndefValue>(GatheredExtracts[I])) {
13043std::swap(VL[I], GatheredExtracts[I]);
13044continue;
13045 }
13046auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
13047if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13048 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13049is_contained(UndefVectorExtracts,I))
13050continue;
13051 }
13052return Res;
13053}
13054
13055/// Tries to find extractelement instructions with constant indices from fixed
13056/// vector type and gather such instructions into a bunch, which highly likely
13057/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
13058/// successful, the matched scalars are replaced by poison values in \p VL for
13059/// future analysis.
13060SmallVector<std::optional<TTI::ShuffleKind>>
13061BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
13062SmallVectorImpl<int> &Mask,
13063unsigned NumParts) const{
13064assert(NumParts > 0 &&"NumParts expected be greater than or equal to 1.");
13065SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
13066Mask.assign(VL.size(),PoisonMaskElem);
13067unsigned SliceSize =getPartNumElems(VL.size(), NumParts);
13068for (unsigned Part : seq<unsigned>(NumParts)) {
13069// Scan list of gathered scalars for extractelements that can be represented
13070// as shuffles.
13071MutableArrayRef<Value *> SubVL =MutableArrayRef(VL).slice(
13072 Part * SliceSize,getNumElems(VL.size(), SliceSize, Part));
13073SmallVector<int> SubMask;
13074 std::optional<TTI::ShuffleKind> Res =
13075 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13076 ShufflesRes[Part] = Res;
13077copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
13078 }
13079if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
13080return Res.has_value();
13081 }))
13082 ShufflesRes.clear();
13083return ShufflesRes;
13084}
13085
13086std::optional<TargetTransformInfo::ShuffleKind>
13087BoUpSLP::isGatherShuffledSingleRegisterEntry(
13088const TreeEntry *TE,ArrayRef<Value *> VL,MutableArrayRef<int> Mask,
13089SmallVectorImpl<const TreeEntry *> &Entries,unsigned Part,bool ForOrder) {
13090 Entries.clear();
13091// TODO: currently checking only for Scalars in the tree entry, need to count
13092// reused elements too for better cost estimation.
13093const EdgeInfo &TEUseEI =TE == VectorizableTree.front().get()
13094 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13095 :TE->UserTreeIndices.front();
13096constInstruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13097constBasicBlock *TEInsertBlock =nullptr;
13098// Main node of PHI entries keeps the correct order of operands/incoming
13099// blocks.
13100if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13101 TEInsertBlock =PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13102 TEInsertPt = TEInsertBlock->getTerminator();
13103 }else {
13104 TEInsertBlock = TEInsertPt->getParent();
13105 }
13106if (!DT->isReachableFromEntry(TEInsertBlock))
13107return std::nullopt;
13108auto *NodeUI = DT->getNode(TEInsertBlock);
13109assert(NodeUI &&"Should only process reachable instructions");
13110SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13111auto CheckOrdering = [&](constInstruction *InsertPt) {
13112// Argument InsertPt is an instruction where vector code for some other
13113// tree entry (one that shares one or more scalars with TE) is going to be
13114// generated. This lambda returns true if insertion point of vector code
13115// for the TE dominates that point (otherwise dependency is the other way
13116// around). The other node is not limited to be of a gather kind. Gather
13117// nodes are not scheduled and their vector code is inserted before their
13118// first user. If user is PHI, that is supposed to be at the end of a
13119// predecessor block. Otherwise it is the last instruction among scalars of
13120// the user node. So, instead of checking dependency between instructions
13121// themselves, we check dependency between their insertion points for vector
13122// code (since each scalar instruction ends up as a lane of a vector
13123// instruction).
13124constBasicBlock *InsertBlock = InsertPt->getParent();
13125auto *NodeEUI = DT->getNode(InsertBlock);
13126if (!NodeEUI)
13127returnfalse;
13128assert((NodeUI == NodeEUI) ==
13129 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13130"Different nodes should have different DFS numbers");
13131// Check the order of the gather nodes users.
13132if (TEInsertPt->getParent() != InsertBlock &&
13133 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13134returnfalse;
13135if (TEInsertPt->getParent() == InsertBlock &&
13136 TEInsertPt->comesBefore(InsertPt))
13137returnfalse;
13138returntrue;
13139 };
13140// Find all tree entries used by the gathered values. If no common entries
13141// found - not a shuffle.
13142// Here we build a set of tree nodes for each gathered value and trying to
13143// find the intersection between these sets. If we have at least one common
13144// tree node for each gathered value - we have just a permutation of the
13145// single vector. If we have 2 different sets, we're in situation where we
13146// have a permutation of 2 input vectors.
13147SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
13148DenseMap<Value *, int> UsedValuesEntry;
13149for (Value *V : VL) {
13150if (isConstant(V))
13151continue;
13152// Build a list of tree entries where V is used.
13153SmallPtrSet<const TreeEntry *, 4> VToTEs;
13154for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13155if (TEPtr == TE || TEPtr->Idx == 0)
13156continue;
13157assert(any_of(TEPtr->Scalars,
13158 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13159"Must contain at least single gathered value.");
13160assert(TEPtr->UserTreeIndices.size() == 1 &&
13161"Expected only single user of a gather node.");
13162const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13163
13164PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13165constInstruction *InsertPt =
13166 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13167 : &getLastInstructionInBundle(UseEI.UserTE);
13168if (TEInsertPt == InsertPt) {
13169// If 2 gathers are operands of the same entry (regardless of whether
13170// user is PHI or else), compare operands indices, use the earlier one
13171// as the base.
13172if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13173continue;
13174// If the user instruction is used for some reason in different
13175// vectorized nodes - make it depend on index.
13176if (TEUseEI.UserTE != UseEI.UserTE &&
13177 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13178continue;
13179 }
13180
13181// Check if the user node of the TE comes after user node of TEPtr,
13182// otherwise TEPtr depends on TE.
13183if ((TEInsertBlock != InsertPt->getParent() ||
13184 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13185 !CheckOrdering(InsertPt))
13186continue;
13187 VToTEs.insert(TEPtr);
13188 }
13189if (const TreeEntry *VTE = getTreeEntry(V)) {
13190if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13191if (VTE->State != TreeEntry::Vectorize) {
13192auto It = MultiNodeScalars.find(V);
13193if (It == MultiNodeScalars.end())
13194continue;
13195 VTE = *It->getSecond().begin();
13196// Iterate through all vectorized nodes.
13197auto *MIt =find_if(It->getSecond(), [](const TreeEntry *MTE) {
13198 return MTE->State == TreeEntry::Vectorize;
13199 });
13200if (MIt == It->getSecond().end())
13201continue;
13202 VTE = *MIt;
13203 }
13204 }
13205if (none_of(TE->CombinedEntriesWithIndices,
13206 [&](constauto &P) { return P.first == VTE->Idx; })) {
13207Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13208if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13209continue;
13210 }
13211 VToTEs.insert(VTE);
13212 }
13213if (VToTEs.empty())
13214continue;
13215if (UsedTEs.empty()) {
13216// The first iteration, just insert the list of nodes to vector.
13217 UsedTEs.push_back(VToTEs);
13218 UsedValuesEntry.try_emplace(V, 0);
13219 }else {
13220// Need to check if there are any previously used tree nodes which use V.
13221// If there are no such nodes, consider that we have another one input
13222// vector.
13223SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13224unsignedIdx = 0;
13225for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13226// Do we have a non-empty intersection of previously listed tree entries
13227// and tree entries using current V?
13228set_intersect(VToTEs, Set);
13229if (!VToTEs.empty()) {
13230// Yes, write the new subset and continue analysis for the next
13231// scalar.
13232Set.swap(VToTEs);
13233break;
13234 }
13235 VToTEs = SavedVToTEs;
13236 ++Idx;
13237 }
13238// No non-empty intersection found - need to add a second set of possible
13239// source vectors.
13240if (Idx == UsedTEs.size()) {
13241// If the number of input vectors is greater than 2 - not a permutation,
13242// fallback to the regular gather.
13243// TODO: support multiple reshuffled nodes.
13244if (UsedTEs.size() == 2)
13245continue;
13246 UsedTEs.push_back(SavedVToTEs);
13247Idx = UsedTEs.size() - 1;
13248 }
13249 UsedValuesEntry.try_emplace(V,Idx);
13250 }
13251 }
13252
13253if (UsedTEs.empty()) {
13254 Entries.clear();
13255return std::nullopt;
13256 }
13257
13258unsigned VF = 0;
13259if (UsedTEs.size() == 1) {
13260// Keep the order to avoid non-determinism.
13261SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13262 UsedTEs.front().end());
13263sort(FirstEntries, [](const TreeEntry *TE1,const TreeEntry *TE2) {
13264return TE1->Idx < TE2->Idx;
13265 });
13266// Try to find the perfect match in another gather node at first.
13267auto *It =find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13268return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13269 });
13270if (It != FirstEntries.end() &&
13271 ((*It)->getVectorFactor() == VL.size() ||
13272 ((*It)->getVectorFactor() ==TE->Scalars.size() &&
13273TE->ReuseShuffleIndices.size() == VL.size() &&
13274 (*It)->isSame(TE->Scalars)))) {
13275 Entries.push_back(*It);
13276if ((*It)->getVectorFactor() == VL.size()) {
13277 std::iota(std::next(Mask.begin(), Part * VL.size()),
13278 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13279 }else {
13280SmallVector<int> CommonMask =TE->getCommonMask();
13281copy(CommonMask,Mask.begin());
13282 }
13283// Clear undef scalars.
13284for (unsignedI : seq<unsigned>(VL.size()))
13285if (isa<PoisonValue>(VL[I]))
13286Mask[Part * VL.size() +I] =PoisonMaskElem;
13287returnTargetTransformInfo::SK_PermuteSingleSrc;
13288 }
13289// No perfect match, just shuffle, so choose the first tree node from the
13290// tree.
13291 Entries.push_back(FirstEntries.front());
13292 VF = FirstEntries.front()->getVectorFactor();
13293 }else {
13294// Try to find nodes with the same vector factor.
13295assert(UsedTEs.size() == 2 &&"Expected at max 2 permuted entries.");
13296// Keep the order of tree nodes to avoid non-determinism.
13297DenseMap<int, const TreeEntry *> VFToTE;
13298for (const TreeEntry *TE : UsedTEs.front()) {
13299unsigned VF =TE->getVectorFactor();
13300auto It = VFToTE.find(VF);
13301if (It != VFToTE.end()) {
13302if (It->second->Idx >TE->Idx)
13303 It->getSecond() =TE;
13304continue;
13305 }
13306 VFToTE.try_emplace(VF, TE);
13307 }
13308// Same, keep the order to avoid non-determinism.
13309SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13310 UsedTEs.back().end());
13311sort(SecondEntries, [](const TreeEntry *TE1,const TreeEntry *TE2) {
13312return TE1->Idx < TE2->Idx;
13313 });
13314for (const TreeEntry *TE : SecondEntries) {
13315auto It = VFToTE.find(TE->getVectorFactor());
13316if (It != VFToTE.end()) {
13317 VF = It->first;
13318 Entries.push_back(It->second);
13319 Entries.push_back(TE);
13320break;
13321 }
13322 }
13323// No 2 source vectors with the same vector factor - just choose 2 with max
13324// index.
13325if (Entries.empty()) {
13326 Entries.push_back(*llvm::max_element(
13327 UsedTEs.front(), [](const TreeEntry *TE1,const TreeEntry *TE2) {
13328 return TE1->Idx < TE2->Idx;
13329 }));
13330 Entries.push_back(SecondEntries.front());
13331 VF = std::max(Entries.front()->getVectorFactor(),
13332 Entries.back()->getVectorFactor());
13333 }else {
13334 VF = Entries.front()->getVectorFactor();
13335 }
13336 }
13337
13338bool IsSplatOrUndefs =isSplat(VL) ||all_of(VL, IsaPred<UndefValue>);
13339// Checks if the 2 PHIs are compatible in terms of high possibility to be
13340// vectorized.
13341auto AreCompatiblePHIs = [&](Value *V,Value *V1) {
13342auto *PHI = cast<PHINode>(V);
13343auto *PHI1 = cast<PHINode>(V1);
13344// Check that all incoming values are compatible/from same parent (if they
13345// are instructions).
13346// The incoming values are compatible if they all are constants, or
13347// instruction with the same/alternate opcodes from the same basic block.
13348for (intI = 0, E =PHI->getNumIncomingValues();I < E; ++I) {
13349Value *In =PHI->getIncomingValue(I);
13350Value *In1 = PHI1->getIncomingValue(I);
13351if (isConstant(In) &&isConstant(In1))
13352continue;
13353if (!getSameOpcode({In, In1}, *TLI))
13354returnfalse;
13355if (cast<Instruction>(In)->getParent() !=
13356 cast<Instruction>(In1)->getParent())
13357returnfalse;
13358 }
13359returntrue;
13360 };
13361// Check if the value can be ignored during analysis for shuffled gathers.
13362// We suppose it is better to ignore instruction, which do not form splats,
13363// are not vectorized/not extractelements (these instructions will be handled
13364// by extractelements processing) or may form vector node in future.
13365auto MightBeIgnored = [=](Value *V) {
13366auto *I = dyn_cast<Instruction>(V);
13367returnI && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
13368 !isVectorLikeInstWithConstOps(I) &&
13369 !areAllUsersVectorized(I, UserIgnoreList) &&isSimple(I);
13370 };
13371// Check that the neighbor instruction may form a full vector node with the
13372// current instruction V. It is possible, if they have same/alternate opcode
13373// and same parent basic block.
13374auto NeighborMightBeIgnored = [&](Value *V,intIdx) {
13375Value *V1 = VL[Idx];
13376bool UsedInSameVTE =false;
13377auto It = UsedValuesEntry.find(V1);
13378if (It != UsedValuesEntry.end())
13379 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13380returnV != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13381getSameOpcode({V, V1}, *TLI) &&
13382 cast<Instruction>(V)->getParent() ==
13383 cast<Instruction>(V1)->getParent() &&
13384 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13385 };
13386// Build a shuffle mask for better cost estimation and vector emission.
13387SmallBitVector UsedIdxs(Entries.size());
13388SmallVector<std::pair<unsigned, int>> EntryLanes;
13389for (intI = 0, E = VL.size();I < E; ++I) {
13390Value *V = VL[I];
13391auto It = UsedValuesEntry.find(V);
13392if (It == UsedValuesEntry.end())
13393continue;
13394// Do not try to shuffle scalars, if they are constants, or instructions
13395// that can be vectorized as a result of the following vector build
13396// vectorization.
13397if (isConstant(V) || (MightBeIgnored(V) &&
13398 ((I > 0 && NeighborMightBeIgnored(V,I - 1)) ||
13399 (I != E - 1 && NeighborMightBeIgnored(V,I + 1)))))
13400continue;
13401unsignedIdx = It->second;
13402 EntryLanes.emplace_back(Idx,I);
13403 UsedIdxs.set(Idx);
13404 }
13405// Iterate through all shuffled scalars and select entries, which can be used
13406// for final shuffle.
13407SmallVector<const TreeEntry *> TempEntries;
13408for (unsignedI = 0, Sz = Entries.size();I < Sz; ++I) {
13409if (!UsedIdxs.test(I))
13410continue;
13411// Fix the entry number for the given scalar. If it is the first entry, set
13412// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13413// These indices are used when calculating final shuffle mask as the vector
13414// offset.
13415for (std::pair<unsigned, int> &Pair : EntryLanes)
13416if (Pair.first ==I)
13417 Pair.first = TempEntries.size();
13418 TempEntries.push_back(Entries[I]);
13419 }
13420 Entries.swap(TempEntries);
13421if (EntryLanes.size() == Entries.size() &&
13422 !VL.equals(ArrayRef(TE->Scalars)
13423 .slice(Part * VL.size(),
13424 std::min<int>(VL.size(),TE->Scalars.size())))) {
13425// We may have here 1 or 2 entries only. If the number of scalars is equal
13426// to the number of entries, no need to do the analysis, it is not very
13427// profitable. Since VL is not the same as TE->Scalars, it means we already
13428// have some shuffles before. Cut off not profitable case.
13429 Entries.clear();
13430return std::nullopt;
13431 }
13432// Build the final mask, check for the identity shuffle, if possible.
13433bool IsIdentity = Entries.size() == 1;
13434// Pair.first is the offset to the vector, while Pair.second is the index of
13435// scalar in the list.
13436for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13437unsignedIdx = Part * VL.size() + Pair.second;
13438Mask[Idx] =
13439 Pair.first * VF +
13440 (ForOrder ? std::distance(
13441 Entries[Pair.first]->Scalars.begin(),
13442find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13443 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13444 IsIdentity &=Mask[Idx] == Pair.second;
13445 }
13446if (ForOrder || IsIdentity || Entries.empty()) {
13447switch (Entries.size()) {
13448case 1:
13449if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13450returnTargetTransformInfo::SK_PermuteSingleSrc;
13451break;
13452case 2:
13453if (EntryLanes.size() > 2 || VL.size() <= 2)
13454returnTargetTransformInfo::SK_PermuteTwoSrc;
13455break;
13456default:
13457break;
13458 }
13459 }elseif (!isa<VectorType>(VL.front()->getType()) &&
13460 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13461// Do the cost estimation if shuffle beneficial than buildvector.
13462SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13463 std::next(Mask.begin(), (Part + 1) * VL.size()));
13464int MinElement = SubMask.front(), MaxElement = SubMask.front();
13465for (intIdx : SubMask) {
13466if (Idx ==PoisonMaskElem)
13467continue;
13468if (MinElement ==PoisonMaskElem || MinElement % VF >Idx % VF)
13469 MinElement =Idx;
13470if (MaxElement ==PoisonMaskElem || MaxElement % VF <Idx % VF)
13471 MaxElement =Idx;
13472 }
13473assert(MaxElement >= 0 && MinElement >= 0 &&
13474 MaxElement % VF >= MinElement % VF &&
13475"Expected at least single element.");
13476unsigned NewVF = std::max<unsigned>(
13477 VL.size(),getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13478 (MaxElement % VF) -
13479 (MinElement % VF) + 1));
13480if (NewVF < VF) {
13481for_each(SubMask, [&](int &Idx) {
13482if (Idx ==PoisonMaskElem)
13483return;
13484Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13485 (Idx >=static_cast<int>(VF) ? NewVF : 0);
13486 });
13487 }else {
13488 NewVF = VF;
13489 }
13490
13491constexprTTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
13492auto *VecTy =getWidenedType(VL.front()->getType(), NewVF);
13493auto *MaskVecTy =getWidenedType(VL.front()->getType(), SubMask.size());
13494auto GetShuffleCost = [&,
13495 &TTI = *TTI](ArrayRef<int>Mask,
13496ArrayRef<const TreeEntry *> Entries,
13497VectorType *VecTy) ->InstructionCost {
13498if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13499ShuffleVectorInst::isDeInterleaveMaskOfFactor(
13500 Mask, Entries.front()->getInterleaveFactor()))
13501returnTTI::TCC_Free;
13502 return ::getShuffleCost(TTI,
13503 Entries.size() > 1 ?TTI::SK_PermuteTwoSrc
13504 :TTI::SK_PermuteSingleSrc,
13505 VecTy, Mask,CostKind);
13506 };
13507InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13508InstructionCost FirstShuffleCost = 0;
13509SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13510if (Entries.size() == 1 || !Entries[0]->isGather()) {
13511 FirstShuffleCost = ShuffleCost;
13512 }else {
13513// Transform mask to include only first entry.
13514APInt DemandedElts =APInt::getAllOnes(SubMask.size());
13515bool IsIdentity =true;
13516for (auto [I,Idx] :enumerate(FirstMask)) {
13517if (Idx >=static_cast<int>(NewVF)) {
13518Idx =PoisonMaskElem;
13519 }else {
13520 DemandedElts.clearBit(I);
13521if (Idx !=PoisonMaskElem)
13522 IsIdentity &=static_cast<int>(I) ==Idx;
13523 }
13524 }
13525if (!IsIdentity)
13526 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13527 FirstShuffleCost +=TTI->getScalarizationOverhead(
13528 MaskVecTy, DemandedElts,/*Insert=*/true,
13529/*Extract=*/false,CostKind);
13530 }
13531InstructionCost SecondShuffleCost = 0;
13532SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13533if (Entries.size() == 1 || !Entries[1]->isGather()) {
13534 SecondShuffleCost = ShuffleCost;
13535 }else {
13536// Transform mask to include only first entry.
13537APInt DemandedElts =APInt::getAllOnes(SubMask.size());
13538bool IsIdentity =true;
13539for (auto [I,Idx] :enumerate(SecondMask)) {
13540if (Idx <static_cast<int>(NewVF) &&Idx >= 0) {
13541Idx =PoisonMaskElem;
13542 }else {
13543 DemandedElts.clearBit(I);
13544if (Idx !=PoisonMaskElem) {
13545Idx -= NewVF;
13546 IsIdentity &=static_cast<int>(I) ==Idx;
13547 }
13548 }
13549 }
13550if (!IsIdentity)
13551 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13552 SecondShuffleCost +=TTI->getScalarizationOverhead(
13553 MaskVecTy, DemandedElts,/*Insert=*/true,
13554/*Extract=*/false,CostKind);
13555 }
13556APInt DemandedElts =APInt::getAllOnes(SubMask.size());
13557for (auto [I,Idx] :enumerate(SubMask))
13558if (Idx ==PoisonMaskElem)
13559 DemandedElts.clearBit(I);
13560InstructionCost BuildVectorCost =
13561TTI->getScalarizationOverhead(MaskVecTy, DemandedElts,/*Insert=*/true,
13562/*Extract=*/false,CostKind);
13563const TreeEntry *BestEntry =nullptr;
13564if (FirstShuffleCost < ShuffleCost) {
13565 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13566 std::next(Mask.begin(), (Part + 1) * VL.size()),
13567 [&](int &Idx) {
13568 if (Idx >= static_cast<int>(VF))
13569 Idx = PoisonMaskElem;
13570 });
13571 BestEntry = Entries.front();
13572 ShuffleCost = FirstShuffleCost;
13573 }
13574if (SecondShuffleCost < ShuffleCost) {
13575 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13576 std::next(Mask.begin(), (Part + 1) * VL.size()),
13577 [&](int &Idx) {
13578 if (Idx < static_cast<int>(VF))
13579 Idx = PoisonMaskElem;
13580 else
13581 Idx -= VF;
13582 });
13583 BestEntry = Entries[1];
13584 ShuffleCost = SecondShuffleCost;
13585 }
13586if (BuildVectorCost >= ShuffleCost) {
13587if (BestEntry) {
13588 Entries.clear();
13589 Entries.push_back(BestEntry);
13590 }
13591return Entries.size() > 1 ?TargetTransformInfo::SK_PermuteTwoSrc
13592 :TargetTransformInfo::SK_PermuteSingleSrc;
13593 }
13594 }
13595 Entries.clear();
13596// Clear the corresponding mask elements.
13597 std::fill(std::next(Mask.begin(), Part * VL.size()),
13598 std::next(Mask.begin(), (Part + 1) * VL.size()),PoisonMaskElem);
13599return std::nullopt;
13600}
13601
13602SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
13603BoUpSLP::isGatherShuffledEntry(
13604const TreeEntry *TE,ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,
13605SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,unsigned NumParts,
13606bool ForOrder) {
13607assert(NumParts > 0 && NumParts < VL.size() &&
13608"Expected positive number of registers.");
13609 Entries.clear();
13610// No need to check for the topmost gather node.
13611if (TE == VectorizableTree.front().get() &&
13612 (!GatheredLoadsEntriesFirst.has_value() ||
13613none_of(ArrayRef(VectorizableTree).drop_front(),
13614 [](const std::unique_ptr<TreeEntry> &TE) {
13615return !TE->isGather();
13616 })))
13617return {};
13618// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
13619// implemented yet.
13620if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
13621return {};
13622Mask.assign(VL.size(),PoisonMaskElem);
13623assert((TE->UserTreeIndices.size() == 1 ||
13624 TE == VectorizableTree.front().get()) &&
13625"Expected only single user of the gather node.");
13626assert(VL.size() % NumParts == 0 &&
13627"Number of scalars must be divisible by NumParts.");
13628if (!TE->UserTreeIndices.empty() &&
13629TE->UserTreeIndices.front().UserTE->isGather() &&
13630TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13631assert(
13632 (TE->Idx == 0 ||
13633 (TE->hasState() &&TE->getOpcode() == Instruction::ExtractElement) ||
13634isSplat(TE->Scalars)) &&
13635"Expected splat or extractelements only node.");
13636return {};
13637 }
13638unsigned SliceSize =getPartNumElems(VL.size(), NumParts);
13639SmallVector<std::optional<TTI::ShuffleKind>> Res;
13640for (unsigned Part : seq<unsigned>(NumParts)) {
13641ArrayRef<Value *> SubVL =
13642 VL.slice(Part * SliceSize,getNumElems(VL.size(), SliceSize, Part));
13643SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13644 std::optional<TTI::ShuffleKind> SubRes =
13645 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13646 ForOrder);
13647if (!SubRes)
13648 SubEntries.clear();
13649 Res.push_back(SubRes);
13650if (SubEntries.size() == 1 && *SubRes ==TTI::SK_PermuteSingleSrc &&
13651 SubEntries.front()->getVectorFactor() == VL.size() &&
13652 (SubEntries.front()->isSame(TE->Scalars) ||
13653 SubEntries.front()->isSame(VL))) {
13654SmallVector<const TreeEntry *> LocalSubEntries;
13655 LocalSubEntries.swap(SubEntries);
13656 Entries.clear();
13657 Res.clear();
13658 std::iota(Mask.begin(),Mask.end(), 0);
13659// Clear undef scalars.
13660for (intI = 0, Sz = VL.size();I < Sz; ++I)
13661if (isa<PoisonValue>(VL[I]))
13662Mask[I] =PoisonMaskElem;
13663 Entries.emplace_back(1, LocalSubEntries.front());
13664 Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
13665return Res;
13666 }
13667 }
13668if (all_of(Res,
13669 [](const std::optional<TTI::ShuffleKind> &SK) {return !SK; })) {
13670 Entries.clear();
13671return {};
13672 }
13673return Res;
13674}
13675
13676InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,bool ForPoisonSrc,
13677Type *ScalarTy) const{
13678auto *VecTy =getWidenedType(ScalarTy, VL.size());
13679bool DuplicateNonConst =false;
13680// Find the cost of inserting/extracting values from the vector.
13681// Check if the same elements are inserted several times and count them as
13682// shuffle candidates.
13683APInt ShuffledElements =APInt::getZero(VL.size());
13684DenseMap<Value *, unsigned> UniqueElements;
13685constexprTTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
13686InstructionCostCost;
13687auto EstimateInsertCost = [&](unsignedI,Value *V) {
13688if (V->getType() != ScalarTy) {
13689Cost +=TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,V->getType(),
13690TTI::CastContextHint::None,CostKind);
13691V =nullptr;
13692 }
13693if (!ForPoisonSrc)
13694Cost +=
13695TTI->getVectorInstrCost(Instruction::InsertElement, VecTy,CostKind,
13696I,Constant::getNullValue(VecTy),V);
13697 };
13698SmallVector<int> ShuffleMask(VL.size(),PoisonMaskElem);
13699for (unsignedI = 0, E = VL.size();I < E; ++I) {
13700Value *V = VL[I];
13701// No need to shuffle duplicates for constants.
13702if ((ForPoisonSrc &&isConstant(V)) || isa<UndefValue>(V)) {
13703 ShuffledElements.setBit(I);
13704 ShuffleMask[I] = isa<PoisonValue>(V) ?PoisonMaskElem :I;
13705continue;
13706 }
13707
13708auto Res = UniqueElements.try_emplace(V,I);
13709if (Res.second) {
13710 EstimateInsertCost(I, V);
13711 ShuffleMask[I] =I;
13712continue;
13713 }
13714
13715 DuplicateNonConst =true;
13716 ShuffledElements.setBit(I);
13717 ShuffleMask[I] = Res.first->second;
13718 }
13719if (ForPoisonSrc) {
13720if (isa<FixedVectorType>(ScalarTy)) {
13721assert(SLPReVec &&"Only supported by REVEC.");
13722// We don't need to insert elements one by one. Instead, we can insert the
13723// entire vector into the destination.
13724Cost = 0;
13725unsigned ScalarTyNumElements =getNumElements(ScalarTy);
13726for (unsignedI : seq<unsigned>(VL.size()))
13727if (!ShuffledElements[I])
13728Cost +=TTI->getShuffleCost(
13729TTI::SK_InsertSubvector, VecTy, std::nullopt,CostKind,
13730I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13731 }else {
13732Cost =TTI->getScalarizationOverhead(VecTy,
13733/*DemandedElts*/ ~ShuffledElements,
13734/*Insert*/true,
13735/*Extract*/false,CostKind, VL);
13736 }
13737 }
13738if (DuplicateNonConst)
13739Cost +=::getShuffleCost(*TTI,TargetTransformInfo::SK_PermuteSingleSrc,
13740 VecTy, ShuffleMask);
13741returnCost;
13742}
13743
13744Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13745auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13746if (Res)
13747return *Res;
13748// Get the basic block this bundle is in. All instructions in the bundle
13749// should be in this block (except for extractelement-like instructions with
13750// constant indices or gathered loads).
13751auto *Front = E->getMainOp();
13752auto *BB = Front->getParent();
13753assert(((GatheredLoadsEntriesFirst.has_value() &&
13754 E->getOpcode() == Instruction::Load && E->isGather() &&
13755 E->Idx < *GatheredLoadsEntriesFirst) ||
13756all_of(E->Scalars,
13757 [=](Value *V) ->bool {
13758 if (E->getOpcode() == Instruction::GetElementPtr &&
13759 !isa<GetElementPtrInst>(V))
13760 return true;
13761 auto *I = dyn_cast<Instruction>(V);
13762 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13763 isVectorLikeInstWithConstOps(I);
13764 })) &&
13765"Expected gathered loads or GEPs or instructions from same basic "
13766"block.");
13767
13768auto FindLastInst = [&]() {
13769Instruction *LastInst = Front;
13770for (Value *V : E->Scalars) {
13771auto *I = dyn_cast<Instruction>(V);
13772if (!I)
13773continue;
13774if (LastInst->getParent() ==I->getParent()) {
13775if (LastInst->comesBefore(I))
13776 LastInst =I;
13777continue;
13778 }
13779assert(((E->getOpcode() == Instruction::GetElementPtr &&
13780 !isa<GetElementPtrInst>(I)) ||
13781 (isVectorLikeInstWithConstOps(LastInst) &&
13782isVectorLikeInstWithConstOps(I)) ||
13783 (GatheredLoadsEntriesFirst.has_value() &&
13784 E->getOpcode() == Instruction::Load && E->isGather() &&
13785 E->Idx < *GatheredLoadsEntriesFirst)) &&
13786"Expected vector-like or non-GEP in GEP node insts only.");
13787if (!DT->isReachableFromEntry(LastInst->getParent())) {
13788 LastInst =I;
13789continue;
13790 }
13791if (!DT->isReachableFromEntry(I->getParent()))
13792continue;
13793auto *NodeA = DT->getNode(LastInst->getParent());
13794auto *NodeB = DT->getNode(I->getParent());
13795assert(NodeA &&"Should only process reachable instructions");
13796assert(NodeB &&"Should only process reachable instructions");
13797assert((NodeA == NodeB) ==
13798 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13799"Different nodes should have different DFS numbers");
13800if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13801 LastInst =I;
13802 }
13803 BB = LastInst->getParent();
13804return LastInst;
13805 };
13806
13807auto FindFirstInst = [&]() {
13808Instruction *FirstInst = Front;
13809for (Value *V : E->Scalars) {
13810auto *I = dyn_cast<Instruction>(V);
13811if (!I)
13812continue;
13813if (FirstInst->getParent() ==I->getParent()) {
13814if (I->comesBefore(FirstInst))
13815 FirstInst =I;
13816continue;
13817 }
13818assert(((E->getOpcode() == Instruction::GetElementPtr &&
13819 !isa<GetElementPtrInst>(I)) ||
13820 (isVectorLikeInstWithConstOps(FirstInst) &&
13821isVectorLikeInstWithConstOps(I))) &&
13822"Expected vector-like or non-GEP in GEP node insts only.");
13823if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13824 FirstInst =I;
13825continue;
13826 }
13827if (!DT->isReachableFromEntry(I->getParent()))
13828continue;
13829auto *NodeA = DT->getNode(FirstInst->getParent());
13830auto *NodeB = DT->getNode(I->getParent());
13831assert(NodeA &&"Should only process reachable instructions");
13832assert(NodeB &&"Should only process reachable instructions");
13833assert((NodeA == NodeB) ==
13834 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13835"Different nodes should have different DFS numbers");
13836if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13837 FirstInst =I;
13838 }
13839return FirstInst;
13840 };
13841
13842// Set insertpoint for gathered loads to the very first load.
13843if (GatheredLoadsEntriesFirst.has_value() &&
13844 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13845 E->getOpcode() == Instruction::Load) {
13846 Res = FindFirstInst();
13847return *Res;
13848 }
13849
13850// Set the insert point to the beginning of the basic block if the entry
13851// should not be scheduled.
13852if (doesNotNeedToSchedule(E->Scalars) ||
13853 (!E->isGather() &&all_of(E->Scalars,isVectorLikeInstWithConstOps))) {
13854if ((E->getOpcode() == Instruction::GetElementPtr &&
13855any_of(E->Scalars,
13856 [](Value *V) {
13857 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13858 })) ||
13859all_of(E->Scalars,
13860 [](Value *V) {
13861 return isa<PoisonValue>(V) ||
13862 (!isVectorLikeInstWithConstOps(V) &&
13863 isUsedOutsideBlock(V));
13864 }) ||
13865 (E->isGather() && E->Idx == 0 &&all_of(E->Scalars, [](Value *V) {
13866 return isa<ExtractElementInst, UndefValue>(V) ||
13867 areAllOperandsNonInsts(V);
13868 })))
13869 Res = FindLastInst();
13870else
13871 Res = FindFirstInst();
13872return *Res;
13873 }
13874
13875// Find the last instruction. The common case should be that BB has been
13876// scheduled, and the last instruction is VL.back(). So we start with
13877// VL.back() and iterate over schedule data until we reach the end of the
13878// bundle. The end of the bundle is marked by null ScheduleData.
13879if (BlocksSchedules.count(BB) && !E->isGather()) {
13880Value *V = E->isOneOf(E->Scalars.back());
13881if (doesNotNeedToBeScheduled(V))
13882V = *find_if_not(E->Scalars,doesNotNeedToBeScheduled);
13883auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13884if (Bundle && Bundle->isPartOfBundle())
13885for (; Bundle; Bundle = Bundle->NextInBundle)
13886 Res = Bundle->Inst;
13887 }
13888
13889// LastInst can still be null at this point if there's either not an entry
13890// for BB in BlocksSchedules or there's no ScheduleData available for
13891// VL.back(). This can be the case if buildTree_rec aborts for various
13892// reasons (e.g., the maximum recursion depth is reached, the maximum region
13893// size is reached, etc.). ScheduleData is initialized in the scheduling
13894// "dry-run".
13895//
13896// If this happens, we can still find the last instruction by brute force. We
13897// iterate forwards from Front (inclusive) until we either see all
13898// instructions in the bundle or reach the end of the block. If Front is the
13899// last instruction in program order, LastInst will be set to Front, and we
13900// will visit all the remaining instructions in the block.
13901//
13902// One of the reasons we exit early from buildTree_rec is to place an upper
13903// bound on compile-time. Thus, taking an additional compile-time hit here is
13904// not ideal. However, this should be exceedingly rare since it requires that
13905// we both exit early from buildTree_rec and that the bundle be out-of-order
13906// (causing us to iterate all the way to the end of the block).
13907if (!Res)
13908 Res = FindLastInst();
13909assert(Res &&"Failed to find last instruction in bundle");
13910return *Res;
13911}
13912
13913void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13914auto *Front = E->getMainOp();
13915Instruction *LastInst = &getLastInstructionInBundle(E);
13916assert(LastInst &&"Failed to find last instruction in bundle");
13917BasicBlock::iterator LastInstIt = LastInst->getIterator();
13918// If the instruction is PHI, set the insert point after all the PHIs.
13919bool IsPHI = isa<PHINode>(LastInst);
13920if (IsPHI)
13921 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13922if (IsPHI || (!E->isGather() &&doesNotNeedToSchedule(E->Scalars))) {
13923 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13924 }else {
13925// Set the insertion point after the last instruction in the bundle. Set the
13926// debug location to Front.
13927 Builder.SetInsertPoint(
13928 LastInst->getParent(),
13929 LastInst->getNextNonDebugInstruction()->getIterator());
13930 }
13931 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13932}
13933
13934Value *BoUpSLP::gather(
13935ArrayRef<Value *> VL,Value *Root,Type *ScalarTy,
13936function_ref<Value *(Value *,Value *,ArrayRef<int>)> CreateShuffle) {
13937// List of instructions/lanes from current block and/or the blocks which are
13938// part of the current loop. These instructions will be inserted at the end to
13939// make it possible to optimize loops and hoist invariant instructions out of
13940// the loops body with better chances for success.
13941SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
13942SmallSet<int, 4> PostponedIndices;
13943Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13944auto &&CheckPredecessor = [](BasicBlock *InstBB,BasicBlock *InsertBB) {
13945SmallPtrSet<BasicBlock *, 4> Visited;
13946while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13947 InsertBB = InsertBB->getSinglePredecessor();
13948return InsertBB && InsertBB == InstBB;
13949 };
13950for (intI = 0, E = VL.size();I < E; ++I) {
13951if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13952if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13953 getTreeEntry(Inst) ||
13954 (L && (!Root ||L->isLoopInvariant(Root)) &&L->contains(Inst))) &&
13955 PostponedIndices.insert(I).second)
13956 PostponedInsts.emplace_back(Inst,I);
13957 }
13958
13959auto &&CreateInsertElement = [this](Value *Vec,Value *V,unsigned Pos,
13960Type *Ty) {
13961Value *Scalar =V;
13962if (Scalar->getType() != Ty) {
13963assert(Scalar->getType()->isIntOrIntVectorTy() &&
13964 Ty->isIntOrIntVectorTy() &&"Expected integer types only.");
13965Value *V =Scalar;
13966if (auto *CI = dyn_cast<CastInst>(Scalar);
13967 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13968Value *Op = CI->getOperand(0);
13969if (auto *IOp = dyn_cast<Instruction>(Op);
13970 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
13971V =Op;
13972 }
13973Scalar = Builder.CreateIntCast(
13974 V, Ty, !isKnownNonNegative(Scalar,SimplifyQuery(*DL)));
13975 }
13976
13977Instruction *InsElt;
13978if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13979assert(SLPReVec &&"FixedVectorType is not expected.");
13980 Vec =
13981createInsertVector(Builder, Vec, Scalar, Pos *getNumElements(VecTy));
13982auto *II = dyn_cast<IntrinsicInst>(Vec);
13983if (!II ||II->getIntrinsicID() != Intrinsic::vector_insert)
13984return Vec;
13985 InsElt =II;
13986 }else {
13987 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13988 InsElt = dyn_cast<InsertElementInst>(Vec);
13989if (!InsElt)
13990return Vec;
13991 }
13992 GatherShuffleExtractSeq.insert(InsElt);
13993 CSEBlocks.insert(InsElt->getParent());
13994// Add to our 'need-to-extract' list.
13995if (isa<Instruction>(V)) {
13996if (TreeEntry *Entry = getTreeEntry(V)) {
13997// Find which lane we need to extract.
13998User *UserOp =nullptr;
13999if (Scalar != V) {
14000if (auto *SI = dyn_cast<Instruction>(Scalar))
14001 UserOp =SI;
14002 }else {
14003 UserOp = InsElt;
14004 }
14005if (UserOp) {
14006unsigned FoundLane =Entry->findLaneForValue(V);
14007 ExternalUses.emplace_back(V, UserOp, FoundLane);
14008 }
14009 }
14010 }
14011return Vec;
14012 };
14013auto *VecTy =getWidenedType(ScalarTy, VL.size());
14014Value *Vec =PoisonValue::get(VecTy);
14015SmallVector<int> NonConsts;
14016SmallVector<int>Mask(VL.size());
14017 std::iota(Mask.begin(),Mask.end(), 0);
14018Value *OriginalRoot = Root;
14019if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14020 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14021 SV->getOperand(0)->getType() == VecTy) {
14022 Root = SV->getOperand(0);
14023Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14024 }
14025// Insert constant values at first.
14026for (intI = 0, E = VL.size();I < E; ++I) {
14027if (PostponedIndices.contains(I))
14028continue;
14029if (!isConstant(VL[I])) {
14030 NonConsts.push_back(I);
14031continue;
14032 }
14033if (isa<PoisonValue>(VL[I]))
14034continue;
14035 Vec = CreateInsertElement(Vec, VL[I],I, ScalarTy);
14036Mask[I] =I + E;
14037 }
14038if (Root) {
14039if (isa<PoisonValue>(Vec)) {
14040 Vec = OriginalRoot;
14041 }else {
14042 Vec = CreateShuffle(Root, Vec, Mask);
14043if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
14044 OI && OI->hasNUses(0) &&
14045none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14046returnTE->VectorizedValue == OI;
14047 }))
14048eraseInstruction(OI);
14049 }
14050 }
14051// Insert non-constant values.
14052for (intI : NonConsts)
14053 Vec = CreateInsertElement(Vec, VL[I],I, ScalarTy);
14054// Append instructions, which are/may be part of the loop, in the end to make
14055// it possible to hoist non-loop-based instructions.
14056for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14057 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14058
14059return Vec;
14060}
14061
14062/// Merges shuffle masks and emits final shuffle instruction, if required. It
14063/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14064/// when the actual shuffle instruction is generated only if this is actually
14065/// required. Otherwise, the shuffle instruction emission is delayed till the
14066/// end of the process, to reduce the number of emitted instructions and further
14067/// analysis/transformations.
14068/// The class also will look through the previously emitted shuffle instructions
14069/// and properly mark indices in mask as undef.
14070/// For example, given the code
14071/// \code
14072/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
14073/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
14074/// \endcode
14075/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
14076/// look through %s1 and %s2 and emit
14077/// \code
14078/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14079/// \endcode
14080/// instead.
14081/// If 2 operands are of different size, the smallest one will be resized and
14082/// the mask recalculated properly.
14083/// For example, given the code
14084/// \code
14085/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
14086/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
14087/// \endcode
14088/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
14089/// look through %s1 and %s2 and emit
14090/// \code
14091/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14092/// \endcode
14093/// instead.
14094classBoUpSLP::ShuffleInstructionBuilder final :public BaseShuffleAnalysis {
14095bool IsFinalized =false;
14096 /// Combined mask for all applied operands and masks. It is built during
14097 /// analysis and actual emission of shuffle vector instructions.
14098SmallVector<int> CommonMask;
14099 /// List of operands for the shuffle vector instruction. It hold at max 2
14100 /// operands, if the 3rd is going to be added, the first 2 are combined into
14101 /// shuffle with \p CommonMask mask, the first operand sets to be the
14102 /// resulting shuffle and the second operand sets to be the newly added
14103 /// operand. The \p CommonMask is transformed in the proper way after that.
14104SmallVector<Value *, 2> InVectors;
14105IRBuilderBase &Builder;
14106BoUpSLP &R;
14107
14108classShuffleIRBuilder {
14109IRBuilderBase &Builder;
14110 /// Holds all of the instructions that we gathered.
14111SetVector<Instruction *> &GatherShuffleExtractSeq;
14112 /// A list of blocks that we are going to CSE.
14113DenseSet<BasicBlock *> &CSEBlocks;
14114 /// Data layout.
14115constDataLayout &DL;
14116
14117public:
14118 ShuffleIRBuilder(IRBuilderBase &Builder,
14119SetVector<Instruction *> &GatherShuffleExtractSeq,
14120DenseSet<BasicBlock *> &CSEBlocks,constDataLayout &DL)
14121 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14122 CSEBlocks(CSEBlocks),DL(DL) {}
14123 ~ShuffleIRBuilder() =default;
14124 /// Creates shufflevector for the 2 operands with the given mask.
14125Value *createShuffleVector(Value *V1,Value *V2,ArrayRef<int> Mask) {
14126if (V1->getType() != V2->getType()) {
14127assert(V1->getType()->isIntOrIntVectorTy() &&
14128 V1->getType()->isIntOrIntVectorTy() &&
14129"Expected integer vector types only.");
14130if (V1->getType() != V2->getType()) {
14131if (cast<VectorType>(V2->getType())
14132 ->getElementType()
14133 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14134 ->getElementType()
14135 ->getIntegerBitWidth())
14136 V2 = Builder.CreateIntCast(
14137 V2, V1->getType(), !isKnownNonNegative(V2,SimplifyQuery(DL)));
14138else
14139 V1 = Builder.CreateIntCast(
14140 V1, V2->getType(), !isKnownNonNegative(V1,SimplifyQuery(DL)));
14141 }
14142 }
14143Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14144if (auto *I = dyn_cast<Instruction>(Vec)) {
14145 GatherShuffleExtractSeq.insert(I);
14146 CSEBlocks.insert(I->getParent());
14147 }
14148return Vec;
14149 }
14150 /// Creates permutation of the single vector operand with the given mask, if
14151 /// it is not identity mask.
14152Value *createShuffleVector(Value *V1,ArrayRef<int> Mask) {
14153if (Mask.empty())
14154return V1;
14155unsigned VF = Mask.size();
14156unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14157if (VF == LocalVF &&ShuffleVectorInst::isIdentityMask(Mask, VF))
14158return V1;
14159Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14160if (auto *I = dyn_cast<Instruction>(Vec)) {
14161 GatherShuffleExtractSeq.insert(I);
14162 CSEBlocks.insert(I->getParent());
14163 }
14164return Vec;
14165 }
14166Value *createIdentity(Value *V) {return V; }
14167Value *createPoison(Type *Ty,unsigned VF) {
14168returnPoisonValue::get(getWidenedType(Ty, VF));
14169 }
14170 /// Resizes 2 input vector to match the sizes, if the they are not equal
14171 /// yet. The smallest vector is resized to the size of the larger vector.
14172void resizeToMatch(Value *&V1,Value *&V2) {
14173if (V1->getType() == V2->getType())
14174return;
14175int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14176int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14177int VF = std::max(V1VF, V2VF);
14178int MinVF = std::min(V1VF, V2VF);
14179SmallVector<int> IdentityMask(VF,PoisonMaskElem);
14180 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14181 0);
14182Value *&Op = MinVF == V1VF ? V1 : V2;
14183Op = Builder.CreateShuffleVector(Op, IdentityMask);
14184if (auto *I = dyn_cast<Instruction>(Op)) {
14185 GatherShuffleExtractSeq.insert(I);
14186 CSEBlocks.insert(I->getParent());
14187 }
14188if (MinVF == V1VF)
14189 V1 =Op;
14190else
14191 V2 =Op;
14192 }
14193 };
14194
14195 /// Smart shuffle instruction emission, walks through shuffles trees and
14196 /// tries to find the best matching vector for the actual shuffle
14197 /// instruction.
14198Value *createShuffle(Value *V1,Value *V2,ArrayRef<int> Mask) {
14199assert(V1 &&"Expected at least one vector value.");
14200 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14201 R.CSEBlocks, *R.DL);
14202return BaseShuffleAnalysis::createShuffle<Value *>(
14203 V1, V2, Mask, ShuffleBuilder, ScalarTy);
14204 }
14205
14206 /// Cast value \p V to the vector type with the same number of elements, but
14207 /// the base type \p ScalarTy.
14208Value *castToScalarTyElem(Value *V,
14209 std::optional<bool> IsSigned = std::nullopt) {
14210auto *VecTy = cast<VectorType>(V->getType());
14211assert(getNumElements(VecTy) %getNumElements(ScalarTy) == 0);
14212if (VecTy->getElementType() == ScalarTy->getScalarType())
14213return V;
14214return Builder.CreateIntCast(
14215 V,VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14216 IsSigned.value_or(!isKnownNonNegative(V,SimplifyQuery(*R.DL))));
14217 }
14218
14219public:
14220ShuffleInstructionBuilder(Type *ScalarTy,IRBuilderBase &Builder,BoUpSLP &R)
14221 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14222
14223 /// Adjusts extractelements after reusing them.
14224Value *adjustExtracts(const TreeEntry *E,MutableArrayRef<int> Mask,
14225ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14226unsigned NumParts,bool &UseVecBaseAsInput) {
14227 UseVecBaseAsInput =false;
14228SmallPtrSet<Value *, 4> UniqueBases;
14229Value *VecBase =nullptr;
14230SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14231if (!E->ReorderIndices.empty()) {
14232SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14233 E->ReorderIndices.end());
14234reorderScalars(VL, ReorderMask);
14235 }
14236for (intI = 0, Sz = Mask.size();I < Sz; ++I) {
14237intIdx = Mask[I];
14238if (Idx ==PoisonMaskElem)
14239continue;
14240auto *EI = cast<ExtractElementInst>(VL[I]);
14241 VecBase = EI->getVectorOperand();
14242if (const TreeEntry *TE = R.getTreeEntry(VecBase))
14243 VecBase = TE->VectorizedValue;
14244assert(VecBase &&"Expected vectorized value.");
14245 UniqueBases.insert(VecBase);
14246// If the only one use is vectorized - can delete the extractelement
14247// itself.
14248if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14249 (NumParts != 1 &&count(VL, EI) > 1) ||
14250any_of(EI->users(), [&](User *U) {
14251 const TreeEntry *UTE = R.getTreeEntry(U);
14252 return !UTE || R.MultiNodeScalars.contains(U) ||
14253 (isa<GetElementPtrInst>(U) &&
14254 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14255 count_if(R.VectorizableTree,
14256 [&](const std::unique_ptr<TreeEntry> &TE) {
14257 return any_of(TE->UserTreeIndices,
14258 [&](const EdgeInfo &Edge) {
14259 return Edge.UserTE == UTE;
14260 }) &&
14261 is_contained(VL, EI);
14262 }) != 1;
14263 }))
14264continue;
14265 R.eraseInstruction(EI);
14266 }
14267if (NumParts == 1 || UniqueBases.size() == 1) {
14268assert(VecBase &&"Expected vectorized value.");
14269return castToScalarTyElem(VecBase);
14270 }
14271 UseVecBaseAsInput =true;
14272auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14273for (auto [I,Idx] :enumerate(Mask))
14274if (Idx !=PoisonMaskElem)
14275Idx =I;
14276 };
14277// Perform multi-register vector shuffle, joining them into a single virtual
14278// long vector.
14279// Need to shuffle each part independently and then insert all this parts
14280// into a long virtual vector register, forming the original vector.
14281Value *Vec =nullptr;
14282SmallVector<int> VecMask(Mask.size(),PoisonMaskElem);
14283unsigned SliceSize =getPartNumElems(VL.size(), NumParts);
14284for (unsigned Part : seq<unsigned>(NumParts)) {
14285unsigned Limit =getNumElems(VL.size(), SliceSize, Part);
14286ArrayRef<Value *> SubVL =ArrayRef(VL).slice(Part * SliceSize, Limit);
14287MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14288constexprint MaxBases = 2;
14289SmallVector<Value *, MaxBases> Bases(MaxBases);
14290auto VLMask =zip(SubVL, SubMask);
14291constunsigned VF = std::accumulate(
14292 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S,constauto &D) {
14293 if (std::get<1>(D) == PoisonMaskElem)
14294 return S;
14295 Value *VecOp =
14296 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14297 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14298 VecOp = TE->VectorizedValue;
14299 assert(VecOp &&"Expected vectorized value.");
14300 const unsigned Size =
14301 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14302 return std::max(S, Size);
14303 });
14304for (constauto [V,I] : VLMask) {
14305if (I ==PoisonMaskElem)
14306continue;
14307Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14308if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14309 VecOp = TE->VectorizedValue;
14310assert(VecOp &&"Expected vectorized value.");
14311 VecOp = castToScalarTyElem(VecOp);
14312 Bases[I / VF] = VecOp;
14313 }
14314if (!Bases.front())
14315continue;
14316Value *SubVec;
14317if (Bases.back()) {
14318 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14319 TransformToIdentity(SubMask);
14320 }else {
14321 SubVec = Bases.front();
14322 }
14323if (!Vec) {
14324 Vec = SubVec;
14325assert((Part == 0 ||all_of(seq<unsigned>(0, Part),
14326 [&](unsignedP) {
14327ArrayRef<int> SubMask =
14328Mask.slice(P * SliceSize,
14329getNumElems(Mask.size(),
14330 SliceSize,P));
14331returnall_of(SubMask, [](intIdx) {
14332returnIdx ==PoisonMaskElem;
14333 });
14334 })) &&
14335"Expected first part or all previous parts masked.");
14336copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14337 }else {
14338unsigned NewVF =
14339 cast<FixedVectorType>(Vec->getType())->getNumElements();
14340if (Vec->getType() != SubVec->getType()) {
14341unsigned SubVecVF =
14342 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14343 NewVF = std::max(NewVF, SubVecVF);
14344 }
14345// Adjust SubMask.
14346for (int &Idx : SubMask)
14347if (Idx !=PoisonMaskElem)
14348Idx += NewVF;
14349copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14350 Vec = createShuffle(Vec, SubVec, VecMask);
14351 TransformToIdentity(VecMask);
14352 }
14353 }
14354copy(VecMask,Mask.begin());
14355return Vec;
14356 }
14357 /// Checks if the specified entry \p E needs to be delayed because of its
14358 /// dependency nodes.
14359 std::optional<Value *>
14360needToDelay(const TreeEntry *E,
14361ArrayRef<SmallVector<const TreeEntry *>> Deps) const{
14362// No need to delay emission if all deps are ready.
14363if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14364returnall_of(
14365 TEs, [](const TreeEntry *TE) {return TE->VectorizedValue; });
14366 }))
14367return std::nullopt;
14368// Postpone gather emission, will be emitted after the end of the
14369// process to keep correct order.
14370auto *ResVecTy =getWidenedType(ScalarTy, E->getVectorFactor());
14371return Builder.CreateAlignedLoad(
14372 ResVecTy,
14373PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
14374MaybeAlign());
14375 }
14376 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14377 /// shuffling.
14378voidadd(const TreeEntry &E1,const TreeEntry &E2,ArrayRef<int> Mask) {
14379Value *V1 = E1.VectorizedValue;
14380if (V1->getType()->isIntOrIntVectorTy())
14381 V1 = castToScalarTyElem(V1,any_of(E1.Scalars, [&](Value *V) {
14382 if (isa<PoisonValue>(V))
14383 return false;
14384 return !isKnownNonNegative(
14385 V, SimplifyQuery(*R.DL));
14386 }));
14387Value *V2 = E2.VectorizedValue;
14388if (V2->getType()->isIntOrIntVectorTy())
14389 V2 = castToScalarTyElem(V2,any_of(E2.Scalars, [&](Value *V) {
14390 if (isa<PoisonValue>(V))
14391 return false;
14392 return !isKnownNonNegative(
14393 V, SimplifyQuery(*R.DL));
14394 }));
14395 add(V1, V2, Mask);
14396 }
14397 /// Adds single input vector (in form of tree entry) and the mask for its
14398 /// shuffling.
14399voidadd(const TreeEntry &E1,ArrayRef<int> Mask) {
14400Value *V1 = E1.VectorizedValue;
14401if (V1->getType()->isIntOrIntVectorTy())
14402 V1 = castToScalarTyElem(V1,any_of(E1.Scalars, [&](Value *V) {
14403 if (isa<PoisonValue>(V))
14404 return false;
14405 return !isKnownNonNegative(
14406 V, SimplifyQuery(*R.DL));
14407 }));
14408 add(V1, Mask);
14409 }
14410 /// Adds 2 input vectors and the mask for their shuffling.
14411voidadd(Value *V1,Value *V2,ArrayRef<int> Mask) {
14412assert(V1 && V2 && !Mask.empty() &&"Expected non-empty input vectors.");
14413assert(isa<FixedVectorType>(V1->getType()) &&
14414 isa<FixedVectorType>(V2->getType()) &&
14415"castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14416 V1 = castToScalarTyElem(V1);
14417 V2 = castToScalarTyElem(V2);
14418if (InVectors.empty()) {
14419 InVectors.push_back(V1);
14420 InVectors.push_back(V2);
14421 CommonMask.assign(Mask.begin(), Mask.end());
14422return;
14423 }
14424Value *Vec = InVectors.front();
14425if (InVectors.size() == 2) {
14426 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14427 transformMaskAfterShuffle(CommonMask, CommonMask);
14428 }elseif (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14429 Mask.size()) {
14430 Vec = createShuffle(Vec,nullptr, CommonMask);
14431 transformMaskAfterShuffle(CommonMask, CommonMask);
14432 }
14433 V1 = createShuffle(V1, V2, Mask);
14434unsigned VF = std::max(getVF(V1), getVF(Vec));
14435for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
14436if (Mask[Idx] !=PoisonMaskElem)
14437 CommonMask[Idx] =Idx + VF;
14438 InVectors.front() = Vec;
14439if (InVectors.size() == 2)
14440 InVectors.back() = V1;
14441else
14442 InVectors.push_back(V1);
14443 }
14444 /// Adds another one input vector and the mask for the shuffling.
14445voidadd(Value *V1,ArrayRef<int> Mask,bool =false) {
14446assert(isa<FixedVectorType>(V1->getType()) &&
14447"castToScalarTyElem expects V1 to be FixedVectorType");
14448 V1 = castToScalarTyElem(V1);
14449if (InVectors.empty()) {
14450 InVectors.push_back(V1);
14451 CommonMask.assign(Mask.begin(), Mask.end());
14452return;
14453 }
14454constauto *It =find(InVectors, V1);
14455if (It == InVectors.end()) {
14456if (InVectors.size() == 2 ||
14457 InVectors.front()->getType() != V1->getType()) {
14458Value *V = InVectors.front();
14459if (InVectors.size() == 2) {
14460 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14461 transformMaskAfterShuffle(CommonMask, CommonMask);
14462 }elseif (cast<FixedVectorType>(V->getType())->getNumElements() !=
14463 CommonMask.size()) {
14464 V = createShuffle(InVectors.front(),nullptr, CommonMask);
14465 transformMaskAfterShuffle(CommonMask, CommonMask);
14466 }
14467unsigned VF = std::max(CommonMask.size(), Mask.size());
14468for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
14469if (CommonMask[Idx] ==PoisonMaskElem && Mask[Idx] !=PoisonMaskElem)
14470 CommonMask[Idx] =
14471 V->getType() != V1->getType()
14472 ?Idx + VF
14473 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14474 ->getNumElements();
14475if (V->getType() != V1->getType())
14476 V1 = createShuffle(V1,nullptr, Mask);
14477 InVectors.front() = V;
14478if (InVectors.size() == 2)
14479 InVectors.back() = V1;
14480else
14481 InVectors.push_back(V1);
14482return;
14483 }
14484// Check if second vector is required if the used elements are already
14485// used from the first one.
14486for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
14487if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem) {
14488 InVectors.push_back(V1);
14489break;
14490 }
14491 }
14492unsigned VF = 0;
14493for (Value *V : InVectors)
14494 VF = std::max(VF, getVF(V));
14495for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)
14496if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)
14497 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14498 }
14499 /// Adds another one input vector and the mask for the shuffling.
14500voidaddOrdered(Value *V1,ArrayRef<unsigned> Order) {
14501SmallVector<int> NewMask;
14502inversePermutation(Order, NewMask);
14503 add(V1, NewMask);
14504 }
14505Value *gather(ArrayRef<Value *> VL,unsigned MaskVF = 0,
14506Value *Root =nullptr) {
14507return R.gather(VL, Root, ScalarTy,
14508 [&](Value *V1,Value *V2,ArrayRef<int> Mask) {
14509return createShuffle(V1, V2, Mask);
14510 });
14511 }
14512Value *createFreeze(Value *V) {return Builder.CreateFreeze(V); }
14513 /// Finalize emission of the shuffles.
14514 /// \param Action the action (if any) to be performed before final applying of
14515 /// the \p ExtMask mask.
14516Value *
14517finalize(ArrayRef<int> ExtMask,
14518ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14519ArrayRef<int> SubVectorsMask,unsigned VF = 0,
14520function_ref<void(Value *&,SmallVectorImpl<int> &)> Action = {}) {
14521 IsFinalized =true;
14522if (Action) {
14523Value *Vec = InVectors.front();
14524if (InVectors.size() == 2) {
14525 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14526 InVectors.pop_back();
14527 }else {
14528 Vec = createShuffle(Vec,nullptr, CommonMask);
14529 }
14530 transformMaskAfterShuffle(CommonMask, CommonMask);
14531assert(VF > 0 &&
14532"Expected vector length for the final value before action.");
14533unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14534if (VecVF < VF) {
14535SmallVector<int> ResizeMask(VF,PoisonMaskElem);
14536 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14537 Vec = createShuffle(Vec,nullptr, ResizeMask);
14538 }
14539 Action(Vec, CommonMask);
14540 InVectors.front() = Vec;
14541 }
14542if (!SubVectors.empty()) {
14543Value *Vec = InVectors.front();
14544if (InVectors.size() == 2) {
14545 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14546 InVectors.pop_back();
14547 }else {
14548 Vec = createShuffle(Vec,nullptr, CommonMask);
14549 }
14550 transformMaskAfterShuffle(CommonMask, CommonMask);
14551auto CreateSubVectors = [&](Value *Vec,
14552SmallVectorImpl<int> &CommonMask) {
14553for (auto [E,Idx] : SubVectors) {
14554Value *V = E->VectorizedValue;
14555if (V->getType()->isIntOrIntVectorTy())
14556 V = castToScalarTyElem(V,any_of(E->Scalars, [&](Value *V) {
14557 if (isa<PoisonValue>(V))
14558 return false;
14559 return !isKnownNonNegative(
14560 V, SimplifyQuery(*R.DL));
14561 }));
14562unsigned InsertionIndex =Idx *getNumElements(ScalarTy);
14563 Vec =createInsertVector(
14564 Builder, Vec, V, InsertionIndex,
14565 std::bind(&ShuffleInstructionBuilder::createShuffle,this, _1, _2,
14566 _3));
14567if (!CommonMask.empty()) {
14568 std::iota(std::next(CommonMask.begin(),Idx),
14569 std::next(CommonMask.begin(),Idx + E->getVectorFactor()),
14570Idx);
14571 }
14572 }
14573return Vec;
14574 };
14575if (SubVectorsMask.empty()) {
14576 Vec = CreateSubVectors(Vec, CommonMask);
14577 }else {
14578SmallVector<int> SVMask(CommonMask.size(),PoisonMaskElem);
14579copy(SubVectorsMask, SVMask.begin());
14580for (auto [I1, I2] :zip(SVMask, CommonMask)) {
14581if (I2 !=PoisonMaskElem) {
14582assert(I1 ==PoisonMaskElem &&"Expected unused subvectors mask");
14583I1 = I2 + CommonMask.size();
14584 }
14585 }
14586Value *InsertVec =
14587 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14588 Vec = createShuffle(InsertVec, Vec, SVMask);
14589 transformMaskAfterShuffle(CommonMask, SVMask);
14590 }
14591 InVectors.front() = Vec;
14592 }
14593
14594if (!ExtMask.empty()) {
14595if (CommonMask.empty()) {
14596 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14597 }else {
14598SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);
14599for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {
14600if (ExtMask[I] ==PoisonMaskElem)
14601continue;
14602 NewMask[I] = CommonMask[ExtMask[I]];
14603 }
14604 CommonMask.swap(NewMask);
14605 }
14606 }
14607if (CommonMask.empty()) {
14608assert(InVectors.size() == 1 &&"Expected only one vector with no mask");
14609return InVectors.front();
14610 }
14611if (InVectors.size() == 2)
14612return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14613return createShuffle(InVectors.front(),nullptr, CommonMask);
14614 }
14615
14616~ShuffleInstructionBuilder() {
14617assert((IsFinalized || CommonMask.empty()) &&
14618"Shuffle construction must be finalized.");
14619 }
14620};
14621
14622BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14623unsigned NodeIdx) {
14624ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14625 InstructionsState S =getSameOpcode(VL, *TLI);
14626// Special processing for GEPs bundle, which may include non-gep values.
14627if (!S && VL.front()->getType()->isPointerTy()) {
14628constauto *It =find_if(VL, IsaPred<GetElementPtrInst>);
14629if (It != VL.end())
14630 S =getSameOpcode(*It, *TLI);
14631 }
14632if (!S)
14633returnnullptr;
14634auto CheckSameVE = [&](const TreeEntry *VE) {
14635return VE->isSame(VL) &&
14636 (any_of(VE->UserTreeIndices,
14637 [E, NodeIdx](const EdgeInfo &EI) {
14638 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14639 }) ||
14640any_of(VectorizableTree,
14641 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14642return TE->isOperandGatherNode(
14643 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14644 VE->isSame(TE->Scalars);
14645 }));
14646 };
14647 TreeEntry *VE = getTreeEntry(S.getMainOp());
14648if (VE && CheckSameVE(VE))
14649return VE;
14650auto It = MultiNodeScalars.find(S.getMainOp());
14651if (It != MultiNodeScalars.end()) {
14652auto *I =find_if(It->getSecond(), [&](const TreeEntry *TE) {
14653 return TE != VE && CheckSameVE(TE);
14654 });
14655if (I != It->getSecond().end())
14656return *I;
14657 }
14658returnnullptr;
14659}
14660
14661Value *BoUpSLP::vectorizeOperand(TreeEntry *E,unsigned NodeIdx,
14662bool PostponedPHIs) {
14663ValueList &VL = E->getOperand(NodeIdx);
14664constunsigned VF = VL.size();
14665if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14666auto FinalShuffle = [&](Value *V,ArrayRef<int>Mask) {
14667// V may be affected by MinBWs.
14668// We want ShuffleInstructionBuilder to correctly support REVEC. The key
14669// factor is the number of elements, not their type.
14670Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14671unsigned NumElements =getNumElements(VL.front()->getType());
14672 ShuffleInstructionBuilder ShuffleBuilder(
14673 NumElements != 1 ?FixedVectorType::get(ScalarTy, NumElements)
14674 : ScalarTy,
14675 Builder, *this);
14676 ShuffleBuilder.add(V, Mask);
14677SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
14678 E->CombinedEntriesWithIndices.size());
14679transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14680 [&](constauto &P) {
14681 return std::make_pair(VectorizableTree[P.first].get(),
14682 P.second);
14683 });
14684assert((E->CombinedEntriesWithIndices.empty() ||
14685 E->ReorderIndices.empty()) &&
14686"Expected either combined subnodes or reordering");
14687return ShuffleBuilder.finalize({}, SubVectors, {});
14688 };
14689Value *V =vectorizeTree(VE, PostponedPHIs);
14690if (VF *getNumElements(VL[0]->getType()) !=
14691 cast<FixedVectorType>(V->getType())->getNumElements()) {
14692if (!VE->ReuseShuffleIndices.empty()) {
14693// Reshuffle to get only unique values.
14694// If some of the scalars are duplicated in the vectorization
14695// tree entry, we do not vectorize them but instead generate a
14696// mask for the reuses. But if there are several users of the
14697// same entry, they may have different vectorization factors.
14698// This is especially important for PHI nodes. In this case, we
14699// need to adapt the resulting instruction for the user
14700// vectorization factor and have to reshuffle it again to take
14701// only unique elements of the vector. Without this code the
14702// function incorrectly returns reduced vector instruction with
14703// the same elements, not with the unique ones.
14704
14705// block:
14706// %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14707// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14708// ... (use %2)
14709// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14710// br %block
14711SmallVector<int>Mask(VF,PoisonMaskElem);
14712for (auto [I, V] :enumerate(VL)) {
14713if (isa<PoisonValue>(V))
14714continue;
14715Mask[I] = VE->findLaneForValue(V);
14716 }
14717V = FinalShuffle(V, Mask);
14718 }else {
14719assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14720"Expected vectorization factor less "
14721"than original vector size.");
14722SmallVector<int> UniformMask(VF, 0);
14723 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14724V = FinalShuffle(V, UniformMask);
14725 }
14726 }
14727// Need to update the operand gather node, if actually the operand is not a
14728// vectorized node, but the buildvector/gather node, which matches one of
14729// the vectorized nodes.
14730if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14731 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14732 }) == VE->UserTreeIndices.end()) {
14733auto *It =
14734find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14735returnTE->isGather() &&TE->UserTreeIndices.front().UserTE == E &&
14736TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14737 });
14738assert(It != VectorizableTree.end() &&"Expected gather node operand.");
14739 (*It)->VectorizedValue =V;
14740 }
14741returnV;
14742 }
14743
14744// Find the corresponding gather entry and vectorize it.
14745// Allows to be more accurate with tree/graph transformations, checks for the
14746// correctness of the transformations in many cases.
14747auto *I =find_if(VectorizableTree,
14748 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14749returnTE->isOperandGatherNode({E, NodeIdx});
14750 });
14751assert(I != VectorizableTree.end() &&"Gather node is not in the graph.");
14752assert(I->get()->UserTreeIndices.size() == 1 &&
14753"Expected only single user for the gather node.");
14754assert(I->get()->isSame(VL) &&"Expected same list of scalars.");
14755returnvectorizeTree(I->get(), PostponedPHIs);
14756}
14757
14758template <typename BVTy,typename ResTy,typename...Args>
14759ResTy BoUpSLP::processBuildVector(const TreeEntry *E,Type *ScalarTy,
14760 Args &...Params) {
14761assert(E->isGather() &&"Expected gather node.");
14762unsigned VF = E->getVectorFactor();
14763
14764bool NeedFreeze =false;
14765SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14766 E->ReuseShuffleIndices.end());
14767SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14768// Clear values, to be replaced by insertvector instructions.
14769for (auto [EIdx,Idx] : E->CombinedEntriesWithIndices)
14770for_each(MutableArrayRef(GatheredScalars)
14771 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14772 [&](Value *&V) {V =PoisonValue::get(V->getType()); });
14773SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
14774 E->CombinedEntriesWithIndices.size());
14775transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14776 [&](constauto &P) {
14777 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14778 });
14779// Build a mask out of the reorder indices and reorder scalars per this
14780// mask.
14781SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14782 E->ReorderIndices.end());
14783if (!ReorderMask.empty())
14784reorderScalars(GatheredScalars, ReorderMask);
14785SmallVector<int> SubVectorsMask;
14786inversePermutation(E->ReorderIndices, SubVectorsMask);
14787// Transform non-clustered elements in the mask to poison (-1).
14788// "Clustered" operations will be reordered using this mask later.
14789if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14790for (unsignedI : seq<unsigned>(GatheredScalars.size()))
14791if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14792 SubVectorsMask[ReorderMask[I]] =PoisonMaskElem;
14793 }else {
14794 SubVectorsMask.clear();
14795 }
14796SmallVector<Value *> StoredGS(GatheredScalars);
14797auto FindReusedSplat = [&](MutableArrayRef<int>Mask,unsigned InputVF,
14798unsignedI,unsigned SliceSize,
14799bool IsNotPoisonous) {
14800if (!isSplat(E->Scalars) ||none_of(E->Scalars, [](Value *V) {
14801 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14802 }))
14803returnfalse;
14804 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14805unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14806if (UserTE->getNumOperands() != 2)
14807returnfalse;
14808if (!IsNotPoisonous) {
14809auto *It =
14810find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14811returnfind_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14812 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14813 }) !=TE->UserTreeIndices.end();
14814 });
14815if (It == VectorizableTree.end())
14816returnfalse;
14817SmallVector<Value *>GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14818if (!(*It)->ReorderIndices.empty()) {
14819inversePermutation((*It)->ReorderIndices, ReorderMask);
14820reorderScalars(GS, ReorderMask);
14821 }
14822if (!all_of(zip(GatheredScalars, GS), [&](constauto &P) {
14823Value *V0 = std::get<0>(P);
14824Value *V1 = std::get<1>(P);
14825return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14826 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14827is_contained(E->Scalars, V1));
14828 }))
14829returnfalse;
14830 }
14831intIdx;
14832if ((Mask.size() < InputVF &&
14833ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF,Idx) &&
14834Idx == 0) ||
14835 (Mask.size() == InputVF &&
14836ShuffleVectorInst::isIdentityMask(Mask,Mask.size()))) {
14837 std::iota(
14838 std::next(Mask.begin(),I * SliceSize),
14839 std::next(Mask.begin(),
14840I * SliceSize +getNumElems(Mask.size(), SliceSize,I)),
14841 0);
14842 }else {
14843unsigned IVal =
14844 *find_if_not(Mask, [](intIdx) {returnIdx ==PoisonMaskElem; });
14845 std::fill(
14846 std::next(Mask.begin(),I * SliceSize),
14847 std::next(Mask.begin(),
14848I * SliceSize +getNumElems(Mask.size(), SliceSize,I)),
14849 IVal);
14850 }
14851returntrue;
14852 };
14853 BVTy ShuffleBuilder(ScalarTy, Params...);
14854 ResTy Res = ResTy();
14855SmallVector<int>Mask;
14856SmallVector<int> ExtractMask(GatheredScalars.size(),PoisonMaskElem);
14857SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
14858Value *ExtractVecBase =nullptr;
14859bool UseVecBaseAsInput =false;
14860SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
14861SmallVector<SmallVector<const TreeEntry *>> Entries;
14862Type *OrigScalarTy = GatheredScalars.front()->getType();
14863auto *VecTy =getWidenedType(ScalarTy, GatheredScalars.size());
14864unsigned NumParts =TTI->getNumberOfParts(VecTy);
14865if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14866 VecTy->getNumElements() % NumParts != 0 ||
14867 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
14868 VecTy->getNumElements() / NumParts))
14869 NumParts = 1;
14870if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14871// Check for gathered extracts.
14872bool Resized =false;
14873 ExtractShuffles =
14874 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14875if (!ExtractShuffles.empty()) {
14876SmallVector<const TreeEntry *> ExtractEntries;
14877for (auto [Idx,I] :enumerate(ExtractMask)) {
14878if (I ==PoisonMaskElem)
14879continue;
14880if (constauto *TE = getTreeEntry(
14881 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
14882 ExtractEntries.push_back(TE);
14883 }
14884if (std::optional<ResTy> Delayed =
14885 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14886// Delay emission of gathers which are not ready yet.
14887 PostponedGathers.insert(E);
14888// Postpone gather emission, will be emitted after the end of the
14889// process to keep correct order.
14890return *Delayed;
14891 }
14892if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14893 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14894 ExtractVecBase = VecBase;
14895if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14896if (VF == VecBaseTy->getNumElements() &&
14897 GatheredScalars.size() != VF) {
14898 Resized =true;
14899 GatheredScalars.append(VF - GatheredScalars.size(),
14900PoisonValue::get(OrigScalarTy));
14901 }
14902 }
14903 }
14904// Gather extracts after we check for full matched gathers only.
14905if (!ExtractShuffles.empty() || !E->hasState() ||
14906 E->getOpcode() != Instruction::Load ||
14907 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14908any_of(E->Scalars, IsaPred<LoadInst>)) &&
14909any_of(E->Scalars,
14910 [this](Value *V) {
14911 return isa<LoadInst>(V) && getTreeEntry(V);
14912 })) ||
14913 (E->hasState() && E->isAltShuffle()) ||
14914all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
14915isSplat(E->Scalars) ||
14916 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14917 GatherShuffles =
14918 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14919 }
14920if (!GatherShuffles.empty()) {
14921if (std::optional<ResTy> Delayed =
14922 ShuffleBuilder.needToDelay(E, Entries)) {
14923// Delay emission of gathers which are not ready yet.
14924 PostponedGathers.insert(E);
14925// Postpone gather emission, will be emitted after the end of the
14926// process to keep correct order.
14927return *Delayed;
14928 }
14929if (GatherShuffles.size() == 1 &&
14930 *GatherShuffles.front() ==TTI::SK_PermuteSingleSrc &&
14931 Entries.front().front()->isSame(E->Scalars)) {
14932// Perfect match in the graph, will reuse the previously vectorized
14933// node. Cost is 0.
14934LLVM_DEBUG(dbgs() <<"SLP: perfect diamond match for gather bundle "
14935 <<shortBundleName(E->Scalars, E->Idx) <<".\n");
14936// Restore the mask for previous partially matched values.
14937Mask.resize(E->Scalars.size());
14938const TreeEntry *FrontTE = Entries.front().front();
14939if (FrontTE->ReorderIndices.empty() &&
14940 ((FrontTE->ReuseShuffleIndices.empty() &&
14941 E->Scalars.size() == FrontTE->Scalars.size()) ||
14942 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14943 std::iota(Mask.begin(),Mask.end(), 0);
14944 }else {
14945for (auto [I, V] :enumerate(E->Scalars)) {
14946if (isa<PoisonValue>(V)) {
14947Mask[I] =PoisonMaskElem;
14948continue;
14949 }
14950Mask[I] = FrontTE->findLaneForValue(V);
14951 }
14952 }
14953 ShuffleBuilder.add(*FrontTE, Mask);
14954// Full matched entry found, no need to insert subvectors.
14955 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14956return Res;
14957 }
14958if (!Resized) {
14959if (GatheredScalars.size() != VF &&
14960any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14961returnany_of(TEs, [&](const TreeEntry *TE) {
14962returnTE->getVectorFactor() == VF;
14963 });
14964 }))
14965 GatheredScalars.append(VF - GatheredScalars.size(),
14966PoisonValue::get(OrigScalarTy));
14967 }
14968// Remove shuffled elements from list of gathers.
14969for (intI = 0, Sz =Mask.size();I < Sz; ++I) {
14970if (Mask[I] !=PoisonMaskElem)
14971 GatheredScalars[I] =PoisonValue::get(OrigScalarTy);
14972 }
14973 }
14974 }
14975auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
14976SmallVectorImpl<int> &ReuseMask,
14977bool IsRootPoison) {
14978// For splats with can emit broadcasts instead of gathers, so try to find
14979// such sequences.
14980bool IsSplat = IsRootPoison &&isSplat(Scalars) &&
14981 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
14982 Scalars.append(VF - Scalars.size(),PoisonValue::get(OrigScalarTy));
14983SmallVector<int> UndefPos;
14984DenseMap<Value *, unsigned> UniquePositions;
14985// Gather unique non-const values and all constant values.
14986// For repeated values, just shuffle them.
14987int NumNonConsts = 0;
14988int SinglePos = 0;
14989for (auto [I, V] :enumerate(Scalars)) {
14990if (isa<UndefValue>(V)) {
14991if (!isa<PoisonValue>(V)) {
14992 ReuseMask[I] =I;
14993 UndefPos.push_back(I);
14994 }
14995continue;
14996 }
14997if (isConstant(V)) {
14998 ReuseMask[I] =I;
14999continue;
15000 }
15001 ++NumNonConsts;
15002 SinglePos =I;
15003Value *OrigV =V;
15004 Scalars[I] =PoisonValue::get(OrigScalarTy);
15005if (IsSplat) {
15006 Scalars.front() = OrigV;
15007 ReuseMask[I] = 0;
15008 }else {
15009constauto Res = UniquePositions.try_emplace(OrigV,I);
15010 Scalars[Res.first->second] = OrigV;
15011 ReuseMask[I] = Res.first->second;
15012 }
15013 }
15014if (NumNonConsts == 1) {
15015// Restore single insert element.
15016if (IsSplat) {
15017 ReuseMask.assign(VF,PoisonMaskElem);
15018std::swap(Scalars.front(), Scalars[SinglePos]);
15019if (!UndefPos.empty() && UndefPos.front() == 0)
15020 Scalars.front() =UndefValue::get(OrigScalarTy);
15021 }
15022 ReuseMask[SinglePos] = SinglePos;
15023 }elseif (!UndefPos.empty() && IsSplat) {
15024// For undef values, try to replace them with the simple broadcast.
15025// We can do it if the broadcasted value is guaranteed to be
15026// non-poisonous, or by freezing the incoming scalar value first.
15027auto *It =find_if(Scalars, [this, E](Value *V) {
15028return !isa<UndefValue>(V) &&
15029 (getTreeEntry(V) ||isGuaranteedNotToBePoison(V, AC) ||
15030 (E->UserTreeIndices.size() == 1 &&
15031any_of(V->uses(), [E](constUse &U) {
15032// Check if the value already used in the same operation in
15033// one of the nodes already.
15034 return E->UserTreeIndices.front().EdgeIdx !=
15035 U.getOperandNo() &&
15036 is_contained(
15037 E->UserTreeIndices.front().UserTE->Scalars,
15038 U.getUser());
15039 })));
15040 });
15041if (It != Scalars.end()) {
15042// Replace undefs by the non-poisoned scalars and emit broadcast.
15043int Pos = std::distance(Scalars.begin(), It);
15044for (intI : UndefPos) {
15045// Set the undef position to the non-poisoned scalar.
15046 ReuseMask[I] = Pos;
15047// Replace the undef by the poison, in the mask it is replaced by
15048// non-poisoned scalar already.
15049if (I != Pos)
15050 Scalars[I] =PoisonValue::get(OrigScalarTy);
15051 }
15052 }else {
15053// Replace undefs by the poisons, emit broadcast and then emit
15054// freeze.
15055for (intI : UndefPos) {
15056 ReuseMask[I] =PoisonMaskElem;
15057if (isa<UndefValue>(Scalars[I]))
15058 Scalars[I] =PoisonValue::get(OrigScalarTy);
15059 }
15060 NeedFreeze =true;
15061 }
15062 }
15063 };
15064if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15065bool IsNonPoisoned =true;
15066bool IsUsedInExpr =true;
15067Value *Vec1 =nullptr;
15068if (!ExtractShuffles.empty()) {
15069// Gather of extractelements can be represented as just a shuffle of
15070// a single/two vectors the scalars are extracted from.
15071// Find input vectors.
15072Value *Vec2 =nullptr;
15073for (unsignedI = 0, Sz = ExtractMask.size();I < Sz; ++I) {
15074if (!Mask.empty() && Mask[I] !=PoisonMaskElem)
15075 ExtractMask[I] =PoisonMaskElem;
15076 }
15077if (UseVecBaseAsInput) {
15078 Vec1 = ExtractVecBase;
15079 }else {
15080for (unsignedI = 0, Sz = ExtractMask.size();I < Sz; ++I) {
15081if (ExtractMask[I] ==PoisonMaskElem)
15082continue;
15083if (isa<UndefValue>(E->Scalars[I]))
15084continue;
15085auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15086Value *VecOp = EI->getVectorOperand();
15087if (constauto *TE = getTreeEntry(VecOp))
15088if (TE->VectorizedValue)
15089 VecOp =TE->VectorizedValue;
15090if (!Vec1) {
15091 Vec1 = VecOp;
15092 }elseif (Vec1 != VecOp) {
15093assert((!Vec2 || Vec2 == VecOp) &&
15094"Expected only 1 or 2 vectors shuffle.");
15095 Vec2 = VecOp;
15096 }
15097 }
15098 }
15099if (Vec2) {
15100 IsUsedInExpr =false;
15101 IsNonPoisoned &=isGuaranteedNotToBePoison(Vec1, AC) &&
15102isGuaranteedNotToBePoison(Vec2, AC);
15103 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15104 }elseif (Vec1) {
15105bool IsNotPoisonedVec =isGuaranteedNotToBePoison(Vec1, AC);
15106 IsUsedInExpr &= FindReusedSplat(
15107 ExtractMask,
15108 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15109 ExtractMask.size(), IsNotPoisonedVec);
15110 ShuffleBuilder.add(Vec1, ExtractMask,/*ForExtracts=*/true);
15111 IsNonPoisoned &= IsNotPoisonedVec;
15112 }else {
15113 IsUsedInExpr =false;
15114 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15115/*ForExtracts=*/true);
15116 }
15117 }
15118if (!GatherShuffles.empty()) {
15119unsigned SliceSize =getPartNumElems(E->Scalars.size(), NumParts);
15120SmallVector<int> VecMask(Mask.size(),PoisonMaskElem);
15121for (constauto [I, TEs] :enumerate(Entries)) {
15122if (TEs.empty()) {
15123assert(!GatherShuffles[I] &&
15124"No shuffles with empty entries list expected.");
15125continue;
15126 }
15127assert((TEs.size() == 1 || TEs.size() == 2) &&
15128"Expected shuffle of 1 or 2 entries.");
15129unsigned Limit =getNumElems(Mask.size(), SliceSize,I);
15130auto SubMask =ArrayRef(Mask).slice(I * SliceSize, Limit);
15131 VecMask.assign(VecMask.size(),PoisonMaskElem);
15132copy(SubMask, std::next(VecMask.begin(),I * SliceSize));
15133if (TEs.size() == 1) {
15134bool IsNotPoisonedVec =
15135 TEs.front()->VectorizedValue
15136 ?isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15137 :true;
15138 IsUsedInExpr &=
15139 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(),I,
15140 SliceSize, IsNotPoisonedVec);
15141 ShuffleBuilder.add(*TEs.front(), VecMask);
15142 IsNonPoisoned &= IsNotPoisonedVec;
15143 }else {
15144 IsUsedInExpr =false;
15145 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15146if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15147 IsNonPoisoned &=
15148isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15149isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15150 }
15151 }
15152 }
15153// Try to figure out best way to combine values: build a shuffle and insert
15154// elements or just build several shuffles.
15155// Insert non-constant scalars.
15156SmallVector<Value *> NonConstants(GatheredScalars);
15157int EMSz = ExtractMask.size();
15158int MSz =Mask.size();
15159// Try to build constant vector and shuffle with it only if currently we
15160// have a single permutation and more than 1 scalar constants.
15161bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15162bool IsIdentityShuffle =
15163 ((UseVecBaseAsInput ||
15164all_of(ExtractShuffles,
15165 [](const std::optional<TTI::ShuffleKind> &SK) {
15166return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15167TTI::SK_PermuteSingleSrc;
15168 })) &&
15169none_of(ExtractMask, [&](intI) {returnI >= EMSz; }) &&
15170ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15171 (!GatherShuffles.empty() &&
15172all_of(GatherShuffles,
15173 [](const std::optional<TTI::ShuffleKind> &SK) {
15174return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15175TTI::SK_PermuteSingleSrc;
15176 }) &&
15177none_of(Mask, [&](intI) {returnI >= MSz; }) &&
15178ShuffleVectorInst::isIdentityMask(Mask, MSz));
15179bool EnoughConstsForShuffle =
15180 IsSingleShuffle &&
15181 (none_of(GatheredScalars,
15182 [](Value *V) {
15183return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15184 }) ||
15185any_of(GatheredScalars,
15186 [](Value *V) {
15187return isa<Constant>(V) && !isa<UndefValue>(V);
15188 })) &&
15189 (!IsIdentityShuffle ||
15190 (GatheredScalars.size() == 2 &&
15191any_of(GatheredScalars,
15192 [](Value *V) {return !isa<UndefValue>(V); })) ||
15193count_if(GatheredScalars, [](Value *V) {
15194return isa<Constant>(V) && !isa<PoisonValue>(V);
15195 }) > 1);
15196// NonConstants array contains just non-constant values, GatheredScalars
15197// contains only constant to build final vector and then shuffle.
15198for (intI = 0, Sz = GatheredScalars.size();I < Sz; ++I) {
15199if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15200 NonConstants[I] =PoisonValue::get(OrigScalarTy);
15201else
15202 GatheredScalars[I] =PoisonValue::get(OrigScalarTy);
15203 }
15204// Generate constants for final shuffle and build a mask for them.
15205if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15206SmallVector<int> BVMask(GatheredScalars.size(),PoisonMaskElem);
15207 TryPackScalars(GatheredScalars, BVMask,/*IsRootPoison=*/true);
15208Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15209 ShuffleBuilder.add(BV, BVMask);
15210 }
15211if (all_of(NonConstants, [=](Value *V) {
15212return isa<PoisonValue>(V) ||
15213 (IsSingleShuffle && ((IsIdentityShuffle &&
15214 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15215 }))
15216 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15217 SubVectorsMask);
15218else
15219 Res = ShuffleBuilder.finalize(
15220 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15221 [&](Value *&Vec,SmallVectorImpl<int> &Mask) {
15222 TryPackScalars(NonConstants, Mask,/*IsRootPoison=*/false);
15223 Vec = ShuffleBuilder.gather(NonConstants,Mask.size(), Vec);
15224 });
15225 }elseif (!allConstant(GatheredScalars)) {
15226// Gather unique scalars and all constants.
15227SmallVector<int> ReuseMask(GatheredScalars.size(),PoisonMaskElem);
15228 TryPackScalars(GatheredScalars, ReuseMask,/*IsRootPoison=*/true);
15229Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15230 ShuffleBuilder.add(BV, ReuseMask);
15231 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15232 SubVectorsMask);
15233 }else {
15234// Gather all constants.
15235SmallVector<int>Mask(GatheredScalars.size(),PoisonMaskElem);
15236for (auto [I, V] :enumerate(GatheredScalars)) {
15237if (!isa<PoisonValue>(V))
15238Mask[I] =I;
15239 }
15240Value *BV = ShuffleBuilder.gather(GatheredScalars);
15241 ShuffleBuilder.add(BV, Mask);
15242 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15243 SubVectorsMask);
15244 }
15245
15246if (NeedFreeze)
15247 Res = ShuffleBuilder.createFreeze(Res);
15248return Res;
15249}
15250
15251Value *BoUpSLP::createBuildVector(const TreeEntry *E,Type *ScalarTy,
15252bool PostponedPHIs) {
15253for (auto [EIdx,_] : E->CombinedEntriesWithIndices)
15254 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15255return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15256 Builder, *this);
15257}
15258
15259/// \returns \p I after propagating metadata from \p VL only for instructions in
15260/// \p VL.
15261staticInstruction *propagateMetadata(Instruction *Inst,ArrayRef<Value *> VL) {
15262SmallVector<Value *> Insts;
15263for (Value *V : VL)
15264if (isa<Instruction>(V))
15265 Insts.push_back(V);
15266returnllvm::propagateMetadata(Inst, Insts);
15267}
15268
15269Value *BoUpSLP::vectorizeTree(TreeEntry *E,bool PostponedPHIs) {
15270IRBuilderBase::InsertPointGuard Guard(Builder);
15271
15272if (E->VectorizedValue &&
15273 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15274 E->isAltShuffle())) {
15275LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *E->Scalars[0] <<".\n");
15276return E->VectorizedValue;
15277 }
15278
15279Value *V = E->Scalars.front();
15280Type *ScalarTy =V->getType();
15281if (!isa<CmpInst>(V))
15282 ScalarTy =getValueType(V);
15283auto It = MinBWs.find(E);
15284if (It != MinBWs.end()) {
15285auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15286 ScalarTy =IntegerType::get(F->getContext(), It->second.first);
15287if (VecTy)
15288 ScalarTy =getWidenedType(ScalarTy, VecTy->getNumElements());
15289 }
15290auto *VecTy =getWidenedType(ScalarTy, E->Scalars.size());
15291if (E->isGather()) {
15292// Set insert point for non-reduction initial nodes.
15293if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15294 setInsertPointAfterBundle(E);
15295Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15296 E->VectorizedValue = Vec;
15297return Vec;
15298 }
15299
15300bool IsReverseOrder =
15301 !E->ReorderIndices.empty() &&isReverseOrder(E->ReorderIndices);
15302auto FinalShuffle = [&](Value *V,const TreeEntry *E) {
15303 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15304if (E->getOpcode() == Instruction::Store &&
15305 E->State == TreeEntry::Vectorize) {
15306ArrayRef<int>Mask =
15307ArrayRef(reinterpret_cast<constint *>(E->ReorderIndices.begin()),
15308 E->ReorderIndices.size());
15309 ShuffleBuilder.add(V, Mask);
15310 }elseif (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15311 ShuffleBuilder.addOrdered(V, {});
15312 }else {
15313 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15314 }
15315SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
15316 E->CombinedEntriesWithIndices.size());
15317transform(
15318 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](constauto &P) {
15319 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15320 });
15321assert(
15322 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15323"Expected either combined subnodes or reordering");
15324return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15325 };
15326
15327assert(!E->isGather() &&"Unhandled state");
15328unsigned ShuffleOrOp =
15329 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15330Instruction *VL0 = E->getMainOp();
15331auto GetOperandSignedness = [&](unsignedIdx) {
15332const TreeEntry *OpE = getOperandEntry(E,Idx);
15333bool IsSigned =false;
15334auto It = MinBWs.find(OpE);
15335if (It != MinBWs.end())
15336 IsSigned = It->second.second;
15337else
15338 IsSigned =any_of(OpE->Scalars, [&](Value *R) {
15339 if (isa<PoisonValue>(V))
15340 return false;
15341 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15342 });
15343return IsSigned;
15344 };
15345switch (ShuffleOrOp) {
15346case Instruction::PHI: {
15347assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15348 E != VectorizableTree.front().get() ||
15349 !E->UserTreeIndices.empty()) &&
15350"PHI reordering is free.");
15351if (PostponedPHIs && E->VectorizedValue)
15352return E->VectorizedValue;
15353auto *PH = cast<PHINode>(VL0);
15354 Builder.SetInsertPoint(PH->getParent(),
15355 PH->getParent()->getFirstNonPHIIt());
15356 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15357if (PostponedPHIs || !E->VectorizedValue) {
15358PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15359 E->PHI = NewPhi;
15360Value *V = NewPhi;
15361
15362// Adjust insertion point once all PHI's have been generated.
15363 Builder.SetInsertPoint(PH->getParent(),
15364 PH->getParent()->getFirstInsertionPt());
15365 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15366
15367V = FinalShuffle(V, E);
15368
15369 E->VectorizedValue =V;
15370if (PostponedPHIs)
15371returnV;
15372 }
15373PHINode *NewPhi = cast<PHINode>(E->PHI);
15374// If phi node is fully emitted - exit.
15375if (NewPhi->getNumIncomingValues() != 0)
15376return NewPhi;
15377
15378// PHINodes may have multiple entries from the same block. We want to
15379// visit every block once.
15380SmallPtrSet<BasicBlock *, 4> VisitedBBs;
15381
15382for (unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {
15383ValueListOperands;
15384BasicBlock *IBB = PH->getIncomingBlock(I);
15385
15386// Stop emission if all incoming values are generated.
15387if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15388LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15389return NewPhi;
15390 }
15391
15392if (!VisitedBBs.insert(IBB).second) {
15393 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15394continue;
15395 }
15396
15397 Builder.SetInsertPoint(IBB->getTerminator());
15398 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15399Value *Vec = vectorizeOperand(E,I,/*PostponedPHIs=*/true);
15400if (VecTy != Vec->getType()) {
15401assert((It != MinBWs.end() || getOperandEntry(E,I)->isGather() ||
15402 MinBWs.contains(getOperandEntry(E,I))) &&
15403"Expected item in MinBWs.");
15404 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15405 }
15406 NewPhi->addIncoming(Vec, IBB);
15407 }
15408
15409assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15410"Invalid number of incoming values");
15411assert(E->VectorizedValue &&"Expected vectorized value.");
15412return E->VectorizedValue;
15413 }
15414
15415case Instruction::ExtractElement: {
15416Value *V = E->getSingleOperand(0);
15417if (const TreeEntry *TE = getTreeEntry(V))
15418V =TE->VectorizedValue;
15419 setInsertPointAfterBundle(E);
15420V = FinalShuffle(V, E);
15421 E->VectorizedValue =V;
15422returnV;
15423 }
15424case Instruction::ExtractValue: {
15425auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15426 Builder.SetInsertPoint(LI);
15427Value *Ptr = LI->getPointerOperand();
15428LoadInst *V = Builder.CreateAlignedLoad(VecTy,Ptr, LI->getAlign());
15429Value *NewV =::propagateMetadata(V, E->Scalars);
15430 NewV = FinalShuffle(NewV, E);
15431 E->VectorizedValue = NewV;
15432return NewV;
15433 }
15434case Instruction::InsertElement: {
15435assert(E->ReuseShuffleIndices.empty() &&"All inserts should be unique");
15436 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15437Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15438ArrayRef<Value *>Op = E->getOperand(1);
15439Type *ScalarTy =Op.front()->getType();
15440if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15441assert(ScalarTy->isIntegerTy() &&"Expected item in MinBWs.");
15442 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15443assert(Res.first > 0 &&"Expected item in MinBWs.");
15444V = Builder.CreateIntCast(
15445 V,
15446getWidenedType(
15447 ScalarTy,
15448 cast<FixedVectorType>(V->getType())->getNumElements()),
15449 Res.second);
15450 }
15451
15452// Create InsertVector shuffle if necessary
15453auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15454 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15455 }));
15456constunsigned NumElts =
15457 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15458constunsigned NumScalars = E->Scalars.size();
15459
15460unsignedOffset = *getElementIndex(VL0);
15461assert(Offset < NumElts &&"Failed to find vector index offset");
15462
15463// Create shuffle to resize vector
15464SmallVector<int>Mask;
15465if (!E->ReorderIndices.empty()) {
15466inversePermutation(E->ReorderIndices, Mask);
15467Mask.append(NumElts - NumScalars,PoisonMaskElem);
15468 }else {
15469Mask.assign(NumElts,PoisonMaskElem);
15470 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15471 }
15472// Create InsertVector shuffle if necessary
15473bool IsIdentity =true;
15474SmallVector<int> PrevMask(NumElts,PoisonMaskElem);
15475Mask.swap(PrevMask);
15476for (unsignedI = 0;I < NumScalars; ++I) {
15477Value *Scalar = E->Scalars[PrevMask[I]];
15478unsigned InsertIdx = *getElementIndex(Scalar);
15479 IsIdentity &= InsertIdx -Offset ==I;
15480Mask[InsertIdx -Offset] =I;
15481 }
15482if (!IsIdentity || NumElts != NumScalars) {
15483Value *V2 =nullptr;
15484bool IsVNonPoisonous =
15485 !isConstant(V) &&isGuaranteedNotToBePoison(V, AC);
15486SmallVector<int> InsertMask(Mask);
15487if (NumElts != NumScalars &&Offset == 0) {
15488// Follow all insert element instructions from the current buildvector
15489// sequence.
15490InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15491do {
15492 std::optional<unsigned> InsertIdx =getElementIndex(Ins);
15493if (!InsertIdx)
15494break;
15495if (InsertMask[*InsertIdx] ==PoisonMaskElem)
15496 InsertMask[*InsertIdx] = *InsertIdx;
15497if (!Ins->hasOneUse())
15498break;
15499Ins = dyn_cast_or_null<InsertElementInst>(
15500Ins->getUniqueUndroppableUser());
15501 }while (Ins);
15502SmallBitVector UseMask =
15503buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15504SmallBitVector IsFirstPoison =
15505 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15506SmallBitVector IsFirstUndef =
15507isUndefVector(FirstInsert->getOperand(0), UseMask);
15508if (!IsFirstPoison.all()) {
15509unsignedIdx = 0;
15510for (unsignedI = 0;I < NumElts;I++) {
15511if (InsertMask[I] ==PoisonMaskElem && !IsFirstPoison.test(I) &&
15512 IsFirstUndef.test(I)) {
15513if (IsVNonPoisonous) {
15514 InsertMask[I] =I < NumScalars ?I : 0;
15515continue;
15516 }
15517if (!V2)
15518V2 =UndefValue::get(V->getType());
15519if (Idx >= NumScalars)
15520Idx = NumScalars - 1;
15521 InsertMask[I] = NumScalars +Idx;
15522 ++Idx;
15523 }elseif (InsertMask[I] !=PoisonMaskElem &&
15524 Mask[I] ==PoisonMaskElem) {
15525 InsertMask[I] =PoisonMaskElem;
15526 }
15527 }
15528 }else {
15529 InsertMask =Mask;
15530 }
15531 }
15532if (!V2)
15533V2 =PoisonValue::get(V->getType());
15534V = Builder.CreateShuffleVector(V, V2, InsertMask);
15535if (auto *I = dyn_cast<Instruction>(V)) {
15536 GatherShuffleExtractSeq.insert(I);
15537 CSEBlocks.insert(I->getParent());
15538 }
15539 }
15540
15541SmallVector<int> InsertMask(NumElts,PoisonMaskElem);
15542for (unsignedI = 0;I < NumElts;I++) {
15543if (Mask[I] !=PoisonMaskElem)
15544 InsertMask[Offset +I] =I;
15545 }
15546SmallBitVector UseMask =
15547buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15548SmallBitVector IsFirstUndef =
15549isUndefVector(FirstInsert->getOperand(0), UseMask);
15550if ((!IsIdentity ||Offset != 0 || !IsFirstUndef.all()) &&
15551 NumElts != NumScalars) {
15552if (IsFirstUndef.all()) {
15553if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15554SmallBitVector IsFirstPoison =
15555 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15556if (!IsFirstPoison.all()) {
15557for (unsignedI = 0;I < NumElts;I++) {
15558if (InsertMask[I] ==PoisonMaskElem && !IsFirstPoison.test(I))
15559 InsertMask[I] =I + NumElts;
15560 }
15561 }
15562V = Builder.CreateShuffleVector(
15563 V,
15564 IsFirstPoison.all() ?PoisonValue::get(V->getType())
15565 : FirstInsert->getOperand(0),
15566 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15567if (auto *I = dyn_cast<Instruction>(V)) {
15568 GatherShuffleExtractSeq.insert(I);
15569 CSEBlocks.insert(I->getParent());
15570 }
15571 }
15572 }else {
15573SmallBitVector IsFirstPoison =
15574 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15575for (unsignedI = 0;I < NumElts;I++) {
15576if (InsertMask[I] ==PoisonMaskElem)
15577 InsertMask[I] = IsFirstPoison.test(I) ?PoisonMaskElem :I;
15578else
15579 InsertMask[I] += NumElts;
15580 }
15581V = Builder.CreateShuffleVector(
15582 FirstInsert->getOperand(0), V, InsertMask,
15583 cast<Instruction>(E->Scalars.back())->getName());
15584if (auto *I = dyn_cast<Instruction>(V)) {
15585 GatherShuffleExtractSeq.insert(I);
15586 CSEBlocks.insert(I->getParent());
15587 }
15588 }
15589 }
15590
15591 ++NumVectorInstructions;
15592 E->VectorizedValue =V;
15593returnV;
15594 }
15595case Instruction::ZExt:
15596case Instruction::SExt:
15597case Instruction::FPToUI:
15598case Instruction::FPToSI:
15599case Instruction::FPExt:
15600case Instruction::PtrToInt:
15601case Instruction::IntToPtr:
15602case Instruction::SIToFP:
15603case Instruction::UIToFP:
15604case Instruction::Trunc:
15605case Instruction::FPTrunc:
15606case Instruction::BitCast: {
15607 setInsertPointAfterBundle(E);
15608
15609Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15610if (E->VectorizedValue) {
15611LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15612return E->VectorizedValue;
15613 }
15614
15615auto *CI = cast<CastInst>(VL0);
15616Instruction::CastOps VecOpcode = CI->getOpcode();
15617Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15618auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15619if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15620 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15621 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15622// Check if the values are candidates to demote.
15623unsigned SrcBWSz =DL->getTypeSizeInBits(SrcScalarTy);
15624if (SrcIt != MinBWs.end())
15625 SrcBWSz = SrcIt->second.first;
15626unsigned BWSz =DL->getTypeSizeInBits(ScalarTy->getScalarType());
15627if (BWSz == SrcBWSz) {
15628 VecOpcode = Instruction::BitCast;
15629 }elseif (BWSz < SrcBWSz) {
15630 VecOpcode = Instruction::Trunc;
15631 }elseif (It != MinBWs.end()) {
15632assert(BWSz > SrcBWSz &&"Invalid cast!");
15633 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15634 }elseif (SrcIt != MinBWs.end()) {
15635assert(BWSz > SrcBWSz &&"Invalid cast!");
15636 VecOpcode =
15637 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15638 }
15639 }elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15640 !SrcIt->second.second) {
15641 VecOpcode = Instruction::UIToFP;
15642 }
15643Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15644 ? InVec
15645 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15646V = FinalShuffle(V, E);
15647
15648 E->VectorizedValue =V;
15649 ++NumVectorInstructions;
15650returnV;
15651 }
15652case Instruction::FCmp:
15653case Instruction::ICmp: {
15654 setInsertPointAfterBundle(E);
15655
15656Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15657if (E->VectorizedValue) {
15658LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15659return E->VectorizedValue;
15660 }
15661Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15662if (E->VectorizedValue) {
15663LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15664return E->VectorizedValue;
15665 }
15666if (L->getType() !=R->getType()) {
15667assert((getOperandEntry(E, 0)->isGather() ||
15668 getOperandEntry(E, 1)->isGather() ||
15669 MinBWs.contains(getOperandEntry(E, 0)) ||
15670 MinBWs.contains(getOperandEntry(E, 1))) &&
15671"Expected item in MinBWs.");
15672if (cast<VectorType>(L->getType())
15673 ->getElementType()
15674 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15675 ->getElementType()
15676 ->getIntegerBitWidth()) {
15677Type *CastTy =R->getType();
15678L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15679 }else {
15680Type *CastTy =L->getType();
15681R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15682 }
15683 }
15684
15685CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15686Value *V = Builder.CreateCmp(P0, L, R);
15687propagateIRFlags(V, E->Scalars, VL0);
15688if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15689 ICmp->setSameSign(/*B=*/false);
15690// Do not cast for cmps.
15691 VecTy = cast<FixedVectorType>(V->getType());
15692V = FinalShuffle(V, E);
15693
15694 E->VectorizedValue =V;
15695 ++NumVectorInstructions;
15696returnV;
15697 }
15698case Instruction::Select: {
15699 setInsertPointAfterBundle(E);
15700
15701Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15702if (E->VectorizedValue) {
15703LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15704return E->VectorizedValue;
15705 }
15706Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15707if (E->VectorizedValue) {
15708LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15709return E->VectorizedValue;
15710 }
15711Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15712if (E->VectorizedValue) {
15713LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15714return E->VectorizedValue;
15715 }
15716if (True->getType() != VecTy || False->getType() != VecTy) {
15717assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15718 getOperandEntry(E, 2)->isGather() ||
15719 MinBWs.contains(getOperandEntry(E, 1)) ||
15720 MinBWs.contains(getOperandEntry(E, 2))) &&
15721"Expected item in MinBWs.");
15722if (True->getType() != VecTy)
15723 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15724if (False->getType() != VecTy)
15725 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15726 }
15727
15728unsigned CondNumElements =getNumElements(Cond->getType());
15729unsigned TrueNumElements =getNumElements(True->getType());
15730assert(TrueNumElements >= CondNumElements &&
15731 TrueNumElements % CondNumElements == 0 &&
15732"Cannot vectorize Instruction::Select");
15733assert(TrueNumElements ==getNumElements(False->getType()) &&
15734"Cannot vectorize Instruction::Select");
15735if (CondNumElements != TrueNumElements) {
15736// When the return type is i1 but the source is fixed vector type, we
15737// need to duplicate the condition value.
15738Cond = Builder.CreateShuffleVector(
15739Cond,createReplicatedMask(TrueNumElements / CondNumElements,
15740 CondNumElements));
15741 }
15742assert(getNumElements(Cond->getType()) == TrueNumElements &&
15743"Cannot vectorize Instruction::Select");
15744Value *V = Builder.CreateSelect(Cond, True, False);
15745V = FinalShuffle(V, E);
15746
15747 E->VectorizedValue =V;
15748 ++NumVectorInstructions;
15749returnV;
15750 }
15751case Instruction::FNeg: {
15752 setInsertPointAfterBundle(E);
15753
15754Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15755
15756if (E->VectorizedValue) {
15757LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15758return E->VectorizedValue;
15759 }
15760
15761Value *V = Builder.CreateUnOp(
15762static_cast<Instruction::UnaryOps>(E->getOpcode()),Op);
15763propagateIRFlags(V, E->Scalars, VL0);
15764if (auto *I = dyn_cast<Instruction>(V))
15765V =::propagateMetadata(I, E->Scalars);
15766
15767V = FinalShuffle(V, E);
15768
15769 E->VectorizedValue =V;
15770 ++NumVectorInstructions;
15771
15772returnV;
15773 }
15774case Instruction::Freeze: {
15775 setInsertPointAfterBundle(E);
15776
15777Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15778
15779if (E->VectorizedValue) {
15780LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15781return E->VectorizedValue;
15782 }
15783
15784if (Op->getType() != VecTy) {
15785assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15786 MinBWs.contains(getOperandEntry(E, 0))) &&
15787"Expected item in MinBWs.");
15788Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15789 }
15790Value *V = Builder.CreateFreeze(Op);
15791V = FinalShuffle(V, E);
15792
15793 E->VectorizedValue =V;
15794 ++NumVectorInstructions;
15795
15796returnV;
15797 }
15798case Instruction::Add:
15799case Instruction::FAdd:
15800case Instruction::Sub:
15801case Instruction::FSub:
15802case Instruction::Mul:
15803case Instruction::FMul:
15804case Instruction::UDiv:
15805case Instruction::SDiv:
15806case Instruction::FDiv:
15807case Instruction::URem:
15808case Instruction::SRem:
15809case Instruction::FRem:
15810case Instruction::Shl:
15811case Instruction::LShr:
15812case Instruction::AShr:
15813case Instruction::And:
15814case Instruction::Or:
15815case Instruction::Xor: {
15816 setInsertPointAfterBundle(E);
15817
15818Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15819if (E->VectorizedValue) {
15820LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15821return E->VectorizedValue;
15822 }
15823Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15824if (E->VectorizedValue) {
15825LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15826return E->VectorizedValue;
15827 }
15828if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15829for (unsignedI : seq<unsigned>(0, E->getNumOperands())) {
15830ArrayRef<Value *> Ops = E->getOperand(I);
15831if (all_of(Ops, [&](Value *Op) {
15832auto *CI = dyn_cast<ConstantInt>(Op);
15833return CI && CI->getValue().countr_one() >= It->second.first;
15834 })) {
15835V = FinalShuffle(I == 0 ? RHS : LHS, E);
15836 E->VectorizedValue =V;
15837 ++NumVectorInstructions;
15838returnV;
15839 }
15840 }
15841 }
15842if (LHS->getType() != VecTy ||RHS->getType() != VecTy) {
15843assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15844 getOperandEntry(E, 1)->isGather() ||
15845 MinBWs.contains(getOperandEntry(E, 0)) ||
15846 MinBWs.contains(getOperandEntry(E, 1))) &&
15847"Expected item in MinBWs.");
15848if (LHS->getType() != VecTy)
15849LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15850if (RHS->getType() != VecTy)
15851RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15852 }
15853
15854Value *V = Builder.CreateBinOp(
15855static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15856 RHS);
15857propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15858if (auto *I = dyn_cast<Instruction>(V)) {
15859V =::propagateMetadata(I, E->Scalars);
15860// Drop nuw flags for abs(sub(commutative), true).
15861if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15862any_of(E->Scalars, [](Value *V) {
15863 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15864 }))
15865I->setHasNoUnsignedWrap(/*b=*/false);
15866 }
15867
15868V = FinalShuffle(V, E);
15869
15870 E->VectorizedValue =V;
15871 ++NumVectorInstructions;
15872
15873returnV;
15874 }
15875case Instruction::Load: {
15876// Loads are inserted at the head of the tree because we don't want to
15877// sink them all the way down past store instructions.
15878 setInsertPointAfterBundle(E);
15879
15880LoadInst *LI = cast<LoadInst>(VL0);
15881Instruction *NewLI;
15882Value *PO = LI->getPointerOperand();
15883if (E->State == TreeEntry::Vectorize) {
15884 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15885 }elseif (E->State == TreeEntry::StridedVectorize) {
15886Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15887Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15888 PO = IsReverseOrder ? PtrN : Ptr0;
15889 std::optional<int> Diff =getPointersDiff(
15890 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15891Type *StrideTy =DL->getIndexType(PO->getType());
15892Value *StrideVal;
15893if (Diff) {
15894int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15895 StrideVal =
15896 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15897DL->getTypeAllocSize(ScalarTy));
15898 }else {
15899SmallVector<Value *> PointerOps(E->Scalars.size(),nullptr);
15900transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15901 return cast<LoadInst>(V)->getPointerOperand();
15902 });
15903OrdersType Order;
15904 std::optional<Value *> Stride =
15905calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15906 &*Builder.GetInsertPoint());
15907Value *NewStride =
15908 Builder.CreateIntCast(*Stride, StrideTy,/*isSigned=*/true);
15909 StrideVal = Builder.CreateMul(
15910 NewStride,
15911 ConstantInt::get(
15912 StrideTy,
15913 (IsReverseOrder ? -1 : 1) *
15914static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15915 }
15916Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15917auto *Inst = Builder.CreateIntrinsic(
15918 Intrinsic::experimental_vp_strided_load,
15919 {VecTy, PO->getType(), StrideTy},
15920 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15921 Builder.getInt32(E->Scalars.size())});
15922 Inst->addParamAttr(
15923/*ArgNo=*/0,
15924Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15925 NewLI = Inst;
15926 }else {
15927assert(E->State == TreeEntry::ScatterVectorize &&"Unhandled state");
15928Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15929if (E->VectorizedValue) {
15930LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
15931return E->VectorizedValue;
15932 }
15933if (isa<FixedVectorType>(ScalarTy)) {
15934assert(SLPReVec &&"FixedVectorType is not expected.");
15935// CreateMaskedGather expects VecTy and VecPtr have same size. We need
15936// to expand VecPtr if ScalarTy is a vector type.
15937unsigned ScalarTyNumElements =
15938 cast<FixedVectorType>(ScalarTy)->getNumElements();
15939unsigned VecTyNumElements =
15940 cast<FixedVectorType>(VecTy)->getNumElements();
15941assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15942"Cannot expand getelementptr.");
15943unsigned VF = VecTyNumElements / ScalarTyNumElements;
15944SmallVector<Constant *> Indices(VecTyNumElements);
15945transform(seq(VecTyNumElements), Indices.begin(), [=](unsignedI) {
15946 return Builder.getInt64(I % ScalarTyNumElements);
15947 });
15948 VecPtr = Builder.CreateGEP(
15949 VecTy->getElementType(),
15950 Builder.CreateShuffleVector(
15951 VecPtr,createReplicatedMask(ScalarTyNumElements, VF)),
15952ConstantVector::get(Indices));
15953 }
15954// Use the minimum alignment of the gathered loads.
15955Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15956 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15957 }
15958Value *V =::propagateMetadata(NewLI, E->Scalars);
15959
15960V = FinalShuffle(V, E);
15961 E->VectorizedValue =V;
15962 ++NumVectorInstructions;
15963returnV;
15964 }
15965case Instruction::Store: {
15966auto *SI = cast<StoreInst>(VL0);
15967
15968 setInsertPointAfterBundle(E);
15969
15970Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15971if (VecValue->getType() != VecTy)
15972 VecValue =
15973 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15974 VecValue = FinalShuffle(VecValue, E);
15975
15976Value *Ptr =SI->getPointerOperand();
15977Instruction *ST;
15978if (E->State == TreeEntry::Vectorize) {
15979ST = Builder.CreateAlignedStore(VecValue,Ptr,SI->getAlign());
15980 }else {
15981assert(E->State == TreeEntry::StridedVectorize &&
15982"Expected either strided or consecutive stores.");
15983if (!E->ReorderIndices.empty()) {
15984SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15985Ptr =SI->getPointerOperand();
15986 }
15987Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15988Type *StrideTy =DL->getIndexType(SI->getPointerOperandType());
15989auto *Inst = Builder.CreateIntrinsic(
15990 Intrinsic::experimental_vp_strided_store,
15991 {VecTy,Ptr->getType(), StrideTy},
15992 {VecValue,Ptr,
15993 ConstantInt::get(
15994 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
15995 Builder.getAllOnesMask(VecTy->getElementCount()),
15996 Builder.getInt32(E->Scalars.size())});
15997 Inst->addParamAttr(
15998/*ArgNo=*/1,
15999Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
16000ST = Inst;
16001 }
16002
16003Value *V =::propagateMetadata(ST, E->Scalars);
16004
16005 E->VectorizedValue =V;
16006 ++NumVectorInstructions;
16007returnV;
16008 }
16009case Instruction::GetElementPtr: {
16010auto *GEP0 = cast<GetElementPtrInst>(VL0);
16011 setInsertPointAfterBundle(E);
16012
16013Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16014if (E->VectorizedValue) {
16015LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16016return E->VectorizedValue;
16017 }
16018
16019SmallVector<Value *> OpVecs;
16020for (int J = 1,N = GEP0->getNumOperands(); J <N; ++J) {
16021Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16022if (E->VectorizedValue) {
16023LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16024return E->VectorizedValue;
16025 }
16026 OpVecs.push_back(OpVec);
16027 }
16028
16029Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16030if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
16031SmallVector<Value *> GEPs;
16032for (Value *V : E->Scalars) {
16033if (isa<GetElementPtrInst>(V))
16034 GEPs.push_back(V);
16035 }
16036V =::propagateMetadata(I, GEPs);
16037 }
16038
16039V = FinalShuffle(V, E);
16040
16041 E->VectorizedValue =V;
16042 ++NumVectorInstructions;
16043
16044returnV;
16045 }
16046case Instruction::Call: {
16047CallInst *CI = cast<CallInst>(VL0);
16048 setInsertPointAfterBundle(E);
16049
16050Intrinsic::IDID =getVectorIntrinsicIDForCall(CI, TLI);
16051
16052SmallVector<Type *> ArgTys =buildIntrinsicArgTypes(
16053 CI,ID, VecTy->getNumElements(),
16054 It != MinBWs.end() ? It->second.first : 0,TTI);
16055auto VecCallCosts =getVectorCallCosts(CI, VecTy,TTI, TLI, ArgTys);
16056bool UseIntrinsic =ID !=Intrinsic::not_intrinsic &&
16057 VecCallCosts.first <= VecCallCosts.second;
16058
16059Value *ScalarArg =nullptr;
16060SmallVector<Value *> OpVecs;
16061SmallVector<Type *, 2> TysForDecl;
16062// Add return type if intrinsic is overloaded on it.
16063if (UseIntrinsic &&isVectorIntrinsicWithOverloadTypeAtArg(ID, -1,TTI))
16064 TysForDecl.push_back(VecTy);
16065auto *CEI = cast<CallInst>(VL0);
16066for (unsignedI : seq<unsigned>(0, CI->arg_size())) {
16067ValueList OpVL;
16068// Some intrinsics have scalar arguments. This argument should not be
16069// vectorized.
16070if (UseIntrinsic &&isVectorIntrinsicWithScalarOpAtArg(ID,I,TTI)) {
16071 ScalarArg = CEI->getArgOperand(I);
16072// if decided to reduce bitwidth of abs intrinsic, it second argument
16073// must be set false (do not return poison, if value issigned min).
16074if (ID == Intrinsic::abs && It != MinBWs.end() &&
16075 It->second.first <DL->getTypeSizeInBits(CEI->getType()))
16076 ScalarArg = Builder.getFalse();
16077 OpVecs.push_back(ScalarArg);
16078if (isVectorIntrinsicWithOverloadTypeAtArg(ID,I,TTI))
16079 TysForDecl.push_back(ScalarArg->getType());
16080continue;
16081 }
16082
16083Value *OpVec = vectorizeOperand(E,I, PostponedPHIs);
16084if (E->VectorizedValue) {
16085LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16086return E->VectorizedValue;
16087 }
16088 ScalarArg = CEI->getArgOperand(I);
16089if (cast<VectorType>(OpVec->getType())->getElementType() !=
16090 ScalarArg->getType()->getScalarType() &&
16091 It == MinBWs.end()) {
16092auto *CastTy =
16093getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16094 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16095 }elseif (It != MinBWs.end()) {
16096 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16097 }
16098LLVM_DEBUG(dbgs() <<"SLP: OpVec[" <<I <<"]: " << *OpVec <<"\n");
16099 OpVecs.push_back(OpVec);
16100if (UseIntrinsic &&isVectorIntrinsicWithOverloadTypeAtArg(ID,I,TTI))
16101 TysForDecl.push_back(OpVec->getType());
16102 }
16103
16104Function *CF;
16105if (!UseIntrinsic) {
16106VFShape Shape =
16107VFShape::get(CI->getFunctionType(),
16108ElementCount::getFixed(
16109static_cast<unsigned>(VecTy->getNumElements())),
16110false/*HasGlobalPred*/);
16111 CF =VFDatabase(*CI).getVectorizedFunction(Shape);
16112 }else {
16113 CF =Intrinsic::getOrInsertDeclaration(F->getParent(),ID, TysForDecl);
16114 }
16115
16116SmallVector<OperandBundleDef, 1> OpBundles;
16117 CI->getOperandBundlesAsDefs(OpBundles);
16118Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16119
16120propagateIRFlags(V, E->Scalars, VL0);
16121V = FinalShuffle(V, E);
16122
16123 E->VectorizedValue =V;
16124 ++NumVectorInstructions;
16125returnV;
16126 }
16127case Instruction::ShuffleVector: {
16128Value *V;
16129if (SLPReVec && !E->isAltShuffle()) {
16130 setInsertPointAfterBundle(E);
16131Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16132if (E->VectorizedValue) {
16133LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16134return E->VectorizedValue;
16135 }
16136SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16137if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16138assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16139"Not supported shufflevector usage.");
16140SmallVector<int> NewMask(ThisMask.size());
16141transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16142 return SVSrc->getShuffleMask()[Mask];
16143 });
16144V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16145 }else {
16146V = Builder.CreateShuffleVector(Src, ThisMask);
16147 }
16148propagateIRFlags(V, E->Scalars, VL0);
16149if (auto *I = dyn_cast<Instruction>(V))
16150V =::propagateMetadata(I, E->Scalars);
16151V = FinalShuffle(V, E);
16152 }else {
16153assert(E->isAltShuffle() &&
16154 ((Instruction::isBinaryOp(E->getOpcode()) &&
16155Instruction::isBinaryOp(E->getAltOpcode())) ||
16156 (Instruction::isCast(E->getOpcode()) &&
16157Instruction::isCast(E->getAltOpcode())) ||
16158 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16159"Invalid Shuffle Vector Operand");
16160
16161Value *LHS =nullptr, *RHS =nullptr;
16162if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16163 setInsertPointAfterBundle(E);
16164LHS = vectorizeOperand(E, 0, PostponedPHIs);
16165if (E->VectorizedValue) {
16166LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16167return E->VectorizedValue;
16168 }
16169RHS = vectorizeOperand(E, 1, PostponedPHIs);
16170 }else {
16171 setInsertPointAfterBundle(E);
16172LHS = vectorizeOperand(E, 0, PostponedPHIs);
16173 }
16174if (E->VectorizedValue) {
16175LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");
16176return E->VectorizedValue;
16177 }
16178if (LHS && RHS &&
16179 ((Instruction::isBinaryOp(E->getOpcode()) &&
16180 (LHS->getType() != VecTy ||RHS->getType() != VecTy)) ||
16181 (isa<CmpInst>(VL0) &&LHS->getType() !=RHS->getType()))) {
16182assert((It != MinBWs.end() ||
16183 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16184 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16185 MinBWs.contains(getOperandEntry(E, 0)) ||
16186 MinBWs.contains(getOperandEntry(E, 1))) &&
16187"Expected item in MinBWs.");
16188Type *CastTy = VecTy;
16189if (isa<CmpInst>(VL0) &&LHS->getType() !=RHS->getType()) {
16190if (cast<VectorType>(LHS->getType())
16191 ->getElementType()
16192 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16193 ->getElementType()
16194 ->getIntegerBitWidth())
16195 CastTy =RHS->getType();
16196else
16197 CastTy =LHS->getType();
16198 }
16199if (LHS->getType() != CastTy)
16200LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16201if (RHS->getType() != CastTy)
16202RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16203 }
16204
16205Value *V0, *V1;
16206if (Instruction::isBinaryOp(E->getOpcode())) {
16207 V0 = Builder.CreateBinOp(
16208static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16209 V1 = Builder.CreateBinOp(
16210static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16211 }elseif (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16212 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16213auto *AltCI = cast<CmpInst>(E->getAltOp());
16214CmpInst::Predicate AltPred = AltCI->getPredicate();
16215 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16216 }else {
16217if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16218unsigned SrcBWSz =DL->getTypeSizeInBits(
16219 cast<VectorType>(LHS->getType())->getElementType());
16220unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);
16221if (BWSz <= SrcBWSz) {
16222if (BWSz < SrcBWSz)
16223LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16224assert(LHS->getType() == VecTy &&
16225"Expected same type as operand.");
16226if (auto *I = dyn_cast<Instruction>(LHS))
16227LHS =::propagateMetadata(I, E->Scalars);
16228LHS = FinalShuffle(LHS, E);
16229 E->VectorizedValue =LHS;
16230 ++NumVectorInstructions;
16231returnLHS;
16232 }
16233 }
16234 V0 = Builder.CreateCast(
16235static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16236 V1 = Builder.CreateCast(
16237static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16238 }
16239// Add V0 and V1 to later analysis to try to find and remove matching
16240// instruction, if any.
16241for (Value *V : {V0, V1}) {
16242if (auto *I = dyn_cast<Instruction>(V)) {
16243 GatherShuffleExtractSeq.insert(I);
16244 CSEBlocks.insert(I->getParent());
16245 }
16246 }
16247
16248// Create shuffle to take alternate operations from the vector.
16249// Also, gather up main and alt scalar ops to propagate IR flags to
16250// each vector operation.
16251ValueList OpScalars, AltScalars;
16252SmallVector<int>Mask;
16253 E->buildAltOpShuffleMask(
16254 [E,this](Instruction *I) {
16255assert(E->isOpcodeOrAlt(I) &&"Unexpected main/alternate opcode");
16256returnisAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16257 *TLI);
16258 },
16259Mask, &OpScalars, &AltScalars);
16260
16261propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16262propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16263auto DropNuwFlag = [&](Value *Vec,unsigned Opcode) {
16264// Drop nuw flags for abs(sub(commutative), true).
16265if (auto *I = dyn_cast<Instruction>(Vec);
16266I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16267any_of(E->Scalars, [](Value *V) {
16268 if (isa<PoisonValue>(V))
16269 return false;
16270 auto *IV = cast<Instruction>(V);
16271 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16272 }))
16273I->setHasNoUnsignedWrap(/*b=*/false);
16274 };
16275 DropNuwFlag(V0, E->getOpcode());
16276 DropNuwFlag(V1, E->getAltOpcode());
16277
16278if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16279assert(SLPReVec &&"FixedVectorType is not expected.");
16280transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
16281 }
16282V = Builder.CreateShuffleVector(V0, V1, Mask);
16283if (auto *I = dyn_cast<Instruction>(V)) {
16284V =::propagateMetadata(I, E->Scalars);
16285 GatherShuffleExtractSeq.insert(I);
16286 CSEBlocks.insert(I->getParent());
16287 }
16288 }
16289
16290 E->VectorizedValue =V;
16291 ++NumVectorInstructions;
16292
16293returnV;
16294 }
16295default:
16296llvm_unreachable("unknown inst");
16297 }
16298returnnullptr;
16299}
16300
16301Value *BoUpSLP::vectorizeTree() {
16302ExtraValueToDebugLocsMap ExternallyUsedValues;
16303returnvectorizeTree(ExternallyUsedValues);
16304}
16305
16306Value *
16307BoUpSLP::vectorizeTree(constExtraValueToDebugLocsMap &ExternallyUsedValues,
16308Instruction *ReductionRoot) {
16309// All blocks must be scheduled before any instructions are inserted.
16310for (auto &BSIter : BlocksSchedules) {
16311 scheduleBlock(BSIter.second.get());
16312 }
16313// Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16314// need to rebuild it.
16315 EntryToLastInstruction.clear();
16316
16317if (ReductionRoot)
16318 Builder.SetInsertPoint(ReductionRoot->getParent(),
16319 ReductionRoot->getIterator());
16320else
16321 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());
16322
16323// Emit gathered loads first to emit better code for the users of those
16324// gathered loads.
16325for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16326if (GatheredLoadsEntriesFirst.has_value() &&
16327 TE->Idx >= *GatheredLoadsEntriesFirst &&
16328 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16329assert((!TE->UserTreeIndices.empty() ||
16330 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16331"Expected gathered load node.");
16332 (void)vectorizeTree(TE.get(),/*PostponedPHIs=*/false);
16333 }
16334 }
16335// Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16336 (void)vectorizeTree(VectorizableTree[0].get(),/*PostponedPHIs=*/true);
16337for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16338if (TE->State == TreeEntry::Vectorize &&
16339 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16340 TE->VectorizedValue)
16341 (void)vectorizeTree(TE.get(),/*PostponedPHIs=*/false);
16342// Run through the list of postponed gathers and emit them, replacing the temp
16343// emitted allocas with actual vector instructions.
16344ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16345DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
16346for (const TreeEntry *E : PostponedNodes) {
16347auto *TE =const_cast<TreeEntry *>(E);
16348if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
16349if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16350 TE->UserTreeIndices.front().EdgeIdx)) &&
16351 VecTE->isSame(TE->Scalars))
16352// Found gather node which is absolutely the same as one of the
16353// vectorized nodes. It may happen after reordering.
16354continue;
16355auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16356 TE->VectorizedValue =nullptr;
16357auto *UserI =
16358 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16359// If user is a PHI node, its vector code have to be inserted right before
16360// block terminator. Since the node was delayed, there were some unresolved
16361// dependencies at the moment when stab instruction was emitted. In a case
16362// when any of these dependencies turn out an operand of another PHI, coming
16363// from this same block, position of a stab instruction will become invalid.
16364// The is because source vector that supposed to feed this gather node was
16365// inserted at the end of the block [after stab instruction]. So we need
16366// to adjust insertion point again to the end of block.
16367if (isa<PHINode>(UserI)) {
16368// Insert before all users.
16369Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16370for (User *U : PrevVec->users()) {
16371if (U == UserI)
16372continue;
16373auto *UI = dyn_cast<Instruction>(U);
16374if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16375continue;
16376if (UI->comesBefore(InsertPt))
16377 InsertPt = UI;
16378 }
16379 Builder.SetInsertPoint(InsertPt);
16380 }else {
16381 Builder.SetInsertPoint(PrevVec);
16382 }
16383 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16384Value *Vec =vectorizeTree(TE,/*PostponedPHIs=*/false);
16385if (auto *VecI = dyn_cast<Instruction>(Vec);
16386 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16387 Builder.GetInsertPoint()->comesBefore(VecI))
16388 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16389 Builder.GetInsertPoint());
16390if (Vec->getType() != PrevVec->getType()) {
16391assert(Vec->getType()->isIntOrIntVectorTy() &&
16392 PrevVec->getType()->isIntOrIntVectorTy() &&
16393"Expected integer vector types only.");
16394 std::optional<bool> IsSigned;
16395for (Value *V : TE->Scalars) {
16396if (const TreeEntry *BaseTE = getTreeEntry(V)) {
16397auto It = MinBWs.find(BaseTE);
16398if (It != MinBWs.end()) {
16399 IsSigned = IsSigned.value_or(false) || It->second.second;
16400if (*IsSigned)
16401break;
16402 }
16403for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
16404auto It = MinBWs.find(MNTE);
16405if (It != MinBWs.end()) {
16406 IsSigned = IsSigned.value_or(false) || It->second.second;
16407if (*IsSigned)
16408break;
16409 }
16410 }
16411if (IsSigned.value_or(false))
16412break;
16413// Scan through gather nodes.
16414for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16415auto It = MinBWs.find(BVE);
16416if (It != MinBWs.end()) {
16417 IsSigned = IsSigned.value_or(false) || It->second.second;
16418if (*IsSigned)
16419break;
16420 }
16421 }
16422if (IsSigned.value_or(false))
16423break;
16424if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16425 IsSigned =
16426 IsSigned.value_or(false) ||
16427 !isKnownNonNegative(EE->getVectorOperand(),SimplifyQuery(*DL));
16428continue;
16429 }
16430if (IsSigned.value_or(false))
16431break;
16432 }
16433 }
16434if (IsSigned.value_or(false)) {
16435// Final attempt - check user node.
16436auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16437if (It != MinBWs.end())
16438 IsSigned = It->second.second;
16439 }
16440assert(IsSigned &&
16441"Expected user node or perfect diamond match in MinBWs.");
16442 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16443 }
16444 PrevVec->replaceAllUsesWith(Vec);
16445 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16446// Replace the stub vector node, if it was used before for one of the
16447// buildvector nodes already.
16448auto It = PostponedValues.find(PrevVec);
16449if (It != PostponedValues.end()) {
16450for (TreeEntry *VTE : It->getSecond())
16451 VTE->VectorizedValue = Vec;
16452 }
16453eraseInstruction(PrevVec);
16454 }
16455
16456LLVM_DEBUG(dbgs() <<"SLP: Extracting " << ExternalUses.size()
16457 <<" values .\n");
16458
16459SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
16460// Maps vector instruction to original insertelement instruction
16461DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16462// Maps extract Scalar to the corresponding extractelement instruction in the
16463// basic block. Only one extractelement per block should be emitted.
16464DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
16465 ScalarToEEs;
16466SmallDenseSet<Value *, 4> UsedInserts;
16467DenseMap<std::pair<Value *, Type *>,Value *> VectorCasts;
16468SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16469SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;
16470// Extract all of the elements with the external uses.
16471for (constauto &ExternalUse : ExternalUses) {
16472Value *Scalar = ExternalUse.Scalar;
16473llvm::User *User = ExternalUse.User;
16474
16475// Skip users that we already RAUW. This happens when one instruction
16476// has multiple uses of the same value.
16477if (User && !is_contained(Scalar->users(),User))
16478continue;
16479 TreeEntry *E = getTreeEntry(Scalar);
16480assert(E &&"Invalid scalar");
16481assert(!E->isGather() &&"Extracting from a gather list");
16482// Non-instruction pointers are not deleted, just skip them.
16483if (E->getOpcode() == Instruction::GetElementPtr &&
16484 !isa<GetElementPtrInst>(Scalar))
16485continue;
16486
16487Value *Vec = E->VectorizedValue;
16488assert(Vec &&"Can't find vectorizable value");
16489
16490Value *Lane = Builder.getInt32(ExternalUse.Lane);
16491auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16492if (Scalar->getType() != Vec->getType()) {
16493Value *Ex =nullptr;
16494Value *ExV =nullptr;
16495auto *Inst = dyn_cast<Instruction>(Scalar);
16496bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16497auto It = ScalarToEEs.find(Scalar);
16498if (It != ScalarToEEs.end()) {
16499// No need to emit many extracts, just move the only one in the
16500// current block.
16501auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16502 : Builder.GetInsertBlock());
16503if (EEIt != It->second.end()) {
16504Value *PrevV = EEIt->second.first;
16505if (auto *I = dyn_cast<Instruction>(PrevV);
16506I && !ReplaceInst &&
16507 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16508 Builder.GetInsertPoint()->comesBefore(I)) {
16509I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16510 Builder.GetInsertPoint());
16511if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16512 CI->moveAfter(I);
16513 }
16514 Ex = PrevV;
16515 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16516 }
16517 }
16518if (!Ex) {
16519// "Reuse" the existing extract to improve final codegen.
16520if (ReplaceInst) {
16521// Leave the instruction as is, if it cheaper extracts and all
16522// operands are scalar.
16523if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16524 IgnoredExtracts.insert(EE);
16525 Ex = EE;
16526 }else {
16527auto *CloneInst = Inst->clone();
16528 CloneInst->insertBefore(Inst->getIterator());
16529if (Inst->hasName())
16530 CloneInst->takeName(Inst);
16531 Ex = CloneInst;
16532 }
16533 }elseif (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16534 ES && isa<Instruction>(Vec)) {
16535Value *V = ES->getVectorOperand();
16536auto *IVec = cast<Instruction>(Vec);
16537if (const TreeEntry *ETE = getTreeEntry(V))
16538 V = ETE->VectorizedValue;
16539if (auto *IV = dyn_cast<Instruction>(V);
16540 !IV ||IV == Vec ||IV->getParent() != IVec->getParent() ||
16541IV->comesBefore(IVec))
16542 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16543else
16544 Ex = Builder.CreateExtractElement(Vec, Lane);
16545 }elseif (auto *VecTy =
16546 dyn_cast<FixedVectorType>(Scalar->getType())) {
16547assert(SLPReVec &&"FixedVectorType is not expected.");
16548unsigned VecTyNumElements = VecTy->getNumElements();
16549// When REVEC is enabled, we need to extract a vector.
16550// Note: The element size of Scalar may be different from the
16551// element size of Vec.
16552 Ex =createExtractVector(Builder, Vec, VecTyNumElements,
16553 ExternalUse.Lane * VecTyNumElements);
16554 }else {
16555 Ex = Builder.CreateExtractElement(Vec, Lane);
16556 }
16557// If necessary, sign-extend or zero-extend ScalarRoot
16558// to the larger type.
16559 ExV = Ex;
16560if (Scalar->getType() != Ex->getType())
16561 ExV = Builder.CreateIntCast(
16562 Ex, Scalar->getType(),
16563 !isKnownNonNegative(Scalar,SimplifyQuery(*DL)));
16564auto *I = dyn_cast<Instruction>(Ex);
16565 ScalarToEEs[Scalar].try_emplace(I ?I->getParent()
16566 : &F->getEntryBlock(),
16567 std::make_pair(Ex, ExV));
16568 }
16569// The then branch of the previous if may produce constants, since 0
16570// operand might be a constant.
16571if (auto *ExI = dyn_cast<Instruction>(Ex);
16572 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16573 GatherShuffleExtractSeq.insert(ExI);
16574 CSEBlocks.insert(ExI->getParent());
16575 }
16576return ExV;
16577 }
16578assert(isa<FixedVectorType>(Scalar->getType()) &&
16579 isa<InsertElementInst>(Scalar) &&
16580"In-tree scalar of vector type is not insertelement?");
16581auto *IE = cast<InsertElementInst>(Scalar);
16582 VectorToInsertElement.try_emplace(Vec, IE);
16583return Vec;
16584 };
16585// If User == nullptr, the Scalar remains as scalar in vectorized
16586// instructions or is used as extra arg. Generate ExtractElement instruction
16587// and update the record for this scalar in ExternallyUsedValues.
16588if (!User) {
16589if (!ScalarsWithNullptrUser.insert(Scalar).second)
16590continue;
16591assert((ExternallyUsedValues.count(Scalar) ||
16592 Scalar->hasNUsesOrMore(UsesLimit) ||
16593 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16594any_of(Scalar->users(),
16595 [&](llvm::User *U) {
16596 if (ExternalUsesAsOriginalScalar.contains(U))
16597 return true;
16598 TreeEntry *UseEntry = getTreeEntry(U);
16599 return UseEntry &&
16600 (UseEntry->State == TreeEntry::Vectorize ||
16601 UseEntry->State ==
16602 TreeEntry::StridedVectorize) &&
16603 (E->State == TreeEntry::Vectorize ||
16604 E->State == TreeEntry::StridedVectorize) &&
16605 doesInTreeUserNeedToExtract(
16606 Scalar, getRootEntryInstruction(*UseEntry),
16607 TLI, TTI);
16608 })) &&
16609"Scalar with nullptr User must be registered in "
16610"ExternallyUsedValues map or remain as scalar in vectorized "
16611"instructions");
16612if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16613if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16614if (PHI->getParent()->isLandingPad())
16615 Builder.SetInsertPoint(
16616PHI->getParent(),
16617 std::next(
16618PHI->getParent()->getLandingPadInst()->getIterator()));
16619else
16620 Builder.SetInsertPoint(PHI->getParent(),
16621PHI->getParent()->getFirstNonPHIIt());
16622 }else {
16623 Builder.SetInsertPoint(VecI->getParent(),
16624 std::next(VecI->getIterator()));
16625 }
16626 }else {
16627 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());
16628 }
16629Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16630// Required to update internally referenced instructions.
16631if (Scalar != NewInst) {
16632assert((!isa<ExtractElementInst>(Scalar) ||
16633 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16634"Extractelements should not be replaced.");
16635 Scalar->replaceAllUsesWith(NewInst);
16636 }
16637continue;
16638 }
16639
16640if (auto *VU = dyn_cast<InsertElementInst>(User);
16641 VU && VU->getOperand(1) == Scalar) {
16642// Skip if the scalar is another vector op or Vec is not an instruction.
16643if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16644if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16645if (!UsedInserts.insert(VU).second)
16646continue;
16647// Need to use original vector, if the root is truncated.
16648auto BWIt = MinBWs.find(E);
16649if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16650auto *ScalarTy = FTy->getElementType();
16651auto Key = std::make_pair(Vec, ScalarTy);
16652auto VecIt = VectorCasts.find(Key);
16653if (VecIt == VectorCasts.end()) {
16654IRBuilderBase::InsertPointGuard Guard(Builder);
16655if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16656if (IVec->getParent()->isLandingPad())
16657 Builder.SetInsertPoint(IVec->getParent(),
16658 std::next(IVec->getParent()
16659 ->getLandingPadInst()
16660 ->getIterator()));
16661else
16662 Builder.SetInsertPoint(
16663 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16664 }elseif (auto *IVec = dyn_cast<Instruction>(Vec)) {
16665 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16666 }
16667 Vec = Builder.CreateIntCast(
16668 Vec,
16669getWidenedType(
16670 ScalarTy,
16671 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16672 BWIt->second.second);
16673 VectorCasts.try_emplace(Key, Vec);
16674 }else {
16675 Vec = VecIt->second;
16676 }
16677 }
16678
16679 std::optional<unsigned> InsertIdx =getElementIndex(VU);
16680if (InsertIdx) {
16681auto *It =find_if(
16682 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16683// Checks if 2 insertelements are from the same buildvector.
16684InsertElementInst *VecInsert =Data.InsertElements.front();
16685returnareTwoInsertFromSameBuildVector(
16686 VU, VecInsert,
16687 [](InsertElementInst *II) {returnII->getOperand(0); });
16688 });
16689unsignedIdx = *InsertIdx;
16690if (It == ShuffledInserts.end()) {
16691 (void)ShuffledInserts.emplace_back();
16692 It = std::next(ShuffledInserts.begin(),
16693 ShuffledInserts.size() - 1);
16694 }
16695SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16696if (Mask.empty())
16697 Mask.assign(FTy->getNumElements(),PoisonMaskElem);
16698 Mask[Idx] = ExternalUse.Lane;
16699 It->InsertElements.push_back(cast<InsertElementInst>(User));
16700continue;
16701 }
16702 }
16703 }
16704 }
16705
16706// Generate extracts for out-of-tree users.
16707// Find the insertion point for the extractelement lane.
16708if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16709if (PHINode *PH = dyn_cast<PHINode>(User)) {
16710for (unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {
16711if (PH->getIncomingValue(I) == Scalar) {
16712Instruction *IncomingTerminator =
16713 PH->getIncomingBlock(I)->getTerminator();
16714if (isa<CatchSwitchInst>(IncomingTerminator)) {
16715 Builder.SetInsertPoint(VecI->getParent(),
16716 std::next(VecI->getIterator()));
16717 }else {
16718 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16719 }
16720Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16721 PH->setOperand(I, NewInst);
16722 }
16723 }
16724 }else {
16725 Builder.SetInsertPoint(cast<Instruction>(User));
16726Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16727User->replaceUsesOfWith(Scalar, NewInst);
16728 }
16729 }else {
16730 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());
16731Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16732User->replaceUsesOfWith(Scalar, NewInst);
16733 }
16734
16735LLVM_DEBUG(dbgs() <<"SLP: Replaced:" << *User <<".\n");
16736 }
16737
16738auto CreateShuffle = [&](Value *V1,Value *V2,ArrayRef<int> Mask) {
16739SmallVector<int> CombinedMask1(Mask.size(),PoisonMaskElem);
16740SmallVector<int> CombinedMask2(Mask.size(),PoisonMaskElem);
16741int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16742for (intI = 0, E = Mask.size();I < E; ++I) {
16743if (Mask[I] < VF)
16744 CombinedMask1[I] = Mask[I];
16745else
16746 CombinedMask2[I] = Mask[I] - VF;
16747 }
16748ShuffleInstructionBuilder ShuffleBuilder(
16749 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16750 ShuffleBuilder.add(V1, CombinedMask1);
16751if (V2)
16752 ShuffleBuilder.add(V2, CombinedMask2);
16753return ShuffleBuilder.finalize({}, {}, {});
16754 };
16755
16756auto &&ResizeToVF = [&CreateShuffle](Value *Vec,ArrayRef<int> Mask,
16757bool ForSingleMask) {
16758unsigned VF = Mask.size();
16759unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16760if (VF != VecVF) {
16761if (any_of(Mask, [VF](intIdx) {returnIdx >=static_cast<int>(VF); })) {
16762 Vec = CreateShuffle(Vec,nullptr, Mask);
16763return std::make_pair(Vec,true);
16764 }
16765if (!ForSingleMask) {
16766SmallVector<int> ResizeMask(VF,PoisonMaskElem);
16767for (unsignedI = 0;I < VF; ++I) {
16768if (Mask[I] !=PoisonMaskElem)
16769 ResizeMask[Mask[I]] = Mask[I];
16770 }
16771 Vec = CreateShuffle(Vec,nullptr, ResizeMask);
16772 }
16773 }
16774
16775return std::make_pair(Vec,false);
16776 };
16777// Perform shuffling of the vectorize tree entries for better handling of
16778// external extracts.
16779for (intI = 0, E = ShuffledInserts.size();I < E; ++I) {
16780// Find the first and the last instruction in the list of insertelements.
16781sort(ShuffledInserts[I].InsertElements,isFirstInsertElement);
16782InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16783InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16784 Builder.SetInsertPoint(LastInsert);
16785autoVector = ShuffledInserts[I].ValueMasks.takeVector();
16786Value *NewInst = performExtractsShuffleAction<Value>(
16787MutableArrayRef(Vector.data(),Vector.size()),
16788 FirstInsert->getOperand(0),
16789 [](Value *Vec) {
16790 return cast<VectorType>(Vec->getType())
16791 ->getElementCount()
16792 .getKnownMinValue();
16793 },
16794 ResizeToVF,
16795 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16796ArrayRef<Value *> Vals) {
16797 assert((Vals.size() == 1 || Vals.size() == 2) &&
16798"Expected exactly 1 or 2 input values.");
16799 if (Vals.size() == 1) {
16800// Do not create shuffle if the mask is a simple identity
16801// non-resizing mask.
16802 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16803 ->getNumElements() ||
16804 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16805 return CreateShuffle(Vals.front(), nullptr, Mask);
16806 return Vals.front();
16807 }
16808return CreateShuffle(Vals.front() ? Vals.front()
16809 : FirstInsert->getOperand(0),
16810 Vals.back(), Mask);
16811 });
16812auto It = ShuffledInserts[I].InsertElements.rbegin();
16813// Rebuild buildvector chain.
16814InsertElementInst *II =nullptr;
16815if (It != ShuffledInserts[I].InsertElements.rend())
16816II = *It;
16817SmallVector<Instruction *> Inserts;
16818while (It != ShuffledInserts[I].InsertElements.rend()) {
16819assert(II &&"Must be an insertelement instruction.");
16820if (*It ==II)
16821 ++It;
16822else
16823 Inserts.push_back(cast<Instruction>(II));
16824II = dyn_cast<InsertElementInst>(II->getOperand(0));
16825 }
16826for (Instruction *II :reverse(Inserts)) {
16827II->replaceUsesOfWith(II->getOperand(0), NewInst);
16828if (auto *NewI = dyn_cast<Instruction>(NewInst))
16829if (II->getParent() == NewI->getParent() &&II->comesBefore(NewI))
16830II->moveAfter(NewI);
16831 NewInst =II;
16832 }
16833 LastInsert->replaceAllUsesWith(NewInst);
16834for (InsertElementInst *IE :reverse(ShuffledInserts[I].InsertElements)) {
16835 IE->replaceUsesOfWith(IE->getOperand(0),
16836PoisonValue::get(IE->getOperand(0)->getType()));
16837 IE->replaceUsesOfWith(IE->getOperand(1),
16838PoisonValue::get(IE->getOperand(1)->getType()));
16839eraseInstruction(IE);
16840 }
16841 CSEBlocks.insert(LastInsert->getParent());
16842 }
16843
16844SmallVector<Instruction *> RemovedInsts;
16845// For each vectorized value:
16846for (auto &TEPtr : VectorizableTree) {
16847 TreeEntry *Entry = TEPtr.get();
16848
16849// No need to handle users of gathered values.
16850if (Entry->isGather())
16851continue;
16852
16853assert(Entry->VectorizedValue &&"Can't find vectorizable value");
16854
16855// For each lane:
16856for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16857Value *Scalar = Entry->Scalars[Lane];
16858
16859if (Entry->getOpcode() == Instruction::GetElementPtr &&
16860 !isa<GetElementPtrInst>(Scalar))
16861continue;
16862if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16863 EE && IgnoredExtracts.contains(EE))
16864continue;
16865if (isa<PoisonValue>(Scalar))
16866continue;
16867#ifndef NDEBUG
16868Type *Ty = Scalar->getType();
16869if (!Ty->isVoidTy()) {
16870for (User *U : Scalar->users()) {
16871LLVM_DEBUG(dbgs() <<"SLP: \tvalidating user:" << *U <<".\n");
16872
16873// It is legal to delete users in the ignorelist.
16874assert((getTreeEntry(U) ||
16875 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16876 (isa_and_nonnull<Instruction>(U) &&
16877 isDeleted(cast<Instruction>(U)))) &&
16878"Deleting out-of-tree value");
16879 }
16880 }
16881#endif
16882LLVM_DEBUG(dbgs() <<"SLP: \tErasing scalar:" << *Scalar <<".\n");
16883auto *I = cast<Instruction>(Scalar);
16884 RemovedInsts.push_back(I);
16885 }
16886 }
16887
16888// Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16889// new vector instruction.
16890if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16891V->mergeDIAssignID(RemovedInsts);
16892
16893// Clear up reduction references, if any.
16894if (UserIgnoreList) {
16895for (Instruction *I : RemovedInsts) {
16896const TreeEntry *IE = getTreeEntry(I);
16897if (IE->Idx != 0 &&
16898 !(VectorizableTree.front()->isGather() &&
16899 !IE->UserTreeIndices.empty() &&
16900 (ValueToGatherNodes.lookup(I).contains(
16901 VectorizableTree.front().get()) ||
16902any_of(IE->UserTreeIndices,
16903 [&](const EdgeInfo &EI) {
16904 return EI.UserTE == VectorizableTree.front().get() &&
16905 EI.EdgeIdx == UINT_MAX;
16906 }))) &&
16907 !(GatheredLoadsEntriesFirst.has_value() &&
16908IE->Idx >= *GatheredLoadsEntriesFirst &&
16909 VectorizableTree.front()->isGather() &&
16910is_contained(VectorizableTree.front()->Scalars,I)))
16911continue;
16912SmallVector<SelectInst *> LogicalOpSelects;
16913I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16914// Do not replace condition of the logical op in form select <cond>.
16915 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16916 (match(U.getUser(), m_LogicalAnd()) ||
16917 match(U.getUser(), m_LogicalOr())) &&
16918 U.getOperandNo() == 0;
16919 if (IsPoisoningLogicalOp) {
16920 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16921 return false;
16922 }
16923return UserIgnoreList->contains(U.getUser());
16924 });
16925// Replace conditions of the poisoning logical ops with the non-poison
16926// constant value.
16927for (SelectInst *SI : LogicalOpSelects)
16928SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16929 }
16930 }
16931// Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16932// cache correctness.
16933// NOTE: removeInstructionAndOperands only marks the instruction for deletion
16934// - instructions are not deleted until later.
16935 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16936
16937 Builder.ClearInsertionPoint();
16938 InstrElementSize.clear();
16939
16940const TreeEntry &RootTE = *VectorizableTree.front();
16941Value *Vec = RootTE.VectorizedValue;
16942if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16943 It != MinBWs.end() &&
16944 ReductionBitWidth != It->second.first) {
16945IRBuilder<>::InsertPointGuard Guard(Builder);
16946 Builder.SetInsertPoint(ReductionRoot->getParent(),
16947 ReductionRoot->getIterator());
16948 Vec = Builder.CreateIntCast(
16949 Vec,
16950VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16951 cast<VectorType>(Vec->getType())->getElementCount()),
16952 It->second.second);
16953 }
16954return Vec;
16955}
16956
16957voidBoUpSLP::optimizeGatherSequence() {
16958LLVM_DEBUG(dbgs() <<"SLP: Optimizing " << GatherShuffleExtractSeq.size()
16959 <<" gather sequences instructions.\n");
16960// LICM InsertElementInst sequences.
16961for (Instruction *I : GatherShuffleExtractSeq) {
16962if (isDeleted(I))
16963continue;
16964
16965// Check if this block is inside a loop.
16966Loop *L = LI->getLoopFor(I->getParent());
16967if (!L)
16968continue;
16969
16970// Check if it has a preheader.
16971BasicBlock *PreHeader = L->getLoopPreheader();
16972if (!PreHeader)
16973continue;
16974
16975// If the vector or the element that we insert into it are
16976// instructions that are defined in this basic block then we can't
16977// hoist this instruction.
16978if (any_of(I->operands(), [L](Value *V) {
16979 auto *OpI = dyn_cast<Instruction>(V);
16980 return OpI && L->contains(OpI);
16981 }))
16982continue;
16983
16984// We can hoist this instruction. Move it to the pre-header.
16985I->moveBefore(PreHeader->getTerminator()->getIterator());
16986 CSEBlocks.insert(PreHeader);
16987 }
16988
16989// Make a list of all reachable blocks in our CSE queue.
16990SmallVector<const DomTreeNode *, 8> CSEWorkList;
16991 CSEWorkList.reserve(CSEBlocks.size());
16992for (BasicBlock *BB : CSEBlocks)
16993if (DomTreeNode *N = DT->getNode(BB)) {
16994assert(DT->isReachableFromEntry(N));
16995 CSEWorkList.push_back(N);
16996 }
16997
16998// Sort blocks by domination. This ensures we visit a block after all blocks
16999// dominating it are visited.
17000llvm::sort(CSEWorkList, [](constDomTreeNode *A,constDomTreeNode *B) {
17001assert((A ==B) == (A->getDFSNumIn() ==B->getDFSNumIn()) &&
17002"Different nodes should have different DFS numbers");
17003returnA->getDFSNumIn() <B->getDFSNumIn();
17004 });
17005
17006// Less defined shuffles can be replaced by the more defined copies.
17007// Between two shuffles one is less defined if it has the same vector operands
17008// and its mask indeces are the same as in the first one or undefs. E.g.
17009// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
17010// poison, <0, 0, 0, 0>.
17011auto &&IsIdenticalOrLessDefined = [TTI =TTI](Instruction *I1,
17012Instruction *I2,
17013SmallVectorImpl<int> &NewMask) {
17014if (I1->getType() != I2->getType())
17015returnfalse;
17016auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17017auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17018if (!SI1 || !SI2)
17019return I1->isIdenticalTo(I2);
17020if (SI1->isIdenticalTo(SI2))
17021returntrue;
17022for (intI = 0, E = SI1->getNumOperands();I < E; ++I)
17023if (SI1->getOperand(I) != SI2->getOperand(I))
17024returnfalse;
17025// Check if the second instruction is more defined than the first one.
17026 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17027ArrayRef<int> SM1 = SI1->getShuffleMask();
17028// Count trailing undefs in the mask to check the final number of used
17029// registers.
17030unsigned LastUndefsCnt = 0;
17031for (intI = 0, E = NewMask.size();I < E; ++I) {
17032if (SM1[I] ==PoisonMaskElem)
17033 ++LastUndefsCnt;
17034else
17035 LastUndefsCnt = 0;
17036if (NewMask[I] !=PoisonMaskElem && SM1[I] !=PoisonMaskElem &&
17037 NewMask[I] != SM1[I])
17038returnfalse;
17039if (NewMask[I] ==PoisonMaskElem)
17040 NewMask[I] = SM1[I];
17041 }
17042// Check if the last undefs actually change the final number of used vector
17043// registers.
17044return SM1.size() - LastUndefsCnt > 1 &&
17045TTI->getNumberOfParts(SI1->getType()) ==
17046TTI->getNumberOfParts(
17047getWidenedType(SI1->getType()->getElementType(),
17048 SM1.size() - LastUndefsCnt));
17049 };
17050// Perform O(N^2) search over the gather/shuffle sequences and merge identical
17051// instructions. TODO: We can further optimize this scan if we split the
17052// instructions into different buckets based on the insert lane.
17053SmallVector<Instruction *, 16> Visited;
17054for (autoI = CSEWorkList.begin(), E = CSEWorkList.end();I != E; ++I) {
17055assert(*I &&
17056 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
17057"Worklist not sorted properly!");
17058BasicBlock *BB = (*I)->getBlock();
17059// For all instructions in blocks containing gather sequences:
17060for (Instruction &In :llvm::make_early_inc_range(*BB)) {
17061if (isDeleted(&In))
17062continue;
17063if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17064 !GatherShuffleExtractSeq.contains(&In))
17065continue;
17066
17067// Check if we can replace this instruction with any of the
17068// visited instructions.
17069bool Replaced =false;
17070for (Instruction *&V : Visited) {
17071SmallVector<int> NewMask;
17072if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17073 DT->dominates(V->getParent(), In.getParent())) {
17074 In.replaceAllUsesWith(V);
17075eraseInstruction(&In);
17076if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17077if (!NewMask.empty())
17078 SI->setShuffleMask(NewMask);
17079 Replaced =true;
17080break;
17081 }
17082if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17083 GatherShuffleExtractSeq.contains(V) &&
17084 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17085 DT->dominates(In.getParent(), V->getParent())) {
17086 In.moveAfter(V);
17087 V->replaceAllUsesWith(&In);
17088eraseInstruction(V);
17089if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17090if (!NewMask.empty())
17091 SI->setShuffleMask(NewMask);
17092 V = &In;
17093 Replaced =true;
17094break;
17095 }
17096 }
17097if (!Replaced) {
17098assert(!is_contained(Visited, &In));
17099 Visited.push_back(&In);
17100 }
17101 }
17102 }
17103 CSEBlocks.clear();
17104 GatherShuffleExtractSeq.clear();
17105}
17106
17107BoUpSLP::ScheduleData *
17108BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17109 ScheduleData *Bundle =nullptr;
17110 ScheduleData *PrevInBundle =nullptr;
17111for (Value *V : VL) {
17112if (doesNotNeedToBeScheduled(V))
17113continue;
17114 ScheduleData *BundleMember = getScheduleData(V);
17115assert(BundleMember &&
17116"no ScheduleData for bundle member "
17117"(maybe not in same basic block)");
17118assert(BundleMember->isSchedulingEntity() &&
17119"bundle member already part of other bundle");
17120if (PrevInBundle) {
17121 PrevInBundle->NextInBundle = BundleMember;
17122 }else {
17123 Bundle = BundleMember;
17124 }
17125
17126// Group the instructions to a bundle.
17127 BundleMember->FirstInBundle = Bundle;
17128 PrevInBundle = BundleMember;
17129 }
17130assert(Bundle &&"Failed to find schedule bundle");
17131return Bundle;
17132}
17133
17134// Groups the instructions to a bundle (which is then a single scheduling entity)
17135// and schedules instructions until the bundle gets ready.
17136std::optional<BoUpSLP::ScheduleData *>
17137BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,BoUpSLP *SLP,
17138const InstructionsState &S) {
17139// No need to schedule PHIs, insertelement, extractelement and extractvalue
17140// instructions.
17141if (isa<PHINode>(S.getMainOp()) ||
17142isVectorLikeInstWithConstOps(S.getMainOp()) ||doesNotNeedToSchedule(VL))
17143returnnullptr;
17144
17145// Initialize the instruction bundle.
17146Instruction *OldScheduleEnd = ScheduleEnd;
17147LLVM_DEBUG(dbgs() <<"SLP: bundle: " << *S.getMainOp() <<"\n");
17148
17149auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17150 ScheduleData *Bundle) {
17151// The scheduling region got new instructions at the lower end (or it is a
17152// new region for the first bundle). This makes it necessary to
17153// recalculate all dependencies.
17154// It is seldom that this needs to be done a second time after adding the
17155// initial bundle to the region.
17156if (ScheduleEnd != OldScheduleEnd) {
17157for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode())
17158if (ScheduleData *SD = getScheduleData(I))
17159 SD->clearDependencies();
17160 ReSchedule =true;
17161 }
17162if (Bundle) {
17163LLVM_DEBUG(dbgs() <<"SLP: try schedule bundle " << *Bundle
17164 <<" in block " << BB->getName() <<"\n");
17165 calculateDependencies(Bundle,/*InsertInReadyList=*/true, SLP);
17166 }
17167
17168if (ReSchedule) {
17169 resetSchedule();
17170 initialFillReadyList(ReadyInsts);
17171 }
17172
17173// Now try to schedule the new bundle or (if no bundle) just calculate
17174// dependencies. As soon as the bundle is "ready" it means that there are no
17175// cyclic dependencies and we can schedule it. Note that's important that we
17176// don't "schedule" the bundle yet (see cancelScheduling).
17177while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17178 !ReadyInsts.empty()) {
17179 ScheduleData *Picked = ReadyInsts.pop_back_val();
17180assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17181"must be ready to schedule");
17182 schedule(Picked, ReadyInsts);
17183 }
17184 };
17185
17186// Make sure that the scheduling region contains all
17187// instructions of the bundle.
17188for (Value *V : VL) {
17189if (doesNotNeedToBeScheduled(V))
17190continue;
17191if (!extendSchedulingRegion(V, S)) {
17192// If the scheduling region got new instructions at the lower end (or it
17193// is a new region for the first bundle). This makes it necessary to
17194// recalculate all dependencies.
17195// Otherwise the compiler may crash trying to incorrectly calculate
17196// dependencies and emit instruction in the wrong order at the actual
17197// scheduling.
17198 TryScheduleBundleImpl(/*ReSchedule=*/false,nullptr);
17199return std::nullopt;
17200 }
17201 }
17202
17203bool ReSchedule =false;
17204for (Value *V : VL) {
17205if (doesNotNeedToBeScheduled(V))
17206continue;
17207 ScheduleData *BundleMember = getScheduleData(V);
17208assert(BundleMember &&
17209"no ScheduleData for bundle member (maybe not in same basic block)");
17210
17211// Make sure we don't leave the pieces of the bundle in the ready list when
17212// whole bundle might not be ready.
17213 ReadyInsts.remove(BundleMember);
17214
17215if (!BundleMember->IsScheduled)
17216continue;
17217// A bundle member was scheduled as single instruction before and now
17218// needs to be scheduled as part of the bundle. We just get rid of the
17219// existing schedule.
17220LLVM_DEBUG(dbgs() <<"SLP: reset schedule because " << *BundleMember
17221 <<" was already scheduled\n");
17222 ReSchedule =true;
17223 }
17224
17225auto *Bundle = buildBundle(VL);
17226 TryScheduleBundleImpl(ReSchedule, Bundle);
17227if (!Bundle->isReady()) {
17228 cancelScheduling(VL, S.getMainOp());
17229return std::nullopt;
17230 }
17231return Bundle;
17232}
17233
17234void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17235Value *OpValue) {
17236if (isa<PHINode>(OpValue) ||isVectorLikeInstWithConstOps(OpValue) ||
17237doesNotNeedToSchedule(VL))
17238return;
17239
17240if (doesNotNeedToBeScheduled(OpValue))
17241 OpValue = *find_if_not(VL,doesNotNeedToBeScheduled);
17242 ScheduleData *Bundle = getScheduleData(OpValue);
17243LLVM_DEBUG(dbgs() <<"SLP: cancel scheduling of " << *Bundle <<"\n");
17244assert(!Bundle->IsScheduled &&
17245"Can't cancel bundle which is already scheduled");
17246assert(Bundle->isSchedulingEntity() &&
17247 (Bundle->isPartOfBundle() ||needToScheduleSingleInstruction(VL)) &&
17248"tried to unbundle something which is not a bundle");
17249
17250// Remove the bundle from the ready list.
17251if (Bundle->isReady())
17252 ReadyInsts.remove(Bundle);
17253
17254// Un-bundle: make single instructions out of the bundle.
17255 ScheduleData *BundleMember = Bundle;
17256while (BundleMember) {
17257assert(BundleMember->FirstInBundle == Bundle &&"corrupt bundle links");
17258 BundleMember->FirstInBundle = BundleMember;
17259 ScheduleData *Next = BundleMember->NextInBundle;
17260 BundleMember->NextInBundle =nullptr;
17261 BundleMember->TE =nullptr;
17262if (BundleMember->unscheduledDepsInBundle() == 0) {
17263 ReadyInsts.insert(BundleMember);
17264 }
17265 BundleMember = Next;
17266 }
17267}
17268
17269BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17270// Allocate a new ScheduleData for the instruction.
17271if (ChunkPos >= ChunkSize) {
17272 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17273 ChunkPos = 0;
17274 }
17275return &(ScheduleDataChunks.back()[ChunkPos++]);
17276}
17277
17278bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17279Value *V,const InstructionsState &S) {
17280Instruction *I = dyn_cast<Instruction>(V);
17281assert(I &&"bundle member must be an instruction");
17282assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17283 !doesNotNeedToBeScheduled(I) &&
17284"phi nodes/insertelements/extractelements/extractvalues don't need to "
17285"be scheduled");
17286if (getScheduleData(I))
17287returntrue;
17288if (!ScheduleStart) {
17289// It's the first instruction in the new region.
17290 initScheduleData(I,I->getNextNode(),nullptr,nullptr);
17291 ScheduleStart =I;
17292 ScheduleEnd =I->getNextNode();
17293assert(ScheduleEnd &&"tried to vectorize a terminator?");
17294LLVM_DEBUG(dbgs() <<"SLP: initialize schedule region to " << *I <<"\n");
17295returntrue;
17296 }
17297// Search up and down at the same time, because we don't know if the new
17298// instruction is above or below the existing scheduling region.
17299// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17300// against the budget. Otherwise debug info could affect codegen.
17301BasicBlock::reverse_iterator UpIter =
17302 ++ScheduleStart->getIterator().getReverse();
17303BasicBlock::reverse_iterator UpperEnd = BB->rend();
17304BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17305BasicBlock::iterator LowerEnd = BB->end();
17306auto IsAssumeLikeIntr = [](constInstruction &I) {
17307if (auto *II = dyn_cast<IntrinsicInst>(&I))
17308returnII->isAssumeLikeIntrinsic();
17309returnfalse;
17310 };
17311 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17312 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17313while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=I &&
17314 &*DownIter !=I) {
17315if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17316LLVM_DEBUG(dbgs() <<"SLP: exceeded schedule region size limit\n");
17317returnfalse;
17318 }
17319
17320 ++UpIter;
17321 ++DownIter;
17322
17323 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17324 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17325 }
17326if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==I)) {
17327assert(I->getParent() == ScheduleStart->getParent() &&
17328"Instruction is in wrong basic block.");
17329 initScheduleData(I, ScheduleStart,nullptr, FirstLoadStoreInRegion);
17330 ScheduleStart =I;
17331LLVM_DEBUG(dbgs() <<"SLP: extend schedule region start to " << *I
17332 <<"\n");
17333returntrue;
17334 }
17335assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==I)) &&
17336"Expected to reach top of the basic block or instruction down the "
17337"lower end.");
17338assert(I->getParent() == ScheduleEnd->getParent() &&
17339"Instruction is in wrong basic block.");
17340 initScheduleData(ScheduleEnd,I->getNextNode(), LastLoadStoreInRegion,
17341nullptr);
17342 ScheduleEnd =I->getNextNode();
17343assert(ScheduleEnd &&"tried to vectorize a terminator?");
17344LLVM_DEBUG(dbgs() <<"SLP: extend schedule region end to " << *I <<"\n");
17345returntrue;
17346}
17347
17348void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17349Instruction *ToI,
17350 ScheduleData *PrevLoadStore,
17351 ScheduleData *NextLoadStore) {
17352 ScheduleData *CurrentLoadStore = PrevLoadStore;
17353for (Instruction *I = FromI;I != ToI;I =I->getNextNode()) {
17354// No need to allocate data for non-schedulable instructions.
17355if (doesNotNeedToBeScheduled(I))
17356continue;
17357 ScheduleData *SD = ScheduleDataMap.lookup(I);
17358if (!SD) {
17359 SD = allocateScheduleDataChunks();
17360 ScheduleDataMap[I] = SD;
17361 }
17362assert(!isInSchedulingRegion(SD) &&
17363"new ScheduleData already in scheduling region");
17364 SD->init(SchedulingRegionID,I);
17365
17366if (I->mayReadOrWriteMemory() &&
17367 (!isa<IntrinsicInst>(I) ||
17368 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17369 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17370 Intrinsic::pseudoprobe))) {
17371// Update the linked list of memory accessing instructions.
17372if (CurrentLoadStore) {
17373 CurrentLoadStore->NextLoadStore = SD;
17374 }else {
17375 FirstLoadStoreInRegion = SD;
17376 }
17377 CurrentLoadStore = SD;
17378 }
17379
17380if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17381match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17382 RegionHasStackSave =true;
17383 }
17384if (NextLoadStore) {
17385if (CurrentLoadStore)
17386 CurrentLoadStore->NextLoadStore = NextLoadStore;
17387 }else {
17388 LastLoadStoreInRegion = CurrentLoadStore;
17389 }
17390}
17391
17392void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17393bool InsertInReadyList,
17394BoUpSLP *SLP) {
17395assert(SD->isSchedulingEntity());
17396
17397SmallVector<ScheduleData *, 10> WorkList;
17398 WorkList.push_back(SD);
17399
17400while (!WorkList.empty()) {
17401 ScheduleData *SD = WorkList.pop_back_val();
17402for (ScheduleData *BundleMember = SD; BundleMember;
17403 BundleMember = BundleMember->NextInBundle) {
17404assert(isInSchedulingRegion(BundleMember));
17405if (BundleMember->hasValidDependencies())
17406continue;
17407
17408LLVM_DEBUG(dbgs() <<"SLP: update deps of " << *BundleMember
17409 <<"\n");
17410 BundleMember->Dependencies = 0;
17411 BundleMember->resetUnscheduledDeps();
17412
17413// Handle def-use chain dependencies.
17414for (User *U : BundleMember->Inst->users()) {
17415if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17416 BundleMember->Dependencies++;
17417 ScheduleData *DestBundle = UseSD->FirstInBundle;
17418if (!DestBundle->IsScheduled)
17419 BundleMember->incrementUnscheduledDeps(1);
17420if (!DestBundle->hasValidDependencies())
17421 WorkList.push_back(DestBundle);
17422 }
17423 }
17424
17425auto MakeControlDependent = [&](Instruction *I) {
17426auto *DepDest = getScheduleData(I);
17427assert(DepDest &&"must be in schedule window");
17428 DepDest->ControlDependencies.push_back(BundleMember);
17429 BundleMember->Dependencies++;
17430 ScheduleData *DestBundle = DepDest->FirstInBundle;
17431if (!DestBundle->IsScheduled)
17432 BundleMember->incrementUnscheduledDeps(1);
17433if (!DestBundle->hasValidDependencies())
17434 WorkList.push_back(DestBundle);
17435 };
17436
17437// Any instruction which isn't safe to speculate at the beginning of the
17438// block is control dependend on any early exit or non-willreturn call
17439// which proceeds it.
17440if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17441for (Instruction *I = BundleMember->Inst->getNextNode();
17442I != ScheduleEnd;I =I->getNextNode()) {
17443if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17444continue;
17445
17446// Add the dependency
17447 MakeControlDependent(I);
17448
17449if (!isGuaranteedToTransferExecutionToSuccessor(I))
17450// Everything past here must be control dependent on I.
17451break;
17452 }
17453 }
17454
17455if (RegionHasStackSave) {
17456// If we have an inalloc alloca instruction, it needs to be scheduled
17457// after any preceeding stacksave. We also need to prevent any alloca
17458// from reordering above a preceeding stackrestore.
17459if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17460match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17461for (Instruction *I = BundleMember->Inst->getNextNode();
17462I != ScheduleEnd;I =I->getNextNode()) {
17463if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17464match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17465// Any allocas past here must be control dependent on I, and I
17466// must be memory dependend on BundleMember->Inst.
17467break;
17468
17469if (!isa<AllocaInst>(I))
17470continue;
17471
17472// Add the dependency
17473 MakeControlDependent(I);
17474 }
17475 }
17476
17477// In addition to the cases handle just above, we need to prevent
17478// allocas and loads/stores from moving below a stacksave or a
17479// stackrestore. Avoiding moving allocas below stackrestore is currently
17480// thought to be conservatism. Moving loads/stores below a stackrestore
17481// can lead to incorrect code.
17482if (isa<AllocaInst>(BundleMember->Inst) ||
17483 BundleMember->Inst->mayReadOrWriteMemory()) {
17484for (Instruction *I = BundleMember->Inst->getNextNode();
17485I != ScheduleEnd;I =I->getNextNode()) {
17486if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17487 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17488continue;
17489
17490// Add the dependency
17491 MakeControlDependent(I);
17492break;
17493 }
17494 }
17495 }
17496
17497// Handle the memory dependencies (if any).
17498 ScheduleData *DepDest = BundleMember->NextLoadStore;
17499if (!DepDest)
17500continue;
17501Instruction *SrcInst = BundleMember->Inst;
17502assert(SrcInst->mayReadOrWriteMemory() &&
17503"NextLoadStore list for non memory effecting bundle?");
17504MemoryLocation SrcLoc =getLocation(SrcInst);
17505bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17506unsigned NumAliased = 0;
17507unsigned DistToSrc = 1;
17508
17509for (; DepDest; DepDest = DepDest->NextLoadStore) {
17510assert(isInSchedulingRegion(DepDest));
17511
17512// We have two limits to reduce the complexity:
17513// 1) AliasedCheckLimit: It's a small limit to reduce calls to
17514// SLP->isAliased (which is the expensive part in this loop).
17515// 2) MaxMemDepDistance: It's for very large blocks and it aborts
17516// the whole loop (even if the loop is fast, it's quadratic).
17517// It's important for the loop break condition (see below) to
17518// check this limit even between two read-only instructions.
17519if (DistToSrc >=MaxMemDepDistance ||
17520 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17521 (NumAliased >=AliasedCheckLimit ||
17522 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17523
17524// We increment the counter only if the locations are aliased
17525// (instead of counting all alias checks). This gives a better
17526// balance between reduced runtime and accurate dependencies.
17527 NumAliased++;
17528
17529 DepDest->MemoryDependencies.push_back(BundleMember);
17530 BundleMember->Dependencies++;
17531 ScheduleData *DestBundle = DepDest->FirstInBundle;
17532if (!DestBundle->IsScheduled) {
17533 BundleMember->incrementUnscheduledDeps(1);
17534 }
17535if (!DestBundle->hasValidDependencies()) {
17536 WorkList.push_back(DestBundle);
17537 }
17538 }
17539
17540// Example, explaining the loop break condition: Let's assume our
17541// starting instruction is i0 and MaxMemDepDistance = 3.
17542//
17543// +--------v--v--v
17544// i0,i1,i2,i3,i4,i5,i6,i7,i8
17545// +--------^--^--^
17546//
17547// MaxMemDepDistance let us stop alias-checking at i3 and we add
17548// dependencies from i0 to i3,i4,.. (even if they are not aliased).
17549// Previously we already added dependencies from i3 to i6,i7,i8
17550// (because of MaxMemDepDistance). As we added a dependency from
17551// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17552// and we can abort this loop at i6.
17553if (DistToSrc >= 2 *MaxMemDepDistance)
17554break;
17555 DistToSrc++;
17556 }
17557 }
17558if (InsertInReadyList && SD->isReady()) {
17559 ReadyInsts.insert(SD);
17560LLVM_DEBUG(dbgs() <<"SLP: gets ready on update: " << *SD->Inst
17561 <<"\n");
17562 }
17563 }
17564}
17565
17566void BoUpSLP::BlockScheduling::resetSchedule() {
17567assert(ScheduleStart &&
17568"tried to reset schedule on block which has not been scheduled");
17569for (Instruction *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {
17570if (ScheduleData *SD = getScheduleData(I)) {
17571assert(isInSchedulingRegion(SD) &&
17572"ScheduleData not in scheduling region");
17573 SD->IsScheduled =false;
17574 SD->resetUnscheduledDeps();
17575 }
17576 }
17577 ReadyInsts.clear();
17578}
17579
17580void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17581if (!BS->ScheduleStart)
17582return;
17583
17584LLVM_DEBUG(dbgs() <<"SLP: schedule block " << BS->BB->getName() <<"\n");
17585
17586// A key point - if we got here, pre-scheduling was able to find a valid
17587// scheduling of the sub-graph of the scheduling window which consists
17588// of all vector bundles and their transitive users. As such, we do not
17589// need to reschedule anything *outside of* that subgraph.
17590
17591 BS->resetSchedule();
17592
17593// For the real scheduling we use a more sophisticated ready-list: it is
17594// sorted by the original instruction location. This lets the final schedule
17595// be as close as possible to the original instruction order.
17596// WARNING: If changing this order causes a correctness issue, that means
17597// there is some missing dependence edge in the schedule data graph.
17598structScheduleDataCompare {
17599bool operator()(ScheduleData *SD1, ScheduleData *SD2) const{
17600return SD2->SchedulingPriority < SD1->SchedulingPriority;
17601 }
17602 };
17603 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17604
17605// Ensure that all dependency data is updated (for nodes in the sub-graph)
17606// and fill the ready-list with initial instructions.
17607intIdx = 0;
17608for (auto *I = BS->ScheduleStart;I != BS->ScheduleEnd;
17609I =I->getNextNode()) {
17610if (ScheduleData *SD = BS->getScheduleData(I)) {
17611 [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);
17612assert((isVectorLikeInstWithConstOps(SD->Inst) ||
17613 SD->isPartOfBundle() ==
17614 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
17615"scheduler and vectorizer bundle mismatch");
17616 SD->FirstInBundle->SchedulingPriority =Idx++;
17617
17618if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17619 BS->calculateDependencies(SD,false,this);
17620 }
17621 }
17622 BS->initialFillReadyList(ReadyInsts);
17623
17624Instruction *LastScheduledInst = BS->ScheduleEnd;
17625
17626// Do the "real" scheduling.
17627while (!ReadyInsts.empty()) {
17628 ScheduleData *Picked = *ReadyInsts.begin();
17629 ReadyInsts.erase(ReadyInsts.begin());
17630
17631// Move the scheduled instruction(s) to their dedicated places, if not
17632// there yet.
17633for (ScheduleData *BundleMember = Picked; BundleMember;
17634 BundleMember = BundleMember->NextInBundle) {
17635Instruction *PickedInst = BundleMember->Inst;
17636if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17637 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17638 LastScheduledInst = PickedInst;
17639 }
17640
17641 BS->schedule(Picked, ReadyInsts);
17642 }
17643
17644// Check that we didn't break any of our invariants.
17645#ifdef EXPENSIVE_CHECKS
17646 BS->verify();
17647#endif
17648
17649#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17650// Check that all schedulable entities got scheduled
17651for (auto *I = BS->ScheduleStart;I != BS->ScheduleEnd;I =I->getNextNode()) {
17652 ScheduleData *SD = BS->getScheduleData(I);
17653if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17654assert(SD->IsScheduled &&"must be scheduled at this point");
17655 }
17656#endif
17657
17658// Avoid duplicate scheduling of the block.
17659 BS->ScheduleStart =nullptr;
17660}
17661
17662unsignedBoUpSLP::getVectorElementSize(Value *V) {
17663// If V is a store, just return the width of the stored value (or value
17664// truncated just before storing) without traversing the expression tree.
17665// This is the common case.
17666if (auto *Store = dyn_cast<StoreInst>(V))
17667returnDL->getTypeSizeInBits(Store->getValueOperand()->getType());
17668
17669if (auto *IEI = dyn_cast<InsertElementInst>(V))
17670returngetVectorElementSize(IEI->getOperand(1));
17671
17672auto E = InstrElementSize.find(V);
17673if (E != InstrElementSize.end())
17674return E->second;
17675
17676// If V is not a store, we can traverse the expression tree to find loads
17677// that feed it. The type of the loaded value may indicate a more suitable
17678// width than V's type. We want to base the vector element size on the width
17679// of memory operations where possible.
17680SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
17681SmallPtrSet<Instruction *, 16> Visited;
17682if (auto *I = dyn_cast<Instruction>(V)) {
17683 Worklist.emplace_back(I,I->getParent(), 0);
17684 Visited.insert(I);
17685 }
17686
17687// Traverse the expression tree in bottom-up order looking for loads. If we
17688// encounter an instruction we don't yet handle, we give up.
17689auto Width = 0u;
17690Value *FirstNonBool =nullptr;
17691while (!Worklist.empty()) {
17692auto [I, Parent, Level] = Worklist.pop_back_val();
17693
17694// We should only be looking at scalar instructions here. If the current
17695// instruction has a vector type, skip.
17696auto *Ty =I->getType();
17697if (isa<VectorType>(Ty))
17698continue;
17699if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17700 FirstNonBool =I;
17701if (Level >RecursionMaxDepth)
17702continue;
17703
17704// If the current instruction is a load, update MaxWidth to reflect the
17705// width of the loaded value.
17706if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17707 Width = std::max<unsigned>(Width,DL->getTypeSizeInBits(Ty));
17708
17709// Otherwise, we need to visit the operands of the instruction. We only
17710// handle the interesting cases from buildTree here. If an operand is an
17711// instruction we haven't yet visited and from the same basic block as the
17712// user or the use is a PHI node, we add it to the worklist.
17713elseif (isa<PHINode,CastInst,GetElementPtrInst,CmpInst,SelectInst,
17714BinaryOperator,UnaryOperator>(I)) {
17715for (Use &U :I->operands()) {
17716if (auto *J = dyn_cast<Instruction>(U.get()))
17717if (Visited.insert(J).second &&
17718 (isa<PHINode>(I) || J->getParent() == Parent)) {
17719 Worklist.emplace_back(J, J->getParent(), Level + 1);
17720continue;
17721 }
17722if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17723 FirstNonBool = U.get();
17724 }
17725 }else {
17726break;
17727 }
17728 }
17729
17730// If we didn't encounter a memory access in the expression tree, or if we
17731// gave up for some reason, just return the width of V. Otherwise, return the
17732// maximum width we found.
17733if (!Width) {
17734if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17735 V = FirstNonBool;
17736 Width =DL->getTypeSizeInBits(V->getType());
17737 }
17738
17739for (Instruction *I : Visited)
17740 InstrElementSize[I] = Width;
17741
17742return Width;
17743}
17744
17745bool BoUpSLP::collectValuesToDemote(
17746const TreeEntry &E,bool IsProfitableToDemoteRoot,unsigned &BitWidth,
17747SmallVectorImpl<unsigned> &ToDemote,DenseSet<const TreeEntry *> &Visited,
17748constSmallDenseSet<unsigned, 8> &NodesToKeepBWs,unsigned &MaxDepthLevel,
17749bool &IsProfitableToDemote,bool IsTruncRoot) const{
17750// We can always demote constants.
17751if (all_of(E.Scalars, IsaPred<Constant>))
17752returntrue;
17753
17754unsigned OrigBitWidth =
17755DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17756if (OrigBitWidth ==BitWidth) {
17757 MaxDepthLevel = 1;
17758returntrue;
17759 }
17760
17761// Check if the node was analyzed already and must keep its original bitwidth.
17762if (NodesToKeepBWs.contains(E.Idx))
17763returnfalse;
17764
17765// If the value is not a vectorized instruction in the expression and not used
17766// by the insertelement instruction and not used in multiple vector nodes, it
17767// cannot be demoted.
17768bool IsSignedNode =any_of(E.Scalars, [&](Value *R) {
17769 if (isa<PoisonValue>(R))
17770 return false;
17771 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17772 });
17773auto IsPotentiallyTruncated = [&](Value *V,unsigned &BitWidth) ->bool {
17774if (isa<PoisonValue>(V))
17775returntrue;
17776if (MultiNodeScalars.contains(V))
17777returnfalse;
17778// For lat shuffle of sext/zext with many uses need to check the extra bit
17779// for unsigned values, otherwise may have incorrect casting for reused
17780// scalars.
17781bool IsSignedVal = !isKnownNonNegative(V,SimplifyQuery(*DL));
17782if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >BitWidth) {
17783APInt Mask =APInt::getBitsSetFrom(OrigBitWidth,BitWidth);
17784if (MaskedValueIsZero(V, Mask,SimplifyQuery(*DL)))
17785returntrue;
17786 }
17787unsigned NumSignBits =ComputeNumSignBits(V, *DL, 0, AC,nullptr, DT);
17788unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17789if (IsSignedNode)
17790 ++BitWidth1;
17791if (auto *I = dyn_cast<Instruction>(V)) {
17792APInt Mask = DB->getDemandedBits(I);
17793unsigned BitWidth2 =
17794 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17795while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17796APInt Mask =APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17797if (MaskedValueIsZero(V, Mask,SimplifyQuery(*DL)))
17798break;
17799 BitWidth2 *= 2;
17800 }
17801 BitWidth1 = std::min(BitWidth1, BitWidth2);
17802 }
17803BitWidth = std::max(BitWidth, BitWidth1);
17804returnBitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17805 };
17806auto FinalAnalysis = [&,TTI =TTI]() {
17807if (!IsProfitableToDemote)
17808returnfalse;
17809bool Res =all_of(
17810 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17811// Demote gathers.
17812if (Res && E.isGather()) {
17813// Check possible extractelement instructions bases and final vector
17814// length.
17815SmallPtrSet<Value *, 4> UniqueBases;
17816for (Value *V : E.Scalars) {
17817auto *EE = dyn_cast<ExtractElementInst>(V);
17818if (!EE)
17819continue;
17820 UniqueBases.insert(EE->getVectorOperand());
17821 }
17822constunsigned VF = E.Scalars.size();
17823Type *OrigScalarTy = E.Scalars.front()->getType();
17824if (UniqueBases.size() <= 2 ||
17825TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17826TTI->getNumberOfParts(getWidenedType(
17827IntegerType::get(OrigScalarTy->getContext(),BitWidth), VF)))
17828 ToDemote.push_back(E.Idx);
17829 }
17830return Res;
17831 };
17832if (E.isGather() || !Visited.insert(&E).second ||
17833any_of(E.Scalars, [&](Value *V) {
17834 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17835 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17836 });
17837 }))
17838return FinalAnalysis();
17839
17840if (any_of(E.Scalars, [&](Value *V) {
17841 return !all_of(V->users(), [=](User *U) {
17842 return getTreeEntry(U) ||
17843 (E.Idx == 0 && UserIgnoreList &&
17844 UserIgnoreList->contains(U)) ||
17845 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17846 !U->getType()->isScalableTy() &&
17847 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17848 }) && !IsPotentiallyTruncated(V,BitWidth);
17849 }))
17850returnfalse;
17851
17852auto ProcessOperands = [&](ArrayRef<const TreeEntry *>Operands,
17853bool &NeedToExit) {
17854 NeedToExit =false;
17855unsigned InitLevel = MaxDepthLevel;
17856for (const TreeEntry *Op :Operands) {
17857unsigned Level = InitLevel;
17858if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot,BitWidth,
17859 ToDemote, Visited, NodesToKeepBWs, Level,
17860 IsProfitableToDemote, IsTruncRoot)) {
17861if (!IsProfitableToDemote)
17862returnfalse;
17863 NeedToExit =true;
17864if (!FinalAnalysis())
17865returnfalse;
17866continue;
17867 }
17868 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17869 }
17870returntrue;
17871 };
17872auto AttemptCheckBitwidth =
17873 [&](function_ref<bool(unsigned,unsigned)> Checker,bool &NeedToExit) {
17874// Try all bitwidth < OrigBitWidth.
17875 NeedToExit =false;
17876unsigned BestFailBitwidth = 0;
17877for (;BitWidth < OrigBitWidth;BitWidth *= 2) {
17878if (Checker(BitWidth, OrigBitWidth))
17879returntrue;
17880if (BestFailBitwidth == 0 && FinalAnalysis())
17881 BestFailBitwidth =BitWidth;
17882 }
17883if (BitWidth >= OrigBitWidth) {
17884if (BestFailBitwidth == 0) {
17885BitWidth = OrigBitWidth;
17886returnfalse;
17887 }
17888 MaxDepthLevel = 1;
17889BitWidth = BestFailBitwidth;
17890 NeedToExit =true;
17891returntrue;
17892 }
17893returnfalse;
17894 };
17895auto TryProcessInstruction =
17896 [&](unsigned &BitWidth,ArrayRef<const TreeEntry *>Operands = {},
17897function_ref<bool(unsigned,unsigned)> Checker = {}) {
17898if (Operands.empty()) {
17899if (!IsTruncRoot)
17900 MaxDepthLevel = 1;
17901 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17902 std::ref(BitWidth)));
17903 }else {
17904// Several vectorized uses? Check if we can truncate it, otherwise -
17905// exit.
17906if (E.UserTreeIndices.size() > 1 &&
17907 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17908 std::ref(BitWidth))))
17909returnfalse;
17910bool NeedToExit =false;
17911if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17912returnfalse;
17913if (NeedToExit)
17914returntrue;
17915if (!ProcessOperands(Operands, NeedToExit))
17916returnfalse;
17917if (NeedToExit)
17918returntrue;
17919 }
17920
17921 ++MaxDepthLevel;
17922// Record the entry that we can demote.
17923 ToDemote.push_back(E.Idx);
17924return IsProfitableToDemote;
17925 };
17926switch (E.getOpcode()) {
17927
17928// We can always demote truncations and extensions. Since truncations can
17929// seed additional demotion, we save the truncated value.
17930case Instruction::Trunc:
17931if (IsProfitableToDemoteRoot)
17932 IsProfitableToDemote =true;
17933return TryProcessInstruction(BitWidth);
17934case Instruction::ZExt:
17935case Instruction::SExt:
17936 IsProfitableToDemote =true;
17937return TryProcessInstruction(BitWidth);
17938
17939// We can demote certain binary operations if we can demote both of their
17940// operands.
17941case Instruction::Add:
17942case Instruction::Sub:
17943case Instruction::Mul:
17944case Instruction::And:
17945case Instruction::Or:
17946case Instruction::Xor: {
17947return TryProcessInstruction(
17948BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17949 }
17950case Instruction::Freeze:
17951return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17952case Instruction::Shl: {
17953// If we are truncating the result of this SHL, and if it's a shift of an
17954// inrange amount, we can always perform a SHL in a smaller type.
17955auto ShlChecker = [&](unsignedBitWidth,unsigned) {
17956returnall_of(E.Scalars, [&](Value *V) {
17957 if (isa<PoisonValue>(V))
17958 return true;
17959 auto *I = cast<Instruction>(V);
17960 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17961 return AmtKnownBits.getMaxValue().ult(BitWidth);
17962 });
17963 };
17964return TryProcessInstruction(
17965BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17966 }
17967case Instruction::LShr: {
17968// If this is a truncate of a logical shr, we can truncate it to a smaller
17969// lshr iff we know that the bits we would otherwise be shifting in are
17970// already zeros.
17971auto LShrChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
17972returnall_of(E.Scalars, [&](Value *V) {
17973 if (isa<PoisonValue>(V))
17974 return true;
17975 auto *I = cast<Instruction>(V);
17976 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17977 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17978 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17979 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17980 SimplifyQuery(*DL));
17981 });
17982 };
17983return TryProcessInstruction(
17984BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17985 LShrChecker);
17986 }
17987case Instruction::AShr: {
17988// If this is a truncate of an arithmetic shr, we can truncate it to a
17989// smaller ashr iff we know that all the bits from the sign bit of the
17990// original type and the sign bit of the truncate type are similar.
17991auto AShrChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
17992returnall_of(E.Scalars, [&](Value *V) {
17993 if (isa<PoisonValue>(V))
17994 return true;
17995 auto *I = cast<Instruction>(V);
17996 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17997 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17998 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17999 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18000 nullptr, DT);
18001 });
18002 };
18003return TryProcessInstruction(
18004BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18005 AShrChecker);
18006 }
18007case Instruction::UDiv:
18008case Instruction::URem: {
18009// UDiv and URem can be truncated if all the truncated bits are zero.
18010auto Checker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
18011assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");
18012returnall_of(E.Scalars, [&](Value *V) {
18013 auto *I = cast<Instruction>(V);
18014 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18015 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18016 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18017 });
18018 };
18019return TryProcessInstruction(
18020BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18021 }
18022
18023// We can demote selects if we can demote their true and false values.
18024case Instruction::Select: {
18025return TryProcessInstruction(
18026BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18027 }
18028
18029// We can demote phis if we can demote all their incoming operands. Note that
18030// we don't need to worry about cycles since we ensure single use above.
18031case Instruction::PHI: {
18032constunsigned NumOps = E.getNumOperands();
18033SmallVector<const TreeEntry *> Ops(NumOps);
18034transform(seq<unsigned>(0, NumOps), Ops.begin(),
18035 std::bind(&BoUpSLP::getOperandEntry,this, &E, _1));
18036
18037return TryProcessInstruction(BitWidth, Ops);
18038 }
18039
18040case Instruction::Call: {
18041auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18042if (!IC)
18043break;
18044Intrinsic::IDID =getVectorIntrinsicIDForCall(IC, TLI);
18045if (ID != Intrinsic::abs &&ID != Intrinsic::smin &&
18046ID != Intrinsic::smax &&ID != Intrinsic::umin &&ID != Intrinsic::umax)
18047break;
18048SmallVector<const TreeEntry *, 2>Operands(1, getOperandEntry(&E, 0));
18049function_ref<bool(unsigned,unsigned)> CallChecker;
18050auto CompChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
18051assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");
18052returnall_of(E.Scalars, [&](Value *V) {
18053 auto *I = cast<Instruction>(V);
18054 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18055 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18056 return MaskedValueIsZero(I->getOperand(0), Mask,
18057 SimplifyQuery(*DL)) &&
18058 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18059 }
18060assert((ID == Intrinsic::smin ||ID == Intrinsic::smax) &&
18061"Expected min/max intrinsics only.");
18062unsigned SignBits = OrigBitWidth -BitWidth;
18063APIntMask =APInt::getBitsSetFrom(OrigBitWidth,BitWidth - 1);
18064unsigned Op0SignBits =ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18065nullptr, DT);
18066unsigned Op1SignBits =ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18067nullptr, DT);
18068return SignBits <= Op0SignBits &&
18069 ((SignBits != Op0SignBits &&
18070 !isKnownNonNegative(I->getOperand(0),SimplifyQuery(*DL))) ||
18071MaskedValueIsZero(I->getOperand(0),Mask,
18072SimplifyQuery(*DL))) &&
18073 SignBits <= Op1SignBits &&
18074 ((SignBits != Op1SignBits &&
18075 !isKnownNonNegative(I->getOperand(1),SimplifyQuery(*DL))) ||
18076MaskedValueIsZero(I->getOperand(1),Mask,SimplifyQuery(*DL)));
18077 });
18078 };
18079auto AbsChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {
18080assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");
18081returnall_of(E.Scalars, [&](Value *V) {
18082 auto *I = cast<Instruction>(V);
18083 unsigned SignBits = OrigBitWidth - BitWidth;
18084 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18085 unsigned Op0SignBits =
18086 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18087 return SignBits <= Op0SignBits &&
18088 ((SignBits != Op0SignBits &&
18089 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18090 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18091 });
18092 };
18093if (ID != Intrinsic::abs) {
18094Operands.push_back(getOperandEntry(&E, 1));
18095 CallChecker = CompChecker;
18096 }else {
18097 CallChecker = AbsChecker;
18098 }
18099InstructionCost BestCost =
18100 std::numeric_limits<InstructionCost::CostType>::max();
18101unsigned BestBitWidth =BitWidth;
18102unsigned VF = E.Scalars.size();
18103// Choose the best bitwidth based on cost estimations.
18104auto Checker = [&](unsignedBitWidth,unsigned) {
18105unsigned MinBW =PowerOf2Ceil(BitWidth);
18106SmallVector<Type *> ArgTys =
18107buildIntrinsicArgTypes(IC,ID, VF, MinBW,TTI);
18108auto VecCallCosts =getVectorCallCosts(
18109 IC,getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18110TTI, TLI, ArgTys);
18111InstructionCostCost = std::min(VecCallCosts.first, VecCallCosts.second);
18112if (Cost < BestCost) {
18113 BestCost =Cost;
18114 BestBitWidth =BitWidth;
18115 }
18116returnfalse;
18117 };
18118 [[maybe_unused]]bool NeedToExit;
18119 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18120BitWidth = BestBitWidth;
18121return TryProcessInstruction(BitWidth,Operands, CallChecker);
18122 }
18123
18124// Otherwise, conservatively give up.
18125default:
18126break;
18127 }
18128 MaxDepthLevel = 1;
18129return FinalAnalysis();
18130}
18131
18132staticRecurKindgetRdxKind(Value *V);
18133
18134voidBoUpSLP::computeMinimumValueSizes() {
18135// We only attempt to truncate integer expressions.
18136bool IsStoreOrInsertElt =
18137 VectorizableTree.front()->hasState() &&
18138 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18139 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18140if ((IsStoreOrInsertElt || UserIgnoreList) &&
18141 ExtraBitWidthNodes.size() <= 1 &&
18142 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18143 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18144return;
18145
18146unsigned NodeIdx = 0;
18147if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18148 NodeIdx = 1;
18149
18150// Ensure the roots of the vectorizable tree don't form a cycle.
18151if (VectorizableTree[NodeIdx]->isGather() ||
18152 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18153 (NodeIdx != 0 &&any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18154 [NodeIdx](constEdgeInfo &EI) {
18155return EI.UserTE->Idx > NodeIdx;
18156 })))
18157return;
18158
18159// The first value node for store/insertelement is sext/zext/trunc? Skip it,
18160// resize to the final type.
18161bool IsTruncRoot =false;
18162bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18163SmallVector<unsigned> RootDemotes;
18164SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18165if (NodeIdx != 0 &&
18166 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18167 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18168assert(IsStoreOrInsertElt &&"Expected store/insertelement seeded graph.");
18169 IsTruncRoot =true;
18170 RootDemotes.push_back(NodeIdx);
18171 IsProfitableToDemoteRoot =true;
18172 ++NodeIdx;
18173 }
18174
18175// Analyzed the reduction already and not profitable - exit.
18176if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18177return;
18178
18179SmallVector<unsigned> ToDemote;
18180auto ComputeMaxBitWidth =
18181 [&](const TreeEntry &E,bool IsTopRoot,bool IsProfitableToDemoteRoot,
18182unsigned Limit,bool IsTruncRoot,bool IsSignedCmp) ->unsigned {
18183 ToDemote.clear();
18184// Check if the root is trunc and the next node is gather/buildvector, then
18185// keep trunc in scalars, which is free in most cases.
18186if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18187 !NodesToKeepBWs.contains(E.Idx) &&
18188 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18189all_of(E.Scalars, [&](Value *V) {
18190return V->hasOneUse() || isa<Constant>(V) ||
18191 (!V->hasNUsesOrMore(UsesLimit) &&
18192none_of(V->users(), [&](User *U) {
18193 const TreeEntry *TE = getTreeEntry(U);
18194 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18195 if (TE == UserTE || !TE)
18196 return false;
18197 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18198 SelectInst>(U) ||
18199 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18200 SelectInst>(UserTE->getMainOp()))
18201 return true;
18202 unsigned UserTESz = DL->getTypeSizeInBits(
18203 UserTE->Scalars.front()->getType());
18204 auto It = MinBWs.find(TE);
18205 if (It != MinBWs.end() && It->second.first > UserTESz)
18206 return true;
18207 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18208 }));
18209 })) {
18210 ToDemote.push_back(E.Idx);
18211const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18212auto It = MinBWs.find(UserTE);
18213if (It != MinBWs.end())
18214return It->second.first;
18215unsigned MaxBitWidth =
18216DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18217 MaxBitWidth =bit_ceil(MaxBitWidth);
18218if (MaxBitWidth < 8 && MaxBitWidth > 1)
18219 MaxBitWidth = 8;
18220return MaxBitWidth;
18221 }
18222
18223if (!E.hasState())
18224return 0u;
18225
18226unsigned VF = E.getVectorFactor();
18227Type *ScalarTy = E.Scalars.front()->getType();
18228unsigned ScalarTyNumElements =getNumElements(ScalarTy);
18229auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18230if (!TreeRootIT)
18231return 0u;
18232
18233if (any_of(E.Scalars,
18234 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18235return 0u;
18236
18237unsigned NumParts =TTI->getNumberOfParts(
18238getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18239
18240// The maximum bit width required to represent all the values that can be
18241// demoted without loss of precision. It would be safe to truncate the roots
18242// of the expression to this width.
18243unsigned MaxBitWidth = 1u;
18244
18245// True if the roots can be zero-extended back to their original type,
18246// rather than sign-extended. We know that if the leading bits are not
18247// demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18248// True.
18249// Determine if the sign bit of all the roots is known to be zero. If not,
18250// IsKnownPositive is set to False.
18251bool IsKnownPositive = !IsSignedCmp &&all_of(E.Scalars, [&](Value *R) {
18252 if (isa<PoisonValue>(R))
18253 return true;
18254 KnownBits Known = computeKnownBits(R, *DL);
18255 return Known.isNonNegative();
18256 });
18257
18258// We first check if all the bits of the roots are demanded. If they're not,
18259// we can truncate the roots to this narrower type.
18260for (Value *Root : E.Scalars) {
18261if (isa<PoisonValue>(Root))
18262continue;
18263unsigned NumSignBits =ComputeNumSignBits(Root, *DL, 0, AC,nullptr, DT);
18264TypeSize NumTypeBits =
18265DL->getTypeSizeInBits(Root->getType()->getScalarType());
18266unsigned BitWidth1 = NumTypeBits - NumSignBits;
18267// If we can't prove that the sign bit is zero, we must add one to the
18268// maximum bit width to account for the unknown sign bit. This preserves
18269// the existing sign bit so we can safely sign-extend the root back to the
18270// original type. Otherwise, if we know the sign bit is zero, we will
18271// zero-extend the root instead.
18272//
18273// FIXME: This is somewhat suboptimal, as there will be cases where adding
18274// one to the maximum bit width will yield a larger-than-necessary
18275// type. In general, we need to add an extra bit only if we can't
18276// prove that the upper bit of the original type is equal to the
18277// upper bit of the proposed smaller type. If these two bits are
18278// the same (either zero or one) we know that sign-extending from
18279// the smaller type will result in the same value. Here, since we
18280// can't yet prove this, we are just making the proposed smaller
18281// type larger to ensure correctness.
18282if (!IsKnownPositive)
18283 ++BitWidth1;
18284
18285APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18286unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18287 MaxBitWidth =
18288 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18289 }
18290
18291if (MaxBitWidth < 8 && MaxBitWidth > 1)
18292 MaxBitWidth = 8;
18293
18294// If the original type is large, but reduced type does not improve the reg
18295// use - ignore it.
18296if (NumParts > 1 &&
18297 NumParts ==
18298TTI->getNumberOfParts(getWidenedType(
18299IntegerType::get(F->getContext(),bit_ceil(MaxBitWidth)), VF)))
18300return 0u;
18301
18302unsigned Opcode = E.getOpcode();
18303bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18304 Opcode == Instruction::SExt ||
18305 Opcode == Instruction::ZExt || NumParts > 1;
18306// Conservatively determine if we can actually truncate the roots of the
18307// expression. Collect the values that can be demoted in ToDemote and
18308// additional roots that require investigating in Roots.
18309DenseSet<const TreeEntry *> Visited;
18310unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18311bool NeedToDemote = IsProfitableToDemote;
18312
18313if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18314 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18315 NeedToDemote, IsTruncRoot) ||
18316 (MaxDepthLevel <= Limit &&
18317 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18318 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18319DL->getTypeSizeInBits(TreeRootIT) /
18320DL->getTypeSizeInBits(
18321 E.getMainOp()->getOperand(0)->getType()) >
18322 2)))))
18323return 0u;
18324// Round MaxBitWidth up to the next power-of-two.
18325 MaxBitWidth =bit_ceil(MaxBitWidth);
18326
18327return MaxBitWidth;
18328 };
18329
18330// If we can truncate the root, we must collect additional values that might
18331// be demoted as a result. That is, those seeded by truncations we will
18332// modify.
18333// Add reduction ops sizes, if any.
18334if (UserIgnoreList &&
18335 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18336// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18337// x i1> to in)).
18338if (all_of(*UserIgnoreList,
18339 [](Value *V) {
18340return isa<PoisonValue>(V) ||
18341 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18342 }) &&
18343 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18344 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18345 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18346 Builder.getInt1Ty()) {
18347 ReductionBitWidth = 1;
18348 }else {
18349for (Value *V : *UserIgnoreList) {
18350if (isa<PoisonValue>(V))
18351continue;
18352unsigned NumSignBits =ComputeNumSignBits(V, *DL, 0, AC,nullptr, DT);
18353TypeSize NumTypeBits =DL->getTypeSizeInBits(V->getType());
18354unsigned BitWidth1 = NumTypeBits - NumSignBits;
18355if (!isKnownNonNegative(V,SimplifyQuery(*DL)))
18356 ++BitWidth1;
18357unsigned BitWidth2 = BitWidth1;
18358if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
18359APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18360 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18361 }
18362 ReductionBitWidth =
18363 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18364 }
18365if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18366 ReductionBitWidth = 8;
18367
18368 ReductionBitWidth =bit_ceil(ReductionBitWidth);
18369 }
18370 }
18371bool IsTopRoot = NodeIdx == 0;
18372while (NodeIdx < VectorizableTree.size() &&
18373 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18374 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18375 RootDemotes.push_back(NodeIdx);
18376 ++NodeIdx;
18377 IsTruncRoot =true;
18378 }
18379bool IsSignedCmp =false;
18380while (NodeIdx < VectorizableTree.size()) {
18381ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18382unsigned Limit = 2;
18383if (IsTopRoot &&
18384 ReductionBitWidth ==
18385DL->getTypeSizeInBits(
18386 VectorizableTree.front()->Scalars.front()->getType()))
18387 Limit = 3;
18388unsigned MaxBitWidth = ComputeMaxBitWidth(
18389 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18390 IsTruncRoot, IsSignedCmp);
18391if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18392if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18393 ReductionBitWidth =bit_ceil(MaxBitWidth);
18394elseif (MaxBitWidth == 0)
18395 ReductionBitWidth = 0;
18396 }
18397
18398for (unsignedIdx : RootDemotes) {
18399if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18400uint32_t OrigBitWidth =
18401DL->getTypeSizeInBits(V->getType()->getScalarType());
18402if (OrigBitWidth > MaxBitWidth) {
18403APInt Mask =APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18404returnMaskedValueIsZero(V, Mask,SimplifyQuery(*DL));
18405 }
18406returnfalse;
18407 }))
18408 ToDemote.push_back(Idx);
18409 }
18410 RootDemotes.clear();
18411 IsTopRoot =false;
18412 IsProfitableToDemoteRoot =true;
18413
18414if (ExtraBitWidthNodes.empty()) {
18415 NodeIdx = VectorizableTree.size();
18416 }else {
18417unsigned NewIdx = 0;
18418do {
18419 NewIdx = *ExtraBitWidthNodes.begin();
18420 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18421 }while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18422 NodeIdx = NewIdx;
18423 IsTruncRoot =
18424 NodeIdx < VectorizableTree.size() &&
18425any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18426 [](constEdgeInfo &EI) {
18427return EI.EdgeIdx == 0 &&
18428 EI.UserTE->getOpcode() == Instruction::Trunc &&
18429 !EI.UserTE->isAltShuffle();
18430 });
18431 IsSignedCmp =
18432 NodeIdx < VectorizableTree.size() &&
18433any_of(
18434 VectorizableTree[NodeIdx]->UserTreeIndices,
18435 [&](constEdgeInfo &EI) {
18436return (EI.UserTE->hasState() &&
18437 EI.UserTE->getOpcode() == Instruction::ICmp) &&
18438any_of(EI.UserTE->Scalars, [&](Value *V) {
18439 auto *IC = dyn_cast<ICmpInst>(V);
18440 return IC &&
18441 (IC->isSigned() ||
18442 !isKnownNonNegative(IC->getOperand(0),
18443 SimplifyQuery(*DL)) ||
18444 !isKnownNonNegative(IC->getOperand(1),
18445 SimplifyQuery(*DL)));
18446 });
18447 });
18448 }
18449
18450// If the maximum bit width we compute is less than the width of the roots'
18451// type, we can proceed with the narrowing. Otherwise, do nothing.
18452if (MaxBitWidth == 0 ||
18453 MaxBitWidth >=
18454 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18455 ->getBitWidth()) {
18456if (UserIgnoreList)
18457 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18458 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18459continue;
18460 }
18461
18462// Finally, map the values we can demote to the maximum bit with we
18463// computed.
18464for (unsignedIdx : ToDemote) {
18465 TreeEntry *TE = VectorizableTree[Idx].get();
18466if (MinBWs.contains(TE))
18467continue;
18468bool IsSigned =any_of(TE->Scalars, [&](Value *R) {
18469 if (isa<PoisonValue>(R))
18470 return false;
18471 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18472 });
18473 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18474 }
18475 }
18476}
18477
18478PreservedAnalysesSLPVectorizerPass::run(Function &F,FunctionAnalysisManager &AM) {
18479auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18480auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18481auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18482auto *AA = &AM.getResult<AAManager>(F);
18483auto *LI = &AM.getResult<LoopAnalysis>(F);
18484auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18485auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18486auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18487auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
18488
18489bool Changed =runImpl(F, SE,TTI, TLI, AA, LI, DT, AC, DB, ORE);
18490if (!Changed)
18491returnPreservedAnalyses::all();
18492
18493PreservedAnalyses PA;
18494 PA.preserveSet<CFGAnalyses>();
18495return PA;
18496}
18497
18498boolSLPVectorizerPass::runImpl(Function &F,ScalarEvolution *SE_,
18499TargetTransformInfo *TTI_,
18500TargetLibraryInfo *TLI_,AAResults *AA_,
18501LoopInfo *LI_,DominatorTree *DT_,
18502AssumptionCache *AC_,DemandedBits *DB_,
18503OptimizationRemarkEmitter *ORE_) {
18504if (!RunSLPVectorization)
18505returnfalse;
18506 SE = SE_;
18507TTI = TTI_;
18508 TLI = TLI_;
18509 AA = AA_;
18510 LI = LI_;
18511 DT = DT_;
18512 AC = AC_;
18513 DB = DB_;
18514DL = &F.getDataLayout();
18515
18516 Stores.clear();
18517 GEPs.clear();
18518bool Changed =false;
18519
18520// If the target claims to have no vector registers don't attempt
18521// vectorization.
18522if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
18523LLVM_DEBUG(
18524dbgs() <<"SLP: Didn't find any vector registers for target, abort.\n");
18525returnfalse;
18526 }
18527
18528// Don't vectorize when the attribute NoImplicitFloat is used.
18529if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18530returnfalse;
18531
18532LLVM_DEBUG(dbgs() <<"SLP: Analyzing blocks in " <<F.getName() <<".\n");
18533
18534// Use the bottom up slp vectorizer to construct chains that start with
18535// store instructions.
18536BoUpSLP R(&F, SE,TTI, TLI, AA, LI, DT, AC, DB,DL, ORE_);
18537
18538// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18539// delete instructions.
18540
18541// Update DFS numbers now so that we can use them for ordering.
18542 DT->updateDFSNumbers();
18543
18544// Scan the blocks in the function in post order.
18545for (auto *BB :post_order(&F.getEntryBlock())) {
18546if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18547continue;
18548
18549// Start new block - clear the list of reduction roots.
18550 R.clearReductionData();
18551 collectSeedInstructions(BB);
18552
18553// Vectorize trees that end at stores.
18554if (!Stores.empty()) {
18555LLVM_DEBUG(dbgs() <<"SLP: Found stores for " << Stores.size()
18556 <<" underlying objects.\n");
18557 Changed |= vectorizeStoreChains(R);
18558 }
18559
18560// Vectorize trees that end at reductions.
18561 Changed |= vectorizeChainsInBlock(BB, R);
18562
18563// Vectorize the index computations of getelementptr instructions. This
18564// is primarily intended to catch gather-like idioms ending at
18565// non-consecutive loads.
18566if (!GEPs.empty()) {
18567LLVM_DEBUG(dbgs() <<"SLP: Found GEPs for " << GEPs.size()
18568 <<" underlying objects.\n");
18569 Changed |= vectorizeGEPIndices(BB, R);
18570 }
18571 }
18572
18573if (Changed) {
18574 R.optimizeGatherSequence();
18575LLVM_DEBUG(dbgs() <<"SLP: vectorized \"" <<F.getName() <<"\"\n");
18576 }
18577return Changed;
18578}
18579
18580std::optional<bool>
18581SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,BoUpSLP &R,
18582unsignedIdx,unsigned MinVF,
18583unsigned &Size) {
18584Size = 0;
18585LLVM_DEBUG(dbgs() <<"SLP: Analyzing a store chain of length " << Chain.size()
18586 <<"\n");
18587constunsigned Sz = R.getVectorElementSize(Chain[0]);
18588unsigned VF = Chain.size();
18589
18590if (!has_single_bit(Sz) ||
18591 !hasFullVectorsOrPowerOf2(
18592 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18593 VF) ||
18594 VF < 2 || VF < MinVF) {
18595// Check if vectorizing with a non-power-of-2 VF should be considered. At
18596// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18597// all vector lanes are used.
18598if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18599returnfalse;
18600 }
18601
18602LLVM_DEBUG(dbgs() <<"SLP: Analyzing " << VF <<" stores at offset " <<Idx
18603 <<"\n");
18604
18605SetVector<Value *> ValOps;
18606for (Value *V : Chain)
18607 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18608// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18609 InstructionsState S =getSameOpcode(ValOps.getArrayRef(), *TLI);
18610if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18611DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18612bool IsAllowedSize =
18613hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18614 ValOps.size()) ||
18615 (VectorizeNonPowerOf2 &&has_single_bit(ValOps.size() + 1));
18616if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18617 (!S.getMainOp()->isSafeToRemove() ||
18618any_of(ValOps.getArrayRef(),
18619 [&](Value *V) {
18620 return !isa<ExtractElementInst>(V) &&
18621 (V->getNumUses() > Chain.size() ||
18622 any_of(V->users(), [&](User *U) {
18623 return !Stores.contains(U);
18624 }));
18625 }))) ||
18626 (ValOps.size() > Chain.size() / 2 && !S)) {
18627Size = (!IsAllowedSize && S) ? 1 : 2;
18628returnfalse;
18629 }
18630 }
18631if (R.isLoadCombineCandidate(Chain))
18632returntrue;
18633R.buildTree(Chain);
18634// Check if tree tiny and store itself or its value is not vectorized.
18635if (R.isTreeTinyAndNotFullyVectorizable()) {
18636if (R.isGathered(Chain.front()) ||
18637R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18638return std::nullopt;
18639Size =R.getCanonicalGraphSize();
18640returnfalse;
18641 }
18642R.reorderTopToBottom();
18643R.reorderBottomToTop();
18644R.transformNodes();
18645R.buildExternalUses();
18646
18647R.computeMinimumValueSizes();
18648
18649Size =R.getCanonicalGraphSize();
18650if (S && S.getOpcode() == Instruction::Load)
18651Size = 2;// cut off masked gather small trees
18652InstructionCostCost =R.getTreeCost();
18653
18654LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost <<" for VF=" << VF <<"\n");
18655if (Cost < -SLPCostThreshold) {
18656LLVM_DEBUG(dbgs() <<"SLP: Decided to vectorize cost = " <<Cost <<"\n");
18657
18658using namespaceore;
18659
18660R.getORE()->emit(OptimizationRemark(SV_NAME,"StoresVectorized",
18661 cast<StoreInst>(Chain[0]))
18662 <<"Stores SLP vectorized with cost " <<NV("Cost",Cost)
18663 <<" and with tree size "
18664 <<NV("TreeSize",R.getTreeSize()));
18665
18666R.vectorizeTree();
18667returntrue;
18668 }
18669
18670returnfalse;
18671}
18672
18673/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18674staticboolcheckTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18675boolFirst) {
18676unsigned Num = 0;
18677uint64_t Sum = std::accumulate(
18678 Sizes.begin(), Sizes.end(),static_cast<uint64_t>(0),
18679 [&](uint64_t V,const std::pair<unsigned, unsigned> &Val) {
18680 unsigned Size = First ? Val.first : Val.second;
18681 if (Size == 1)
18682 return V;
18683 ++Num;
18684 return V + Size;
18685 });
18686if (Num == 0)
18687returntrue;
18688uint64_t Mean = Sum / Num;
18689if (Mean == 0)
18690returntrue;
18691uint64_t Dev = std::accumulate(
18692 Sizes.begin(), Sizes.end(),static_cast<uint64_t>(0),
18693 [&](uint64_t V,const std::pair<unsigned, unsigned> &Val) {
18694 unsigned P = First ? Val.first : Val.second;
18695 if (P == 1)
18696 return V;
18697 return V + (P - Mean) * (P - Mean);
18698 }) /
18699 Num;
18700return Dev * 81 / (Mean * Mean) == 0;
18701}
18702
18703bool SLPVectorizerPass::vectorizeStores(
18704ArrayRef<StoreInst *> Stores,BoUpSLP &R,
18705DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18706 &Visited) {
18707// We may run into multiple chains that merge into a single chain. We mark the
18708// stores that we vectorized so that we don't visit the same store twice.
18709BoUpSLP::ValueSet VectorizedStores;
18710bool Changed =false;
18711
18712structStoreDistCompare {
18713bool operator()(const std::pair<unsigned, int> &Op1,
18714const std::pair<unsigned, int> &Op2) const{
18715return Op1.second < Op2.second;
18716 }
18717 };
18718// A set of pairs (index of store in Stores array ref, Distance of the store
18719// address relative to base store address in units).
18720usingStoreIndexToDistSet =
18721 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18722auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18723int PrevDist = -1;
18724BoUpSLP::ValueListOperands;
18725// Collect the chain into a list.
18726for (auto [Idx,Data] :enumerate(Set)) {
18727if (Operands.empty() ||Data.second - PrevDist == 1) {
18728Operands.push_back(Stores[Data.first]);
18729 PrevDist =Data.second;
18730if (Idx !=Set.size() - 1)
18731continue;
18732 }
18733auto E =make_scope_exit([&, &DataVar =Data]() {
18734Operands.clear();
18735Operands.push_back(Stores[DataVar.first]);
18736 PrevDist = DataVar.second;
18737 });
18738
18739if (Operands.size() <= 1 ||
18740 !Visited
18741 .insert({Operands.front(),
18742 cast<StoreInst>(Operands.front())->getValueOperand(),
18743 Operands.back(),
18744 cast<StoreInst>(Operands.back())->getValueOperand(),
18745 Operands.size()})
18746 .second)
18747continue;
18748
18749unsigned MaxVecRegSize =R.getMaxVecRegSize();
18750unsigned EltSize =R.getVectorElementSize(Operands[0]);
18751unsigned MaxElts =llvm::bit_floor(MaxVecRegSize / EltSize);
18752
18753unsigned MaxVF =
18754 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18755auto *Store = cast<StoreInst>(Operands[0]);
18756Type *StoreTy =Store->getValueOperand()->getType();
18757Type *ValueTy = StoreTy;
18758if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18759 ValueTy = Trunc->getSrcTy();
18760unsigned MinVF = std::max<unsigned>(
18761 2,PowerOf2Ceil(TTI->getStoreMinimumVF(
18762R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18763 ValueTy)));
18764
18765if (MaxVF < MinVF) {
18766LLVM_DEBUG(dbgs() <<"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18767 <<") < "
18768 <<"MinVF (" << MinVF <<")\n");
18769continue;
18770 }
18771
18772unsigned NonPowerOf2VF = 0;
18773if (VectorizeNonPowerOf2) {
18774// First try vectorizing with a non-power-of-2 VF. At the moment, only
18775// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18776// lanes are used.
18777unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18778if (has_single_bit(CandVF + 1)) {
18779 NonPowerOf2VF = CandVF;
18780assert(NonPowerOf2VF != MaxVF &&
18781"Non-power-of-2 VF should not be equal to MaxVF");
18782 }
18783 }
18784
18785unsigned MaxRegVF = MaxVF;
18786 MaxVF = std::min<unsigned>(MaxVF,bit_floor(Operands.size()));
18787if (MaxVF < MinVF) {
18788LLVM_DEBUG(dbgs() <<"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18789 <<") < "
18790 <<"MinVF (" << MinVF <<")\n");
18791continue;
18792 }
18793
18794unsigned Sz = 1 +Log2_32(MaxVF) -Log2_32(MinVF);
18795SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18796unsignedSize = MinVF;
18797for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18798 VF =Size > MaxVF ? NonPowerOf2VF :Size;
18799Size *= 2;
18800 });
18801unsignedEnd =Operands.size();
18802unsigned Repeat = 0;
18803constexprunsigned MaxAttempts = 4;
18804OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
18805for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18806P.first =P.second = 1;
18807 });
18808DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
18809auto IsNotVectorized = [](boolFirst,
18810const std::pair<unsigned, unsigned> &P) {
18811returnFirst ?P.first > 0 :P.second > 0;
18812 };
18813auto IsVectorized = [](boolFirst,
18814const std::pair<unsigned, unsigned> &P) {
18815returnFirst ?P.first == 0 :P.second == 0;
18816 };
18817auto VFIsProfitable = [](boolFirst,unsignedSize,
18818const std::pair<unsigned, unsigned> &P) {
18819returnFirst ?Size >=P.first :Size >=P.second;
18820 };
18821auto FirstSizeSame = [](unsignedSize,
18822const std::pair<unsigned, unsigned> &P) {
18823returnSize ==P.first;
18824 };
18825while (true) {
18826 ++Repeat;
18827bool RepeatChanged =false;
18828bool AnyProfitableGraph =false;
18829for (unsignedSize : CandidateVFs) {
18830 AnyProfitableGraph =false;
18831unsigned StartIdx = std::distance(
18832 RangeSizes.begin(),
18833find_if(RangeSizes, std::bind(IsNotVectorized,Size >= MaxRegVF,
18834 std::placeholders::_1)));
18835while (StartIdx <End) {
18836unsigned EndIdx =
18837 std::distance(RangeSizes.begin(),
18838find_if(RangeSizes.drop_front(StartIdx),
18839 std::bind(IsVectorized,Size >= MaxRegVF,
18840 std::placeholders::_1)));
18841unsigned Sz = EndIdx >=End ?End : EndIdx;
18842for (unsigned Cnt = StartIdx; Cnt +Size <= Sz;) {
18843if (!checkTreeSizes(RangeSizes.slice(Cnt,Size),
18844Size >= MaxRegVF)) {
18845 ++Cnt;
18846continue;
18847 }
18848ArrayRef<Value *> Slice =ArrayRef(Operands).slice(Cnt,Size);
18849assert(all_of(Slice,
18850 [&](Value *V) {
18851return cast<StoreInst>(V)
18852 ->getValueOperand()
18853 ->getType() ==
18854 cast<StoreInst>(Slice.front())
18855 ->getValueOperand()
18856 ->getType();
18857 }) &&
18858"Expected all operands of same type.");
18859if (!NonSchedulable.empty()) {
18860auto [NonSchedSizeMax, NonSchedSizeMin] =
18861 NonSchedulable.lookup(Slice.front());
18862if (NonSchedSizeMax > 0 && NonSchedSizeMin <=Size) {
18863 Cnt += NonSchedSizeMax;
18864continue;
18865 }
18866 }
18867unsigned TreeSize;
18868 std::optional<bool> Res =
18869 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18870if (!Res) {
18871 NonSchedulable
18872 .try_emplace(Slice.front(), std::make_pair(Size,Size))
18873 .first->getSecond()
18874 .second =Size;
18875 }elseif (*Res) {
18876// Mark the vectorized stores so that we don't vectorize them
18877// again.
18878 VectorizedStores.insert(Slice.begin(), Slice.end());
18879// Mark the vectorized stores so that we don't vectorize them
18880// again.
18881 AnyProfitableGraph = RepeatChanged = Changed =true;
18882// If we vectorized initial block, no need to try to vectorize
18883// it again.
18884for_each(RangeSizes.slice(Cnt,Size),
18885 [](std::pair<unsigned, unsigned> &P) {
18886 P.first = P.second = 0;
18887 });
18888if (Cnt < StartIdx + MinVF) {
18889for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18890 [](std::pair<unsigned, unsigned> &P) {
18891 P.first = P.second = 0;
18892 });
18893 StartIdx = Cnt +Size;
18894 }
18895if (Cnt > Sz -Size - MinVF) {
18896for_each(RangeSizes.slice(Cnt +Size, Sz - (Cnt +Size)),
18897 [](std::pair<unsigned, unsigned> &P) {
18898 P.first = P.second = 0;
18899 });
18900if (Sz ==End)
18901End = Cnt;
18902 Sz = Cnt;
18903 }
18904 Cnt +=Size;
18905continue;
18906 }
18907if (Size > 2 && Res &&
18908 !all_of(RangeSizes.slice(Cnt,Size),
18909 std::bind(VFIsProfitable,Size >= MaxRegVF, TreeSize,
18910 std::placeholders::_1))) {
18911 Cnt +=Size;
18912continue;
18913 }
18914// Check for the very big VFs that we're not rebuilding same
18915// trees, just with larger number of elements.
18916if (Size > MaxRegVF && TreeSize > 1 &&
18917all_of(RangeSizes.slice(Cnt,Size),
18918 std::bind(FirstSizeSame, TreeSize,
18919 std::placeholders::_1))) {
18920 Cnt +=Size;
18921while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18922 ++Cnt;
18923continue;
18924 }
18925if (TreeSize > 1)
18926for_each(RangeSizes.slice(Cnt,Size),
18927 [&](std::pair<unsigned, unsigned> &P) {
18928 if (Size >= MaxRegVF)
18929 P.second = std::max(P.second, TreeSize);
18930 else
18931 P.first = std::max(P.first, TreeSize);
18932 });
18933 ++Cnt;
18934 AnyProfitableGraph =true;
18935 }
18936if (StartIdx >=End)
18937break;
18938if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18939 AnyProfitableGraph =true;
18940 StartIdx = std::distance(
18941 RangeSizes.begin(),
18942find_if(RangeSizes.drop_front(Sz),
18943 std::bind(IsNotVectorized,Size >= MaxRegVF,
18944 std::placeholders::_1)));
18945 }
18946if (!AnyProfitableGraph &&Size >= MaxRegVF &&has_single_bit(Size))
18947break;
18948 }
18949// All values vectorized - exit.
18950if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18951returnP.first == 0 &&P.second == 0;
18952 }))
18953break;
18954// Check if tried all attempts or no need for the last attempts at all.
18955if (Repeat >= MaxAttempts ||
18956 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18957break;
18958constexprunsigned StoresLimit = 64;
18959constunsigned MaxTotalNum = std::min<unsigned>(
18960Operands.size(),
18961static_cast<unsigned>(
18962End -
18963 std::distance(
18964 RangeSizes.begin(),
18965find_if(RangeSizes, std::bind(IsNotVectorized,true,
18966 std::placeholders::_1))) +
18967 1));
18968unsigned VF =bit_ceil(CandidateVFs.front()) * 2;
18969unsigned Limit =
18970getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
18971 CandidateVFs.clear();
18972if (bit_floor(Limit) == VF)
18973 CandidateVFs.push_back(Limit);
18974if (VF > MaxTotalNum || VF >= StoresLimit)
18975break;
18976for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
18977if (P.first != 0)
18978P.first = std::max(P.second,P.first);
18979 });
18980// Last attempt to vectorize max number of elements, if all previous
18981// attempts were unsuccessful because of the cost issues.
18982 CandidateVFs.push_back(VF);
18983 }
18984 }
18985 };
18986
18987// Stores pair (first: index of the store into Stores array ref, address of
18988// which taken as base, second: sorted set of pairs {index, dist}, which are
18989// indices of stores in the set and their store location distances relative to
18990// the base address).
18991
18992// Need to store the index of the very first store separately, since the set
18993// may be reordered after the insertion and the first store may be moved. This
18994// container allows to reduce number of calls of getPointersDiff() function.
18995SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
18996// Inserts the specified store SI with the given index Idx to the set of the
18997// stores. If the store with the same distance is found already - stop
18998// insertion, try to vectorize already found stores. If some stores from this
18999// sequence were not vectorized - try to vectorize them with the new store
19000// later. But this logic is applied only to the stores, that come before the
19001// previous store with the same distance.
19002// Example:
19003// 1. store x, %p
19004// 2. store y, %p+1
19005// 3. store z, %p+2
19006// 4. store a, %p
19007// 5. store b, %p+3
19008// - Scan this from the last to first store. The very first bunch of stores is
19009// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
19010// vector).
19011// - The next store in the list - #1 - has the same distance from store #5 as
19012// the store #4.
19013// - Try to vectorize sequence of stores 4,2,3,5.
19014// - If all these stores are vectorized - just drop them.
19015// - If some of them are not vectorized (say, #3 and #5), do extra analysis.
19016// - Start new stores sequence.
19017// The new bunch of stores is {1, {1, 0}}.
19018// - Add the stores from previous sequence, that were not vectorized.
19019// Here we consider the stores in the reversed order, rather they are used in
19020// the IR (Stores are reversed already, see vectorizeStoreChains() function).
19021// Store #3 can be added -> comes after store #4 with the same distance as
19022// store #1.
19023// Store #5 cannot be added - comes before store #4.
19024// This logic allows to improve the compile time, we assume that the stores
19025// after previous store with the same distance most likely have memory
19026// dependencies and no need to waste compile time to try to vectorize them.
19027// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
19028auto FillStoresSet = [&](unsignedIdx,StoreInst *SI) {
19029for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19030 std::optional<int> Diff =getPointersDiff(
19031 Stores[Set.first]->getValueOperand()->getType(),
19032 Stores[Set.first]->getPointerOperand(),
19033SI->getValueOperand()->getType(),SI->getPointerOperand(), *DL, *SE,
19034/*StrictCheck=*/true);
19035if (!Diff)
19036continue;
19037auto It =Set.second.find(std::make_pair(Idx, *Diff));
19038if (It ==Set.second.end()) {
19039Set.second.emplace(Idx, *Diff);
19040return;
19041 }
19042// Try to vectorize the first found set to avoid duplicate analysis.
19043 TryToVectorize(Set.second);
19044unsigned ItIdx = It->first;
19045int ItDist = It->second;
19046 StoreIndexToDistSet PrevSet;
19047copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
19048 [&](const std::pair<unsigned, int> &Pair) {
19049 return Pair.first > ItIdx;
19050 });
19051Set.second.clear();
19052Set.first =Idx;
19053Set.second.emplace(Idx, 0);
19054// Insert stores that followed previous match to try to vectorize them
19055// with this store.
19056unsigned StartIdx = ItIdx + 1;
19057SmallBitVector UsedStores(Idx - StartIdx);
19058// Distances to previously found dup store (or this store, since they
19059// store to the same addresses).
19060SmallVector<int> Dists(Idx - StartIdx, 0);
19061for (const std::pair<unsigned, int> &Pair :reverse(PrevSet)) {
19062// Do not try to vectorize sequences, we already tried.
19063if (VectorizedStores.contains(Stores[Pair.first]))
19064break;
19065unsigned BI = Pair.first - StartIdx;
19066 UsedStores.set(BI);
19067 Dists[BI] = Pair.second - ItDist;
19068 }
19069for (unsignedI = StartIdx;I <Idx; ++I) {
19070unsigned BI =I - StartIdx;
19071if (UsedStores.test(BI))
19072Set.second.emplace(I, Dists[BI]);
19073 }
19074return;
19075 }
19076auto &Res = SortedStores.emplace_back();
19077 Res.first =Idx;
19078 Res.second.emplace(Idx, 0);
19079 };
19080Type *PrevValTy =nullptr;
19081for (auto [I, SI] :enumerate(Stores)) {
19082if (R.isDeleted(SI))
19083continue;
19084if (!PrevValTy)
19085 PrevValTy =SI->getValueOperand()->getType();
19086// Check that we do not try to vectorize stores of different types.
19087if (PrevValTy !=SI->getValueOperand()->getType()) {
19088for (auto &Set : SortedStores)
19089 TryToVectorize(Set.second);
19090 SortedStores.clear();
19091 PrevValTy =SI->getValueOperand()->getType();
19092 }
19093 FillStoresSet(I, SI);
19094 }
19095
19096// Final vectorization attempt.
19097for (auto &Set : SortedStores)
19098 TryToVectorize(Set.second);
19099
19100return Changed;
19101}
19102
19103void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19104// Initialize the collections. We will make a single pass over the block.
19105 Stores.clear();
19106 GEPs.clear();
19107
19108// Visit the store and getelementptr instructions in BB and organize them in
19109// Stores and GEPs according to the underlying objects of their pointer
19110// operands.
19111for (Instruction &I : *BB) {
19112// Ignore store instructions that are volatile or have a pointer operand
19113// that doesn't point to a scalar type.
19114if (auto *SI = dyn_cast<StoreInst>(&I)) {
19115if (!SI->isSimple())
19116continue;
19117if (!isValidElementType(SI->getValueOperand()->getType()))
19118continue;
19119 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19120 }
19121
19122// Ignore getelementptr instructions that have more than one index, a
19123// constant index, or a pointer operand that doesn't point to a scalar
19124// type.
19125elseif (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19126if (GEP->getNumIndices() != 1)
19127continue;
19128Value *Idx =GEP->idx_begin()->get();
19129if (isa<Constant>(Idx))
19130continue;
19131if (!isValidElementType(Idx->getType()))
19132continue;
19133if (GEP->getType()->isVectorTy())
19134continue;
19135 GEPs[GEP->getPointerOperand()].push_back(GEP);
19136 }
19137 }
19138}
19139
19140bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL,BoUpSLP &R,
19141bool MaxVFOnly) {
19142if (VL.size() < 2)
19143returnfalse;
19144
19145LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize a list of length = "
19146 << VL.size() <<".\n");
19147
19148// Check that all of the parts are instructions of the same type,
19149// we permit an alternate opcode via InstructionsState.
19150 InstructionsState S =getSameOpcode(VL, *TLI);
19151if (!S)
19152returnfalse;
19153
19154Instruction *I0 = S.getMainOp();
19155// Make sure invalid types (including vector type) are rejected before
19156// determining vectorization factor for scalar instructions.
19157for (Value *V : VL) {
19158Type *Ty =V->getType();
19159if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19160// NOTE: the following will give user internal llvm type name, which may
19161// not be useful.
19162R.getORE()->emit([&]() {
19163 std::string TypeStr;
19164llvm::raw_string_ostream rso(TypeStr);
19165 Ty->print(rso);
19166returnOptimizationRemarkMissed(SV_NAME,"UnsupportedType", I0)
19167 <<"Cannot SLP vectorize list: type "
19168 << TypeStr +" is unsupported by vectorizer";
19169 });
19170returnfalse;
19171 }
19172 }
19173
19174Type *ScalarTy =getValueType(VL[0]);
19175unsigned Sz =R.getVectorElementSize(I0);
19176unsigned MinVF =R.getMinVF(Sz);
19177unsigned MaxVF = std::max<unsigned>(
19178getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
19179 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19180if (MaxVF < 2) {
19181R.getORE()->emit([&]() {
19182returnOptimizationRemarkMissed(SV_NAME,"SmallVF", I0)
19183 <<"Cannot SLP vectorize list: vectorization factor "
19184 <<"less than 2 is not supported";
19185 });
19186returnfalse;
19187 }
19188
19189bool Changed =false;
19190bool CandidateFound =false;
19191InstructionCost MinCost =SLPCostThreshold.getValue();
19192
19193unsigned NextInst = 0, MaxInst = VL.size();
19194for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19195 VF =getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
19196// No actual vectorization should happen, if number of parts is the same as
19197// provided vectorization factor (i.e. the scalar type is used for vector
19198// code during codegen).
19199auto *VecTy =getWidenedType(ScalarTy, VF);
19200if (TTI->getNumberOfParts(VecTy) == VF)
19201continue;
19202for (unsignedI = NextInst;I < MaxInst; ++I) {
19203unsigned ActualVF = std::min(MaxInst -I, VF);
19204
19205if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19206continue;
19207
19208if (MaxVFOnly && ActualVF < MaxVF)
19209break;
19210if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19211break;
19212
19213SmallVector<Value *> Ops(ActualVF,nullptr);
19214unsignedIdx = 0;
19215for (Value *V : VL.drop_front(I)) {
19216// Check that a previous iteration of this loop did not delete the
19217// Value.
19218if (auto *Inst = dyn_cast<Instruction>(V);
19219 !Inst || !R.isDeleted(Inst)) {
19220 Ops[Idx] =V;
19221 ++Idx;
19222if (Idx == ActualVF)
19223break;
19224 }
19225 }
19226// Not enough vectorizable instructions - exit.
19227if (Idx != ActualVF)
19228break;
19229
19230LLVM_DEBUG(dbgs() <<"SLP: Analyzing " << ActualVF <<" operations "
19231 <<"\n");
19232
19233R.buildTree(Ops);
19234if (R.isTreeTinyAndNotFullyVectorizable())
19235continue;
19236R.reorderTopToBottom();
19237R.reorderBottomToTop(
19238/*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19239 !R.doesRootHaveInTreeUses());
19240R.transformNodes();
19241R.buildExternalUses();
19242
19243R.computeMinimumValueSizes();
19244InstructionCostCost =R.getTreeCost();
19245 CandidateFound =true;
19246 MinCost = std::min(MinCost,Cost);
19247
19248LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost
19249 <<" for VF=" << ActualVF <<"\n");
19250if (Cost < -SLPCostThreshold) {
19251LLVM_DEBUG(dbgs() <<"SLP: Vectorizing list at cost:" <<Cost <<".\n");
19252R.getORE()->emit(OptimizationRemark(SV_NAME,"VectorizedList",
19253 cast<Instruction>(Ops[0]))
19254 <<"SLP vectorized with cost " <<ore::NV("Cost",Cost)
19255 <<" and with tree size "
19256 <<ore::NV("TreeSize",R.getTreeSize()));
19257
19258R.vectorizeTree();
19259// Move to the next bundle.
19260I += VF - 1;
19261 NextInst =I + 1;
19262 Changed =true;
19263 }
19264 }
19265 }
19266
19267if (!Changed && CandidateFound) {
19268R.getORE()->emit([&]() {
19269returnOptimizationRemarkMissed(SV_NAME,"NotBeneficial", I0)
19270 <<"List vectorization was possible but not beneficial with cost "
19271 <<ore::NV("Cost", MinCost) <<" >= "
19272 <<ore::NV("Treshold", -SLPCostThreshold);
19273 });
19274 }elseif (!Changed) {
19275R.getORE()->emit([&]() {
19276returnOptimizationRemarkMissed(SV_NAME,"NotPossible", I0)
19277 <<"Cannot SLP vectorize list: vectorization was impossible"
19278 <<" with available vectorization factors";
19279 });
19280 }
19281return Changed;
19282}
19283
19284bool SLPVectorizerPass::tryToVectorize(Instruction *I,BoUpSLP &R) {
19285if (!I)
19286returnfalse;
19287
19288if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19289returnfalse;
19290
19291Value *P =I->getParent();
19292
19293// Vectorize in current basic block only.
19294auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19295auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19296if (!Op0 || !Op1 || Op0->getParent() !=P || Op1->getParent() !=P ||
19297R.isDeleted(Op0) ||R.isDeleted(Op1))
19298returnfalse;
19299
19300// First collect all possible candidates
19301SmallVector<std::pair<Value *, Value *>, 4> Candidates;
19302 Candidates.emplace_back(Op0, Op1);
19303
19304auto *A = dyn_cast<BinaryOperator>(Op0);
19305auto *B = dyn_cast<BinaryOperator>(Op1);
19306// Try to skip B.
19307if (A &&B &&B->hasOneUse()) {
19308auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19309auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19310if (B0 && B0->getParent() ==P && !R.isDeleted(B0))
19311 Candidates.emplace_back(A, B0);
19312if (B1 && B1->getParent() ==P && !R.isDeleted(B1))
19313 Candidates.emplace_back(A, B1);
19314 }
19315// Try to skip A.
19316if (B &&A &&A->hasOneUse()) {
19317auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19318auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19319if (A0 && A0->getParent() ==P && !R.isDeleted(A0))
19320 Candidates.emplace_back(A0,B);
19321if (A1 && A1->getParent() ==P && !R.isDeleted(A1))
19322 Candidates.emplace_back(A1,B);
19323 }
19324
19325if (Candidates.size() == 1)
19326return tryToVectorizeList({Op0, Op1},R);
19327
19328// We have multiple options. Try to pick the single best.
19329 std::optional<int> BestCandidate =R.findBestRootPair(Candidates);
19330if (!BestCandidate)
19331returnfalse;
19332return tryToVectorizeList(
19333 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},R);
19334}
19335
19336namespace{
19337
19338/// Model horizontal reductions.
19339///
19340/// A horizontal reduction is a tree of reduction instructions that has values
19341/// that can be put into a vector as its leaves. For example:
19342///
19343/// mul mul mul mul
19344/// \ / \ /
19345/// + +
19346/// \ /
19347/// +
19348/// This tree has "mul" as its leaf values and "+" as its reduction
19349/// instructions. A reduction can feed into a store or a binary operation
19350/// feeding a phi.
19351/// ...
19352/// \ /
19353/// +
19354/// |
19355/// phi +=
19356///
19357/// Or:
19358/// ...
19359/// \ /
19360/// +
19361/// |
19362/// *p =
19363///
19364classHorizontalReduction {
19365usingReductionOpsType =SmallVector<Value *, 16>;
19366usingReductionOpsListType =SmallVector<ReductionOpsType, 2>;
19367 ReductionOpsListType ReductionOps;
19368 /// List of possibly reduced values.
19369SmallVector<SmallVector<Value *>> ReducedVals;
19370 /// Maps reduced value to the corresponding reduction operation.
19371SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
19372WeakTrackingVH ReductionRoot;
19373 /// The type of reduction operation.
19374RecurKind RdxKind;
19375 /// Checks if the optimization of original scalar identity operations on
19376 /// matched horizontal reductions is enabled and allowed.
19377bool IsSupportedHorRdxIdentityOp =false;
19378
19379staticbool isCmpSelMinMax(Instruction *I) {
19380returnmatch(I,m_Select(m_Cmp(),m_Value(),m_Value())) &&
19381RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
19382 }
19383
19384// And/or are potentially poison-safe logical patterns like:
19385// select x, y, false
19386// select x, true, y
19387staticbool isBoolLogicOp(Instruction *I) {
19388return isa<SelectInst>(I) &&
19389 (match(I,m_LogicalAnd()) ||match(I,m_LogicalOr()));
19390 }
19391
19392 /// Checks if instruction is associative and can be vectorized.
19393staticbool isVectorizable(RecurKind Kind,Instruction *I) {
19394if (Kind == RecurKind::None)
19395returnfalse;
19396
19397// Integer ops that map to select instructions or intrinsics are fine.
19398if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
19399 isBoolLogicOp(I))
19400returntrue;
19401
19402if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19403// FP min/max are associative except for NaN and -0.0. We do not
19404// have to rule out -0.0 here because the intrinsic semantics do not
19405// specify a fixed result for it.
19406returnI->getFastMathFlags().noNaNs();
19407 }
19408
19409if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19410returntrue;
19411
19412returnI->isAssociative();
19413 }
19414
19415staticValue *getRdxOperand(Instruction *I,unsignedIndex) {
19416// Poison-safe 'or' takes the form: select X, true, Y
19417// To make that work with the normal operand processing, we skip the
19418// true value operand.
19419// TODO: Change the code and data structures to handle this without a hack.
19420if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) &&Index == 1)
19421returnI->getOperand(2);
19422returnI->getOperand(Index);
19423 }
19424
19425 /// Creates reduction operation with the current opcode.
19426staticValue *createOp(IRBuilderBase &Builder,RecurKind Kind,Value *LHS,
19427Value *RHS,constTwine &Name,bool UseSelect) {
19428switch (Kind) {
19429case RecurKind::Or: {
19430if (UseSelect &&
19431LHS->getType() ==CmpInst::makeCmpResultType(LHS->getType()))
19432return Builder.CreateSelect(LHS, Builder.getTrue(),RHS,Name);
19433unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);
19434return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,
19435Name);
19436 }
19437case RecurKind::And: {
19438if (UseSelect &&
19439LHS->getType() ==CmpInst::makeCmpResultType(LHS->getType()))
19440return Builder.CreateSelect(LHS,RHS, Builder.getFalse(),Name);
19441unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);
19442return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,
19443Name);
19444 }
19445case RecurKind::Add:
19446case RecurKind::Mul:
19447case RecurKind::Xor:
19448case RecurKind::FAdd:
19449case RecurKind::FMul: {
19450unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);
19451return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,
19452Name);
19453 }
19454case RecurKind::SMax:
19455case RecurKind::SMin:
19456case RecurKind::UMax:
19457case RecurKind::UMin:
19458if (UseSelect) {
19459CmpInst::Predicate Pred =llvm::getMinMaxReductionPredicate(Kind);
19460Value *Cmp = Builder.CreateICmp(Pred,LHS,RHS,Name);
19461return Builder.CreateSelect(Cmp,LHS,RHS,Name);
19462 }
19463 [[fallthrough]];
19464case RecurKind::FMax:
19465case RecurKind::FMin:
19466case RecurKind::FMaximum:
19467case RecurKind::FMinimum: {
19468Intrinsic::IDId =llvm::getMinMaxReductionIntrinsicOp(Kind);
19469return Builder.CreateBinaryIntrinsic(Id,LHS,RHS);
19470 }
19471default:
19472llvm_unreachable("Unknown reduction operation.");
19473 }
19474 }
19475
19476 /// Creates reduction operation with the current opcode with the IR flags
19477 /// from \p ReductionOps, dropping nuw/nsw flags.
19478staticValue *createOp(IRBuilderBase &Builder,RecurKind RdxKind,Value *LHS,
19479Value *RHS,constTwine &Name,
19480const ReductionOpsListType &ReductionOps) {
19481bool UseSelect = ReductionOps.size() == 2 ||
19482// Logical or/and.
19483 (ReductionOps.size() == 1 &&
19484any_of(ReductionOps.front(), IsaPred<SelectInst>));
19485assert((!UseSelect || ReductionOps.size() != 2 ||
19486 isa<SelectInst>(ReductionOps[1][0])) &&
19487"Expected cmp + select pairs for reduction");
19488Value *Op = createOp(Builder, RdxKind,LHS,RHS,Name, UseSelect);
19489if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
19490if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19491propagateIRFlags(Sel->getCondition(), ReductionOps[0],nullptr,
19492/*IncludeWrapFlags=*/false);
19493propagateIRFlags(Op, ReductionOps[1],nullptr,
19494/*IncludeWrapFlags=*/false);
19495returnOp;
19496 }
19497 }
19498propagateIRFlags(Op, ReductionOps[0],nullptr,/*IncludeWrapFlags=*/false);
19499returnOp;
19500 }
19501
19502public:
19503staticRecurKindgetRdxKind(Value *V) {
19504auto *I = dyn_cast<Instruction>(V);
19505if (!I)
19506return RecurKind::None;
19507if (match(I,m_Add(m_Value(),m_Value())))
19508return RecurKind::Add;
19509if (match(I,m_Mul(m_Value(),m_Value())))
19510return RecurKind::Mul;
19511if (match(I,m_And(m_Value(),m_Value())) ||
19512match(I,m_LogicalAnd(m_Value(),m_Value())))
19513return RecurKind::And;
19514if (match(I,m_Or(m_Value(),m_Value())) ||
19515match(I,m_LogicalOr(m_Value(),m_Value())))
19516return RecurKind::Or;
19517if (match(I,m_Xor(m_Value(),m_Value())))
19518return RecurKind::Xor;
19519if (match(I,m_FAdd(m_Value(),m_Value())))
19520return RecurKind::FAdd;
19521if (match(I,m_FMul(m_Value(),m_Value())))
19522return RecurKind::FMul;
19523
19524if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(),m_Value())))
19525return RecurKind::FMax;
19526if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(),m_Value())))
19527return RecurKind::FMin;
19528
19529if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(),m_Value())))
19530return RecurKind::FMaximum;
19531if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(),m_Value())))
19532return RecurKind::FMinimum;
19533// This matches either cmp+select or intrinsics. SLP is expected to handle
19534// either form.
19535// TODO: If we are canonicalizing to intrinsics, we can remove several
19536// special-case paths that deal with selects.
19537if (match(I,m_SMax(m_Value(),m_Value())))
19538return RecurKind::SMax;
19539if (match(I,m_SMin(m_Value(),m_Value())))
19540return RecurKind::SMin;
19541if (match(I,m_UMax(m_Value(),m_Value())))
19542return RecurKind::UMax;
19543if (match(I,m_UMin(m_Value(),m_Value())))
19544return RecurKind::UMin;
19545
19546if (auto *Select = dyn_cast<SelectInst>(I)) {
19547// Try harder: look for min/max pattern based on instructions producing
19548// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19549// During the intermediate stages of SLP, it's very common to have
19550// pattern like this (since optimizeGatherSequence is run only once
19551// at the end):
19552// %1 = extractelement <2 x i32> %a, i32 0
19553// %2 = extractelement <2 x i32> %a, i32 1
19554// %cond = icmp sgt i32 %1, %2
19555// %3 = extractelement <2 x i32> %a, i32 0
19556// %4 = extractelement <2 x i32> %a, i32 1
19557// %select = select i1 %cond, i32 %3, i32 %4
19558CmpPredicate Pred;
19559Instruction *L1;
19560Instruction *L2;
19561
19562Value *LHS =Select->getTrueValue();
19563Value *RHS =Select->getFalseValue();
19564Value *Cond =Select->getCondition();
19565
19566// TODO: Support inverse predicates.
19567if (match(Cond,m_Cmp(Pred,m_Specific(LHS),m_Instruction(L2)))) {
19568if (!isa<ExtractElementInst>(RHS) ||
19569 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19570return RecurKind::None;
19571 }elseif (match(Cond,m_Cmp(Pred,m_Instruction(L1),m_Specific(RHS)))) {
19572if (!isa<ExtractElementInst>(LHS) ||
19573 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19574return RecurKind::None;
19575 }else {
19576if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19577return RecurKind::None;
19578if (!match(Cond,m_Cmp(Pred,m_Instruction(L1),m_Instruction(L2))) ||
19579 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19580 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19581return RecurKind::None;
19582 }
19583
19584switch (Pred) {
19585default:
19586return RecurKind::None;
19587caseCmpInst::ICMP_SGT:
19588caseCmpInst::ICMP_SGE:
19589return RecurKind::SMax;
19590caseCmpInst::ICMP_SLT:
19591caseCmpInst::ICMP_SLE:
19592return RecurKind::SMin;
19593caseCmpInst::ICMP_UGT:
19594caseCmpInst::ICMP_UGE:
19595return RecurKind::UMax;
19596caseCmpInst::ICMP_ULT:
19597caseCmpInst::ICMP_ULE:
19598return RecurKind::UMin;
19599 }
19600 }
19601return RecurKind::None;
19602 }
19603
19604 /// Get the index of the first operand.
19605staticunsigned getFirstOperandIndex(Instruction *I) {
19606return isCmpSelMinMax(I) ? 1 : 0;
19607 }
19608
19609private:
19610 /// Total number of operands in the reduction operation.
19611staticunsigned getNumberOfOperands(Instruction *I) {
19612return isCmpSelMinMax(I) ? 3 : 2;
19613 }
19614
19615 /// Checks if the instruction is in basic block \p BB.
19616 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19617staticbool hasSameParent(Instruction *I,BasicBlock *BB) {
19618if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19619auto *Sel = cast<SelectInst>(I);
19620auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19621return Sel->getParent() == BB &&Cmp &&Cmp->getParent() == BB;
19622 }
19623returnI->getParent() == BB;
19624 }
19625
19626 /// Expected number of uses for reduction operations/reduced values.
19627staticbool hasRequiredNumberOfUses(bool IsCmpSelMinMax,Instruction *I) {
19628if (IsCmpSelMinMax) {
19629// SelectInst must be used twice while the condition op must have single
19630// use only.
19631if (auto *Sel = dyn_cast<SelectInst>(I))
19632return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19633returnI->hasNUses(2);
19634 }
19635
19636// Arithmetic reduction operation must be used once only.
19637returnI->hasOneUse();
19638 }
19639
19640 /// Initializes the list of reduction operations.
19641void initReductionOps(Instruction *I) {
19642if (isCmpSelMinMax(I))
19643 ReductionOps.assign(2, ReductionOpsType());
19644else
19645 ReductionOps.assign(1, ReductionOpsType());
19646 }
19647
19648 /// Add all reduction operations for the reduction instruction \p I.
19649void addReductionOps(Instruction *I) {
19650if (isCmpSelMinMax(I)) {
19651 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19652 ReductionOps[1].emplace_back(I);
19653 }else {
19654 ReductionOps[0].emplace_back(I);
19655 }
19656 }
19657
19658staticbool isGoodForReduction(ArrayRef<Value *> Data) {
19659int Sz = Data.size();
19660auto *I = dyn_cast<Instruction>(Data.front());
19661return Sz > 1 ||isConstant(Data.front()) ||
19662 (I && !isa<LoadInst>(I) &&isValidForAlternation(I->getOpcode()));
19663 }
19664
19665public:
19666HorizontalReduction() =default;
19667
19668 /// Try to find a reduction tree.
19669bool matchAssociativeReduction(BoUpSLP &R,Instruction *Root,
19670ScalarEvolution &SE,constDataLayout &DL,
19671constTargetLibraryInfo &TLI) {
19672 RdxKind = HorizontalReduction::getRdxKind(Root);
19673if (!isVectorizable(RdxKind, Root))
19674returnfalse;
19675
19676// Analyze "regular" integer/FP types for reductions - no target-specific
19677// types or pointers.
19678Type *Ty = Root->getType();
19679if (!isValidElementType(Ty) || Ty->isPointerTy())
19680returnfalse;
19681
19682// Though the ultimate reduction may have multiple uses, its condition must
19683// have only single use.
19684if (auto *Sel = dyn_cast<SelectInst>(Root))
19685if (!Sel->getCondition()->hasOneUse())
19686returnfalse;
19687
19688 ReductionRoot = Root;
19689
19690// Iterate through all the operands of the possible reduction tree and
19691// gather all the reduced values, sorting them by their value id.
19692BasicBlock *BB = Root->getParent();
19693bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19694SmallVector<std::pair<Instruction *, unsigned>> Worklist(
19695 1, std::make_pair(Root, 0));
19696// Checks if the operands of the \p TreeN instruction are also reduction
19697// operations or should be treated as reduced values or an extra argument,
19698// which is not part of the reduction.
19699auto CheckOperands = [&](Instruction *TreeN,
19700SmallVectorImpl<Value *> &PossibleReducedVals,
19701SmallVectorImpl<Instruction *> &ReductionOps,
19702unsigned Level) {
19703for (intI :reverse(seq<int>(getFirstOperandIndex(TreeN),
19704 getNumberOfOperands(TreeN)))) {
19705Value *EdgeVal = getRdxOperand(TreeN,I);
19706 ReducedValsToOps[EdgeVal].push_back(TreeN);
19707auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19708// If the edge is not an instruction, or it is different from the main
19709// reduction opcode or has too many uses - possible reduced value.
19710// Also, do not try to reduce const values, if the operation is not
19711// foldable.
19712if (!EdgeInst || Level >RecursionMaxDepth ||
19713getRdxKind(EdgeInst) != RdxKind ||
19714 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19715 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19716 !isVectorizable(RdxKind, EdgeInst) ||
19717 (R.isAnalyzedReductionRoot(EdgeInst) &&
19718all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19719 PossibleReducedVals.push_back(EdgeVal);
19720continue;
19721 }
19722 ReductionOps.push_back(EdgeInst);
19723 }
19724 };
19725// Try to regroup reduced values so that it gets more profitable to try to
19726// reduce them. Values are grouped by their value ids, instructions - by
19727// instruction op id and/or alternate op id, plus do extra analysis for
19728// loads (grouping them by the distabce between pointers) and cmp
19729// instructions (grouping them by the predicate).
19730SmallMapVector<
19731 size_t,SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
19732 8>
19733 PossibleReducedVals;
19734 initReductionOps(Root);
19735DenseMap<std::pair<size_t, Value *>,SmallVector<LoadInst *>> LoadsMap;
19736SmallSet<size_t, 2> LoadKeyUsed;
19737
19738auto GenerateLoadsSubkey = [&](size_tKey,LoadInst *LI) {
19739Key =hash_combine(hash_value(LI->getParent()), Key);
19740Value *Ptr =
19741getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth);
19742if (!LoadKeyUsed.insert(Key).second) {
19743auto LIt = LoadsMap.find(std::make_pair(Key,Ptr));
19744if (LIt != LoadsMap.end()) {
19745for (LoadInst *RLI : LIt->second) {
19746if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19747 LI->getType(), LI->getPointerOperand(),DL, SE,
19748/*StrictCheck=*/true))
19749returnhash_value(RLI->getPointerOperand());
19750 }
19751for (LoadInst *RLI : LIt->second) {
19752if (arePointersCompatible(RLI->getPointerOperand(),
19753 LI->getPointerOperand(), TLI)) {
19754hash_code SubKey =hash_value(RLI->getPointerOperand());
19755return SubKey;
19756 }
19757 }
19758if (LIt->second.size() > 2) {
19759hash_code SubKey =
19760hash_value(LIt->second.back()->getPointerOperand());
19761return SubKey;
19762 }
19763 }
19764 }
19765 LoadsMap.try_emplace(std::make_pair(Key,Ptr))
19766 .first->second.push_back(LI);
19767returnhash_value(LI->getPointerOperand());
19768 };
19769
19770while (!Worklist.empty()) {
19771auto [TreeN, Level] = Worklist.pop_back_val();
19772SmallVector<Value *> PossibleRedVals;
19773SmallVector<Instruction *> PossibleReductionOps;
19774 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19775 addReductionOps(TreeN);
19776// Add reduction values. The values are sorted for better vectorization
19777// results.
19778for (Value *V : PossibleRedVals) {
19779size_tKey,Idx;
19780 std::tie(Key,Idx) =generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19781/*AllowAlternate=*/false);
19782 ++PossibleReducedVals[Key][Idx]
19783 .insert(std::make_pair(V, 0))
19784 .first->second;
19785 }
19786for (Instruction *I :reverse(PossibleReductionOps))
19787 Worklist.emplace_back(I,I->getParent() == BB ? 0 : Level + 1);
19788 }
19789auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19790// Sort values by the total number of values kinds to start the reduction
19791// from the longest possible reduced values sequences.
19792for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19793auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19794SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19795for (auto It = PossibleRedVals.begin(),E = PossibleRedVals.end();
19796 It !=E; ++It) {
19797 PossibleRedValsVect.emplace_back();
19798auto RedValsVect = It->second.takeVector();
19799stable_sort(RedValsVect,llvm::less_second());
19800for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19801 PossibleRedValsVect.back().append(Data.second, Data.first);
19802 }
19803stable_sort(PossibleRedValsVect, [](constauto &P1,constauto &P2) {
19804returnP1.size() > P2.size();
19805 });
19806int NewIdx = -1;
19807for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19808if (NewIdx < 0 ||
19809 (!isGoodForReduction(Data) &&
19810 (!isa<LoadInst>(Data.front()) ||
19811 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19812getUnderlyingObject(
19813 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19814getUnderlyingObject(
19815 cast<LoadInst>(ReducedVals[NewIdx].front())
19816 ->getPointerOperand())))) {
19817 NewIdx = ReducedVals.size();
19818 ReducedVals.emplace_back();
19819 }
19820 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19821 }
19822 }
19823// Sort the reduced values by number of same/alternate opcode and/or pointer
19824// operand.
19825stable_sort(ReducedVals, [](ArrayRef<Value *> P1,ArrayRef<Value *> P2) {
19826returnP1.size() > P2.size();
19827 });
19828returntrue;
19829 }
19830
19831 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19832Value *tryToReduce(BoUpSLP &V,constDataLayout &DL,TargetTransformInfo *TTI,
19833constTargetLibraryInfo &TLI,AssumptionCache *AC) {
19834constunsigned ReductionLimit =VectorizeNonPowerOf2 ? 3 : 4;
19835constexprunsigned RegMaxNumber = 4;
19836constexprunsigned RedValsMaxNumber = 128;
19837// If there are a sufficient number of reduction values, reduce
19838// to a nearby power-of-2. We can safely generate oversized
19839// vectors and rely on the backend to split them to legal sizes.
19840if (unsigned NumReducedVals = std::accumulate(
19841 ReducedVals.begin(), ReducedVals.end(), 0,
19842 [](unsigned Num,ArrayRef<Value *> Vals) ->unsigned {
19843 if (!isGoodForReduction(Vals))
19844 return Num;
19845 return Num + Vals.size();
19846 });
19847 NumReducedVals < ReductionLimit &&
19848all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19849return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19850 })) {
19851for (ReductionOpsType &RdxOps : ReductionOps)
19852for (Value *RdxOp : RdxOps)
19853V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19854returnnullptr;
19855 }
19856
19857IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19858TargetFolder(DL));
19859 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19860
19861// Track the reduced values in case if they are replaced by extractelement
19862// because of the vectorization.
19863DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19864 ReducedVals.front().size());
19865
19866// The compare instruction of a min/max is the insertion point for new
19867// instructions and may be replaced with a new compare instruction.
19868auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19869assert(isa<SelectInst>(RdxRootInst) &&
19870"Expected min/max reduction to have select root instruction");
19871Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19872assert(isa<Instruction>(ScalarCond) &&
19873"Expected min/max reduction to have compare condition");
19874return cast<Instruction>(ScalarCond);
19875 };
19876
19877bool AnyBoolLogicOp =any_of(ReductionOps.back(), [](Value *V) {
19878 return isBoolLogicOp(cast<Instruction>(V));
19879 });
19880// Return new VectorizedTree, based on previous value.
19881auto GetNewVectorizedTree = [&](Value *VectorizedTree,Value *Res) {
19882if (VectorizedTree) {
19883// Update the final value in the reduction.
19884 Builder.SetCurrentDebugLocation(
19885 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19886if (AnyBoolLogicOp) {
19887auto It = ReducedValsToOps.find(VectorizedTree);
19888auto It1 = ReducedValsToOps.find(Res);
19889if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19890isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19891 (It != ReducedValsToOps.end() &&
19892any_of(It->getSecond(), [&](Instruction *I) {
19893 return isBoolLogicOp(I) &&
19894 getRdxOperand(I, 0) == VectorizedTree;
19895 }))) {
19896 ;
19897 }elseif (isGuaranteedNotToBePoison(Res, AC) ||
19898 (It1 != ReducedValsToOps.end() &&
19899any_of(It1->getSecond(), [&](Instruction *I) {
19900 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19901 }))) {
19902std::swap(VectorizedTree, Res);
19903 }else {
19904 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19905 }
19906 }
19907
19908return createOp(Builder, RdxKind, VectorizedTree, Res,"op.rdx",
19909 ReductionOps);
19910 }
19911// Initialize the final value in the reduction.
19912return Res;
19913 };
19914SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19915 ReductionOps.front().size());
19916for (ReductionOpsType &RdxOps : ReductionOps)
19917for (Value *RdxOp : RdxOps) {
19918if (!RdxOp)
19919continue;
19920 IgnoreList.insert(RdxOp);
19921 }
19922// Intersect the fast-math-flags from all reduction operations.
19923FastMathFlags RdxFMF;
19924 RdxFMF.set();
19925for (Value *U : IgnoreList)
19926if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19927 RdxFMF &= FPMO->getFastMathFlags();
19928bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19929
19930// Need to track reduced vals, they may be changed during vectorization of
19931// subvectors.
19932for (ArrayRef<Value *> Candidates : ReducedVals)
19933for (Value *V : Candidates)
19934 TrackedVals.try_emplace(V, V);
19935
19936auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
19937Value *V) ->unsigned & {
19938auto *It = MV.find(V);
19939assert(It != MV.end() &&"Unable to find given key.");
19940return It->second;
19941 };
19942
19943DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19944// List of the values that were reduced in other trees as part of gather
19945// nodes and thus requiring extract if fully vectorized in other trees.
19946SmallPtrSet<Value *, 4> RequiredExtract;
19947WeakTrackingVH VectorizedTree =nullptr;
19948bool CheckForReusedReductionOps =false;
19949// Try to vectorize elements based on their type.
19950SmallVector<InstructionsState> States;
19951for (ArrayRef<Value *> RV : ReducedVals)
19952 States.push_back(getSameOpcode(RV, TLI));
19953for (unsignedI = 0,E = ReducedVals.size();I <E; ++I) {
19954ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19955 InstructionsState S = States[I];
19956SmallVector<Value *> Candidates;
19957 Candidates.reserve(2 * OrigReducedVals.size());
19958DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19959for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19960Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19961// Check if the reduction value was not overriden by the extractelement
19962// instruction because of the vectorization and exclude it, if it is not
19963// compatible with other values.
19964// Also check if the instruction was folded to constant/other value.
19965auto *Inst = dyn_cast<Instruction>(RdxVal);
19966if ((Inst &&isVectorLikeInstWithConstOps(Inst) &&
19967 (!S || !S.isOpcodeOrAlt(Inst))) ||
19968 (S && !Inst))
19969continue;
19970 Candidates.push_back(RdxVal);
19971 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19972 }
19973bool ShuffledExtracts =false;
19974// Try to handle shuffled extractelements.
19975if (S && S.getOpcode() == Instruction::ExtractElement &&
19976 !S.isAltShuffle() &&I + 1 <E) {
19977SmallVector<Value *> CommonCandidates(Candidates);
19978for (Value *RV : ReducedVals[I + 1]) {
19979Value *RdxVal = TrackedVals.at(RV);
19980// Check if the reduction value was not overriden by the
19981// extractelement instruction because of the vectorization and
19982// exclude it, if it is not compatible with other values.
19983auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19984if (!Inst)
19985continue;
19986 CommonCandidates.push_back(RdxVal);
19987 TrackedToOrig.try_emplace(RdxVal, RV);
19988 }
19989SmallVector<int>Mask;
19990if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
19991 ++I;
19992 Candidates.swap(CommonCandidates);
19993 ShuffledExtracts =true;
19994 }
19995 }
19996
19997// Emit code for constant values.
19998if (Candidates.size() > 1 &&allConstant(Candidates)) {
19999Value *Res = Candidates.front();
20000Value *OrigV = TrackedToOrig.at(Candidates.front());
20001 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20002for (Value *VC :ArrayRef(Candidates).drop_front()) {
20003 Res = createOp(Builder, RdxKind, Res, VC,"const.rdx", ReductionOps);
20004Value *OrigV = TrackedToOrig.at(VC);
20005 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20006if (auto *ResI = dyn_cast<Instruction>(Res))
20007V.analyzedReductionRoot(ResI);
20008 }
20009 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20010continue;
20011 }
20012
20013unsigned NumReducedVals = Candidates.size();
20014if (NumReducedVals < ReductionLimit &&
20015 (NumReducedVals < 2 || !isSplat(Candidates)))
20016continue;
20017
20018// Check if we support repeated scalar values processing (optimization of
20019// original scalar identity operations on matched horizontal reductions).
20020 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20021 RdxKind != RecurKind::FMul &&
20022 RdxKind != RecurKind::FMulAdd;
20023// Gather same values.
20024SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
20025if (IsSupportedHorRdxIdentityOp)
20026for (Value *V : Candidates) {
20027Value *OrigV = TrackedToOrig.at(V);
20028 ++SameValuesCounter.try_emplace(OrigV).first->second;
20029 }
20030// Used to check if the reduced values used same number of times. In this
20031// case the compiler may produce better code. E.g. if reduced values are
20032// aabbccdd (8 x values), then the first node of the tree will have a node
20033// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
20034// Plus, the final reduction will be performed on <8 x aabbccdd>.
20035// Instead compiler may build <4 x abcd> tree immediately, + reduction (4
20036// x abcd) * 2.
20037// Currently it only handles add/fadd/xor. and/or/min/max do not require
20038// this analysis, other operations may require an extra estimation of
20039// the profitability.
20040bool SameScaleFactor =false;
20041bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20042 SameValuesCounter.size() != Candidates.size();
20043BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
20044if (OptReusedScalars) {
20045 SameScaleFactor =
20046 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20047 RdxKind == RecurKind::Xor) &&
20048all_of(drop_begin(SameValuesCounter),
20049 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
20050returnP.second == SameValuesCounter.front().second;
20051 });
20052 Candidates.resize(SameValuesCounter.size());
20053transform(SameValuesCounter, Candidates.begin(),
20054 [&](constauto &P) { return TrackedVals.at(P.first); });
20055 NumReducedVals = Candidates.size();
20056// Have a reduction of the same element.
20057if (NumReducedVals == 1) {
20058Value *OrigV = TrackedToOrig.at(Candidates.front());
20059unsigned Cnt = At(SameValuesCounter, OrigV);
20060Value *RedVal =
20061 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20062 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20063 VectorizedVals.try_emplace(OrigV, Cnt);
20064 ExternallyUsedValues.insert(OrigV);
20065continue;
20066 }
20067 }
20068
20069unsigned MaxVecRegSize =V.getMaxVecRegSize();
20070unsigned EltSize =V.getVectorElementSize(Candidates[0]);
20071constunsigned MaxElts = std::clamp<unsigned>(
20072llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20073 RegMaxNumber * RedValsMaxNumber);
20074
20075unsigned ReduxWidth = NumReducedVals;
20076auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20077unsigned NumParts, NumRegs;
20078Type *ScalarTy = Candidates.front()->getType();
20079 ReduxWidth =
20080getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20081VectorType *Tp =getWidenedType(ScalarTy, ReduxWidth);
20082 NumParts =TTI.getNumberOfParts(Tp);
20083 NumRegs =
20084TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
20085while (NumParts > NumRegs) {
20086assert(ReduxWidth > 0 &&"ReduxWidth is unexpectedly 0.");
20087 ReduxWidth =bit_floor(ReduxWidth - 1);
20088VectorType *Tp =getWidenedType(ScalarTy, ReduxWidth);
20089 NumParts =TTI.getNumberOfParts(Tp);
20090 NumRegs =
20091TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
20092 }
20093if (NumParts > NumRegs / 2)
20094 ReduxWidth =bit_floor(ReduxWidth);
20095return ReduxWidth;
20096 };
20097if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20098 ReduxWidth = GetVectorFactor(ReduxWidth);
20099 ReduxWidth = std::min(ReduxWidth, MaxElts);
20100
20101unsigned Start = 0;
20102unsigned Pos = Start;
20103// Restarts vectorization attempt with lower vector factor.
20104unsigned PrevReduxWidth = ReduxWidth;
20105bool CheckForReusedReductionOpsLocal =false;
20106auto AdjustReducedVals = [&](bool IgnoreVL =false) {
20107bool IsAnyRedOpGathered = !IgnoreVL &&V.isAnyGathered(IgnoreList);
20108if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20109// Check if any of the reduction ops are gathered. If so, worth
20110// trying again with less number of reduction ops.
20111 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20112 }
20113 ++Pos;
20114if (Pos < NumReducedVals - ReduxWidth + 1)
20115return IsAnyRedOpGathered;
20116 Pos = Start;
20117 --ReduxWidth;
20118if (ReduxWidth > 1)
20119 ReduxWidth = GetVectorFactor(ReduxWidth);
20120return IsAnyRedOpGathered;
20121 };
20122bool AnyVectorized =false;
20123SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20124while (Pos < NumReducedVals - ReduxWidth + 1 &&
20125 ReduxWidth >= ReductionLimit) {
20126// Dependency in tree of the reduction ops - drop this attempt, try
20127// later.
20128if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20129 Start == 0) {
20130 CheckForReusedReductionOps =true;
20131break;
20132 }
20133 PrevReduxWidth = ReduxWidth;
20134ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20135// Been analyzed already - skip.
20136if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20137 (!has_single_bit(ReduxWidth) &&
20138 (IgnoredCandidates.contains(
20139 std::make_pair(Pos,bit_floor(ReduxWidth))) ||
20140 IgnoredCandidates.contains(
20141 std::make_pair(Pos + (ReduxWidth -bit_floor(ReduxWidth)),
20142bit_floor(ReduxWidth))))) ||
20143V.areAnalyzedReductionVals(VL)) {
20144 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20145continue;
20146 }
20147// Early exit if any of the reduction values were deleted during
20148// previous vectorization attempts.
20149if (any_of(VL, [&V](Value *RedVal) {
20150auto *RedValI = dyn_cast<Instruction>(RedVal);
20151if (!RedValI)
20152returnfalse;
20153returnV.isDeleted(RedValI);
20154 }))
20155break;
20156V.buildTree(VL, IgnoreList);
20157if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20158if (!AdjustReducedVals())
20159V.analyzedReductionVals(VL);
20160continue;
20161 }
20162if (V.isLoadCombineReductionCandidate(RdxKind)) {
20163if (!AdjustReducedVals())
20164V.analyzedReductionVals(VL);
20165continue;
20166 }
20167V.reorderTopToBottom();
20168// No need to reorder the root node at all.
20169V.reorderBottomToTop(/*IgnoreReorder=*/true);
20170// Keep extracted other reduction values, if they are used in the
20171// vectorization trees.
20172BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20173 ExternallyUsedValues);
20174// The reduction root is used as the insertion point for new
20175// instructions, so set it as externally used to prevent it from being
20176// deleted.
20177 LocalExternallyUsedValues.insert(ReductionRoot);
20178for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20179if (Cnt ==I || (ShuffledExtracts && Cnt ==I - 1))
20180continue;
20181for (Value *V : ReducedVals[Cnt])
20182if (isa<Instruction>(V))
20183 LocalExternallyUsedValues.insert(TrackedVals[V]);
20184 }
20185if (!IsSupportedHorRdxIdentityOp) {
20186// Number of uses of the candidates in the vector of values.
20187assert(SameValuesCounter.empty() &&
20188"Reused values counter map is not empty");
20189for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20190if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20191continue;
20192Value *V = Candidates[Cnt];
20193Value *OrigV = TrackedToOrig.at(V);
20194 ++SameValuesCounter.try_emplace(OrigV).first->second;
20195 }
20196 }
20197V.transformNodes();
20198SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20199// Gather externally used values.
20200SmallPtrSet<Value *, 4> Visited;
20201for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20202if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20203continue;
20204Value *RdxVal = Candidates[Cnt];
20205if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20206 RdxVal = It->second;
20207if (!Visited.insert(RdxVal).second)
20208continue;
20209// Check if the scalar was vectorized as part of the vectorization
20210// tree but not the top node.
20211if (!VLScalars.contains(RdxVal) &&V.isVectorized(RdxVal)) {
20212 LocalExternallyUsedValues.insert(RdxVal);
20213continue;
20214 }
20215Value *OrigV = TrackedToOrig.at(RdxVal);
20216unsigned NumOps =
20217 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20218if (NumOps != ReducedValsToOps.at(OrigV).size())
20219 LocalExternallyUsedValues.insert(RdxVal);
20220 }
20221// Do not need the list of reused scalars in regular mode anymore.
20222if (!IsSupportedHorRdxIdentityOp)
20223 SameValuesCounter.clear();
20224for (Value *RdxVal : VL)
20225if (RequiredExtract.contains(RdxVal))
20226 LocalExternallyUsedValues.insert(RdxVal);
20227V.buildExternalUses(LocalExternallyUsedValues);
20228
20229V.computeMinimumValueSizes();
20230
20231// Estimate cost.
20232InstructionCost TreeCost =V.getTreeCost(VL);
20233InstructionCost ReductionCost =
20234 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20235InstructionCostCost = TreeCost + ReductionCost;
20236LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost
20237 <<" for reduction\n");
20238if (!Cost.isValid())
20239break;
20240if (Cost >= -SLPCostThreshold) {
20241V.getORE()->emit([&]() {
20242returnOptimizationRemarkMissed(SV_NAME,"HorSLPNotBeneficial",
20243 ReducedValsToOps.at(VL[0]).front())
20244 <<"Vectorizing horizontal reduction is possible "
20245 <<"but not beneficial with cost " <<ore::NV("Cost",Cost)
20246 <<" and threshold "
20247 <<ore::NV("Threshold", -SLPCostThreshold);
20248 });
20249if (!AdjustReducedVals()) {
20250V.analyzedReductionVals(VL);
20251unsignedOffset = Pos == Start ? Pos : Pos - 1;
20252if (ReduxWidth > ReductionLimit &&V.isTreeNotExtendable()) {
20253// Add subvectors of VL to the list of the analyzed values.
20254for (unsigned VF =getFloorFullVectorNumberOfElements(
20255 *TTI, VL.front()->getType(), ReduxWidth - 1);
20256 VF >= ReductionLimit;
20257 VF =getFloorFullVectorNumberOfElements(
20258 *TTI, VL.front()->getType(), VF - 1)) {
20259if (has_single_bit(VF) &&
20260V.getCanonicalGraphSize() !=V.getTreeSize())
20261continue;
20262for (unsignedIdx : seq<unsigned>(ReduxWidth - VF))
20263 IgnoredCandidates.insert(std::make_pair(Offset +Idx, VF));
20264 }
20265 }
20266 }
20267continue;
20268 }
20269
20270LLVM_DEBUG(dbgs() <<"SLP: Vectorizing horizontal reduction at cost:"
20271 <<Cost <<". (HorRdx)\n");
20272V.getORE()->emit([&]() {
20273returnOptimizationRemark(SV_NAME,"VectorizedHorizontalReduction",
20274 ReducedValsToOps.at(VL[0]).front())
20275 <<"Vectorized horizontal reduction with cost "
20276 <<ore::NV("Cost",Cost) <<" and with tree size "
20277 <<ore::NV("TreeSize",V.getTreeSize());
20278 });
20279
20280 Builder.setFastMathFlags(RdxFMF);
20281
20282// Emit a reduction. If the root is a select (min/max idiom), the insert
20283// point is the compare condition of that select.
20284Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20285Instruction *InsertPt = RdxRootInst;
20286if (IsCmpSelMinMax)
20287 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20288
20289// Vectorize a tree.
20290Value *VectorizedRoot =
20291V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20292// Update TrackedToOrig mapping, since the tracked values might be
20293// updated.
20294for (Value *RdxVal : Candidates) {
20295Value *OrigVal = TrackedToOrig.at(RdxVal);
20296Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20297if (TransformedRdxVal != RdxVal)
20298 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20299 }
20300
20301 Builder.SetInsertPoint(InsertPt);
20302
20303// To prevent poison from leaking across what used to be sequential,
20304// safe, scalar boolean logic operations, the reduction operand must be
20305// frozen.
20306if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20307 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20308
20309// Emit code to correctly handle reused reduced values, if required.
20310if (OptReusedScalars && !SameScaleFactor) {
20311 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20312 SameValuesCounter, TrackedToOrig);
20313 }
20314
20315Value *ReducedSubTree;
20316Type *ScalarTy = VL.front()->getType();
20317if (isa<FixedVectorType>(ScalarTy)) {
20318assert(SLPReVec &&"FixedVectorType is not expected.");
20319unsigned ScalarTyNumElements =getNumElements(ScalarTy);
20320 ReducedSubTree =PoisonValue::get(FixedVectorType::get(
20321 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20322for (unsignedI : seq<unsigned>(ScalarTyNumElements)) {
20323// Do reduction for each lane.
20324// e.g., do reduce add for
20325// VL[0] = <4 x Ty> <a, b, c, d>
20326// VL[1] = <4 x Ty> <e, f, g, h>
20327// Lane[0] = <2 x Ty> <a, e>
20328// Lane[1] = <2 x Ty> <b, f>
20329// Lane[2] = <2 x Ty> <c, g>
20330// Lane[3] = <2 x Ty> <d, h>
20331// result[0] = reduce add Lane[0]
20332// result[1] = reduce add Lane[1]
20333// result[2] = reduce add Lane[2]
20334// result[3] = reduce add Lane[3]
20335SmallVector<int, 16>Mask =
20336createStrideMask(I, ScalarTyNumElements, VL.size());
20337Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20338 ReducedSubTree = Builder.CreateInsertElement(
20339 ReducedSubTree,
20340 emitReduction(Lane, Builder,TTI, RdxRootInst->getType()),I);
20341 }
20342 }else {
20343 ReducedSubTree = emitReduction(VectorizedRoot, Builder,TTI,
20344 RdxRootInst->getType());
20345 }
20346if (ReducedSubTree->getType() != VL.front()->getType()) {
20347assert(ReducedSubTree->getType() != VL.front()->getType() &&
20348"Expected different reduction type.");
20349 ReducedSubTree =
20350 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20351V.isSignedMinBitwidthRootNode());
20352 }
20353
20354// Improved analysis for add/fadd/xor reductions with same scale factor
20355// for all operands of reductions. We can emit scalar ops for them
20356// instead.
20357if (OptReusedScalars && SameScaleFactor)
20358 ReducedSubTree = emitScaleForReusedOps(
20359 ReducedSubTree, Builder, SameValuesCounter.front().second);
20360
20361 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20362// Count vectorized reduced values to exclude them from final reduction.
20363for (Value *RdxVal : VL) {
20364Value *OrigV = TrackedToOrig.at(RdxVal);
20365if (IsSupportedHorRdxIdentityOp) {
20366 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20367continue;
20368 }
20369 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20370if (!V.isVectorized(RdxVal))
20371 RequiredExtract.insert(RdxVal);
20372 }
20373 Pos += ReduxWidth;
20374 Start = Pos;
20375 ReduxWidth = NumReducedVals - Pos;
20376if (ReduxWidth > 1)
20377 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20378 AnyVectorized =true;
20379 }
20380if (OptReusedScalars && !AnyVectorized) {
20381for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20382Value *RdxVal = TrackedVals.at(P.first);
20383Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,P.second);
20384 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20385 VectorizedVals.try_emplace(P.first,P.second);
20386 }
20387continue;
20388 }
20389 }
20390if (VectorizedTree) {
20391// Reorder operands of bool logical op in the natural order to avoid
20392// possible problem with poison propagation. If not possible to reorder
20393// (both operands are originally RHS), emit an extra freeze instruction
20394// for the LHS operand.
20395// I.e., if we have original code like this:
20396// RedOp1 = select i1 ?, i1 LHS, i1 false
20397// RedOp2 = select i1 RHS, i1 ?, i1 false
20398
20399// Then, we swap LHS/RHS to create a new op that matches the poison
20400// semantics of the original code.
20401
20402// If we have original code like this and both values could be poison:
20403// RedOp1 = select i1 ?, i1 LHS, i1 false
20404// RedOp2 = select i1 ?, i1 RHS, i1 false
20405
20406// Then, we must freeze LHS in the new op.
20407auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS,Value *&RHS,
20408Instruction *RedOp1,
20409Instruction *RedOp2,
20410bool InitStep) {
20411if (!AnyBoolLogicOp)
20412return;
20413if (isBoolLogicOp(RedOp1) && ((!InitStep &&LHS == VectorizedTree) ||
20414 getRdxOperand(RedOp1, 0) ==LHS ||
20415isGuaranteedNotToBePoison(LHS, AC)))
20416return;
20417if (isBoolLogicOp(RedOp2) && ((!InitStep &&RHS == VectorizedTree) ||
20418 getRdxOperand(RedOp2, 0) ==RHS ||
20419isGuaranteedNotToBePoison(RHS, AC))) {
20420std::swap(LHS,RHS);
20421return;
20422 }
20423if (LHS != VectorizedTree)
20424LHS = Builder.CreateFreeze(LHS);
20425 };
20426// Finish the reduction.
20427// Need to add extra arguments and not vectorized possible reduction
20428// values.
20429// Try to avoid dependencies between the scalar remainders after
20430// reductions.
20431auto FinalGen =
20432 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
20433bool InitStep) {
20434unsigned Sz = InstVals.size();
20435SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
20436 Sz % 2);
20437for (unsignedI = 0,E = (Sz / 2) * 2;I <E;I += 2) {
20438Instruction *RedOp = InstVals[I + 1].first;
20439 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20440Value *RdxVal1 = InstVals[I].second;
20441Value *StableRdxVal1 = RdxVal1;
20442auto It1 = TrackedVals.find(RdxVal1);
20443if (It1 != TrackedVals.end())
20444 StableRdxVal1 = It1->second;
20445Value *RdxVal2 = InstVals[I + 1].second;
20446Value *StableRdxVal2 = RdxVal2;
20447auto It2 = TrackedVals.find(RdxVal2);
20448if (It2 != TrackedVals.end())
20449 StableRdxVal2 = It2->second;
20450// To prevent poison from leaking across what used to be
20451// sequential, safe, scalar boolean logic operations, the
20452// reduction operand must be frozen.
20453 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20454 RedOp, InitStep);
20455Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20456 StableRdxVal2,"op.rdx", ReductionOps);
20457 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20458 }
20459if (Sz % 2 == 1)
20460 ExtraReds[Sz / 2] = InstVals.back();
20461return ExtraReds;
20462 };
20463SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
20464 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20465 VectorizedTree);
20466SmallPtrSet<Value *, 8> Visited;
20467for (ArrayRef<Value *> Candidates : ReducedVals) {
20468for (Value *RdxVal : Candidates) {
20469if (!Visited.insert(RdxVal).second)
20470continue;
20471unsigned NumOps = VectorizedVals.lookup(RdxVal);
20472for (Instruction *RedOp :
20473ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20474 ExtraReductions.emplace_back(RedOp, RdxVal);
20475 }
20476 }
20477// Iterate through all not-vectorized reduction values/extra arguments.
20478bool InitStep =true;
20479while (ExtraReductions.size() > 1) {
20480SmallVector<std::pair<Instruction *, Value *>> NewReds =
20481 FinalGen(ExtraReductions, InitStep);
20482 ExtraReductions.swap(NewReds);
20483 InitStep =false;
20484 }
20485 VectorizedTree = ExtraReductions.front().second;
20486
20487 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20488
20489// The original scalar reduction is expected to have no remaining
20490// uses outside the reduction tree itself. Assert that we got this
20491// correct, replace internal uses with undef, and mark for eventual
20492// deletion.
20493#ifndef NDEBUG
20494SmallSet<Value *, 4> IgnoreSet;
20495for (ArrayRef<Value *> RdxOps : ReductionOps)
20496 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20497#endif
20498for (ArrayRef<Value *> RdxOps : ReductionOps) {
20499for (Value *Ignore : RdxOps) {
20500if (!Ignore)
20501continue;
20502#ifndef NDEBUG
20503for (auto *U :Ignore->users()) {
20504assert(IgnoreSet.count(U) &&
20505"All users must be either in the reduction ops list.");
20506 }
20507#endif
20508if (!Ignore->use_empty()) {
20509Value *P =PoisonValue::get(Ignore->getType());
20510Ignore->replaceAllUsesWith(P);
20511 }
20512 }
20513V.removeInstructionsAndOperands(RdxOps);
20514 }
20515 }elseif (!CheckForReusedReductionOps) {
20516for (ReductionOpsType &RdxOps : ReductionOps)
20517for (Value *RdxOp : RdxOps)
20518V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20519 }
20520return VectorizedTree;
20521 }
20522
20523private:
20524 /// Calculate the cost of a reduction.
20525InstructionCost getReductionCost(TargetTransformInfo *TTI,
20526ArrayRef<Value *> ReducedVals,
20527bool IsCmpSelMinMax,FastMathFlags FMF,
20528constBoUpSLP &R) {
20529TTI::TargetCostKindCostKind =TTI::TCK_RecipThroughput;
20530Type *ScalarTy = ReducedVals.front()->getType();
20531unsigned ReduxWidth = ReducedVals.size();
20532FixedVectorType *VectorTy =R.getReductionType();
20533InstructionCost VectorCost = 0, ScalarCost;
20534// If all of the reduced values are constant, the vector cost is 0, since
20535// the reduction value can be calculated at the compile time.
20536bool AllConsts =allConstant(ReducedVals);
20537auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20538InstructionCostCost = 0;
20539// Scalar cost is repeated for N-1 elements.
20540int Cnt = ReducedVals.size();
20541for (Value *RdxVal : ReducedVals) {
20542if (Cnt == 1)
20543break;
20544 --Cnt;
20545if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20546Cost += GenCostFn();
20547continue;
20548 }
20549InstructionCost ScalarCost = 0;
20550for (User *U : RdxVal->users()) {
20551auto *RdxOp = cast<Instruction>(U);
20552if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20553 ScalarCost +=TTI->getInstructionCost(RdxOp,CostKind);
20554continue;
20555 }
20556 ScalarCost =InstructionCost::getInvalid();
20557break;
20558 }
20559if (ScalarCost.isValid())
20560Cost += ScalarCost;
20561else
20562Cost += GenCostFn();
20563 }
20564returnCost;
20565 };
20566switch (RdxKind) {
20567case RecurKind::Add:
20568case RecurKind::Mul:
20569case RecurKind::Or:
20570case RecurKind::And:
20571case RecurKind::Xor:
20572case RecurKind::FAdd:
20573case RecurKind::FMul: {
20574unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(RdxKind);
20575if (!AllConsts) {
20576if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20577assert(SLPReVec &&"FixedVectorType is not expected.");
20578unsigned ScalarTyNumElements = VecTy->getNumElements();
20579for (unsignedI : seq<unsigned>(ReducedVals.size())) {
20580 VectorCost +=TTI->getShuffleCost(
20581TTI::SK_PermuteSingleSrc, VectorTy,
20582createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20583 VectorCost +=TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20584CostKind);
20585 }
20586 VectorCost +=TTI->getScalarizationOverhead(
20587 VecTy,APInt::getAllOnes(ScalarTyNumElements),/*Insert*/true,
20588/*Extract*/false,TTI::TCK_RecipThroughput);
20589 }else {
20590Type *RedTy = VectorTy->getElementType();
20591auto [RType, IsSigned] =R.getRootNodeTypeWithNoCast().value_or(
20592 std::make_pair(RedTy,true));
20593if (RType == RedTy) {
20594 VectorCost =TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20595 FMF,CostKind);
20596 }else {
20597 VectorCost =TTI->getExtendedReductionCost(
20598 RdxOpcode, !IsSigned, RedTy,getWidenedType(RType, ReduxWidth),
20599 FMF,CostKind);
20600 }
20601 }
20602 }
20603 ScalarCost = EvaluateScalarCost([&]() {
20604returnTTI->getArithmeticInstrCost(RdxOpcode, ScalarTy,CostKind);
20605 });
20606break;
20607 }
20608case RecurKind::FMax:
20609case RecurKind::FMin:
20610case RecurKind::FMaximum:
20611case RecurKind::FMinimum:
20612case RecurKind::SMax:
20613case RecurKind::SMin:
20614case RecurKind::UMax:
20615case RecurKind::UMin: {
20616Intrinsic::IDId =getMinMaxReductionIntrinsicOp(RdxKind);
20617if (!AllConsts)
20618 VectorCost =TTI->getMinMaxReductionCost(Id, VectorTy, FMF,CostKind);
20619 ScalarCost = EvaluateScalarCost([&]() {
20620IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20621returnTTI->getIntrinsicInstrCost(ICA,CostKind);
20622 });
20623break;
20624 }
20625default:
20626llvm_unreachable("Expected arithmetic or min/max reduction operation");
20627 }
20628
20629LLVM_DEBUG(dbgs() <<"SLP: Adding cost " << VectorCost - ScalarCost
20630 <<" for reduction of " <<shortBundleName(ReducedVals)
20631 <<" (It is a splitting reduction)\n");
20632return VectorCost - ScalarCost;
20633 }
20634
20635 /// Emit a horizontal reduction of the vectorized value.
20636Value *emitReduction(Value *VectorizedValue,IRBuilderBase &Builder,
20637constTargetTransformInfo *TTI,Type *DestTy) {
20638assert(VectorizedValue &&"Need to have a vectorized tree node");
20639assert(RdxKind != RecurKind::FMulAdd &&
20640"A call to the llvm.fmuladd intrinsic is not handled yet");
20641
20642auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20643if (FTy->getScalarType() == Builder.getInt1Ty() &&
20644 RdxKind == RecurKind::Add &&
20645 DestTy->getScalarType() != FTy->getScalarType()) {
20646// Convert vector_reduce_add(ZExt(<n x i1>)) to
20647// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20648Value *V = Builder.CreateBitCast(
20649 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20650 ++NumVectorInstructions;
20651return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20652 }
20653 ++NumVectorInstructions;
20654returncreateSimpleReduction(Builder, VectorizedValue, RdxKind);
20655 }
20656
20657 /// Emits optimized code for unique scalar value reused \p Cnt times.
20658Value *emitScaleForReusedOps(Value *VectorizedValue,IRBuilderBase &Builder,
20659unsigned Cnt) {
20660assert(IsSupportedHorRdxIdentityOp &&
20661"The optimization of matched scalar identity horizontal reductions "
20662"must be supported.");
20663if (Cnt == 1)
20664return VectorizedValue;
20665switch (RdxKind) {
20666case RecurKind::Add: {
20667// res = mul vv, n
20668Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20669LLVM_DEBUG(dbgs() <<"SLP: Add (to-mul) " << Cnt <<"of "
20670 << VectorizedValue <<". (HorRdx)\n");
20671return Builder.CreateMul(VectorizedValue, Scale);
20672 }
20673case RecurKind::Xor: {
20674// res = n % 2 ? 0 : vv
20675LLVM_DEBUG(dbgs() <<"SLP: Xor " << Cnt <<"of " << VectorizedValue
20676 <<". (HorRdx)\n");
20677if (Cnt % 2 == 0)
20678returnConstant::getNullValue(VectorizedValue->getType());
20679return VectorizedValue;
20680 }
20681case RecurKind::FAdd: {
20682// res = fmul v, n
20683Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20684LLVM_DEBUG(dbgs() <<"SLP: FAdd (to-fmul) " << Cnt <<"of "
20685 << VectorizedValue <<". (HorRdx)\n");
20686return Builder.CreateFMul(VectorizedValue, Scale);
20687 }
20688case RecurKind::And:
20689case RecurKind::Or:
20690case RecurKind::SMax:
20691case RecurKind::SMin:
20692case RecurKind::UMax:
20693case RecurKind::UMin:
20694case RecurKind::FMax:
20695case RecurKind::FMin:
20696case RecurKind::FMaximum:
20697case RecurKind::FMinimum:
20698// res = vv
20699return VectorizedValue;
20700case RecurKind::Mul:
20701case RecurKind::FMul:
20702case RecurKind::FMulAdd:
20703case RecurKind::IAnyOf:
20704case RecurKind::FAnyOf:
20705case RecurKind::IFindLastIV:
20706case RecurKind::FFindLastIV:
20707case RecurKind::None:
20708llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20709 }
20710returnnullptr;
20711 }
20712
20713 /// Emits actual operation for the scalar identity values, found during
20714 /// horizontal reduction analysis.
20715Value *
20716 emitReusedOps(Value *VectorizedValue,IRBuilderBase &Builder,BoUpSLP &R,
20717constSmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20718constDenseMap<Value *, Value *> &TrackedToOrig) {
20719assert(IsSupportedHorRdxIdentityOp &&
20720"The optimization of matched scalar identity horizontal reductions "
20721"must be supported.");
20722ArrayRef<Value *> VL =R.getRootNodeScalars();
20723auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20724if (VTy->getElementType() != VL.front()->getType()) {
20725 VectorizedValue = Builder.CreateIntCast(
20726 VectorizedValue,
20727getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20728R.isSignedMinBitwidthRootNode());
20729 }
20730switch (RdxKind) {
20731case RecurKind::Add: {
20732// root = mul prev_root, <1, 1, n, 1>
20733SmallVector<Constant *> Vals;
20734for (Value *V : VL) {
20735unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20736 Vals.push_back(ConstantInt::get(V->getType(), Cnt,/*IsSigned=*/false));
20737 }
20738auto *Scale =ConstantVector::get(Vals);
20739LLVM_DEBUG(dbgs() <<"SLP: Add (to-mul) " << Scale <<"of "
20740 << VectorizedValue <<". (HorRdx)\n");
20741return Builder.CreateMul(VectorizedValue, Scale);
20742 }
20743case RecurKind::And:
20744case RecurKind::Or:
20745// No need for multiple or/and(s).
20746LLVM_DEBUG(dbgs() <<"SLP: And/or of same " << VectorizedValue
20747 <<". (HorRdx)\n");
20748return VectorizedValue;
20749case RecurKind::SMax:
20750case RecurKind::SMin:
20751case RecurKind::UMax:
20752case RecurKind::UMin:
20753case RecurKind::FMax:
20754case RecurKind::FMin:
20755case RecurKind::FMaximum:
20756case RecurKind::FMinimum:
20757// No need for multiple min/max(s) of the same value.
20758LLVM_DEBUG(dbgs() <<"SLP: Max/min of same " << VectorizedValue
20759 <<". (HorRdx)\n");
20760return VectorizedValue;
20761case RecurKind::Xor: {
20762// Replace values with even number of repeats with 0, since
20763// x xor x = 0.
20764// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20765// 7>, if elements 4th and 6th elements have even number of repeats.
20766SmallVector<int>Mask(
20767 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20768PoisonMaskElem);
20769 std::iota(Mask.begin(),Mask.end(), 0);
20770bool NeedShuffle =false;
20771for (unsignedI = 0, VF = VL.size();I < VF; ++I) {
20772Value *V = VL[I];
20773unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20774if (Cnt % 2 == 0) {
20775Mask[I] = VF;
20776 NeedShuffle =true;
20777 }
20778 }
20779LLVM_DEBUG(dbgs() <<"SLP: Xor <";for (intI
20780 : Mask)dbgs()
20781 <<I <<" ";
20782dbgs() <<"> of " << VectorizedValue <<". (HorRdx)\n");
20783if (NeedShuffle)
20784 VectorizedValue = Builder.CreateShuffleVector(
20785 VectorizedValue,
20786 ConstantVector::getNullValue(VectorizedValue->getType()),Mask);
20787return VectorizedValue;
20788 }
20789case RecurKind::FAdd: {
20790// root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20791SmallVector<Constant *> Vals;
20792for (Value *V : VL) {
20793unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20794 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20795 }
20796auto *Scale =ConstantVector::get(Vals);
20797return Builder.CreateFMul(VectorizedValue, Scale);
20798 }
20799case RecurKind::Mul:
20800case RecurKind::FMul:
20801case RecurKind::FMulAdd:
20802case RecurKind::IAnyOf:
20803case RecurKind::FAnyOf:
20804case RecurKind::IFindLastIV:
20805case RecurKind::FFindLastIV:
20806case RecurKind::None:
20807llvm_unreachable("Unexpected reduction kind for reused scalars.");
20808 }
20809returnnullptr;
20810 }
20811};
20812}// end anonymous namespace
20813
20814/// Gets recurrence kind from the specified value.
20815staticRecurKindgetRdxKind(Value *V) {
20816return HorizontalReduction::getRdxKind(V);
20817}
20818static std::optional<unsigned>getAggregateSize(Instruction *InsertInst) {
20819if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20820return cast<FixedVectorType>(IE->getType())->getNumElements();
20821
20822unsigned AggregateSize = 1;
20823auto *IV = cast<InsertValueInst>(InsertInst);
20824Type *CurrentType =IV->getType();
20825do {
20826if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20827for (auto *Elt : ST->elements())
20828if (Elt != ST->getElementType(0))// check homogeneity
20829return std::nullopt;
20830 AggregateSize *= ST->getNumElements();
20831 CurrentType = ST->getElementType(0);
20832 }elseif (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20833 AggregateSize *= AT->getNumElements();
20834 CurrentType = AT->getElementType();
20835 }elseif (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20836 AggregateSize *= VT->getNumElements();
20837return AggregateSize;
20838 }elseif (CurrentType->isSingleValueType()) {
20839return AggregateSize;
20840 }else {
20841return std::nullopt;
20842 }
20843 }while (true);
20844}
20845
20846staticvoidfindBuildAggregate_rec(Instruction *LastInsertInst,
20847TargetTransformInfo *TTI,
20848SmallVectorImpl<Value *> &BuildVectorOpds,
20849SmallVectorImpl<Value *> &InsertElts,
20850unsigned OperandOffset,constBoUpSLP &R) {
20851do {
20852Value *InsertedOperand = LastInsertInst->getOperand(1);
20853 std::optional<unsigned> OperandIndex =
20854getElementIndex(LastInsertInst, OperandOffset);
20855if (!OperandIndex || R.isDeleted(LastInsertInst))
20856return;
20857if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20858findBuildAggregate_rec(cast<Instruction>(InsertedOperand),TTI,
20859 BuildVectorOpds, InsertElts, *OperandIndex, R);
20860
20861 }else {
20862 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20863 InsertElts[*OperandIndex] = LastInsertInst;
20864 }
20865 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20866 }while (LastInsertInst !=nullptr &&
20867 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20868 LastInsertInst->hasOneUse());
20869}
20870
20871/// Recognize construction of vectors like
20872/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20873/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20874/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20875/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20876/// starting from the last insertelement or insertvalue instruction.
20877///
20878/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20879/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20880/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20881///
20882/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20883///
20884/// \return true if it matches.
20885staticboolfindBuildAggregate(Instruction *LastInsertInst,
20886TargetTransformInfo *TTI,
20887SmallVectorImpl<Value *> &BuildVectorOpds,
20888SmallVectorImpl<Value *> &InsertElts,
20889constBoUpSLP &R) {
20890
20891assert((isa<InsertElementInst>(LastInsertInst) ||
20892 isa<InsertValueInst>(LastInsertInst)) &&
20893"Expected insertelement or insertvalue instruction!");
20894
20895assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20896"Expected empty result vectors!");
20897
20898 std::optional<unsigned> AggregateSize =getAggregateSize(LastInsertInst);
20899if (!AggregateSize)
20900returnfalse;
20901 BuildVectorOpds.resize(*AggregateSize);
20902 InsertElts.resize(*AggregateSize);
20903
20904findBuildAggregate_rec(LastInsertInst,TTI, BuildVectorOpds, InsertElts, 0,
20905 R);
20906llvm::erase(BuildVectorOpds,nullptr);
20907llvm::erase(InsertElts,nullptr);
20908if (BuildVectorOpds.size() >= 2)
20909returntrue;
20910
20911returnfalse;
20912}
20913
20914/// Try and get a reduction instruction from a phi node.
20915///
20916/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20917/// if they come from either \p ParentBB or a containing loop latch.
20918///
20919/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20920/// if not possible.
20921staticInstruction *getReductionInstr(constDominatorTree *DT,PHINode *P,
20922BasicBlock *ParentBB,LoopInfo *LI) {
20923// There are situations where the reduction value is not dominated by the
20924// reduction phi. Vectorizing such cases has been reported to cause
20925// miscompiles. See PR25787.
20926auto DominatedReduxValue = [&](Value *R) {
20927return isa<Instruction>(R) &&
20928 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20929 };
20930
20931Instruction *Rdx =nullptr;
20932
20933// Return the incoming value if it comes from the same BB as the phi node.
20934if (P->getIncomingBlock(0) == ParentBB) {
20935 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20936 }elseif (P->getIncomingBlock(1) == ParentBB) {
20937 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20938 }
20939
20940if (Rdx && DominatedReduxValue(Rdx))
20941return Rdx;
20942
20943// Otherwise, check whether we have a loop latch to look at.
20944Loop *BBL = LI->getLoopFor(ParentBB);
20945if (!BBL)
20946returnnullptr;
20947BasicBlock *BBLatch = BBL->getLoopLatch();
20948if (!BBLatch)
20949returnnullptr;
20950
20951// There is a loop latch, return the incoming value if it comes from
20952// that. This reduction pattern occasionally turns up.
20953if (P->getIncomingBlock(0) == BBLatch) {
20954 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20955 }elseif (P->getIncomingBlock(1) == BBLatch) {
20956 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20957 }
20958
20959if (Rdx && DominatedReduxValue(Rdx))
20960return Rdx;
20961
20962returnnullptr;
20963}
20964
20965staticboolmatchRdxBop(Instruction *I,Value *&V0,Value *&V1) {
20966if (match(I,m_BinOp(m_Value(V0),m_Value(V1))))
20967returntrue;
20968if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0),m_Value(V1))))
20969returntrue;
20970if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0),m_Value(V1))))
20971returntrue;
20972if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0),m_Value(V1))))
20973returntrue;
20974if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0),m_Value(V1))))
20975returntrue;
20976if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0),m_Value(V1))))
20977returntrue;
20978if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0),m_Value(V1))))
20979returntrue;
20980if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0),m_Value(V1))))
20981returntrue;
20982if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0),m_Value(V1))))
20983returntrue;
20984returnfalse;
20985}
20986
20987/// We could have an initial reduction that is not an add.
20988/// r *= v1 + v2 + v3 + v4
20989/// In such a case start looking for a tree rooted in the first '+'.
20990/// \Returns the new root if found, which may be nullptr if not an instruction.
20991staticInstruction *tryGetSecondaryReductionRoot(PHINode *Phi,
20992Instruction *Root) {
20993assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20994 isa<IntrinsicInst>(Root)) &&
20995"Expected binop, select, or intrinsic for reduction matching");
20996Value *LHS =
20997 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20998Value *RHS =
20999 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
21000if (LHS == Phi)
21001return dyn_cast<Instruction>(RHS);
21002if (RHS == Phi)
21003return dyn_cast<Instruction>(LHS);
21004returnnullptr;
21005}
21006
21007/// \p Returns the first operand of \p I that does not match \p Phi. If
21008/// operand is not an instruction it returns nullptr.
21009staticInstruction *getNonPhiOperand(Instruction *I,PHINode *Phi) {
21010Value *Op0 =nullptr;
21011Value *Op1 =nullptr;
21012if (!matchRdxBop(I, Op0, Op1))
21013returnnullptr;
21014return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21015}
21016
21017/// \Returns true if \p I is a candidate instruction for reduction vectorization.
21018staticboolisReductionCandidate(Instruction *I) {
21019boolIsSelect =match(I,m_Select(m_Value(),m_Value(),m_Value()));
21020Value *B0 =nullptr, *B1 =nullptr;
21021bool IsBinop =matchRdxBop(I, B0, B1);
21022return IsBinop ||IsSelect;
21023}
21024
21025bool SLPVectorizerPass::vectorizeHorReduction(
21026PHINode *P,Instruction *Root,BasicBlock *BB,BoUpSLP &R,
21027SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
21028if (!ShouldVectorizeHor)
21029returnfalse;
21030bool TryOperandsAsNewSeeds =P && isa<BinaryOperator>(Root);
21031
21032if (Root->getParent() != BB || isa<PHINode>(Root))
21033returnfalse;
21034
21035// If we can find a secondary reduction root, use that instead.
21036auto SelectRoot = [&]() {
21037if (TryOperandsAsNewSeeds &&isReductionCandidate(Root) &&
21038 HorizontalReduction::getRdxKind(Root) !=RecurKind::None)
21039if (Instruction *NewRoot =tryGetSecondaryReductionRoot(P, Root))
21040return NewRoot;
21041return Root;
21042 };
21043
21044// Start analysis starting from Root instruction. If horizontal reduction is
21045// found, try to vectorize it. If it is not a horizontal reduction or
21046// vectorization is not possible or not effective, and currently analyzed
21047// instruction is a binary operation, try to vectorize the operands, using
21048// pre-order DFS traversal order. If the operands were not vectorized, repeat
21049// the same procedure considering each operand as a possible root of the
21050// horizontal reduction.
21051// Interrupt the process if the Root instruction itself was vectorized or all
21052// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
21053// If a horizintal reduction was not matched or vectorized we collect
21054// instructions for possible later attempts for vectorization.
21055 std::queue<std::pair<Instruction *, unsigned>>Stack;
21056Stack.emplace(SelectRoot(), 0);
21057SmallPtrSet<Value *, 8> VisitedInstrs;
21058bool Res =false;
21059auto &&TryToReduce = [this, &R](Instruction *Inst) ->Value * {
21060if (R.isAnalyzedReductionRoot(Inst))
21061returnnullptr;
21062if (!isReductionCandidate(Inst))
21063returnnullptr;
21064HorizontalReduction HorRdx;
21065if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21066returnnullptr;
21067return HorRdx.tryToReduce(R, *DL,TTI, *TLI, AC);
21068 };
21069auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21070if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21071 FutureSeed =getNonPhiOperand(Root,P);
21072if (!FutureSeed)
21073returnfalse;
21074 }
21075// Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21076// analysis is done separately.
21077if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21078 PostponedInsts.push_back(FutureSeed);
21079returntrue;
21080 };
21081
21082while (!Stack.empty()) {
21083Instruction *Inst;
21084unsigned Level;
21085 std::tie(Inst, Level) =Stack.front();
21086Stack.pop();
21087// Do not try to analyze instruction that has already been vectorized.
21088// This may happen when we vectorize instruction operands on a previous
21089// iteration while stack was populated before that happened.
21090if (R.isDeleted(Inst))
21091continue;
21092if (Value *VectorizedV = TryToReduce(Inst)) {
21093 Res =true;
21094if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21095// Try to find another reduction.
21096Stack.emplace(I, Level);
21097continue;
21098 }
21099if (R.isDeleted(Inst))
21100continue;
21101 }else {
21102// We could not vectorize `Inst` so try to use it as a future seed.
21103if (!TryAppendToPostponedInsts(Inst)) {
21104assert(Stack.empty() &&"Expected empty stack");
21105break;
21106 }
21107 }
21108
21109// Try to vectorize operands.
21110// Continue analysis for the instruction from the same basic block only to
21111// save compile time.
21112if (++Level <RecursionMaxDepth)
21113for (auto *Op : Inst->operand_values())
21114if (VisitedInstrs.insert(Op).second)
21115if (auto *I = dyn_cast<Instruction>(Op))
21116// Do not try to vectorize CmpInst operands, this is done
21117// separately.
21118if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21119 !R.isDeleted(I) &&I->getParent() == BB)
21120Stack.emplace(I, Level);
21121 }
21122return Res;
21123}
21124
21125bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P,Instruction *Root,
21126BasicBlock *BB,BoUpSLP &R) {
21127SmallVector<WeakTrackingVH> PostponedInsts;
21128bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21129 Res |= tryToVectorize(PostponedInsts, R);
21130return Res;
21131}
21132
21133bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21134BoUpSLP &R) {
21135bool Res =false;
21136for (Value *V : Insts)
21137if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21138 Res |= tryToVectorize(Inst, R);
21139return Res;
21140}
21141
21142bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21143BasicBlock *BB,BoUpSLP &R,
21144bool MaxVFOnly) {
21145if (!R.canMapToVector(IVI->getType()))
21146returnfalse;
21147
21148SmallVector<Value *, 16> BuildVectorOpds;
21149SmallVector<Value *, 16> BuildVectorInsts;
21150if (!findBuildAggregate(IVI,TTI, BuildVectorOpds, BuildVectorInsts, R))
21151returnfalse;
21152
21153if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21154R.getORE()->emit([&]() {
21155returnOptimizationRemarkMissed(SV_NAME,"NotPossible", IVI)
21156 <<"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21157"trying reduction first.";
21158 });
21159returnfalse;
21160 }
21161LLVM_DEBUG(dbgs() <<"SLP: array mappable to vector: " << *IVI <<"\n");
21162// Aggregate value is unlikely to be processed in vector register.
21163return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21164}
21165
21166bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21167BasicBlock *BB,BoUpSLP &R,
21168bool MaxVFOnly) {
21169SmallVector<Value *, 16> BuildVectorInsts;
21170SmallVector<Value *, 16> BuildVectorOpds;
21171SmallVector<int>Mask;
21172if (!findBuildAggregate(IEI,TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21173 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21174isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21175returnfalse;
21176
21177if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21178R.getORE()->emit([&]() {
21179returnOptimizationRemarkMissed(SV_NAME,"NotPossible", IEI)
21180 <<"Cannot SLP vectorize list: only 2 elements of buildvector, "
21181"trying reduction first.";
21182 });
21183returnfalse;
21184 }
21185LLVM_DEBUG(dbgs() <<"SLP: array mappable to vector: " << *IEI <<"\n");
21186return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21187}
21188
21189template <typename T>
21190staticbooltryToVectorizeSequence(
21191SmallVectorImpl<T *> &Incoming,function_ref<bool(T *,T *)> Comparator,
21192function_ref<bool(T *,T *)> AreCompatible,
21193function_ref<bool(ArrayRef<T *>,bool)> TryToVectorizeHelper,
21194bool MaxVFOnly,BoUpSLP &R) {
21195bool Changed =false;
21196// Sort by type, parent, operands.
21197stable_sort(Incoming, Comparator);
21198
21199// Try to vectorize elements base on their type.
21200SmallVector<T *> Candidates;
21201SmallVector<T *> VL;
21202for (auto *IncIt =Incoming.begin(), *E =Incoming.end(); IncIt != E;
21203 VL.clear()) {
21204// Look for the next elements with the same type, parent and operand
21205// kinds.
21206auto *I = dyn_cast<Instruction>(*IncIt);
21207if (!I || R.isDeleted(I)) {
21208 ++IncIt;
21209continue;
21210 }
21211auto *SameTypeIt = IncIt;
21212while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21213 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21214 AreCompatible(*SameTypeIt, *IncIt))) {
21215auto *I = dyn_cast<Instruction>(*SameTypeIt);
21216 ++SameTypeIt;
21217if (I && !R.isDeleted(I))
21218 VL.push_back(cast<T>(I));
21219 }
21220
21221// Try to vectorize them.
21222unsigned NumElts = VL.size();
21223LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize starting at nodes ("
21224 << NumElts <<")\n");
21225// The vectorization is a 3-state attempt:
21226// 1. Try to vectorize instructions with the same/alternate opcodes with the
21227// size of maximal register at first.
21228// 2. Try to vectorize remaining instructions with the same type, if
21229// possible. This may result in the better vectorization results rather than
21230// if we try just to vectorize instructions with the same/alternate opcodes.
21231// 3. Final attempt to try to vectorize all instructions with the
21232// same/alternate ops only, this may result in some extra final
21233// vectorization.
21234if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21235// Success start over because instructions might have been changed.
21236 Changed =true;
21237 VL.swap(Candidates);
21238 Candidates.clear();
21239for (T *V : VL) {
21240if (auto *I = dyn_cast<Instruction>(V);I && !R.isDeleted(I))
21241 Candidates.push_back(V);
21242 }
21243 }else {
21244 /// \Returns the minimum number of elements that we will attempt to
21245 /// vectorize.
21246auto GetMinNumElements = [&R](Value *V) {
21247unsigned EltSize = R.getVectorElementSize(V);
21248return std::max(2U, R.getMaxVecRegSize() / EltSize);
21249 };
21250if (NumElts < GetMinNumElements(*IncIt) &&
21251 (Candidates.empty() ||
21252 Candidates.front()->getType() == (*IncIt)->getType())) {
21253for (T *V : VL) {
21254if (auto *I = dyn_cast<Instruction>(V);I && !R.isDeleted(I))
21255 Candidates.push_back(V);
21256 }
21257 }
21258 }
21259// Final attempt to vectorize instructions with the same types.
21260if (Candidates.size() > 1 &&
21261 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21262if (TryToVectorizeHelper(Candidates,/*MaxVFOnly=*/false)) {
21263// Success start over because instructions might have been changed.
21264 Changed =true;
21265 }elseif (MaxVFOnly) {
21266// Try to vectorize using small vectors.
21267SmallVector<T *> VL;
21268for (auto *It = Candidates.begin(), *End = Candidates.end(); It !=End;
21269 VL.clear()) {
21270auto *I = dyn_cast<Instruction>(*It);
21271if (!I || R.isDeleted(I)) {
21272 ++It;
21273continue;
21274 }
21275auto *SameTypeIt = It;
21276while (SameTypeIt !=End &&
21277 (!isa<Instruction>(*SameTypeIt) ||
21278 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21279 AreCompatible(*SameTypeIt, *It))) {
21280auto *I = dyn_cast<Instruction>(*SameTypeIt);
21281 ++SameTypeIt;
21282if (I && !R.isDeleted(I))
21283 VL.push_back(cast<T>(I));
21284 }
21285unsigned NumElts = VL.size();
21286if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21287/*MaxVFOnly=*/false))
21288 Changed =true;
21289 It = SameTypeIt;
21290 }
21291 }
21292 Candidates.clear();
21293 }
21294
21295// Start over at the next instruction of a different type (or the end).
21296 IncIt = SameTypeIt;
21297 }
21298return Changed;
21299}
21300
21301/// Compare two cmp instructions. If IsCompatibility is true, function returns
21302/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21303/// operands. If IsCompatibility is false, function implements strict weak
21304/// ordering relation between two cmp instructions, returning true if the first
21305/// instruction is "less" than the second, i.e. its predicate is less than the
21306/// predicate of the second or the operands IDs are less than the operands IDs
21307/// of the second cmp instruction.
21308template <bool IsCompatibility>
21309staticboolcompareCmp(Value *V,Value *V2,TargetLibraryInfo &TLI,
21310constDominatorTree &DT) {
21311assert(isValidElementType(V->getType()) &&
21312isValidElementType(V2->getType()) &&
21313"Expected valid element types only.");
21314if (V == V2)
21315return IsCompatibility;
21316auto *CI1 = cast<CmpInst>(V);
21317auto *CI2 = cast<CmpInst>(V2);
21318if (CI1->getOperand(0)->getType()->getTypeID() <
21319 CI2->getOperand(0)->getType()->getTypeID())
21320return !IsCompatibility;
21321if (CI1->getOperand(0)->getType()->getTypeID() >
21322 CI2->getOperand(0)->getType()->getTypeID())
21323returnfalse;
21324if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21325 CI2->getOperand(0)->getType()->getScalarSizeInBits())
21326return !IsCompatibility;
21327if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21328 CI2->getOperand(0)->getType()->getScalarSizeInBits())
21329returnfalse;
21330CmpInst::Predicate Pred1 = CI1->getPredicate();
21331CmpInst::Predicate Pred2 = CI2->getPredicate();
21332CmpInst::Predicate SwapPred1 =CmpInst::getSwappedPredicate(Pred1);
21333CmpInst::Predicate SwapPred2 =CmpInst::getSwappedPredicate(Pred2);
21334CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21335CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21336if (BasePred1 < BasePred2)
21337return !IsCompatibility;
21338if (BasePred1 > BasePred2)
21339returnfalse;
21340// Compare operands.
21341bool CI1Preds = Pred1 == BasePred1;
21342bool CI2Preds = Pred2 == BasePred1;
21343for (intI = 0, E = CI1->getNumOperands();I < E; ++I) {
21344auto *Op1 = CI1->getOperand(CI1Preds ?I : E -I - 1);
21345auto *Op2 = CI2->getOperand(CI2Preds ?I : E -I - 1);
21346if (Op1 == Op2)
21347continue;
21348if (Op1->getValueID() < Op2->getValueID())
21349return !IsCompatibility;
21350if (Op1->getValueID() > Op2->getValueID())
21351returnfalse;
21352if (auto *I1 = dyn_cast<Instruction>(Op1))
21353if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21354if (IsCompatibility) {
21355if (I1->getParent() != I2->getParent())
21356returnfalse;
21357 }else {
21358// Try to compare nodes with same parent.
21359DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21360DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21361if (!NodeI1)
21362return NodeI2 !=nullptr;
21363if (!NodeI2)
21364returnfalse;
21365assert((NodeI1 == NodeI2) ==
21366 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21367"Different nodes should have different DFS numbers");
21368if (NodeI1 != NodeI2)
21369return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21370 }
21371 InstructionsState S =getSameOpcode({I1, I2}, TLI);
21372if (S && (IsCompatibility || !S.isAltShuffle()))
21373continue;
21374if (IsCompatibility)
21375returnfalse;
21376if (I1->getOpcode() != I2->getOpcode())
21377return I1->getOpcode() < I2->getOpcode();
21378 }
21379 }
21380return IsCompatibility;
21381}
21382
21383template <typename ItT>
21384bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21385BasicBlock *BB,BoUpSLP &R) {
21386bool Changed =false;
21387// Try to find reductions first.
21388for (CmpInst *I : CmpInsts) {
21389if (R.isDeleted(I))
21390continue;
21391for (Value *Op :I->operands())
21392if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21393 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21394if (R.isDeleted(I))
21395break;
21396 }
21397 }
21398// Try to vectorize operands as vector bundles.
21399for (CmpInst *I : CmpInsts) {
21400if (R.isDeleted(I))
21401continue;
21402 Changed |= tryToVectorize(I, R);
21403 }
21404// Try to vectorize list of compares.
21405// Sort by type, compare predicate, etc.
21406auto CompareSorter = [&](Value *V,Value *V2) {
21407if (V == V2)
21408returnfalse;
21409return compareCmp<false>(V, V2, *TLI, *DT);
21410 };
21411
21412auto AreCompatibleCompares = [&](Value *V1,Value *V2) {
21413if (V1 == V2)
21414returntrue;
21415return compareCmp<true>(V1, V2, *TLI, *DT);
21416 };
21417
21418SmallVector<Value *> Vals;
21419for (Instruction *V : CmpInsts)
21420if (!R.isDeleted(V) &&isValidElementType(getValueType(V)))
21421 Vals.push_back(V);
21422if (Vals.size() <= 1)
21423return Changed;
21424 Changed |= tryToVectorizeSequence<Value>(
21425 Vals, CompareSorter, AreCompatibleCompares,
21426 [this, &R](ArrayRef<Value *> Candidates,bool MaxVFOnly) {
21427// Exclude possible reductions from other blocks.
21428bool ArePossiblyReducedInOtherBlock =any_of(Candidates, [](Value *V) {
21429returnany_of(V->users(), [V](User *U) {
21430 auto *Select = dyn_cast<SelectInst>(U);
21431 return Select &&
21432 Select->getParent() != cast<Instruction>(V)->getParent();
21433 });
21434 });
21435if (ArePossiblyReducedInOtherBlock)
21436returnfalse;
21437return tryToVectorizeList(Candidates, R, MaxVFOnly);
21438 },
21439/*MaxVFOnly=*/true,R);
21440return Changed;
21441}
21442
21443bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21444BasicBlock *BB,BoUpSLP &R) {
21445assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21446"This function only accepts Insert instructions");
21447bool OpsChanged =false;
21448SmallVector<WeakTrackingVH> PostponedInsts;
21449for (auto *I :reverse(Instructions)) {
21450// pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21451if (R.isDeleted(I) || isa<CmpInst>(I))
21452continue;
21453if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21454 OpsChanged |=
21455 vectorizeInsertValueInst(LastInsertValue, BB, R,/*MaxVFOnly=*/true);
21456 }elseif (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21457 OpsChanged |=
21458 vectorizeInsertElementInst(LastInsertElem, BB, R,/*MaxVFOnly=*/true);
21459 }
21460// pass2 - try to vectorize reductions only
21461if (R.isDeleted(I))
21462continue;
21463 OpsChanged |= vectorizeHorReduction(nullptr,I, BB, R, PostponedInsts);
21464if (R.isDeleted(I) || isa<CmpInst>(I))
21465continue;
21466// pass3 - try to match and vectorize a buildvector sequence.
21467if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21468 OpsChanged |=
21469 vectorizeInsertValueInst(LastInsertValue, BB, R,/*MaxVFOnly=*/false);
21470 }elseif (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21471 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21472/*MaxVFOnly=*/false);
21473 }
21474 }
21475// Now try to vectorize postponed instructions.
21476 OpsChanged |= tryToVectorize(PostponedInsts, R);
21477
21478Instructions.clear();
21479return OpsChanged;
21480}
21481
21482bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,BoUpSLP &R) {
21483bool Changed =false;
21484SmallVector<Value *, 4>Incoming;
21485SmallPtrSet<Value *, 16> VisitedInstrs;
21486// Maps phi nodes to the non-phi nodes found in the use tree for each phi
21487// node. Allows better to identify the chains that can be vectorized in the
21488// better way.
21489DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
21490auto PHICompare = [this, &PHIToOpcodes](Value *V1,Value *V2) {
21491assert(isValidElementType(V1->getType()) &&
21492isValidElementType(V2->getType()) &&
21493"Expected vectorizable types only.");
21494// It is fine to compare type IDs here, since we expect only vectorizable
21495// types, like ints, floats and pointers, we don't care about other type.
21496if (V1->getType()->getTypeID() <V2->getType()->getTypeID())
21497returntrue;
21498if (V1->getType()->getTypeID() >V2->getType()->getTypeID())
21499returnfalse;
21500if (V1->getType()->getScalarSizeInBits() <
21501V2->getType()->getScalarSizeInBits())
21502returntrue;
21503if (V1->getType()->getScalarSizeInBits() >
21504V2->getType()->getScalarSizeInBits())
21505returnfalse;
21506ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21507ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21508if (Opcodes1.size() < Opcodes2.size())
21509returntrue;
21510if (Opcodes1.size() > Opcodes2.size())
21511returnfalse;
21512for (intI = 0, E = Opcodes1.size();I < E; ++I) {
21513 {
21514// Instructions come first.
21515auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21516auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21517if (I1 && I2) {
21518DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21519DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21520if (!NodeI1)
21521return NodeI2 !=nullptr;
21522if (!NodeI2)
21523returnfalse;
21524assert((NodeI1 == NodeI2) ==
21525 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21526"Different nodes should have different DFS numbers");
21527if (NodeI1 != NodeI2)
21528return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21529 InstructionsState S =getSameOpcode({I1, I2}, *TLI);
21530if (S && !S.isAltShuffle())
21531continue;
21532returnI1->getOpcode() < I2->getOpcode();
21533 }
21534if (I1)
21535returntrue;
21536if (I2)
21537returnfalse;
21538 }
21539 {
21540// Non-undef constants come next.
21541bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21542bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21543if (C1 && C2)
21544continue;
21545if (C1)
21546returntrue;
21547if (C2)
21548returnfalse;
21549 }
21550bool U1 = isa<UndefValue>(Opcodes1[I]);
21551bool U2 = isa<UndefValue>(Opcodes2[I]);
21552 {
21553// Non-constant non-instructions come next.
21554if (!U1 && !U2) {
21555auto ValID1 = Opcodes1[I]->getValueID();
21556auto ValID2 = Opcodes2[I]->getValueID();
21557if (ValID1 == ValID2)
21558continue;
21559if (ValID1 < ValID2)
21560returntrue;
21561if (ValID1 > ValID2)
21562returnfalse;
21563 }
21564if (!U1)
21565returntrue;
21566if (!U2)
21567returnfalse;
21568 }
21569// Undefs come last.
21570assert(U1 && U2 &&"The only thing left should be undef & undef.");
21571 }
21572returnfalse;
21573 };
21574auto AreCompatiblePHIs = [&PHIToOpcodes,this, &R](Value *V1,Value *V2) {
21575if (V1 == V2)
21576returntrue;
21577if (V1->getType() !=V2->getType())
21578returnfalse;
21579ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21580ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21581if (Opcodes1.size() != Opcodes2.size())
21582returnfalse;
21583for (intI = 0, E = Opcodes1.size();I < E; ++I) {
21584// Undefs are compatible with any other value.
21585if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21586continue;
21587if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21588if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21589if (R.isDeleted(I1) ||R.isDeleted(I2))
21590returnfalse;
21591if (I1->getParent() != I2->getParent())
21592returnfalse;
21593if (getSameOpcode({I1, I2}, *TLI))
21594continue;
21595returnfalse;
21596 }
21597if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21598continue;
21599if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21600returnfalse;
21601 }
21602returntrue;
21603 };
21604
21605bool HaveVectorizedPhiNodes =false;
21606do {
21607// Collect the incoming values from the PHIs.
21608Incoming.clear();
21609for (Instruction &I : *BB) {
21610auto *P = dyn_cast<PHINode>(&I);
21611if (!P ||P->getNumIncomingValues() >MaxPHINumOperands)
21612break;
21613
21614// No need to analyze deleted, vectorized and non-vectorizable
21615// instructions.
21616if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21617isValidElementType(P->getType()))
21618Incoming.push_back(P);
21619 }
21620
21621if (Incoming.size() <= 1)
21622break;
21623
21624// Find the corresponding non-phi nodes for better matching when trying to
21625// build the tree.
21626for (Value *V :Incoming) {
21627SmallVectorImpl<Value *> &Opcodes =
21628 PHIToOpcodes.try_emplace(V).first->getSecond();
21629if (!Opcodes.empty())
21630continue;
21631SmallVector<Value *, 4> Nodes(1, V);
21632SmallPtrSet<Value *, 4> Visited;
21633while (!Nodes.empty()) {
21634auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21635if (!Visited.insert(PHI).second)
21636continue;
21637for (Value *V :PHI->incoming_values()) {
21638if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21639 Nodes.push_back(PHI1);
21640continue;
21641 }
21642 Opcodes.emplace_back(V);
21643 }
21644 }
21645 }
21646
21647 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21648Incoming, PHICompare, AreCompatiblePHIs,
21649 [this, &R](ArrayRef<Value *> Candidates,bool MaxVFOnly) {
21650return tryToVectorizeList(Candidates, R, MaxVFOnly);
21651 },
21652/*MaxVFOnly=*/true,R);
21653 Changed |= HaveVectorizedPhiNodes;
21654if (HaveVectorizedPhiNodes &&any_of(PHIToOpcodes, [&](constauto &P) {
21655auto *PHI = dyn_cast<PHINode>(P.first);
21656return !PHI ||R.isDeleted(PHI);
21657 }))
21658 PHIToOpcodes.clear();
21659 VisitedInstrs.insert(Incoming.begin(),Incoming.end());
21660 }while (HaveVectorizedPhiNodes);
21661
21662 VisitedInstrs.clear();
21663
21664 InstSetVector PostProcessInserts;
21665SmallSetVector<CmpInst *, 8> PostProcessCmps;
21666// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21667// also vectorizes `PostProcessCmps`.
21668auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21669bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21670if (VectorizeCmps) {
21671 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21672 PostProcessCmps.clear();
21673 }
21674 PostProcessInserts.clear();
21675return Changed;
21676 };
21677// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21678auto IsInPostProcessInstrs = [&](Instruction *I) {
21679if (auto *Cmp = dyn_cast<CmpInst>(I))
21680return PostProcessCmps.contains(Cmp);
21681return isa<InsertElementInst, InsertValueInst>(I) &&
21682 PostProcessInserts.contains(I);
21683 };
21684// Returns true if `I` is an instruction without users, like terminator, or
21685// function call with ignored return value, store. Ignore unused instructions
21686// (basing on instruction type, except for CallInst and InvokeInst).
21687auto HasNoUsers = [](Instruction *I) {
21688returnI->use_empty() &&
21689 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21690 };
21691for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21692// Skip instructions with scalable type. The num of elements is unknown at
21693// compile-time for scalable type.
21694if (isa<ScalableVectorType>(It->getType()))
21695continue;
21696
21697// Skip instructions marked for the deletion.
21698if (R.isDeleted(&*It))
21699continue;
21700// We may go through BB multiple times so skip the one we have checked.
21701if (!VisitedInstrs.insert(&*It).second) {
21702if (HasNoUsers(&*It) &&
21703 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21704// We would like to start over since some instructions are deleted
21705// and the iterator may become invalid value.
21706 Changed =true;
21707 It = BB->begin();
21708 E = BB->end();
21709 }
21710continue;
21711 }
21712
21713if (isa<DbgInfoIntrinsic>(It))
21714continue;
21715
21716// Try to vectorize reductions that use PHINodes.
21717if (PHINode *P = dyn_cast<PHINode>(It)) {
21718// Check that the PHI is a reduction PHI.
21719if (P->getNumIncomingValues() == 2) {
21720// Try to match and vectorize a horizontal reduction.
21721Instruction *Root =getReductionInstr(DT,P, BB, LI);
21722if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21723 Changed =true;
21724 It = BB->begin();
21725 E = BB->end();
21726continue;
21727 }
21728 }
21729// Try to vectorize the incoming values of the PHI, to catch reductions
21730// that feed into PHIs.
21731for (unsignedI : seq<unsigned>(P->getNumIncomingValues())) {
21732// Skip if the incoming block is the current BB for now. Also, bypass
21733// unreachable IR for efficiency and to avoid crashing.
21734// TODO: Collect the skipped incoming values and try to vectorize them
21735// after processing BB.
21736if (BB ==P->getIncomingBlock(I) ||
21737 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21738continue;
21739
21740// Postponed instructions should not be vectorized here, delay their
21741// vectorization.
21742if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21743 PI && !IsInPostProcessInstrs(PI)) {
21744bool Res =
21745 vectorizeRootInstruction(nullptr, PI,P->getIncomingBlock(I), R);
21746 Changed |= Res;
21747if (Res &&R.isDeleted(P)) {
21748 It = BB->begin();
21749 E = BB->end();
21750break;
21751 }
21752 }
21753 }
21754continue;
21755 }
21756
21757if (HasNoUsers(&*It)) {
21758bool OpsChanged =false;
21759auto *SI = dyn_cast<StoreInst>(It);
21760bool TryToVectorizeRoot =ShouldStartVectorizeHorAtStore || !SI;
21761if (SI) {
21762auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21763// Try to vectorize chain in store, if this is the only store to the
21764// address in the block.
21765// TODO: This is just a temporarily solution to save compile time. Need
21766// to investigate if we can safely turn on slp-vectorize-hor-store
21767// instead to allow lookup for reduction chains in all non-vectorized
21768// stores (need to check side effects and compile time).
21769 TryToVectorizeRoot |= (I == Stores.end() ||I->second.size() == 1) &&
21770SI->getValueOperand()->hasOneUse();
21771 }
21772if (TryToVectorizeRoot) {
21773for (auto *V : It->operand_values()) {
21774// Postponed instructions should not be vectorized here, delay their
21775// vectorization.
21776if (auto *VI = dyn_cast<Instruction>(V);
21777VI && !IsInPostProcessInstrs(VI))
21778// Try to match and vectorize a horizontal reduction.
21779 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21780 }
21781 }
21782// Start vectorization of post-process list of instructions from the
21783// top-tree instructions to try to vectorize as many instructions as
21784// possible.
21785 OpsChanged |=
21786 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21787if (OpsChanged) {
21788// We would like to start over since some instructions are deleted
21789// and the iterator may become invalid value.
21790 Changed =true;
21791 It = BB->begin();
21792 E = BB->end();
21793continue;
21794 }
21795 }
21796
21797if (isa<InsertElementInst, InsertValueInst>(It))
21798 PostProcessInserts.insert(&*It);
21799elseif (isa<CmpInst>(It))
21800 PostProcessCmps.insert(cast<CmpInst>(&*It));
21801 }
21802
21803return Changed;
21804}
21805
21806bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,BoUpSLP &R) {
21807auto Changed =false;
21808for (auto &Entry : GEPs) {
21809// If the getelementptr list has fewer than two elements, there's nothing
21810// to do.
21811if (Entry.second.size() < 2)
21812continue;
21813
21814LLVM_DEBUG(dbgs() <<"SLP: Analyzing a getelementptr list of length "
21815 <<Entry.second.size() <<".\n");
21816
21817// Process the GEP list in chunks suitable for the target's supported
21818// vector size. If a vector register can't hold 1 element, we are done. We
21819// are trying to vectorize the index computations, so the maximum number of
21820// elements is based on the size of the index expression, rather than the
21821// size of the GEP itself (the target's pointer size).
21822auto *It =find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21823 return !R.isDeleted(GEP);
21824 });
21825if (It ==Entry.second.end())
21826continue;
21827unsigned MaxVecRegSize =R.getMaxVecRegSize();
21828unsigned EltSize =R.getVectorElementSize(*(*It)->idx_begin());
21829if (MaxVecRegSize < EltSize)
21830continue;
21831
21832unsigned MaxElts = MaxVecRegSize / EltSize;
21833for (unsigned BI = 0, BE =Entry.second.size(); BI < BE; BI += MaxElts) {
21834autoLen = std::min<unsigned>(BE - BI, MaxElts);
21835ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21836
21837// Initialize a set a candidate getelementptrs. Note that we use a
21838// SetVector here to preserve program order. If the index computations
21839// are vectorizable and begin with loads, we want to minimize the chance
21840// of having to reorder them later.
21841SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21842
21843// Some of the candidates may have already been vectorized after we
21844// initially collected them or their index is optimized to constant value.
21845// If so, they are marked as deleted, so remove them from the set of
21846// candidates.
21847 Candidates.remove_if([&R](Value *I) {
21848returnR.isDeleted(cast<Instruction>(I)) ||
21849 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21850 });
21851
21852// Remove from the set of candidates all pairs of getelementptrs with
21853// constant differences. Such getelementptrs are likely not good
21854// candidates for vectorization in a bottom-up phase since one can be
21855// computed from the other. We also ensure all candidate getelementptr
21856// indices are unique.
21857for (intI = 0, E = GEPList.size();I < E && Candidates.size() > 1; ++I) {
21858auto *GEPI = GEPList[I];
21859if (!Candidates.count(GEPI))
21860continue;
21861constSCEV *SCEVI = SE->getSCEV(GEPList[I]);
21862for (int J =I + 1; J < E && Candidates.size() > 1; ++J) {
21863auto *GEPJ = GEPList[J];
21864constSCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21865if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21866 Candidates.remove(GEPI);
21867 Candidates.remove(GEPJ);
21868 }elseif (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21869 Candidates.remove(GEPJ);
21870 }
21871 }
21872 }
21873
21874// We break out of the above computation as soon as we know there are
21875// fewer than two candidates remaining.
21876if (Candidates.size() < 2)
21877continue;
21878
21879// Add the single, non-constant index of each candidate to the bundle. We
21880// ensured the indices met these constraints when we originally collected
21881// the getelementptrs.
21882SmallVector<Value *, 16> Bundle(Candidates.size());
21883auto BundleIndex = 0u;
21884for (auto *V : Candidates) {
21885auto *GEP = cast<GetElementPtrInst>(V);
21886auto *GEPIdx =GEP->idx_begin()->get();
21887assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21888 Bundle[BundleIndex++] = GEPIdx;
21889 }
21890
21891// Try and vectorize the indices. We are currently only interested in
21892// gather-like cases of the form:
21893//
21894// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21895//
21896// where the loads of "a", the loads of "b", and the subtractions can be
21897// performed in parallel. It's likely that detecting this pattern in a
21898// bottom-up phase will be simpler and less costly than building a
21899// full-blown top-down phase beginning at the consecutive loads.
21900 Changed |= tryToVectorizeList(Bundle, R);
21901 }
21902 }
21903return Changed;
21904}
21905
21906bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21907bool Changed =false;
21908// Sort by type, base pointers and values operand. Value operands must be
21909// compatible (have the same opcode, same parent), otherwise it is
21910// definitely not profitable to try to vectorize them.
21911auto &&StoreSorter = [this](StoreInst *V,StoreInst *V2) {
21912if (V->getValueOperand()->getType()->getTypeID() <
21913V2->getValueOperand()->getType()->getTypeID())
21914returntrue;
21915if (V->getValueOperand()->getType()->getTypeID() >
21916V2->getValueOperand()->getType()->getTypeID())
21917returnfalse;
21918if (V->getPointerOperandType()->getTypeID() <
21919V2->getPointerOperandType()->getTypeID())
21920returntrue;
21921if (V->getPointerOperandType()->getTypeID() >
21922V2->getPointerOperandType()->getTypeID())
21923returnfalse;
21924if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21925V2->getValueOperand()->getType()->getScalarSizeInBits())
21926returntrue;
21927if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21928V2->getValueOperand()->getType()->getScalarSizeInBits())
21929returnfalse;
21930// UndefValues are compatible with all other values.
21931if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21932if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21933DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
21934 DT->getNode(I1->getParent());
21935DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
21936 DT->getNode(I2->getParent());
21937assert(NodeI1 &&"Should only process reachable instructions");
21938assert(NodeI2 &&"Should only process reachable instructions");
21939assert((NodeI1 == NodeI2) ==
21940 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21941"Different nodes should have different DFS numbers");
21942if (NodeI1 != NodeI2)
21943return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21944returnI1->getOpcode() < I2->getOpcode();
21945 }
21946returnV->getValueOperand()->getValueID() <
21947V2->getValueOperand()->getValueID();
21948 };
21949
21950auto &&AreCompatibleStores = [this](StoreInst *V1,StoreInst *V2) {
21951if (V1 == V2)
21952returntrue;
21953if (V1->getValueOperand()->getType() !=V2->getValueOperand()->getType())
21954returnfalse;
21955if (V1->getPointerOperandType() !=V2->getPointerOperandType())
21956returnfalse;
21957// Undefs are compatible with any other value.
21958if (isa<UndefValue>(V1->getValueOperand()) ||
21959 isa<UndefValue>(V2->getValueOperand()))
21960returntrue;
21961if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
21962if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21963if (I1->getParent() != I2->getParent())
21964returnfalse;
21965returngetSameOpcode({I1, I2}, *TLI).valid();
21966 }
21967if (isa<Constant>(V1->getValueOperand()) &&
21968 isa<Constant>(V2->getValueOperand()))
21969returntrue;
21970return V1->getValueOperand()->getValueID() ==
21971V2->getValueOperand()->getValueID();
21972 };
21973
21974// Attempt to sort and vectorize each of the store-groups.
21975DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
21976for (auto &Pair : Stores) {
21977if (Pair.second.size() < 2)
21978continue;
21979
21980LLVM_DEBUG(dbgs() <<"SLP: Analyzing a store chain of length "
21981 << Pair.second.size() <<".\n");
21982
21983if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
21984continue;
21985
21986// Reverse stores to do bottom-to-top analysis. This is important if the
21987// values are stores to the same addresses several times, in this case need
21988// to follow the stores order (reversed to meet the memory dependecies).
21989SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
21990 Pair.second.rend());
21991 Changed |= tryToVectorizeSequence<StoreInst>(
21992 ReversedStores, StoreSorter, AreCompatibleStores,
21993 [&](ArrayRef<StoreInst *> Candidates,bool) {
21994return vectorizeStores(Candidates, R, Attempted);
21995 },
21996/*MaxVFOnly=*/false,R);
21997 }
21998return Changed;
21999}
isConstant
static bool isConstant(const MachineInstr &MI)
Definition:AMDGPUInstructionSelector.cpp:2862
Select
AMDGPU Register Bank Select
Definition:AMDGPURegBankSelect.cpp:71
PHI
Rewrite undef for PHI
Definition:AMDGPURewriteUndefForPHI.cpp:100
Ignore
ReachingDefAnalysis InstSet InstSet & Ignore
Definition:ARMLowOverheadLoops.cpp:531
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition:ARMSLSHardening.cpp:73
Results
Function Alias Analysis Results
Definition:AliasAnalysis.cpp:731
AliasAnalysis.h
AssumptionCache.h
Attributes.h
This file contains the simple types necessary to represent the attributes associated with functions a...
getParent
static const Function * getParent(const Value *V)
Definition:BasicAliasAnalysis.cpp:863
true
basic Basic Alias true
Definition:BasicAliasAnalysis.cpp:1981
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Casting.h
CodeMetrics.h
CommandLine.h
Compiler.h
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition:Compiler.h:622
ConstantFolding.h
Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
DOTGraphTraits.h
getElementIndex
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition:DataLayout.cpp:920
DataLayout.h
Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition:DeadArgumentElimination.cpp:353
DebugCounter.h
This file provides an implementation of debug counters.
DEBUG_COUNTER
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition:DebugCounter.h:190
Debug.h
LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition:Debug.h:106
DemandedBits.h
DenseMap.h
This file defines the DenseMap class.
DenseSet.h
This file defines the DenseSet and SmallDenseSet classes.
DerivedTypes.h
Dominators.h
Name
std::string Name
Definition:ELFObjHandler.cpp:77
Index
uint32_t Index
Definition:ELFObjHandler.cpp:83
Size
uint64_t Size
Definition:ELFObjHandler.cpp:81
End
bool End
Definition:ELF_riscv.cpp:480
Blocks
DenseMap< Block *, BlockRelaxAux > Blocks
Definition:ELF_riscv.cpp:507
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
runImpl
static bool runImpl(Function &F, const TargetLowering &TLI)
Definition:ExpandLargeDivRem.cpp:79
GlobalsModRef.h
This is the interface for a simple mod/ref and alias analysis over globals.
GraphWriter.h
Cleanup
static const HTTPClientCleanup Cleanup
Definition:HTTPClient.cpp:42
GEP
Hexagon Common GEP
Definition:HexagonCommonGEP.cpp:170
_
#define _
Definition:HexagonMCCodeEmitter.cpp:46
IRBuilder.h
MI
IRTranslator LLVM IR MI
Definition:IRTranslator.cpp:112
BasicBlock.h
Constant.h
Function.h
Instruction.h
IntrinsicInst.h
Module.h
Module.h This file contains the declarations for the Module class.
Operator.h
Type.h
Use.h
This defines the Use class.
User.h
Value.h
IVDescriptors.h
Users
iv Induction Variable Users
Definition:IVUsers.cpp:48
InjectTLIMappings.h
InstrTypes.h
InstructionCost.h
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Instructions.h
Intrinsics.h
KnownBits.h
LoopAccessAnalysis.h
LoopInfo.h
LoopUtils.h
isSplat
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
Definition:LowerMatrixIntrinsics.cpp:102
F
#define F(x, y, z)
Definition:MD5.cpp:55
I
#define I(x, y, z)
Definition:MD5.cpp:58
Operands
mir Rename Register Operands
Definition:MIRNamerPass.cpp:74
MathExtras.h
MemoryLocation.h
This file provides utility analysis objects describing memory locations.
Unknown
@ Unknown
Definition:NVPTXISelLowering.cpp:4791
II
uint64_t IntrinsicInst * II
Definition:NVVMIntrRange.cpp:51
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
OptimizationRemarkEmitter.h
P
#define P(N)
verify
ppc ctr loops verify
Definition:PPCCTRLoopsVerify.cpp:72
IsSelect
static bool IsSelect(MachineInstr &MI)
Definition:PPCISelLowering.cpp:13186
if
if(PassOpts->AAPipeline)
Definition:PassBuilderBindings.cpp:64
Pass.h
PatternMatch.h
PriorityQueue.h
This file defines the PriorityQueue class.
Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition:RISCVRedundantCopyElimination.cpp:75
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
isLoadCombineCandidateImpl
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
Definition:SLPVectorizer.cpp:12010
RunSLPVectorization
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
getWidenedType
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
Definition:SLPVectorizer.cpp:263
isVectorLikeInstWithConstOps
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
Definition:SLPVectorizer.cpp:417
calculateRtStride
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
Definition:SLPVectorizer.cpp:4822
isRepeatedNonIdentityClusteredMask
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
Definition:SLPVectorizer.cpp:5864
MaxPHINumOperands
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
Definition:SLPVectorizer.cpp:222
MaxVectorRegSizeOption
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
MaxProfitableLoadStride
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
findBuildAggregate
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
Definition:SLPVectorizer.cpp:20885
needToScheduleSingleInstruction
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:7386
clusterSortPtrAccesses
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Definition:SLPVectorizer.cpp:5357
getNumElements
static unsigned getNumElements(Type *Ty)
Definition:SLPVectorizer.cpp:254
buildUseMask
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
Definition:SLPVectorizer.cpp:616
areCompatibleCmpOps
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
Definition:SLPVectorizer.cpp:873
createInsertVector
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
Definition:SLPVectorizer.cpp:4967
getNumElems
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
Definition:SLPVectorizer.cpp:442
getShuffleCost
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
Definition:SLPVectorizer.cpp:4943
findBuildAggregate_rec
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
Definition:SLPVectorizer.cpp:20846
isSimple
static bool isSimple(Instruction *I)
Definition:SLPVectorizer.cpp:1137
MinScheduleRegionSize
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
Definition:SLPVectorizer.cpp:219
MinProfitableStridedLoads
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
isFirstInsertElement
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
Definition:SLPVectorizer.cpp:12298
getAltInstrMask
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
Definition:SLPVectorizer.cpp:1211
LookAheadMaxDepth
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
MaxVFOption
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
reorderReuses
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
Definition:SLPVectorizer.cpp:4551
combineOrders
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
Definition:SLPVectorizer.cpp:5904
MaxMemDepDistance
static const unsigned MaxMemDepDistance
Definition:SLPVectorizer.cpp:215
ViewSLPTree
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
doesInTreeUserNeedToExtract
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
Definition:SLPVectorizer.cpp:1099
VectorizeNonPowerOf2
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
MinTreeSize
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
reorderOrder
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
Definition:SLPVectorizer.cpp:4565
getFullVectorNumberOfElements
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
Definition:SLPVectorizer.cpp:271
performExtractsShuffleAction
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
Definition:SLPVectorizer.cpp:12353
ShouldVectorizeHor
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
isConstant
static bool isConstant(Value *V)
Definition:SLPVectorizer.cpp:410
isSplat
static bool isSplat(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:493
SLPCostThreshold
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
getPartNumElems
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
Definition:SLPVectorizer.cpp:435
allConstant
static bool allConstant(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:485
UsesLimit
static constexpr int UsesLimit
Definition:SLPVectorizer.cpp:210
getElementIndex
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
Definition:SLPVectorizer.cpp:568
isReductionCandidate
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
Definition:SLPVectorizer.cpp:21018
checkTreeSizes
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
Definition:SLPVectorizer.cpp:18674
getShufflevectorNumGroups
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:339
isCmpSameOrSwapped
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
Definition:SLPVectorizer.cpp:887
SLPSkipEarlyProfitabilityCheck
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
generateKeySubkey
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
Definition:SLPVectorizer.cpp:7405
ShouldStartVectorizeHorAtStore
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
getVectorCallCosts
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Definition:SLPVectorizer.cpp:9027
transformScalarShuffleIndiciesToVector
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
Definition:SLPVectorizer.cpp:300
SLPReVec
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
isValidForAlternation
static bool isValidForAlternation(unsigned Opcode)
Definition:SLPVectorizer.cpp:861
buildIntrinsicArgTypes
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
Definition:SLPVectorizer.cpp:11075
getExtractIndex
static std::optional< unsigned > getExtractIndex(Instruction *E)
Definition:SLPVectorizer.cpp:794
RootLookAheadMaxDepth
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
AliasedCheckLimit
static const unsigned AliasedCheckLimit
Definition:SLPVectorizer.cpp:206
getValueType
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
Definition:SLPVectorizer.cpp:243
gatherPossiblyVectorizableLoads
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
Definition:SLPVectorizer.cpp:6777
shortBundleName
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
Definition:SLPVectorizer.cpp:449
dumpOrder
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
Definition:SLPVectorizer.cpp:6722
isValidElementType
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
Definition:SLPVectorizer.cpp:231
getReductionInstr
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
Definition:SLPVectorizer.cpp:20921
calculateShufflevectorMask
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:391
allSameType
static bool allSameType(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:1092
getLocation
static MemoryLocation getLocation(Instruction *I)
Definition:SLPVectorizer.cpp:1128
isCommutative
static bool isCommutative(Instruction *I)
Definition:SLPVectorizer.cpp:509
allSameBlock
static bool allSameBlock(ArrayRef< Value * > VL)
Definition:SLPVectorizer.cpp:461
getFloorFullVectorNumberOfElements
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
Definition:SLPVectorizer.cpp:286
areTwoInsertFromSameBuildVector
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
Definition:SLPVectorizer.cpp:5488
arePointersCompatible
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
Definition:SLPVectorizer.cpp:4778
getGEPCosts
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
Definition:SLPVectorizer.cpp:9509
isUndefVector
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
Definition:SLPVectorizer.cpp:637
tryToVectorizeSequence
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
Definition:SLPVectorizer.cpp:21190
getSameOpcode
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
Definition:SLPVectorizer.cpp:909
ScheduleRegionSizeBudget
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
tryGetSecondaryReductionRoot
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
Definition:SLPVectorizer.cpp:20991
getRdxKind
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
Definition:SLPVectorizer.cpp:20815
matchRdxBop
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
Definition:SLPVectorizer.cpp:20965
MinVectorRegSizeOption
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
isFixedVectorShuffle
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
Definition:SLPVectorizer.cpp:706
getAggregateSize
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
Definition:SLPVectorizer.cpp:20818
getInsertExtractIndex
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
Definition:SLPVectorizer.cpp:543
RecursionMaxDepth
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
computeCommonAlignment
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
Definition:SLPVectorizer.cpp:4797
addMask
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
Definition:SLPVectorizer.cpp:1150
fixupOrderingIndices
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
Definition:SLPVectorizer.cpp:1185
createExtractVector
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
Definition:SLPVectorizer.cpp:4998
getNonPhiOperand
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
Definition:SLPVectorizer.cpp:21009
compareCmp
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
Definition:SLPVectorizer.cpp:21309
isReverseOrder
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
Definition:SLPVectorizer.cpp:4805
isAlternateInstruction
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
Definition:SLPVectorizer.cpp:9089
SLPVectorizer.h
STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.
OS
raw_pwrite_stream & OS
Definition:SampleProfWriter.cpp:51
SV_NAME
#define SV_NAME
Definition:SandboxVectorizer.cpp:17
ScalarEvolutionExpander.h
ScalarEvolutionExpressions.h
ScalarEvolution.h
ScopeExit.h
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
SetOperations.h
This file defines generic set operations that may be used on set's of different types,...
SetVector.h
This file implements a set that has insertion order iteration characteristics.
SmallBitVector.h
This file implements the SmallBitVector class.
SmallPtrSet.h
This file defines the SmallPtrSet class.
SmallSet.h
This file defines the SmallSet class.
SmallString.h
This file defines the SmallString class.
Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition:Statistic.h:166
getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition:TapiFile.cpp:39
Ptr
@ Ptr
Definition:TargetLibraryInfo.cpp:77
TargetLibraryInfo.h
TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.
Local.h
getOpcode
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition:VPlanSLP.cpp:191
getOperands
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition:VPlanSLP.cpp:154
ValueHandle.h
ValueTracking.h
VectorUtils.h
Verifier.h
RHS
Value * RHS
Definition:X86PartialReduction.cpp:74
LHS
Value * LHS
Definition:X86PartialReduction.cpp:73
IV
static const uint32_t IV[8]
Definition:blake3_impl.h:78
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator
Merges shuffle masks and emits final shuffle instruction, if required.
Definition:SLPVectorizer.cpp:10132
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::ShuffleCostEstimator
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
Definition:SLPVectorizer.cpp:10661
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Definition:SLPVectorizer.cpp:10830
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Definition:SLPVectorizer.cpp:10778
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::needToDelay
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Definition:SLPVectorizer.cpp:10773
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::gather
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
Definition:SLPVectorizer.cpp:10894
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::~ShuffleCostEstimator
~ShuffleCostEstimator()
Definition:SLPVectorizer.cpp:11035
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::finalize
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Definition:SLPVectorizer.cpp:10940
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::createFreeze
InstructionCost createFreeze(InstructionCost Cost)
Definition:SLPVectorizer.cpp:10937
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Definition:SLPVectorizer.cpp:10807
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::adjustExtracts
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Definition:SLPVectorizer.cpp:10667
llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Definition:SLPVectorizer.cpp:10847
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder
Merges shuffle masks and emits final shuffle instruction, if required.
Definition:SLPVectorizer.cpp:14094
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
Definition:SLPVectorizer.cpp:14445
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::addOrdered
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Definition:SLPVectorizer.cpp:14500
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::needToDelay
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Definition:SLPVectorizer.cpp:14360
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Definition:SLPVectorizer.cpp:14399
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::gather
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
Definition:SLPVectorizer.cpp:14505
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Definition:SLPVectorizer.cpp:14411
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::createFreeze
Value * createFreeze(Value *V)
Definition:SLPVectorizer.cpp:14512
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::ShuffleInstructionBuilder
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
Definition:SLPVectorizer.cpp:14220
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Definition:SLPVectorizer.cpp:14378
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::adjustExtracts
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Definition:SLPVectorizer.cpp:14224
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::finalize
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Definition:SLPVectorizer.cpp:14517
llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::~ShuffleInstructionBuilder
~ShuffleInstructionBuilder()
Definition:SLPVectorizer.cpp:14616
T
VectorType
Definition:ItaniumDemangle.h:1173
bool
llvm::AAManager
A manager for alias analyses.
Definition:AliasAnalysis.h:933
llvm::AAResults
Definition:AliasAnalysis.h:314
llvm::APInt
Class for arbitrary precision integers.
Definition:APInt.h:78
llvm::APInt::getAllOnes
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition:APInt.h:234
llvm::APInt::clearBit
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition:APInt.h:1407
llvm::APInt::setBit
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition:APInt.h:1330
llvm::APInt::isAllOnes
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition:APInt.h:371
llvm::APInt::isZero
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition:APInt.h:380
llvm::APInt::urem
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition:APInt.cpp:1640
llvm::APInt::clearAllBits
void clearAllBits()
Set every bit to 0.
Definition:APInt.h:1397
llvm::APInt::setAllBits
void setAllBits()
Set every bit to 1.
Definition:APInt.h:1319
llvm::APInt::setBits
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition:APInt.h:1367
llvm::APInt::getZero
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition:APInt.h:200
llvm::APInt::getBitsSetFrom
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition:APInt.h:286
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition:APInt.h:239
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition:PassManager.h:253
llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition:PassManager.h:429
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition:PassManager.h:410
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition:ArrayRef.h:41
llvm::ArrayRef::equals
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition:ArrayRef.h:190
llvm::ArrayRef::back
const T & back() const
back - Get the last element.
Definition:ArrayRef.h:177
llvm::ArrayRef::take_front
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition:ArrayRef.h:231
llvm::ArrayRef::drop_front
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition:ArrayRef.h:207
llvm::ArrayRef::front
const T & front() const
front - Get the first element.
Definition:ArrayRef.h:171
llvm::ArrayRef::end
iterator end() const
Definition:ArrayRef.h:157
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition:ArrayRef.h:168
llvm::ArrayRef::drop_back
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition:ArrayRef.h:213
llvm::ArrayRef::begin
iterator begin() const
Definition:ArrayRef.h:156
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition:ArrayRef.h:163
llvm::ArrayRef::slice
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition:ArrayRef.h:198
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition:AssumptionCache.h:173
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition:AssumptionCache.h:42
llvm::Attribute::getWithAlignment
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition:Attributes.cpp:234
llvm::BasicBlock
LLVM Basic Block Representation.
Definition:BasicBlock.h:61
llvm::BasicBlock::end
iterator end()
Definition:BasicBlock.h:464
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition:BasicBlock.h:451
llvm::BasicBlock::reverse_iterator
InstListType::reverse_iterator reverse_iterator
Definition:BasicBlock.h:179
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition:BasicBlock.h:220
llvm::BasicBlock::rend
reverse_iterator rend()
Definition:BasicBlock.h:469
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition:BasicBlock.h:177
llvm::BasicBlock::isEHPad
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition:BasicBlock.h:678
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition:BasicBlock.h:240
llvm::BatchAAResults
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
Definition:AliasAnalysis.h:630
llvm::BatchAAResults::getModRefInfo
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Definition:AliasAnalysis.h:653
llvm::BinaryOperator
Definition:InstrTypes.h:170
llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition:Analysis.h:72
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition:InstrTypes.h:1112
llvm::CallBase::getBundleOperandsEndIndex
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition:InstrTypes.h:1980
llvm::CallBase::getOperandBundlesAsDefs
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Definition:Instructions.cpp:483
llvm::CallBase::isNoBuiltin
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition:InstrTypes.h:1875
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition:InstrTypes.h:1341
llvm::CallBase::hasIdenticalOperandBundleSchema
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition:InstrTypes.h:2117
llvm::CallBase::getBundleOperandsStartIndex
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition:InstrTypes.h:1974
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition:InstrTypes.h:1286
llvm::CallBase::getFunctionType
FunctionType * getFunctionType() const
Definition:InstrTypes.h:1199
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition:InstrTypes.h:1277
llvm::CallBase::arg_size
unsigned arg_size() const
Definition:InstrTypes.h:1284
llvm::CallBase::addParamAttr
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition:InstrTypes.h:1494
llvm::CallBase::hasOperandBundles
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition:InstrTypes.h:1971
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition:Instructions.h:1479
llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition:InstrTypes.h:444
llvm::CmpInst
This class is the base class for the comparison instructions.
Definition:InstrTypes.h:661
llvm::CmpInst::makeCmpResultType
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition:InstrTypes.h:980
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition:InstrTypes.h:673
llvm::CmpInst::BAD_ICMP_PREDICATE
@ BAD_ICMP_PREDICATE
Definition:InstrTypes.h:706
llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition:InstrTypes.h:702
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition:InstrTypes.h:703
llvm::CmpInst::ICMP_UGE
@ ICMP_UGE
unsigned greater or equal
Definition:InstrTypes.h:697
llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition:InstrTypes.h:696
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition:InstrTypes.h:700
llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition:InstrTypes.h:698
llvm::CmpInst::ICMP_SGE
@ ICMP_SGE
signed greater or equal
Definition:InstrTypes.h:701
llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition:InstrTypes.h:699
llvm::CmpInst::BAD_FCMP_PREDICATE
@ BAD_FCMP_PREDICATE
Definition:InstrTypes.h:693
llvm::CmpInst::getSwappedPredicate
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition:InstrTypes.h:825
llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition:InstrTypes.h:787
llvm::CmpInst::getPredicate
Predicate getPredicate() const
Return the predicate for this instruction.
Definition:InstrTypes.h:763
llvm::CmpPredicate
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition:CmpPredicate.h:22
llvm::ConstantExpr::getIntToPtr
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition:Constants.cpp:2307
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition:Constants.h:83
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition:Constants.h:157
llvm::ConstantVector::getSplat
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition:Constants.cpp:1472
llvm::ConstantVector::get
static Constant * get(ArrayRef< Constant * > V)
Definition:Constants.cpp:1421
llvm::Constant
This is an important base class in LLVM.
Definition:Constant.h:42
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition:Constants.cpp:420
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition:Constants.cpp:373
llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition:DWARFExpression.h:32
llvm::DWARFExpression::Operation::getNumOperands
uint64_t getNumOperands() const
Definition:DWARFExpression.h:90
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition:DataLayout.h:63
llvm::DataLayout::getTypeStoreSizeInBits
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition:DataLayout.h:434
llvm::DataLayout::getIndexType
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition:DataLayout.cpp:878
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition:DataLayout.h:617
llvm::DebugCounter::shouldExecute
static bool shouldExecute(unsigned CounterName)
Definition:DebugCounter.h:87
llvm::DemandedBitsAnalysis
An analysis that produces DemandedBits for a function.
Definition:DemandedBits.h:103
llvm::DemandedBits
Definition:DemandedBits.h:40
llvm::DemandedBits::getDemandedBits
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
Definition:DemandedBits.cpp:399
llvm::DenseMapBase::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition:DenseMap.h:194
llvm::DenseMapBase::find
iterator find(const_arg_type_t< KeyT > Val)
Definition:DenseMap.h:156
llvm::DenseMapBase::try_emplace
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition:DenseMap.h:226
llvm::DenseMapBase::erase
bool erase(const KeyT &Val)
Definition:DenseMap.h:321
llvm::DenseMapBase::size
unsigned size() const
Definition:DenseMap.h:99
llvm::DenseMapBase::empty
bool empty() const
Definition:DenseMap.h:98
llvm::DenseMapBase::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition:DenseMap.h:152
llvm::DenseMapBase::end
iterator end()
Definition:DenseMap.h:84
llvm::DenseMapBase::at
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition:DenseMap.h:202
llvm::DenseMapBase::contains
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition:DenseMap.h:147
llvm::DenseMapBase::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition:DenseMap.h:211
llvm::DenseMapBase::clear
void clear()
Definition:DenseMap.h:110
llvm::DenseMap
Definition:DenseMap.h:727
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition:DenseSet.h:278
llvm::DomTreeNodeBase< BasicBlock >
llvm::DomTreeNodeBase::getDFSNumIn
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Definition:GenericDomTree.h:140
llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition:Dominators.h:279
llvm::DominatorTreeBase::updateDFSNumbers
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
Definition:GenericDomTree.h:805
llvm::DominatorTreeBase::getNode
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Definition:GenericDomTree.h:401
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition:Dominators.h:162
llvm::DominatorTree::isReachableFromEntry
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition:Dominators.cpp:321
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition:Dominators.cpp:122
llvm::ElementCount::getFixed
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition:TypeSize.h:311
llvm::ExtractElementInst
This instruction extracts a single (scalar) element from a VectorType value.
Definition:Instructions.h:1775
llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition:Instructions.h:2397
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition:FMF.h:20
llvm::FastMathFlags::set
void set()
Definition:FMF.h:62
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition:DerivedTypes.h:563
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition:DerivedTypes.h:606
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition:Type.cpp:791
llvm::FunctionType::params
ArrayRef< Type * > params() const
Definition:DerivedTypes.h:132
llvm::FunctionType::getReturnType
Type * getReturnType() const
Definition:DerivedTypes.h:126
llvm::Function
Definition:Function.h:63
llvm::Function::empty
bool empty() const
Definition:Function.h:859
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition:Instructions.h:933
llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator++
nodes_iterator operator++()
Definition:SLPVectorizer.cpp:4457
llvm::GraphTraits< BoUpSLP * >::nodes_iterator::nodes_iterator
nodes_iterator(const ItTy &It2)
Definition:SLPVectorizer.cpp:4455
llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator*
NodeRef operator*()
Definition:SLPVectorizer.cpp:4456
llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator!=
bool operator!=(const nodes_iterator &N2) const
Definition:SLPVectorizer.cpp:4461
llvm::IRBuilderBase::InsertPointGuard
Definition:IRBuilder.h:394
llvm::IRBuilderBase
Common base class shared among various IRBuilders.
Definition:IRBuilder.h:113
llvm::IRBuilderBase::CreateExtractVector
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition:IRBuilder.h:1072
llvm::IRBuilderBase::CreateInsertElement
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition:IRBuilder.h:2511
llvm::IRBuilderBase::getInt1Ty
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition:IRBuilder.h:530
llvm::IRBuilderBase::CreateInsertVector
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition:IRBuilder.h:1080
llvm::IRBuilderBase::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition:IRBuilder.h:2499
llvm::IRBuilderBase::getIntNTy
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition:IRBuilder.h:558
llvm::IRBuilderBase::CreateAlignedLoad
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition:IRBuilder.h:1815
llvm::IRBuilderBase::getTrue
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition:IRBuilder.h:485
llvm::IRBuilderBase::CreateSelect
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition:IRBuilder.cpp:1053
llvm::IRBuilderBase::GetInsertPoint
BasicBlock::iterator GetInsertPoint() const
Definition:IRBuilder.h:194
llvm::IRBuilderBase::CreateFreeze
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition:IRBuilder.h:2574
llvm::IRBuilderBase::CreateCast
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition:IRBuilder.h:2186
llvm::IRBuilderBase::GetInsertBlock
BasicBlock * GetInsertBlock() const
Definition:IRBuilder.h:193
llvm::IRBuilderBase::setFastMathFlags
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition:IRBuilder.h:330
llvm::IRBuilderBase::SetCurrentDebugLocation
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition:IRBuilder.h:239
llvm::IRBuilderBase::CreateGEP
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition:IRBuilder.h:1874
llvm::IRBuilderBase::getInt64
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition:IRBuilder.h:510
llvm::IRBuilderBase::getAllOnesMask
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition:IRBuilder.h:867
llvm::IRBuilderBase::CreateUnOp
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition:IRBuilder.h:1761
llvm::IRBuilderBase::CreateBinaryIntrinsic
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition:IRBuilder.cpp:889
llvm::IRBuilderBase::CreateIntrinsic
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition:IRBuilder.cpp:900
llvm::IRBuilderBase::getInt32
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition:IRBuilder.h:505
llvm::IRBuilderBase::CreateCmp
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition:IRBuilder.h:2404
llvm::IRBuilderBase::CreatePHI
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition:IRBuilder.h:2435
llvm::IRBuilderBase::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition:IRBuilder.h:2152
llvm::IRBuilderBase::CreateUnaryIntrinsic
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition:IRBuilder.cpp:881
llvm::IRBuilderBase::CreateShuffleVector
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition:IRBuilder.h:2533
llvm::IRBuilderBase::getFalse
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition:IRBuilder.h:490
llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition:IRBuilder.h:2449
llvm::IRBuilderBase::CreateBinOp
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition:IRBuilder.h:1671
llvm::IRBuilderBase::ClearInsertionPoint
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition:IRBuilder.h:188
llvm::IRBuilderBase::CreateIntCast
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition:IRBuilder.h:2225
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition:IRBuilder.h:199
llvm::IRBuilderBase::CreateAlignedStore
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition:IRBuilder.h:1834
llvm::IRBuilderBase::CreateICmp
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition:IRBuilder.h:2380
llvm::IRBuilderBase::CreateFMul
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition:IRBuilder.h:1614
llvm::IRBuilderBase::CreateMul
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition:IRBuilder.h:1404
llvm::IRBuilderBase::CreateMaskedGather
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition:IRBuilder.cpp:596
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition:IRBuilder.h:2705
llvm::InsertElementInst
This instruction inserts a single (scalar) element into a VectorType value.
Definition:Instructions.h:1834
llvm::InsertElementInst::getType
VectorType * getType() const
Overload to return most specific vector type.
Definition:Instructions.h:1862
llvm::InsertValueInst
This instruction inserts a struct field of array element value into an aggregate value.
Definition:Instructions.h:2485
llvm::InstructionCost
Definition:InstructionCost.h:29
llvm::InstructionCost::getInvalid
static InstructionCost getInvalid(CostType Val=0)
Definition:InstructionCost.h:73
llvm::InstructionCost::isValid
bool isValid() const
Definition:InstructionCost.h:79
llvm::Instruction
Definition:Instruction.h:68
llvm::Instruction::isCast
bool isCast() const
Definition:Instruction.h:300
llvm::Instruction::mayReadOrWriteMemory
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition:Instruction.h:780
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition:Instruction.h:492
llvm::Instruction::moveAfter
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition:Instruction.cpp:191
llvm::Instruction::isBinaryOp
bool isBinaryOp() const
Definition:Instruction.h:296
llvm::Instruction::comesBefore
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition:Instruction.cpp:334
llvm::Instruction::getNextNonDebugInstruction
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
Definition:Instruction.cpp:1226
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition:Instruction.h:291
llvm::Instruction::BinaryOps
BinaryOps
Definition:Instruction.h:989
llvm::Instruction::isIdenticalTo
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
Definition:Instruction.cpp:914
llvm::Instruction::isIntDivRem
bool isIntDivRem() const
Definition:Instruction.h:297
llvm::Instruction::UnaryOps
UnaryOps
Definition:Instruction.h:982
llvm::Instruction::CastOps
CastOps
Definition:Instruction.h:1003
llvm::IntegerType::get
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition:Type.cpp:311
llvm::IntrinsicCostAttributes
Definition:TargetTransformInfo.h:119
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition:TargetTransformInfo.h:156
llvm::LoadInst
An instruction for reading from memory.
Definition:Instructions.h:176
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition:Instructions.h:255
llvm::LoadInst::isSimple
bool isSimple() const
Definition:Instructions.h:247
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition:Instructions.h:211
llvm::LoopAnalysis
Analysis pass that exposes the LoopInfo for a function.
Definition:LoopInfo.h:566
llvm::LoopBase::getLoopLatch
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition:GenericLoopInfoImpl.h:256
llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition:GenericLoopInfo.h:606
llvm::LoopInfo
Definition:LoopInfo.h:407
llvm::Loop
Represents a single loop in the control flow graph.
Definition:LoopInfo.h:39
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition:MapVector.h:36
llvm::MapVector::end
iterator end()
Definition:MapVector.h:71
llvm::MapVector::takeVector
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition:MapVector.h:55
llvm::MapVector::find
iterator find(const KeyT &Key)
Definition:MapVector.h:167
llvm::MapVector::empty
bool empty() const
Definition:MapVector.h:79
llvm::MapVector::try_emplace
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition:MapVector.h:118
llvm::MapVector::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition:MapVector.h:141
llvm::MapVector::lookup
ValueT lookup(const KeyT &Key) const
Definition:MapVector.h:110
llvm::MapVector::size
size_type size() const
Definition:MapVector.h:60
llvm::MapVector::front
std::pair< KeyT, ValueT > & front()
Definition:MapVector.h:83
llvm::MapVector::clear
void clear()
Definition:MapVector.h:88
llvm::MemIntrinsic
This is the common base class for memset/memcpy/memmove.
Definition:IntrinsicInst.h:1205
llvm::MemoryLocation
Representation for a specific memory location.
Definition:MemoryLocation.h:227
llvm::MemoryLocation::get
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Definition:MemoryLocation.cpp:35
llvm::MemoryLocation::Ptr
const Value * Ptr
The address of the start of the location.
Definition:MemoryLocation.h:235
llvm::MutableArrayRef
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition:ArrayRef.h:310
llvm::MutableArrayRef::front
T & front() const
front - Get the first element.
Definition:ArrayRef.h:366
llvm::MutableArrayRef::end
iterator end() const
Definition:ArrayRef.h:360
llvm::MutableArrayRef::begin
iterator begin() const
Definition:ArrayRef.h:359
llvm::MutableArrayRef::slice
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition:ArrayRef.h:379
llvm::OptimizationRemarkEmitterAnalysis
Definition:OptimizationRemarkEmitter.h:164
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition:OptimizationRemarkEmitter.h:32
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition:DiagnosticInfo.h:807
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition:DiagnosticInfo.h:762
llvm::OwningArrayRef
This is a MutableArrayRef that owns its array.
Definition:ArrayRef.h:452
llvm::PHINode
Definition:Instructions.h:2600
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition:Instructions.h:2735
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition:Instructions.h:2775
llvm::PHINode::getIncomingBlock
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Definition:Instructions.h:2695
llvm::PHINode::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Definition:Instructions.h:2671
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition:Pass.h:94
llvm::PointerType::getUnqual
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition:DerivedTypes.h:686
llvm::PointerUnion
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition:PointerUnion.h:118
llvm::PointerUnion::isNull
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition:PointerUnion.h:142
llvm::PointerUnion::dyn_cast
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition:PointerUnion.h:168
llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition:Constants.cpp:1878
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition:Analysis.h:111
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition:Analysis.h:117
llvm::PreservedAnalyses::preserveSet
void preserveSet()
Mark an analysis set as preserved.
Definition:Analysis.h:146
llvm::PriorityQueue
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition:PriorityQueue.h:28
llvm::RecurrenceDescriptor::getOpcode
unsigned getOpcode() const
Definition:IVDescriptors.h:212
llvm::RecurrenceDescriptor::isIntMinMaxRecurrenceKind
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
Definition:IVDescriptors.h:234
llvm::RecurrenceDescriptor::isMinMaxRecurrenceKind
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
Definition:IVDescriptors.h:246
llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition:ScalarEvolutionExpander.h:63
llvm::SCEVExpander::expandCodeFor
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
Definition:ScalarEvolutionExpander.cpp:1443
llvm::SCEV
This class represents an analyzed expression in the program.
Definition:ScalarEvolution.h:71
llvm::SCEV::isZero
bool isZero() const
Return true if the expression is a constant zero.
Definition:ScalarEvolution.cpp:448
llvm::SCEV::isNonConstantNegative
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Definition:ScalarEvolution.cpp:454
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition:ScalarEvolution.cpp:386
llvm::ScalarEvolutionAnalysis
Analysis pass that exposes the ScalarEvolution for a function.
Definition:ScalarEvolution.h:2320
llvm::ScalarEvolution
The main scalar evolution driver.
Definition:ScalarEvolution.h:447
llvm::ScalarEvolution::getConstant
const SCEV * getConstant(ConstantInt *V)
Definition:ScalarEvolution.cpp:473
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition:ScalarEvolution.cpp:4547
llvm::ScalarEvolution::forgetValue
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
Definition:ScalarEvolution.cpp:8542
llvm::ScalarEvolution::getMinusSCEV
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition:ScalarEvolution.cpp:4655
llvm::ScalarEvolution::getMulExpr
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition:ScalarEvolution.cpp:3106
llvm::ScalarEvolution::getUDivExactExpr
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
Definition:ScalarEvolution.cpp:3587
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition:ScalarEvolution.cpp:2526
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition:Instructions.h:1657
llvm::SetVector
A vector that has set insertion semantics.
Definition:SetVector.h:57
llvm::SetVector::getArrayRef
ArrayRef< value_type > getArrayRef() const
Definition:SetVector.h:84
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition:SetVector.h:98
llvm::SetVector::front
const value_type & front() const
Return the first element of the SetVector.
Definition:SetVector.h:143
llvm::SetVector::clear
void clear()
Completely clear the SetVector.
Definition:SetVector.h:273
llvm::SetVector::empty
bool empty() const
Determine if the SetVector is empty or not.
Definition:SetVector.h:93
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition:SetVector.h:162
llvm::SetVector::contains
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition:SetVector.h:254
llvm::ShuffleVectorInst
This instruction constructs a fixed permutation of two input vectors.
Definition:Instructions.h:1901
llvm::ShuffleVectorInst::isZeroEltSplatMask
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
Definition:Instructions.cpp:1911
llvm::ShuffleVectorInst::isOneUseSingleSourceMask
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
Definition:Instructions.cpp:2253
llvm::ShuffleVectorInst::isDeInterleaveMaskOfFactor
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
Definition:Instructions.cpp:2379
llvm::ShuffleVectorInst::isIdentityMask
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
Definition:Instructions.cpp:1883
llvm::ShuffleVectorInst::isExtractSubvectorMask
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
Definition:Instructions.cpp:2010
llvm::ShuffleVectorInst::isReverseMask
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
Definition:Instructions.cpp:1891
llvm::ShuffleVectorInst::isInsertSubvectorMask
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
Definition:Instructions.cpp:2039
llvm::ShuffleVectorInst::isInterleaveMask
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Definition:Instructions.cpp:2295
llvm::SmallBitVector
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
Definition:SmallBitVector.h:35
llvm::SmallBitVector::find_first
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
Definition:SmallBitVector.h:230
llvm::SmallBitVector::set
SmallBitVector & set()
Definition:SmallBitVector.h:366
llvm::SmallBitVector::test
bool test(unsigned Idx) const
Definition:SmallBitVector.h:472
llvm::SmallBitVector::find_next
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
Definition:SmallBitVector.h:277
llvm::SmallBitVector::all
bool all() const
Returns true if all bits are set.
Definition:SmallBitVector.h:216
llvm::SmallBitVector::size
size_type size() const
Returns the number of bits in this bitvector.
Definition:SmallBitVector.h:195
llvm::SmallBitVector::any
bool any() const
Returns true if any bit is set.
Definition:SmallBitVector.h:209
llvm::SmallBitVector::count
size_type count() const
Returns the number of bits which are set.
Definition:SmallBitVector.h:200
llvm::SmallBitVector::reset
SmallBitVector & reset()
Definition:SmallBitVector.h:401
llvm::SmallBitVector::none
bool none() const
Returns true if none of the bits are set.
Definition:SmallBitVector.h:223
llvm::SmallDenseMap
Definition:DenseMap.h:883
llvm::SmallDenseSet
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition:DenseSet.h:298
llvm::SmallPtrSetImplBase::size
size_type size() const
Definition:SmallPtrSet.h:94
llvm::SmallPtrSetImplBase::clear
void clear()
Definition:SmallPtrSet.h:97
llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition:SmallPtrSet.h:93
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition:SmallPtrSet.h:363
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition:SmallPtrSet.h:401
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition:SmallPtrSet.h:452
llvm::SmallPtrSetImpl::end
iterator end() const
Definition:SmallPtrSet.h:477
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition:SmallPtrSet.h:384
llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition:SmallPtrSet.h:472
llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition:SmallPtrSet.h:458
llvm::SmallPtrSet< Value *, 16 >
llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition:SetVector.h:370
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition:SmallSet.h:132
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition:SmallSet.h:175
llvm::SmallSet::contains
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition:SmallSet.h:222
llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition:SmallSet.h:181
llvm::SmallSet::size
size_type size() const
Definition:SmallSet.h:170
llvm::SmallString
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition:SmallString.h:26
llvm::SmallVectorBase::empty
bool empty() const
Definition:SmallVector.h:81
llvm::SmallVectorBase::size
size_t size() const
Definition:SmallVector.h:78
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition:SmallVector.h:573
llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition:SmallVector.h:673
llvm::SmallVectorImpl::assign
void assign(size_type NumElts, ValueParamT Elt)
Definition:SmallVector.h:704
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition:SmallVector.h:937
llvm::SmallVectorImpl::reserve
void reserve(size_type N)
Definition:SmallVector.h:663
llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition:SmallVector.h:683
llvm::SmallVectorImpl::clear
void clear()
Definition:SmallVector.h:610
llvm::SmallVectorImpl::swap
void swap(SmallVectorImpl &RHS)
Definition:SmallVector.h:968
llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition:SmallVector.h:638
llvm::SmallVectorTemplateBase::pop_back
void pop_back()
Definition:SmallVector.h:425
llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition:SmallVector.h:413
llvm::SmallVectorTemplateCommon::end
iterator end()
Definition:SmallVector.h:269
llvm::SmallVectorTemplateCommon::rbegin
reverse_iterator rbegin()
Definition:SmallVector.h:273
llvm::SmallVectorTemplateCommon::front
reference front()
Definition:SmallVector.h:299
llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition:SmallVector.h:267
llvm::SmallVectorTemplateCommon::back
reference back()
Definition:SmallVector.h:308
llvm::SmallVectorTemplateCommon::rend
reverse_iterator rend()
Definition:SmallVector.h:275
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition:SmallVector.h:1196
llvm::StoreInst
An instruction for storing to memory.
Definition:Instructions.h:292
llvm::StoreInst::getPointerOperandType
Type * getPointerOperandType() const
Definition:Instructions.h:384
llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition:Instructions.h:378
llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition:Instructions.h:381
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition:StringRef.h:51
llvm::TargetFolder
TargetFolder - Create constants with target dependent folding.
Definition:TargetFolder.h:34
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition:TargetTransformInfo.h:3194
llvm::TargetLibraryAnalysis
Analysis pass providing the TargetLibraryInfo.
Definition:TargetLibraryInfo.h:614
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition:TargetLibraryInfo.h:280
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition:TargetTransformInfo.h:212
llvm::TargetTransformInfo::getCastContextHint
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
Definition:TargetTransformInfo.cpp:996
llvm::TargetTransformInfo::getStridedMemoryOpCost
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1156
llvm::TargetTransformInfo::getCmpSelInstrCost
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1067
llvm::TargetTransformInfo::getRegisterBitWidth
TypeSize getRegisterBitWidth(RegisterKind K) const
Definition:TargetTransformInfo.cpp:776
llvm::TargetTransformInfo::isLegalMaskedGather
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
Definition:TargetTransformInfo.cpp:490
llvm::TargetTransformInfo::getMemoryOpCost
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1125
llvm::TargetTransformInfo::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
Definition:TargetTransformInfo.cpp:1165
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition:TargetTransformInfo.cpp:1177
llvm::TargetTransformInfo::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
Definition:TargetTransformInfo.cpp:1215
llvm::TargetTransformInfo::getCastInstrCost
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1039
llvm::TargetTransformInfo::getGEPCost
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
Definition:TargetTransformInfo.cpp:248
llvm::TargetTransformInfo::isLegalInterleavedAccessType
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
Definition:TargetTransformInfo.cpp:531
llvm::TargetTransformInfo::isLegalBroadcastLoad
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
Definition:TargetTransformInfo.cpp:485
llvm::TargetTransformInfo::getExtendedReductionCost
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
Definition:TargetTransformInfo.cpp:1233
llvm::TargetTransformInfo::getOperandInfo
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
Definition:TargetTransformInfo.cpp:880
llvm::TargetTransformInfo::getRegisterClassForType
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
Definition:TargetTransformInfo.cpp:767
llvm::TargetTransformInfo::forceScalarizeMaskedGather
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
Definition:TargetTransformInfo.cpp:506
llvm::TargetTransformInfo::isLegalStridedLoadStore
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
Definition:TargetTransformInfo.cpp:526
llvm::TargetTransformInfo::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Definition:TargetTransformInfo.cpp:1224
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition:TargetTransformInfo.h:263
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition:TargetTransformInfo.h:264
llvm::TargetTransformInfo::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
Definition:TargetTransformInfo.cpp:940
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition:TargetTransformInfo.h:1126
llvm::TargetTransformInfo::OP_NegatedPowerOf2
@ OP_NegatedPowerOf2
Definition:TargetTransformInfo.h:1129
llvm::TargetTransformInfo::OP_None
@ OP_None
Definition:TargetTransformInfo.h:1127
llvm::TargetTransformInfo::OP_PowerOf2
@ OP_PowerOf2
Definition:TargetTransformInfo.h:1128
llvm::TargetTransformInfo::getPointersChainCost
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
Definition:TargetTransformInfo.cpp:254
llvm::TargetTransformInfo::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition:TargetTransformInfo.cpp:807
llvm::TargetTransformInfo::isTypeLegal
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
Definition:TargetTransformInfo.cpp:583
llvm::TargetTransformInfo::getCostOfKeepingLiveOverCall
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
Definition:TargetTransformInfo.cpp:1247
llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition:TargetTransformInfo.h:1180
llvm::TargetTransformInfo::getShuffleCost
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
Definition:TargetTransformInfo.cpp:976
llvm::TargetTransformInfo::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition:TargetTransformInfo.cpp:781
llvm::TargetTransformInfo::getGatherScatterOpCost
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
Definition:TargetTransformInfo.cpp:1146
llvm::TargetTransformInfo::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned ClassID) const
Definition:TargetTransformInfo.cpp:759
llvm::TargetTransformInfo::isLegalAltInstr
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
Definition:TargetTransformInfo.cpp:495
llvm::TargetTransformInfo::isFPVectorizationPotentiallyUnsafe
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
Definition:TargetTransformInfo.cpp:680
llvm::TargetTransformInfo::getStoreMinimumVF
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition:TargetTransformInfo.cpp:812
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition:TargetTransformInfo.h:291
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition:TargetTransformInfo.h:289
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition:TargetTransformInfo.h:290
llvm::TargetTransformInfo::getScalarizationOverhead
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
Definition:TargetTransformInfo.cpp:628
llvm::TargetTransformInfo::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
Definition:TargetTransformInfo.cpp:270
llvm::TargetTransformInfo::getExtractWithExtendCost
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
Definition:TargetTransformInfo.cpp:1050
llvm::TargetTransformInfo::getNumberOfParts
unsigned getNumberOfParts(Type *Tp) const
Definition:TargetTransformInfo.cpp:1193
llvm::TargetTransformInfo::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
Definition:TargetTransformInfo.cpp:1079
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition:TargetTransformInfo.h:1098
llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition:TargetTransformInfo.h:1105
llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition:TargetTransformInfo.h:1101
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition:TargetTransformInfo.h:1109
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition:TargetTransformInfo.h:1099
llvm::TargetTransformInfo::SK_PermuteTwoSrc
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
Definition:TargetTransformInfo.h:1107
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition:TargetTransformInfo.h:1100
llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition:TargetTransformInfo.h:1106
llvm::TargetTransformInfo::getCallInstrCost
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
Definition:TargetTransformInfo.cpp:1185
llvm::TargetTransformInfo::CastContextHint
CastContextHint
Represents a hint about the context in which a cast is used.
Definition:TargetTransformInfo.h:1389
llvm::TargetTransformInfo::CastContextHint::Reversed
@ Reversed
The cast is used with a reversed load/store.
llvm::TargetTransformInfo::CastContextHint::None
@ None
The cast is not used with a load/store of any kind.
llvm::TargetTransformInfo::CastContextHint::Normal
@ Normal
The cast is used with a normal load/store.
llvm::TargetTransformInfo::CastContextHint::GatherScatter
@ GatherScatter
The cast is used with a gather/scatter.
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition:TargetTransformInfo.h:1118
llvm::TargetTransformInfo::OK_UniformConstantValue
@ OK_UniformConstantValue
Definition:TargetTransformInfo.h:1121
llvm::TargetTransformInfo::OK_UniformValue
@ OK_UniformValue
Definition:TargetTransformInfo.h:1120
llvm::TargetTransformInfo::OK_AnyValue
@ OK_AnyValue
Definition:TargetTransformInfo.h:1119
llvm::TargetTransformInfo::OK_NonUniformConstantValue
@ OK_NonUniformConstantValue
Definition:TargetTransformInfo.h:1122
llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition:Twine.h:81
llvm::TypeSize
Definition:TypeSize.h:334
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition:Type.h:45
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition:Type.h:270
llvm::Type::isX86_FP80Ty
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition:Type.h:159
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition:Type.h:243
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition:Type.h:264
llvm::Type::isEmptyTy
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
llvm::Type::getStructNumElements
unsigned getStructNumElements() const
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
llvm::Type::isSingleValueType
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition:Type.h:295
llvm::Type::print
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
llvm::Type::isPPC_FP128Ty
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition:Type.h:165
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
llvm::Type::getWithNewType
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition:Type.h:128
llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition:Type.h:184
llvm::Type::isPtrOrPtrVectorTy
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition:Type.h:267
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition:Type.h:237
llvm::Type::getTypeID
TypeID getTypeID() const
Return the type id for the type.
Definition:Type.h:136
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition:Type.h:225
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition:Type.h:139
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition:Type.h:355
llvm::UnaryOperator
Definition:InstrTypes.h:100
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition:Constants.cpp:1859
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition:Use.h:43
llvm::User
Definition:User.h:44
llvm::User::operands
op_range operands()
Definition:User.h:288
llvm::User::replaceUsesOfWith
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition:User.cpp:21
llvm::User::User
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition:User.h:115
llvm::User::op_begin
op_iterator op_begin()
Definition:User.h:280
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition:User.h:228
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition:User.h:250
llvm::User::operand_values
iterator_range< value_op_iterator > operand_values()
Definition:User.h:312
llvm::VFDatabase
The Vector Function Database.
Definition:VectorUtils.h:31
llvm::VFDatabase::getMappings
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition:VectorUtils.h:72
llvm::Value
LLVM Value Representation.
Definition:Value.h:74
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition:Value.h:255
llvm::Value::user_begin
user_iterator user_begin()
Definition:Value.h:397
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition:Value.h:434
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition:Value.cpp:534
llvm::Value::users
iterator_range< user_iterator > users()
Definition:Value.h:421
llvm::Value::getValueID
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition:Value.h:532
llvm::Value::hasNUsesOrMore
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition:Value.cpp:153
llvm::Value::hasNUses
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition:Value.cpp:149
llvm::Value::use_empty
bool use_empty() const
Definition:Value.h:344
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition:Value.cpp:1075
llvm::Value::getNumUses
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition:Value.cpp:255
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition:Value.cpp:309
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition:Value.cpp:383
llvm::VectorType
Base class of all SIMD vector types.
Definition:DerivedTypes.h:427
llvm::VectorType::getElementCount
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition:DerivedTypes.h:665
llvm::VectorType::get
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
llvm::VectorType::getElementType
Type * getElementType() const
Definition:DerivedTypes.h:460
llvm::WeakTrackingVH
Value handle that is nullable, but tries to track the Value.
Definition:ValueHandle.h:204
llvm::cl::opt
Definition:CommandLine.h:1423
llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition:DenseSet.h:213
llvm::detail::DenseSetImpl::clear
void clear()
Definition:DenseSet.h:92
llvm::detail::DenseSetImpl::find
iterator find(const_arg_type_t< ValueT > V)
Definition:DenseSet.h:187
llvm::detail::DenseSetImpl::end
iterator end()
Definition:DenseSet.h:182
llvm::detail::DenseSetImpl::size
size_type size() const
Definition:DenseSet.h:81
llvm::detail::DenseSetImpl::empty
bool empty() const
Definition:DenseSet.h:80
llvm::detail::DenseSetImpl::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition:DenseSet.h:193
llvm::detail::DenseSetImpl::begin
iterator begin()
Definition:DenseSet.h:181
llvm::detail::DenseSetImpl::erase
bool erase(const ValueT &V)
Definition:DenseSet.h:97
llvm::detail::DenseSetImpl::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition:DenseSet.h:95
llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition:TypeSize.h:202
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition:STLFunctionalExtras.h:37
llvm::hash_code
An opaque object representing a hash code.
Definition:Hashing.h:75
llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition:ilist_node.h:32
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition:ilist_node.h:132
llvm::ilist_node_with_parent::getPrevNode
NodeTy * getPrevNode()
Definition:ilist_node.h:339
llvm::iterator_adaptor_base
CRTP base class for adapting an iterator to a different type.
Definition:iterator.h:237
llvm::iterator_range
A range adaptor for a pair of iterators.
Definition:iterator_range.h:42
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition:raw_ostream.h:52
llvm::raw_ostream::indent
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
Definition:raw_ostream.cpp:495
llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition:raw_ostream.h:661
llvm::raw_svector_ostream
A raw_ostream that writes to an SmallVector or SmallString.
Definition:raw_ostream.h:691
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics
A helper class used for scoring candidates for two consecutive lanes.
Definition:SLPVectorizer.cpp:1670
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveExtracts
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
Definition:SLPVectorizer.cpp:1708
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getShallowScore
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
Definition:SLPVectorizer.cpp:1730
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAllUserVectorized
static const int ScoreAllUserVectorized
Score if all users are vectorized.
Definition:SLPVectorizer.cpp:1724
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSameOpcode
static const int ScoreSameOpcode
Instructions with the same opcode.
Definition:SLPVectorizer.cpp:1714
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreUndef
static const int ScoreUndef
Matching with an undef is preferable to failing.
Definition:SLPVectorizer.cpp:1720
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getScoreAtLevelRec
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
Definition:SLPVectorizer.cpp:1902
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreFail
static const int ScoreFail
Score for failing to find a decent match.
Definition:SLPVectorizer.cpp:1722
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreMaskedGatherCandidate
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
Definition:SLPVectorizer.cpp:1706
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplat
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
Definition:SLPVectorizer.cpp:1718
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::LookAheadHeuristics
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
Definition:SLPVectorizer.cpp:1679
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplatLoads
static const int ScoreSplatLoads
The same load multiple times.
Definition:SLPVectorizer.cpp:1702
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedLoads
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
Definition:SLPVectorizer.cpp:1704
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConstants
static const int ScoreConstants
Constants.
Definition:SLPVectorizer.cpp:1712
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAltOpcodes
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
Definition:SLPVectorizer.cpp:1716
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveLoads
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
Definition:SLPVectorizer.cpp:1697
llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedExtracts
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
Definition:SLPVectorizer.cpp:1710
llvm::slpvectorizer::BoUpSLP::VLOperands
A helper data structure to hold the operands of a vector of instructions.
Definition:SLPVectorizer.cpp:1971
llvm::slpvectorizer::BoUpSLP::VLOperands::getVL
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
Definition:SLPVectorizer.cpp:2581
llvm::slpvectorizer::BoUpSLP::VLOperands::getModeStr
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
Definition:SLPVectorizer.cpp:2734
llvm::slpvectorizer::BoUpSLP::VLOperands::VLOperands
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Definition:SLPVectorizer.cpp:2571
llvm::slpvectorizer::BoUpSLP::VLOperands::dumpMode
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
Definition:SLPVectorizer.cpp:2756
llvm::slpvectorizer::BoUpSLP::VLOperands::dump
LLVM_DUMP_METHOD void dump() const
Debug print.
Definition:SLPVectorizer.cpp:2783
llvm::slpvectorizer::BoUpSLP::VLOperands::operator<<
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
Definition:SLPVectorizer.cpp:2760
llvm::slpvectorizer::BoUpSLP::VLOperands::reorder
void reorder()
Definition:SLPVectorizer.cpp:2593
llvm::slpvectorizer::BoUpSLP::VLOperands::printMode
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
Definition:SLPVectorizer.cpp:2750
llvm::slpvectorizer::BoUpSLP::VLOperands::print
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Definition:SLPVectorizer.cpp:2764
llvm::slpvectorizer::BoUpSLP
Bottom Up SLP Vectorizer.
Definition:SLPVectorizer.cpp:1319
llvm::slpvectorizer::BoUpSLP::OrdersType
SmallVector< unsigned, 4 > OrdersType
Definition:SLPVectorizer.cpp:1339
llvm::slpvectorizer::BoUpSLP::getRootNodeTypeWithNoCast
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
Definition:SLPVectorizer.cpp:1409
llvm::slpvectorizer::BoUpSLP::findPartiallyOrderedLoads
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
Definition:SLPVectorizer.cpp:5464
llvm::slpvectorizer::BoUpSLP::LoadsState
LoadsState
Tracks the state we can represent the loads in the given sequence.
Definition:SLPVectorizer.cpp:1327
llvm::slpvectorizer::BoUpSLP::LoadsState::ScatterVectorize
@ ScatterVectorize
llvm::slpvectorizer::BoUpSLP::LoadsState::Gather
@ Gather
llvm::slpvectorizer::BoUpSLP::LoadsState::Vectorize
@ Vectorize
llvm::slpvectorizer::BoUpSLP::LoadsState::StridedVectorize
@ StridedVectorize
llvm::slpvectorizer::BoUpSLP::operator<<
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
Definition:SLPVectorizer.cpp:4045
llvm::slpvectorizer::BoUpSLP::reorderTopToBottom
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
Definition:SLPVectorizer.cpp:5926
llvm::slpvectorizer::BoUpSLP::reorderBottomToTop
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
Definition:SLPVectorizer.cpp:6258
llvm::slpvectorizer::BoUpSLP::registerNonVectorizableLoads
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
Definition:SLPVectorizer.cpp:1627
llvm::slpvectorizer::BoUpSLP::getTreeSize
unsigned getTreeSize() const
Definition:SLPVectorizer.cpp:1487
llvm::slpvectorizer::BoUpSLP::~BoUpSLP
~BoUpSLP()
Definition:SLPVectorizer.cpp:4510
llvm::slpvectorizer::BoUpSLP::areKnownNonVectorizableLoads
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
Definition:SLPVectorizer.cpp:1633
llvm::slpvectorizer::BoUpSLP::getCanonicalGraphSize
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
Definition:SLPVectorizer.cpp:1490
llvm::slpvectorizer::BoUpSLP::areAnalyzedReductionVals
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
Definition:SLPVectorizer.cpp:2919
llvm::slpvectorizer::BoUpSLP::canVectorizeLoads
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
Definition:SLPVectorizer.cpp:5013
llvm::slpvectorizer::BoUpSLP::isLoadCombineCandidate
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
Definition:SLPVectorizer.cpp:12060
llvm::slpvectorizer::BoUpSLP::analyzedReductionVals
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
Definition:SLPVectorizer.cpp:2924
llvm::slpvectorizer::BoUpSLP::isLoadCombineReductionCandidate
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
Definition:SLPVectorizer.cpp:12050
llvm::slpvectorizer::BoUpSLP::getVectorElementSize
unsigned getVectorElementSize(Value *V)
Definition:SLPVectorizer.cpp:17662
llvm::slpvectorizer::BoUpSLP::isSignedMinBitwidthRootNode
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
Definition:SLPVectorizer.cpp:1428
llvm::slpvectorizer::BoUpSLP::analyzedReductionRoot
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
Definition:SLPVectorizer.cpp:2914
llvm::slpvectorizer::BoUpSLP::getRootNodeScalars
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
Definition:SLPVectorizer.cpp:1402
llvm::slpvectorizer::BoUpSLP::computeMinimumValueSizes
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
Definition:SLPVectorizer.cpp:18134
llvm::slpvectorizer::BoUpSLP::deleteTree
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
Definition:SLPVectorizer.cpp:1460
llvm::slpvectorizer::BoUpSLP::getTreeCost
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
Definition:SLPVectorizer.cpp:12459
llvm::slpvectorizer::BoUpSLP::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition:SLPVectorizer.cpp:1568
llvm::slpvectorizer::BoUpSLP::ValueSet
SmallPtrSet< Value *, 16 > ValueSet
Definition:SLPVectorizer.cpp:1336
llvm::slpvectorizer::BoUpSLP::ExtraValueToDebugLocsMap
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
Definition:SLPVectorizer.cpp:1338
llvm::slpvectorizer::BoUpSLP::BoUpSLP
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
Definition:SLPVectorizer.cpp:1341
llvm::slpvectorizer::BoUpSLP::isNotScheduled
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Definition:SLPVectorizer.cpp:2942
llvm::slpvectorizer::BoUpSLP::transformNodes
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
Definition:SLPVectorizer.cpp:9761
llvm::slpvectorizer::BoUpSLP::isDeleted
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
Definition:SLPVectorizer.cpp:2813
llvm::slpvectorizer::BoUpSLP::buildExternalUses
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
Definition:SLPVectorizer.cpp:6535
llvm::slpvectorizer::BoUpSLP::isTreeTinyAndNotFullyVectorizable
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
Definition:SLPVectorizer.cpp:12073
llvm::slpvectorizer::BoUpSLP::removeInstructionsAndOperands
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
Definition:SLPVectorizer.cpp:2825
llvm::slpvectorizer::BoUpSLP::canMapToVector
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
Definition:SLPVectorizer.cpp:8898
llvm::slpvectorizer::BoUpSLP::getMinVF
unsigned getMinVF(unsigned Sz) const
Definition:SLPVectorizer.cpp:1564
llvm::slpvectorizer::BoUpSLP::isAnalyzedReductionRoot
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
Definition:SLPVectorizer.cpp:2909
llvm::slpvectorizer::BoUpSLP::getReorderingData
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
Definition:SLPVectorizer.cpp:5540
llvm::slpvectorizer::BoUpSLP::eraseInstruction
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
Definition:SLPVectorizer.cpp:2818
llvm::slpvectorizer::BoUpSLP::doesRootHaveInTreeUses
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
Definition:SLPVectorizer.cpp:1396
llvm::slpvectorizer::BoUpSLP::getORE
OptimizationRemarkEmitter * getORE()
Definition:SLPVectorizer.cpp:1637
llvm::slpvectorizer::BoUpSLP::isAnyGathered
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
Definition:SLPVectorizer.cpp:2934
llvm::slpvectorizer::BoUpSLP::ValueList
SmallVector< Value *, 8 > ValueList
Definition:SLPVectorizer.cpp:1334
llvm::slpvectorizer::BoUpSLP::buildTree
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
Definition:SLPVectorizer.cpp:6759
llvm::slpvectorizer::BoUpSLP::isTreeNotExtendable
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
Definition:SLPVectorizer.cpp:12157
llvm::slpvectorizer::BoUpSLP::getReductionType
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
Definition:SLPVectorizer.cpp:1433
llvm::slpvectorizer::BoUpSLP::getMaxVecRegSize
unsigned getMaxVecRegSize() const
Definition:SLPVectorizer.cpp:1555
llvm::slpvectorizer::BoUpSLP::isVectorized
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
Definition:SLPVectorizer.cpp:2947
llvm::slpvectorizer::BoUpSLP::isIdentityOrder
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
Definition:SLPVectorizer.cpp:1499
llvm::slpvectorizer::BoUpSLP::isGathered
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
Definition:SLPVectorizer.cpp:2938
llvm::slpvectorizer::BoUpSLP::getSpillCost
InstructionCost getSpillCost() const
Definition:SLPVectorizer.cpp:12185
llvm::slpvectorizer::BoUpSLP::getMinVecRegSize
unsigned getMinVecRegSize() const
Definition:SLPVectorizer.cpp:1560
llvm::slpvectorizer::BoUpSLP::vectorizeTree
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
Definition:SLPVectorizer.cpp:16301
llvm::slpvectorizer::BoUpSLP::findBestRootPair
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
Definition:SLPVectorizer.cpp:2793
llvm::slpvectorizer::BoUpSLP::findReusedOrderedScalars
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
Definition:SLPVectorizer.cpp:4610
llvm::slpvectorizer::BoUpSLP::clearReductionData
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
Definition:SLPVectorizer.cpp:2928
llvm::slpvectorizer::BoUpSLP::optimizeGatherSequence
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Definition:SLPVectorizer.cpp:16957
uint32_t
uint64_t
unsigned
llvm::VFDatabase::getVectorizedFunction
Function * getVectorizedFunction(const VFShape &Shape) const
Definition:VectorUtils.h:106
iterator.h
iterator_range.h
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
ErrorHandling.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition:ErrorHandling.h:143
llvm::AArch64CC::LS
@ LS
Definition:AArch64BaseInfo.h:264
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition:AMDGPUMetadata.h:395
llvm::AMDGPU::PALMD::Key
Key
PAL metadata keys.
Definition:AMDGPUMetadata.h:487
llvm::AMDGPU::VGPRIndexMode::Id
Id
Definition:SIDefines.h:310
llvm::AMDGPU::P1
@ P1
Definition:AMDGPURegBankLegalizeRules.h:53
llvm::ARMII::HorizontalReduction
@ HorizontalReduction
Definition:ARMBaseInfo.h:425
llvm::ARM_MB::ST
@ ST
Definition:ARMBaseInfo.h:73
llvm::ARM_PROC::IE
@ IE
Definition:ARMBaseInfo.h:27
llvm::ARM::PredBlockMask::TE
@ TE
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition:BitmaskEnum.h:125
llvm::COFF::Entry
@ Entry
Definition:COFF.h:844
llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition:CallingConv.h:34
llvm::Intrinsic::getOrInsertDeclaration
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition:Intrinsics.cpp:732
llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition:Intrinsics.h:44
llvm::LegalityPredicates::all
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
Definition:LegalizerInfo.h:234
llvm::M68kBeads::Term
@ Term
Definition:M68kBaseInfo.h:116
llvm::M68k::MemAddrModeKind::U
@ U
llvm::M68k::MemAddrModeKind::V
@ V
llvm::M68k::MemAddrModeKind::u
@ u
llvm::M68k::MemAddrModeKind::K
@ K
llvm::M68k::MemAddrModeKind::L
@ L
llvm::MipsISD::Ext
@ Ext
Definition:MipsISelLowering.h:157
llvm::MipsISD::Ins
@ Ins
Definition:MipsISelLowering.h:158
llvm::NVPTX::PTXLdStInstCode::Scalar
@ Scalar
Definition:NVPTX.h:162
llvm::NVPTX::PTXLdStInstCode::V2
@ V2
Definition:NVPTX.h:163
llvm::PatternMatch
Definition:PatternMatch.h:47
llvm::PatternMatch::m_Store
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
Definition:PatternMatch.h:1930
llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1216
llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1102
llvm::PatternMatch::m_BinOp
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition:PatternMatch.h:100
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1228
llvm::PatternMatch::m_FMul
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1174
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition:PatternMatch.h:49
llvm::PatternMatch::m_Instruction
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition:PatternMatch.h:826
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition:PatternMatch.h:885
llvm::PatternMatch::m_ExtractElt
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
Definition:PatternMatch.h:1837
llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition:PatternMatch.h:168
llvm::PatternMatch::m_Select
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
Definition:PatternMatch.h:1799
llvm::PatternMatch::m_SMin
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
Definition:PatternMatch.h:2348
llvm::PatternMatch::m_FAdd
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1108
llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1168
llvm::PatternMatch::m_OneUse
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition:PatternMatch.h:67
llvm::PatternMatch::m_LogicalOr
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
Definition:PatternMatch.h:3099
llvm::PatternMatch::m_Load
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
Definition:PatternMatch.h:1923
llvm::PatternMatch::m_ZExt
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
Definition:PatternMatch.h:2107
llvm::PatternMatch::m_UMax
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
Definition:PatternMatch.h:2354
llvm::PatternMatch::m_Cmp
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition:PatternMatch.h:105
llvm::PatternMatch::m_SMax
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
Definition:PatternMatch.h:2342
llvm::PatternMatch::m_APInt
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition:PatternMatch.h:299
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition:PatternMatch.h:92
llvm::PatternMatch::m_ZExtOrSExt
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
Definition:PatternMatch.h:2138
llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1234
llvm::PatternMatch::m_LogicalAnd
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
Definition:PatternMatch.h:3081
llvm::PatternMatch::m_Undef
auto m_Undef()
Match an arbitrary undef constant.
Definition:PatternMatch.h:152
llvm::PatternMatch::m_Or
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
Definition:PatternMatch.h:1222
llvm::PatternMatch::m_UMin
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Definition:PatternMatch.h:2360
llvm::PatternMatch::m_CombineOr
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition:PatternMatch.h:239
llvm::RISCVFenceField::R
@ R
Definition:RISCVBaseInfo.h:373
llvm::SIEncodingFamily::VI
@ VI
Definition:SIDefines.h:37
llvm::SIEncodingFamily::SI
@ SI
Definition:SIDefines.h:36
llvm::SPII::Store
@ Store
Definition:SparcInstrInfo.h:33
llvm::X86AS::GS
@ GS
Definition:X86.h:210
llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind::Used
@ Used
llvm::cl::Hidden
@ Hidden
Definition:CommandLine.h:137
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition:CommandLine.h:443
llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
llvm::codeview::ExportFlags::IsConstant
@ IsConstant
llvm::dwarf::Index
Index
Definition:Dwarf.h:882
llvm::dxil::ElementType::I1
@ I1
llvm::logicalview::LVPrintKind::Instructions
@ Instructions
llvm::objcopy::AdjustKind::Set
@ Set
llvm::omp::RTLDependInfoFields::Len
@ Len
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition:OptimizationRemarkEmitter.h:135
llvm::pdb::PDB_MemoryType::Stack
@ Stack
llvm::sampleprof::Base
@ Base
Definition:Discriminator.h:58
llvm::sys::path::begin
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition:Path.cpp:226
llvm::tgtok::In
@ In
Definition:TGLexer.h:84
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition:AddressRanges.h:18
llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition:STLExtras.h:329
llvm::getPointersDiff
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
Definition:LoopAccessAnalysis.cpp:1535
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition:SparseBitVector.h:877
llvm::createSimpleReduction
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition:LoopUtils.cpp:1278
llvm::doesNotNeedToBeScheduled
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
Definition:SLPVectorizer.cpp:1287
llvm::Offset
@ Offset
Definition:DWP.cpp:480
llvm::zip
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition:STLExtras.h:854
llvm::stable_sort
void stable_sort(R &&Range)
Definition:STLExtras.h:2037
llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1759
llvm::for_each
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1732
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1739
llvm::hash_value
hash_code hash_value(const FixedPointSemantics &Val)
Definition:APFixedPoint.h:136
llvm::getMinMaxReductionIntrinsicOp
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition:LoopUtils.cpp:989
llvm::isEqual
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Definition:GCNRegPressure.cpp:22
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition:STLExtras.h:1697
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition:Local.cpp:546
llvm::getVectorIntrinsicIDForCall
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
Definition:VectorUtils.cpp:209
llvm::reorderScalars
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
Definition:SLPVectorizer.cpp:1238
llvm::make_scope_exit
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition:ScopeExit.h:59
llvm::Depth
@ Depth
Definition:SIMachineScheduler.h:36
llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition:STLExtras.h:2448
llvm::set_intersect
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition:SetOperations.h:58
llvm::AlignStyle::Right
@ Right
llvm::AlignStyle::Left
@ Left
llvm::verifyFunction
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition:Verifier.cpp:7301
llvm::salvageDebugInfo
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition:Utils.cpp:1683
llvm::Failed
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition:Error.h:198
llvm::isUsedOutsideBlock
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
Definition:SLPVectorizer.cpp:1270
llvm::canConvertToMinOrMaxIntrinsic
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
Definition:ValueTracking.cpp:9167
llvm::set_is_subset
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
Definition:SetOperations.h:151
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition:ValueTracking.cpp:6768
llvm::interleaveComma
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition:STLExtras.h:2207
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition:STLExtras.h:657
llvm::alignDown
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition:MathExtras.h:556
llvm::post_order
iterator_range< po_iterator< T > > post_order(const T &G)
Definition:PostOrderIterator.h:197
llvm::getAlign
MaybeAlign getAlign(const Function &F, unsigned Index)
Definition:NVPTXUtilities.cpp:323
llvm::propagateMetadata
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
Definition:VectorUtils.cpp:942
llvm::bit_ceil
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition:bit.h:342
llvm::copy_if
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1785
llvm::isGather
bool isGather(IntrinsicInst *IntInst)
Definition:ARMBaseInstrInfo.h:937
llvm::getPointerOperand
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
Definition:Instructions.h:4998
llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition:MathExtras.h:395
llvm::MaskedValueIsZero
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
Definition:ValueTracking.cpp:333
llvm::erase
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition:STLExtras.h:2107
llvm::transform
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition:STLExtras.h:1952
llvm::has_single_bit
constexpr bool has_single_bit(T Value) noexcept
Definition:bit.h:146
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1746
llvm::isInstructionTriviallyDead
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition:Local.cpp:406
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition:MathExtras.h:341
llvm::createStrideMask
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
Definition:VectorUtils.cpp:1032
llvm::reverse
auto reverse(ContainerTy &&C)
Definition:STLExtras.h:420
llvm::inversePermutation
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
Definition:SLPVectorizer.cpp:1228
llvm::get
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Definition:PointerIntPair.h:270
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition:STLExtras.h:1664
llvm::createReplicatedMask
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
Definition:VectorUtils.cpp:1012
llvm::ComplexDeinterleavingOperation::Splat
@ Splat
llvm::find_if_not
auto find_if_not(R &&Range, UnaryPredicate P)
Definition:STLExtras.h:1771
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition:Debug.cpp:163
llvm::hasFullVectorsOrPowerOf2
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
Definition:SLPVectorizer.cpp:1303
llvm::isPointerTy
bool isPointerTy(const Type *T)
Definition:SPIRVUtils.h:256
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1753
llvm::wouldInstructionBeTriviallyDead
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition:Local.cpp:425
llvm::isModOrRefSet
bool isModOrRefSet(const ModRefInfo MRI)
Definition:ModRef.h:42
llvm::isSafeToSpeculativelyExecute
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
Definition:ValueTracking.cpp:7043
llvm::sortPtrAccesses
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
Definition:LoopAccessAnalysis.cpp:1600
llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition:Casting.h:548
llvm::propagateIRFlags
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition:LoopUtils.cpp:1368
llvm::PoisonMaskElem
constexpr int PoisonMaskElem
Definition:Instructions.h:1889
llvm::ModRefInfo::Ref
@ Ref
The access may reference the value stored in memory.
llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition:MathExtras.h:404
llvm::IRMemLocation::Other
@ Other
Any other memory.
llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
llvm::TTI
TargetTransformInfo TTI
Definition:TargetTransformInfo.h:208
llvm::getMinMaxReductionPredicate
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition:LoopUtils.cpp:1054
llvm::RecurKind
RecurKind
These are the kinds of recurrences that we support.
Definition:IVDescriptors.h:33
llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
llvm::RecurKind::None
@ None
Not a recurrence.
llvm::isVectorIntrinsicWithScalarOpAtArg
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
Definition:VectorUtils.cpp:134
llvm::areAllOperandsNonInsts
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
Definition:SLPVectorizer.cpp:1253
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition:Alignment.h:155
llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition:STLExtras.h:1938
llvm::Op
DWARFExpression::Operation Op
Definition:DWARFExpression.cpp:22
llvm::max_element
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition:STLExtras.h:2014
llvm::ViewGraph
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition:GraphWriter.h:427
llvm::copy
OutputIt copy(R &&Range, OutputIt Out)
Definition:STLExtras.h:1841
llvm::doesNotNeedToSchedule
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
Definition:SLPVectorizer.cpp:1295
llvm::BitWidth
constexpr unsigned BitWidth
Definition:BitmaskEnum.h:217
llvm::isGuaranteedToTransferExecutionToSuccessor
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
Definition:ValueTracking.cpp:7920
llvm::PseudoProbeReservedId::Last
@ Last
llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition:STLExtras.h:1945
llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1766
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition:STLExtras.h:1903
llvm::ComputeNumSignBits
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
Definition:ValueTracking.cpp:351
llvm::Cost
InstructionCost Cost
Definition:FunctionSpecialization.h:102
llvm::seq
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition:Sequence.h:305
llvm::VFParamKind::Vector
@ Vector
llvm::hash_combine
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition:Hashing.h:590
llvm::isGuaranteedNotToBePoison
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Definition:ValueTracking.cpp:7849
llvm::bit_floor
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition:bit.h:327
llvm::ConstantFoldIntegerCast
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
Definition:ConstantFolding.cpp:1549
llvm::Data
@ Data
Definition:SIMachineScheduler.h:55
llvm::isKnownNonNegative
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
Definition:ValueTracking.cpp:292
llvm::mayHaveNonDefUseDependency
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
Definition:ValueTracking.cpp:7156
llvm::isTriviallyVectorizable
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition:VectorUtils.cpp:46
llvm::isVectorIntrinsicWithOverloadTypeAtArg
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
Definition:VectorUtils.cpp:162
llvm::hash_combine_range
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition:Hashing.h:468
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition:BitVector.h:860
raw_ostream.h
N
#define N
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition:Alignment.h:39
llvm::CallBase::BundleOpInfo
Used to keep track of an operand bundle.
Definition:InstrTypes.h:2138
llvm::CodeMetrics::collectEphemeralValues
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition:CodeMetrics.cpp:71
llvm::DOTGraphTraits< BoUpSLP * >::TreeEntry
BoUpSLP::TreeEntry TreeEntry
Definition:SLPVectorizer.cpp:4476
llvm::DOTGraphTraits< BoUpSLP * >::getNodeLabel
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
Definition:SLPVectorizer.cpp:4480
llvm::DOTGraphTraits< BoUpSLP * >::getNodeAttributes
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
Definition:SLPVectorizer.cpp:4497
llvm::DOTGraphTraits< BoUpSLP * >::DOTGraphTraits
DOTGraphTraits(bool IsSimple=false)
Definition:SLPVectorizer.cpp:4478
llvm::DOTGraphTraits
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
Definition:DOTGraphTraits.h:166
llvm::DefaultDOTGraphTraits
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Definition:DOTGraphTraits.h:28
llvm::DiagnosticInfoOptimizationBase::Argument
Used in the streaming interface as the general argument type.
Definition:DiagnosticInfo.h:499
llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::operator*
NodeRef operator*()
Definition:SLPVectorizer.cpp:4433
llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::ChildIteratorType
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
Definition:SLPVectorizer.cpp:4429
llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::VectorizableTree
ContainerTy & VectorizableTree
Definition:SLPVectorizer.cpp:4427
llvm::GraphTraits< BoUpSLP * >::child_end
static ChildIteratorType child_end(NodeRef N)
Definition:SLPVectorizer.cpp:4444
llvm::GraphTraits< BoUpSLP * >::getEntryNode
static NodeRef getEntryNode(BoUpSLP &R)
Definition:SLPVectorizer.cpp:4436
llvm::GraphTraits< BoUpSLP * >::child_begin
static ChildIteratorType child_begin(NodeRef N)
Definition:SLPVectorizer.cpp:4440
llvm::GraphTraits< BoUpSLP * >::nodes_begin
static nodes_iterator nodes_begin(BoUpSLP *R)
Definition:SLPVectorizer.cpp:4464
llvm::GraphTraits< BoUpSLP * >::NodeRef
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
Definition:SLPVectorizer.cpp:4418
llvm::GraphTraits< BoUpSLP * >::size
static unsigned size(BoUpSLP *R)
Definition:SLPVectorizer.cpp:4472
llvm::GraphTraits< BoUpSLP * >::TreeEntry
BoUpSLP::TreeEntry TreeEntry
Definition:SLPVectorizer.cpp:4415
llvm::GraphTraits< BoUpSLP * >::nodes_end
static nodes_iterator nodes_end(BoUpSLP *R)
Definition:SLPVectorizer.cpp:4468
llvm::GraphTraits
Definition:GraphTraits.h:38
llvm::Incoming
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Definition:SILowerI1Copies.h:25
llvm::Loop::LoopBounds::Direction
Direction
An enum for the direction of the loop.
Definition:LoopInfo.h:215
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition:Alignment.h:117
llvm::MinMax
Definition:AssumeBundleQueries.h:70
llvm::SLPVectorizerPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition:SLPVectorizer.cpp:18478
llvm::SLPVectorizerPass::runImpl
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Definition:SLPVectorizer.cpp:18498
llvm::SimplifyQuery
Definition:SimplifyQuery.h:70
llvm::SmallMapVector
A MapVector that performs no allocations if smaller than a certain size.
Definition:MapVector.h:254
llvm::TargetTransformInfo::OperandValueInfo
Definition:TargetTransformInfo.h:1135
llvm::TargetTransformInfo::PointersChainInfo
Describe known properties for a set of pointers.
Definition:TargetTransformInfo.h:311
llvm::VFShape
Contains the information about the kind of vectorization available.
Definition:VFABIDemangler.h:83
llvm::VFShape::get
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Definition:VFABIDemangler.h:108
llvm::cl::desc
Definition:CommandLine.h:409
llvm::less_first
Function object to check whether the first component of a container supported by std::get (like std::...
Definition:STLExtras.h:1467
llvm::less_second
Function object to check whether the second component of a container supported by std::get (like std:...
Definition:STLExtras.h:1476
llvm::slpvectorizer::BoUpSLP::EdgeInfo
This structure holds any data we need about the edges being traversed during buildTree_rec().
Definition:SLPVectorizer.cpp:1643
llvm::slpvectorizer::BoUpSLP::EdgeInfo::EdgeIdx
unsigned EdgeIdx
The operand index of the use.
Definition:SLPVectorizer.cpp:1650
llvm::slpvectorizer::BoUpSLP::EdgeInfo::EdgeInfo
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
Definition:SLPVectorizer.cpp:1645
llvm::slpvectorizer::BoUpSLP::EdgeInfo::dump
LLVM_DUMP_METHOD void dump() const
Definition:SLPVectorizer.cpp:1662
llvm::slpvectorizer::BoUpSLP::EdgeInfo::UserTE
TreeEntry * UserTE
The user TreeEntry.
Definition:SLPVectorizer.cpp:1648
llvm::slpvectorizer::BoUpSLP::EdgeInfo::EdgeInfo
EdgeInfo()=default
llvm::slpvectorizer::BoUpSLP::EdgeInfo::operator<<
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
Definition:SLPVectorizer.cpp:1652
llvm::slpvectorizer::BoUpSLP::EdgeInfo::dump
void dump(raw_ostream &OS) const
Debug print.
Definition:SLPVectorizer.cpp:1658

Generated on Thu Jul 17 2025 17:59:46 for LLVM by doxygen 1.9.6
[8]ページ先頭

©2009-2025 Movatter.jp