1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10// stores that can be put together into vector-stores. Next, it attempts to 11// construct vectorizable tree using the use-def chains. If a profitable tree 12// was found, the SLP vectorizer performs vectorization on the tree. 14// The pass is inspired by the work described in the paper: 15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 17//===----------------------------------------------------------------------===// 73#ifdef EXPENSIVE_CHECKS 106using namespaceslpvectorizer;
107using namespacestd::placeholders;
109#define SV_NAME "slp-vectorizer" 110#define DEBUG_TYPE "SLP" 112STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
115"Controls which SLP graphs should be vectorized.");
119cl::desc(
"Run the SLP vectorization passes"));
123cl::desc(
"Enable vectorization for wider vector utilization"));
127cl::desc(
"Only vectorize if you gain more than this " 132cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on " 133"heuristics and makes vectorization decision via cost modeling."));
137cl::desc(
"Attempt to vectorize horizontal reductions"));
142"Attempt to vectorize horizontal reductions feeding into a store"));
146cl::desc(
"Attempt to vectorize for this register size in bits"));
150cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
152/// Limits the size of scheduling regions in a block. 153/// It avoid long compile times for _very_ large blocks where vector 154/// instructions are spread over a wide range. 155/// This limit is way higher than needed by real-world functions. 158cl::desc(
"Limit the size of the SLP scheduling region per block"));
162cl::desc(
"Attempt to vectorize for this register size in bits"));
166cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
170cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
172// The maximum depth that the look-ahead score heuristic will explore. 173// The higher this value, the higher the compilation time overhead. 176cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
178// The maximum depth that the look-ahead score heuristic will explore 179// when it probing among candidates for vectorization tree roots. 180// The higher this value, the higher the compilation time overhead but unlike 181// similar limit for operands ordering this is less frequently used, hence 182// impact of higher value is less noticeable. 185cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
189cl::desc(
"The minimum number of loads, which should be considered strided, " 190"if the stride is > 1 or is runtime value"));
194cl::desc(
"The maximum stride, considered to be profitable."));
198cl::desc(
"Display the SLP trees with Graphviz"));
202cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
204// Limit the number of alias checks. The limit is chosen so that 205// it has no negative effect on the llvm benchmarks. 208// Limit of the number of uses for potentially transformed instructions/values, 209// used in checks to avoid compile-time explode. 212// Another limit for the alias checks: The maximum distance between load/store 213// instructions where alias checks are done. 214// This limit is useful for very large basic blocks. 217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling 218/// regions to be handled. 221/// Maximum allowed number of operands in the PHI nodes. 224/// Predicate for the element types that the SLP vectorizer supports. 226/// The most important thing to filter here are types which are invalid in LLVM 227/// vectors. We also filter target specific types which have absolutely no 228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 229/// avoids spending time checking the cost model and realizing that they will 230/// be inevitably scalarized. 232// TODO: Support ScalableVectorType. 233if (
SLPReVec && isa<FixedVectorType>(Ty))
235return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
239/// Returns the type of the given value/instruction \p V. If it is store, 240/// returns the type of its value operand, for Cmp - the types of the compare 241/// operands and for insertelement - the type os the inserted operand. 242/// Otherwise, just the type of the value is returned. 244if (
auto *SI = dyn_cast<StoreInst>(V))
245return SI->getValueOperand()->getType();
246if (
auto *CI = dyn_cast<CmpInst>(V))
247return CI->getOperand(0)->getType();
248if (
auto *IE = dyn_cast<InsertElementInst>(V))
249return IE->getOperand(1)->getType();
253/// \returns the number of elements for Ty. 255assert(!isa<ScalableVectorType>(Ty) &&
256"ScalableVectorType is not supported.");
257if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258return VecTy->getNumElements();
262/// \returns the vector type of ScalarTy based on vectorization factor. 268/// Returns the number of elements of the given type \p Ty, not less than \p Sz, 269/// which forms type, which splits by \p TTI into whole vector types during 272Type *Ty,
unsigned Sz) {
275// Find the number of elements, which forms full vectors. 277if (NumParts == 0 || NumParts >= Sz)
282/// Returns the number of elements of the given type \p Ty, not greater than \p 283/// Sz, which forms type, which splits by \p TTI into whole vector types during 290// Find the number of elements, which forms full vectors. 292if (NumParts == 0 || NumParts >= Sz)
297return (Sz / RegVF) * RegVF;
302// The ShuffleBuilder implementation use shufflevector to splat an "element". 303// But the element have different meaning for SLP (scalar) and REVEC 304// (vector). We need to expand Mask into masks which shufflevector can use 307for (
unsignedI : seq<unsigned>(Mask.size()))
309I * VecTyNumElements, VecTyNumElements)))
311 : Mask[
I] * VecTyNumElements + J;
315/// \returns the number of groups of shufflevector 316/// A group has the following features 317/// 1. All of value in a group are shufflevector. 318/// 2. The mask of all shufflevector is isExtractSubvectorMask. 319/// 3. The mask of all shufflevector uses all of the elements of the source. 320/// e.g., it is 1 group (%0) 321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison, 322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison, 324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 325/// it is 2 groups (%3 and %4) 326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison, 327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison, 329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison, 331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison, 333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison, 336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison, 338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 342if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
344auto *SV = cast<ShuffleVectorInst>(VL.
front());
345unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348if (SVNumElements % ShuffleMaskSize != 0)
350unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
353unsigned NumGroup = 0;
354for (
size_tI = 0, E = VL.
size();
I != E;
I += GroupSize) {
355auto *SV = cast<ShuffleVectorInst>(VL[
I]);
356Value *Src = SV->getOperand(0);
360auto *SV = cast<ShuffleVectorInst>(V);
361// From the same source. 362if (SV->getOperand(0) != Src)
365if (!SV->isExtractSubvectorMask(Index))
367 ExpectedIndex.
set(Index / ShuffleMaskSize);
371if (!ExpectedIndex.
all())
375assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
379/// \returns a shufflevector mask which is used to vectorize shufflevectors 381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison, 382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison, 384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison, 386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison, 388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31> 393auto *SV = cast<ShuffleVectorInst>(VL.
front());
394unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
397unsigned AccumulateLength = 0;
399auto *SV = cast<ShuffleVectorInst>(V);
400for (
int M : SV->getShuffleMask())
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
408/// \returns True if the value is a constant (but not globals/constant 411return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
414/// Checks if \p V is one of vector-like instructions, i.e. undef, 415/// insertelement/extractelement with constant indices for fixed vector type or 416/// extractvalue instruction. 418if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
421auto *
I = dyn_cast<Instruction>(V);
422if (!
I || isa<ExtractValueInst>(
I))
424if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
426if (isa<ExtractElementInst>(
I))
428assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
432/// Returns power-of-2 number of elements in a single register (part), given the 433/// total number of elements \p Size and number of registers (parts) \p 439/// Returns correct remaining number of elements, considering total amount \p 440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems 441/// and current register (part) \p Part. 444return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
448/// Print a short descriptor of the instruction bundle suitable for debug output. 453OS <<
"Idx: " <<
Idx <<
", ";
454OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
459/// \returns true if all of the instructions in \p VL are in the same block or 462auto *It =
find_if(VL, IsaPred<Instruction>);
471if (isa<PoisonValue>(V))
473auto *
II = dyn_cast<Instruction>(V);
477if (BB !=
II->getParent())
483/// \returns True if all of the values in \p VL are constants (but not 484/// globals/constant expressions). 486// Constant expressions and globals can't be vectorized like normal integer/FP 491/// \returns True if all of the values in \p VL are identical or some of them 494Value *FirstNonUndef =
nullptr;
496if (isa<UndefValue>(V))
502if (V != FirstNonUndef)
505return FirstNonUndef !=
nullptr;
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. 510if (
auto *Cmp = dyn_cast<CmpInst>(
I))
511return Cmp->isCommutative();
512if (
auto *BO = dyn_cast<BinaryOperator>(
I))
513return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
519// Commutative, if icmp eq/ne sub, 0 521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
525// Commutative, if abs(sub nsw, true) or abs(sub, false). 527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
533 (BO->getOpcode() == Instruction::FSub &&
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
539returnI->isCommutative();
545static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
549if (
constauto *IE = dyn_cast<T>(Inst)) {
550constauto *VT = dyn_cast<FixedVectorType>(IE->getType());
553constauto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
556if (CI->getValue().uge(VT->getNumElements()))
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
565/// \returns inserting or extracting index of InsertElement, ExtractElement or 566/// InsertValue instruction, using Offset as base offset for index. 567/// \returns std::nullopt if the index is not an immediate. 570if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
572if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
577constauto *
IV = dyn_cast<InsertValueInst>(Inst);
581Type *CurrentType =
IV->getType();
582for (
unsignedI :
IV->indices()) {
583if (
constauto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(
I);
586 }
elseif (
constauto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements 599/// in the shuffle mask. 601 FirstArg,
///< The mask is expected to be for permutation of 1-2 vectors, 602 ///< check for the mask elements for the first argument (mask 603 ///< indices are in range [0:VF)). 604 SecondArg,
///< The mask is expected to be for permutation of 2 vectors, check 605 ///< for the mask elements for the second argument (mask indices 606 ///< are in range [VF:2*VF)) 607 UndefsAsMask
///< Consider undef mask elements (-1) as placeholders for 608 ///< future shuffle elements and mark them as ones as being used 609 ///< in future. Non-undef elements are considered as unused since 610 ///< they're already marked as used in the mask. 614/// Prepares a use bitset for the given mask either for the first argument or 621if (MaskArg == UseMask::UndefsAsMask)
625if (MaskArg == UseMask::FirstArg &&
Value < VF)
626 UseMask.reset(
Value);
627elseif (MaskArg == UseMask::SecondArg &&
Value >= VF)
628 UseMask.reset(
Value - VF);
633/// Checks if the given value is actually an undefined constant vector. 634/// Also, if the \p UseMask is not empty, tries to check if the non-masked 635/// elements actually mask the insertelement buildvector, if any. 636template <
bool IsPoisonOnly = false>
640usingT = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
643auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
646auto *
C = dyn_cast<Constant>(V);
648if (!UseMask.empty()) {
650while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
652if (isa<T>(
II->getOperand(1)))
659if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
662// TODO: Add analysis for shuffles here too. 667 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
674for (
unsignedI = 0, E = VecTy->getNumElements();
I != E; ++
I) {
677 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
683/// Checks if the vector of instructions can be represented as a shuffle, like: 684/// %x0 = extractelement <4 x i8> %x, i32 0 685/// %x3 = extractelement <4 x i8> %x, i32 3 686/// %y1 = extractelement <4 x i8> %y, i32 1 687/// %y2 = extractelement <4 x i8> %y, i32 2 688/// %x0x0 = mul i8 %x0, %x0 689/// %x3x3 = mul i8 %x3, %x3 690/// %y1y1 = mul i8 %y1, %y1 691/// %y2y2 = mul i8 %y2, %y2 692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 696/// ret <4 x i8> %ins4 697/// can be transformed into: 698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, 700/// %2 = mul <4 x i8> %1, %1 702/// Mask will return the Shuffle Mask equivalent to the extracted elements. 703/// TODO: Can we split off and reuse the shuffle mask detection from 704/// ShuffleVectorInst/getShuffleCost? 705static std::optional<TargetTransformInfo::ShuffleKind>
708constauto *It =
find_if(VL, IsaPred<ExtractElementInst>);
712 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
719 return std::max(S, VTy->getNumElements());
725auto *EE = dyn_cast<ExtractElementInst>(V);
728Value *Vec = EE->getVectorOperand();
729if (isa<UndefValue>(Vec))
734 ShuffleMode CommonShuffleMode =
Unknown;
736for (
unsignedI = 0, E = VL.
size();
I < E; ++
I) {
737// Undef can be represented as an undef element in a vector. 738if (isa<UndefValue>(VL[
I]))
740auto *EI = cast<ExtractElementInst>(VL[
I]);
741if (isa<ScalableVectorType>(EI->getVectorOperandType()))
743auto *Vec = EI->getVectorOperand();
744// We can extractelement from undef or poison vector. 745if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
747// All vector operands must have the same number of vector elements. 748if (isa<UndefValue>(Vec)) {
751if (isa<UndefValue>(EI->getIndexOperand()))
753auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
756// Undefined behavior if Idx is negative or >= Size. 759unsigned IntIdx =
Idx->getValue().getZExtValue();
764// For correct shuffling we have to have at most 2 different vector operands 765// in all extractelement instructions. 766if (!Vec1 || Vec1 == Vec) {
768 }
elseif (!Vec2 || Vec2 == Vec) {
774if (CommonShuffleMode == Permute)
776// If the extract index is not the same as the operation number, it is a 779 CommonShuffleMode = Permute;
782 CommonShuffleMode =
Select;
784// If we're not crossing lanes in different vectors, consider it as blending. 785if (CommonShuffleMode ==
Select && Vec2)
787// If Vec2 was never used, we have a permutation of a single vector, otherwise 788// we have permutation of 2 vectors. 793/// \returns True if Extract{Value,Element} instruction extracts element Idx. 796assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798"Expected extractelement or extractvalue instruction.");
799if (Opcode == Instruction::ExtractElement) {
800auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
803return CI->getZExtValue();
805auto *EI = cast<ExtractValueInst>(E);
806if (EI->getNumIndices() != 1)
808return *EI->idx_begin();
813/// Main data required for vectorization of instructions. 814classInstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0. 821assert(valid() &&
"InstructionsState is invalid.");
826assert(valid() &&
"InstructionsState is invalid.");
830 /// The main/alternate opcodes for the list of instructions. 831unsignedgetOpcode()
const{
return getMainOp()->getOpcode(); }
833unsigned getAltOpcode()
const{
return getAltOp()->getOpcode(); }
835 /// Some of the instructions in the list have alternate opcodes. 836bool isAltShuffle()
const{
return getMainOp() != getAltOp(); }
839unsigned CheckedOpcode =
I->getOpcode();
840returngetOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
843 /// Checks if the current state is valid, i.e. has non-null MainOp 844bool valid()
const{
return MainOp && AltOp; }
846explicitoperatorbool()
const{
return valid(); }
848 InstructionsState() =
delete;
850 : MainOp(MainOp), AltOp(AltOp) {}
851static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
854}
// end anonymous namespace 856/// \returns true if \p Opcode is allowed as part of the main/alternate 857/// instruction for SLP vectorization. 859/// Example of unsupported opcode is SDIV that can potentially cause UB if the 860/// "shuffled out" lane would result in division by zero. 871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e. 872/// compatible instructions or constants, or just some other regular values. 877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
884/// \returns true if a compare instruction \p CI has similar "look" and 885/// same predicate as \p BaseCI, "as is" or with its operands and predicate 886/// swapped, false otherwise. 890"Assessing comparisons of different types?");
900return (BasePred == Pred &&
902 (BasePred == SwappedPred &&
906/// \returns analysis of the Instructions in \p VL described in 907/// InstructionsState, the Opcode that we suppose the whole list 908/// could be vectorized even if its structure is diverse. 911// Make sure these are all Instructions. 912if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
913return InstructionsState::invalid();
915auto *It =
find_if(VL, IsaPred<Instruction>);
917return InstructionsState::invalid();
920unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
921if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
922 (VL.
size() == 2 && InstCnt < 2))
923return InstructionsState::invalid();
925bool IsCastOp = isa<CastInst>(MainOp);
926bool IsBinOp = isa<BinaryOperator>(MainOp);
927bool IsCmpOp = isa<CmpInst>(MainOp);
932unsigned AltOpcode = Opcode;
934bool SwappedPredsCompatible = IsCmpOp && [&]() {
936 UniquePreds.
insert(BasePred);
937 UniqueNonSwappedPreds.
insert(BasePred);
939auto *
I = dyn_cast<CmpInst>(V);
945 UniqueNonSwappedPreds.
insert(CurrentPred);
946if (!UniquePreds.
contains(CurrentPred) &&
947 !UniquePreds.
contains(SwappedCurrentPred))
948 UniquePreds.
insert(CurrentPred);
950// Total number of predicates > 2, but if consider swapped predicates 951// compatible only 2, consider swappable predicates as compatible opcodes, 953return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
955// Check for one alternate opcode from another BinaryOperator. 956// TODO - generalize to support all operators (types, calls etc.). 959if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
963return InstructionsState::invalid();
965bool AnyPoison = InstCnt != VL.
size();
968auto *
I = dyn_cast<Instruction>(V);
972// Cannot combine poison and divisions. 973// TODO: do some smart analysis of the CallInsts to exclude divide-like 974// intrinsics/functions only. 975if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
976return InstructionsState::invalid();
977unsigned InstOpcode =
I->getOpcode();
978if (IsBinOp && isa<BinaryOperator>(
I)) {
979if (InstOpcode == Opcode || InstOpcode == AltOpcode)
983 AltOpcode = InstOpcode;
987 }
elseif (IsCastOp && isa<CastInst>(
I)) {
990Value *Op1 =
I->getOperand(0);
993if (InstOpcode == Opcode || InstOpcode == AltOpcode)
995if (Opcode == AltOpcode) {
998"Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1004 }
elseif (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1005auto *BaseInst = cast<CmpInst>(MainOp);
1006Type *Ty0 = BaseInst->getOperand(0)->getType();
1007Type *Ty1 = Inst->getOperand(0)->getType();
1009assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1010assert(InstOpcode == AltOpcode &&
1011"Alternate instructions are only supported by BinaryOperator " 1013// Check for compatible operands. If the corresponding operands are not 1014// compatible - need to perform alternate vectorization. 1019if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1025auto *AltInst = cast<CmpInst>(AltOp);
1026if (MainOp != AltOp) {
1029 }
elseif (BasePred != CurrentPred) {
1032"CmpInst isn't safe for alternation, logic needs to be updated!");
1037if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1041 }
elseif (InstOpcode == Opcode) {
1042assert(InstOpcode == AltOpcode &&
1043"Alternate instructions are only supported by BinaryOperator and " 1045if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1046if (Gep->getNumOperands() != 2 ||
1048return InstructionsState::invalid();
1049 }
elseif (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1051return InstructionsState::invalid();
1052 }
elseif (
auto *LI = dyn_cast<LoadInst>(
I)) {
1053auto *BaseLI = cast<LoadInst>(MainOp);
1054if (!LI->isSimple() || !BaseLI->isSimple())
1055return InstructionsState::invalid();
1056 }
elseif (
auto *Call = dyn_cast<CallInst>(
I)) {
1057auto *
CallBase = cast<CallInst>(MainOp);
1059return InstructionsState::invalid();
1060if (Call->hasOperandBundles() &&
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1066return InstructionsState::invalid();
1069return InstructionsState::invalid();
1072if (Mappings.
size() != BaseMappings.
size() ||
1073 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1074 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1075 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1076 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1077 Mappings.
front().Shape.Parameters !=
1078 BaseMappings.
front().Shape.Parameters)
1079return InstructionsState::invalid();
1084return InstructionsState::invalid();
1087return InstructionsState(MainOp, AltOp);
1090/// \returns true if all of the values in \p VL have the same type or false 1097/// \returns True if in-tree use also needs extract. This refers to 1098/// possible scalar operand in vectorized instruction. 1106case Instruction::Load: {
1107LoadInst *LI = cast<LoadInst>(UserInst);
1110case Instruction::Store: {
1111StoreInst *SI = cast<StoreInst>(UserInst);
1112return (SI->getPointerOperand() == Scalar);
1114case Instruction::Call: {
1115CallInst *CI = cast<CallInst>(UserInst);
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1127/// \returns the AA location that is being access by the instruction. 1131if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1136/// \returns True if the instruction is not a volatile or atomic load/store. 1138if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1139return LI->isSimple();
1141return SI->isSimple();
1143return !
MI->isVolatile();
1147/// Shuffles \p Mask in accordance with the given \p SubMask. 1148/// \param ExtendingManyInputs Supports reshuffling of the mask with not only 1149/// one but two input vectors. 1151bool ExtendingManyInputs =
false) {
1155 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1156// Check if input scalars were extended to match the size of other node. 1158"SubMask with many inputs support must be larger than the mask.");
1160 Mask.append(SubMask.
begin(), SubMask.
end());
1164int TermValue = std::min(Mask.size(), SubMask.
size());
1165for (
intI = 0, E = SubMask.
size();
I < E; ++
I) {
1167 (!ExtendingManyInputs &&
1168 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1170 NewMask[
I] = Mask[SubMask[
I]];
1175/// Order may have elements assigned special value (size) which is out of 1176/// bounds. Such indices only appear on places which correspond to undef values 1177/// (see canReuseExtract for details) and used in order to avoid undef values 1178/// have effect on operands ordering. 1179/// The first loop below simply finds all unused indices and then the next loop 1180/// nest assigns these indices for undef values positions. 1181/// As an example below Order has two undef positions and they have assigned 1182/// values 3 and 7 respectively: 1183/// before: 6 9 5 4 9 2 1 0 1184/// after: 6 3 5 4 7 2 1 0 1186constunsigned Sz = Order.
size();
1189for (
unsignedI = 0;
I < Sz; ++
I) {
1191 UnusedIndices.
reset(Order[
I]);
1193 MaskedIndices.
set(
I);
1195if (MaskedIndices.
none())
1198"Non-synced masked/available indices.");
1202assert(
Idx >= 0 &&
"Indices must be synced.");
1209/// \returns a bitset for selecting opcodes. false for Opcode0 and true for 1213Type *ScalarTy = VL[0]->getType();
1216for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1217if (isa<PoisonValue>(VL[Lane]))
1219if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1220 OpcodeMask.
set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1231constunsigned E = Indices.
size();
1233for (
unsignedI = 0;
I < E; ++
I)
1234 Mask[Indices[
I]] =
I;
1237/// Reorders the list of scalars in accordance with the given \p Mask. 1240assert(!Mask.empty() &&
"Expected non-empty mask.");
1244for (
unsignedI = 0, E = Prev.
size();
I < E; ++
I)
1246 Scalars[Mask[
I]] = Prev[
I];
1249/// Checks if the provided value does not require scheduling. It does not 1250/// require scheduling if this is not an instruction or it is an instruction 1251/// that does not read/write memory and all operands are either not instructions 1252/// or phi nodes or instructions from different blocks. 1254auto *
I = dyn_cast<Instruction>(V);
1259 auto *IO = dyn_cast<Instruction>(V);
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1266/// Checks if the provided value does not require scheduling. It does not 1267/// require scheduling if this is not an instruction or it is an instruction 1268/// that does not read/write memory and all users are phi nodes or instructions 1269/// from the different blocks. 1271auto *
I = dyn_cast<Instruction>(V);
1274// Limits the number of uses to save compile time. 1275return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1277 auto *IU = dyn_cast<Instruction>(U);
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1284/// Checks if the specified value does not require scheduling. It does not 1285/// require scheduling if all operands and all users do not need to be scheduled 1286/// in the current basic block. 1291/// Checks if the specified array of instructions does not require scheduling. 1292/// It is so if all either instructions have operands that do not require 1293/// scheduling or their users do not require scheduling since they are phis or 1294/// in other basic blocks. 1296return !VL.
empty() &&
1300/// Returns true if widened type of \p Ty elements with size \p Sz represents 1301/// full vector type, i.e. adding extra element results in extra parts upon type 1312return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1316namespaceslpvectorizer {
1318/// Bottom Up SLP Vectorizer. 1326 /// Tracks the state we can represent the loads in the given sequence. 1345 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1349// Use the vector register size specified by the target unless overridden 1350// by a command-line option. 1351// TODO: It would be better to limit the vectorization factor based on 1352// data type rather than just register size. For example, x86 AVX has 1353// 256-bit registers, but it does not support integer operations 1354// at that width (that requires AVX2). 1368 /// Vectorize the tree that starts with the elements in \p VL. 1369 /// Returns the vectorized root. 1372 /// Vectorize the tree but with the list of externally used values \p 1373 /// ExternallyUsedValues. Values in this MapVector can be replaced but the 1374 /// generated extractvalue instructions. 1379 /// \returns the cost incurred by unwanted spills and fills, caused by 1380 /// holding live values over call sites. 1383 /// \returns the vectorization cost of the subtree that starts at \p VL. 1384 /// A negative number means that this is profitable. 1387 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 1388 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 1392 /// Construct a vectorizable tree that starts at \p Roots. 1395 /// Returns whether the root node has in-tree uses. 1397return !VectorizableTree.
empty() &&
1398 !VectorizableTree.
front()->UserTreeIndices.empty();
1401 /// Return the scalars of the root node. 1403assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1404return VectorizableTree.
front()->Scalars;
1407 /// Returns the type/is-signed info for the root node in the graph without 1410const TreeEntry &Root = *VectorizableTree.
front().get();
1411if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1414auto It = MinBWs.
find(&Root);
1415if (It != MinBWs.
end())
1419if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1426 /// Checks if the root graph node can be emitted with narrower bitwidth at 1427 /// codegen and returns it signedness, if so. 1429return MinBWs.
at(VectorizableTree.
front().get()).second;
1432 /// Returns reduction type after minbitdth analysis. 1434if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437DL->getTypeSizeInBits(
1438 VectorizableTree.
front()->Scalars.front()->getType()))
1440 VectorizableTree.
front()->Scalars.front()->getType(),
1441 VectorizableTree.
front()->getVectorFactor());
1444 VectorizableTree.
front()->Scalars.front()->getContext(),
1446 VectorizableTree.
front()->getVectorFactor());
1449 /// Builds external uses of the vectorized scalars, i.e. the list of 1450 /// vectorized scalars to be extracted, their lanes and their scalar users. \p 1451 /// ExternallyUsedValues contains additional list of external uses to handle 1452 /// vectorization of reductions. 1456 /// Transforms graph nodes to target specific representations, if profitable. 1459 /// Clear the internal data structures that are created by 'buildTree'. 1461 VectorizableTree.
clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1465 NonScheduledFirst.
clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.
clear();
1468 IsGraphTransformMode =
false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.
clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472for (
auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1477 ReductionBitWidth = 0;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.
clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList =
nullptr;
1483 PostponedGathers.
clear();
1484 ValueToGatherNodes.
clear();
1489 /// Returns the base graph size, before any transformations. 1492 /// Perform LICM and CSE on the newly generated gather sequences. 1495 /// Does this non-empty order represent an identity order? Identity 1496 /// should be represented as an empty order, so this is used to 1497 /// decide if we can canonicalize a computed order. Undef elements 1498 /// (represented as size) are ignored. 1500assert(!Order.
empty() &&
"expected non-empty order");
1501constunsigned Sz = Order.
size();
1503returnP.value() ==
P.index() ||
P.value() == Sz;
1507 /// Checks if the specified gather tree entry \p TE can be represented as a 1508 /// shuffled vector entry + (possibly) permutation with other gathers. It 1509 /// implements the checks only for possibly ordered scalars (Loads, 1510 /// ExtractElement, ExtractValue), which can be part of the graph. 1513 /// Sort loads into increasing pointers offsets to allow greater clustering. 1516 /// Gets reordering data for the given tree entry. If the entry is vectorized 1517 /// - just return ReorderIndices, otherwise check if the scalars can be 1518 /// reordered and return the most optimal order. 1519 /// \return std::nullopt if ordering is not important, empty order, if 1520 /// identity order is important, or the actual order. 1521 /// \param TopToBottom If true, include the order of vectorized stores and 1522 /// insertelement nodes, otherwise skip them. 1526 /// Reorders the current graph to the most profitable order starting from the 1527 /// root node to the leaf nodes. The best order is chosen only from the nodes 1528 /// of the same size (vectorization factor). Smaller nodes are considered 1529 /// parts of subgraph with smaller VF and they are reordered independently. We 1530 /// can make it because we still need to extend smaller nodes to the wider VF 1531 /// and we can merge reordering shuffles with the widening shuffles. 1534 /// Reorders the current graph to the most profitable order starting from 1535 /// leaves to the root. It allows to rotate small subgraphs and reduce the 1536 /// number of reshuffles if the leaf nodes use the same order. In this case we 1537 /// can merge the orders and just shuffle user node instead of shuffling its 1538 /// operands. Plus, even the leaf nodes have different orders, it allows to 1539 /// sink reordering in the graph closer to the root node and merge it later 1540 /// during analysis. 1543 /// \return The vector element size in bits to use when vectorizing the 1544 /// expression tree ending at \p V. If V is a store, the size is the width of 1545 /// the stored value. Otherwise, the size is the width of the largest loaded 1546 /// value reaching V. This method is used by the vectorizer to calculate 1547 /// vectorization factors. 1550 /// Compute the minimum type sizes required to represent the entries in a 1551 /// vectorizable tree. 1554// \returns maximum vector register size as set by TTI or overridden by cl::opt. 1556return MaxVecRegSize;
1559// \returns minimum vector register size as set by cl::opt. 1561return MinVecRegSize;
1571return MaxVF ? MaxVF : UINT_MAX;
1574 /// Check if homogeneous aggregate is isomorphic to some VectorType. 1575 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like 1576 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, 1577 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. 1579 /// \returns number of elements in vector if isomorphism exists, 0 otherwise. 1582 /// \returns True if the VectorizableTree is both tiny and not fully 1583 /// vectorizable. We do not vectorize such trees. 1586 /// Checks if the graph and all its subgraphs cannot be better vectorized. 1587 /// It may happen, if all gather nodes are loads and they cannot be 1588 /// "clusterized". In this case even subgraphs cannot be vectorized more 1589 /// effectively than the base graph. 1592 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values 1593 /// can be load combined in the backend. Load combining may not be allowed in 1594 /// the IR optimizer, so we do not want to alter the pattern. For example, 1595 /// partially transforming a scalar bswap() pattern into vector code is 1596 /// effectively impossible for the backend to undo. 1597 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1598 /// may not be necessary. 1601 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values 1602 /// can be load combined in the backend. Load combining may not be allowed in 1603 /// the IR optimizer, so we do not want to alter the pattern. For example, 1604 /// partially transforming a scalar bswap() pattern into vector code is 1605 /// effectively impossible for the backend to undo. 1606 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1607 /// may not be necessary. 1610 /// Checks if the given array of loads can be represented as a vectorized, 1611 /// scatter or just simple gather. 1612 /// \param VL list of loads. 1613 /// \param VL0 main load value. 1614 /// \param Order returned order of load instructions. 1615 /// \param PointerOps returned list of pointer operands. 1616 /// \param BestVF return best vector factor, if recursive check found better 1617 /// vectorization sequences rather than masked gather. 1618 /// \param TryRecursiveCheck used to check if long masked gather can be 1619 /// represented as a serie of loads/insert subvector, if profitable. 1623unsigned *BestVF =
nullptr,
1624bool TryRecursiveCheck =
true)
const;
1626 /// Registers non-vectorizable sequence of loads 1631 /// Checks if the given loads sequence is known as not vectorizable 1632template <
typename T>
1639 /// This structure holds any data we need about the edges being traversed 1640 /// during buildTree_rec(). We keep track of: 1641 /// (i) the user TreeEntry index, and 1642 /// (ii) the index of the edge. 1647 /// The user TreeEntry. 1649 /// The operand index of the use. 1660 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1669 /// A helper class used for scoring candidates for two consecutive lanes. 1675int NumLanes;
// Total number of lanes (aka vectorization factor). 1676int MaxLevel;
// The maximum recursion depth for accumulating score. 1682 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1685// The hard-coded scores listed here are not very important, though it shall 1686// be higher for better matches to improve the resulting cost. When 1687// computing the scores of matching one sub-tree with another, we are 1688// basically counting the number of values that are matching. So even if all 1689// scores are set to 1, we would still get a decent matching result. 1690// However, sometimes we have to break ties. For example we may have to 1691// choose between matching loads vs matching opcodes. This is what these 1692// scores are helping us with: they provide the order of preference. Also, 1693// this is important if the scalar is externally used or used in another 1694// tree entry node in the different lane. 1696 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). 1698 /// The same load multiple times. This should have a better score than 1699 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it 1700 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for 1701 /// a vector load and 1.0 for a broadcast. 1703 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). 1705 /// A load candidate for masked gather. 1707 /// ExtractElementInst from same vector and consecutive indexes. 1709 /// ExtractElementInst from same vector and reversed indices. 1713 /// Instructions with the same opcode. 1715 /// Instructions with alt opcodes (e.g, add + sub). 1717 /// Identical instructions (a.k.a. splat or broadcast). 1719 /// Matching with an undef is preferable to failing. 1721 /// Score for failing to find a decent match. 1723 /// Score if all users are vectorized. 1726 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. 1727 /// \p U1 and \p U2 are the users of \p V1 and \p V2. 1728 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p 1737if (isa<LoadInst>(V1)) {
1738// Retruns true if the users of V1 and V2 won't need to be extracted. 1739auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1740// Bail out if we have too many uses to save compilation time. 1744auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1749return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1751// A broadcast of a load can be cheaper on some targets. 1752if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1754 ((
int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1761auto CheckSameEntryOrFail = [&]() {
1762if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1768auto *LI1 = dyn_cast<LoadInst>(V1);
1769auto *LI2 = dyn_cast<LoadInst>(V2);
1771if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1773return CheckSameEntryOrFail();
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(),
DL, SE,
/*StrictCheck=*/true);
1778if (!Dist || *Dist == 0) {
1781 R.TTI->isLegalMaskedGather(
1784return CheckSameEntryOrFail();
1786// The distance is too large - still may be profitable to use masked 1788if (std::abs(*Dist) > NumLanes / 2)
1790// This still will detect consecutive loads, but we might have "holes" 1791// in some cases. It is ok for non-power-2 vectorization and may produce 1792// better results. It should not affect current vectorization. 1797auto *C1 = dyn_cast<Constant>(V1);
1798auto *C2 = dyn_cast<Constant>(V2);
1802// Extracts from consecutive indexes of the same vector better score as 1803// the extracts could be optimized away. 1807// Undefs are always profitable for extractelements. 1808// Compiler can easily combine poison and extractelement <non-poison> or 1809// undef and extractelement <poison>. But combining undef + 1810// extractelement <non-poison-but-may-produce-poison> requires some 1812if (isa<UndefValue>(V2))
1821// Undefs are always profitable for extractelements. 1829int Dist = Idx2 - Idx1;
1830// The distance is too large - still may be profitable to use 1832if (std::abs(Dist) == 0)
1834if (std::abs(Dist) > NumLanes / 2)
1841return CheckSameEntryOrFail();
1844auto *I1 = dyn_cast<Instruction>(V1);
1845auto *I2 = dyn_cast<Instruction>(V2);
1847if (I1->getParent() != I2->getParent())
1848return CheckSameEntryOrFail();
1853// Note: Only consider instructions with <= 2 operands to avoid 1854// complexity explosion. 1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1857 !S.isAltShuffle()) &&
1859return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1867if (I1 && isa<PoisonValue>(V2))
1870if (isa<UndefValue>(V2))
1873return CheckSameEntryOrFail();
1876 /// Go through the operands of \p LHS and \p RHS recursively until 1877 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are 1878 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands 1879 /// of \p U1 and \p U2), except at the beginning of the recursion where 1880 /// these are set to nullptr. 1884 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] 1889 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at 1890 /// each level recursively, accumulating the score. It starts from matching 1891 /// the additions at level 0, then moves on to the loads (level 1). The 1892 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and 1893 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while 1894 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail. 1895 /// Please note that the order of the operands does not matter, as we 1896 /// evaluate the score of all profitable combinations of operands. In 1897 /// other words the score of G1 and G4 is the same as G1 and G2. This 1898 /// heuristic is based on ideas described in: 1899 /// Look-ahead SLP: Auto-vectorization in the presence of commutative 1900 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 1906// Get the shallow score of V1 and V2. 1907int ShallowScoreAtThisLevel =
1910// If reached MaxLevel, 1911// or if V1 and V2 are not instructions, 1912// or if they are SPLAT, 1913// or if they are not consecutive, 1914// or if profitable to vectorize loads or extractelements, early return 1916auto *I1 = dyn_cast<Instruction>(
LHS);
1917auto *I2 = dyn_cast<Instruction>(
RHS);
1918if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924return ShallowScoreAtThisLevel;
1925assert(I1 && I2 &&
"Should have early exited.");
1927// Contains the I2 operand indexes that got matched with I1 operands. 1930// Recursion towards the operands of I1 and I2. We are trying all possible 1931// operand pairs, and keeping track of the best score. 1932for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1934// Try to pair op1I with the best operand of I2. 1936unsigned MaxOpIdx2 = 0;
1937bool FoundBest =
false;
1938// If I2 is commutative try all combinations. 1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943assert(FromIdx <= ToIdx &&
"Bad index");
1944for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1945// Skip operands already paired with OpIdx1. 1946if (Op2Used.
count(OpIdx2))
1948// Recursively calculate the cost at each level 1951 I1, I2, CurrLevel + 1, {});
1952// Look for the best score. 1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1961// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. 1962 Op2Used.
insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1966return ShallowScoreAtThisLevel;
1969 /// A helper data structure to hold the operands of a vector of instructions. 1970 /// This supports a fixed vector length for all operand vectors. 1972 /// For each operand we need (i) the value, and (ii) the opcode that it 1973 /// would be attached to if the expression was in a left-linearized form. 1974 /// This is required to avoid illegal operand reordering. 1979 /// Op1 Op2 Linearized + Op2 1980 /// \ / ----------> |/ 1983 /// Op1 - Op2 (0 + Op1) - Op2 1986 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. 1988 /// Another way to think of this is to track all the operations across the 1989 /// path from the operand all the way to the root of the tree and to 1990 /// calculate the operation that corresponds to this path. For example, the 1991 /// path from Op2 to the root crosses the RHS of the '-', therefore the 1992 /// corresponding operation is a '-' (which matches the one in the 1993 /// linearized tree, as shown above). 1995 /// For lack of a better term, we refer to this operation as Accumulated 1996 /// Path Operation (APO). 1998 OperandData() =
default;
1999 OperandData(
Value *V,
bool APO,
bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2001 /// The operand value. 2003 /// TreeEntries only allow a single opcode, or an alternate sequence of 2004 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the 2005 /// APO. It is set to 'true' if 'V' is attached to an inverse operation 2006 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise 2009 /// Helper data for the reordering function. 2013 /// During operand reordering, we are trying to select the operand at lane 2014 /// that matches best with the operand at the neighboring lane. Our 2015 /// selection is based on the type of value we are looking for. For example, 2016 /// if the neighboring lane has a load, we need to look for a load that is 2017 /// accessing a consecutive address. These strategies are summarized in the 2018 /// 'ReorderingMode' enumerator. 2019enum class ReorderingMode {
2020 Load,
///< Matching loads to consecutive memory addresses 2021 Opcode,
///< Matching instructions based on opcode (same or alternate) 2023Splat,
///< Matching the same instruction multiple times (broadcast) 2024Failed,
///< We failed to create a vectorizable group 2029 /// A vector of operand vectors. 2031 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0] 2032 /// is not IntrinsicInst, ArgSize is User::getNumOperands. 2033unsigned ArgSize = 0;
2039constLoop *L =
nullptr;
2041 /// \returns the operand data at \p OpIdx and \p Lane. 2042 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2043return OpsVec[OpIdx][Lane];
2046 /// \returns the operand data at \p OpIdx and \p Lane. Const version. 2047const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const{
2048return OpsVec[OpIdx][Lane];
2051 /// Clears the used flag for all entries. 2053for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2057 OpsVec[OpIdx][Lane].IsUsed =
false;
2060 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. 2061void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2062std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2065 /// \param Lane lane of the operands under analysis. 2066 /// \param OpIdx operand index in \p Lane lane we're looking the best 2068 /// \param Idx operand index of the current candidate value. 2069 /// \returns The additional score due to possible broadcasting of the 2070 /// elements in the lane. It is more profitable to have power-of-2 unique 2071 /// elements in the lane, it will be vectorized with higher probability 2072 /// after removing duplicates. Currently the SLP vectorizer supports only 2073 /// vectorization of the power-of-2 number of unique scalars. 2074int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsignedIdx,
2076Value *IdxLaneV = getData(
Idx, Lane).V;
2077if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2081for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2084Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085if (!isa<Instruction>(OpIdxLnV))
2089unsigned UniquesCount = Uniques.
size();
2090auto IdxIt = Uniques.
find(IdxLaneV);
2091unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2093Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2095unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2097if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2099return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2103 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2105 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2108 /// \param Lane lane of the operands under analysis. 2109 /// \param OpIdx operand index in \p Lane lane we're looking the best 2111 /// \param Idx operand index of the current candidate value. 2112 /// \returns The additional score for the scalar which users are all 2114int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsignedIdx)
const{
2115Value *IdxLaneV = getData(
Idx, Lane).V;
2116Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2117// Do not care about number of uses for vector-like instructions 2118// (extractelement/extractvalue with constant indices), they are extracts 2119// themselves and already externally used. Vectorization of such 2120// instructions does not add extra extractelement instruction, just may 2125auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2128return R.areAllUsersVectorized(IdxLaneI)
2133 /// Score scaling factor for fully compatible instructions but with 2134 /// different number of external uses. Allows better selection of the 2135 /// instructions with less external uses. 2136staticconstint ScoreScaleFactor = 10;
2138 /// \Returns the look-ahead score, which tells us how much the sub-trees 2139 /// rooted at \p LHS and \p RHS match, the more they match the higher the 2140 /// score. This helps break ties in an informed way when we cannot decide on 2141 /// the order of the operands by just considering the immediate 2144int Lane,
unsigned OpIdx,
unsignedIdx,
2148// Keep track of the instruction stack as we recurse into the operands 2149// during the look-ahead score exploration. 2152/*CurrLevel=*/1, MainAltOps);
2154int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2155if (Score <= -SplatScore) {
2159 Score += SplatScore;
2160// Scale score to see the difference between different operands 2161// and similar operands but all vectorized/not all vectorized 2162// uses. It does not affect actual selection of the best 2163// compatible operand in general, just allows to select the 2164// operand with all vectorized uses. 2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2173 /// Best defined scores per lanes between the passes. Used to choose the 2174 /// best operand (with the highest score) between the passes. 2175 /// The key - {Operand Index, Lane}. 2176 /// The value - the best score between the passes for the lane and the 2181// Search all operands in Ops[*][Lane] for the one that matches best 2182// Ops[OpIdx][LastLane] and return its opreand index. 2183// If no good match can be found, return std::nullopt. 2184 std::optional<unsigned>
2185 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2189unsigned NumOperands = getNumOperands();
2191// The operand of the previous lane at OpIdx. 2192Value *OpLastLane = getData(OpIdx, LastLane).V;
2194// Our strategy mode for OpIdx. 2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196if (RMode == ReorderingMode::Failed)
2199// The linearized opcode of the operand at OpIdx, Lane. 2200bool OpIdxAPO = getData(OpIdx, Lane).APO;
2202// The best operand index and its score. 2203// Sometimes we have more than one option (e.g., Opcode and Undefs), so we 2204// are using the score to differentiate between the two. 2206 std::optional<unsigned>
Idx;
2210 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2213// Track if the operand must be marked as used. If the operand is set to 2214// Score 1 explicitly (because of non power-of-2 unique scalars, we may 2215// want to reestimate the operands again on the following iterations). 2216bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2219// Iterate through all unused operands and look for the best. 2220for (
unsignedIdx = 0;
Idx != NumOperands; ++
Idx) {
2221// Get the operand at Idx and Lane. 2222 OperandData &OpData = getData(
Idx, Lane);
2224bool OpAPO = OpData.APO;
2226// Skip already selected operands. 2230// Skip if we are trying to move the operand to a position with a 2231// different opcode in the linearized tree form. This would break the 2233if (OpAPO != OpIdxAPO)
2236// Look for an operand that matches the current mode. 2238case ReorderingMode::Load:
2239case ReorderingMode::Opcode: {
2240bool LeftToRight = Lane > LastLane;
2241Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2242Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2243int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx,
Idx, IsUsed, UsedLanes);
2245if (Score >
static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2254case ReorderingMode::Constant:
2255if (isa<Constant>(
Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2258if (isa<Constant>(
Op)) {
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2263if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2267case ReorderingMode::Splat:
2268if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2269 IsUsed =
Op == OpLastLane;
2270if (
Op == OpLastLane) {
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2278case ReorderingMode::Failed:
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2287// If we could not find a good match return std::nullopt. 2291 /// Helper for reorderOperandVecs. 2292 /// \returns the lane that we should start reordering from. This is the one 2293 /// which has the least number of operands that can freely move about or 2294 /// less profitable because it already has the most optimal set of operands. 2295unsigned getBestLaneToStartReordering()
const{
2296unsigned Min = UINT_MAX;
2297unsigned SameOpNumber = 0;
2298// std::pair<unsigned, unsigned> is used to implement a simple voting 2299// algorithm and choose the lane with the least number of operands that 2300// can freely move about or less profitable because it already has the 2301// most optimal set of operands. The first unsigned is a counter for 2302// voting, the second unsigned is the counter of lanes with instructions 2303// with same/alternate opcodes and same parent basic block. 2305// Try to be closer to the original results, if we have multiple lanes 2306// with same cost. If 2 lanes have the same cost, use the one with the 2308for (
intI = getNumLanes();
I > 0; --
I) {
2309unsigned Lane =
I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2312// Compare the number of operands that can move and choose the one with 2314if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 }
elseif (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2321// Select the most optimal lane in terms of number of operands that 2322// should be moved around. 2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 }
elseif (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327auto [It, Inserted] =
2328 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2333// Select the lane with the minimum counter. 2334unsigned BestLane = 0;
2335unsigned CntMin = UINT_MAX;
2337if (
Data.second.first < CntMin) {
2338 CntMin =
Data.second.first;
2339 BestLane =
Data.second.second;
2345 /// Data structure that helps to reorder operands. 2346structOperandsOrderData {
2347 /// The best number of operands with the same APOs, which can be 2349unsigned NumOfAPOs = UINT_MAX;
2350 /// Number of operands with the same/alternate instruction opcode and 2352unsigned NumOpsWithSameOpcodeParent = 0;
2353 /// Hash for the actual operands ordering. 2354 /// Used to count operands, actually their position id and opcode 2355 /// value. It is used in the voting mechanism to find the lane with the 2356 /// least number of operands that can freely move about or less profitable 2357 /// because it already has the most optimal set of operands. Can be 2358 /// replaced with SmallVector<unsigned> instead but hash code is faster 2359 /// and requires less memory. 2362 /// \returns the maximum number of operands that are allowed to be reordered 2363 /// for \p Lane and the number of compatible instructions(with the same 2364 /// parent/opcode). This is used as a heuristic for selecting the first lane 2365 /// to start operand reordering. 2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const{
2367unsigned CntTrue = 0;
2368unsigned NumOperands = getNumOperands();
2369// Operands with the same APO can be reordered. We therefore need to count 2370// how many of them we have for each APO, like this: Cnt[APO] = x. 2371// Since we only have two APOs, namely true and false, we can avoid using 2372// a map. Instead we can simply count the number of operands that 2373// correspond to one of them (in this case the 'true' APO), and calculate 2374// the other by subtracting it from the total number of operands. 2375// Operands with the same instruction opcode and parent are more 2376// profitable since we don't need to move them in many cases, with a high 2377// probability such lane already can be vectorized effectively. 2378bool AllUndefs =
true;
2379unsigned NumOpsWithSameOpcodeParent = 0;
2383for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384const OperandData &OpData = getData(OpIdx, Lane);
2387// Use Boyer-Moore majority voting for finding the majority opcode and 2388// the number of times it occurs. 2389if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2391I->getParent() != Parent) {
2392if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2395 Parent =
I->getParent();
2397 --NumOpsWithSameOpcodeParent;
2400 ++NumOpsWithSameOpcodeParent;
2404 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2409 OperandsOrderData
Data;
2410Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2416 /// Go through the instructions in VL and append their operands. 2419assert((empty() || VL.
size() == getNumLanes()) &&
2420"Expected same number of lanes");
2421assert(S.valid() &&
"InstructionsState is invalid.");
2422// IntrinsicInst::isCommutative returns true if swapping the first "two" 2423// arguments to the intrinsic produces the same result. 2424constexprunsigned IntrinsicNumOperands = 2;
2427 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2428 OpsVec.
resize(NumOperands);
2429unsigned NumLanes = VL.
size();
2430for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2431 OpsVec[OpIdx].
resize(NumLanes);
2432for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2433assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2434"Expected instruction or poison value");
2435// Our tree has just 3 nodes: the root and two operands. 2436// It is therefore trivial to get the APO. We only need to check the 2437// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or 2438// RHS operand. The LHS operand of both add and sub is never attached 2439// to an inversese operation in the linearized form, therefore its APO 2440// is false. The RHS is true only if VL[Lane] is an inverse operation. 2442// Since operand reordering is performed on groups of commutative 2443// operations or alternating sequences (e.g., +, -), we can safely 2444// tell the inverse operations by checking commutativity. 2445if (isa<PoisonValue>(VL[Lane])) {
2446if (
auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2448 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),
true,
false};
2451 }
elseif (
auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2453 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),
true,
false};
2457 OpsVec[OpIdx][Lane] = {
2462bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2463bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2464 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2470 /// \returns the number of operands. 2471unsigned getNumOperands()
const{
return ArgSize; }
2473 /// \returns the number of lanes. 2474unsigned getNumLanes()
const{
return OpsVec[0].
size(); }
2476 /// \returns the operand value at \p OpIdx and \p Lane. 2477Value *getValue(
unsigned OpIdx,
unsigned Lane)
const{
2478return getData(OpIdx, Lane).V;
2481 /// \returns true if the data structure is empty. 2482bool empty()
const{
return OpsVec.
empty(); }
2484 /// Clears the data. 2485void clear() { OpsVec.
clear(); }
2487 /// \Returns true if there are enough operands identical to \p Op to fill 2488 /// the whole vector (it is mixed with constants or loop invariant values). 2489 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. 2490bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2491assert(
Op == getValue(OpIdx, Lane) &&
2492"Op is expected to be getValue(OpIdx, Lane).");
2493// Small number of loads - try load matching. 2494if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2496bool OpAPO = getData(OpIdx, Lane).APO;
2497bool IsInvariant = L && L->isLoopInvariant(
Op);
2499for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2502// This is set to true if we found a candidate for broadcast at Lane. 2503bool FoundCandidate =
false;
2504for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2505 OperandData &
Data = getData(OpI, Ln);
2506if (
Data.APO != OpAPO ||
Data.IsUsed)
2508Value *OpILane = getValue(OpI, Lane);
2509bool IsConstantOp = isa<Constant>(OpILane);
2510// Consider the broadcast candidate if: 2511// 1. Same value is found in one of the operands. 2513// 2. The operand in the given lane is not constant but there is a 2514// constant operand in another lane (which can be moved to the 2515// given lane). In this case we can represent it as a simple 2516// permutation of constant and broadcast. 2518 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2519// 2.1. If we have only 2 lanes, need to check that value in the 2520// next lane does not build same opcode sequence. 2523 isa<Constant>(
Data.V)))) ||
2524// 3. The operand in the current lane is loop invariant (can be 2525// hoisted out) and another operand is also a loop invariant 2526// (though not a constant). In this case the whole vector can be 2528// FIXME: need to teach the cost model about this case for better 2530 (IsInvariant && !isa<Constant>(
Data.V) &&
2532 L->isLoopInvariant(
Data.V))) {
2533 FoundCandidate =
true;
2543return getNumLanes() == 2 || Cnt > 1;
2546 /// Checks if there is at least single compatible operand in lanes other 2547 /// than \p Lane, compatible with the operand \p Op. 2548bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const{
2549assert(
Op == getValue(OpIdx, Lane) &&
2550"Op is expected to be getValue(OpIdx, Lane).");
2551bool OpAPO = getData(OpIdx, Lane).APO;
2552for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2555if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2556const OperandData &
Data = getData(OpI, Ln);
2557if (
Data.APO != OpAPO ||
Data.IsUsed)
2559Value *OpILn = getValue(OpI, Ln);
2560return (L && L->isLoopInvariant(OpILn)) ||
2570 /// Initialize with all the operands of the instruction vector \p RootVL. 2573 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2574 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
2575// Append all the operands of RootVL. 2576 appendOperandsOfVL(RootVL, S);
2579 /// \Returns a value vector with the operands across all lanes for the 2580 /// opearnd at \p OpIdx. 2583assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2584"Expected same num of lanes across all operands");
2585for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2586 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2590// Performs operand reordering for 2 or more operands. 2591// The original operands are in OrigOps[OpIdx][Lane]. 2592// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. 2594unsigned NumOperands = getNumOperands();
2595unsigned NumLanes = getNumLanes();
2596// Each operand has its own mode. We are using this mode to help us select 2597// the instructions for each lane, so that they match best with the ones 2598// we have selected so far. 2601// This is a greedy single-pass algorithm. We are going over each lane 2602// once and deciding on the best order right away with no back-tracking. 2603// However, in order to increase its effectiveness, we start with the lane 2604// that has operands that can move the least. For example, given the 2606// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd 2607// Lane 1 : A[1] = C[1] - B[1] // Visited 1st 2608// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd 2609// Lane 3 : A[3] = C[3] - B[3] // Visited 4th 2610// we will start at Lane 1, since the operands of the subtraction cannot 2611// be reordered. Then we will visit the rest of the lanes in a circular 2612// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. 2614// Find the first lane that we will start our search from. 2615unsigned FirstLane = getBestLaneToStartReordering();
2617// Initialize the modes. 2618for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2619Value *OpLane0 = getValue(OpIdx, FirstLane);
2620// Keep track if we have instructions with all the same opcode on one 2622if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2623// Check if OpLane0 should be broadcast. 2624if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2625 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2626 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2627elseif (isa<LoadInst>(OpILane0))
2628 ReorderingModes[OpIdx] = ReorderingMode::Load;
2630 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2631 }
elseif (isa<Constant>(OpLane0)) {
2632 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2633 }
elseif (isa<Argument>(OpLane0)) {
2634// Our best hope is a Splat. It may save some cost in some cases. 2635 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2641// Check that we don't have same operands. No need to reorder if operands 2642// are just perfect diamond or shuffled diamond match. Do not do it only 2643// for possible broadcasts or non-power of 2 number of scalars (just for 2645auto &&SkipReordering = [
this]() {
2648for (
const OperandData &
Data : Op0)
2652if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2657// TODO: Check if we can remove a check for non-power-2 number of 2658// scalars after full support of non-power-2 vectorization. 2659return UniqueValues.
size() != 2 &&
2661 UniqueValues.
size());
2664// If the initial strategy fails for any of the operand indexes, then we 2665// perform reordering again in a second pass. This helps avoid assigning 2666// high priority to the failed strategy, and should improve reordering for 2667// the non-failed operand indexes. 2669// Check if no need to reorder operands since they're are perfect or 2670// shuffled diamond match. 2671// Need to do it to avoid extra external use cost counting for 2672// shuffled matches, which may cause regressions. 2673if (SkipReordering())
2675// Skip the second pass if the first pass did not fail. 2676bool StrategyFailed =
false;
2677// Mark all operand data as free to use. 2679// We keep the original operand order for the FirstLane, so reorder the 2680// rest of the lanes. We are visiting the nodes in a circular fashion, 2681// using FirstLane as the center point and increasing the radius 2684for (
unsignedI = 0;
I < NumOperands; ++
I)
2685 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2688 UsedLanes.
set(FirstLane);
2689for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2690// Visit the lane on the right and then the lane on the left. 2692int Lane = FirstLane +
Direction * Distance;
2693if (Lane < 0 || Lane >= (
int)NumLanes)
2695 UsedLanes.
set(Lane);
2697assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2699// Look for a good match for each operand. 2700for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2701// Search for the operand that matches SortedOps[OpIdx][Lane-1]. 2702 std::optional<unsigned> BestIdx =
2703 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2704 MainAltOps[OpIdx], UsedLanes);
2705// By not selecting a value, we allow the operands that follow to 2706// select a better matching value. We will get a non-null value in 2707// the next run of getBestOperand(). 2709// Swap the current operand with the one returned by 2711 swap(OpIdx, *BestIdx, Lane);
2713// Enable the second pass. 2714 StrategyFailed =
true;
2716// Try to get the alternate opcode and follow it during analysis. 2717if (MainAltOps[OpIdx].
size() != 2) {
2718 OperandData &AltOp = getData(OpIdx, Lane);
2719 InstructionsState OpS =
2721if (OpS && OpS.isAltShuffle())
2727// Skip second pass if the strategy did not fail. 2733#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2736case ReorderingMode::Load:
2738case ReorderingMode::Opcode:
2740case ReorderingMode::Constant:
2742case ReorderingMode::Splat:
2744case ReorderingMode::Failed:
2765constunsigned Indent = 2;
2768OS <<
"Operand " << Cnt++ <<
"\n";
2769for (
const OperandData &OpData : OpDataVec) {
2771if (
Value *V = OpData.V)
2775OS <<
", APO:" << OpData.APO <<
"}\n";
2787 /// Evaluate each pair in \p Candidates and return index into \p Candidates 2788 /// for a pair which have highest score deemed to have best chance to form 2789 /// root of profitable tree to vectorize. Return std::nullopt if no candidate 2790 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit 2791 /// of the cost, considered to be good enough score. 2797int BestScore = Limit;
2798 std::optional<int> Index;
2799for (
intI : seq<int>(0, Candidates.size())) {
2801 Candidates[
I].second,
2802/*U1=*/nullptr,
/*U2=*/nullptr,
2803/*CurrLevel=*/1, {});
2804if (Score > BestScore) {
2812 /// Checks if the instruction is marked for deletion. 2815 /// Removes an instruction from its block and eventually deletes it. 2816 /// It's like Instruction::eraseFromParent() except that the actual deletion 2817 /// is delayed until BoUpSLP is destructed. 2819 DeletedInstructions.insert(
I);
2822 /// Remove instructions from the parent function and clear the operands of \p 2823 /// DeadVals instructions, marking for deletion trivially dead operands. 2824template <
typename T>
2827for (
T *V : DeadVals) {
2828auto *
I = cast<Instruction>(V);
2829 DeletedInstructions.insert(
I);
2832for (
T *V : DeadVals) {
2833if (!V || !Processed.
insert(V).second)
2835auto *
I = cast<Instruction>(V);
2838if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2839 Entries.push_back(Entry);
2840auto It = MultiNodeScalars.find(
I);
2841if (It != MultiNodeScalars.end())
2842 Entries.append(It->second.begin(), It->second.end());
2844for (
Use &U :
I->operands()) {
2845if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2846 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2848 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2849return Entry->VectorizedValue == OpI;
2853I->dropAllReferences();
2855for (
T *V : DeadVals) {
2856auto *
I = cast<Instruction>(V);
2862 cast<Instruction>(U.getUser()));
2864"trying to erase instruction with users.");
2865I->removeFromParent();
2868// Process the dead instruction list until empty. 2869while (!DeadInsts.
empty()) {
2872if (!VI || !VI->getParent())
2875"Live instruction found in dead worklist!");
2876assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2878// Don't lose the debug info while deleting the instructions. 2881// Null out all of the instruction's operands to see if any operand 2882// becomes dead as we go. 2883for (
Use &OpU : VI->operands()) {
2884Value *OpV = OpU.get();
2892// If the operand is an instruction that became dead as we nulled out 2893// the operand, and if it is 'trivially' dead, delete it in a future 2895if (
auto *OpI = dyn_cast<Instruction>(OpV))
2896if (!DeletedInstructions.contains(OpI) &&
2901 VI->removeFromParent();
2902 DeletedInstructions.insert(VI);
2907 /// Checks if the instruction was already analyzed for being possible 2910return AnalyzedReductionsRoots.count(
I);
2912 /// Register given instruction as already analyzed for being possible 2915 AnalyzedReductionsRoots.insert(
I);
2917 /// Checks if the provided list of reduced values was checked already for 2922 /// Adds the list of reduced values to list of already checked values for the 2927 /// Clear the list of the analyzed reduction root instructions. 2929 AnalyzedReductionsRoots.clear();
2930 AnalyzedReductionVals.
clear();
2931 AnalyzedMinBWVals.
clear();
2933 /// Checks if the given value is gathered in one of the nodes. 2937 /// Checks if the given value is gathered in one of the nodes. 2941 /// Checks if the specified value was not schedule. 2943return NonScheduledFirst.
contains(V);
2946 /// Check if the value is vectorized in the tree. 2952 /// Determine if a node \p E in can be demoted to a smaller type with a 2953 /// truncation. We collect the entries that will be demoted in ToDemote. 2954 /// \param E Node for analysis 2955 /// \param ToDemote indices of the nodes to be demoted. 2956bool collectValuesToDemote(
2957const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2960bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2962 /// Check if the operands on the edges \p Edges of the \p UserTE allows 2963 /// reordering (i.e. the operands can be reordered because they have only one 2964 /// user and reordarable). 2965 /// \param ReorderableGathers List of all gather nodes that require reordering 2966 /// (e.g., gather of extractlements or partially vectorizable loads). 2967 /// \param GatherOps List of gather operand nodes for \p UserTE that require 2968 /// reordering, subset of \p NonVectorized. 2970 canReorderOperands(TreeEntry *UserTE,
2975 /// Checks if the given \p TE is a gather node with clustered reused scalars 2976 /// and reorders it per given \p Mask. 2977void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2979 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 2980 /// if any. If it is not vectorized (gather node), returns nullptr. 2981 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2983 TreeEntry *TE =
nullptr;
2985 TE = getTreeEntry(V);
2986if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2988auto It = MultiNodeScalars.find(V);
2989if (It != MultiNodeScalars.end()) {
2990for (TreeEntry *E : It->second) {
2991if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2999if (It != VL.
end()) {
3000assert(
TE->isSame(VL) &&
"Expected same scalars.");
3006 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 3007 /// if any. If it is not vectorized (gather node), returns nullptr. 3008const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
3009unsigned OpIdx)
const{
3010returnconst_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
3011const_cast<TreeEntry *
>(UserTE), OpIdx);
3014 /// Checks if all users of \p I are the part of the vectorization tree. 3015bool areAllUsersVectorized(
3019 /// Return information about the vector formed for the specified index 3020 /// of a vector of (the same) instruction. 3023 /// \ returns the graph entry for the \p Idx operand of the \p E entry. 3024const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsignedIdx)
const;
3026 /// Gets the root instruction for the given node. If the node is a strided 3027 /// load/store node with the reverse order, the root instruction is the last 3029Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3031 /// \returns Cast context for the given graph node. 3033 getCastContextHint(
const TreeEntry &TE)
const;
3035 /// \returns the cost of the vectorizable entry. 3040 /// This is the recursive part of buildTree. 3042const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3044 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can 3045 /// be vectorized to use the original vector (or aggregate "bitcast" to a 3046 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise 3047 /// returns false, setting \p CurrentOrder to either an empty vector or a 3048 /// non-identity permutation that allows to reuse extract instructions. 3049 /// \param ResizeAllowed indicates whether it is allowed to handle subvector 3053bool ResizeAllowed =
false)
const;
3055 /// Vectorize a single entry in the tree. 3056 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to 3057 /// avoid issues with def-use order. 3060 /// Returns vectorized operand node, that matches the order of the scalars 3061 /// operand number \p NodeIdx in entry \p E. 3062 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3063const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3064unsigned NodeIdx)
const{
3065returnconst_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3068 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry 3070 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to 3071 /// avoid issues with def-use order. 3072Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3074 /// Create a new vector from a list of scalar values. Produces a sequence 3075 /// which exploits values reused across lanes, and arranges the inserts 3076 /// for ease of later optimization. 3077template <
typename BVTy,
typename ResTy,
typename...
Args>
3078 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3080 /// Create a new vector from a list of scalar values. Produces a sequence 3081 /// which exploits values reused across lanes, and arranges the inserts 3082 /// for ease of later optimization. 3083Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3086 /// Returns the instruction in the bundle, which can be used as a base point 3087 /// for scheduling. Usually it is the last instruction in the bundle, except 3088 /// for the case when all operands are external (in this case, it is the first 3089 /// instruction in the list). 3090Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3092 /// Tries to find extractelement instructions with constant indices from fixed 3093 /// vector type and gather such instructions into a bunch, which highly likely 3094 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt 3095 /// was successful, the matched scalars are replaced by poison values in \p VL 3096 /// for future analysis. 3097 std::optional<TargetTransformInfo::ShuffleKind>
3101 /// Tries to find extractelement instructions with constant indices from fixed 3102 /// vector type and gather such instructions into a bunch, which highly likely 3103 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt 3104 /// was successful, the matched scalars are replaced by poison values in \p VL 3105 /// for future analysis. 3109unsigned NumParts)
const;
3111 /// Checks if the gathered \p VL can be represented as a single register 3112 /// shuffle(s) of previous tree entries. 3113 /// \param TE Tree entry checked for permutation. 3114 /// \param VL List of scalars (a subset of the TE scalar), checked for 3115 /// permutations. Must form single-register vector. 3116 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also 3117 /// commands to build the mask using the original vector value, without 3118 /// relying on the potential reordering. 3119 /// \returns ShuffleKind, if gathered values can be represented as shuffles of 3120 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask. 3121 std::optional<TargetTransformInfo::ShuffleKind>
3122 isGatherShuffledSingleRegisterEntry(
3127 /// Checks if the gathered \p VL can be represented as multi-register 3128 /// shuffle(s) of previous tree entries. 3129 /// \param TE Tree entry checked for permutation. 3130 /// \param VL List of scalars (a subset of the TE scalar), checked for 3132 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also 3133 /// commands to build the mask using the original vector value, without 3134 /// relying on the potential reordering. 3135 /// \returns per-register series of ShuffleKind, if gathered values can be 3136 /// represented as shuffles of previous tree entries. \p Mask is filled with 3137 /// the shuffle mask (also on per-register base). 3139 isGatherShuffledEntry(
3142unsigned NumParts,
bool ForOrder =
false);
3144 /// \returns the cost of gathering (inserting) the values in \p VL into a 3146 /// \param ForPoisonSrc true if initial vector is poison, false otherwise. 3148Type *ScalarTy)
const;
3150 /// Set the Builder insert point to one after the last instruction in 3152void setInsertPointAfterBundle(
const TreeEntry *E);
3154 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not 3155 /// specified, the starting vector value is poison. 3160 /// \returns whether the VectorizableTree is fully vectorizable and will 3161 /// be beneficial even the tree height is tiny. 3162bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3164 /// Run through the list of all gathered loads in the graph and try to find 3165 /// vector loads/masked gathers instead of regular gathers. Later these loads 3166 /// are reshufled to build final gathered nodes. 3167void tryToVectorizeGatheredLoads(
3172 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the 3173 /// users of \p TE and collects the stores. It returns the map from the store 3174 /// pointers to the collected stores. 3176 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3178 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the 3179 /// stores in \p StoresVec can form a vector instruction. If so it returns 3180 /// true and populates \p ReorderIndices with the shuffle indices of the 3181 /// stores when compared to the sorted vector. 3185 /// Iterates through the users of \p TE, looking for scalar stores that can be 3186 /// potentially vectorized in a future SLP-tree. If found, it keeps track of 3187 /// their order and builds an order index vector for each store bundle. It 3188 /// returns all these order vectors found. 3189 /// We run this after the tree has formed, otherwise we may come across user 3190 /// instructions that are not yet in the tree. 3192 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3194 /// Tries to reorder the gathering node for better vectorization 3196void reorderGatherNode(TreeEntry &TE);
3200 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3202 /// \returns Common mask for reorder indices and reused scalars. 3210 /// \returns true if the scalars in VL are equal to this entry. 3217 [Scalars](
Value *V,
int Idx) {
3218 return (isa<UndefValue>(V) &&
3219 Idx == PoisonMaskElem) ||
3220 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3223if (!ReorderIndices.empty()) {
3224// TODO: implement matching if the nodes are just reordered, still can 3225// treat the vector as the same if the list of scalars matches VL 3226// directly, without reordering. 3230return IsSame(Scalars, Mask);
3231if (VL.
size() == ReuseShuffleIndices.size()) {
3233return IsSame(Scalars, Mask);
3237return IsSame(Scalars, ReuseShuffleIndices);
3240bool isOperandGatherNode(
const EdgeInfo &UserEI)
const{
3241returnisGather() && !UserTreeIndices.empty() &&
3242 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3243 UserTreeIndices.front().UserTE == UserEI.UserTE;
3246 /// \returns true if current entry has same operands as \p TE. 3247bool hasEqualOperands(
const TreeEntry &TE)
const{
3248if (
TE.getNumOperands() != getNumOperands())
3251for (
unsignedI = 0, E = getNumOperands();
I < E; ++
I) {
3252unsigned PrevCount =
Used.count();
3253for (
unsigned K = 0;
K < E; ++
K) {
3256if (getOperand(K) ==
TE.getOperand(
I)) {
3261// Check if we actually found the matching operand. 3262if (PrevCount ==
Used.count())
3268 /// \return Final vectorization factor for the node. Defined by the total 3269 /// number of vectorized scalars, including those, used several times in the 3270 /// entry and counted in the \a ReuseShuffleIndices, if any. 3271unsigned getVectorFactor()
const{
3272if (!ReuseShuffleIndices.empty())
3273return ReuseShuffleIndices.size();
3274return Scalars.
size();
3277 /// Checks if the current node is a gather node. 3278boolisGather()
const{
return State == NeedToGather; }
3280 /// A vector of scalars. 3283 /// The Scalars are vectorized into this value. It is initialized to Null. 3286 /// New vector phi instructions emitted for the vectorized phi nodes. 3289 /// Do we need to gather this sequence or vectorize it 3290 /// (either with vector instruction or with scatter/gather 3291 /// intrinsics for store/load)? 3293 Vectorize,
///< The node is regularly vectorized. 3294 ScatterVectorize,
///< Masked scatter/gather node. 3295 StridedVectorize,
///< Strided loads (and stores) 3296 NeedToGather,
///< Gather/buildvector node. 3297 CombinedVectorize,
///< Vectorized node, combined with its user into more 3298 ///< complex node like select/cmp to minmax, mul/add to 3299 ///< fma, etc. Must be used for the following nodes in 3300 ///< the pattern, not the very first one. 3304 /// List of combined opcodes supported by the vectorizer. 3305enum CombinedOpcode {
3307MinMax = Instruction::OtherOpsEnd + 1,
3309 CombinedOpcode CombinedOp = NotCombinedOp;
3311 /// Does this sequence require some shuffling? 3314 /// Does this entry require reordering? 3317 /// Points back to the VectorizableTree. 3319 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has 3320 /// to be a pointer and needs to be able to initialize the child iterator. 3321 /// Thus we need a reference back to the container to translate the indices 3323 VecTreeTy &Container;
3325 /// The TreeEntry index containing the user of this entry. We can actually 3326 /// have multiple users so the data structure is not truly a tree. 3329 /// The index of this treeEntry in VectorizableTree. 3332 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from 3333 /// other nodes as a series of insertvector instructions. 3337 /// The operands of each instruction in each lane Operands[op_index][lane]. 3338 /// Note: This helps avoid the replication of the code that performs the 3339 /// reordering of operands during buildTree_rec() and vectorizeTree(). 3342 /// MainOp and AltOp are recorded inside. S should be obtained from 3344 InstructionsState S = InstructionsState::invalid();
3346 /// Interleaving factor for interleaved loads Vectorize nodes. 3347unsigned InterleaveFactor = 0;
3350 /// Returns interleave factor for interleave nodes. 3351unsigned getInterleaveFactor()
const{
return InterleaveFactor; }
3352 /// Sets interleaving factor for the interleaving nodes. 3353void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3355 /// Set this bundle's \p OpIdx'th operand to \p OpVL. 3359assert(Operands[OpIdx].empty() &&
"Already resized?");
3361"Number of operands is greater than the number of scalars.");
3366 /// Set this bundle's operand from Scalars. 3367void setOperand(
constBoUpSLP &R,
bool RequireReorder =
false) {
3368 VLOperands Ops(Scalars, S, R);
3371for (
unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands()))
3372 setOperand(
I, Ops.getVL(
I));
3375 /// Reorders operands of the node to the given mask \p Mask. 3381 /// \returns the \p OpIdx operand of this TreeEntry. 3387 /// \returns the \p OpIdx operand of this TreeEntry. 3393 /// \returns the number of operands. 3394unsigned getNumOperands()
const{
returnOperands.size(); }
3396 /// \return the single \p OpIdx operand. 3397Value *getSingleOperand(
unsigned OpIdx)
const{
3399assert(!Operands[OpIdx].empty() &&
"No operand available");
3403 /// Some of the instructions in the list have alternate opcodes. 3404bool isAltShuffle()
const{
return S.isAltShuffle(); }
3406bool isOpcodeOrAlt(
Instruction *
I)
const{
return S.isOpcodeOrAlt(
I); }
3408 /// Chooses the correct key for scheduling data. If \p Op has the same (or 3409 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is 3412auto *
I = dyn_cast<Instruction>(
Op);
3413if (
I && isOpcodeOrAlt(
I))
3415return S.getMainOp();
3418void setOperations(
const InstructionsState &S) {
3419assert(S &&
"InstructionsState is invalid.");
3423Instruction *getMainOp()
const{
return S.getMainOp(); }
3425Instruction *getAltOp()
const{
return S.getAltOp(); }
3427 /// The main/alternate opcodes for the list of instructions. 3428unsigned getOpcode()
const{
return S.
getOpcode(); }
3430unsigned getAltOpcode()
const{
return S.getAltOpcode(); }
3432bool hasState()
const{
return S.valid(); }
3434 /// When ReuseReorderShuffleIndices is empty it just returns position of \p 3435 /// V within vector of Scalars. Otherwise, try to remap on its reuse index. 3436int findLaneForValue(
Value *V)
const{
3437unsigned FoundLane = getVectorFactor();
3438for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3439 std::advance(It, 1)) {
3442 FoundLane = std::distance(Scalars.begin(), It);
3443assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3444if (!ReorderIndices.
empty())
3445 FoundLane = ReorderIndices[FoundLane];
3446assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3447if (ReuseShuffleIndices.
empty())
3449if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3450 RIt != ReuseShuffleIndices.
end()) {
3451 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3455assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3459 /// Build a shuffle mask for graph entry which represents a merge of main 3460 /// and alternate operations. 3467 /// Return true if this is a non-power-of-2 node. 3468bool isNonPowOf2Vec()
const{
3470return IsNonPowerOf2;
3473 /// Return true if this is a node, which tries to vectorize number of 3474 /// elements, forming whole vectors. 3479assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3480"Reshuffling not supported with non-power-of-2 vectors yet.");
3481return IsNonPowerOf2;
3484Value *getOrdered(
unsigned Idx)
const{
3485assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3486if (ReorderIndices.
empty())
3497for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3498dbgs() <<
"Operand " << OpI <<
":\n";
3499for (
constValue *V : Operands[OpI])
3502dbgs() <<
"Scalars: \n";
3503for (
Value *V : Scalars)
3508if (InterleaveFactor > 0) {
3509dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3512dbgs() <<
"Vectorize\n";
3515case ScatterVectorize:
3516dbgs() <<
"ScatterVectorize\n";
3518case StridedVectorize:
3519dbgs() <<
"StridedVectorize\n";
3522dbgs() <<
"NeedToGather\n";
3524case CombinedVectorize:
3525dbgs() <<
"CombinedVectorize\n";
3529dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
3530dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
3532dbgs() <<
"MainOp: NULL\n";
3533dbgs() <<
"AltOp: NULL\n";
3535dbgs() <<
"VectorizedValue: ";
3537dbgs() << *VectorizedValue <<
"\n";
3540dbgs() <<
"ReuseShuffleIndices: ";
3541if (ReuseShuffleIndices.
empty())
3544for (
int ReuseIdx : ReuseShuffleIndices)
3545dbgs() << ReuseIdx <<
", ";
3547dbgs() <<
"ReorderIndices: ";
3548for (
unsigned ReorderIdx : ReorderIndices)
3549dbgs() << ReorderIdx <<
", ";
3551dbgs() <<
"UserTreeIndices: ";
3552for (
constauto &EInfo : UserTreeIndices)
3553dbgs() << EInfo <<
", ";
3555if (!CombinedEntriesWithIndices.
empty()) {
3556dbgs() <<
"Combined entries: ";
3558dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3567void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3570dbgs() <<
"SLP: " << Banner <<
":\n";
3572dbgs() <<
"SLP: Costs:\n";
3573dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3574dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3575dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3576dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = " 3577 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3581 /// Create a new VectorizableTree entry. 3583 std::optional<ScheduleData *> Bundle,
3584const InstructionsState &S,
3585const EdgeInfo &UserTreeIdx,
3588unsigned InterleaveFactor = 0) {
3589 TreeEntry::EntryState EntryState =
3590 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3591 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3592 ReuseShuffleIndices, ReorderIndices);
3593if (E && InterleaveFactor > 0)
3594 E->setInterleave(InterleaveFactor);
3599 TreeEntry::EntryState EntryState,
3600 std::optional<ScheduleData *> Bundle,
3601const InstructionsState &S,
3602const EdgeInfo &UserTreeIdx,
3605assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3606 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3607"Need to vectorize gather entry?");
3608// Gathered loads still gathered? Do not create entry, use the original one. 3609if (GatheredLoadsEntriesFirst.has_value() &&
3610 EntryState == TreeEntry::NeedToGather && S &&
3611 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3612 !UserTreeIdx.UserTE)
3614 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3615 TreeEntry *
Last = VectorizableTree.
back().get();
3616Last->Idx = VectorizableTree.
size() - 1;
3617Last->State = EntryState;
3618// FIXME: Remove once support for ReuseShuffleIndices has been implemented 3619// for non-power-of-two vectors. 3622 ReuseShuffleIndices.empty()) &&
3623"Reshuffling scalars not yet supported for nodes with padding");
3624Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3625 ReuseShuffleIndices.end());
3626if (ReorderIndices.
empty()) {
3629Last->setOperations(S);
3631// Reorder scalars and build final mask. 3632Last->Scalars.assign(VL.
size(),
nullptr);
3635 if (Idx >= VL.size())
3636 return UndefValue::get(VL.front()->getType());
3641Last->setOperations(S);
3642Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3644if (!
Last->isGather()) {
3645for (
Value *V : VL) {
3646if (isa<PoisonValue>(V))
3648const TreeEntry *
TE = getTreeEntry(V);
3650"Scalar already in tree!");
3653 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3656 ScalarToTreeEntry[
V] =
Last;
3658// Update the scheduler bundle to point to this TreeEntry. 3659 ScheduleData *BundleMember = *Bundle;
3660assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3663"Bundle and VL out of sync");
3665for (
Value *V : VL) {
3670 BundleMember->TE =
Last;
3671 BundleMember = BundleMember->NextInBundle;
3674assert(!BundleMember &&
"Bundle and VL out of sync");
3676// Build a map for gathered scalars to the nodes where they are used. 3677bool AllConstsOrCasts =
true;
3680auto *
I = dyn_cast<CastInst>(V);
3681 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3682if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3683 !UserTreeIdx.UserTE->isGather())
3686if (AllConstsOrCasts)
3688 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3689 MustGather.
insert(VL.begin(), VL.end());
3692if (UserTreeIdx.UserTE)
3693Last->UserTreeIndices.push_back(UserTreeIdx);
3697 /// -- Vectorization State -- 3698 /// Holds all of the tree entries. 3699 TreeEntry::VecTreeTy VectorizableTree;
3704for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3705 VectorizableTree[
Id]->dump();
3711 TreeEntry *getTreeEntry(
Value *V) {
3712assert(V &&
"V cannot be nullptr.");
3713return ScalarToTreeEntry.lookup(V);
3716const TreeEntry *getTreeEntry(
Value *V)
const{
3717assert(V &&
"V cannot be nullptr.");
3718return ScalarToTreeEntry.lookup(V);
3721 /// Check that the operand node of alternate node does not generate 3722 /// buildvector sequence. If it is, then probably not worth it to build 3723 /// alternate shuffle, if number of buildvector operands + alternate 3724 /// instruction > than the number of buildvector instructions. 3725 /// \param S the instructions state of the analyzed values. 3726 /// \param VL list of the instructions with alternate opcodes. 3727bool areAltOperandsProfitable(
const InstructionsState &S,
3730 /// Checks if the specified list of the instructions/values can be vectorized 3731 /// and fills required data before actual scheduling of the instructions. 3732 TreeEntry::EntryState
3734bool IsScatterVectorizeUserTE,
3738 /// Maps a specific scalar to its tree entry. 3741 /// List of scalars, used in several vectorize nodes, and the list of the 3745 /// Maps a value to the proposed vectorizable size. 3748 /// A list of scalars that we found that we need to keep as scalars. 3751 /// A set of first non-schedulable values. 3754 /// A map between the vectorized entries and the last instructions in the 3755 /// bundles. The bundles are built in use order, not in the def order of the 3756 /// instructions. So, we cannot rely directly on the last instruction in the 3757 /// bundle being the last instruction in the program order during 3758 /// vectorization process since the basic blocks are affected, need to 3759 /// pre-gather them before. 3762 /// List of gather nodes, depending on other gather/vector nodes, which should 3763 /// be emitted after the vector instruction emission process to correctly 3764 /// handle order of the vector instructions and shuffles. 3767usingValueToGatherNodesMap =
3769 ValueToGatherNodesMap ValueToGatherNodes;
3771 /// A list of the load entries (node indices), which can be vectorized using 3772 /// strided or masked gather approach, but attempted to be represented as 3773 /// contiguous loads. 3776 /// true if graph nodes transforming mode is on. 3777bool IsGraphTransformMode =
false;
3779 /// The index of the first gathered load entry in the VectorizeTree. 3780 std::optional<unsigned> GatheredLoadsEntriesFirst;
3782 /// This POD struct describes one external user in the vectorized tree. 3787// Which scalar in our function. 3790// Which user that uses the scalar. 3793// Which lane does the scalar belong to. 3798 /// Checks if two instructions may access the same memory. 3800 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 3801 /// is invariant in the calling loop. 3806// First check if the result is already in the cache. 3807 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3808auto It = AliasCache.
find(Key);
3809if (It != AliasCache.
end())
3812// Store the result in the cache. 3814 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3818usingAliasCacheKey = std::pair<Instruction *, Instruction *>;
3820 /// Cache for alias results. 3821 /// TODO: consider moving this to the AliasAnalysis itself. 3824// Cache for pointerMayBeCaptured calls inside AA. This is preserved 3825// globally through SLP because we don't perform any action which 3826// invalidates capture results. 3829 /// Temporary store for deleted instructions. Instructions will be deleted 3830 /// eventually when the BoUpSLP is destructed. The deferral is required to 3831 /// ensure that there are no incorrect collisions in the AliasCache, which 3832 /// can happen if a new instruction is allocated at the same address as a 3833 /// previously deleted instruction. 3836 /// Set of the instruction, being analyzed already for reductions. 3839 /// Set of hashes for the list of reduction values already being analyzed. 3842 /// Values, already been analyzed for mininmal bitwidth and found to be 3846 /// A list of values that need to extracted out of the tree. 3847 /// This list holds pairs of (Internal Scalar : External User). External User 3848 /// can be nullptr, it means that this Internal Scalar will be used later, 3849 /// after vectorization. 3850 UserList ExternalUses;
3852 /// A list of GEPs which can be reaplced by scalar GEPs instead of 3853 /// extractelement instructions. 3856 /// Values used only by @llvm.assume calls. 3859 /// Holds all of the instructions that we gathered, shuffle instructions and 3860 /// extractelements. 3863 /// A list of blocks that we are going to CSE. 3866 /// List of hashes of vector of loads, which are known to be non vectorizable. 3869 /// Contains all scheduling relevant data for an instruction. 3870 /// A ScheduleData either represents a single instruction or a member of an 3871 /// instruction bundle (= a group of instructions which is combined into a 3872 /// vector instruction). 3874// The initial value for the dependency counters. It means that the 3875// dependencies are not calculated yet. 3876enum { InvalidDeps = -1 };
3878 ScheduleData() =
default;
3881 FirstInBundle =
this;
3882 NextInBundle =
nullptr;
3883 NextLoadStore =
nullptr;
3885 SchedulingRegionID = BlockSchedulingRegionID;
3886 clearDependencies();
3891 /// Verify basic self consistency properties 3893if (hasValidDependencies()) {
3894assert(UnscheduledDeps <= Dependencies &&
"invariant");
3896assert(UnscheduledDeps == Dependencies &&
"invariant");
3900assert(isSchedulingEntity() &&
3901"unexpected scheduled state");
3902for (
const ScheduleData *BundleMember =
this; BundleMember;
3903 BundleMember = BundleMember->NextInBundle) {
3904assert(BundleMember->hasValidDependencies() &&
3905 BundleMember->UnscheduledDeps == 0 &&
3906"unexpected scheduled state");
3907assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3908"only bundle is marked scheduled");
3912assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3913"all bundle members must be in same basic block");
3916 /// Returns true if the dependency information has been calculated. 3917 /// Note that depenendency validity can vary between instructions within 3918 /// a single bundle. 3919bool hasValidDependencies()
const{
return Dependencies != InvalidDeps; }
3921 /// Returns true for single instructions and for bundle representatives 3922 /// (= the head of a bundle). 3923bool isSchedulingEntity()
const{
return FirstInBundle ==
this; }
3925 /// Returns true if it represents an instruction bundle and not only a 3926 /// single instruction. 3927bool isPartOfBundle()
const{
3928return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3931 /// Returns true if it is ready for scheduling, i.e. it has no more 3932 /// unscheduled depending instructions/bundles. 3933bool isReady()
const{
3934assert(isSchedulingEntity() &&
3935"can't consider non-scheduling entity for ready list");
3936return unscheduledDepsInBundle() == 0 && !IsScheduled;
3939 /// Modifies the number of unscheduled dependencies for this instruction, 3940 /// and returns the number of remaining dependencies for the containing 3942int incrementUnscheduledDeps(
int Incr) {
3943assert(hasValidDependencies() &&
3944"increment of unscheduled deps would be meaningless");
3945 UnscheduledDeps += Incr;
3946return FirstInBundle->unscheduledDepsInBundle();
3949 /// Sets the number of unscheduled dependencies to the number of 3951void resetUnscheduledDeps() {
3952 UnscheduledDeps = Dependencies;
3955 /// Clears all dependency information. 3956void clearDependencies() {
3957 Dependencies = InvalidDeps;
3958 resetUnscheduledDeps();
3959 MemoryDependencies.clear();
3960 ControlDependencies.clear();
3963int unscheduledDepsInBundle()
const{
3964assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3966for (
const ScheduleData *BundleMember =
this; BundleMember;
3967 BundleMember = BundleMember->NextInBundle) {
3968if (BundleMember->UnscheduledDeps == InvalidDeps)
3970 Sum += BundleMember->UnscheduledDeps;
3976if (!isSchedulingEntity()) {
3978 }
elseif (NextInBundle) {
3980 ScheduleData *SD = NextInBundle;
3982 os <<
';' << *SD->Inst;
3983 SD = SD->NextInBundle;
3995 /// The TreeEntry that this instruction corresponds to. 3996 TreeEntry *
TE =
nullptr;
3998 /// Points to the head in an instruction bundle (and always to this for 3999 /// single instructions). 4000 ScheduleData *FirstInBundle =
nullptr;
4002 /// Single linked list of all instructions in a bundle. Null if it is a 4003 /// single instruction. 4004 ScheduleData *NextInBundle =
nullptr;
4006 /// Single linked list of all memory instructions (e.g. load, store, call) 4007 /// in the block - until the end of the scheduling region. 4008 ScheduleData *NextLoadStore =
nullptr;
4010 /// The dependent memory instructions. 4011 /// This list is derived on demand in calculateDependencies(). 4014 /// List of instructions which this instruction could be control dependent 4015 /// on. Allowing such nodes to be scheduled below this one could introduce 4016 /// a runtime fault which didn't exist in the original program. 4017 /// ex: this is a load or udiv following a readonly call which inf loops 4020 /// This ScheduleData is in the current scheduling region if this matches 4021 /// the current SchedulingRegionID of BlockScheduling. 4022int SchedulingRegionID = 0;
4024 /// Used for getting a "good" final ordering of instructions. 4025int SchedulingPriority = 0;
4027 /// The number of dependencies. Constitutes of the number of users of the 4028 /// instruction plus the number of dependent memory instructions (if any). 4029 /// This value is calculated on demand. 4030 /// If InvalidDeps, the number of dependencies is not calculated yet. 4031int Dependencies = InvalidDeps;
4033 /// The number of dependencies minus the number of dependencies of scheduled 4034 /// instructions. As soon as this is zero, the instruction/bundle gets ready 4036 /// Note that this is negative as long as Dependencies is not calculated. 4037int UnscheduledDeps = InvalidDeps;
4039 /// True if this instruction is scheduled (or considered as scheduled in the 4041bool IsScheduled =
false;
4046const BoUpSLP::ScheduleData &SD) {
4055 /// Contains all scheduling data for a basic block. 4056 /// It does not schedules instructions, which are not memory read/write 4057 /// instructions and their operands are either constants, or arguments, or 4058 /// phis, or instructions from others blocks, or their users are phis or from 4059 /// the other blocks. The resulting vector instructions can be placed at the 4060 /// beginning of the basic block without scheduling (if operands does not need 4061 /// to be scheduled) or at the end of the block (if users are outside of the 4062 /// block). It allows to save some compile time and memory used by the 4064 /// ScheduleData is assigned for each instruction in between the boundaries of 4065 /// the tree entry, even for those, which are not part of the graph. It is 4066 /// required to correctly follow the dependencies between the instructions and 4067 /// their correct scheduling. The ScheduleData is not allocated for the 4068 /// instructions, which do not require scheduling, like phis, nodes with 4069 /// extractelements/insertelements only or nodes with instructions, with 4070 /// uses/operands outside of the block. 4071structBlockScheduling {
4073 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4077 ScheduleStart =
nullptr;
4078 ScheduleEnd =
nullptr;
4079 FirstLoadStoreInRegion =
nullptr;
4080 LastLoadStoreInRegion =
nullptr;
4081 RegionHasStackSave =
false;
4083// Reduce the maximum schedule region size by the size of the 4084// previous scheduling run. 4085 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4088 ScheduleRegionSize = 0;
4090// Make a new scheduling region, i.e. all existing ScheduleData is not 4091// in the new region yet. 4092 ++SchedulingRegionID;
4096if (BB !=
I->getParent())
4097// Avoid lookup if can't possibly be in map. 4099 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4100if (SD && isInSchedulingRegion(SD))
4105 ScheduleData *getScheduleData(
Value *V) {
4106if (
auto *
I = dyn_cast<Instruction>(V))
4107return getScheduleData(
I);
4111bool isInSchedulingRegion(ScheduleData *SD)
const{
4112return SD->SchedulingRegionID == SchedulingRegionID;
4115 /// Marks an instruction as scheduled and puts all dependent ready 4116 /// instructions into the ready-list. 4117template <
typename ReadyListType>
4118void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4119 SD->IsScheduled =
true;
4122for (ScheduleData *BundleMember = SD; BundleMember;
4123 BundleMember = BundleMember->NextInBundle) {
4125// Handle the def-use chain dependencies. 4127// Decrement the unscheduled counter and insert to ready list if ready. 4128auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4129 ScheduleData *OpDef = getScheduleData(
I);
4130if (OpDef && OpDef->hasValidDependencies() &&
4131 OpDef->incrementUnscheduledDeps(-1) == 0) {
4132// There are no more unscheduled dependencies after 4133// decrementing, so we can put the dependent instruction 4134// into the ready list. 4135 ScheduleData *DepBundle = OpDef->FirstInBundle;
4136assert(!DepBundle->IsScheduled &&
4137"already scheduled bundle gets ready");
4138 ReadyList.insert(DepBundle);
4140 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4144// If BundleMember is a vector bundle, its operands may have been 4145// reordered during buildTree(). We therefore need to get its operands 4146// through the TreeEntry. 4147if (TreeEntry *TE = BundleMember->TE) {
4148// Need to search for the lane since the tree entry can be reordered. 4149auto *
In = BundleMember->Inst;
4150int Lane = std::distance(
TE->Scalars.begin(),
4152assert(Lane >= 0 &&
"Lane not set");
4154// Since vectorization tree is being built recursively this assertion 4155// ensures that the tree entry has all operands set before reaching 4156// this code. Couple of exceptions known at the moment are extracts 4157// where their second (immediate) operand is not added. Since 4158// immediates do not affect scheduler behavior this is considered 4162 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4163In->getNumOperands() ==
TE->getNumOperands()) &&
4164"Missed TreeEntry operands?");
4166for (
unsigned OpIdx : seq<unsigned>(
TE->getNumOperands()))
4167if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4170// If BundleMember is a stand-alone instruction, no operand reordering 4171// has taken place, so we directly access its operands. 4172for (
Use &U : BundleMember->Inst->operands())
4173if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4176// Handle the memory dependencies. 4177for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4178if (MemoryDepSD->hasValidDependencies() &&
4179 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4180// There are no more unscheduled dependencies after decrementing, 4181// so we can put the dependent instruction into the ready list. 4182 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4183assert(!DepBundle->IsScheduled &&
4184"already scheduled bundle gets ready");
4185 ReadyList.insert(DepBundle);
4187 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4190// Handle the control dependencies. 4191for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4192if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4193// There are no more unscheduled dependencies after decrementing, 4194// so we can put the dependent instruction into the ready list. 4195 ScheduleData *DepBundle = DepSD->FirstInBundle;
4196assert(!DepBundle->IsScheduled &&
4197"already scheduled bundle gets ready");
4198 ReadyList.insert(DepBundle);
4200 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4206 /// Verify basic self consistency properties of the data structure. 4211assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4212 ScheduleStart->comesBefore(ScheduleEnd) &&
4213"Not a valid scheduling region?");
4215for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4216auto *SD = getScheduleData(
I);
4219assert(isInSchedulingRegion(SD) &&
4220"primary schedule data not in window?");
4221assert(isInSchedulingRegion(SD->FirstInBundle) &&
4222"entire bundle in window!");
4226for (
auto *SD : ReadyInsts) {
4227assert(SD->isSchedulingEntity() && SD->isReady() &&
4228"item in ready list not ready?");
4233 /// Put all instructions into the ReadyList which are ready for scheduling. 4234template <
typename ReadyListType>
4235void initialFillReadyList(ReadyListType &ReadyList) {
4236for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4237 ScheduleData *SD = getScheduleData(
I);
4238if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4240 ReadyList.insert(SD);
4242 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4247 /// Build a bundle from the ScheduleData nodes corresponding to the 4248 /// scalar instruction for each lane. 4251 /// Checks if a bundle of instructions can be scheduled, i.e. has no 4252 /// cyclic dependencies. This is only a dry-run, no instructions are 4253 /// actually moved at this stage. 4254 /// \returns the scheduling bundle. The returned Optional value is not 4255 /// std::nullopt if \p VL is allowed to be scheduled. 4256 std::optional<ScheduleData *>
4258const InstructionsState &S);
4260 /// Un-bundles a group of instructions. 4263 /// Allocates schedule data chunk. 4264 ScheduleData *allocateScheduleDataChunks();
4266 /// Extends the scheduling region so that V is inside the region. 4267 /// \returns true if the region size is within the limit. 4268bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4270 /// Initialize the ScheduleData structures for new instructions in the 4271 /// scheduling region. 4273 ScheduleData *PrevLoadStore,
4274 ScheduleData *NextLoadStore);
4276 /// Updates the dependency information of a bundle and of all instructions/ 4277 /// bundles which depend on the original bundle. 4278void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4281 /// Sets all instruction in the scheduling region to un-scheduled. 4282void resetSchedule();
4286 /// Simple memory allocation for ScheduleData. 4289 /// The size of a ScheduleData array in ScheduleDataChunks. 4292 /// The allocator position in the current chunk, which is the last entry 4293 /// of ScheduleDataChunks. 4296 /// Attaches ScheduleData to Instruction. 4297 /// Note that the mapping survives during all vectorization iterations, i.e. 4298 /// ScheduleData structures are recycled. 4301 /// The ready-list for scheduling (only used for the dry-run). 4304 /// The first instruction of the scheduling region. 4307 /// The first instruction _after_ the scheduling region. 4310 /// The first memory accessing instruction in the scheduling region 4312 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4314 /// The last memory accessing instruction in the scheduling region 4316 ScheduleData *LastLoadStoreInRegion =
nullptr;
4318 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling 4319 /// region? Used to optimize the dependence calculation for the 4320 /// common case where there isn't. 4321bool RegionHasStackSave =
false;
4323 /// The current size of the scheduling region. 4324int ScheduleRegionSize = 0;
4326 /// The maximum size allowed for the scheduling region. 4329 /// The ID of the scheduling region. For a new vectorization iteration this 4330 /// is incremented which "removes" all ScheduleData from the region. 4331 /// Make sure that the initial SchedulingRegionID is greater than the 4332 /// initial SchedulingRegionID in ScheduleData (which is 0). 4333int SchedulingRegionID = 1;
4336 /// Attaches the BlockScheduling structures to basic blocks. 4339 /// Performs the "real" scheduling. Done before vectorization is actually 4340 /// performed in a basic block. 4341void scheduleBlock(BlockScheduling *BS);
4343 /// List of users to ignore during scheduling and that don't need extracting. 4346 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of 4347 /// sorted SmallVectors of unsigned. 4348structOrdersTypeDenseMapInfo {
4361staticunsigned getHashValue(
constOrdersType &V) {
4370// Analysis and block reference. 4382unsigned MaxVecRegSize;
// This is set by TTI or overridden by cl::opt. 4383unsigned MinVecRegSize;
// Set by cl::opt (default: 128). 4385 /// Instruction builder to construct the vectorized tree. 4388 /// A map of scalar integer values to the smallest bit width with which they 4389 /// can legally be represented. The values map to (width, signed) pairs, 4390 /// where "width" indicates the minimum bit width and "signed" is True if the 4391 /// value must be signed-extended, rather than zero-extended, back to its 4395 /// Final size of the reduced vector, if the current graph represents the 4396 /// input for the reduction and it was possible to narrow the size of the 4398unsigned ReductionBitWidth = 0;
4400 /// Canonical graph size before the transformations. 4401unsigned BaseGraphSize = 1;
4403 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of 4404 /// type sizes, used in the tree. 4405 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4407 /// Indices of the vectorized nodes, which supposed to be the roots of the new 4408 /// bitwidth analysis attempt, like trunc, IToFP or ICmp. 4412}
// end namespace slpvectorizer 4417 /// NodeRef has to be a pointer per the GraphWriter. 4422 /// Add the VectorizableTree to the index iterator to be able to return 4423 /// TreeEntry pointers. 4424structChildIteratorType
4426 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4437return R.VectorizableTree[0].get();
4441return {
N->UserTreeIndices.begin(),
N->Container};
4445return {
N->UserTreeIndices.end(),
N->Container};
4448 /// For the node iterator we just need to turn the TreeEntry iterator into a 4449 /// TreeEntry* iterator so that it dereferences to NodeRef. 4450classnodes_iterator {
4461booloperator!=(
const nodes_iterator &N2)
const{
return N2.It != It; }
4465return nodes_iterator(R->VectorizableTree.begin());
4469return nodes_iterator(R->VectorizableTree.end());
4483OS << Entry->Idx <<
".\n";
4486for (
auto *V : Entry->Scalars) {
4488if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4489 return EU.Scalar == V;
4499if (Entry->isGather())
4501if (Entry->State == TreeEntry::ScatterVectorize ||
4502 Entry->State == TreeEntry::StridedVectorize)
4508}
// end namespace llvm 4512for (
auto *
I : DeletedInstructions) {
4513if (!
I->getParent()) {
4514// Temporarily insert instruction back to erase them from parent and 4517// Phi nodes must be the very first instructions in the block. 4518I->insertBefore(
F->getEntryBlock(),
4519F->getEntryBlock().getFirstNonPHIIt());
4521I->insertBefore(
F->getEntryBlock().getTerminator()->getIterator());
4524for (
Use &U :
I->operands()) {
4525auto *
Op = dyn_cast<Instruction>(U.get());
4526if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4530I->dropAllReferences();
4532for (
auto *
I : DeletedInstructions) {
4534"trying to erase instruction with users.");
4535I->eraseFromParent();
4538// Cleanup any dead scalar code feeding the vectorized instructions 4541#ifdef EXPENSIVE_CHECKS 4542// If we could guarantee that this call is not extremely slow, we could 4543// remove the ifdef limitation (see PR47712). 4548/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses 4549/// contains original mask for the scalars reused in the node. Procedure 4550/// transform this mask in accordance with the given \p Mask. 4552assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4553"Expected non-empty mask.");
4556for (
unsignedI = 0,
E = Prev.
size();
I <
E; ++
I)
4558 Reuses[Mask[
I]] = Prev[
I];
4561/// Reorders the given \p Order according to the given \p Mask. \p Order - is 4562/// the original order of the scalars. Procedure transforms the provided order 4563/// in accordance with the given \p Mask. If the resulting \p Order is just an 4564/// identity order, \p Order is cleared. 4566bool BottomOrder =
false) {
4567assert(!Mask.empty() &&
"Expected non-empty mask.");
4568unsigned Sz = Mask.size();
4573 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4575 PrevOrder.
swap(Order);
4578for (
unsignedI = 0;
I < Sz; ++
I)
4580 Order[
I] = PrevOrder[Mask[
I]];
4593 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4603for (
unsignedI = 0;
I < Sz; ++
I)
4605 Order[MaskOrder[
I]] =
I;
4609std::optional<BoUpSLP::OrdersType>
4611assert(TE.isGather() &&
"Expected gather node only.");
4612// Try to find subvector extract/insert patterns and reorder only such 4615Type *ScalarTy = GatheredScalars.
front()->getType();
4616int NumScalars = GatheredScalars.
size();
4621if (NumParts == 0 || NumParts >= NumScalars ||
4622 VecTy->getNumElements() % NumParts != 0 ||
4624 VecTy->getNumElements() / NumParts))
4630 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4632 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4634// No shuffled operands - ignore. 4635if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4637OrdersType CurrentOrder(NumScalars, NumScalars);
4638if (GatherShuffles.
size() == 1 &&
4640 Entries.front().front()->isSame(TE.Scalars)) {
4641// Perfect match in the graph, will reuse the previously vectorized 4643 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4654// Exclusive broadcast mask - ignore. 4655if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4656 (Entries.size() != 1 ||
4657 Entries.front().front()->ReorderIndices.empty())) ||
4658 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4664for (
intI : seq<int>(0, NumParts)) {
4665if (ShuffledSubMasks.
test(
I))
4667constint VF = GetVF(
I);
4672// Shuffle of at least 2 vectors - ignore. 4673if (
any_of(Slice, [&](
intI) {
returnI != NumScalars; })) {
4674 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4675 ShuffledSubMasks.
set(
I);
4678// Try to include as much elements from the mask as possible. 4679int FirstMin = INT_MAX;
4680int SecondVecFound =
false;
4681for (
int K : seq<int>(Limit)) {
4682intIdx = Mask[
I * PartSz + K];
4684Value *V = GatheredScalars[
I * PartSz + K];
4686 SecondVecFound =
true;
4695 SecondVecFound =
true;
4699 FirstMin = (FirstMin / PartSz) * PartSz;
4700// Shuffle of at least 2 vectors - ignore. 4701if (SecondVecFound) {
4702 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4703 ShuffledSubMasks.
set(
I);
4706for (
int K : seq<int>(Limit)) {
4707intIdx = Mask[
I * PartSz + K];
4712 SecondVecFound =
true;
4715if (CurrentOrder[
I * PartSz +
Idx] >
4716static_cast<unsigned>(
I * PartSz + K) &&
4717 CurrentOrder[
I * PartSz +
Idx] !=
4718static_cast<unsigned>(
I * PartSz +
Idx))
4719 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4721// Shuffle of at least 2 vectors - ignore. 4722if (SecondVecFound) {
4723 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4724 ShuffledSubMasks.
set(
I);
4730if (!ExtractShuffles.
empty())
4731 TransformMaskToOrder(
4732 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsignedI) {
4733if (!ExtractShuffles[
I])
4736unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4737for (
unsignedIdx : seq<unsigned>(Sz)) {
4738int K =
I * PartSz +
Idx;
4741if (!TE.ReuseShuffleIndices.empty())
4742 K = TE.ReuseShuffleIndices[K];
4745if (!TE.ReorderIndices.empty())
4746 K = std::distance(TE.ReorderIndices.begin(),
4747find(TE.ReorderIndices, K));
4748auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4751 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4753 .getKnownMinValue());
4757// Check special corner case - single shuffle of the same entry. 4758if (GatherShuffles.
size() == 1 && NumParts != 1) {
4759if (ShuffledSubMasks.
any())
4761 PartSz = NumScalars;
4764if (!Entries.empty())
4765 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsignedI) {
4766if (!GatherShuffles[
I])
4768return std::max(Entries[
I].front()->getVectorFactor(),
4769 Entries[
I].back()->getVectorFactor());
4773if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4775return std::move(CurrentOrder);
4780bool CompareOpcodes =
true) {
4784auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4785auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4786return (!GEP1 || GEP1->getNumOperands() == 2) &&
4787 (!GEP2 || GEP2->getNumOperands() == 2) &&
4788 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4789 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4792getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4795/// Calculates minimal alignment as a common alignment. 4796template <
typename T>
4798Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4800 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4801return CommonAlignment;
4804/// Check if \p Order represents reverse order. 4807"Order is empty. Please check it before using isReverseOrder.");
4808unsigned Sz = Order.
size();
4810return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4814/// Checks if the provided list of pointers \p Pointers represents the strided 4815/// pointers for type ElemTy. If they are not, std::nullopt is returned. 4816/// Otherwise, if \p Inst is not specified, just initialized optional value is 4817/// returned to show that the pointers represent strided pointers. If \p Inst 4818/// specified, the runtime stride is materialized before the given \p Inst. 4819/// \returns std::nullopt if the pointers are not pointers with the runtime 4820/// stride, nullptr or actual stride value, otherwise. 4821static std::optional<Value *>
4827constSCEV *PtrSCEVLowest =
nullptr;
4828constSCEV *PtrSCEVHighest =
nullptr;
4829// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest 4836if (!PtrSCEVLowest && !PtrSCEVHighest) {
4837 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4841if (isa<SCEVCouldNotCompute>(Diff))
4844 PtrSCEVLowest = PtrSCEV;
4848if (isa<SCEVCouldNotCompute>(Diff1))
4851 PtrSCEVHighest = PtrSCEV;
4855// Dist = PtrSCEVHighest - PtrSCEVLowest; 4857if (isa<SCEVCouldNotCompute>(Dist))
4859intSize =
DL.getTypeStoreSize(ElemTy);
4860auto TryGetStride = [&](
constSCEV *Dist,
4861constSCEV *Multiplier) ->
constSCEV * {
4862if (
constauto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4863if (M->getOperand(0) == Multiplier)
4864return M->getOperand(1);
4865if (M->getOperand(1) == Multiplier)
4866return M->getOperand(0);
4869if (Multiplier == Dist)
4873// Stride_in_elements = Dist / element_size * (num_elems - 1). 4874constSCEV *Stride =
nullptr;
4875if (
Size != 1 || SCEVs.
size() > 2) {
4877 Stride = TryGetStride(Dist, Sz);
4881if (!Stride || isa<SCEVConstant>(Stride))
4883// Iterate through all pointers and check if all distances are 4884// unique multiple of Stride. 4885usingDistOrdPair = std::pair<int64_t, int>;
4887 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4889bool IsConsecutive =
true;
4890for (
constSCEV *PtrSCEV : SCEVs) {
4892if (PtrSCEV != PtrSCEVLowest) {
4894constSCEV *Coeff = TryGetStride(Diff, Stride);
4897constauto *SC = dyn_cast<SCEVConstant>(Coeff);
4898if (!SC || isa<SCEVCouldNotCompute>(SC))
4904 Dist = SC->getAPInt().getZExtValue();
4906// If the strides are not the same or repeated, we can't vectorize. 4909auto Res = Offsets.emplace(Dist, Cnt);
4912// Consecutive order if the inserted element is the last one. 4913 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4916if (Offsets.size() != SCEVs.
size())
4918 SortedIndices.
clear();
4919if (!IsConsecutive) {
4920// Fill SortedIndices array only if it is non-consecutive. 4923for (
const std::pair<int64_t, int> &Pair : Offsets) {
4924 SortedIndices[Cnt] = Pair.second;
4934static std::pair<InstructionCost, InstructionCost>
4939/// Returns the cost of the shuffle instructions with the given \p Kind, vector 4940/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert 4941/// subvector pattern. 4950int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4953 Mask, NumSrcElts, NumSubElts,
Index)) {
4954if (
Index + NumSubElts > NumSrcElts &&
4955Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4964/// Correctly creates insert_subvector, checking that the index is multiple of 4965/// the subvectors length. Otherwise, generates shuffle using \p Generator or 4966/// using default shuffle. 4971if (
Index % SubVecVF == 0) {
4975// Create shuffle, insertvector requires that index is multiple of 4976// the subvector length. 4979 std::iota(
Mask.begin(),
Mask.end(), 0);
4980for (
unsignedI : seq<unsigned>(SubVecVF))
4983 Vec = Generator(Vec, V, Mask);
4985// 1. Resize V to the size of Vec. 4987 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
4995/// Correctly creates extract_subvector, checking that the index is multiple of 4996/// the subvectors length. Otherwise, generates shuffle using \p Generator or 4997/// using default shuffle. 4999unsigned SubVecVF,
unsignedIndex) {
5000if (
Index % SubVecVF == 0) {
5005// Create shuffle, extract_subvector requires that index is multiple of 5006// the subvector length. 5008 std::iota(Mask.begin(), Mask.end(),
Index);
5016unsigned *BestVF,
bool TryRecursiveCheck)
const{
5017// Check that a vectorized load would load the same memory as a scalar 5018// load. For example, we don't want to vectorize loads that are smaller 5019// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 5020// treats loading/storing it as an i8 struct. If we vectorize loads/stores 5021// from such a struct, we read/write packed bits disagreeing with the 5022// unvectorized version. 5029if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
5032// Make sure all loads in the bundle are simple - we can't vectorize 5033// atomic or volatile loads. 5035constunsigned Sz = VL.
size();
5037auto *POIter = PointerOps.
begin();
5038for (
Value *V : VL) {
5039auto *L = dyn_cast<LoadInst>(V);
5040if (!L || !L->isSimple())
5042 *POIter = L->getPointerOperand();
5047// Check the order of pointer operands or that all pointers are the same. 5051Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5072 Ptr0 = PointerOps.
front();
5073 PtrN = PointerOps.
back();
5075 Ptr0 = PointerOps[Order.
front()];
5076 PtrN = PointerOps[Order.
back()];
5078 std::optional<int> Diff =
5080// Check that the sorted loads are consecutive. 5081if (
static_cast<unsigned>(*Diff) == Sz - 1)
5086// Simple check if not a strided access - clear order. 5087bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5088// Try to generate strided load node if: 5089// 1. Target with strided load support is detected. 5090// 2. The number of loads is greater than MinProfitableStridedLoads, 5091// or the potential stride <= MaxProfitableLoadStride and the 5092// potential stride is power-of-2 (to avoid perf regressions for the very 5093// small number of loads) and max distance > number of loads, or potential 5095// 3. The loads are ordered, or number of unordered loads <= 5096// MaxProfitableUnorderedLoads, or loads are in reversed order. 5097// (this check is to avoid extra costs for very expensive shuffles). 5098// 4. Any pointer operand is an instruction with the users outside of the 5099// current graph (for masked gathers extra extractelement instructions 5100// might be required). 5101auto IsAnyPointerUsedOutGraph =
5102 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5103return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5104 return !getTreeEntry(U) && !MustGather.contains(U);
5107constunsigned AbsoluteDiff = std::abs(*Diff);
5108if (IsPossibleStrided &&
5109 (IsAnyPointerUsedOutGraph ||
5110 (AbsoluteDiff > Sz &&
5114 *Diff == -(
static_cast<int>(Sz) - 1))) {
5115int Stride = *Diff /
static_cast<int>(Sz - 1);
5116if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5121// Iterate through all pointers and check if all distances are 5122// unique multiple of Dist. 5130// If the strides are not the same or repeated, we can't 5132if (((Dist / Stride) * Stride) != Dist ||
5133 !Dists.
insert(Dist).second)
5136if (Dists.
size() == Sz)
5142// Correctly identify compare the cost of loads + shuffles rather than 5143// strided/masked gather loads. Returns true if vectorized + shuffles 5144// representation is better than just gather. 5145auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5147bool ProfitableGatherPointers) {
5150// Compare masked gather cost and loads + insert subvector costs. 5152auto [ScalarGEPCost, VectorGEPCost] =
5154 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5155// Estimate the cost of masked gather GEP. If not a splat, roughly 5156// estimate as a buildvector, otherwise estimate as splat. 5160 VecTy->getNumElements());
5162 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5168 PtrVecTy, DemandedElts,
/*Insert=*/true,
/*Extract=*/false,
CostKind);
5173/*Insert=*/true,
/*Extract=*/false,
CostKind) +
5175// The cost of scalar loads. 5183// The cost of masked gather. 5187/*VariableMask=*/false, CommonAlignment,
CostKind) +
5188 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5193// The list of loads is small or perform partial check already - directly 5194// compare masked gather cost and gather cost. 5195constexprunsigned ListLimit = 4;
5196if (!TryRecursiveCheck || VL.
size() < ListLimit)
5199// FIXME: The following code has not been updated for non-power-of-2 5200// vectors (and not whole registers). The splitting logic here does not 5201// cover the original vector if the vector factor is not a power of two. 5205unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5208// Iterate through possible vectorization factors and check if vectorized + 5209// shuffles is better than just gather. 5215for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5221/*TryRecursiveCheck=*/false);
5222// Check that the sorted loads are consecutive. 5228 DemandedElts.
setBits(Cnt, Cnt + VF);
5231// If need the reorder - consider as high-cost masked gather for now. 5239// All loads gathered - try smaller VF. 5241// Can be vectorized later as a serie of loads/insertelements. 5243if (!DemandedElts.
isZero()) {
5248for (
unsignedIdx : seq<unsigned>(VL.
size()))
5249if (DemandedElts[
Idx])
5256auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5261 LI0->getPointerOperand(),
5262 Instruction::GetElementPtr,
CostKind, ScalarTy,
5266if (
static_cast<unsigned>(
5267count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5268 PointerOps.
size() - 1 ||
5275/*Insert=*/true,
/*Extract=*/false,
CostKind);
5280/*Insert=*/true,
/*Extract=*/false,
CostKind) +
5288 LI0->getPointerAddressSpace(),
CostKind,
5294 LI0->getPointerOperand(),
5295/*VariableMask=*/false,
5301 LI0->getPointerOperand(),
5302/*VariableMask=*/false,
5307// Gathers are already calculated - ignore. 5311for (
intIdx : seq<int>(0, VL.
size()))
5318// If masked gather cost is higher - better to vectorize, so 5319// consider it as a gather node. It will be better estimated 5321if (MaskedGatherCost >= VecLdCost &&
5330// TODO: need to improve analysis of the pointers, if not all of them are 5331// GEPs or have > 2 operands, we end up with a gather node, which just 5332// increases the cost. 5334bool ProfitableGatherPointers =
5335 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5336return L->isLoopInvariant(V);
5338if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5339auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5341 (
GEP &&
GEP->getNumOperands() == 2 &&
5342 isa<Constant, Instruction>(
GEP->getOperand(1)));
5344// Check if potential masked gather can be represented as series 5345// of loads + insertsubvectors. 5346// If masked gather cost is higher - better to vectorize, so 5347// consider it as a gather node. It will be better estimated 5349if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5350 ProfitableGatherPointers))
5363"Expected list of pointer operands.");
5364// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each 5365// Ptr into, sort and return the sorted indices with values next to one 5373 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5375 SortedIndices.
clear();
5377auto Key = std::make_pair(BBs[Cnt + 1],
5381 std::optional<int> Diff = getPointersDiff(
5382 ElemTy, std::get<0>(Base.front()), ElemTy,
5384/*StrictCheck=*/true);
5388 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5393// If we haven't found enough to usefully cluster, return early. 5394if (Bases.
size() > VL.
size() / 2 - 1)
5397// Not found already - add a new Base 5398 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5405if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5406 Bases.
front().second.size() == VL.
size()))
5409// For each of the bases sort the pointers by Offset and check if any of the 5410// base become consecutively allocated. 5411auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5420 FirstPointers.
insert(P1);
5421 SecondPointers.
insert(P2);
5427"Unable to find matching root.");
5430for (
auto &
Base : Bases) {
5431for (
auto &Vec :
Base.second) {
5432if (Vec.size() > 1) {
5433stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5434const std::tuple<Value *, int, unsigned> &
Y) {
5435return std::get<1>(
X) < std::get<1>(
Y);
5437int InitialOffset = std::get<1>(Vec[0]);
5438bool AnyConsecutive =
5440return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5442// Fill SortedIndices array only if it looks worth-while to sort the 5449 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5453for (
auto &
T : Bases)
5454for (
constauto &Vec :
T.second)
5455for (
constauto &
P : Vec)
5459"Expected SortedIndices to be the size of VL");
5463std::optional<BoUpSLP::OrdersType>
5465assert(TE.isGather() &&
"Expected gather node only.");
5466Type *ScalarTy = TE.Scalars[0]->getType();
5469 Ptrs.
reserve(TE.Scalars.size());
5471 BBs.
reserve(TE.Scalars.size());
5472for (
Value *V : TE.Scalars) {
5473auto *L = dyn_cast<LoadInst>(V);
5474if (!L || !L->isSimple())
5481if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5483return std::move(Order);
5487/// Check if two insertelement instructions are from the same buildvector. 5491// Instructions must be from the same basic blocks. 5494// Checks if 2 insertelements are from the same buildvector. 5495if (VU->
getType() != V->getType())
5497// Multiple used inserts are separate nodes. 5504if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5506// Go through the vector operand of insertelement instructions trying to find 5507// either VU as the original vector for IE2 or V as the original vector for 5510 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5511bool IsReusedIdx =
false;
5513if (IE2 == VU && !IE1)
5515if (IE1 == V && !IE2)
5516return V->hasOneUse();
5517if (IE1 && IE1 != V) {
5519 IsReusedIdx |= ReusedIdx.
test(Idx1);
5520 ReusedIdx.
set(Idx1);
5521if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5524 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5526if (IE2 && IE2 != VU) {
5528 IsReusedIdx |= ReusedIdx.
test(Idx2);
5529 ReusedIdx.
set(Idx2);
5530if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5533 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5535 }
while (!IsReusedIdx && (IE1 || IE2));
5539std::optional<BoUpSLP::OrdersType>
5541// No need to reorder if need to shuffle reuses, still need to shuffle the 5543if (!TE.ReuseShuffleIndices.empty()) {
5544// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors. 5545assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5546"Reshuffling scalars not yet supported for nodes with padding");
5550// Check if reuse shuffle indices can be improved by reordering. 5551// For this, check that reuse mask is "clustered", i.e. each scalar values 5552// is used once in each submask of size <number_of_scalars>. 5553// Example: 4 scalar values. 5554// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. 5555// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because 5556// element 3 is used twice in the second submask. 5557unsigned Sz = TE.Scalars.size();
5559if (std::optional<OrdersType> CurrentOrder =
5565OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5566unsigned Sz = TE.Scalars.size();
5567for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5570 Res[
Idx + K * Sz] =
I + K * Sz;
5572return std::move(Res);
5575if (Sz == 2 && TE.getVectorFactor() == 4 &&
5577 2 * TE.getVectorFactor())) == 1)
5582if (TE.ReorderIndices.empty())
5583 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5586::addMask(ReorderMask, TE.ReuseShuffleIndices);
5587unsigned VF = ReorderMask.
size();
5591for (
unsignedI = 0;
I < VF;
I += Sz) {
5593unsigned UndefCnt = 0;
5594unsigned Limit = std::min(Sz, VF -
I);
5603 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5607for (
unsigned K = 0; K < NumParts; ++K) {
5608unsignedIdx = Val + Sz * K;
5610 ResOrder[
Idx] =
I + K;
5613return std::move(ResOrder);
5615unsigned VF = TE.getVectorFactor();
5616// Try build correct order for extractelement instructions. 5618 TE.ReuseShuffleIndices.end());
5619if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5621 if (isa<PoisonValue>(V))
5623 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5624 return Idx && *Idx < Sz;
5626assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported " 5627"by BinaryOperator and CastInst.");
5629if (TE.ReorderIndices.empty())
5630 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5633for (
unsignedI = 0;
I < VF; ++
I) {
5634int &
Idx = ReusedMask[
I];
5637Value *V = TE.Scalars[ReorderMask[
Idx]];
5639Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5642// Build the order of the VF size, need to reorder reuses shuffles, they are 5643// always of VF size. 5645 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5646auto *It = ResOrder.
begin();
5647for (
unsigned K = 0; K < VF; K += Sz) {
5651 std::iota(SubMask.begin(), SubMask.end(), 0);
5653transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5654 std::advance(It, Sz);
5659return std::nullopt;
// No need to reorder. 5660return std::move(ResOrder);
5662if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5663any_of(TE.UserTreeIndices,
5665 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5669if ((TE.State == TreeEntry::Vectorize ||
5670 TE.State == TreeEntry::StridedVectorize) &&
5671 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5672 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5673assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by " 5674"BinaryOperator and CastInst.");
5675return TE.ReorderIndices;
5677if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5678if (!TE.ReorderIndices.empty())
5679return TE.ReorderIndices;
5682for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5683if (!V->hasNUsesOrMore(1))
5685auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5690while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5692II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5698assert(BB1 != BB2 &&
"Expected different basic blocks.");
5699auto *NodeA = DT->
getNode(BB1);
5700auto *NodeB = DT->
getNode(BB2);
5701assert(NodeA &&
"Should only process reachable instructions");
5702assert(NodeB &&
"Should only process reachable instructions");
5703assert((NodeA == NodeB) ==
5704 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5705"Different nodes should have different DFS numbers");
5706return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5708auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5709Value *V1 = TE.Scalars[I1];
5710Value *V2 = TE.Scalars[I2];
5711if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5713if (isa<PoisonValue>(V1))
5715if (isa<PoisonValue>(V2))
5721auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5722auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5723if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5724return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5725 FirstUserOfPhi2->getParent());
5726auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5727auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5728auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5729auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5735if (UserBVHead[I1] && !UserBVHead[I2])
5739if (UserBVHead[I1] == UserBVHead[I2])
5742return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5744return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5751auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5752auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5753auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5754auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5757if (EE1->getOperand(0) == EE2->getOperand(0))
5761if (Inst1 && Inst2) {
5769"Expected either instructions or arguments vector operands.");
5770return P1->getArgNo() < P2->getArgNo();
5775 std::iota(Phis.
begin(), Phis.
end(), 0);
5778return std::nullopt;
// No need to reorder. 5779return std::move(Phis);
5781if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5783// TODO: add analysis of other gather nodes with extractelement 5784// instructions and other values/instructions, not only undefs. 5785if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5786 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5787any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5789 auto *EE = dyn_cast<ExtractElementInst>(V);
5790 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5792// Check that gather of extractelements can be represented as 5793// just a shuffle of a single vector. 5796 canReuseExtract(TE.Scalars, CurrentOrder,
/*ResizeAllowed=*/true);
5797if (Reuse || !CurrentOrder.
empty())
5798return std::move(CurrentOrder);
5800// If the gather node is <undef, v, .., poison> and 5801// insertelement poison, v, 0 [+ permute] 5803// insertelement poison, v, n - try to reorder. 5804// If rotating the whole graph, exclude the permute cost, the whole graph 5805// might be transformed. 5806int Sz = TE.Scalars.size();
5808count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5810find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5811if (It == TE.Scalars.begin())
5814if (It != TE.Scalars.end()) {
5816unsignedIdx = std::distance(TE.Scalars.begin(), It);
5831if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5834return std::move(Order);
5840if (TE.Scalars.size() >= 3)
5843// Check if can include the order of vectorized loads. For masked gathers do 5844// extra analysis later, so include such nodes into a special list. 5845if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5849 CurrentOrder, PointerOps);
5851return std::move(CurrentOrder);
5853// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars 5854// has been auditted for correctness with non-power-of-two vectors. 5862/// Checks if the given mask is a "clustered" mask with the same clusters of 5863/// size \p Sz, which are not identity submasks. 5869for (
unsignedI = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5871if (Cluster != FirstCluster)
5877void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const{
5878// Reorder reuses mask. 5880constunsigned Sz =
TE.Scalars.size();
5881// For vectorized and non-clustered reused no need to do anything else. 5882if (!
TE.isGather() ||
5890// Clear reorder since it is going to be applied to the new mask. 5891TE.ReorderIndices.clear();
5892// Try to improve gathered nodes with clustered reuses, if possible. 5897// Fill the reuses mask with the identity submasks. 5898for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5899 *
End =
TE.ReuseShuffleIndices.end();
5900 It !=
End; std::advance(It, Sz))
5901 std::iota(It, std::next(It, Sz), 0);
5907"Expected same size of orders");
5908unsigned Sz = Order.
size();
5910for (
unsignedIdx : seq<unsigned>(0, Sz)) {
5911if (Order[
Idx] != Sz)
5912 UsedIndices.
set(Order[
Idx]);
5914if (SecondaryOrder.
empty()) {
5915for (
unsignedIdx : seq<unsigned>(0, Sz))
5916if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5919for (
unsignedIdx : seq<unsigned>(0, Sz))
5920if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5921 !UsedIndices.
test(SecondaryOrder[
Idx]))
5922 Order[
Idx] = SecondaryOrder[
Idx];
5927// Maps VF to the graph nodes. 5929// ExtractElement gather nodes which can be vectorized and need to handle 5933// Phi nodes can have preferred ordering based on their result users 5936// AltShuffles can also have a preferred ordering that leads to fewer 5937// instructions, e.g., the addsub instruction in x86. 5940// Maps a TreeEntry to the reorder indices of external users. 5942 ExternalUserReorderMap;
5943// Find all reorderable nodes with the given VF. 5944// Currently the are vectorized stores,loads,extracts + some gathering of 5947const std::unique_ptr<TreeEntry> &TE) {
5948// Look for external users that will probably be vectorized. 5950 findExternalStoreUsersReorderIndices(TE.get());
5951if (!ExternalUserReorderIndices.
empty()) {
5952 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5954 std::move(ExternalUserReorderIndices));
5957// Patterns like [fadd,fsub] can be combined into a single instruction in 5958// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need 5959// to take into account their order when looking for the most used order. 5960if (TE->hasState() && TE->isAltShuffle()) {
5963unsigned Opcode0 = TE->getOpcode();
5964unsigned Opcode1 = TE->getAltOpcode();
5966// If this pattern is supported by the target then we consider the order. 5967if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5968 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5971// TODO: Check the reverse order too. 5974if (std::optional<OrdersType> CurrentOrder =
5976// Do not include ordering for nodes used in the alt opcode vectorization, 5977// better to reorder them during bottom-to-top stage. If follow the order 5978// here, it causes reordering of the whole graph though actually it is 5979// profitable just to reorder the subgraph that starts from the alternate 5980// opcode vectorization node. Such nodes already end-up with the shuffle 5981// instruction and it is just enough to change this shuffle rather than 5982// rotate the scalars for the whole graph. 5984const TreeEntry *UserTE = TE.get();
5986if (UserTE->UserTreeIndices.size() != 1)
5989 return EI.UserTE->State == TreeEntry::Vectorize &&
5990 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5993 UserTE = UserTE->UserTreeIndices.back().UserTE;
5996 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5997if (!(TE->State == TreeEntry::Vectorize ||
5998 TE->State == TreeEntry::StridedVectorize) ||
5999 !TE->ReuseShuffleIndices.empty())
6000 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
6001if (TE->State == TreeEntry::Vectorize &&
6002 TE->getOpcode() == Instruction::PHI)
6003 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
6007// Reorder the graph nodes according to their vectorization factor. 6008for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
6009 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6010auto It = VFToOrderedEntries.
find(VF);
6011if (It == VFToOrderedEntries.
end())
6013// Try to find the most profitable order. We just are looking for the most 6014// used order and reorder scalar elements in the nodes according to this 6015// mostly used order. 6017// Delete VF entry upon exit. 6020// All operands are reordered and used only in this node - propagate the 6021// most used order to the user node. 6026for (
const TreeEntry *OpTE : OrderedEntries) {
6027// No need to reorder this nodes, still need to extend and to use shuffle, 6028// just need to merge reordering shuffle and the reuse shuffle. 6029if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6031// Count number of orders uses. 6032constauto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6034if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6035auto It = GathersToOrders.find(OpTE);
6036if (It != GathersToOrders.end())
6039if (OpTE->hasState() && OpTE->isAltShuffle()) {
6040auto It = AltShufflesToOrders.find(OpTE);
6041if (It != AltShufflesToOrders.end())
6044if (OpTE->State == TreeEntry::Vectorize &&
6045 OpTE->getOpcode() == Instruction::PHI) {
6046auto It = PhisToOrders.
find(OpTE);
6047if (It != PhisToOrders.
end())
6050return OpTE->ReorderIndices;
6052// First consider the order of the external scalar users. 6053auto It = ExternalUserReorderMap.
find(OpTE);
6054if (It != ExternalUserReorderMap.
end()) {
6055constauto &ExternalUserReorderIndices = It->second;
6056// If the OpTE vector factor != number of scalars - use natural order, 6057// it is an attempt to reorder node with reused scalars but with 6059if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6060 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
6061 ExternalUserReorderIndices.size();
6063for (
constOrdersType &ExtOrder : ExternalUserReorderIndices)
6064 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6066// No other useful reorder data in this entry. 6070// Stores actually store the mask, not the order, need to invert. 6071if (OpTE->State == TreeEntry::Vectorize &&
6072 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6073assert(!OpTE->isAltShuffle() &&
6074"Alternate instructions are only supported by BinaryOperator " 6078unsignedE = Order.size();
6081 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6084 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6086 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6089if (OrdersUses.empty())
6091// Choose the most used order. 6092unsigned IdentityCnt = 0;
6093unsigned FilledIdentityCnt = 0;
6095for (
auto &Pair : OrdersUses) {
6097if (!Pair.first.empty())
6098 FilledIdentityCnt += Pair.second;
6099 IdentityCnt += Pair.second;
6104unsigned Cnt = IdentityCnt;
6105for (
auto &Pair : OrdersUses) {
6106// Prefer identity order. But, if filled identity found (non-empty order) 6107// with same number of uses, as the new candidate order, we can choose 6108// this candidate order. 6109if (Cnt < Pair.second ||
6110 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6111 Cnt == Pair.second && !BestOrder.
empty() &&
6114 BestOrder = Pair.first;
6120// Set order of the user node. 6127unsignedE = BestOrder.
size();
6129 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6131// Do an actual reordering, if profitable. 6132for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6133// Just do the reordering for the nodes with the given VF. 6134if (TE->Scalars.size() != VF) {
6135if (TE->ReuseShuffleIndices.size() == VF) {
6136// Need to reorder the reuses masks of the operands with smaller VF to 6137// be able to find the match between the graph nodes and scalar 6138// operands of the given node during vectorization/cost estimation. 6141 return EI.UserTE->Scalars.size() == VF ||
6142 EI.UserTE->Scalars.size() ==
6145"All users must be of VF size.");
6148// ShuffleVectorInst does not do reorderOperands (and it should not 6149// because ShuffleVectorInst supports only a limited set of 6150// patterns). Only do reorderNodeWithReuses if all of the users are 6151// not ShuffleVectorInst. 6153 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6158 return isa<ShuffleVectorInst>(
6159 EI.UserTE->getMainOp());
6161"Does not know how to reorder.");
6163// Update ordering of the operands with the smaller VF than the given 6165 reorderNodeWithReuses(*TE, Mask);
6169if ((TE->State == TreeEntry::Vectorize ||
6170 TE->State == TreeEntry::StridedVectorize) &&
6173 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6174assert(!TE->isAltShuffle() &&
6175"Alternate instructions are only supported by BinaryOperator " 6177// Build correct orders for extract{element,value}, loads and 6180if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6181 TE->reorderOperands(Mask);
6183// Reorder the node and its operands. 6184 TE->reorderOperands(Mask);
6185assert(TE->ReorderIndices.empty() &&
6186"Expected empty reorder sequence.");
6189if (!TE->ReuseShuffleIndices.empty()) {
6190// Apply reversed order to keep the original ordering of the reused 6191// elements to avoid extra reorder indices shuffling. 6196addMask(NewReuses, TE->ReuseShuffleIndices);
6197 TE->ReuseShuffleIndices.swap(NewReuses);
6203bool BoUpSLP::canReorderOperands(
6204 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6207for (
unsignedI = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6208if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6209return OpData.first ==
I &&
6210 (OpData.second->State == TreeEntry::Vectorize ||
6211 OpData.second->State == TreeEntry::StridedVectorize);
6214if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6215// Do not reorder if operand node is used by many user nodes. 6216if (
any_of(TE->UserTreeIndices,
6217 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6219// Add the node to the list of the ordered nodes with the identity 6221 Edges.emplace_back(
I, TE);
6222// Add ScatterVectorize nodes to the list of operands, where just 6223// reordering of the scalars is required. Similar to the gathers, so 6224// simply add to the list of gathered ops. 6225// If there are reused scalars, process this node as a regular vectorize 6226// node, just reorder reuses mask. 6227if (TE->State != TreeEntry::Vectorize &&
6228 TE->State != TreeEntry::StridedVectorize &&
6229 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6233 TreeEntry *
Gather =
nullptr;
6235 [&
Gather, UserTE,
I](TreeEntry *TE) {
6236assert(TE->State != TreeEntry::Vectorize &&
6237 TE->State != TreeEntry::StridedVectorize &&
6238"Only non-vectorized nodes are expected.");
6239if (
any_of(TE->UserTreeIndices,
6240 [UserTE,
I](
const EdgeInfo &EI) {
6241 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6243assert(TE->isSame(UserTE->getOperand(
I)) &&
6244"Operand entry does not match operands.");
6261// Find all reorderable leaf nodes with the given VF. 6262// Currently the are vectorized loads,extracts without alternate operands + 6263// some gathering of extracts. 6265for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6266if (TE->State != TreeEntry::Vectorize &&
6267 TE->State != TreeEntry::StridedVectorize)
6269if (std::optional<OrdersType> CurrentOrder =
6271 OrderedEntries.
insert(TE.get());
6272if (!(TE->State == TreeEntry::Vectorize ||
6273 TE->State == TreeEntry::StridedVectorize) ||
6274 !TE->ReuseShuffleIndices.empty())
6275 GathersToOrders.
insert(TE.get());
6279// 1. Propagate order to the graph nodes, which use only reordered nodes. 6280// I.e., if the node has operands, that are reordered, try to make at least 6281// one operand order in the natural order and reorder others + reorder the 6284while (!OrderedEntries.
empty()) {
6285// 1. Filter out only reordered nodes. 6286// 2. If the entry has multiple uses - skip it and jump to the next node. 6289for (TreeEntry *TE : OrderedEntries) {
6290if (!(TE->State == TreeEntry::Vectorize ||
6291 TE->State == TreeEntry::StridedVectorize ||
6292 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6293 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6296 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6298 !Visited.
insert(TE).second) {
6302// Build a map between user nodes and their operands order to speedup 6303// search. The graph currently does not provide this dependency directly. 6304for (
EdgeInfo &EI : TE->UserTreeIndices)
6307// Erase filtered entries. 6308for (TreeEntry *TE : Filtered)
6309 OrderedEntries.remove(TE);
6311 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6313sort(UsersVec, [](
constauto &Data1,
constauto &Data2) {
6314return Data1.first->Idx > Data2.first->Idx;
6316for (
auto &
Data : UsersVec) {
6317// Check that operands are used only in the User node. 6319if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6321for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6322 OrderedEntries.remove(
Op.second);
6325// All operands are reordered and used only in this node - propagate the 6326// most used order to the user node. 6330// Do the analysis for each tree entry only once, otherwise the order of 6331// the same node my be considered several times, though might be not 6335for (
constauto &
Op :
Data.second) {
6336 TreeEntry *OpTE =
Op.second;
6337if (!VisitedOps.
insert(OpTE).second)
6339if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6342if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6345return OpTE->ReorderIndices;
6347// The order is partially ordered, skip it in favor of fully non-ordered 6349if (Order.size() == 1)
6352Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6353 return P.second == OpTE;
6355// Stores actually store the mask, not the order, need to invert. 6356if (OpTE->State == TreeEntry::Vectorize &&
6357 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6358assert(!OpTE->isAltShuffle() &&
6359"Alternate instructions are only supported by BinaryOperator " 6363unsignedE = Order.size();
6366 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6369 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6372 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6374auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6375constauto AllowsReordering = [&](
const TreeEntry *TE) {
6376if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6377 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6378 (IgnoreReorder && TE->Idx == 0))
6380if (TE->isGather()) {
6389for (
constEdgeInfo &EI : OpTE->UserTreeIndices) {
6390 TreeEntry *UserTE = EI.
UserTE;
6391if (!VisitedUsers.
insert(UserTE).second)
6393// May reorder user node if it requires reordering, has reused 6394// scalars, is an alternate op vectorize node or its op nodes require 6396if (AllowsReordering(UserTE))
6398// Check if users allow reordering. 6399// Currently look up just 1 level of operands to avoid increase of 6401// Profitable to reorder if definitely more operands allow 6402// reordering rather than those with natural order. 6405 Ops, [UserTE, &AllowsReordering](
6406const std::pair<unsigned, TreeEntry *> &
Op) {
6407return AllowsReordering(
Op.second) &&
6410 return EI.UserTE == UserTE;
6412 })) <= Ops.
size() / 2)
6413 ++Res.first->second;
6416if (OrdersUses.empty()) {
6417for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6418 OrderedEntries.remove(
Op.second);
6421// Choose the most used order. 6422unsigned IdentityCnt = 0;
6423unsigned VF =
Data.second.front().second->getVectorFactor();
6425for (
auto &Pair : OrdersUses) {
6427 IdentityCnt += Pair.second;
6432unsigned Cnt = IdentityCnt;
6433for (
auto &Pair : OrdersUses) {
6434// Prefer identity order. But, if filled identity found (non-empty 6435// order) with same number of uses, as the new candidate order, we can 6436// choose this candidate order. 6437if (Cnt < Pair.second) {
6439 BestOrder = Pair.first;
6445// Set order of the user node. 6447for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6448 OrderedEntries.remove(
Op.second);
6452// Erase operands from OrderedEntries list and adjust their orders. 6457unsignedE = BestOrder.
size();
6459 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6461for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6462 TreeEntry *TE =
Op.second;
6463 OrderedEntries.remove(TE);
6464if (!VisitedOps.
insert(TE).second)
6466if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6467 reorderNodeWithReuses(*TE, Mask);
6470// Gathers are processed separately. 6471if (TE->State != TreeEntry::Vectorize &&
6472 TE->State != TreeEntry::StridedVectorize &&
6473 (TE->State != TreeEntry::ScatterVectorize ||
6474 TE->ReorderIndices.empty()))
6476assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6477 TE->ReorderIndices.empty()) &&
6478"Non-matching sizes of user/operand entries.");
6480if (IgnoreReorder && TE == VectorizableTree.front().get())
6481 IgnoreReorder =
false;
6483// For gathers just need to reorder its scalars. 6484for (TreeEntry *
Gather : GatherOps) {
6486"Unexpected reordering of gathers.");
6487if (!
Gather->ReuseShuffleIndices.empty()) {
6488// Just reorder reuses indices. 6493 OrderedEntries.remove(
Gather);
6495// Reorder operands of the user node and set the ordering for the user 6497if (
Data.first->State != TreeEntry::Vectorize ||
6498 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6499Data.first->getMainOp()) ||
6500Data.first->isAltShuffle())
6501Data.first->reorderOperands(Mask);
6502if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6503Data.first->isAltShuffle() ||
6504Data.first->State == TreeEntry::StridedVectorize) {
6507/*BottomOrder=*/true);
6508if (
Data.first->ReuseShuffleIndices.empty() &&
6509 !
Data.first->ReorderIndices.empty() &&
6510 !
Data.first->isAltShuffle()) {
6511// Insert user node to the list to try to sink reordering deeper in 6513 OrderedEntries.insert(
Data.first);
6520// If the reordering is unnecessary, just remove the reorder. 6521if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6522 VectorizableTree.front()->ReuseShuffleIndices.empty())
6523 VectorizableTree.front()->ReorderIndices.clear();
6526Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const{
6527if ((Entry.getOpcode() == Instruction::Store ||
6528 Entry.getOpcode() == Instruction::Load) &&
6529 Entry.State == TreeEntry::StridedVectorize &&
6530 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6531return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6532return dyn_cast<Instruction>(Entry.Scalars.front());
6538// Collect the values that we need to extract from the tree. 6539for (
auto &TEPtr : VectorizableTree) {
6540 TreeEntry *Entry = TEPtr.get();
6542// No need to handle users of gathered values. 6543if (Entry->isGather())
6547for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6548Value *Scalar = Entry->Scalars[Lane];
6549if (!isa<Instruction>(Scalar))
6551// All uses must be replaced already? No need to do it again. 6552auto It = ScalarToExtUses.
find(Scalar);
6553if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6556// Check if the scalar is externally used as an extra arg. 6557constauto ExtI = ExternallyUsedValues.
find(Scalar);
6558if (ExtI != ExternallyUsedValues.
end()) {
6559int FoundLane = Entry->findLaneForValue(Scalar);
6561 << FoundLane <<
" from " << *Scalar <<
".\n");
6562 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6563 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6566for (
User *U : Scalar->users()) {
6573// Ignore users in the user ignore list. 6574if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6577// Skip in-tree scalars that become vectors 6578if (TreeEntry *UseEntry = getTreeEntry(U)) {
6579// Some in-tree scalars will remain as scalar in vectorized 6580// instructions. If that is the case, the one in FoundLane will 6582if (UseEntry->State == TreeEntry::ScatterVectorize ||
6584 Scalar, getRootEntryInstruction(*UseEntry), TLI,
TTI)) {
6587assert(!UseEntry->isGather() &&
"Bad state");
6591if (It != ScalarToExtUses.
end()) {
6592 ExternalUses[It->second].User =
nullptr;
6597if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6599int FoundLane = Entry->findLaneForValue(Scalar);
6601 <<
" from lane " << FoundLane <<
" from " << *Scalar
6603 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6604 ExternalUses.emplace_back(Scalar, U, FoundLane);
6613BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const{
6617for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6618Value *V = TE->Scalars[Lane];
6619// Don't iterate over the users of constant data. 6620if (!isa<Instruction>(V))
6622// To save compilation time we don't visit if we have too many users. 6626// Collect stores per pointer object. 6627for (
User *U : V->users()) {
6628auto *SI = dyn_cast<StoreInst>(U);
6629// Test whether we can handle the store. V might be a global, which could 6630// be used in a different function. 6631if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6634// Skip entry if already 6640auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6641 SI->getValueOperand()->getType(),
Ptr}];
6642// For now just keep one store per pointer object per lane. 6643// TODO: Extend this to support multiple stores per pointer per lane 6644if (StoresVec.size() > Lane)
6646if (!StoresVec.empty()) {
6648 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6649 SI->getValueOperand()->getType(),
6650 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6651/*StrictCheck=*/true);
6652// We failed to compare the pointers so just abandon this store. 6656 StoresVec.push_back(SI);
6661for (
auto &
P : PtrToStoresMap) {
6662 Res[
I].swap(
P.second);
6669 OrdersType &ReorderIndices)
const{
6670// We check whether the stores in StoreVec can form a vector by sorting them 6671// and checking whether they are consecutive. 6673// To avoid calling getPointersDiff() while sorting we create a vector of 6674// pairs {store, offset from first} and sort this instead. 6680for (
unsignedIdx : seq<unsigned>(1, StoresVec.
size())) {
6682 std::optional<int> Diff =
6684SI->getPointerOperand(), *
DL, *SE,
6685/*StrictCheck=*/true);
6689// Check if the stores are consecutive by checking if their difference is 1. 6690if (StoreOffsetVec.
size() != StoresVec.
size())
6693 [](
const std::pair<int, unsigned> &L,
6694const std::pair<int, unsigned> &R) {
returnL.first <
R.first; });
6697for (
constauto &
P : StoreOffsetVec) {
6698if (
Idx > 0 &&
P.first != PrevDist + 1)
6704// Calculate the shuffle indices according to their offset against the sorted 6706 ReorderIndices.assign(StoresVec.
size(), 0);
6707bool IsIdentity =
true;
6709 ReorderIndices[
P.second] =
I;
6710 IsIdentity &=
P.second ==
I;
6712// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in 6713// reorderTopToBottom() and reorderBottomToTop(), so we are following the 6714// same convention here. 6716 ReorderIndices.clear();
6723for (
unsignedIdx : Order)
6730BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const{
6731unsigned NumLanes =
TE->Scalars.size();
6735// Holds the reorder indices for each candidate store vector that is a user of 6736// the current TreeEntry. 6739// Now inspect the stores collected per pointer and look for vectorization 6740// candidates. For each candidate calculate the reorder index vector and push 6741// it into `ExternalReorderIndices` 6743// If we have fewer than NumLanes stores, then we can't form a vector. 6744if (StoresVec.
size() != NumLanes)
6747// If the stores are not consecutive then abandon this StoresVec. 6749if (!canFormVector(StoresVec, ReorderIndices))
6752// We now know that the scalars in StoresVec can form a vector instruction, 6753// so set the reorder indices. 6754 ExternalReorderIndices.
push_back(ReorderIndices);
6756return ExternalReorderIndices;
6762 UserIgnoreList = &UserIgnoreLst;
6765 buildTree_rec(Roots, 0,
EdgeInfo());
6772 buildTree_rec(Roots, 0,
EdgeInfo());
6775/// Tries to find subvector of loads and builds new vector of only loads if can 6789for (
Value *V : VL) {
6790auto *LI = dyn_cast<LoadInst>(V);
6793if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6796for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6797assert(LI->getParent() ==
Data.front().first->getParent() &&
6798 LI->getType() ==
Data.front().first->getType() &&
6802"Expected loads with the same type, same parent and same " 6803"underlying pointer.");
6805 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6806Data.front().first->getPointerOperand(),
DL, SE,
6807/*StrictCheck=*/true);
6810auto It = Map.find(*Dist);
6811if (It != Map.end() && It->second != LI)
6813if (It == Map.end()) {
6814Data.emplace_back(LI, *Dist);
6815 Map.try_emplace(*Dist, LI);
6825auto FindMatchingLoads =
6830int &
Offset,
unsigned &Start) {
6832return GatheredLoads.
end();
6842 std::optional<int> Dist =
6844Data.front().first->getType(),
6845Data.front().first->getPointerOperand(),
DL, SE,
6846/*StrictCheck=*/true);
6851for (std::pair<LoadInst *, int>
P :
Data) {
6855// Found matching gathered loads - check if all loads are unique or 6856// can be effectively vectorized. 6857unsigned NumUniques = 0;
6858for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6859bool Used = DataLoads.
contains(Pair.first);
6860if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6864 Repeated.insert(Cnt);
6867if (NumUniques > 0 &&
6868 (Loads.
size() == NumUniques ||
6869 (Loads.
size() - NumUniques >= 2 &&
6870 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6876return std::next(GatheredLoads.
begin(),
Idx);
6880return GatheredLoads.
end();
6882for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6886auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6888while (It != GatheredLoads.
end()) {
6889assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6890for (
unsignedIdx : LocalToAdd)
6892 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6893 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6897 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6901for (
unsignedIdx : seq<unsigned>(
Data.size())) {
6910 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6911return PD.front().first->getParent() == LI->
getParent() &&
6912 PD.front().first->getType() == LI->
getType();
6914while (It != GatheredLoads.
end()) {
6917 std::next(It), GatheredLoads.
end(),
6918 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6919 return PD.front().first->getParent() == LI->getParent() &&
6920 PD.front().first->getType() == LI->getType();
6924 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6925 AddNewLoads(GatheredLoads.emplace_back());
6930void BoUpSLP::tryToVectorizeGatheredLoads(
6933 8> &GatheredLoads) {
6934 GatheredLoadsEntriesFirst = VectorizableTree.size();
6937 LoadEntriesToVectorize.
size());
6938for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6939Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6940 VectorizableTree[
Idx]->Scalars.end());
6942// Sort loads by distance. 6943auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6944const std::pair<LoadInst *, int> &L2) {
6945return L1.second > L2.second;
6951Align Alignment = computeCommonAlignment<LoadInst>(Values);
6960bool Final,
unsigned MaxVF) {
6962unsigned StartIdx = 0;
6967 *
TTI, Loads.
front()->getType(), MaxVF);
6969 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
6975if (Final && CandidateVFs.
empty())
6978unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6979for (
unsigned NumElts : CandidateVFs) {
6980if (Final && NumElts > BestVF)
6983for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
6987if (VectorizedLoads.count(Slice.
front()) ||
6988 VectorizedLoads.count(Slice.
back()) ||
6991// Check if it is profitable to try vectorizing gathered loads. It is 6992// profitable if we have more than 3 consecutive loads or if we have 6993// less but all users are vectorized or deleted. 6994bool AllowToVectorize =
false;
6995// Check if it is profitable to vectorize 2-elements loads. 7001// If single use/user - allow to vectorize. 7004// 1. Check if number of uses equals number of users. 7005// 2. All users are deleted. 7006// 3. The load broadcasts are not allowed or the load is not 7008if (
static_cast<unsignedint>(std::distance(
7009 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7011if (!IsLegalBroadcastLoad)
7015for (
User *U : LI->users()) {
7016if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
7018if (
const TreeEntry *UTE = getTreeEntry(U)) {
7019for (
intI : seq<int>(UTE->getNumOperands())) {
7021 [LI](
Value *V) { return V == LI; }))
7022// Found legal broadcast - do not vectorize. 7030 AllowToVectorize = CheckIfAllowed(Slice);
7034any_of(ValueToGatherNodes.at(Slice.front()),
7035 [=](
const TreeEntry *TE) {
7036 return TE->Scalars.size() == 2 &&
7037 ((TE->Scalars.front() == Slice.front() &&
7038 TE->Scalars.back() == Slice.back()) ||
7039 (TE->Scalars.front() == Slice.back() &&
7040 TE->Scalars.back() == Slice.front()));
7045if (AllowToVectorize) {
7048// Try to build vector load. 7050reinterpret_cast<Value *
const*
>(Slice.begin()), Slice.size());
7052 PointerOps, &BestVF);
7054 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7056if (MaskedGatherVectorized.
empty() ||
7057 Cnt >= MaskedGatherVectorized.
back() + NumElts)
7062Results.emplace_back(Values, LS);
7063 VectorizedLoads.insert(Slice.begin(), Slice.end());
7064// If we vectorized initial block, no need to try to vectorize it 7067 StartIdx += NumElts;
7069// Check if the whole array was vectorized already - exit. 7070if (StartIdx >= Loads.
size())
7072// Erase last masked gather candidate, if another candidate within 7073// the range is found to be better. 7074if (!MaskedGatherVectorized.
empty() &&
7075 Cnt < MaskedGatherVectorized.
back() + NumElts)
7081if (!AllowToVectorize || BestVF == 0)
7084// Mark masked gathers candidates as vectorized, if any. 7085for (
unsigned Cnt : MaskedGatherVectorized) {
7087 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7091 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7092// If we vectorized initial block, no need to try to vectorize it again. 7094 StartIdx += NumElts;
7098if (!VectorizedLoads.contains(LI))
7099 NonVectorized.push_back(LI);
7103auto ProcessGatheredLoads =
7108for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7109if (LoadsDists.size() <= 1) {
7110 NonVectorized.
push_back(LoadsDists.back().first);
7115transform(LoadsDists, OriginalLoads.begin(),
7116 [](
const std::pair<LoadInst *, int> &L) ->
LoadInst * {
7121unsigned MaxConsecutiveDistance = 0;
7122unsigned CurrentConsecutiveDist = 1;
7123int LastDist = LocalLoadsDists.
front().second;
7124bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7125for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7126if (getTreeEntry(
L.first))
7129"Expected first distance always not less than second");
7130if (
static_cast<unsigned>(LastDist -
L.second) ==
7131 CurrentConsecutiveDist) {
7132 ++CurrentConsecutiveDist;
7133 MaxConsecutiveDistance =
7134 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7138if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7141 CurrentConsecutiveDist = 1;
7145if (Loads.
size() <= 1)
7147if (AllowMaskedGather)
7148 MaxConsecutiveDistance = Loads.
size();
7149elseif (MaxConsecutiveDistance < 2)
7154 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7155 Final, MaxConsecutiveDistance);
7157 OriginalLoads.size() == Loads.
size() &&
7158 MaxConsecutiveDistance == Loads.
size() &&
7163 VectorizedLoads.
clear();
7167 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7168 UnsortedNonVectorized, Final,
7169 OriginalLoads.size());
7170if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7171 SortedNonVectorized.
swap(UnsortedNonVectorized);
7177 << Slice.
size() <<
")\n");
7178if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
7179for (
Value *L : Slice)
7180if (!getTreeEntry(L))
7181 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7185// Select maximum VF as a maximum of user gathered nodes and 7186// distance between scalar loads in these nodes. 7187unsigned MaxVF = Slice.size();
7188unsigned UserMaxVF = 0;
7189unsigned InterleaveFactor = 0;
7193// Found distance between segments of the interleaved loads. 7194 std::optional<unsigned> InterleavedLoadsDistance = 0;
7196 std::optional<unsigned> CommonVF = 0;
7200for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7201 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7204 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7206if (*CommonVF == 0) {
7207 CommonVF =
E->Scalars.size();
7210if (*CommonVF !=
E->Scalars.size())
7213// Check if the load is the part of the interleaved load. 7214if (Pos !=
Idx && InterleavedLoadsDistance) {
7217 if (isa<Constant>(V))
7219 if (getTreeEntry(V))
7221 const auto &Nodes = ValueToGatherNodes.at(V);
7222 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7223 !is_contained(Slice, V);
7225 InterleavedLoadsDistance.reset();
7229if (*InterleavedLoadsDistance == 0) {
7230 InterleavedLoadsDistance =
Idx - Pos;
7233if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7234 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7235 InterleavedLoadsDistance.reset();
7236 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7240 DeinterleavedNodes.
clear();
7241// Check if the large load represents interleaved load operation. 7242if (InterleavedLoadsDistance.value_or(0) > 1 &&
7243 CommonVF.value_or(0) != 0) {
7244 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7245unsigned VF = *CommonVF;
7248// Segmented load detected - vectorize at maximum vector factor. 7249if (InterleaveFactor <= Slice.size() &&
7253 cast<LoadInst>(Slice.front())->getAlign(),
7254 cast<LoadInst>(Slice.front())
7258 UserMaxVF = InterleaveFactor * VF;
7260 InterleaveFactor = 0;
7263// Cannot represent the loads as consecutive vectorizable nodes - 7265unsigned ConsecutiveNodesSize = 0;
7266if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7267any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7268 [&, Slice = Slice](
constauto &
P) {
7270return std::get<1>(
P).contains(V);
7272if (It == Slice.end())
7275 VectorizableTree[std::get<0>(
P)]->Scalars;
7276 ConsecutiveNodesSize += VL.
size();
7277unsigned Start = std::distance(Slice.begin(), It);
7278unsigned Sz = Slice.size() - Start;
7279return Sz < VL.
size() ||
7280 Slice.slice(std::distance(Slice.begin(), It),
7284// Try to build long masked gather loads. 7286if (InterleaveFactor == 0 &&
7287any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7288 [&, Slice = Slice](
unsignedIdx) {
7290 SmallVector<Value *> PointerOps;
7291 return canVectorizeLoads(
7292 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7293 Slice[Idx * UserMaxVF], Order,
7295 LoadsState::ScatterVectorize;
7298if (Slice.size() != ConsecutiveNodesSize)
7299 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7301for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7302bool IsVectorized =
true;
7303for (
unsignedI = 0,
E = Slice.size();
I <
E;
I += VF) {
7306if (getTreeEntry(SubSlice.
front()))
7308// Check if the subslice is to be-vectorized entry, which is not 7310if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7313 VectorizableTree[std::get<0>(
P)]
7318unsigned Sz = VectorizableTree.size();
7319 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7320if (Sz == VectorizableTree.size()) {
7321 IsVectorized =
false;
7322// Try non-interleaved vectorization with smaller vector 7324if (InterleaveFactor > 0) {
7325 VF = 2 * (MaxVF / InterleaveFactor);
7326 InterleaveFactor = 0;
7335 NonVectorized.
append(SortedNonVectorized);
7337return NonVectorized;
7339for (
constauto &GLs : GatheredLoads) {
7340constauto &
Ref = GLs.second;
7342if (!
Ref.empty() && !NonVectorized.
empty() &&
7344Ref.begin(),
Ref.end(), 0u,
7346ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->
unsigned {
7347 return S + LoadsDists.size();
7348 }) != NonVectorized.
size() &&
7349 IsMaskedGatherSupported(NonVectorized)) {
7351for (
LoadInst *LI : NonVectorized) {
7352// Reinsert non-vectorized loads to other list of loads with the same 7358// Final attempt to vectorize non-vectorized loads. 7359 (void)ProcessGatheredLoads(FinalGatheredLoads,
/*Final=*/true);
7362// Try to vectorize postponed load entries, previously marked as gathered. 7363for (
unsignedIdx : LoadEntriesToVectorize) {
7364const TreeEntry &
E = *VectorizableTree[
Idx];
7366// Avoid reordering, if possible. 7367if (!
E.ReorderIndices.empty()) {
7368// Build a mask out of the reorder indices and reorder scalars per this 7374 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7376// If no new entries created, consider it as no gathered loads entries must be 7378if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7379 VectorizableTree.size())
7380 GatheredLoadsEntriesFirst.reset();
7383/// \return true if the specified list of values has only one instruction that 7384/// requires scheduling, false otherwise. 7387Value *NeedsScheduling =
nullptr;
7388for (
Value *V : VL) {
7391if (!NeedsScheduling) {
7392 NeedsScheduling = V;
7397return NeedsScheduling;
7401/// Generates key/subkey pair for the given value to provide effective sorting 7402/// of the values and better detection of the vectorizable values sequences. The 7403/// keys/subkeys can be used for better sorting of the values themselves (keys) 7404/// and in values subgroups (subkeys). 7408bool AllowAlternate) {
7411// Sort the loads by the distance between the pointers. 7412if (
auto *LI = dyn_cast<LoadInst>(V)) {
7415 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7419// Sort extracts by the vector operands. 7420if (isa<ExtractElementInst, UndefValue>(V))
7422if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7424 !isa<UndefValue>(EI->getIndexOperand()))
7427 }
elseif (
auto *
I = dyn_cast<Instruction>(V)) {
7428// Sort other instructions just by the opcodes except for CMPInst. 7429// For CMP also sort by the predicate kind. 7430if ((isa<BinaryOperator, CastInst>(
I)) &&
7440 : cast<CastInst>(
I)->getOperand(0)->getType()));
7441// For casts, look through the only operand to improve compile time. 7442if (isa<CastInst>(
I)) {
7443 std::pair<size_t, size_t> OpVals =
7445/*AllowAlternate=*/true);
7449 }
elseif (
auto *CI = dyn_cast<CmpInst>(
I)) {
7451if (CI->isCommutative())
7457 }
elseif (
auto *Call = dyn_cast<CallInst>(
I)) {
7471 }
elseif (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7472if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7473 SubKey =
hash_value(Gep->getPointerOperand());
7477 !isa<ConstantInt>(
I->getOperand(1))) {
7478// Do not try to vectorize instructions with potentially high cost. 7485return std::make_pair(Key, SubKey);
7488/// Checks if the specified instruction \p I is an alternate operation for 7489/// the given \p MainOp and \p AltOp instructions. 7495bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7497unsigned Opcode0 = S.getOpcode();
7498unsigned Opcode1 = S.getAltOpcode();
7500// If this pattern is supported by the target then consider it profitable. 7502 Opcode0, Opcode1, OpcodeMask))
7505for (
unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7507// Prepare the operand vector. 7508for (
Value *V : VL) {
7509if (isa<PoisonValue>(V)) {
7514Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7518// Try find best operands candidates. 7519for (
unsignedI : seq<unsigned>(0, VL.size() - 1)) {
7525switch (Res.value_or(0)) {
7540constexprunsigned NumAltInsts = 3;
// main + alt + shuffle. 7541unsigned NonInstCnt = 0;
7542// Estimate number of instructions, required for the vectorized node and for 7543// the buildvector node. 7544unsigned UndefCnt = 0;
7545// Count the number of extra shuffles, required for vector nodes. 7546unsigned ExtraShuffleInsts = 0;
7547// Check that operands do not contain same values and create either perfect 7548// diamond match or shuffled match. 7550// Do not count same operands twice. 7555 return is_contained(Operands.back(), V);
7558 ++ExtraShuffleInsts;
7562// Vectorize node, if: 7563// 1. at least single operand is constant or splat. 7564// 2. Operands have many loop invariants (the instructions are not loop 7566// 3. At least single unique operands is supposed to vectorized. 7575if (isa<Constant, ExtractElementInst>(V) ||
7576 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7577 if (isa<UndefValue>(V))
7582// Found first duplicate - need to add shuffle. 7583if (!Res.second && Res.first->second == 1)
7584 ++ExtraShuffleInsts;
7585 ++Res.first->getSecond();
7586if (
auto *
I = dyn_cast<Instruction>(V))
7587 UniqueOpcodes.
insert(
I->getOpcode());
7591returnnone_of(Uniques, [&](
constauto &
P) {
7592returnP.first->hasNUsesOrMore(
P.second + 1) &&
7594 return getTreeEntry(U) || Uniques.contains(U);
7598// Do not vectorize node, if estimated number of vector instructions is 7599// more than estimated number of buildvector instructions. Number of 7600// vector operands is number of vector instructions + number of vector 7601// instructions for operands (buildvectors). Number of buildvector 7602// instructions is just number_of_operands * number_of_scalars. 7603 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7604 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7605 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7608BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7610bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7613"Expected instructions with same/alternate opcodes only.");
7615unsigned ShuffleOrOp =
7616 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7618switch (ShuffleOrOp) {
7619case Instruction::PHI: {
7620// Too many operands - gather, most probably won't be vectorized. 7622return TreeEntry::NeedToGather;
7623// Check for terminator values (e.g. invoke). 7624for (
Value *V : VL) {
7625auto *
PHI = dyn_cast<PHINode>(V);
7630if (Term &&
Term->isTerminator()) {
7632 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7633return TreeEntry::NeedToGather;
7638return TreeEntry::Vectorize;
7640case Instruction::ExtractValue:
7641case Instruction::ExtractElement: {
7642bool Reuse = canReuseExtract(VL, CurrentOrder);
7643// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and 7644// non-full registers). 7646return TreeEntry::NeedToGather;
7647if (Reuse || !CurrentOrder.empty())
7648return TreeEntry::Vectorize;
7650return TreeEntry::NeedToGather;
7652case Instruction::InsertElement: {
7653// Check that we have a buildvector and not a shuffle of 2 or more 7654// different vectors. 7656for (
Value *V : VL) {
7657 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7659"Non-constant or undef index?");
7663return !SourceVectors.contains(V);
7665// Found 2nd source vector - cancel. 7667"different source vectors.\n");
7668return TreeEntry::NeedToGather;
7672// The last InsertElement can have multiple uses. 7673return SourceVectors.contains(V) && !
V->hasOneUse();
7678return TreeEntry::NeedToGather;
7681return TreeEntry::Vectorize;
7683case Instruction::Load: {
7684// Check that a vectorized load would load the same memory as a scalar 7685// load. For example, we don't want to vectorize loads that are smaller 7686// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 7687// treats loading/storing it as an i8 struct. If we vectorize loads/stores 7688// from such a struct, we read/write packed bits disagreeing with the 7689// unvectorized version. 7692return TreeEntry::Vectorize;
7694if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7695// Delay slow vectorized nodes for better vectorization attempts. 7696 LoadEntriesToVectorize.insert(VectorizableTree.size());
7697return TreeEntry::NeedToGather;
7699return TreeEntry::ScatterVectorize;
7701if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7702// Delay slow vectorized nodes for better vectorization attempts. 7703 LoadEntriesToVectorize.insert(VectorizableTree.size());
7704return TreeEntry::NeedToGather;
7706return TreeEntry::StridedVectorize;
7710if (
DL->getTypeSizeInBits(ScalarTy) !=
7711DL->getTypeAllocSizeInBits(ScalarTy))
7712LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7714auto *LI = dyn_cast<LoadInst>(V);
7715return !LI || !LI->isSimple();
7722return TreeEntry::NeedToGather;
7726case Instruction::ZExt:
7727case Instruction::SExt:
7728case Instruction::FPToUI:
7729case Instruction::FPToSI:
7730case Instruction::FPExt:
7731case Instruction::PtrToInt:
7732case Instruction::IntToPtr:
7733case Instruction::SIToFP:
7734case Instruction::UIToFP:
7735case Instruction::Trunc:
7736case Instruction::FPTrunc:
7737case Instruction::BitCast: {
7739for (
Value *V : VL) {
7740if (isa<PoisonValue>(V))
7742Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7745dbgs() <<
"SLP: Gathering casts with different src types.\n");
7746return TreeEntry::NeedToGather;
7749return TreeEntry::Vectorize;
7751case Instruction::ICmp:
7752case Instruction::FCmp: {
7753// Check that all of the compares have the same predicate. 7757for (
Value *V : VL) {
7758if (isa<PoisonValue>(V))
7760auto *
Cmp = cast<CmpInst>(V);
7761if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7762Cmp->getOperand(0)->getType() != ComparedTy) {
7763LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7764return TreeEntry::NeedToGather;
7767return TreeEntry::Vectorize;
7769case Instruction::Select:
7770case Instruction::FNeg:
7771case Instruction::Add:
7772case Instruction::FAdd:
7773case Instruction::Sub:
7774case Instruction::FSub:
7775case Instruction::Mul:
7776case Instruction::FMul:
7777case Instruction::UDiv:
7778case Instruction::SDiv:
7779case Instruction::FDiv:
7780case Instruction::URem:
7781case Instruction::SRem:
7782case Instruction::FRem:
7783case Instruction::Shl:
7784case Instruction::LShr:
7785case Instruction::AShr:
7786case Instruction::And:
7787case Instruction::Or:
7788case Instruction::Xor:
7789case Instruction::Freeze:
7790if (S.getMainOp()->getType()->isFloatingPointTy() &&
7792auto *
I = dyn_cast<Instruction>(V);
7793returnI &&
I->isBinaryOp() && !
I->isFast();
7795return TreeEntry::NeedToGather;
7796return TreeEntry::Vectorize;
7797case Instruction::GetElementPtr: {
7798// We don't combine GEPs with complicated (nested) indexing. 7799for (
Value *V : VL) {
7800auto *
I = dyn_cast<GetElementPtrInst>(V);
7803if (
I->getNumOperands() != 2) {
7804LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7805return TreeEntry::NeedToGather;
7809// We can't combine several GEPs into one vector if they operate on 7811Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7812for (
Value *V : VL) {
7813auto *
GEP = dyn_cast<GEPOperator>(V);
7816Type *CurTy =
GEP->getSourceElementType();
7818LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7819return TreeEntry::NeedToGather;
7823// We don't combine GEPs with non-constant indexes. 7825for (
Value *V : VL) {
7826auto *
I = dyn_cast<GetElementPtrInst>(V);
7829auto *
Op =
I->getOperand(1);
7830if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7831 (
Op->getType() != Ty1 &&
7832 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7833Op->getType()->getScalarSizeInBits() >
7834DL->getIndexSizeInBits(
7835V->getType()->getPointerAddressSpace())))) {
7837dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7838return TreeEntry::NeedToGather;
7842return TreeEntry::Vectorize;
7844case Instruction::Store: {
7845// Check if the stores are consecutive or if we need to swizzle them. 7846llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7847// Avoid types that are padded when being allocated as scalars, while 7848// being packed together in a vector (such as i1). 7849if (
DL->getTypeSizeInBits(ScalarTy) !=
7850DL->getTypeAllocSizeInBits(ScalarTy)) {
7851LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7852return TreeEntry::NeedToGather;
7854// Make sure all stores in the bundle are simple - we can't vectorize 7855// atomic or volatile stores. 7856for (
Value *V : VL) {
7857auto *
SI = cast<StoreInst>(V);
7858if (!
SI->isSimple()) {
7860return TreeEntry::NeedToGather;
7865// Check the order of pointer operands. 7869if (CurrentOrder.empty()) {
7870 Ptr0 = PointerOps.
front();
7871 PtrN = PointerOps.
back();
7873 Ptr0 = PointerOps[CurrentOrder.front()];
7874 PtrN = PointerOps[CurrentOrder.back()];
7876 std::optional<int> Dist =
7878// Check that the sorted pointer operands are consecutive. 7879if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7880return TreeEntry::Vectorize;
7884return TreeEntry::NeedToGather;
7886case Instruction::Call: {
7887if (S.getMainOp()->getType()->isFloatingPointTy() &&
7889auto *
I = dyn_cast<Instruction>(V);
7890returnI && !
I->isFast();
7892return TreeEntry::NeedToGather;
7893// Check if the calls are all to the same vectorizable intrinsic or 7901false/*HasGlobalPred*/);
7906return TreeEntry::NeedToGather;
7911for (
unsigned J = 0; J != NumArgs; ++J)
7914for (
Value *V : VL) {
7915CallInst *CI2 = dyn_cast<CallInst>(V);
7923return TreeEntry::NeedToGather;
7925// Some intrinsics have scalar arguments and should be same in order for 7926// them to be vectorized. 7927for (
unsigned J = 0; J != NumArgs; ++J) {
7930if (ScalarArgs[J] != A1J) {
7932 <<
"SLP: mismatched arguments in call:" << *CI
7933 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7934return TreeEntry::NeedToGather;
7938// Verify that the bundle operands are identical between the two calls. 7943LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7944 <<
"!=" << *V <<
'\n');
7945return TreeEntry::NeedToGather;
7949return TreeEntry::Vectorize;
7951case Instruction::ShuffleVector: {
7952if (!S.isAltShuffle()) {
7953// REVEC can support non alternate shuffle. 7955return TreeEntry::Vectorize;
7956// If this is not an alternate sequence of opcode like add-sub 7957// then do not vectorize this instruction. 7959return TreeEntry::NeedToGather;
7964 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and " 7965"the whole alt sequence is not profitable.\n");
7966return TreeEntry::NeedToGather;
7969return TreeEntry::Vectorize;
7973return TreeEntry::NeedToGather;
7978/// Allows to correctly handle operands of the phi nodes based on the \p Main 7979/// PHINode order of incoming basic blocks/values. 7987 PHIHandler() =
delete;
7989 : DT(DT), Main(Main), Phis(Phis),
7990Operands(Main->getNumIncomingValues(),
7992void buildOperands() {
7993constexprunsigned FastLimit = 4;
8001// Prepare the operand vector. 8003auto *
P = dyn_cast<PHINode>(V);
8005assert(isa<PoisonValue>(V) &&
8006"Expected isa instruction or poison value.");
8010if (
P->getIncomingBlock(
I) == InBB)
8025Blocks.try_emplace(InBB).first->second.push_back(
I);
8028if (isa<PoisonValue>(V)) {
8033auto *
P = cast<PHINode>(V);
8034for (
unsignedI : seq<unsigned>(0,
P->getNumIncomingValues())) {
8042auto It =
Blocks.find(InBB);
8049if (
P.getSecond().size() <= 1)
8051unsigned BasicI =
P.getSecond().front();
8054 [&](
constauto &Data) {
8055return !Data.value() ||
8056 Data.value() ==
Operands[BasicI][Data.index()];
8058"Expected empty operands list.");
8068const EdgeInfo &UserTreeIdx,
8069unsigned InterleaveFactor) {
8075auto TryToFindDuplicates = [&](
const InstructionsState &S,
8076bool DoNotFail =
false) {
8077// Check that every instruction appears once in this bundle. 8079for (
Value *V : VL) {
8086auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8091size_t NumUniqueScalarValues = UniqueValues.
size();
8094if (NumUniqueScalarValues == VL.size() &&
8096 ReuseShuffleIndices.
clear();
8098// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. 8099if ((UserTreeIdx.UserTE &&
8100 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8103"for nodes with padding.\n");
8104 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8108if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8109 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8112if (DoNotFail && UniquePositions.size() > 1 &&
8113 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8114all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8115// Find the number of elements, which forms full vectors. 8117 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8118if (PWSz == VL.size()) {
8119 ReuseShuffleIndices.
clear();
8121 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8123 PWSz - UniqueValues.
size(),
8125// Check that extended with poisons operations are still valid for 8126// vectorization (div/rem are not allowed). 8129 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8132 VL = NonUniqueValueVL;
8137 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8147// Don't go into catchswitch blocks, which can happen with PHIs. 8148// Such blocks can only have PHIs and the catchswitch. There is no 8149// place to insert a shuffle if we need to, so just avoid that issue. 8150if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8152 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8156// Check if this is a duplicate of another entry. 8158if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8161if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8162auto It = MultiNodeScalars.
find(S.getMainOp());
8163if (It != MultiNodeScalars.
end()) {
8164auto *TEIt =
find_if(It->getSecond(),
8165 [&](TreeEntry *ME) { return ME->isSame(VL); });
8166if (TEIt != It->getSecond().end())
8177if (TryToFindDuplicates(S))
8178 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8179 ReuseShuffleIndices);
8183 Nodes.
insert(getTreeEntry(S.getMainOp()));
8184for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.getMainOp()))
8187if (
any_of(Nodes, [&](
const TreeEntry *E) {
8189 [&](
Value *V) { return Values.contains(V); }))
8194all_of(VL, [&](
Value *V) {
return EValues.contains(V); }));
8197if (TryToFindDuplicates(S))
8198 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8199 ReuseShuffleIndices);
8203// Record the reuse of the tree node. FIXME, currently this is only 8204// used to properly draw the graph rather than for the actual 8206 E->UserTreeIndices.push_back(UserTreeIdx);
8207LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8214// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of 8215// a load), in which case peek through to include it in the tree, without 8216// ballooning over-budget. 8218 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8223 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
8225LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8226if (TryToFindDuplicates(S))
8227 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8228 ReuseShuffleIndices);
8232// Don't handle scalable vectors 8233if (S && S.getOpcode() == Instruction::ExtractElement &&
8234 isa<ScalableVectorType>(
8235 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8236LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8237if (TryToFindDuplicates(S))
8238 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8239 ReuseShuffleIndices);
8243// Don't handle vectors. 8246 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8250// If all of the operands are identical or constant we have a simple solution. 8251// If we deal with insert/extract instructions, they all must have constant 8252// indices, otherwise we should gather them, not try to vectorize. 8253// If alternate op node with 2 elements with gathered operands - do not 8255auto &&NotProfitableForVectorization = [&S,
this,
8257if (!S || !S.isAltShuffle() || VL.size() > 2)
8263// Check if all operands are extracts, part of vector node or can build a 8264// regular vectorize node. 8266for (
Value *V : VL) {
8267auto *
I = cast<Instruction>(V);
8269 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8274if ((IsCommutative &&
8275 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8277all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8279assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8281auto *
I1 = cast<Instruction>(VL.front());
8282auto *I2 = cast<Instruction>(VL.back());
8283for (
intOp : seq<int>(S.getMainOp()->getNumOperands()))
8285 I2->getOperand(
Op));
8287 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8289 })) >= S.getMainOp()->getNumOperands() / 2)
8291if (S.getMainOp()->getNumOperands() > 2)
8294// Check permuted operands. 8298 I2->getOperand((
Op + 1) % E));
8300 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8309bool IsScatterVectorizeUserTE =
8310 UserTreeIdx.UserTE &&
8311 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8313bool AreScatterAllGEPSameBlock =
8314 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8318auto *
I = dyn_cast<GetElementPtrInst>(V);
8323return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8326sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8328bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8331 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8334 NotProfitableForVectorization(VL)) {
8335LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8336if (TryToFindDuplicates(S))
8337 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8338 ReuseShuffleIndices);
8342// Don't vectorize ephemeral values. 8343if (S && !EphValues.
empty()) {
8344for (
Value *V : VL) {
8345if (EphValues.
count(V)) {
8347 <<
") is ephemeral.\n");
8348 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8354// We now know that this is a vector of instructions of the same type from 8357// Check that none of the instructions in the bundle are already in the tree. 8358for (
Value *V : VL) {
8359if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8362if (getTreeEntry(V)) {
8364 <<
") is already in tree.\n");
8365if (TryToFindDuplicates(S))
8366 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8367 ReuseShuffleIndices);
8372// The reduction nodes (stored in UserIgnoreList) also should stay scalar. 8373if (UserIgnoreList && !UserIgnoreList->empty()) {
8374for (
Value *V : VL) {
8375if (UserIgnoreList->contains(V)) {
8377if (TryToFindDuplicates(S))
8378 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8379 ReuseShuffleIndices);
8385// Special processing for sorted pointers for ScatterVectorize node with 8386// constant indeces only. 8387if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8388assert(VL.front()->getType()->isPointerTy() &&
8389count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8390"Expected pointers only.");
8391// Reset S to make it GetElementPtr kind of node. 8392constauto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8393assert(It != VL.end() &&
"Expected at least one GEP.");
8397// Check that all of the users of the scalars that we want to vectorize are 8405// Don't go into unreachable blocks. They may contain instructions with 8406// dependency cycles which confuse the final scheduling. 8407// Do not vectorize EH and non-returning blocks, not profitable in most 8410 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8414// Check that every instruction appears once in this bundle. 8415if (!TryToFindDuplicates(S,
/*DoNotFail=*/true))
8418// Perform specific checks for each particular instruction kind. 8421 TreeEntry::EntryState State = getScalarsVectorizationState(
8422 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8423if (State == TreeEntry::NeedToGather) {
8424 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8425 ReuseShuffleIndices);
8429auto &BSRef = BlocksSchedules[BB];
8431 BSRef = std::make_unique<BlockScheduling>(BB);
8433 BlockScheduling &BS = *BSRef;
8435 std::optional<ScheduleData *> Bundle =
8436 BS.tryScheduleBundle(UniqueValues,
this, S);
8437#ifdef EXPENSIVE_CHECKS 8438// Make sure we didn't break any internal invariants 8442LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8443assert((!BS.getScheduleData(VL0) ||
8444 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8445"tryScheduleBundle should cancelScheduling on failure");
8446 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8447 ReuseShuffleIndices);
8448 NonScheduledFirst.insert(VL.front());
8449if (S.getOpcode() == Instruction::Load &&
8450 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8454LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8456unsigned ShuffleOrOp =
8457 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
8458auto CreateOperandNodes = [&](TreeEntry *
TE,
constauto &
Operands) {
8459// Postpone PHI nodes creation 8461for (
unsignedI : seq<unsigned>(
Operands.size())) {
8466if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8471for (
unsignedI : PHIOps)
8474switch (ShuffleOrOp) {
8475case Instruction::PHI: {
8476auto *PH = cast<PHINode>(VL0);
8479 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8483// Keeps the reordered operands to avoid code duplication. 8484 PHIHandler Handler(*DT, PH, VL);
8485 Handler.buildOperands();
8486for (
unsignedI : seq<unsigned>(PH->getNumOperands()))
8487TE->setOperand(
I, Handler.getOperands(
I));
8489for (
unsignedI : seq<unsigned>(PH->getNumOperands()))
8494case Instruction::ExtractValue:
8495case Instruction::ExtractElement: {
8496if (CurrentOrder.empty()) {
8497LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8500dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence " 8502for (
unsignedIdx : CurrentOrder)
8508// Insert new order with initial value 0, if it does not exist, 8509// otherwise return the iterator to the existing one. 8510 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8511 ReuseShuffleIndices, CurrentOrder);
8513"(ExtractValueInst/ExtractElementInst).\n";
8515// This is a special case, as it does not gather, but at the same time 8516// we are not extending buildTree_rec() towards the operands. 8517TE->setOperand(*
this);
8520case Instruction::InsertElement: {
8521assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8523auto OrdCompare = [](
const std::pair<int, int> &
P1,
8524const std::pair<int, int> &P2) {
8525returnP1.first > P2.first;
8528decltype(OrdCompare)>
8529 Indices(OrdCompare);
8530for (
intI = 0, E = VL.size();
I < E; ++
I) {
8532 Indices.emplace(
Idx,
I);
8534OrdersType CurrentOrder(VL.size(), VL.size());
8535bool IsIdentity =
true;
8536for (
intI = 0, E = VL.size();
I < E; ++
I) {
8537 CurrentOrder[Indices.top().second] =
I;
8538 IsIdentity &= Indices.top().second ==
I;
8542 CurrentOrder.clear();
8543 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8545LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8548TE->setOperand(*
this);
8549 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8552case Instruction::Load: {
8553// Check that a vectorized load would load the same memory as a scalar 8554// load. For example, we don't want to vectorize loads that are smaller 8555// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 8556// treats loading/storing it as an i8 struct. If we vectorize loads/stores 8557// from such a struct, we read/write packed bits disagreeing with the 8558// unvectorized version. 8559 TreeEntry *
TE =
nullptr;
8562case TreeEntry::Vectorize:
8563TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8564 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8565if (CurrentOrder.empty())
8570 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8573case TreeEntry::StridedVectorize:
8574// Vectorizing non-consecutive loads with `llvm.masked.gather`. 8575TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8576 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8577LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8580case TreeEntry::ScatterVectorize:
8581// Vectorizing non-consecutive loads with `llvm.masked.gather`. 8582TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8583 UserTreeIdx, ReuseShuffleIndices);
8586 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8589case TreeEntry::CombinedVectorize:
8590case TreeEntry::NeedToGather:
8593TE->setOperand(*
this);
8594if (State == TreeEntry::ScatterVectorize)
8595 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8598case Instruction::ZExt:
8599case Instruction::SExt:
8600case Instruction::FPToUI:
8601case Instruction::FPToSI:
8602case Instruction::FPExt:
8603case Instruction::PtrToInt:
8604case Instruction::IntToPtr:
8605case Instruction::SIToFP:
8606case Instruction::UIToFP:
8607case Instruction::Trunc:
8608case Instruction::FPTrunc:
8609case Instruction::BitCast: {
8610auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8611 std::make_pair(std::numeric_limits<unsigned>::min(),
8612 std::numeric_limits<unsigned>::max()));
8613if (ShuffleOrOp == Instruction::ZExt ||
8614 ShuffleOrOp == Instruction::SExt) {
8615 CastMaxMinBWSizes = std::make_pair(
8621 }
elseif (ShuffleOrOp == Instruction::Trunc) {
8622 CastMaxMinBWSizes = std::make_pair(
8629 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8630 ReuseShuffleIndices);
8634TE->setOperand(*
this);
8636 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8637if (ShuffleOrOp == Instruction::Trunc) {
8638 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8639 }
elseif (ShuffleOrOp == Instruction::SIToFP ||
8640 ShuffleOrOp == Instruction::UIToFP) {
8641unsigned NumSignBits =
8643if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8645 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8647if (NumSignBits * 2 >=
8649 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8653case Instruction::ICmp:
8654case Instruction::FCmp: {
8655// Check that all of the compares have the same predicate. 8657 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8658 ReuseShuffleIndices);
8663 VLOperands Ops(VL, S, *
this);
8665// Commutative predicate - collect + sort operands of the instructions 8666// so that each side is more likely to have the same opcode. 8668"Commutative Predicate mismatch");
8671Right = Ops.getVL(1);
8673// Collect operands - commute if it uses the swapped predicate. 8674for (
Value *V : VL) {
8675if (isa<PoisonValue>(V)) {
8680auto *
Cmp = cast<CmpInst>(V);
8683if (
Cmp->getPredicate() != P0)
8686Right.push_back(RHS);
8693if (ShuffleOrOp == Instruction::ICmp) {
8694unsigned NumSignBits0 =
8696if (NumSignBits0 * 2 >=
8698 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8699unsigned NumSignBits1 =
8701if (NumSignBits1 * 2 >=
8703 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8707case Instruction::Select:
8708case Instruction::FNeg:
8709case Instruction::Add:
8710case Instruction::FAdd:
8711case Instruction::Sub:
8712case Instruction::FSub:
8713case Instruction::Mul:
8714case Instruction::FMul:
8715case Instruction::UDiv:
8716case Instruction::SDiv:
8717case Instruction::FDiv:
8718case Instruction::URem:
8719case Instruction::SRem:
8720case Instruction::FRem:
8721case Instruction::Shl:
8722case Instruction::LShr:
8723case Instruction::AShr:
8724case Instruction::And:
8725case Instruction::Or:
8726case Instruction::Xor:
8727case Instruction::Freeze: {
8728 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8729 ReuseShuffleIndices);
8731dbgs() <<
"SLP: added a new TreeEntry " 8732"(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8737 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8740case Instruction::GetElementPtr: {
8741 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8742 ReuseShuffleIndices);
8743LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8746// Prepare the operand vector for pointer operands. 8747for (
Value *V : VL) {
8748auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8753Operands.front().push_back(
GEP->getPointerOperand());
8756// Need to cast all indices to the same type before vectorization to 8758// Required to be able to find correct matches between different gather 8759// nodes and reuse the vectorized values rather than trying to gather them 8764 [VL0Ty, IndexIdx](
Value *V) {
8765auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8768return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8772 ->getPointerOperandType()
8774// Prepare the operand vector. 8775for (
Value *V : VL) {
8776auto *
I = dyn_cast<GetElementPtrInst>(V);
8779 ConstantInt::get(Ty, 0,
/*isSigned=*/false));
8782auto *
Op =
I->getOperand(IndexIdx);
8783auto *CI = dyn_cast<ConstantInt>(
Op);
8788 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8792for (
unsignedI = 0, Ops =
Operands.size();
I < Ops; ++
I)
8796case Instruction::Store: {
8797bool Consecutive = CurrentOrder.empty();
8800 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8801 ReuseShuffleIndices, CurrentOrder);
8807dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8809TE->setOperand(*
this);
8810 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8813case Instruction::Call: {
8814// Check if the calls are all to the same vectorizable intrinsic or 8819 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8820 ReuseShuffleIndices);
8824for (
unsignedI : seq<unsigned>(CI->
arg_size())) {
8825// For scalar operands no need to create an entry since no need to 8829 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8833case Instruction::ShuffleVector: {
8834 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8835 ReuseShuffleIndices);
8836if (S.isAltShuffle()) {
8837LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8842dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8846// Reorder operands if reordering would enable vectorization. 8847auto *CI = dyn_cast<CmpInst>(VL0);
8849return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8851auto *MainCI = cast<CmpInst>(S.getMainOp());
8852auto *AltCI = cast<CmpInst>(S.getAltOp());
8856"Expected different main/alternate predicates.");
8858// Collect operands - commute if it uses the swapped predicate or 8859// alternate operation. 8860for (
Value *V : VL) {
8861if (isa<PoisonValue>(V)) {
8866auto *
Cmp = cast<CmpInst>(V);
8878Right.push_back(RHS);
8887TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8889 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8902while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8905if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8906// Check that struct is homogeneous. 8907for (
constauto *Ty : ST->elements())
8908if (Ty != *ST->element_begin())
8910N *= ST->getNumElements();
8911 EltTy = *ST->element_begin();
8912 }
elseif (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8913N *= AT->getNumElements();
8914 EltTy = AT->getElementType();
8916auto *VT = cast<FixedVectorType>(EltTy);
8917N *= VT->getNumElements();
8918 EltTy = VT->getElementType();
8925if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8933bool ResizeAllowed)
const{
8934constauto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8935assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8936auto *E0 = cast<Instruction>(*It);
8938all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8940// Check if all of the extracts come from the same vector and from the 8942Value *Vec = E0->getOperand(0);
8944 CurrentOrder.
clear();
8946// We have to extract from a vector/aggregate with the same number of elements. 8948if (E0->getOpcode() == Instruction::ExtractValue) {
8952// Check if load can be rewritten as load of vector. 8953LoadInst *LI = dyn_cast<LoadInst>(Vec);
8957 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8960unsigned E = VL.
size();
8961if (!ResizeAllowed && NElts != E)
8964unsigned MinIdx = NElts, MaxIdx = 0;
8966auto *Inst = dyn_cast<Instruction>(V);
8969if (Inst->getOperand(0) != Vec)
8971if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8972if (isa<UndefValue>(EE->getIndexOperand()))
8977constunsigned ExtIdx = *
Idx;
8980 Indices[
I] = ExtIdx;
8986if (MaxIdx - MinIdx + 1 > E)
8991// Check that all of the indices extract from the correct offset. 8992bool ShouldKeepOrder =
true;
8993// Assign to all items the initial value E + 1 so we can check if the extract 8994// instruction index was used already. 8995// Also, later we can check that all the indices are used and we have a 8996// consecutive access in the extract instructions, by checking that no 8997// element of CurrentOrder still has value E + 1. 8998 CurrentOrder.
assign(E, E);
8999for (
unsignedI = 0;
I < E; ++
I) {
9002constunsigned ExtIdx = Indices[
I] - MinIdx;
9003if (CurrentOrder[ExtIdx] != E) {
9004 CurrentOrder.
clear();
9007 ShouldKeepOrder &= ExtIdx ==
I;
9008 CurrentOrder[ExtIdx] =
I;
9011 CurrentOrder.
clear();
9013return ShouldKeepOrder;
9016bool BoUpSLP::areAllUsersVectorized(
9018return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
9020 return ScalarToTreeEntry.contains(U) ||
9021 isVectorLikeInstWithConstOps(U) ||
9022 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9026static std::pair<InstructionCost, InstructionCost>
9032// Calculate the cost of the scalar and vector calls. 9034if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
9035 FMF = FPCI->getFastMathFlags();
9042false/*HasGlobalPred*/);
9044auto LibCost = IntrinsicCost;
9046// Calculate the cost of the vector library call. 9047// If the corresponding vector call is cheaper, return its cost. 9051return {IntrinsicCost, LibCost};
9054void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9058unsigned Sz = Scalars.size();
9061if (!ReorderIndices.empty())
9063for (
unsignedI = 0;
I < Sz; ++
I) {
9065if (!ReorderIndices.empty())
9067if (isa<PoisonValue>(Scalars[
Idx]))
9069auto *OpInst = cast<Instruction>(Scalars[
Idx]);
9070if (IsAltOp(OpInst)) {
9080if (!ReuseShuffleIndices.
empty()) {
9083 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9093if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9094auto *AltCI = cast<CmpInst>(AltOp);
9097assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9098auto *CI = cast<CmpInst>(
I);
9106assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9107"CmpInst expected to match either main or alternate predicate or " 9109return MainP !=
P && MainP != SwappedP;
9116constauto *Op0 = Ops.
front();
9119// TODO: We should allow undef elements here 9123// TODO: We should allow undef elements here 9127// TODO: We should allow undef elements here 9128if (
auto *CI = dyn_cast<ConstantInt>(V))
9129return CI->getValue().isPowerOf2();
9132constbool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9133// TODO: We should allow undef elements here 9134if (
auto *CI = dyn_cast<ConstantInt>(V))
9135return CI->getValue().isNegatedPowerOf2();
9140if (IsConstant && IsUniform)
9155/// The base class for shuffle instruction emission and shuffle cost estimation. 9156classBaseShuffleAnalysis {
9158Type *ScalarTy =
nullptr;
9160 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9162 /// V is expected to be a vectorized value. 9163 /// When REVEC is disabled, there is no difference between VF and 9165 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements. 9166 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead 9168unsigned getVF(
Value *V)
const{
9169assert(V &&
"V cannot be nullptr");
9170assert(isa<FixedVectorType>(
V->getType()) &&
9171"V does not have FixedVectorType");
9172assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9174unsigned VNumElements =
9175 cast<FixedVectorType>(
V->getType())->getNumElements();
9176assert(VNumElements > ScalarTyNumElements &&
9177"the number of elements of V is not large enough");
9178assert(VNumElements % ScalarTyNumElements == 0 &&
9179"the number of elements of V is not a vectorized value");
9180return VNumElements / ScalarTyNumElements;
9183 /// Checks if the mask is an identity mask. 9184 /// \param IsStrict if is true the function returns false if mask size does 9185 /// not match vector size. 9188int Limit =
Mask.size();
9194// Consider extract subvector starting from index 0. 9198// All VF-size submasks are identity (e.g. 9199// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4). 9200if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
intIdx) {
9210 /// Tries to combine 2 different masks into single one. 9211 /// \param LocalVF Vector length of the permuted input vector. \p Mask may 9212 /// change the size of the vector, \p LocalVF is the original size of the 9213 /// shuffled vector. 9216unsigned VF =
Mask.size();
9218for (
intI = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9221int MaskedIdx =
Mask[ExtMask[
I] % VF];
9228 /// Looks through shuffles trying to reduce final number of shuffles in the 9229 /// code. The function looks through the previously emitted shuffle 9230 /// instructions and properly mark indices in mask as undef. 9231 /// For example, given the code 9233 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 9234 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 9236 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 9237 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 9238 /// <0, 1, 2, 3> for the shuffle. 9239 /// If 2 operands are of different size, the smallest one will be resized and 9240 /// the mask recalculated properly. 9241 /// For example, given the code 9243 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 9244 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 9246 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 9247 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 9248 /// <0, 1, 2, 3> for the shuffle. 9249 /// So, it tries to transform permutations to simple vector merge, if 9251 /// \param V The input vector which must be shuffled using the given \p Mask. 9252 /// If the better candidate is found, \p V is set to this best candidate 9254 /// \param Mask The input mask for the shuffle. If the best candidate is found 9255 /// during looking-through-shuffles attempt, it is updated accordingly. 9256 /// \param SinglePermute true if the shuffle operation is originally a 9257 /// single-value-permutation. In this case the look-through-shuffles procedure 9258 /// may look for resizing shuffles as the best candidates. 9259 /// \return true if the shuffle results in the non-resizing identity shuffle 9260 /// (and thus can be ignored), false - otherwise. 9262bool SinglePermute) {
9266while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9267// Exit if not a fixed vector type or changing size shuffle. 9268auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9271// Remember the identity or broadcast mask, if it is not a resizing 9272// shuffle. If no better candidates are found, this Op and Mask will be 9273// used in the final shuffle. 9274if (isIdentityMask(Mask, SVTy,
/*IsStrict=*/false)) {
9275if (!IdentityOp || !SinglePermute ||
9276 (isIdentityMask(Mask, SVTy,
/*IsStrict=*/true) &&
9278 IdentityMask.
size()))) {
9280// Store current mask in the IdentityMask so later we did not lost 9281// this info if IdentityOp is selected as the best candidate for the 9283 IdentityMask.
assign(Mask);
9286// Remember the broadcast mask. If no better candidates are found, this Op 9287// and Mask will be used in the final shuffle. 9288// Zero splat can be used as identity too, since it might be used with 9289// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling. 9290// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is 9291// expensive, the analysis founds out, that the source vector is just a 9292// broadcast, this original mask can be transformed to identity mask <0, 9295// %0 = shuffle %v, poison, zeroinitalizer 9296// %res = shuffle %0, poison, <3, 1, 2, 0> 9298// may be transformed to 9300// %0 = shuffle %v, poison, zeroinitalizer 9301// %res = shuffle %0, poison, <0, 1, 2, 3> 9303if (SV->isZeroEltSplat()) {
9305 IdentityMask.
assign(Mask);
9307int LocalVF =
Mask.size();
9309 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9310 LocalVF = SVOpTy->getNumElements();
9314static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9316 ExtMask[
Idx] = SV->getMaskValue(
I);
9326if (!IsOp1Undef && !IsOp2Undef) {
9327// Update mask and mark undef elems. 9328for (
int &
I : Mask) {
9331if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9338 combineMasks(LocalVF, ShuffleMask, Mask);
9339Mask.swap(ShuffleMask);
9341Op = SV->getOperand(0);
9343Op = SV->getOperand(1);
9345if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9346 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9351"Expected masks of same sizes.");
9352// Clear known poison elements. 9356Mask.swap(IdentityMask);
9357auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9358return SinglePermute &&
9359 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9360/*IsStrict=*/true) ||
9361 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9362 Shuffle->isZeroEltSplat() &&
9372 /// Smart shuffle instruction emission, walks through shuffles trees and 9373 /// tries to find the best matching vector for the actual shuffle 9375template <
typename T,
typename ShuffleBuilderTy>
9377 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
9378assert(V1 &&
"Expected at least one vector value.");
9381if (ScalarTyNumElements != 1) {
9387 Builder.resizeToMatch(V1, V2);
9389if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9390 VF = FTy->getNumElements();
9391if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9394// Peek through shuffles. 9398 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9401for (
intI = 0,
E =
Mask.size();
I <
E; ++
I) {
9403 CombinedMask1[
I] =
Mask[
I];
9405 CombinedMask2[
I] =
Mask[
I] - VF;
9412 (void)peekThroughShuffles(Op1, CombinedMask1,
/*SinglePermute=*/false);
9413 (void)peekThroughShuffles(Op2, CombinedMask2,
/*SinglePermute=*/false);
9414// Check if we have 2 resizing shuffles - need to peek through operands 9416if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9417if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9422 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9425 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9427 ExtMask1, UseMask::SecondArg);
9432 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9435 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9437 ExtMask2, UseMask::SecondArg);
9438if (SV1->getOperand(0)->getType() ==
9439 SV2->getOperand(0)->getType() &&
9440 SV1->getOperand(0)->getType() != SV1->getType() &&
9443 Op1 = SV1->getOperand(0);
9444 Op2 = SV2->getOperand(0);
9446int LocalVF = ShuffleMask1.size();
9447if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9448 LocalVF = FTy->getNumElements();
9449 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9450 CombinedMask1.swap(ShuffleMask1);
9452 LocalVF = ShuffleMask2.size();
9453if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9454 LocalVF = FTy->getNumElements();
9455 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9456 CombinedMask2.swap(ShuffleMask2);
9459 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9460 Builder.resizeToMatch(Op1, Op2);
9461 VF = std::max(cast<VectorType>(Op1->
getType())
9463 .getKnownMinValue(),
9464 cast<VectorType>(Op2->
getType())
9466 .getKnownMinValue());
9467for (
intI = 0,
E =
Mask.size();
I <
E; ++
I) {
9470"Expected undefined mask element");
9471 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9477 isa<ShuffleVectorInst>(Op1) &&
9478 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9480return Builder.createIdentity(Op1);
9481return Builder.createShuffleVector(
9485if (isa<PoisonValue>(V1))
9486return Builder.createPoison(
9487 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9488bool IsIdentity = peekThroughShuffles(V1, NewMask,
/*SinglePermute=*/true);
9489assert(V1 &&
"Expected non-null value after looking through shuffles.");
9492return Builder.createShuffleVector(V1, NewMask);
9493return Builder.createIdentity(V1);
9496 /// Transforms mask \p CommonMask per given \p Mask to make proper set after 9497 /// shuffle emission. 9500for (
unsignedI : seq<unsigned>(CommonMask.
size()))
9507/// Calculate the scalar and the vector costs from vectorizing set of GEPs. 9508static std::pair<InstructionCost, InstructionCost>
9514// Here we differentiate two cases: (1) when Ptrs represent a regular 9515// vectorization tree node (as they are pointer arguments of scattered 9516// loads) or (2) when Ptrs are the arguments of loads or stores being 9517// vectorized as plane wide unit-stride load/store since all the 9518// loads/stores are known to be from/to adjacent locations. 9519if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9520// Case 2: estimate costs for pointer related costs when vectorizing to 9521// a wide load/store. 9522// Scalar cost is estimated as a set of pointers with known relationship 9524// For vector code we will use BasePtr as argument for the wide load/store 9525// but we also need to account all the instructions which are going to 9526// stay in vectorized code due to uses outside of these scalar 9529 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9533for (
Value *V : Ptrs) {
9538auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9539// For simplicity assume Ptr to stay in vectorized code if it's not a 9540// GEP instruction. We don't care since it's cost considered free. 9541// TODO: We should check for any uses outside of vectorizable tree 9542// rather than just single use. 9543if (!
Ptr || !
Ptr->hasOneUse())
9547if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9548// If all pointers stay in vectorized code then we don't have 9549// any savings on that. 9553 TTI::PointersChainInfo::getKnownStride(),
9556// Case 1: Ptrs are the arguments of loads that we are going to transform 9557// into masked gather load intrinsic. 9558// All the scalar GEPs will be removed as a result of vectorization. 9559// For any external uses of some lanes extract element instructions will 9560// be generated (which cost is estimated separately). 9564auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9565returnPtr && !
Ptr->hasAllConstantIndices();
9567 ? TTI::PointersChainInfo::getUnknownStride()
9568 : TTI::PointersChainInfo::getKnownStride();
9572auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9574auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9575if (It != Ptrs.
end())
9576 BaseGEP = cast<GEPOperator>(*It);
9581 BaseGEP->getPointerOperand(), Indices, VecTy,
9586return std::make_pair(ScalarCost, VecCost);
9589void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9590assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9591"Expected gather node without reordering.");
9595// Do not reorder nodes if it small (just 2 elements), all-constant or all 9596// instructions have same opcode already. 9597if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
9601if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsignedIdx) {
9602 return VectorizableTree[Idx]->isSame(TE.Scalars);
9606auto GenerateLoadsSubkey = [&](
size_tKey,
LoadInst *LI) {
9611auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9612if (LIt != LoadsMap.
end()) {
9616/*StrictCheck=*/true))
9626if (LIt->second.size() > 2) {
9628hash_value(LIt->second.back()->getPointerOperand());
9634 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9639bool IsOrdered =
true;
9640unsigned NumInstructions = 0;
9641// Try to "cluster" scalar instructions, to be able to build extra vectorized 9645if (
auto *Inst = dyn_cast<Instruction>(V);
9646 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9649/*AllowAlternate=*/false);
9652auto &Container = SortedValues[
Key];
9653if (IsOrdered && !KeyToIndex.
contains(V) &&
9654 !(isa<Constant, ExtractElementInst>(V) ||
9656 ((Container.contains(
Idx) &&
9657 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9658 (!Container.empty() && !Container.contains(
Idx) &&
9659 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9661auto &KTI = KeyToIndex[
V];
9663 Container[
Idx].push_back(V);
9668if (!IsOrdered && NumInstructions > 1) {
9670TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9671for (
constauto &
D : SortedValues) {
9672for (
constauto &
P :
D.second) {
9674for (
Value *V :
P.second) {
9677TE.ReorderIndices[Cnt +
K] =
Idx;
9678TE.Scalars[Cnt +
K] =
V;
9680 Sz += Indices.
size();
9681 Cnt += Indices.
size();
9683if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9685 *
TTI,
TE.Scalars.front()->getType(), Sz);
9687for (
unsignedI : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9689 }
elseif (!
P.second.empty() &&
isConstant(
P.second.front())) {
9690for (
unsignedI : seq<unsigned>(Cnt - Sz, Cnt))
9696// Reuses always require shuffles, so consider it as profitable. 9697if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9699// Do simple cost estimation. 9702auto *ScalarTy =
TE.Scalars.front()->getType();
9704for (
auto [
Idx, Sz] : SubVectors) {
9708if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9710// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead 9711// of CreateInsertElement. 9713for (
unsignedI : seq<unsigned>(
TE.Scalars.size()))
9722int Sz =
TE.Scalars.size();
9724TE.ReorderIndices.end());
9725for (
unsignedI : seq<unsigned>(Sz)) {
9727if (isa<PoisonValue>(V)) {
9730 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9734any_of(ReorderMask, [&](
intI) {
returnI >= Sz; })
9737 VecTy, ReorderMask);
9740for (
unsignedI : seq<unsigned>(Sz)) {
9744if (!isa<PoisonValue>(V))
9747 ReorderMask[
I] =
I + Sz;
9751 VecTy, DemandedElts,
/*Insert=*/true,
/*Extract=*/false,
CostKind);
9754if (
Cost >= BVCost) {
9757TE.ReorderIndices.clear();
9763 BaseGraphSize = VectorizableTree.size();
9764// Turn graph transforming mode on and off, when done. 9765classGraphTransformModeRAAI {
9766bool &SavedIsGraphTransformMode;
9769 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9770 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9771 IsGraphTransformMode =
true;
9773 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9774 } TransformContext(IsGraphTransformMode);
9775// Operands are profitable if they are: 9776// 1. At least one constant 9780// 3. Results in good vectorization opportunity, i.e. may generate vector 9781// nodes and reduce cost of the graph. 9783const InstructionsState &S) {
9785for (
unsignedOp : seq<unsigned>(S.getMainOp()->getNumOperands()))
9787 I2->getOperand(
Op));
9789 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9791 [](
const std::pair<Value *, Value *> &
P) {
9792return isa<Constant>(
P.first) ||
9793 isa<Constant>(
P.second) ||
P.first ==
P.second;
9799// Try to reorder gather nodes for better vectorization opportunities. 9800for (
unsignedIdx : seq<unsigned>(BaseGraphSize)) {
9801 TreeEntry &E = *VectorizableTree[
Idx];
9803 reorderGatherNode(E);
9806// The tree may grow here, so iterate over nodes, built before. 9807for (
unsignedIdx : seq<unsigned>(BaseGraphSize)) {
9808 TreeEntry &E = *VectorizableTree[
Idx];
9813// Do not try partial vectorization for small nodes (<= 2), nodes with the 9814// same opcode and same parent block or all constants. 9815if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9816 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9820// Try to find vectorizable sequences and transform them into a series of 9821// insertvector instructions. 9822unsigned StartIdx = 0;
9827 *
TTI, VL.
front()->getType(), VF - 1)) {
9828if (StartIdx + VF >
End)
9831for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9833// If any instruction is vectorized already - do not try again. 9834// Reuse the existing node, if it fully matches the slice. 9835if (
const TreeEntry *SE = getTreeEntry(Slice.
front());
9836 SE || getTreeEntry(Slice.
back())) {
9839if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9842// Constant already handled effectively - skip. 9845// Do not try to vectorize small splats (less than vector register and 9846// only with the single non-undef element). 9848if (Slices.
empty() || !IsSplat ||
9850 Slice.
front()->getType(), VF)),
9853 Slice.
front()->getType(), 2 * VF)),
9856static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9862 (S.getOpcode() == Instruction::Load &&
9864 (S.getOpcode() != Instruction::Load &&
9868// Try to vectorize reduced values or if all users are vectorized. 9869// For expensive instructions extra extracts might be profitable. 9870if ((!UserIgnoreList || E.Idx != 0) &&
9874if (isa<PoisonValue>(V))
9876return areAllUsersVectorized(cast<Instruction>(V),
9880if (S.getOpcode() == Instruction::Load) {
9885// Do not vectorize gathers. 9890// If reductions and the scalars from the root node are 9891// analyzed - mark as non-vectorizable reduction. 9892if (UserIgnoreList && E.Idx == 0)
9897 }
elseif (S.getOpcode() == Instruction::ExtractElement ||
9900 !CheckOperandsProfitability(
9903 IsaPred<Instruction>)),
9905// Do not vectorize extractelements (handled effectively 9906// alread). Do not vectorize non-profitable instructions (with 9907// low cost and non-vectorizable operands.) 9914auto AddCombinedNode = [&](
unsignedIdx,
unsigned Cnt,
unsigned Sz) {
9915 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9917 StartIdx = Cnt + Sz;
9921for (
auto [Cnt, Sz] : Slices) {
9923// If any instruction is vectorized already - do not try again. 9924if (TreeEntry *SE = getTreeEntry(Slice.
front());
9925 SE || getTreeEntry(Slice.
back())) {
9928if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9930 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9931 AddCombinedNode(SE->Idx, Cnt, Sz);
9934unsigned PrevSize = VectorizableTree.size();
9935 [[maybe_unused]]
unsigned PrevEntriesSize =
9936 LoadEntriesToVectorize.size();
9937 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9938if (PrevSize + 1 == VectorizableTree.size() &&
9939 VectorizableTree[PrevSize]->isGather() &&
9940 VectorizableTree[PrevSize]->hasState() &&
9941 VectorizableTree[PrevSize]->getOpcode() !=
9942 Instruction::ExtractElement &&
9944if (UserIgnoreList && E.Idx == 0 && VF == 2)
9946 VectorizableTree.pop_back();
9947assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9948"LoadEntriesToVectorize expected to remain the same");
9951 AddCombinedNode(PrevSize, Cnt, Sz);
9954// Restore ordering, if no extra vectorization happened. 9955if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9958 E.ReorderIndices.clear();
9963switch (E.getOpcode()) {
9964case Instruction::Load: {
9965// No need to reorder masked gather loads, just reorder the scalar 9967if (E.State != TreeEntry::Vectorize)
9969Type *ScalarTy = E.getMainOp()->getType();
9971Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9972// Check if profitable to represent consecutive load + reverse as strided 9973// load with stride -1. 9974if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9978auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9985 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9986/*VariableMask=*/false, CommonAlignment,
CostKind, BaseLI);
9987if (StridedCost < OriginalVecCost)
9988// Strided load is more profitable than consecutive load + reverse - 9989// transform the node to strided load. 9990 E.State = TreeEntry::StridedVectorize;
9994case Instruction::Store: {
9996 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9998Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9999// Check if profitable to represent consecutive load + reverse as strided 10000// load with stride -1. 10001if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
10005auto *BaseSI = cast<StoreInst>(E.Scalars.back());
10012 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10013/*VariableMask=*/false, CommonAlignment,
CostKind, BaseSI);
10014if (StridedCost < OriginalVecCost)
10015// Strided store is more profitable than reverse + consecutive store - 10016// transform the node to strided store. 10017 E.State = TreeEntry::StridedVectorize;
10018 }
elseif (!E.ReorderIndices.empty()) {
10019// Check for interleaved stores. 10021auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10022assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
10023if (Mask.size() < 4)
10025for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10029 VecTy, Factor, BaseSI->getAlign(),
10037unsigned InterleaveFactor = IsInterleaveMask(Mask);
10038if (InterleaveFactor != 0)
10039 E.setInterleave(InterleaveFactor);
10043case Instruction::Select: {
10044if (E.State != TreeEntry::Vectorize)
10049// This node is a minmax node. 10050 E.CombinedOp = TreeEntry::MinMax;
10051 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
10052if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10053 CondEntry->State == TreeEntry::Vectorize) {
10054// The condition node is part of the combined minmax node. 10055 CondEntry->State = TreeEntry::CombinedVectorize;
10064if (LoadEntriesToVectorize.empty()) {
10065// Single load node - exit. 10066if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10067 VectorizableTree.front()->getOpcode() == Instruction::Load)
10069// Small graph with small VF - exit. 10070constexprunsigned SmallTree = 3;
10071constexprunsigned SmallVF = 2;
10072if ((VectorizableTree.size() <= SmallTree &&
10073 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10074 (VectorizableTree.size() <= 2 && UserIgnoreList))
10077if (VectorizableTree.front()->isNonPowOf2Vec() &&
10081 [](
const std::unique_ptr<TreeEntry> &TE) {
10082return TE->isGather() && TE->hasState() &&
10083 TE->getOpcode() == Instruction::Load &&
10089// A list of loads to be gathered during the vectorization process. We can 10090// try to vectorize them at the end, if profitable. 10095for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10096 TreeEntry &E = *TE;
10098 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10099 (!E.hasState() &&
any_of(E.Scalars,
10101 return isa<LoadInst>(V) &&
10102 !isVectorized(V) &&
10103 !isDeleted(cast<Instruction>(V));
10106for (
Value *V : E.Scalars) {
10107auto *LI = dyn_cast<LoadInst>(V);
10113 *
this, V, *DL, *SE, *
TTI,
10114 GatheredLoads[std::make_tuple(
10121// Try to vectorize gathered loads if this is not just a gather of loads. 10122if (!GatheredLoads.
empty())
10123 tryToVectorizeGatheredLoads(GatheredLoads);
10126/// Merges shuffle masks and emits final shuffle instruction, if required. It 10127/// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 10128/// when the actual shuffle instruction is generated only if this is actually 10129/// required. Otherwise, the shuffle instruction emission is delayed till the 10130/// end of the process, to reduce the number of emitted instructions and further 10131/// analysis/transformations. 10133bool IsFinalized =
false;
10142 /// While set, still trying to estimate the cost for the same nodes and we 10143 /// can delay actual cost estimation (virtual shuffle instruction emission). 10144 /// May help better estimate the cost if same nodes must be permuted + allows 10145 /// to move most of the long shuffles cost estimation to TTI. 10146bool SameNodesEstimated =
true;
10155if (
auto *VTy = dyn_cast<VectorType>(Ty))
10169// Found the broadcasting of the single scalar, calculate the cost as 10171constauto *It =
find_if_not(VL, IsaPred<UndefValue>);
10172assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10173// Add broadcast for non-identity shuffle only. 10175count(VL, *It) > 1 &&
10178if (isa<FixedVectorType>(ScalarTy)) {
10183 cast<FixedVectorType>(ScalarTy));
10186 CostKind, std::distance(VL.
begin(), It),
10192 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10199 VecTy, ShuffleMask, CostKind,
10200/*Index=*/0,
/*SubTp=*/nullptr,
10204 (
all_of(Gathers, IsaPred<UndefValue>)
10206 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10210 /// Compute the cost of creating a vector containing the extracted values from 10214ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10215unsigned NumParts) {
10216assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10218 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10219 auto *EE = dyn_cast<ExtractElementInst>(V);
10222 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10225 return std::max(Sz, VecTy->getNumElements());
10227// FIXME: this must be moved to TTI for better estimation. 10231 -> std::optional<TTI::ShuffleKind> {
10232if (NumElts <= EltsPerVector)
10233return std::nullopt;
10235alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10237 if (I == PoisonMaskElem)
10239 return std::min(S, I);
10242int OffsetReg1 = OffsetReg0;
10244// Check that if trying to permute same single/2 input vectors. 10246int FirstRegId = -1;
10247 Indices.assign(1, OffsetReg0);
10251intIdx =
I - OffsetReg0;
10253 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10255 FirstRegId = RegId;
10256 RegIndices.
insert(RegId);
10257if (RegIndices.
size() > 2)
10258return std::nullopt;
10259if (RegIndices.
size() == 2) {
10261if (Indices.
size() == 1) {
10264 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10266 if (I == PoisonMaskElem)
10268 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10269 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10270 if (RegId == FirstRegId)
10272 return std::min(S, I);
10275 Indices.push_back(OffsetReg1 % NumElts);
10277Idx =
I - OffsetReg1;
10279I = (
Idx % NumElts) % EltsPerVector +
10280 (RegId == FirstRegId ? 0 : EltsPerVector);
10286// Process extracts in blocks of EltsPerVector to check if the source vector 10287// operand can be re-used directly. If not, add the cost of creating a 10288// shuffle to extract the values into a vector register. 10289for (
unsigned Part : seq<unsigned>(NumParts)) {
10290if (!ShuffleKinds[Part])
10293 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10297 std::optional<TTI::ShuffleKind> RegShuffleKind =
10298 CheckPerRegistersShuffle(SubMask, Indices);
10299if (!RegShuffleKind) {
10302 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10315 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10316for (
unsignedIdx : Indices) {
10317assert((
Idx + EltsPerVector) <= BaseVF &&
10318"SK_ExtractSubvector index out of range");
10323// Second attempt to check, if just a permute is better estimated than 10324// subvector extract. 10329if (OriginalCost <
Cost)
10330Cost = OriginalCost;
10334 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given 10335 /// mask \p Mask, register number \p Part, that includes \p SliceSize 10337void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10339unsigned SliceSize) {
10340if (SameNodesEstimated) {
10341// Delay the cost estimation if the same nodes are reshuffling. 10342// If we already requested the cost of reshuffling of E1 and E2 before, no 10343// need to estimate another cost with the sub-Mask, instead include this 10344// sub-Mask into the CommonMask to estimate it later and avoid double cost 10346if ((InVectors.
size() == 2 &&
10347 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10348 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10349 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10350unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10353"Expected all poisoned elements.");
10355copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10358// Found non-matching nodes - need to estimate the cost for the matched 10359// and transform mask. 10360Cost += createShuffle(InVectors.
front(),
10361 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10363 transformMaskAfterShuffle(CommonMask, CommonMask);
10364 }
elseif (InVectors.
size() == 2) {
10365Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10366 transformMaskAfterShuffle(CommonMask, CommonMask);
10368 SameNodesEstimated =
false;
10369if (!E2 && InVectors.
size() == 1) {
10370unsigned VF = E1.getVectorFactor();
10373 cast<FixedVectorType>(V1->
getType())->getNumElements());
10375constauto *E = cast<const TreeEntry *>(InVectors.
front());
10376 VF = std::max(VF, E->getVectorFactor());
10378for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10380 CommonMask[
Idx] = Mask[
Idx] + VF;
10381Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10382 transformMaskAfterShuffle(CommonMask, CommonMask);
10384autoP = InVectors.
front();
10385Cost += createShuffle(&E1, E2, Mask);
10386unsigned VF = Mask.size();
10391constauto *E = cast<const TreeEntry *>(
P);
10392 VF = std::max(VF, E->getVectorFactor());
10394for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10396 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10397Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10398 transformMaskAfterShuffle(CommonMask, CommonMask);
10402classShuffleCostBuilder {
10405staticbool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10407return Mask.empty() ||
10408 (VF == Mask.size() &&
10416 ~ShuffleCostBuilder() =
default;
10419// Empty mask or identity mask are free. 10421 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10422if (isEmptyOrIdentity(Mask, VF))
10425 cast<VectorType>(V1->
getType()), Mask);
10428// Empty mask or identity mask are free. 10430 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10431if (isEmptyOrIdentity(Mask, VF))
10434 cast<VectorType>(V1->
getType()), Mask);
10440void resizeToMatch(
Value *&,
Value *&)
const{}
10443 /// Smart shuffle instruction emission, walks through shuffles trees and 10444 /// tries to find the best matching vector for the actual shuffle 10450 ShuffleCostBuilder Builder(
TTI);
10453unsigned CommonVF = Mask.size();
10455auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10459Type *EScalarTy = E.Scalars.front()->getType();
10460bool IsSigned =
true;
10461if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10463 IsSigned = It->second.second;
10465if (EScalarTy != ScalarTy) {
10466unsigned CastOpcode = Instruction::Trunc;
10467unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10468unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10470 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10478if (isa<Constant>(V))
10480auto *VecTy = cast<VectorType>(V->getType());
10482if (EScalarTy != ScalarTy) {
10484unsigned CastOpcode = Instruction::Trunc;
10485unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10486unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10488 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10495if (!V1 && !V2 && !P2.
isNull()) {
10496// Shuffle 2 entry nodes. 10497const TreeEntry *E = cast<const TreeEntry *>(P1);
10498unsigned VF = E->getVectorFactor();
10499const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10500 CommonVF = std::max(VF, E2->getVectorFactor());
10503return Idx < 2 * static_cast<int>(CommonVF);
10505"All elements in mask must be less than 2 * CommonVF.");
10506if (E->Scalars.size() == E2->Scalars.size()) {
10510for (
int &
Idx : CommonMask) {
10513if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10515elseif (
Idx >=
static_cast<int>(CommonVF))
10516Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10520 CommonVF = E->Scalars.size();
10521 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10522 GetNodeMinBWAffectedCost(*E2, CommonVF);
10524 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10525 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10529 }
elseif (!V1 && P2.
isNull()) {
10530// Shuffle single entry node. 10531const TreeEntry *E = cast<const TreeEntry *>(P1);
10532unsigned VF = E->getVectorFactor();
10536 [=](
intIdx) {
return Idx < static_cast<int>(CommonVF); }) &&
10537"All elements in mask must be less than CommonVF.");
10538if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10540assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10541for (
int &
Idx : CommonMask) {
10545 CommonVF = E->Scalars.size();
10546 }
elseif (
unsigned Factor = E->getInterleaveFactor();
10547 Factor > 0 && E->Scalars.size() != Mask.size() &&
10550// Deinterleaved nodes are free. 10551 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10553 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10555// Not identity/broadcast? Try to see if the original vector is better. 10556if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10557 CommonVF == CommonMask.
size() &&
10559 [](
constauto &&
P) {
10561static_cast<unsigned>(
P.value()) !=
P.index();
10569 }
elseif (V1 && P2.
isNull()) {
10570// Shuffle single vector. 10571 ExtraCost += GetValueMinBWAffectedCost(V1);
10572 CommonVF = getVF(V1);
10575 [=](
intIdx) {
return Idx < static_cast<int>(CommonVF); }) &&
10576"All elements in mask must be less than CommonVF.");
10577 }
elseif (V1 && !V2) {
10578// Shuffle vector and tree node. 10579unsigned VF = getVF(V1);
10580const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10581 CommonVF = std::max(VF, E2->getVectorFactor());
10584return Idx < 2 * static_cast<int>(CommonVF);
10586"All elements in mask must be less than 2 * CommonVF.");
10587if (E2->Scalars.size() == VF && VF != CommonVF) {
10589assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10590for (
int &
Idx : CommonMask) {
10593if (
Idx >=
static_cast<int>(CommonVF))
10594Idx = E2Mask[
Idx - CommonVF] + VF;
10598 ExtraCost += GetValueMinBWAffectedCost(V1);
10600 ExtraCost += GetNodeMinBWAffectedCost(
10601 *E2, std::min(CommonVF, E2->getVectorFactor()));
10603 }
elseif (!V1 && V2) {
10604// Shuffle vector and tree node. 10605unsigned VF = getVF(V2);
10606const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10607 CommonVF = std::max(VF, E1->getVectorFactor());
10610return Idx < 2 * static_cast<int>(CommonVF);
10612"All elements in mask must be less than 2 * CommonVF.");
10613if (E1->Scalars.size() == VF && VF != CommonVF) {
10615assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10616for (
int &
Idx : CommonMask) {
10619if (
Idx >=
static_cast<int>(CommonVF))
10620Idx = E1Mask[
Idx - CommonVF] + VF;
10626 ExtraCost += GetNodeMinBWAffectedCost(
10627 *E1, std::min(CommonVF, E1->getVectorFactor()));
10629 ExtraCost += GetValueMinBWAffectedCost(V2);
10632assert(V1 && V2 &&
"Expected both vectors.");
10633unsigned VF = getVF(V1);
10634 CommonVF = std::max(VF, getVF(V2));
10637return Idx < 2 * static_cast<int>(CommonVF);
10639"All elements in mask must be less than 2 * CommonVF.");
10641 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10642if (V1->
getType() != V2->getType()) {
10646if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10648if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10652 InVectors.
front() =
10654if (InVectors.
size() == 2)
10656return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10657 V1, V2, CommonMask, Builder, ScalarTy);
10664 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10665 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10666 CheckedExtracts(CheckedExtracts) {}
10668ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10669unsigned NumParts,
bool &UseVecBaseAsInput) {
10670 UseVecBaseAsInput =
false;
10673Value *VecBase =
nullptr;
10675if (!E->ReorderIndices.empty()) {
10677 E->ReorderIndices.end());
10680// Check if it can be considered reused if same extractelements were 10681// vectorized already. 10682bool PrevNodeFound =
any_of(
10684 [&](
const std::unique_ptr<TreeEntry> &TE) {
10685 return ((TE->hasState() && !TE->isAltShuffle() &&
10686 TE->getOpcode() == Instruction::ExtractElement) ||
10688 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10689 return VL.size() > Data.index() &&
10690 (Mask[Data.index()] == PoisonMaskElem ||
10691 isa<UndefValue>(VL[Data.index()]) ||
10692 Data.value() == VL[Data.index()]);
10697for (
unsigned Part : seq<unsigned>(NumParts)) {
10699ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10702// Ignore non-extractelement scalars. 10703if (isa<UndefValue>(V) ||
10706// If all users of instruction are going to be vectorized and this 10707// instruction itself is not going to be vectorized, consider this 10708// instruction as dead and remove its cost from the final cost of the 10710// Also, avoid adjusting the cost for extractelements with multiple uses 10711// in different graph entries. 10712auto *EE = cast<ExtractElementInst>(V);
10713 VecBase = EE->getVectorOperand();
10714 UniqueBases.
insert(VecBase);
10715const TreeEntry *VE = R.getTreeEntry(V);
10716if (!CheckedExtracts.
insert(V).second ||
10717 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10720 return isa<GetElementPtrInst>(U) &&
10721 !R.areAllUsersVectorized(cast<Instruction>(U),
10729unsignedIdx = *EEIdx;
10730// Take credit for instruction that will become dead. 10731if (EE->hasOneUse() || !PrevNodeFound) {
10733if (isa<SExtInst, ZExtInst>(Ext) &&
10734all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10735// Use getExtractWithExtendCost() to calculate the cost of 10736// extractelement/ext pair. 10739 EE->getVectorOperandType(),
Idx);
10740// Add back the cost of s|zext which is subtracted separately. 10742 Ext->getOpcode(), Ext->getType(), EE->getType(),
10751// Check that gather of extractelements can be represented as just a 10752// shuffle of a single/two vectors the scalars are extracted from. 10753// Found the bunch of extractelement instructions that must be gathered 10754// into a vector and can be represented as a permutation elements in a 10755// single input vector or of 2 input vectors. 10756// Done for reused if same extractelements were vectorized already. 10758Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10761 transformMaskAfterShuffle(CommonMask, CommonMask);
10762 SameNodesEstimated =
false;
10763if (NumParts != 1 && UniqueBases.
size() != 1) {
10764 UseVecBaseAsInput =
true;
10770 /// Checks if the specified entry \p E needs to be delayed because of its 10771 /// dependency nodes. 10772 std::optional<InstructionCost>
10775// No need to delay the cost estimation during analysis. 10776return std::nullopt;
10782return Idx < static_cast<int>(E1.getVectorFactor());
10784"Expected single vector shuffle mask.");
10788if (InVectors.
empty()) {
10789 CommonMask.
assign(Mask.begin(), Mask.end());
10790 InVectors.
assign({&E1, &E2});
10793assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10796if (NumParts == 0 || NumParts >= Mask.size() ||
10797 MaskVecTy->getNumElements() % NumParts != 0 ||
10799 MaskVecTy->getNumElements() / NumParts))
10804unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10805 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10808if (InVectors.
empty()) {
10809 CommonMask.
assign(Mask.begin(), Mask.end());
10810 InVectors.
assign(1, &E1);
10813assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10816if (NumParts == 0 || NumParts >= Mask.size() ||
10817 MaskVecTy->getNumElements() % NumParts != 0 ||
10819 MaskVecTy->getNumElements() / NumParts))
10824unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10825 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10826if (!SameNodesEstimated && InVectors.
size() == 1)
10829 /// Adds 2 input vectors and the mask for their shuffling. 10831// May come only for shuffling of 2 vectors with extractelements, already 10832// handled in adjustExtracts. 10838auto *EI = cast<ExtractElementInst>(
10839 cast<const TreeEntry *>(InVectors.
front())
10840 ->getOrdered(
P.index()));
10841return EI->getVectorOperand() == V1 ||
10842 EI->getVectorOperand() == V2;
10844"Expected extractelement vectors.");
10846 /// Adds another one input vector and the mask for the shuffling. 10848if (InVectors.
empty()) {
10850"Expected empty input mask/vectors.");
10851 CommonMask.
assign(Mask.begin(), Mask.end());
10852 InVectors.
assign(1, V1);
10856// No need to add vectors here, already handled them in adjustExtracts. 10857assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10858 !CommonMask.
empty() &&
10861Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10862 ->getOrdered(
P.index());
10864returnP.value() == Mask[
P.index()] ||
10865 isa<UndefValue>(Scalar);
10866if (isa<Constant>(V1))
10868auto *EI = cast<ExtractElementInst>(Scalar);
10869return EI->getVectorOperand() == V1;
10871"Expected only tree entry for extractelement vectors.");
10875"Expected only tree entries from extracts/reused buildvectors.");
10876unsigned VF = getVF(V1);
10877if (InVectors.
size() == 2) {
10878Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10879 transformMaskAfterShuffle(CommonMask, CommonMask);
10880 VF = std::max<unsigned>(VF, CommonMask.
size());
10881 }
elseif (
constauto *InTE =
10882 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10883 VF = std::max(VF, InTE->getVectorFactor());
10886 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10887 ->getNumElements());
10890for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10892 CommonMask[
Idx] = Mask[
Idx] + VF;
10895Value *Root =
nullptr) {
10896Cost += getBuildVectorCost(VL, Root);
10898// FIXME: Need to find a way to avoid use of getNullValue here. 10900unsigned VF = VL.
size();
10902 VF = std::min(VF, MaskVF);
10904if (isa<UndefValue>(V)) {
10910if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10912// When REVEC is enabled, we need to expand vector types into scalar 10917Type *ScalarTy = V->getType()->getScalarType();
10919if (isa<PoisonValue>(V))
10921elseif (isa<UndefValue>(V))
10925 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10928 Vals.
swap(NewVals);
10934 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10938 /// Finalize emission of the shuffles. 10941ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10947if (InVectors.
size() == 2)
10948Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10950Cost += createShuffle(Vec,
nullptr, CommonMask);
10951 transformMaskAfterShuffle(CommonMask, CommonMask);
10953"Expected vector length for the final value before action.");
10954Value *V = cast<Value *>(Vec);
10955 Action(V, CommonMask);
10956 InVectors.
front() = V;
10958if (!SubVectors.empty()) {
10960if (InVectors.
size() == 2)
10961Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10963Cost += createShuffle(Vec,
nullptr, CommonMask);
10964 transformMaskAfterShuffle(CommonMask, CommonMask);
10965// Add subvectors permutation cost. 10966if (!SubVectorsMask.
empty()) {
10968"Expected same size of masks for subvectors and common mask.");
10970copy(SubVectorsMask, SVMask.begin());
10971for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10974 I1 = I2 + CommonMask.
size();
10981for (
auto [E,
Idx] : SubVectors) {
10982Type *EScalarTy = E->Scalars.front()->getType();
10983bool IsSigned =
true;
10984if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10987 IsSigned = It->second.second;
10989if (ScalarTy != EScalarTy) {
10990unsigned CastOpcode = Instruction::Trunc;
10991unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10992unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10994 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
11004if (!CommonMask.
empty()) {
11005 std::iota(std::next(CommonMask.
begin(),
Idx),
11006 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
11012if (!ExtMask.
empty()) {
11013if (CommonMask.
empty()) {
11017for (
intI = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11020 NewMask[
I] = CommonMask[ExtMask[
I]];
11022 CommonMask.
swap(NewMask);
11025if (CommonMask.
empty()) {
11026assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11030 createShuffle(InVectors.
front(),
11031 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
11037"Shuffle construction must be finalized.");
11041const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
11043if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
11046find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11047return TE->isGather() &&
11048find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11049 return EI.EdgeIdx == Idx && EI.UserTE == E;
11050 }) != TE->UserTreeIndices.end();
11052assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
11057if (TE.State == TreeEntry::ScatterVectorize ||
11058 TE.State == TreeEntry::StridedVectorize)
11060if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11061 !TE.isAltShuffle()) {
11062if (TE.ReorderIndices.empty())
11072/// Builds the arguments types vector for the given call instruction with the 11073/// given \p ID for the specified vector factor. 11076constunsigned VF,
unsigned MinBW,
11106// If we have computed a smaller type for the expression, update VecTy so 11107// that the costs will be accurate. 11108auto It = MinBWs.
find(E);
11109Type *OrigScalarTy = ScalarTy;
11110if (It != MinBWs.
end()) {
11111auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11117unsigned EntryVF = E->getVectorFactor();
11120if (E->isGather()) {
11123if (isa<InsertElementInst>(VL[0]))
11125if (isa<CmpInst>(VL.
front()))
11126 ScalarTy = VL.
front()->getType();
11127return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11128 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11132if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11135if (E->getOpcode() == Instruction::Store) {
11136// For stores the order is actually a mask. 11137 NewMask.
resize(E->ReorderIndices.size());
11138copy(E->ReorderIndices, NewMask.
begin());
11144if (!E->ReuseShuffleIndices.empty())
11145::addMask(Mask, E->ReuseShuffleIndices);
11149assert((E->State == TreeEntry::Vectorize ||
11150 E->State == TreeEntry::ScatterVectorize ||
11151 E->State == TreeEntry::StridedVectorize) &&
11155 (E->getOpcode() == Instruction::GetElementPtr &&
11156 E->getMainOp()->getType()->isPointerTy())) &&
11159unsigned ShuffleOrOp =
11160 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11161if (E->CombinedOp != TreeEntry::NotCombinedOp)
11162 ShuffleOrOp = E->CombinedOp;
11164constunsigned Sz = UniqueValues.
size();
11166for (
unsignedI = 0;
I < Sz; ++
I) {
11167if (isa<Instruction>(UniqueValues[
I]) && getTreeEntry(UniqueValues[
I]) == E)
11169 UsedScalars.set(
I);
11171auto GetCastContextHint = [&](
Value *
V) {
11172if (
const TreeEntry *OpTE = getTreeEntry(V))
11173return getCastContextHint(*OpTE);
11174 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11175if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11176 !SrcState.isAltShuffle())
11183// Calculate the cost of this instruction. 11185if (isa<CastInst, CallInst>(VL0)) {
11186// For some of the instructions no need to calculate cost for each 11187// particular instruction, we can use the cost of the single 11188// instruction x total number of scalar instructions. 11189 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11191for (
unsignedI = 0;
I < Sz; ++
I) {
11192if (UsedScalars.test(
I))
11194 ScalarCost += ScalarEltCost(
I);
11199// Check if the current node must be resized, if the parent node is not 11203 (E->getOpcode() != Instruction::Load ||
11204 !E->UserTreeIndices.empty())) {
11205const EdgeInfo &EI =
11206 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11207 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11209if (EI.UserTE->getOpcode() != Instruction::Select ||
11211auto UserBWIt = MinBWs.
find(EI.UserTE);
11212Type *UserScalarTy =
11213 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11214if (UserBWIt != MinBWs.
end())
11216 UserBWIt->second.first);
11217if (ScalarTy != UserScalarTy) {
11218unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11219unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11221auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11223 VecOpcode = Instruction::Trunc;
11226 It->second.second ? Instruction::SExt : Instruction::ZExt;
11233LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11234 ScalarCost,
"Calculated costs for Tree"));
11235return VecCost - ScalarCost;
11237// Calculate cost difference from vectorizing set of GEPs. 11238// Negative value means vectorizing is profitable. 11240assert((E->State == TreeEntry::Vectorize ||
11241 E->State == TreeEntry::StridedVectorize) &&
11242"Entry state expected to be Vectorize or StridedVectorize here.");
11246 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11247LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11248"Calculated GEPs cost for Tree"));
11250return VecCost - ScalarCost;
11257Type *CanonicalType = Ty;
11264 {CanonicalType, CanonicalType});
11267// If the selects are the only uses of the compares, they will be 11268// dead and we can adjust the cost by removing their cost. 11269if (VI && SelectOnly) {
11271"Expected only for scalar type.");
11272auto *CI = cast<CmpInst>(
VI->getOperand(0));
11274 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11275CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11276 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11278return IntrinsicCost;
11280switch (ShuffleOrOp) {
11281case Instruction::PHI: {
11282// Count reused scalars. 11285for (
Value *V : UniqueValues) {
11286auto *
PHI = dyn_cast<PHINode>(V);
11291for (
unsignedI = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11295if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
11297if (!OpTE->ReuseShuffleIndices.empty())
11298 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11299 OpTE->Scalars.size());
11302return CommonCost - ScalarCost;
11304case Instruction::ExtractValue:
11305case Instruction::ExtractElement: {
11306auto GetScalarCost = [&](
unsignedIdx) {
11307if (isa<PoisonValue>(UniqueValues[
Idx]))
11310auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11312if (ShuffleOrOp == Instruction::ExtractElement) {
11313auto *EE = cast<ExtractElementInst>(
I);
11314 SrcVecTy = EE->getVectorOperandType();
11316auto *EV = cast<ExtractValueInst>(
I);
11317Type *AggregateTy = EV->getAggregateOperand()->getType();
11319if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11320 NumElts = ATy->getNumElements();
11325if (
I->hasOneUse()) {
11327if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11328all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11329// Use getExtractWithExtendCost() to calculate the cost of 11330// extractelement/ext pair. 11333// Subtract the cost of s|zext which is subtracted separately. 11335Ext->getOpcode(),
Ext->getType(),
I->getType(),
11343auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11344return GetCostDiff(GetScalarCost, GetVectorCost);
11346case Instruction::InsertElement: {
11347assert(E->ReuseShuffleIndices.empty() &&
11348"Unique insertelements only are expected.");
11349auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11350unsignedconst NumElts = SrcVecTy->getNumElements();
11351unsignedconst NumScalars = VL.
size();
11357unsigned OffsetEnd = OffsetBeg;
11358 InsertMask[OffsetBeg] = 0;
11363elseif (OffsetEnd <
Idx)
11365 InsertMask[
Idx] =
I + 1;
11368if (NumOfParts > 0 && NumOfParts < NumElts)
11369 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11370unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11372unsignedOffset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11373unsigned InsertVecSz = std::min<unsigned>(
11375 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11376bool IsWholeSubvector =
11377 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11378// Check if we can safely insert a subvector. If it is not possible, just 11379// generate a whole-sized vector and shuffle the source vector and the new 11381if (OffsetBeg + InsertVecSz > VecSz) {
11382// Align OffsetBeg to generate correct mask. 11384 InsertVecSz = VecSz;
11388// TODO: Add support for Instruction::InsertValue. 11390if (!E->ReorderIndices.empty()) {
11395 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11397bool IsIdentity =
true;
11399Mask.swap(PrevMask);
11400for (
unsignedI = 0;
I < NumScalars; ++
I) {
11402 DemandedElts.
setBit(InsertIdx);
11403 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11404Mask[InsertIdx - OffsetBeg] =
I;
11406assert(
Offset < NumElts &&
"Failed to find vector index offset");
11410/*Insert*/true,
/*Extract*/false,
11413// First cost - resize to actual vector size if not identity shuffle or 11414// need to shift the vector. 11415// Do not calculate the cost if the actual size is the register size and 11416// we can merge this shuffle with the following SK_Select. 11420 InsertVecTy, Mask);
11421auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11422 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11424// Second cost - permutation with subvector, if some elements are from the 11425// initial vector or inserting a subvector. 11426// TODO: Implement the analysis of the FirstInsert->getOperand(0) 11427// subvector of ActualVecTy. 11430buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11431if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11432if (InsertVecSz != VecSz) {
11443for (
unsignedI = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11452case Instruction::ZExt:
11453case Instruction::SExt:
11454case Instruction::FPToUI:
11455case Instruction::FPToSI:
11456case Instruction::FPExt:
11457case Instruction::PtrToInt:
11458case Instruction::IntToPtr:
11459case Instruction::SIToFP:
11460case Instruction::UIToFP:
11461case Instruction::Trunc:
11462case Instruction::FPTrunc:
11463case Instruction::BitCast: {
11464auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11467unsigned Opcode = ShuffleOrOp;
11468unsigned VecOpcode = Opcode;
11470 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11471// Check if the values are candidates to demote. 11472unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11473if (SrcIt != MinBWs.
end()) {
11474 SrcBWSz = SrcIt->second.first;
11481if (BWSz == SrcBWSz) {
11482 VecOpcode = Instruction::BitCast;
11483 }
elseif (BWSz < SrcBWSz) {
11484 VecOpcode = Instruction::Trunc;
11485 }
elseif (It != MinBWs.
end()) {
11486assert(BWSz > SrcBWSz &&
"Invalid cast!");
11487 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11488 }
elseif (SrcIt != MinBWs.
end()) {
11489assert(BWSz > SrcBWSz &&
"Invalid cast!");
11491 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11493 }
elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11494 !SrcIt->second.second) {
11495 VecOpcode = Instruction::UIToFP;
11498assert(
Idx == 0 &&
"Expected 0 index only");
11504// Do not count cost here if minimum bitwidth is in effect and it is just 11505// a bitcast (here it is just a noop). 11506if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11508auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11511bool IsArithmeticExtendedReduction =
11512 E->Idx == 0 && UserIgnoreList &&
11514auto *
I = cast<Instruction>(V);
11515returnis_contained({Instruction::Add, Instruction::FAdd,
11516 Instruction::Mul, Instruction::FMul,
11517 Instruction::And, Instruction::Or,
11521if (IsArithmeticExtendedReduction &&
11522 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11526 VecOpcode == Opcode ? VI :
nullptr);
11528return GetCostDiff(GetScalarCost, GetVectorCost);
11530case Instruction::FCmp:
11531case Instruction::ICmp:
11532case Instruction::Select: {
11536match(VL0, MatchCmp))
11542auto GetScalarCost = [&](
unsignedIdx) {
11543if (isa<PoisonValue>(UniqueValues[
Idx]))
11546auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11552 !
match(VI, MatchCmp)) ||
11560 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11562 getOperandInfo(
VI->getOperand(1)), VI);
11565 ScalarCost = IntrinsicCost;
11574CostKind, getOperandInfo(E->getOperand(0)),
11575 getOperandInfo(E->getOperand(1)), VL0);
11576if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11579unsigned CondNumElements = CondType->getNumElements();
11581assert(VecTyNumElements >= CondNumElements &&
11582 VecTyNumElements % CondNumElements == 0 &&
11583"Cannot vectorize Instruction::Select");
11584if (CondNumElements != VecTyNumElements) {
11585// When the return type is i1 but the source is fixed vector type, we 11586// need to duplicate the condition value. 11593return VecCost + CommonCost;
11595return GetCostDiff(GetScalarCost, GetVectorCost);
11597case TreeEntry::MinMax: {
11598auto GetScalarCost = [&](
unsignedIdx) {
11599return GetMinMaxCost(OrigScalarTy);
11603return VecCost + CommonCost;
11605return GetCostDiff(GetScalarCost, GetVectorCost);
11607case Instruction::FNeg:
11608case Instruction::Add:
11609case Instruction::FAdd:
11610case Instruction::Sub:
11611case Instruction::FSub:
11612case Instruction::Mul:
11613case Instruction::FMul:
11614case Instruction::UDiv:
11615case Instruction::SDiv:
11616case Instruction::FDiv:
11617case Instruction::URem:
11618case Instruction::SRem:
11619case Instruction::FRem:
11620case Instruction::Shl:
11621case Instruction::LShr:
11622case Instruction::AShr:
11623case Instruction::And:
11624case Instruction::Or:
11625case Instruction::Xor: {
11626auto GetScalarCost = [&](
unsignedIdx) {
11627if (isa<PoisonValue>(UniqueValues[
Idx]))
11630auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11631unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11640if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11641for (
unsignedI : seq<unsigned>(0, E->getNumOperands())) {
11644auto *CI = dyn_cast<ConstantInt>(
Op);
11645return CI && CI->getValue().countr_one() >= It->second.first;
11650unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11654 Op2Info, {},
nullptr, TLI) +
11657return GetCostDiff(GetScalarCost, GetVectorCost);
11659case Instruction::GetElementPtr: {
11660return CommonCost + GetGEPCostDiff(VL, VL0);
11662case Instruction::Load: {
11663auto GetScalarCost = [&](
unsignedIdx) {
11664auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11666VI->getAlign(),
VI->getPointerAddressSpace(),
11669auto *LI0 = cast<LoadInst>(VL0);
11673case TreeEntry::Vectorize:
11674if (
unsigned Factor = E->getInterleaveFactor()) {
11676 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11677 LI0->getPointerAddressSpace(),
CostKind);
11681 Instruction::Load, VecTy, LI0->getAlign(),
11685case TreeEntry::StridedVectorize: {
11686Align CommonAlignment =
11687 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11689 Instruction::Load, VecTy, LI0->getPointerOperand(),
11690/*VariableMask=*/false, CommonAlignment,
CostKind);
11693case TreeEntry::ScatterVectorize: {
11694Align CommonAlignment =
11695 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11697 Instruction::Load, VecTy, LI0->getPointerOperand(),
11698/*VariableMask=*/false, CommonAlignment,
CostKind);
11701case TreeEntry::CombinedVectorize:
11702case TreeEntry::NeedToGather:
11705return VecLdCost + CommonCost;
11709// If this node generates masked gather load then it is not a terminal node. 11710// Hence address operand cost is estimated separately. 11711if (E->State == TreeEntry::ScatterVectorize)
11714// Estimate cost of GEPs since this tree node is a terminator. 11717 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11718returnCost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11720case Instruction::Store: {
11721bool IsReorder = !E->ReorderIndices.empty();
11722auto GetScalarCost = [=](
unsignedIdx) {
11723auto *
VI = cast<StoreInst>(VL[
Idx]);
11726VI->getAlign(),
VI->getPointerAddressSpace(),
11730 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11732// We know that we can merge the stores. Calculate the cost. 11734if (E->State == TreeEntry::StridedVectorize) {
11735Align CommonAlignment =
11736 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11738 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11739/*VariableMask=*/false, CommonAlignment,
CostKind);
11741assert(E->State == TreeEntry::Vectorize &&
11742"Expected either strided or consecutive stores.");
11743if (
unsigned Factor = E->getInterleaveFactor()) {
11744assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11745"No reused shuffles expected");
11748 Instruction::Store, VecTy, Factor, std::nullopt,
11749 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11753 Instruction::Store, VecTy, BaseSI->getAlign(),
11754 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11757return VecStCost + CommonCost;
11761unsignedIdx = IsReorder ? E->ReorderIndices[
I] :
I;
11762 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11765return GetCostDiff(GetScalarCost, GetVectorCost) +
11766 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11768case Instruction::Call: {
11769auto GetScalarCost = [&](
unsignedIdx) {
11770auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11781auto *CI = cast<CallInst>(VL0);
11785 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11787return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11789return GetCostDiff(GetScalarCost, GetVectorCost);
11791case Instruction::ShuffleVector: {
11792if (!
SLPReVec || E->isAltShuffle())
11793assert(E->isAltShuffle() &&
11798 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11799"Invalid Shuffle Vector Operand");
11800// Try to find the previous shuffle node with the same operands and same 11801// main/alternate ops. 11802auto TryFindNodeWithEqualOperands = [=]() {
11803for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11806if (
TE->hasState() &&
TE->isAltShuffle() &&
11807 ((
TE->getOpcode() == E->getOpcode() &&
11808TE->getAltOpcode() == E->getAltOpcode()) ||
11809 (
TE->getOpcode() == E->getAltOpcode() &&
11810TE->getAltOpcode() == E->getOpcode())) &&
11811TE->hasEqualOperands(*E))
11816auto GetScalarCost = [&](
unsignedIdx) {
11817if (isa<PoisonValue>(UniqueValues[
Idx]))
11820auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11821assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11825// Need to clear CommonCost since the final shuffle cost is included into 11828// VecCost is equal to sum of the cost of creating 2 vectors 11829// and the cost of creating shuffle. 11831if (TryFindNodeWithEqualOperands()) {
11833dbgs() <<
"SLP: diamond match for alternate node found.\n";
11836// No need to add new vector costs here since we're going to reuse 11837// same main/alternate vector ops, just do different shuffling. 11840 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11842 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11843 }
elseif (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11845 VecCost = TTIRef.getCmpSelInstrCost(
11846 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11847 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11849 VecCost += TTIRef.getCmpSelInstrCost(
11850 E->getOpcode(), VecTy, MaskTy,
11851 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11852 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11855Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11858auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11859unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11861DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11862if (SrcIt != MinBWs.
end()) {
11863 SrcBWSz = SrcIt->second.first;
11867if (BWSz <= SrcBWSz) {
11870 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11874 <<
"SLP: alternate extension, which should be truncated.\n";
11880 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11883 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11887 E->buildAltOpShuffleMask(
11889assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11896// Patterns like [fadd,fsub] can be combined into a single instruction 11897// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we 11898// need to take into account their order when looking for the most used 11900unsigned Opcode0 = E->getOpcode();
11901unsigned Opcode1 = E->getAltOpcode();
11903// If this pattern is supported by the target then we consider the 11905if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11907 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11908return AltVecCost < VecCost ? AltVecCost : VecCost;
11910// TODO: Check the reverse order too. 11913if (
SLPReVec && !E->isAltShuffle())
11916// If a group uses mask in order, the shufflevector can be 11917// eliminated by instcombine. Then the cost is 0. 11919"Not supported shufflevector usage.");
11920auto *SV = cast<ShuffleVectorInst>(VL.
front());
11921unsigned SVNumElements =
11922 cast<FixedVectorType>(SV->getOperand(0)->getType())
11923 ->getNumElements();
11924unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11929assert(isa<ShuffleVectorInst>(V) &&
11930"Not supported shufflevector usage.");
11931auto *SV = cast<ShuffleVectorInst>(V);
11933 [[maybe_unused]]
bool IsExtractSubvectorMask =
11934 SV->isExtractSubvectorMask(Index);
11935assert(IsExtractSubvectorMask &&
11936"Not supported shufflevector usage.");
11937if (NextIndex != Index)
11939 NextIndex += SV->getShuffleMask().size();
11942 return ::getShuffleCost(
11948return GetCostDiff(GetScalarCost, GetVectorCost);
11950case Instruction::Freeze:
11957bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const{
11959 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11961auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11963returnTE->isGather() &&
11965 [
this](
Value *V) { return EphValues.contains(V); }) &&
11967TE->Scalars.size() < Limit ||
11968 (((
TE->hasState() &&
11969TE->getOpcode() == Instruction::ExtractElement) ||
11970all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11972 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
11973 !
TE->isAltShuffle()) ||
11974any_of(
TE->Scalars, IsaPred<LoadInst>));
11977// We only handle trees of heights 1 and 2. 11978if (VectorizableTree.size() == 1 &&
11979 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11980 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11982 AreVectorizableGathers(VectorizableTree[0].
get(),
11983 VectorizableTree[0]->Scalars.size()) &&
11984 VectorizableTree[0]->getVectorFactor() > 2)))
11987if (VectorizableTree.size() != 2)
11990// Handle splat and all-constants stores. Also try to vectorize tiny trees 11991// with the second gather nodes if they have less scalar operands rather than 11992// the initial tree element (may be profitable to shuffle the second gather) 11993// or they are extractelements, which form shuffle. 11995if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11996 AreVectorizableGathers(VectorizableTree[1].
get(),
11997 VectorizableTree[0]->Scalars.size()))
12000// Gathering cost would be too much for tiny trees. 12001if (VectorizableTree[0]->
isGather() ||
12002 (VectorizableTree[1]->isGather() &&
12003 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
12004 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
12012bool MustMatchOrInst) {
12013// Look past the root to find a source value. Arbitrarily follow the 12014// path through operand 0 of any 'or'. Also, peek through optional 12015// shift-left-by-multiple-of-8-bits. 12016Value *ZextLoad = Root;
12018bool FoundOr =
false;
12019while (!isa<ConstantExpr>(ZextLoad) &&
12022 ShAmtC->
urem(8) == 0))) {
12023auto *BinOp = cast<BinaryOperator>(ZextLoad);
12024 ZextLoad = BinOp->getOperand(0);
12025if (BinOp->getOpcode() == Instruction::Or)
12028// Check if the input is an extended load of the required or/shift expression. 12030if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12034// Require that the total load bit width is a legal integer type. 12035// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. 12036// But <16 x i8> --> i128 is not, so the backend probably can't reduce it. 12037Type *SrcTy = Load->getType();
12042// Everything matched - assume that we can fold the whole sequence using 12044LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at " 12045 << *(cast<Instruction>(Root)) <<
"\n");
12054unsigned NumElts = VectorizableTree[0]->Scalars.size();
12055Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12057/* MatchOr */false);
12061// Peek through a final sequence of stores and check if all operations are 12062// likely to be load-combined. 12063unsigned NumElts = Stores.
size();
12064for (
Value *Scalar : Stores) {
12077// Graph is empty - do nothing. 12078if (VectorizableTree.empty()) {
12079assert(ExternalUses.empty() &&
"We shouldn't have any external users");
12084// No need to vectorize inserts of gathered values. 12085if (VectorizableTree.size() == 2 &&
12086 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12087 VectorizableTree[1]->isGather() &&
12088 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12089 !(
isSplat(VectorizableTree[1]->Scalars) ||
12093// If the graph includes only PHI nodes and gathers, it is defnitely not 12094// profitable for the vectorization, we can skip it, if the cost threshold is 12095// default. The cost of vectorized PHI nodes is almost always 0 + the cost of 12096// gathers/buildvectors. 12097constexprint Limit = 4;
12099 !VectorizableTree.empty() &&
12100all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12101return (TE->isGather() &&
12102 (!TE->hasState() ||
12103 TE->getOpcode() != Instruction::ExtractElement) &&
12104count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12105 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12109// We can vectorize the tree if its size is greater than or equal to the 12110// minimum size specified by the MinTreeSize command line option. 12114// If we have a tiny tree (a tree whose size is less than MinTreeSize), we 12115// can vectorize it if we can prove it fully vectorizable. 12116if (isFullyVectorizableTinyTree(ForReduction))
12119// Check if any of the gather node forms an insertelement buildvector 12121bool IsAllowedSingleBVNode =
12122 VectorizableTree.size() > 1 ||
12123 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12124 !VectorizableTree.front()->isAltShuffle() &&
12125 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12126 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12128if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12129return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12130 return isa<ExtractElementInst, UndefValue>(V) ||
12131 (IsAllowedSingleBVNode &&
12132 !V->hasNUsesOrMore(UsesLimit) &&
12133 any_of(V->users(), IsaPred<InsertElementInst>));
12138if (VectorizableTree.back()->isGather() &&
12139 VectorizableTree.back()->hasState() &&
12140 VectorizableTree.back()->isAltShuffle() &&
12141 VectorizableTree.back()->getVectorFactor() > 2 &&
12143 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12145getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12146 VectorizableTree.back()->getVectorFactor()),
12148/*Insert=*/true,
/*Extract=*/false,
12152// Otherwise, we can't vectorize the tree. It is both tiny and not fully 12159constexprunsigned SmallTree = 3;
12160if (VectorizableTree.front()->isNonPowOf2Vec() &&
12163 [](
const std::unique_ptr<TreeEntry> &TE) {
12164return TE->isGather() && TE->hasState() &&
12165 TE->getOpcode() == Instruction::Load &&
12173 TreeEntry &E = *VectorizableTree[
Idx];
12176if (E.hasState() && E.getOpcode() != Instruction::Load)
12186// Walk from the bottom of the tree to the top, tracking which values are 12187// live. When we see a call instruction that is not part of our tree, 12188// query TTI to see if there is a cost to keeping values live over it 12189// (for example, if spills and fills are required). 12190unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12196// The entries in VectorizableTree are not necessarily ordered by their 12197// position in basic blocks. Collect them and order them by dominance so later 12198// instructions are guaranteed to be visited first. For instructions in 12199// different basic blocks, we only scan to the beginning of the block, so 12200// their order does not matter, as long as all instructions in a basic block 12201// are grouped together. Using dominance ensures a deterministic order. 12203for (
constauto &TEPtr : VectorizableTree) {
12204if (TEPtr->State != TreeEntry::Vectorize)
12206Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12212auto *NodeA = DT->
getNode(
A->getParent());
12213auto *NodeB = DT->
getNode(
B->getParent());
12214assert(NodeA &&
"Should only process reachable instructions");
12215assert(NodeB &&
"Should only process reachable instructions");
12216assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12217"Different nodes should have different DFS numbers");
12219return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12220returnB->comesBefore(
A);
12229// Update LiveValues. 12230 LiveValues.
erase(PrevInst);
12231for (
auto &J : PrevInst->
operands()) {
12232if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12233 LiveValues.
insert(cast<Instruction>(&*J));
12237dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12238for (
auto *
X : LiveValues)
12239dbgs() <<
" " <<
X->getName();
12240dbgs() <<
", Looking at ";
12244// Now find the sequence of instructions between PrevInst and Inst. 12245unsigned NumCalls = 0;
12249while (InstIt != PrevInstIt) {
12250if (PrevInstIt == PrevInst->
getParent()->rend()) {
12251 PrevInstIt = Inst->getParent()->rbegin();
12256if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12257if (
II->isAssumeLikeIntrinsic())
12265if (IntrCost < CallCost)
12271// Debug information does not impact spill cost. 12272if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12273 &*PrevInstIt != PrevInst)
12281for (
auto *
II : LiveValues) {
12282auto *ScalarTy =
II->getType();
12283if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12284 ScalarTy = VectorTy->getElementType();
12296/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the 12297/// buildvector sequence. 12302constauto *I1 = IE1;
12303constauto *I2 = IE2;
12315if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12317 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12318if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12320 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12321 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12326/// Returns incoming Value *, if the requested type is Value * too, or a default 12327/// value, otherwise. 12329template <
typename U>
12330static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12333template <
typename U>
12334static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12340/// Does the analysis of the provided shuffle masks and performs the requested 12341/// actions on the vectors with the given shuffle masks. It tries to do it in 12343/// 1. If the Base vector is not undef vector, resizing the very first mask to 12344/// have common VF and perform action for 2 input vectors (including non-undef 12345/// Base). Other shuffle masks are combined with the resulting after the 1 stage 12346/// and processed as a shuffle of 2 elements. 12347/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the 12348/// action only for 1 vector with the given mask, if it is not the identity 12350/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2 12351/// vectors, combing the masks properly between the steps. 12352template <
typename T>
12358assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12360auto VMIt = std::next(ShuffleMask.begin());
12363buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12365if (!IsBaseUndef.
all()) {
12366// Base is not undef, need to combine it with the next subvectors. 12367 std::pair<T *, bool> Res =
12368 ResizeAction(ShuffleMask.begin()->first, Mask,
/*ForSingleMask=*/false);
12370for (
unsignedIdx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12374 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12376 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
12377assert((!V || GetVF(V) == Mask.size()) &&
12378"Expected base vector of VF number of elements.");
12379 Prev = Action(Mask, {
nullptr, Res.first});
12380 }
elseif (ShuffleMask.size() == 1) {
12381// Base is undef and only 1 vector is shuffled - perform the action only for 12382// single vector, if the mask is not the identity mask. 12383 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12384/*ForSingleMask=*/true);
12386// Identity mask is found. 12389 Prev = Action(Mask, {ShuffleMask.begin()->first});
12391// Base is undef and at least 2 input vectors shuffled - perform 2 vectors 12392// shuffles step by step, combining shuffle between the steps. 12393unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12394unsigned Vec2VF = GetVF(VMIt->first);
12395if (Vec1VF == Vec2VF) {
12396// No need to resize the input vectors since they are of the same size, we 12397// can shuffle them directly. 12399for (
unsignedI = 0, VF = Mask.size();
I < VF; ++
I) {
12402 Mask[
I] = SecMask[
I] + Vec1VF;
12405 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12407// Vectors of different sizes - resize and reshuffle. 12408 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12409/*ForSingleMask=*/false);
12410 std::pair<T *, bool> Res2 =
12411 ResizeAction(VMIt->first, VMIt->second,
/*ForSingleMask=*/false);
12413for (
unsignedI = 0, VF = Mask.size();
I < VF; ++
I) {
12420 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12423 Prev = Action(Mask, {Res1.first, Res2.first});
12425 VMIt = std::next(VMIt);
12427 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
12428// Perform requested actions for the remaining masks/vectors. 12429for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12430// Shuffle other input vectors, if any. 12431 std::pair<T *, bool> Res =
12432 ResizeAction(VMIt->first, VMIt->second,
/*ForSingleMask=*/false);
12434for (
unsignedI = 0, VF = Mask.size();
I < VF; ++
I) {
12437"Multiple uses of scalars.");
12438 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12443 Prev = Action(Mask, {Prev, Res.first});
12449/// Data type for handling buildvector sequences with the reused scalars from 12450/// other tree entries. 12451template <
typename T>
structShuffledInsertData {
12452 /// List of insertelements to be replaced by shuffles. 12454 /// The parent vectors and shuffle mask for the given list of inserts. 12462 << VectorizableTree.size() <<
".\n");
12464unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12467for (
unsignedI = 0, E = VectorizableTree.size();
I < E; ++
I) {
12468 TreeEntry &TE = *VectorizableTree[
I];
12469// No need to count the cost for combined entries, they are combined and 12470// just skip their cost. 12471if (TE.State == TreeEntry::CombinedVectorize) {
12473dbgs() <<
"SLP: Skipping cost for combined node that starts with " 12474 << *TE.Scalars[0] <<
".\n";
12475 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12478if (TE.isGather() && TE.hasState()) {
12479if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
12480 E && E->getVectorFactor() == TE.getVectorFactor() &&
12481 E->isSame(TE.Scalars)) {
12482// Some gather nodes might be absolutely the same as some vectorizable 12483// nodes after reordering, need to handle it. 12486 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12491// Exclude cost of gather loads nodes which are not used. These nodes were 12492// built as part of the final attempt to vectorize gathered loads. 12493assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12494"Expected gather nodes with users only.");
12500 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12509 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12512// Keep track {Scalar, Index, User} tuple. 12513// On AArch64, this helps in fusing a mov instruction, associated with 12514// extractelement, with fmul in the backend so that extractelement is free. 12516for (ExternalUser &EU : ExternalUses) {
12517 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12519for (ExternalUser &EU : ExternalUses) {
12520// Uses by ephemeral values are free (because the ephemeral value will be 12521// removed prior to code generation, and so the extraction will be 12522// removed as well). 12523if (EphValues.
count(EU.User))
12526// Used in unreachable blocks or in EH pads (rarely executed) or is 12527// terminated with unreachable instruction. 12529 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12532 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12535// We only add extract cost once for the same scalar. 12536if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12537 !ExtractCostCalculated.
insert(EU.Scalar).second)
12540// No extract cost for vector "scalar" 12541if (isa<FixedVectorType>(EU.Scalar->getType()))
12544// If found user is an insertelement, do not calculate extract cost but try 12545// to detect it as a final shuffled/identity match. 12546if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12548if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12549if (!UsedInserts.
insert(VU).second)
12553const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12556 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12557// Checks if 2 insertelements are from the same buildvector. 12561Value *Op0 =
II->getOperand(0);
12562if (getTreeEntry(
II) && !getTreeEntry(Op0))
12568if (It == ShuffledInserts.
end()) {
12570Data.InsertElements.emplace_back(VU);
12572 VecId = ShuffledInserts.
size() - 1;
12573auto It = MinBWs.
find(ScalarTE);
12574if (It != MinBWs.
end() &&
12576 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12578unsigned BWSz = It->second.first;
12579unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12582 VecOpcode = Instruction::Trunc;
12585 It->second.second ? Instruction::SExt : Instruction::ZExt;
12590 FTy->getNumElements()),
12593 <<
" for extending externally used vector with " 12594"non-equal minimum bitwidth.\n");
12599 It->InsertElements.front() = VU;
12600 VecId = std::distance(ShuffledInserts.
begin(), It);
12602int InIdx = *InsertIdx;
12604 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12607 Mask[InIdx] = EU.Lane;
12608 DemandedElts[VecId].setBit(InIdx);
12615// If we plan to rewrite the tree in a smaller type, we will need to sign 12616// extend the extracted value back to the original type. Here, we account 12617// for the extract and the added cost of the sign extend if needed. 12620const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12621auto It = MinBWs.
find(Entry);
12622if (It != MinBWs.
end()) {
12625 ? Instruction::ZExt
12626 : Instruction::SExt;
12633 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12635// Leave the scalar instructions as is if they are cheaper than extracts. 12636if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12637 Entry->getOpcode() == Instruction::Load) {
12638// Checks if the user of the external scalar is phi in loop body. 12639auto IsPhiInLoop = [&](
const ExternalUser &U) {
12640if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12641auto *
I = cast<Instruction>(U.Scalar);
12642constLoop *L = LI->getLoopFor(Phi->getParent());
12643return L && (Phi->getParent() ==
I->getParent() ||
12644 L == LI->getLoopFor(
I->getParent()));
12648if (!ValueToExtUses) {
12649 ValueToExtUses.emplace();
12651// Ignore phis in loops. 12652if (IsPhiInLoop(
P.value()))
12655 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12658// Can use original instruction, if no operands vectorized or they are 12659// marked as externally used already. 12660auto *Inst = cast<Instruction>(EU.Scalar);
12662auto OperandIsScalar = [&](
Value *V) {
12663if (!getTreeEntry(V)) {
12664// Some extractelements might be not vectorized, but 12665// transformed into shuffle and removed from the function, 12666// consider it here. 12667if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12668return !EE->hasOneUse() || !MustGather.contains(EE);
12671return ValueToExtUses->contains(V);
12673bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12674bool CanBeUsedAsScalarCast =
false;
12675if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12676if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12677Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12679 (getTreeEntry(
Op) && !ValueToExtUses->contains(
Op))
12682if (ScalarCost + OpCost <= ExtraCost) {
12683 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12684 ScalarCost += OpCost;
12688if (CanBeUsedAsScalar) {
12689bool KeepScalar = ScalarCost <= ExtraCost;
12690// Try to keep original scalar if the user is the phi node from the same 12691// block as the root phis, currently vectorized. It allows to keep 12692// better ordering info of PHIs, being vectorized currently. 12693bool IsProfitablePHIUser =
12695 VectorizableTree.front()->Scalars.size() > 2)) &&
12696 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12700 auto *PHIUser = dyn_cast<PHINode>(U);
12701 return (!PHIUser ||
12702 PHIUser->getParent() !=
12704 VectorizableTree.front()->getMainOp())
12709 return ValueToExtUses->contains(V);
12711if (IsProfitablePHIUser) {
12715 (!GatheredLoadsEntriesFirst.has_value() ||
12716 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12717unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12718 return ValueToExtUses->contains(V);
12720auto It = ExtractsCount.
find(Entry);
12721if (It != ExtractsCount.
end()) {
12722assert(ScalarUsesCount >= It->getSecond().size() &&
12723"Expected total number of external uses not less than " 12724"number of scalar uses.");
12725 ScalarUsesCount -= It->getSecond().size();
12727// Keep original scalar if number of externally used instructions in 12728// the same entry is not power of 2. It may help to do some extra 12729// vectorization for now. 12730 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12733 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12735 auto It = ValueToExtUses->find(V);
12736 if (It != ValueToExtUses->end()) {
12737// Replace all uses to avoid compiler crash. 12738 ExternalUses[It->second].User = nullptr;
12741 ExtraCost = ScalarCost;
12742if (!IsPhiInLoop(EU))
12743 ExtractsCount[Entry].
insert(Inst);
12744if (CanBeUsedAsScalarCast) {
12745 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12746// Update the users of the operands of the cast operand to avoid 12748if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12750 auto It = ValueToExtUses->find(V);
12751 if (It != ValueToExtUses->end()) {
12752// Replace all uses to avoid compiler crash. 12753 ExternalUses[It->second].User = nullptr;
12762 ExtractCost += ExtraCost;
12764// Insert externals for extract of operands of casts to be emitted as scalars 12765// instead of extractelement. 12766for (
Value *V : ScalarOpsFromCasts) {
12767 ExternalUsesAsOriginalScalar.
insert(V);
12768if (
const TreeEntry *E = getTreeEntry(V)) {
12769 ExternalUses.emplace_back(V,
nullptr, E->findLaneForValue(V));
12772// Add reduced value cost, if resized. 12773if (!VectorizedVals.
empty()) {
12774const TreeEntry &Root = *VectorizableTree.front();
12775auto BWIt = MinBWs.find(&Root);
12776if (BWIt != MinBWs.end()) {
12777Type *DstTy = Root.Scalars.front()->getType();
12780 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12781if (OriginalSz != SrcSz) {
12782unsigned Opcode = Instruction::Trunc;
12783if (OriginalSz > SrcSz)
12784 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12786if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12798Cost += SpillCost + ExtractCost;
12802unsigned VF =
Mask.size();
12803unsigned VecVF =
TE->getVectorFactor();
12805 (
any_of(Mask, [VF](
intIdx) {
returnIdx >=
static_cast<int>(VF); }) ||
12808 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12814dbgs() <<
"SLP: Adding cost " <<
C 12815 <<
" for final shuffle of insertelement external users.\n";
12816TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12818return std::make_pair(TE,
true);
12820return std::make_pair(TE,
false);
12822// Calculate the cost of the reshuffled vectors, if any. 12823for (
intI = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12824Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12825autoVector = ShuffledInserts[
I].ValueMasks.takeVector();
12829assert((TEs.size() == 1 || TEs.size() == 2) &&
12830"Expected exactly 1 or 2 tree entries.");
12831if (TEs.size() == 1) {
12833 VF = TEs.front()->getVectorFactor();
12834auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12838 (
Data.index() < VF &&
12839static_cast<int>(
Data.index()) ==
Data.value());
12844 <<
" for final shuffle of insertelement " 12845"external users.\n";
12846 TEs.front()->
dump();
12847dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12853 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12854 VF = TEs.front()->getVectorFactor();
12858auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12862 <<
" for final shuffle of vector node and external " 12863"insertelement users.\n";
12864if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12865dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12871 (void)performExtractsShuffleAction<const TreeEntry>(
12873 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12874 EstimateShufflesCost);
12876 cast<FixedVectorType>(
12877 ShuffledInserts[
I].InsertElements.front()->getType()),
12883// Add the cost for reduced value resize (if required). 12884if (ReductionBitWidth != 0) {
12885assert(UserIgnoreList &&
"Expected reduction tree.");
12886const TreeEntry &E = *VectorizableTree.front();
12887auto It = MinBWs.find(&E);
12888if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12889unsigned SrcSize = It->second.first;
12890unsigned DstSize = ReductionBitWidth;
12891unsigned Opcode = Instruction::Trunc;
12892if (SrcSize < DstSize) {
12893bool IsArithmeticExtendedReduction =
12895auto *
I = cast<Instruction>(V);
12896returnis_contained({Instruction::Add, Instruction::FAdd,
12897 Instruction::Mul, Instruction::FMul,
12898 Instruction::And, Instruction::Or,
12902if (IsArithmeticExtendedReduction)
12904 Instruction::BitCast;
// Handle it by getExtendedReductionCost 12906 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12908if (Opcode != Instruction::BitCast) {
12915switch (E.getOpcode()) {
12916case Instruction::SExt:
12917case Instruction::ZExt:
12918case Instruction::Trunc: {
12919const TreeEntry *OpTE = getOperandEntry(&E, 0);
12920 CCH = getCastContextHint(*OpTE);
12930 <<
" for final resize for reduction from " << SrcVecTy
12931 <<
" to " << DstVecTy <<
"\n";
12932dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12941OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n" 12942 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n" 12943 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12947ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12953/// Tries to find extractelement instructions with constant indices from fixed 12954/// vector type and gather such instructions into a bunch, which highly likely 12955/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was 12956/// successful, the matched scalars are replaced by poison values in \p VL for 12957/// future analysis. 12958std::optional<TTI::ShuffleKind>
12959BoUpSLP::tryToGatherSingleRegisterExtractElements(
12961// Scan list of gathered scalars for extractelements that can be represented 12965for (
intI = 0, E = VL.
size();
I < E; ++
I) {
12966auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12968if (isa<UndefValue>(VL[
I]))
12972auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12973if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12986 ExtractMask.reset(*
Idx);
12991 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12993// Sort the vector operands by the maximum number of uses in extractelements. 12996stable_sort(Vectors, [](
constauto &P1,
constauto &P2) {
12997returnP1.second.size() > P2.second.size();
12999// Find the best pair of the vectors or a single vector. 13000constint UndefSz = UndefVectorExtracts.
size();
13001unsigned SingleMax = 0;
13002unsigned PairMax = 0;
13003if (!Vectors.
empty()) {
13004 SingleMax = Vectors.
front().second.size() + UndefSz;
13005if (Vectors.
size() > 1) {
13006auto *ItNext = std::next(Vectors.
begin());
13007 PairMax = SingleMax + ItNext->second.size();
13010if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
13011return std::nullopt;
13012// Check if better to perform a shuffle of 2 vectors or just of a single 13017if (SingleMax >= PairMax && SingleMax) {
13018for (
intIdx : Vectors.
front().second)
13020 }
elseif (!Vectors.
empty()) {
13021for (
unsignedIdx : {0, 1})
13022for (
intIdx : Vectors[
Idx].second)
13025// Add extracts from undefs too. 13026for (
intIdx : UndefVectorExtracts)
13028// Check that gather of extractelements can be represented as just a 13029// shuffle of a single/two vectors the scalars are extracted from. 13030 std::optional<TTI::ShuffleKind> Res =
13033// TODO: try to check other subsets if possible. 13034// Restore the original VL if attempt was not successful. 13036return std::nullopt;
13038// Restore unused scalars from mask, if some of the extractelements were not 13039// selected for shuffle. 13040for (
intI = 0, E = GatheredExtracts.size();
I < E; ++
I) {
13042 isa<UndefValue>(GatheredExtracts[
I])) {
13046auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
13047if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13048 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13055/// Tries to find extractelement instructions with constant indices from fixed 13056/// vector type and gather such instructions into a bunch, which highly likely 13057/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was 13058/// successful, the matched scalars are replaced by poison values in \p VL for 13059/// future analysis. 13063unsigned NumParts)
const{
13064assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
13068for (
unsigned Part : seq<unsigned>(NumParts)) {
13069// Scan list of gathered scalars for extractelements that can be represented 13074 std::optional<TTI::ShuffleKind> Res =
13075 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13076 ShufflesRes[Part] = Res;
13077copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
13079if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
13080return Res.has_value();
13082 ShufflesRes.clear();
13086std::optional<TargetTransformInfo::ShuffleKind>
13087BoUpSLP::isGatherShuffledSingleRegisterEntry(
13091// TODO: currently checking only for Scalars in the tree entry, need to count 13092// reused elements too for better cost estimation. 13093const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13094 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13095 :
TE->UserTreeIndices.front();
13096constInstruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13098// Main node of PHI entries keeps the correct order of operands/incoming 13100if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13101 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13104 TEInsertBlock = TEInsertPt->
getParent();
13107return std::nullopt;
13108auto *NodeUI = DT->
getNode(TEInsertBlock);
13109assert(NodeUI &&
"Should only process reachable instructions");
13111auto CheckOrdering = [&](
constInstruction *InsertPt) {
13112// Argument InsertPt is an instruction where vector code for some other 13113// tree entry (one that shares one or more scalars with TE) is going to be 13114// generated. This lambda returns true if insertion point of vector code 13115// for the TE dominates that point (otherwise dependency is the other way 13116// around). The other node is not limited to be of a gather kind. Gather 13117// nodes are not scheduled and their vector code is inserted before their 13118// first user. If user is PHI, that is supposed to be at the end of a 13119// predecessor block. Otherwise it is the last instruction among scalars of 13120// the user node. So, instead of checking dependency between instructions 13121// themselves, we check dependency between their insertion points for vector 13122// code (since each scalar instruction ends up as a lane of a vector 13125auto *NodeEUI = DT->
getNode(InsertBlock);
13128assert((NodeUI == NodeEUI) ==
13129 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13130"Different nodes should have different DFS numbers");
13131// Check the order of the gather nodes users. 13132if (TEInsertPt->
getParent() != InsertBlock &&
13135if (TEInsertPt->
getParent() == InsertBlock &&
13140// Find all tree entries used by the gathered values. If no common entries 13141// found - not a shuffle. 13142// Here we build a set of tree nodes for each gathered value and trying to 13143// find the intersection between these sets. If we have at least one common 13144// tree node for each gathered value - we have just a permutation of the 13145// single vector. If we have 2 different sets, we're in situation where we 13146// have a permutation of 2 input vectors. 13149for (
Value *V : VL) {
13152// Build a list of tree entries where V is used. 13154for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13155if (TEPtr == TE || TEPtr->Idx == 0)
13158 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13159"Must contain at least single gathered value.");
13160assert(TEPtr->UserTreeIndices.size() == 1 &&
13161"Expected only single user of a gather node.");
13162const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13164PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13167 : &getLastInstructionInBundle(UseEI.UserTE);
13168if (TEInsertPt == InsertPt) {
13169// If 2 gathers are operands of the same entry (regardless of whether 13170// user is PHI or else), compare operands indices, use the earlier one 13172if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13174// If the user instruction is used for some reason in different 13175// vectorized nodes - make it depend on index. 13176if (TEUseEI.UserTE != UseEI.UserTE &&
13177 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13181// Check if the user node of the TE comes after user node of TEPtr, 13182// otherwise TEPtr depends on TE. 13183if ((TEInsertBlock != InsertPt->
getParent() ||
13184 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13185 !CheckOrdering(InsertPt))
13189if (
const TreeEntry *VTE = getTreeEntry(V)) {
13190if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13191if (VTE->State != TreeEntry::Vectorize) {
13192auto It = MultiNodeScalars.
find(V);
13193if (It == MultiNodeScalars.
end())
13195 VTE = *It->getSecond().begin();
13196// Iterate through all vectorized nodes. 13197auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
13198 return MTE->State == TreeEntry::Vectorize;
13200if (MIt == It->getSecond().end())
13205if (
none_of(
TE->CombinedEntriesWithIndices,
13206 [&](
constauto &
P) { return P.first == VTE->Idx; })) {
13207Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13208if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13215if (UsedTEs.
empty()) {
13216// The first iteration, just insert the list of nodes to vector. 13220// Need to check if there are any previously used tree nodes which use V. 13221// If there are no such nodes, consider that we have another one input 13226// Do we have a non-empty intersection of previously listed tree entries 13227// and tree entries using current V? 13229if (!VToTEs.
empty()) {
13230// Yes, write the new subset and continue analysis for the next 13235 VToTEs = SavedVToTEs;
13238// No non-empty intersection found - need to add a second set of possible 13241// If the number of input vectors is greater than 2 - not a permutation, 13242// fallback to the regular gather. 13243// TODO: support multiple reshuffled nodes. 13244if (UsedTEs.
size() == 2)
13246 UsedTEs.push_back(SavedVToTEs);
13253if (UsedTEs.
empty()) {
13255return std::nullopt;
13259if (UsedTEs.
size() == 1) {
13260// Keep the order to avoid non-determinism. 13262 UsedTEs.front().
end());
13263sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13264return TE1->Idx < TE2->Idx;
13266// Try to find the perfect match in another gather node at first. 13267auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13268return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13270if (It != FirstEntries.end() &&
13271 ((*It)->getVectorFactor() == VL.size() ||
13272 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13273TE->ReuseShuffleIndices.size() == VL.size() &&
13274 (*It)->isSame(
TE->Scalars)))) {
13275 Entries.push_back(*It);
13276if ((*It)->getVectorFactor() == VL.size()) {
13277 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13278 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13283// Clear undef scalars. 13284for (
unsignedI : seq<unsigned>(VL.size()))
13285if (isa<PoisonValue>(VL[
I]))
13289// No perfect match, just shuffle, so choose the first tree node from the 13291 Entries.push_back(FirstEntries.front());
13292 VF = FirstEntries.front()->getVectorFactor();
13294// Try to find nodes with the same vector factor. 13295assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13296// Keep the order of tree nodes to avoid non-determinism. 13298for (
const TreeEntry *TE : UsedTEs.front()) {
13299unsigned VF =
TE->getVectorFactor();
13300auto It = VFToTE.
find(VF);
13301if (It != VFToTE.
end()) {
13302if (It->second->Idx >
TE->Idx)
13303 It->getSecond() =
TE;
13308// Same, keep the order to avoid non-determinism. 13310 UsedTEs.back().
end());
13311sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13312return TE1->Idx < TE2->Idx;
13314for (
const TreeEntry *TE : SecondEntries) {
13315auto It = VFToTE.
find(
TE->getVectorFactor());
13316if (It != VFToTE.
end()) {
13318 Entries.push_back(It->second);
13319 Entries.push_back(TE);
13323// No 2 source vectors with the same vector factor - just choose 2 with max 13325if (Entries.empty()) {
13327 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13328 return TE1->Idx < TE2->Idx;
13330 Entries.push_back(SecondEntries.front());
13331 VF = std::max(Entries.front()->getVectorFactor(),
13332 Entries.back()->getVectorFactor());
13334 VF = Entries.front()->getVectorFactor();
13338bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13339// Checks if the 2 PHIs are compatible in terms of high possibility to be 13341auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13342auto *
PHI = cast<PHINode>(V);
13343auto *PHI1 = cast<PHINode>(V1);
13344// Check that all incoming values are compatible/from same parent (if they 13345// are instructions). 13346// The incoming values are compatible if they all are constants, or 13347// instruction with the same/alternate opcodes from the same basic block. 13348for (
intI = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13350Value *In1 = PHI1->getIncomingValue(
I);
13355if (cast<Instruction>(In)->
getParent() !=
13361// Check if the value can be ignored during analysis for shuffled gathers. 13362// We suppose it is better to ignore instruction, which do not form splats, 13363// are not vectorized/not extractelements (these instructions will be handled 13364// by extractelements processing) or may form vector node in future. 13365auto MightBeIgnored = [=](
Value *
V) {
13366auto *
I = dyn_cast<Instruction>(V);
13367returnI && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
13369 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13371// Check that the neighbor instruction may form a full vector node with the 13372// current instruction V. It is possible, if they have same/alternate opcode 13373// and same parent basic block. 13374auto NeighborMightBeIgnored = [&](
Value *
V,
intIdx) {
13376bool UsedInSameVTE =
false;
13377auto It = UsedValuesEntry.
find(V1);
13378if (It != UsedValuesEntry.
end())
13379 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13380returnV != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13382 cast<Instruction>(V)->getParent() ==
13383 cast<Instruction>(V1)->getParent() &&
13384 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13386// Build a shuffle mask for better cost estimation and vector emission. 13389for (
intI = 0, E = VL.size();
I < E; ++
I) {
13391auto It = UsedValuesEntry.
find(V);
13392if (It == UsedValuesEntry.
end())
13394// Do not try to shuffle scalars, if they are constants, or instructions 13395// that can be vectorized as a result of the following vector build 13398 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13399 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13401unsignedIdx = It->second;
13405// Iterate through all shuffled scalars and select entries, which can be used 13406// for final shuffle. 13408for (
unsignedI = 0, Sz = Entries.size();
I < Sz; ++
I) {
13409if (!UsedIdxs.test(
I))
13411// Fix the entry number for the given scalar. If it is the first entry, set 13412// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). 13413// These indices are used when calculating final shuffle mask as the vector 13415for (std::pair<unsigned, int> &Pair : EntryLanes)
13417 Pair.first = TempEntries.
size();
13420 Entries.swap(TempEntries);
13421if (EntryLanes.size() == Entries.size() &&
13423 .
slice(Part * VL.size(),
13424 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13425// We may have here 1 or 2 entries only. If the number of scalars is equal 13426// to the number of entries, no need to do the analysis, it is not very 13427// profitable. Since VL is not the same as TE->Scalars, it means we already 13428// have some shuffles before. Cut off not profitable case. 13430return std::nullopt;
13432// Build the final mask, check for the identity shuffle, if possible. 13433bool IsIdentity = Entries.size() == 1;
13434// Pair.first is the offset to the vector, while Pair.second is the index of 13435// scalar in the list. 13436for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13437unsignedIdx = Part * VL.size() + Pair.second;
13440 (ForOrder ? std::distance(
13441 Entries[Pair.first]->Scalars.begin(),
13442find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13443 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13444 IsIdentity &=
Mask[
Idx] == Pair.second;
13446if (ForOrder || IsIdentity || Entries.empty()) {
13447switch (Entries.size()) {
13449if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13453if (EntryLanes.size() > 2 || VL.size() <= 2)
13459 }
elseif (!isa<VectorType>(VL.front()->getType()) &&
13460 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13461// Do the cost estimation if shuffle beneficial than buildvector. 13463 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13464int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13465for (
intIdx : SubMask) {
13473assert(MaxElement >= 0 && MinElement >= 0 &&
13474 MaxElement % VF >= MinElement % VF &&
13475"Expected at least single element.");
13476unsigned NewVF = std::max<unsigned>(
13478 (MaxElement % VF) -
13479 (MinElement % VF) + 1));
13484Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13485 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13493auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13494auto GetShuffleCost = [&,
13498if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13500 Mask, Entries.front()->getInterleaveFactor()))
13502 return ::getShuffleCost(
TTI,
13507InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13510if (Entries.size() == 1 || !Entries[0]->isGather()) {
13511 FirstShuffleCost = ShuffleCost;
13513// Transform mask to include only first entry. 13515bool IsIdentity =
true;
13517if (
Idx >=
static_cast<int>(NewVF)) {
13522 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13526 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13528 MaskVecTy, DemandedElts,
/*Insert=*/true,
13533if (Entries.size() == 1 || !Entries[1]->isGather()) {
13534 SecondShuffleCost = ShuffleCost;
13536// Transform mask to include only first entry. 13538bool IsIdentity =
true;
13540if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
13546 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13551 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13553 MaskVecTy, DemandedElts,
/*Insert=*/true,
13563const TreeEntry *BestEntry =
nullptr;
13564if (FirstShuffleCost < ShuffleCost) {
13565 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13566 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13568 if (Idx >= static_cast<int>(VF))
13569 Idx = PoisonMaskElem;
13571 BestEntry = Entries.front();
13572 ShuffleCost = FirstShuffleCost;
13574if (SecondShuffleCost < ShuffleCost) {
13575 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13576 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13578 if (Idx < static_cast<int>(VF))
13579 Idx = PoisonMaskElem;
13583 BestEntry = Entries[1];
13584 ShuffleCost = SecondShuffleCost;
13586if (BuildVectorCost >= ShuffleCost) {
13589 Entries.push_back(BestEntry);
13596// Clear the corresponding mask elements. 13597 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13599return std::nullopt;
13603BoUpSLP::isGatherShuffledEntry(
13607assert(NumParts > 0 && NumParts < VL.
size() &&
13608"Expected positive number of registers.");
13610// No need to check for the topmost gather node. 13611if (TE == VectorizableTree.front().get() &&
13612 (!GatheredLoadsEntriesFirst.has_value() ||
13614 [](
const std::unique_ptr<TreeEntry> &TE) {
13615return !
TE->isGather();
13618// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not 13620if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI))
13623assert((
TE->UserTreeIndices.size() == 1 ||
13624 TE == VectorizableTree.front().get()) &&
13625"Expected only single user of the gather node.");
13627"Number of scalars must be divisible by NumParts.");
13628if (!
TE->UserTreeIndices.empty() &&
13629TE->UserTreeIndices.front().UserTE->isGather() &&
13630TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13633 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
13635"Expected splat or extractelements only node.");
13640for (
unsigned Part : seq<unsigned>(NumParts)) {
13644 std::optional<TTI::ShuffleKind> SubRes =
13645 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13648 SubEntries.
clear();
13651 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13652 (SubEntries.
front()->isSame(
TE->Scalars) ||
13653 SubEntries.
front()->isSame(VL))) {
13655 LocalSubEntries.
swap(SubEntries);
13658 std::iota(
Mask.begin(),
Mask.end(), 0);
13659// Clear undef scalars. 13660for (
intI = 0, Sz = VL.
size();
I < Sz; ++
I)
13661if (isa<PoisonValue>(VL[
I]))
13663 Entries.emplace_back(1, LocalSubEntries.
front());
13669 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13677Type *ScalarTy)
const{
13679bool DuplicateNonConst =
false;
13680// Find the cost of inserting/extracting values from the vector. 13681// Check if the same elements are inserted several times and count them as 13682// shuffle candidates. 13687auto EstimateInsertCost = [&](
unsignedI,
Value *
V) {
13688if (
V->getType() != ScalarTy) {
13699for (
unsignedI = 0, E = VL.
size();
I < E; ++
I) {
13701// No need to shuffle duplicates for constants. 13702if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13710 EstimateInsertCost(
I, V);
13715 DuplicateNonConst =
true;
13717 ShuffleMask[
I] = Res.first->second;
13720if (isa<FixedVectorType>(ScalarTy)) {
13722// We don't need to insert elements one by one. Instead, we can insert the 13723// entire vector into the destination. 13726for (
unsignedI : seq<unsigned>(VL.
size()))
13727if (!ShuffledElements[
I])
13730I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13733/*DemandedElts*/ ~ShuffledElements,
13738if (DuplicateNonConst)
13740 VecTy, ShuffleMask);
13744Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13745auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13748// Get the basic block this bundle is in. All instructions in the bundle 13749// should be in this block (except for extractelement-like instructions with 13750// constant indices or gathered loads). 13751auto *Front = E->getMainOp();
13753assert(((GatheredLoadsEntriesFirst.has_value() &&
13754 E->getOpcode() == Instruction::Load && E->isGather() &&
13755 E->Idx < *GatheredLoadsEntriesFirst) ||
13757 [=](
Value *V) ->
bool {
13758 if (E->getOpcode() == Instruction::GetElementPtr &&
13759 !isa<GetElementPtrInst>(V))
13761 auto *I = dyn_cast<Instruction>(V);
13762 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13763 isVectorLikeInstWithConstOps(I);
13765"Expected gathered loads or GEPs or instructions from same basic " 13768auto FindLastInst = [&]() {
13770for (
Value *V : E->Scalars) {
13771auto *
I = dyn_cast<Instruction>(V);
13774if (LastInst->
getParent() ==
I->getParent()) {
13779assert(((E->getOpcode() == Instruction::GetElementPtr &&
13780 !isa<GetElementPtrInst>(
I)) ||
13783 (GatheredLoadsEntriesFirst.has_value() &&
13784 E->getOpcode() == Instruction::Load && E->isGather() &&
13785 E->Idx < *GatheredLoadsEntriesFirst)) &&
13786"Expected vector-like or non-GEP in GEP node insts only.");
13794auto *NodeB = DT->
getNode(
I->getParent());
13795assert(NodeA &&
"Should only process reachable instructions");
13796assert(NodeB &&
"Should only process reachable instructions");
13797assert((NodeA == NodeB) ==
13798 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13799"Different nodes should have different DFS numbers");
13800if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13807auto FindFirstInst = [&]() {
13809for (
Value *V : E->Scalars) {
13810auto *
I = dyn_cast<Instruction>(V);
13813if (FirstInst->
getParent() ==
I->getParent()) {
13814if (
I->comesBefore(FirstInst))
13818assert(((E->getOpcode() == Instruction::GetElementPtr &&
13819 !isa<GetElementPtrInst>(
I)) ||
13822"Expected vector-like or non-GEP in GEP node insts only.");
13830auto *NodeB = DT->
getNode(
I->getParent());
13831assert(NodeA &&
"Should only process reachable instructions");
13832assert(NodeB &&
"Should only process reachable instructions");
13833assert((NodeA == NodeB) ==
13834 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13835"Different nodes should have different DFS numbers");
13836if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13842// Set insertpoint for gathered loads to the very first load. 13843if (GatheredLoadsEntriesFirst.has_value() &&
13844 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13845 E->getOpcode() == Instruction::Load) {
13846 Res = FindFirstInst();
13850// Set the insert point to the beginning of the basic block if the entry 13851// should not be scheduled. 13854if ((E->getOpcode() == Instruction::GetElementPtr &&
13857 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13861 return isa<PoisonValue>(V) ||
13862 (!isVectorLikeInstWithConstOps(V) &&
13863 isUsedOutsideBlock(V));
13865 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13866 return isa<ExtractElementInst, UndefValue>(V) ||
13867 areAllOperandsNonInsts(V);
13869 Res = FindLastInst();
13871 Res = FindFirstInst();
13875// Find the last instruction. The common case should be that BB has been 13876// scheduled, and the last instruction is VL.back(). So we start with 13877// VL.back() and iterate over schedule data until we reach the end of the 13878// bundle. The end of the bundle is marked by null ScheduleData. 13879if (BlocksSchedules.count(BB) && !E->isGather()) {
13880Value *
V = E->isOneOf(E->Scalars.back());
13883auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13884if (Bundle && Bundle->isPartOfBundle())
13885for (; Bundle; Bundle = Bundle->NextInBundle)
13886 Res = Bundle->Inst;
13889// LastInst can still be null at this point if there's either not an entry 13890// for BB in BlocksSchedules or there's no ScheduleData available for 13891// VL.back(). This can be the case if buildTree_rec aborts for various 13892// reasons (e.g., the maximum recursion depth is reached, the maximum region 13893// size is reached, etc.). ScheduleData is initialized in the scheduling 13896// If this happens, we can still find the last instruction by brute force. We 13897// iterate forwards from Front (inclusive) until we either see all 13898// instructions in the bundle or reach the end of the block. If Front is the 13899// last instruction in program order, LastInst will be set to Front, and we 13900// will visit all the remaining instructions in the block. 13902// One of the reasons we exit early from buildTree_rec is to place an upper 13903// bound on compile-time. Thus, taking an additional compile-time hit here is 13904// not ideal. However, this should be exceedingly rare since it requires that 13905// we both exit early from buildTree_rec and that the bundle be out-of-order 13906// (causing us to iterate all the way to the end of the block). 13908 Res = FindLastInst();
13909assert(Res &&
"Failed to find last instruction in bundle");
13913void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13914auto *Front = E->getMainOp();
13915Instruction *LastInst = &getLastInstructionInBundle(E);
13916assert(LastInst &&
"Failed to find last instruction in bundle");
13918// If the instruction is PHI, set the insert point after all the PHIs. 13919bool IsPHI = isa<PHINode>(LastInst);
13921 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13923 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13925// Set the insertion point after the last instruction in the bundle. Set the 13926// debug location to Front. 13927 Builder.SetInsertPoint(
13931 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13934Value *BoUpSLP::gather(
13937// List of instructions/lanes from current block and/or the blocks which are 13938// part of the current loop. These instructions will be inserted at the end to 13939// make it possible to optimize loops and hoist invariant instructions out of 13940// the loops body with better chances for success. 13943Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13946while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13947 InsertBB = InsertBB->getSinglePredecessor();
13948return InsertBB && InsertBB == InstBB;
13950for (
intI = 0, E = VL.
size();
I < E; ++
I) {
13951if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13952if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13953 getTreeEntry(Inst) ||
13954 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13955 PostponedIndices.
insert(
I).second)
13959auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13962if (
Scalar->getType() != Ty) {
13966if (
auto *CI = dyn_cast<CastInst>(Scalar);
13967 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13969if (
auto *IOp = dyn_cast<Instruction>(
Op);
13970 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
13973Scalar = Builder.CreateIntCast(
13978if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13982auto *
II = dyn_cast<IntrinsicInst>(Vec);
13983if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13987 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13988 InsElt = dyn_cast<InsertElementInst>(Vec);
13992 GatherShuffleExtractSeq.
insert(InsElt);
13994// Add to our 'need-to-extract' list. 13995if (isa<Instruction>(V)) {
13996if (TreeEntry *Entry = getTreeEntry(V)) {
13997// Find which lane we need to extract. 13998User *UserOp =
nullptr;
14000if (
auto *SI = dyn_cast<Instruction>(Scalar))
14006unsigned FoundLane =
Entry->findLaneForValue(V);
14007 ExternalUses.emplace_back(V, UserOp, FoundLane);
14017 std::iota(
Mask.begin(),
Mask.end(), 0);
14018Value *OriginalRoot = Root;
14019if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14020 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14021 SV->getOperand(0)->getType() == VecTy) {
14022 Root = SV->getOperand(0);
14023Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14025// Insert constant values at first. 14026for (
intI = 0, E = VL.
size();
I < E; ++
I) {
14033if (isa<PoisonValue>(VL[
I]))
14035 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14039if (isa<PoisonValue>(Vec)) {
14040 Vec = OriginalRoot;
14042 Vec = CreateShuffle(Root, Vec, Mask);
14043if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
14044 OI && OI->hasNUses(0) &&
14045none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14046returnTE->VectorizedValue == OI;
14051// Insert non-constant values. 14052for (
intI : NonConsts)
14053 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14054// Append instructions, which are/may be part of the loop, in the end to make 14055// it possible to hoist non-loop-based instructions. 14056for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14057 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14062/// Merges shuffle masks and emits final shuffle instruction, if required. It 14063/// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 14064/// when the actual shuffle instruction is generated only if this is actually 14065/// required. Otherwise, the shuffle instruction emission is delayed till the 14066/// end of the process, to reduce the number of emitted instructions and further 14067/// analysis/transformations. 14068/// The class also will look through the previously emitted shuffle instructions 14069/// and properly mark indices in mask as undef. 14070/// For example, given the code 14072/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 14073/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 14075/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 14076/// look through %s1 and %s2 and emit 14078/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 14081/// If 2 operands are of different size, the smallest one will be resized and 14082/// the mask recalculated properly. 14083/// For example, given the code 14085/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 14086/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 14088/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 14089/// look through %s1 and %s2 and emit 14091/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 14095bool IsFinalized =
false;
14096 /// Combined mask for all applied operands and masks. It is built during 14097 /// analysis and actual emission of shuffle vector instructions. 14099 /// List of operands for the shuffle vector instruction. It hold at max 2 14100 /// operands, if the 3rd is going to be added, the first 2 are combined into 14101 /// shuffle with \p CommonMask mask, the first operand sets to be the 14102 /// resulting shuffle and the second operand sets to be the newly added 14103 /// operand. The \p CommonMask is transformed in the proper way after that. 14108classShuffleIRBuilder {
14110 /// Holds all of the instructions that we gathered. 14112 /// A list of blocks that we are going to CSE. 14121 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14122 CSEBlocks(CSEBlocks),
DL(
DL) {}
14123 ~ShuffleIRBuilder() =
default;
14124 /// Creates shufflevector for the 2 operands with the given mask. 14126if (V1->
getType() != V2->getType()) {
14129"Expected integer vector types only.");
14130if (V1->
getType() != V2->getType()) {
14131if (cast<VectorType>(V2->getType())
14133 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14135 ->getIntegerBitWidth())
14144if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14145 GatherShuffleExtractSeq.
insert(
I);
14146 CSEBlocks.
insert(
I->getParent());
14150 /// Creates permutation of the single vector operand with the given mask, if 14151 /// it is not identity mask. 14155unsigned VF = Mask.size();
14156unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14160if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14161 GatherShuffleExtractSeq.
insert(
I);
14162 CSEBlocks.
insert(
I->getParent());
14167Value *createPoison(
Type *Ty,
unsigned VF) {
14170 /// Resizes 2 input vector to match the sizes, if the they are not equal 14171 /// yet. The smallest vector is resized to the size of the larger vector. 14173if (V1->
getType() == V2->getType())
14175int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14176int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14177int VF = std::max(V1VF, V2VF);
14178int MinVF = std::min(V1VF, V2VF);
14180 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14182Value *&
Op = MinVF == V1VF ? V1 : V2;
14184if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14185 GatherShuffleExtractSeq.
insert(
I);
14186 CSEBlocks.
insert(
I->getParent());
14195 /// Smart shuffle instruction emission, walks through shuffles trees and 14196 /// tries to find the best matching vector for the actual shuffle 14199assert(V1 &&
"Expected at least one vector value.");
14200 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14201 R.CSEBlocks, *R.DL);
14202return BaseShuffleAnalysis::createShuffle<Value *>(
14203 V1, V2, Mask, ShuffleBuilder, ScalarTy);
14206 /// Cast value \p V to the vector type with the same number of elements, but 14207 /// the base type \p ScalarTy. 14209 std::optional<bool> IsSigned = std::nullopt) {
14210auto *VecTy = cast<VectorType>(V->getType());
14221 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14223 /// Adjusts extractelements after reusing them. 14225ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14226unsigned NumParts,
bool &UseVecBaseAsInput) {
14227 UseVecBaseAsInput =
false;
14229Value *VecBase =
nullptr;
14231if (!E->ReorderIndices.empty()) {
14233 E->ReorderIndices.end());
14236for (
intI = 0, Sz = Mask.size();
I < Sz; ++
I) {
14240auto *EI = cast<ExtractElementInst>(VL[
I]);
14241 VecBase = EI->getVectorOperand();
14242if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
14243 VecBase = TE->VectorizedValue;
14244assert(VecBase &&
"Expected vectorized value.");
14245 UniqueBases.
insert(VecBase);
14246// If the only one use is vectorized - can delete the extractelement 14248if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14249 (NumParts != 1 &&
count(VL, EI) > 1) ||
14251 const TreeEntry *UTE = R.getTreeEntry(U);
14252 return !UTE || R.MultiNodeScalars.contains(U) ||
14253 (isa<GetElementPtrInst>(U) &&
14254 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14255 count_if(R.VectorizableTree,
14256 [&](const std::unique_ptr<TreeEntry> &TE) {
14257 return any_of(TE->UserTreeIndices,
14258 [&](const EdgeInfo &Edge) {
14259 return Edge.UserTE == UTE;
14261 is_contained(VL, EI);
14265 R.eraseInstruction(EI);
14267if (NumParts == 1 || UniqueBases.
size() == 1) {
14268assert(VecBase &&
"Expected vectorized value.");
14269return castToScalarTyElem(VecBase);
14271 UseVecBaseAsInput =
true;
14277// Perform multi-register vector shuffle, joining them into a single virtual 14279// Need to shuffle each part independently and then insert all this parts 14280// into a long virtual vector register, forming the original vector. 14281Value *Vec =
nullptr;
14284for (
unsigned Part : seq<unsigned>(NumParts)) {
14288constexprint MaxBases = 2;
14290auto VLMask =
zip(SubVL, SubMask);
14291constunsigned VF = std::accumulate(
14292 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
constauto &
D) {
14293 if (std::get<1>(D) == PoisonMaskElem)
14296 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14297 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14298 VecOp = TE->VectorizedValue;
14299 assert(VecOp &&
"Expected vectorized value.");
14300 const unsigned Size =
14301 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14302 return std::max(S, Size);
14304for (
constauto [V,
I] : VLMask) {
14307Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14308if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
14309 VecOp = TE->VectorizedValue;
14310assert(VecOp &&
"Expected vectorized value.");
14311 VecOp = castToScalarTyElem(VecOp);
14312 Bases[
I / VF] = VecOp;
14318 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14319 TransformToIdentity(SubMask);
14321 SubVec = Bases.front();
14328Mask.slice(
P * SliceSize,
14335"Expected first part or all previous parts masked.");
14336copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14339 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14342 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14343 NewVF = std::max(NewVF, SubVecVF);
14346for (
int &
Idx : SubMask)
14349copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14350 Vec = createShuffle(Vec, SubVec, VecMask);
14351 TransformToIdentity(VecMask);
14357 /// Checks if the specified entry \p E needs to be delayed because of its 14358 /// dependency nodes. 14359 std::optional<Value *>
14362// No need to delay emission if all deps are ready. 14365 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14367return std::nullopt;
14368// Postpone gather emission, will be emitted after the end of the 14369// process to keep correct order. 14376 /// Adds 2 input vectors (in form of tree entries) and the mask for their 14379Value *V1 = E1.VectorizedValue;
14381 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14382 if (isa<PoisonValue>(V))
14384 return !isKnownNonNegative(
14385 V, SimplifyQuery(*R.DL));
14387Value *V2 = E2.VectorizedValue;
14388if (V2->getType()->isIntOrIntVectorTy())
14389 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14390 if (isa<PoisonValue>(V))
14392 return !isKnownNonNegative(
14393 V, SimplifyQuery(*R.DL));
14397 /// Adds single input vector (in form of tree entry) and the mask for its 14400Value *V1 = E1.VectorizedValue;
14402 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14403 if (isa<PoisonValue>(V))
14405 return !isKnownNonNegative(
14406 V, SimplifyQuery(*R.DL));
14410 /// Adds 2 input vectors and the mask for their shuffling. 14412assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14414 isa<FixedVectorType>(V2->getType()) &&
14415"castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14416 V1 = castToScalarTyElem(V1);
14417 V2 = castToScalarTyElem(V2);
14418if (InVectors.
empty()) {
14421 CommonMask.
assign(Mask.begin(), Mask.end());
14425if (InVectors.
size() == 2) {
14426 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14427 transformMaskAfterShuffle(CommonMask, CommonMask);
14428 }
elseif (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14430 Vec = createShuffle(Vec,
nullptr, CommonMask);
14431 transformMaskAfterShuffle(CommonMask, CommonMask);
14433 V1 = createShuffle(V1, V2, Mask);
14434unsigned VF = std::max(getVF(V1), getVF(Vec));
14435for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14437 CommonMask[
Idx] =
Idx + VF;
14438 InVectors.
front() = Vec;
14439if (InVectors.
size() == 2)
14440 InVectors.
back() = V1;
14444 /// Adds another one input vector and the mask for the shuffling. 14447"castToScalarTyElem expects V1 to be FixedVectorType");
14448 V1 = castToScalarTyElem(V1);
14449if (InVectors.
empty()) {
14451 CommonMask.
assign(Mask.begin(), Mask.end());
14454constauto *It =
find(InVectors, V1);
14455if (It == InVectors.
end()) {
14456if (InVectors.
size() == 2 ||
14459if (InVectors.
size() == 2) {
14460 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14461 transformMaskAfterShuffle(CommonMask, CommonMask);
14462 }
elseif (cast<FixedVectorType>(V->getType())->getNumElements() !=
14463 CommonMask.
size()) {
14464 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14465 transformMaskAfterShuffle(CommonMask, CommonMask);
14467unsigned VF = std::max(CommonMask.
size(), Mask.size());
14468for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14471 V->getType() != V1->
getType()
14473 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14474 ->getNumElements();
14475if (V->getType() != V1->
getType())
14476 V1 = createShuffle(V1,
nullptr, Mask);
14477 InVectors.
front() = V;
14478if (InVectors.
size() == 2)
14479 InVectors.
back() = V1;
14484// Check if second vector is required if the used elements are already 14485// used from the first one. 14486for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14493for (
Value *V : InVectors)
14494 VF = std::max(VF, getVF(V));
14495for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14497 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.begin() ? 0 : VF);
14499 /// Adds another one input vector and the mask for the shuffling. 14506Value *Root =
nullptr) {
14507return R.gather(VL, Root, ScalarTy,
14509return createShuffle(V1, V2, Mask);
14513 /// Finalize emission of the shuffles. 14514 /// \param Action the action (if any) to be performed before final applying of 14515 /// the \p ExtMask mask. 14518ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14524if (InVectors.
size() == 2) {
14525 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14528 Vec = createShuffle(Vec,
nullptr, CommonMask);
14530 transformMaskAfterShuffle(CommonMask, CommonMask);
14532"Expected vector length for the final value before action.");
14533unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14536 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14537 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14539 Action(Vec, CommonMask);
14540 InVectors.
front() = Vec;
14542if (!SubVectors.empty()) {
14544if (InVectors.
size() == 2) {
14545 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14548 Vec = createShuffle(Vec,
nullptr, CommonMask);
14550 transformMaskAfterShuffle(CommonMask, CommonMask);
14551auto CreateSubVectors = [&](
Value *Vec,
14553for (
auto [E,
Idx] : SubVectors) {
14554Value *
V = E->VectorizedValue;
14555if (
V->getType()->isIntOrIntVectorTy())
14556 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14557 if (isa<PoisonValue>(V))
14559 return !isKnownNonNegative(
14560 V, SimplifyQuery(*R.DL));
14564 Builder, Vec, V, InsertionIndex,
14565 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
14567if (!CommonMask.
empty()) {
14568 std::iota(std::next(CommonMask.
begin(),
Idx),
14569 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
14575if (SubVectorsMask.
empty()) {
14576 Vec = CreateSubVectors(Vec, CommonMask);
14579copy(SubVectorsMask, SVMask.begin());
14580for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14583I1 = I2 + CommonMask.
size();
14588 Vec = createShuffle(InsertVec, Vec, SVMask);
14589 transformMaskAfterShuffle(CommonMask, SVMask);
14591 InVectors.
front() = Vec;
14594if (!ExtMask.
empty()) {
14595if (CommonMask.
empty()) {
14599for (
intI = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14602 NewMask[
I] = CommonMask[ExtMask[
I]];
14604 CommonMask.
swap(NewMask);
14607if (CommonMask.
empty()) {
14608assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14609return InVectors.
front();
14611if (InVectors.
size() == 2)
14612return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14613return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14618"Shuffle construction must be finalized.");
14622BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14626// Special processing for GEPs bundle, which may include non-gep values. 14627if (!S && VL.
front()->getType()->isPointerTy()) {
14628constauto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14634auto CheckSameVE = [&](
const TreeEntry *VE) {
14635return VE->isSame(VL) &&
14636 (
any_of(VE->UserTreeIndices,
14637 [E, NodeIdx](
const EdgeInfo &EI) {
14638 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14641 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14642return TE->isOperandGatherNode(
14643 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14644 VE->isSame(TE->Scalars);
14647 TreeEntry *VE = getTreeEntry(S.getMainOp());
14648if (VE && CheckSameVE(VE))
14650auto It = MultiNodeScalars.
find(S.getMainOp());
14651if (It != MultiNodeScalars.
end()) {
14652auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
14653 return TE != VE && CheckSameVE(TE);
14655if (
I != It->getSecond().end())
14661Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14662bool PostponedPHIs) {
14664constunsigned VF = VL.size();
14665if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14667// V may be affected by MinBWs. 14668// We want ShuffleInstructionBuilder to correctly support REVEC. The key 14669// factor is the number of elements, not their type. 14670Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14672 ShuffleInstructionBuilder ShuffleBuilder(
14676 ShuffleBuilder.add(V, Mask);
14678 E->CombinedEntriesWithIndices.size());
14679transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14680 [&](
constauto &
P) {
14681 return std::make_pair(VectorizableTree[P.first].get(),
14684assert((E->CombinedEntriesWithIndices.empty() ||
14685 E->ReorderIndices.empty()) &&
14686"Expected either combined subnodes or reordering");
14687return ShuffleBuilder.finalize({}, SubVectors, {});
14691 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14692if (!VE->ReuseShuffleIndices.empty()) {
14693// Reshuffle to get only unique values. 14694// If some of the scalars are duplicated in the vectorization 14695// tree entry, we do not vectorize them but instead generate a 14696// mask for the reuses. But if there are several users of the 14697// same entry, they may have different vectorization factors. 14698// This is especially important for PHI nodes. In this case, we 14699// need to adapt the resulting instruction for the user 14700// vectorization factor and have to reshuffle it again to take 14701// only unique elements of the vector. Without this code the 14702// function incorrectly returns reduced vector instruction with 14703// the same elements, not with the unique ones. 14706// %phi = phi <2 x > { .., %entry} {%shuffle, %block} 14707// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> 14709// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} 14713if (isa<PoisonValue>(V))
14715Mask[
I] = VE->findLaneForValue(V);
14717V = FinalShuffle(V, Mask);
14719assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14720"Expected vectorization factor less " 14721"than original vector size.");
14723 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14724V = FinalShuffle(V, UniformMask);
14727// Need to update the operand gather node, if actually the operand is not a 14728// vectorized node, but the buildvector/gather node, which matches one of 14729// the vectorized nodes. 14730if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14731 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14732 }) == VE->UserTreeIndices.end()) {
14734find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14735returnTE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14736TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14738assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14739 (*It)->VectorizedValue =
V;
14744// Find the corresponding gather entry and vectorize it. 14745// Allows to be more accurate with tree/graph transformations, checks for the 14746// correctness of the transformations in many cases. 14748 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14749returnTE->isOperandGatherNode({E, NodeIdx});
14751assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14752assert(
I->get()->UserTreeIndices.size() == 1 &&
14753"Expected only single user for the gather node.");
14754assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14758template <
typename BVTy,
typename ResTy,
typename...
Args>
14759ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14761assert(E->isGather() &&
"Expected gather node.");
14762unsigned VF = E->getVectorFactor();
14764bool NeedFreeze =
false;
14766 E->ReuseShuffleIndices.end());
14768// Clear values, to be replaced by insertvector instructions. 14769for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14771 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14774 E->CombinedEntriesWithIndices.size());
14775transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14776 [&](
constauto &
P) {
14777 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14779// Build a mask out of the reorder indices and reorder scalars per this 14782 E->ReorderIndices.end());
14783if (!ReorderMask.empty())
14787// Transform non-clustered elements in the mask to poison (-1). 14788// "Clustered" operations will be reordered using this mask later. 14789if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14790for (
unsignedI : seq<unsigned>(GatheredScalars.size()))
14791if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14794 SubVectorsMask.
clear();
14798unsignedI,
unsigned SliceSize,
14799bool IsNotPoisonous) {
14801 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14804 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14805unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14806if (UserTE->getNumOperands() != 2)
14808if (!IsNotPoisonous) {
14810find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14811returnfind_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14812 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14813 }) !=
TE->UserTreeIndices.end();
14815if (It == VectorizableTree.end())
14818if (!(*It)->ReorderIndices.empty()) {
14822if (!
all_of(
zip(GatheredScalars, GS), [&](
constauto &
P) {
14823Value *V0 = std::get<0>(
P);
14824Value *V1 = std::get<1>(
P);
14825return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14826 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14832if ((
Mask.size() < InputVF &&
14835 (
Mask.size() == InputVF &&
14838 std::next(
Mask.begin(),
I * SliceSize),
14839 std::next(
Mask.begin(),
14846 std::next(
Mask.begin(),
I * SliceSize),
14847 std::next(
Mask.begin(),
14853 BVTy ShuffleBuilder(ScalarTy, Params...);
14854 ResTy Res = ResTy();
14858Value *ExtractVecBase =
nullptr;
14859bool UseVecBaseAsInput =
false;
14862Type *OrigScalarTy = GatheredScalars.front()->getType();
14865if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14870if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14871// Check for gathered extracts. 14872bool Resized =
false;
14874 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14875if (!ExtractShuffles.
empty()) {
14880if (
constauto *TE = getTreeEntry(
14881 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand()))
14884if (std::optional<ResTy> Delayed =
14885 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14886// Delay emission of gathers which are not ready yet. 14887 PostponedGathers.
insert(E);
14888// Postpone gather emission, will be emitted after the end of the 14889// process to keep correct order. 14892if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14893 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14894 ExtractVecBase = VecBase;
14895if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14896if (VF == VecBaseTy->getNumElements() &&
14897 GatheredScalars.size() != VF) {
14899 GatheredScalars.append(VF - GatheredScalars.size(),
14904// Gather extracts after we check for full matched gathers only. 14905if (!ExtractShuffles.
empty() || !E->hasState() ||
14906 E->getOpcode() != Instruction::Load ||
14907 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14908any_of(E->Scalars, IsaPred<LoadInst>)) &&
14911 return isa<LoadInst>(V) && getTreeEntry(V);
14913 (E->hasState() && E->isAltShuffle()) ||
14914all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
14916 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14918 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14920if (!GatherShuffles.
empty()) {
14921if (std::optional<ResTy> Delayed =
14922 ShuffleBuilder.needToDelay(E, Entries)) {
14923// Delay emission of gathers which are not ready yet. 14924 PostponedGathers.
insert(E);
14925// Postpone gather emission, will be emitted after the end of the 14926// process to keep correct order. 14929if (GatherShuffles.
size() == 1 &&
14931 Entries.front().front()->isSame(E->Scalars)) {
14932// Perfect match in the graph, will reuse the previously vectorized 14934LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle " 14936// Restore the mask for previous partially matched values. 14937Mask.resize(E->Scalars.size());
14938const TreeEntry *FrontTE = Entries.front().front();
14939if (FrontTE->ReorderIndices.empty() &&
14940 ((FrontTE->ReuseShuffleIndices.empty() &&
14941 E->Scalars.size() == FrontTE->Scalars.size()) ||
14942 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14943 std::iota(
Mask.begin(),
Mask.end(), 0);
14946if (isa<PoisonValue>(V)) {
14950Mask[
I] = FrontTE->findLaneForValue(V);
14953 ShuffleBuilder.add(*FrontTE, Mask);
14954// Full matched entry found, no need to insert subvectors. 14955 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14959if (GatheredScalars.size() != VF &&
14961returnany_of(TEs, [&](
const TreeEntry *TE) {
14962returnTE->getVectorFactor() == VF;
14965 GatheredScalars.append(VF - GatheredScalars.size(),
14968// Remove shuffled elements from list of gathers. 14969for (
intI = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14977bool IsRootPoison) {
14978// For splats with can emit broadcasts instead of gathers, so try to find 14980bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14985// Gather unique non-const values and all constant values. 14986// For repeated values, just shuffle them. 14987int NumNonConsts = 0;
14990if (isa<UndefValue>(V)) {
14991if (!isa<PoisonValue>(V)) {
15006 Scalars.
front() = OrigV;
15010 Scalars[Res.first->second] = OrigV;
15011 ReuseMask[
I] = Res.first->second;
15014if (NumNonConsts == 1) {
15015// Restore single insert element. 15019if (!UndefPos.
empty() && UndefPos.
front() == 0)
15022 ReuseMask[SinglePos] = SinglePos;
15023 }
elseif (!UndefPos.
empty() && IsSplat) {
15024// For undef values, try to replace them with the simple broadcast. 15025// We can do it if the broadcasted value is guaranteed to be 15026// non-poisonous, or by freezing the incoming scalar value first. 15028return !isa<UndefValue>(V) &&
15030 (E->UserTreeIndices.size() == 1 &&
15032// Check if the value already used in the same operation in 15033// one of the nodes already. 15034 return E->UserTreeIndices.front().EdgeIdx !=
15035 U.getOperandNo() &&
15037 E->UserTreeIndices.front().UserTE->Scalars,
15041if (It != Scalars.
end()) {
15042// Replace undefs by the non-poisoned scalars and emit broadcast. 15043int Pos = std::distance(Scalars.
begin(), It);
15044for (
intI : UndefPos) {
15045// Set the undef position to the non-poisoned scalar. 15046 ReuseMask[
I] = Pos;
15047// Replace the undef by the poison, in the mask it is replaced by 15048// non-poisoned scalar already. 15053// Replace undefs by the poisons, emit broadcast and then emit 15055for (
intI : UndefPos) {
15057if (isa<UndefValue>(Scalars[
I]))
15064if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15065bool IsNonPoisoned =
true;
15066bool IsUsedInExpr =
true;
15067Value *Vec1 =
nullptr;
15068if (!ExtractShuffles.
empty()) {
15069// Gather of extractelements can be represented as just a shuffle of 15070// a single/two vectors the scalars are extracted from. 15071// Find input vectors. 15072Value *Vec2 =
nullptr;
15073for (
unsignedI = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15077if (UseVecBaseAsInput) {
15078 Vec1 = ExtractVecBase;
15080for (
unsignedI = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15083if (isa<UndefValue>(E->Scalars[
I]))
15085auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15086Value *VecOp = EI->getVectorOperand();
15087if (
constauto *TE = getTreeEntry(VecOp))
15088if (
TE->VectorizedValue)
15089 VecOp =
TE->VectorizedValue;
15092 }
elseif (Vec1 != VecOp) {
15093assert((!Vec2 || Vec2 == VecOp) &&
15094"Expected only 1 or 2 vectors shuffle.");
15100 IsUsedInExpr =
false;
15103 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15106 IsUsedInExpr &= FindReusedSplat(
15108 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15109 ExtractMask.size(), IsNotPoisonedVec);
15110 ShuffleBuilder.add(Vec1, ExtractMask,
/*ForExtracts=*/true);
15111 IsNonPoisoned &= IsNotPoisonedVec;
15113 IsUsedInExpr =
false;
15115/*ForExtracts=*/true);
15118if (!GatherShuffles.
empty()) {
15121for (
constauto [
I, TEs] :
enumerate(Entries)) {
15124"No shuffles with empty entries list expected.");
15128"Expected shuffle of 1 or 2 entries.");
15132copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15133if (TEs.
size() == 1) {
15134bool IsNotPoisonedVec =
15135 TEs.
front()->VectorizedValue
15139 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15140 SliceSize, IsNotPoisonedVec);
15141 ShuffleBuilder.add(*TEs.
front(), VecMask);
15142 IsNonPoisoned &= IsNotPoisonedVec;
15144 IsUsedInExpr =
false;
15145 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15146if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15153// Try to figure out best way to combine values: build a shuffle and insert 15154// elements or just build several shuffles. 15155// Insert non-constant scalars. 15157int EMSz = ExtractMask.size();
15158int MSz =
Mask.size();
15159// Try to build constant vector and shuffle with it only if currently we 15160// have a single permutation and more than 1 scalar constants. 15161bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15162bool IsIdentityShuffle =
15163 ((UseVecBaseAsInput ||
15165 [](
const std::optional<TTI::ShuffleKind> &SK) {
15169none_of(ExtractMask, [&](
intI) {
returnI >= EMSz; }) &&
15171 (!GatherShuffles.
empty() &&
15173 [](
const std::optional<TTI::ShuffleKind> &SK) {
15177none_of(Mask, [&](
intI) {
returnI >= MSz; }) &&
15179bool EnoughConstsForShuffle =
15183return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15187return isa<Constant>(V) && !isa<UndefValue>(V);
15189 (!IsIdentityShuffle ||
15190 (GatheredScalars.size() == 2 &&
15192 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15194return isa<Constant>(V) && !isa<PoisonValue>(V);
15196// NonConstants array contains just non-constant values, GatheredScalars 15197// contains only constant to build final vector and then shuffle. 15198for (
intI = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15199if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15204// Generate constants for final shuffle and build a mask for them. 15205if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15207 TryPackScalars(GatheredScalars, BVMask,
/*IsRootPoison=*/true);
15208Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15209 ShuffleBuilder.add(BV, BVMask);
15212return isa<PoisonValue>(V) ||
15213 (IsSingleShuffle && ((IsIdentityShuffle &&
15214 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15216 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15219 Res = ShuffleBuilder.finalize(
15220 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15222 TryPackScalars(NonConstants, Mask,
/*IsRootPoison=*/false);
15223 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15226// Gather unique scalars and all constants. 15228 TryPackScalars(GatheredScalars, ReuseMask,
/*IsRootPoison=*/true);
15229Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15230 ShuffleBuilder.add(BV, ReuseMask);
15231 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15234// Gather all constants. 15236for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15237if (!isa<PoisonValue>(V))
15240Value *BV = ShuffleBuilder.gather(GatheredScalars);
15241 ShuffleBuilder.add(BV, Mask);
15242 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15247 Res = ShuffleBuilder.createFreeze(Res);
15251Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15252bool PostponedPHIs) {
15253for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15255return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15259/// \returns \p I after propagating metadata from \p VL only for instructions in 15264if (isa<Instruction>(V))
15272if (E->VectorizedValue &&
15273 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15274 E->isAltShuffle())) {
15275LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15276return E->VectorizedValue;
15279Value *
V = E->Scalars.front();
15280Type *ScalarTy =
V->getType();
15281if (!isa<CmpInst>(V))
15283auto It = MinBWs.
find(E);
15284if (It != MinBWs.
end()) {
15285auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15291if (E->isGather()) {
15292// Set insert point for non-reduction initial nodes. 15293if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15294 setInsertPointAfterBundle(E);
15295Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15296 E->VectorizedValue = Vec;
15300bool IsReverseOrder =
15302auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15303 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15304if (E->getOpcode() == Instruction::Store &&
15305 E->State == TreeEntry::Vectorize) {
15307ArrayRef(
reinterpret_cast<constint *
>(E->ReorderIndices.begin()),
15308 E->ReorderIndices.size());
15309 ShuffleBuilder.add(V, Mask);
15310 }
elseif (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15311 ShuffleBuilder.addOrdered(V, {});
15313 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15316 E->CombinedEntriesWithIndices.size());
15318 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
constauto &
P) {
15319 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15322 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15323"Expected either combined subnodes or reordering");
15324return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15327assert(!E->isGather() &&
"Unhandled state");
15328unsigned ShuffleOrOp =
15329 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15331auto GetOperandSignedness = [&](
unsignedIdx) {
15332const TreeEntry *OpE = getOperandEntry(E,
Idx);
15333bool IsSigned =
false;
15334auto It = MinBWs.
find(OpE);
15335if (It != MinBWs.
end())
15336 IsSigned = It->second.second;
15339 if (isa<PoisonValue>(V))
15341 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15345switch (ShuffleOrOp) {
15346case Instruction::PHI: {
15347assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15348 E != VectorizableTree.front().get() ||
15349 !E->UserTreeIndices.empty()) &&
15350"PHI reordering is free.");
15351if (PostponedPHIs && E->VectorizedValue)
15352return E->VectorizedValue;
15353auto *PH = cast<PHINode>(VL0);
15355 PH->getParent()->getFirstNonPHIIt());
15357if (PostponedPHIs || !E->VectorizedValue) {
15362// Adjust insertion point once all PHI's have been generated. 15364 PH->getParent()->getFirstInsertionPt());
15367V = FinalShuffle(V, E);
15369 E->VectorizedValue =
V;
15373PHINode *NewPhi = cast<PHINode>(E->PHI);
15374// If phi node is fully emitted - exit. 15378// PHINodes may have multiple entries from the same block. We want to 15379// visit every block once. 15382for (
unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {
15386// Stop emission if all incoming values are generated. 15392if (!VisitedBBs.
insert(IBB).second) {
15399Value *Vec = vectorizeOperand(E,
I,
/*PostponedPHIs=*/true);
15400if (VecTy != Vec->
getType()) {
15402 MinBWs.
contains(getOperandEntry(E,
I))) &&
15403"Expected item in MinBWs.");
15404 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15410"Invalid number of incoming values");
15411assert(E->VectorizedValue &&
"Expected vectorized value.");
15412return E->VectorizedValue;
15415case Instruction::ExtractElement: {
15416Value *
V = E->getSingleOperand(0);
15417if (
const TreeEntry *TE = getTreeEntry(V))
15418V =
TE->VectorizedValue;
15419 setInsertPointAfterBundle(E);
15420V = FinalShuffle(V, E);
15421 E->VectorizedValue =
V;
15424case Instruction::ExtractValue: {
15425auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15430 NewV = FinalShuffle(NewV, E);
15431 E->VectorizedValue = NewV;
15434case Instruction::InsertElement: {
15435assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15437Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15439Type *ScalarTy =
Op.front()->getType();
15440if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15442 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15443assert(Res.first > 0 &&
"Expected item in MinBWs.");
15448 cast<FixedVectorType>(
V->getType())->getNumElements()),
15452// Create InsertVector shuffle if necessary 15453auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15454 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15456constunsigned NumElts =
15457 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15458constunsigned NumScalars = E->Scalars.size();
15461assert(
Offset < NumElts &&
"Failed to find vector index offset");
15463// Create shuffle to resize vector 15465if (!E->ReorderIndices.empty()) {
15470 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15472// Create InsertVector shuffle if necessary 15473bool IsIdentity =
true;
15475Mask.swap(PrevMask);
15476for (
unsignedI = 0;
I < NumScalars; ++
I) {
15479 IsIdentity &= InsertIdx -
Offset ==
I;
15482if (!IsIdentity || NumElts != NumScalars) {
15484bool IsVNonPoisonous =
15487if (NumElts != NumScalars &&
Offset == 0) {
15488// Follow all insert element instructions from the current buildvector 15496 InsertMask[*InsertIdx] = *InsertIdx;
15497if (!
Ins->hasOneUse())
15499Ins = dyn_cast_or_null<InsertElementInst>(
15500Ins->getUniqueUndroppableUser());
15503buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15505 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15508if (!IsFirstPoison.
all()) {
15510for (
unsignedI = 0;
I < NumElts;
I++) {
15512 IsFirstUndef.
test(
I)) {
15513if (IsVNonPoisonous) {
15514 InsertMask[
I] =
I < NumScalars ?
I : 0;
15519if (
Idx >= NumScalars)
15520Idx = NumScalars - 1;
15521 InsertMask[
I] = NumScalars +
Idx;
15535if (
auto *
I = dyn_cast<Instruction>(V)) {
15536 GatherShuffleExtractSeq.
insert(
I);
15537 CSEBlocks.
insert(
I->getParent());
15542for (
unsignedI = 0;
I < NumElts;
I++) {
15547buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15550if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15551 NumElts != NumScalars) {
15552if (IsFirstUndef.
all()) {
15555 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15556if (!IsFirstPoison.
all()) {
15557for (
unsignedI = 0;
I < NumElts;
I++) {
15559 InsertMask[
I] =
I + NumElts;
15566 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15567if (
auto *
I = dyn_cast<Instruction>(V)) {
15568 GatherShuffleExtractSeq.
insert(
I);
15569 CSEBlocks.
insert(
I->getParent());
15574 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15575for (
unsignedI = 0;
I < NumElts;
I++) {
15579 InsertMask[
I] += NumElts;
15582 FirstInsert->getOperand(0), V, InsertMask,
15583 cast<Instruction>(E->Scalars.back())->getName());
15584if (
auto *
I = dyn_cast<Instruction>(V)) {
15585 GatherShuffleExtractSeq.
insert(
I);
15586 CSEBlocks.
insert(
I->getParent());
15591 ++NumVectorInstructions;
15592 E->VectorizedValue =
V;
15595case Instruction::ZExt:
15596case Instruction::SExt:
15597case Instruction::FPToUI:
15598case Instruction::FPToSI:
15599case Instruction::FPExt:
15600case Instruction::PtrToInt:
15601case Instruction::IntToPtr:
15602case Instruction::SIToFP:
15603case Instruction::UIToFP:
15604case Instruction::Trunc:
15605case Instruction::FPTrunc:
15606case Instruction::BitCast: {
15607 setInsertPointAfterBundle(E);
15609Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15610if (E->VectorizedValue) {
15612return E->VectorizedValue;
15615auto *CI = cast<CastInst>(VL0);
15617Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15618auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15620 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15622// Check if the values are candidates to demote. 15623unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15624if (SrcIt != MinBWs.
end())
15625 SrcBWSz = SrcIt->second.first;
15627if (BWSz == SrcBWSz) {
15628 VecOpcode = Instruction::BitCast;
15629 }
elseif (BWSz < SrcBWSz) {
15630 VecOpcode = Instruction::Trunc;
15631 }
elseif (It != MinBWs.
end()) {
15632assert(BWSz > SrcBWSz &&
"Invalid cast!");
15633 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15634 }
elseif (SrcIt != MinBWs.
end()) {
15635assert(BWSz > SrcBWSz &&
"Invalid cast!");
15637 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15639 }
elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15640 !SrcIt->second.second) {
15641 VecOpcode = Instruction::UIToFP;
15643Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15645 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15646V = FinalShuffle(V, E);
15648 E->VectorizedValue =
V;
15649 ++NumVectorInstructions;
15652case Instruction::FCmp:
15653case Instruction::ICmp: {
15654 setInsertPointAfterBundle(E);
15656Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15657if (E->VectorizedValue) {
15659return E->VectorizedValue;
15661Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15662if (E->VectorizedValue) {
15664return E->VectorizedValue;
15666if (
L->getType() !=
R->getType()) {
15668 getOperandEntry(E, 1)->
isGather() ||
15669 MinBWs.
contains(getOperandEntry(E, 0)) ||
15670 MinBWs.
contains(getOperandEntry(E, 1))) &&
15671"Expected item in MinBWs.");
15672if (cast<VectorType>(
L->getType())
15674 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15676 ->getIntegerBitWidth()) {
15677Type *CastTy =
R->getType();
15680Type *CastTy =
L->getType();
15688if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15689 ICmp->setSameSign(
/*B=*/false);
15690// Do not cast for cmps. 15691 VecTy = cast<FixedVectorType>(
V->getType());
15692V = FinalShuffle(V, E);
15694 E->VectorizedValue =
V;
15695 ++NumVectorInstructions;
15698case Instruction::Select: {
15699 setInsertPointAfterBundle(E);
15701Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15702if (E->VectorizedValue) {
15704return E->VectorizedValue;
15706Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15707if (E->VectorizedValue) {
15709return E->VectorizedValue;
15711Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15712if (E->VectorizedValue) {
15714return E->VectorizedValue;
15718 getOperandEntry(E, 2)->
isGather() ||
15719 MinBWs.
contains(getOperandEntry(E, 1)) ||
15720 MinBWs.
contains(getOperandEntry(E, 2))) &&
15721"Expected item in MinBWs.");
15723 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15724if (False->
getType() != VecTy)
15725 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15730assert(TrueNumElements >= CondNumElements &&
15731 TrueNumElements % CondNumElements == 0 &&
15732"Cannot vectorize Instruction::Select");
15734"Cannot vectorize Instruction::Select");
15735if (CondNumElements != TrueNumElements) {
15736// When the return type is i1 but the source is fixed vector type, we 15737// need to duplicate the condition value. 15743"Cannot vectorize Instruction::Select");
15745V = FinalShuffle(V, E);
15747 E->VectorizedValue =
V;
15748 ++NumVectorInstructions;
15751case Instruction::FNeg: {
15752 setInsertPointAfterBundle(E);
15754Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15756if (E->VectorizedValue) {
15758return E->VectorizedValue;
15764if (
auto *
I = dyn_cast<Instruction>(V))
15767V = FinalShuffle(V, E);
15769 E->VectorizedValue =
V;
15770 ++NumVectorInstructions;
15774case Instruction::Freeze: {
15775 setInsertPointAfterBundle(E);
15777Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15779if (E->VectorizedValue) {
15781return E->VectorizedValue;
15784if (
Op->getType() != VecTy) {
15786 MinBWs.
contains(getOperandEntry(E, 0))) &&
15787"Expected item in MinBWs.");
15791V = FinalShuffle(V, E);
15793 E->VectorizedValue =
V;
15794 ++NumVectorInstructions;
15798case Instruction::Add:
15799case Instruction::FAdd:
15800case Instruction::Sub:
15801case Instruction::FSub:
15802case Instruction::Mul:
15803case Instruction::FMul:
15804case Instruction::UDiv:
15805case Instruction::SDiv:
15806case Instruction::FDiv:
15807case Instruction::URem:
15808case Instruction::SRem:
15809case Instruction::FRem:
15810case Instruction::Shl:
15811case Instruction::LShr:
15812case Instruction::AShr:
15813case Instruction::And:
15814case Instruction::Or:
15815case Instruction::Xor: {
15816 setInsertPointAfterBundle(E);
15818Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15819if (E->VectorizedValue) {
15821return E->VectorizedValue;
15823Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15824if (E->VectorizedValue) {
15826return E->VectorizedValue;
15828if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15829for (
unsignedI : seq<unsigned>(0, E->getNumOperands())) {
15832auto *CI = dyn_cast<ConstantInt>(
Op);
15833return CI && CI->getValue().countr_one() >= It->second.first;
15835V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15836 E->VectorizedValue =
V;
15837 ++NumVectorInstructions;
15844 getOperandEntry(E, 1)->
isGather() ||
15845 MinBWs.
contains(getOperandEntry(E, 0)) ||
15846 MinBWs.
contains(getOperandEntry(E, 1))) &&
15847"Expected item in MinBWs.");
15858if (
auto *
I = dyn_cast<Instruction>(V)) {
15860// Drop nuw flags for abs(sub(commutative), true). 15861if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15863 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15865I->setHasNoUnsignedWrap(
/*b=*/false);
15868V = FinalShuffle(V, E);
15870 E->VectorizedValue =
V;
15871 ++NumVectorInstructions;
15875case Instruction::Load: {
15876// Loads are inserted at the head of the tree because we don't want to 15877// sink them all the way down past store instructions. 15878 setInsertPointAfterBundle(E);
15880LoadInst *LI = cast<LoadInst>(VL0);
15883if (E->State == TreeEntry::Vectorize) {
15885 }
elseif (E->State == TreeEntry::StridedVectorize) {
15886Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15887Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15888 PO = IsReverseOrder ? PtrN : Ptr0;
15894int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15896 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15897DL->getTypeAllocSize(ScalarTy));
15901 return cast<LoadInst>(V)->getPointerOperand();
15904 std::optional<Value *> Stride =
15908 Builder.
CreateIntCast(*Stride, StrideTy,
/*isSigned=*/true);
15913 (IsReverseOrder ? -1 : 1) *
15914static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15916Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15918 Intrinsic::experimental_vp_strided_load,
15919 {VecTy, PO->
getType(), StrideTy},
15921 Builder.
getInt32(E->Scalars.size())});
15927assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15928Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15929if (E->VectorizedValue) {
15931return E->VectorizedValue;
15933if (isa<FixedVectorType>(ScalarTy)) {
15935// CreateMaskedGather expects VecTy and VecPtr have same size. We need 15936// to expand VecPtr if ScalarTy is a vector type. 15937unsigned ScalarTyNumElements =
15938 cast<FixedVectorType>(ScalarTy)->getNumElements();
15939unsigned VecTyNumElements =
15940 cast<FixedVectorType>(VecTy)->getNumElements();
15941assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15942"Cannot expand getelementptr.");
15943unsigned VF = VecTyNumElements / ScalarTyNumElements;
15946 return Builder.getInt64(I % ScalarTyNumElements);
15954// Use the minimum alignment of the gathered loads. 15955Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15960V = FinalShuffle(V, E);
15961 E->VectorizedValue =
V;
15962 ++NumVectorInstructions;
15965case Instruction::Store: {
15966auto *
SI = cast<StoreInst>(VL0);
15968 setInsertPointAfterBundle(E);
15970Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15971if (VecValue->
getType() != VecTy)
15973 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15974 VecValue = FinalShuffle(VecValue, E);
15978if (E->State == TreeEntry::Vectorize) {
15981assert(E->State == TreeEntry::StridedVectorize &&
15982"Expected either strided or consecutive stores.");
15983if (!E->ReorderIndices.empty()) {
15984SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15985Ptr =
SI->getPointerOperand();
15987Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15988Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15990 Intrinsic::experimental_vp_strided_store,
15991 {VecTy,
Ptr->getType(), StrideTy},
15994 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
15996 Builder.
getInt32(E->Scalars.size())});
16005 E->VectorizedValue =
V;
16006 ++NumVectorInstructions;
16009case Instruction::GetElementPtr: {
16010auto *GEP0 = cast<GetElementPtrInst>(VL0);
16011 setInsertPointAfterBundle(E);
16013Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16014if (E->VectorizedValue) {
16016return E->VectorizedValue;
16020for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
16021Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16022if (E->VectorizedValue) {
16024return E->VectorizedValue;
16029Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16032for (
Value *V : E->Scalars) {
16033if (isa<GetElementPtrInst>(V))
16039V = FinalShuffle(V, E);
16041 E->VectorizedValue =
V;
16042 ++NumVectorInstructions;
16046case Instruction::Call: {
16047CallInst *CI = cast<CallInst>(VL0);
16048 setInsertPointAfterBundle(E);
16054 It != MinBWs.
end() ? It->second.first : 0,
TTI);
16057 VecCallCosts.first <= VecCallCosts.second;
16059Value *ScalarArg =
nullptr;
16062// Add return type if intrinsic is overloaded on it. 16065auto *CEI = cast<CallInst>(VL0);
16066for (
unsignedI : seq<unsigned>(0, CI->
arg_size())) {
16068// Some intrinsics have scalar arguments. This argument should not be 16071 ScalarArg = CEI->getArgOperand(
I);
16072// if decided to reduce bitwidth of abs intrinsic, it second argument 16073// must be set false (do not return poison, if value issigned min). 16074if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16075 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16083Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16084if (E->VectorizedValue) {
16086return E->VectorizedValue;
16088 ScalarArg = CEI->getArgOperand(
I);
16089if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16091 It == MinBWs.
end()) {
16094 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16095 }
elseif (It != MinBWs.
end()) {
16096 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16105if (!UseIntrinsic) {
16110false/*HasGlobalPred*/);
16121V = FinalShuffle(V, E);
16123 E->VectorizedValue =
V;
16124 ++NumVectorInstructions;
16127case Instruction::ShuffleVector: {
16129if (
SLPReVec && !E->isAltShuffle()) {
16130 setInsertPointAfterBundle(E);
16131Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16132if (E->VectorizedValue) {
16134return E->VectorizedValue;
16137if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16138assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16139"Not supported shufflevector usage.");
16142 return SVSrc->getShuffleMask()[Mask];
16149if (
auto *
I = dyn_cast<Instruction>(V))
16151V = FinalShuffle(V, E);
16153assert(E->isAltShuffle() &&
16158 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16159"Invalid Shuffle Vector Operand");
16163 setInsertPointAfterBundle(E);
16164LHS = vectorizeOperand(E, 0, PostponedPHIs);
16165if (E->VectorizedValue) {
16167return E->VectorizedValue;
16169RHS = vectorizeOperand(E, 1, PostponedPHIs);
16171 setInsertPointAfterBundle(E);
16172LHS = vectorizeOperand(E, 0, PostponedPHIs);
16174if (E->VectorizedValue) {
16176return E->VectorizedValue;
16183 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16184 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16185 MinBWs.
contains(getOperandEntry(E, 0)) ||
16186 MinBWs.
contains(getOperandEntry(E, 1))) &&
16187"Expected item in MinBWs.");
16188Type *CastTy = VecTy;
16192 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16194 ->getIntegerBitWidth())
16211 }
elseif (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16212 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16213auto *AltCI = cast<CmpInst>(E->getAltOp());
16215 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16218unsigned SrcBWSz =
DL->getTypeSizeInBits(
16219 cast<VectorType>(
LHS->
getType())->getElementType());
16220unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16221if (BWSz <= SrcBWSz) {
16225"Expected same type as operand.");
16226if (
auto *
I = dyn_cast<Instruction>(LHS))
16228LHS = FinalShuffle(LHS, E);
16229 E->VectorizedValue =
LHS;
16230 ++NumVectorInstructions;
16239// Add V0 and V1 to later analysis to try to find and remove matching 16240// instruction, if any. 16241for (
Value *V : {V0, V1}) {
16242if (
auto *
I = dyn_cast<Instruction>(V)) {
16243 GatherShuffleExtractSeq.
insert(
I);
16244 CSEBlocks.
insert(
I->getParent());
16248// Create shuffle to take alternate operations from the vector. 16249// Also, gather up main and alt scalar ops to propagate IR flags to 16250// each vector operation. 16253 E->buildAltOpShuffleMask(
16255assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16259Mask, &OpScalars, &AltScalars);
16263auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16264// Drop nuw flags for abs(sub(commutative), true). 16265if (
auto *
I = dyn_cast<Instruction>(Vec);
16266I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16268 if (isa<PoisonValue>(V))
16270 auto *IV = cast<Instruction>(V);
16271 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16273I->setHasNoUnsignedWrap(
/*b=*/false);
16275 DropNuwFlag(V0, E->getOpcode());
16276 DropNuwFlag(V1, E->getAltOpcode());
16278if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16283if (
auto *
I = dyn_cast<Instruction>(V)) {
16285 GatherShuffleExtractSeq.
insert(
I);
16286 CSEBlocks.
insert(
I->getParent());
16290 E->VectorizedValue =
V;
16291 ++NumVectorInstructions;
16309// All blocks must be scheduled before any instructions are inserted. 16310for (
auto &BSIter : BlocksSchedules) {
16311 scheduleBlock(BSIter.second.get());
16313// Clean Entry-to-LastInstruction table. It can be affected after scheduling, 16314// need to rebuild it. 16315 EntryToLastInstruction.
clear();
16323// Emit gathered loads first to emit better code for the users of those 16325for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16326if (GatheredLoadsEntriesFirst.has_value() &&
16327 TE->Idx >= *GatheredLoadsEntriesFirst &&
16328 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16329assert((!TE->UserTreeIndices.empty() ||
16330 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16331"Expected gathered load node.");
16335// Postpone emission of PHIs operands to avoid cyclic dependencies issues. 16337for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16338if (TE->State == TreeEntry::Vectorize &&
16339 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16340 TE->VectorizedValue)
16342// Run through the list of postponed gathers and emit them, replacing the temp 16343// emitted allocas with actual vector instructions. 16346for (
const TreeEntry *E : PostponedNodes) {
16347auto *TE =
const_cast<TreeEntry *
>(E);
16348if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
16349if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16350 TE->UserTreeIndices.front().EdgeIdx)) &&
16351 VecTE->isSame(TE->Scalars))
16352// Found gather node which is absolutely the same as one of the 16353// vectorized nodes. It may happen after reordering. 16355auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16356 TE->VectorizedValue =
nullptr;
16358 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16359// If user is a PHI node, its vector code have to be inserted right before 16360// block terminator. Since the node was delayed, there were some unresolved 16361// dependencies at the moment when stab instruction was emitted. In a case 16362// when any of these dependencies turn out an operand of another PHI, coming 16363// from this same block, position of a stab instruction will become invalid. 16364// The is because source vector that supposed to feed this gather node was 16365// inserted at the end of the block [after stab instruction]. So we need 16366// to adjust insertion point again to the end of block. 16367if (isa<PHINode>(UserI)) {
16368// Insert before all users. 16370for (
User *U : PrevVec->users()) {
16373auto *UI = dyn_cast<Instruction>(U);
16374if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16376if (UI->comesBefore(InsertPt))
16385if (
auto *VecI = dyn_cast<Instruction>(Vec);
16390if (Vec->
getType() != PrevVec->getType()) {
16392 PrevVec->getType()->isIntOrIntVectorTy() &&
16393"Expected integer vector types only.");
16394 std::optional<bool> IsSigned;
16395for (
Value *V : TE->Scalars) {
16396if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
16397auto It = MinBWs.
find(BaseTE);
16398if (It != MinBWs.
end()) {
16399 IsSigned = IsSigned.value_or(
false) || It->second.second;
16403for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
16404auto It = MinBWs.
find(MNTE);
16405if (It != MinBWs.
end()) {
16406 IsSigned = IsSigned.value_or(
false) || It->second.second;
16411if (IsSigned.value_or(
false))
16413// Scan through gather nodes. 16414for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16415auto It = MinBWs.
find(BVE);
16416if (It != MinBWs.
end()) {
16417 IsSigned = IsSigned.value_or(
false) || It->second.second;
16422if (IsSigned.value_or(
false))
16424if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16426 IsSigned.value_or(
false) ||
16430if (IsSigned.value_or(
false))
16434if (IsSigned.value_or(
false)) {
16435// Final attempt - check user node. 16436auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16437if (It != MinBWs.
end())
16438 IsSigned = It->second.second;
16441"Expected user node or perfect diamond match in MinBWs.");
16445 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16446// Replace the stub vector node, if it was used before for one of the 16447// buildvector nodes already. 16448auto It = PostponedValues.
find(PrevVec);
16449if (It != PostponedValues.
end()) {
16450for (TreeEntry *VTE : It->getSecond())
16451 VTE->VectorizedValue = Vec;
16460// Maps vector instruction to original insertelement instruction 16462// Maps extract Scalar to the corresponding extractelement instruction in the 16463// basic block. Only one extractelement per block should be emitted. 16470// Extract all of the elements with the external uses. 16471for (
constauto &ExternalUse : ExternalUses) {
16472Value *Scalar = ExternalUse.Scalar;
16475// Skip users that we already RAUW. This happens when one instruction 16476// has multiple uses of the same value. 16479 TreeEntry *E = getTreeEntry(Scalar);
16480assert(E &&
"Invalid scalar");
16481assert(!E->isGather() &&
"Extracting from a gather list");
16482// Non-instruction pointers are not deleted, just skip them. 16483if (E->getOpcode() == Instruction::GetElementPtr &&
16484 !isa<GetElementPtrInst>(Scalar))
16487Value *Vec = E->VectorizedValue;
16488assert(Vec &&
"Can't find vectorizable value");
16491auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16492if (Scalar->getType() != Vec->
getType()) {
16494Value *ExV =
nullptr;
16495auto *Inst = dyn_cast<Instruction>(Scalar);
16496bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16497auto It = ScalarToEEs.
find(Scalar);
16498if (It != ScalarToEEs.
end()) {
16499// No need to emit many extracts, just move the only one in the 16501auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16503if (EEIt != It->second.end()) {
16504Value *PrevV = EEIt->second.first;
16505if (
auto *
I = dyn_cast<Instruction>(PrevV);
16506I && !ReplaceInst &&
16511if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16515 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16519// "Reuse" the existing extract to improve final codegen. 16521// Leave the instruction as is, if it cheaper extracts and all 16522// operands are scalar. 16523if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16524 IgnoredExtracts.
insert(EE);
16527auto *CloneInst = Inst->clone();
16528 CloneInst->insertBefore(Inst->getIterator());
16529if (Inst->hasName())
16533 }
elseif (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16534 ES && isa<Instruction>(Vec)) {
16535Value *V = ES->getVectorOperand();
16536auto *IVec = cast<Instruction>(Vec);
16537if (
const TreeEntry *ETE = getTreeEntry(V))
16538 V = ETE->VectorizedValue;
16539if (
auto *
IV = dyn_cast<Instruction>(V);
16540 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16541IV->comesBefore(IVec))
16545 }
elseif (
auto *VecTy =
16546 dyn_cast<FixedVectorType>(Scalar->getType())) {
16549// When REVEC is enabled, we need to extract a vector. 16550// Note: The element size of Scalar may be different from the 16551// element size of Vec. 16553 ExternalUse.Lane * VecTyNumElements);
16557// If necessary, sign-extend or zero-extend ScalarRoot 16558// to the larger type. 16560if (Scalar->getType() != Ex->
getType())
16562 Ex, Scalar->getType(),
16564auto *
I = dyn_cast<Instruction>(Ex);
16566 : &
F->getEntryBlock(),
16567 std::make_pair(Ex, ExV));
16569// The then branch of the previous if may produce constants, since 0 16570// operand might be a constant. 16571if (
auto *ExI = dyn_cast<Instruction>(Ex);
16573 GatherShuffleExtractSeq.
insert(ExI);
16574 CSEBlocks.
insert(ExI->getParent());
16578assert(isa<FixedVectorType>(Scalar->getType()) &&
16579 isa<InsertElementInst>(Scalar) &&
16580"In-tree scalar of vector type is not insertelement?");
16581auto *IE = cast<InsertElementInst>(Scalar);
16585// If User == nullptr, the Scalar remains as scalar in vectorized 16586// instructions or is used as extra arg. Generate ExtractElement instruction 16587// and update the record for this scalar in ExternallyUsedValues. 16589if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16593 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16596 if (ExternalUsesAsOriginalScalar.contains(U))
16598 TreeEntry *UseEntry = getTreeEntry(U);
16600 (UseEntry->State == TreeEntry::Vectorize ||
16602 TreeEntry::StridedVectorize) &&
16603 (E->State == TreeEntry::Vectorize ||
16604 E->State == TreeEntry::StridedVectorize) &&
16605 doesInTreeUserNeedToExtract(
16606 Scalar, getRootEntryInstruction(*UseEntry),
16609"Scalar with nullptr User must be registered in " 16610"ExternallyUsedValues map or remain as scalar in vectorized " 16612if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16613if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16614if (
PHI->getParent()->isLandingPad())
16618PHI->getParent()->getLandingPadInst()->getIterator()));
16621PHI->getParent()->getFirstNonPHIIt());
16624 std::next(VecI->getIterator()));
16629Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16630// Required to update internally referenced instructions. 16631if (Scalar != NewInst) {
16632assert((!isa<ExtractElementInst>(Scalar) ||
16633 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16634"Extractelements should not be replaced.");
16635 Scalar->replaceAllUsesWith(NewInst);
16640if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16642// Skip if the scalar is another vector op or Vec is not an instruction. 16643if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16644if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16645if (!UsedInserts.
insert(VU).second)
16647// Need to use original vector, if the root is truncated. 16648auto BWIt = MinBWs.
find(E);
16650auto *ScalarTy = FTy->getElementType();
16651auto Key = std::make_pair(Vec, ScalarTy);
16652auto VecIt = VectorCasts.
find(Key);
16653if (VecIt == VectorCasts.
end()) {
16655if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16656if (IVec->getParent()->isLandingPad())
16658 std::next(IVec->getParent()
16659 ->getLandingPadInst()
16663 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16664 }
elseif (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16671 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16672 BWIt->second.second);
16675 Vec = VecIt->second;
16682 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16683// Checks if 2 insertelements are from the same buildvector. 16689unsignedIdx = *InsertIdx;
16690if (It == ShuffledInserts.
end()) {
16692 It = std::next(ShuffledInserts.
begin(),
16693 ShuffledInserts.
size() - 1);
16698 Mask[
Idx] = ExternalUse.Lane;
16699 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16706// Generate extracts for out-of-tree users. 16707// Find the insertion point for the extractelement lane. 16708if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16710for (
unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {
16711if (PH->getIncomingValue(
I) == Scalar) {
16713 PH->getIncomingBlock(
I)->getTerminator();
16714if (isa<CatchSwitchInst>(IncomingTerminator)) {
16716 std::next(VecI->getIterator()));
16720Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16721 PH->setOperand(
I, NewInst);
16726Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16731Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16741int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16742for (
intI = 0, E = Mask.size();
I < E; ++
I) {
16744 CombinedMask1[
I] = Mask[
I];
16746 CombinedMask2[
I] = Mask[
I] - VF;
16749 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16750 ShuffleBuilder.
add(V1, CombinedMask1);
16752 ShuffleBuilder.
add(V2, CombinedMask2);
16753return ShuffleBuilder.
finalize({}, {}, {});
16757bool ForSingleMask) {
16758unsigned VF = Mask.size();
16759unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16761if (
any_of(Mask, [VF](
intIdx) {
returnIdx >=
static_cast<int>(VF); })) {
16762 Vec = CreateShuffle(Vec,
nullptr, Mask);
16763return std::make_pair(Vec,
true);
16765if (!ForSingleMask) {
16767for (
unsignedI = 0;
I < VF; ++
I) {
16769 ResizeMask[Mask[
I]] = Mask[
I];
16771 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16775return std::make_pair(Vec,
false);
16777// Perform shuffling of the vectorize tree entries for better handling of 16778// external extracts. 16779for (
intI = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16780// Find the first and the last instruction in the list of insertelements. 16785autoVector = ShuffledInserts[
I].ValueMasks.takeVector();
16786Value *NewInst = performExtractsShuffleAction<Value>(
16790 return cast<VectorType>(Vec->getType())
16791 ->getElementCount()
16792 .getKnownMinValue();
16797 assert((Vals.size() == 1 || Vals.size() == 2) &&
16798"Expected exactly 1 or 2 input values.");
16799 if (Vals.size() == 1) {
16800// Do not create shuffle if the mask is a simple identity 16801// non-resizing mask. 16802 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16803 ->getNumElements() ||
16804 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16805 return CreateShuffle(Vals.front(), nullptr, Mask);
16806 return Vals.front();
16808return CreateShuffle(Vals.
front() ? Vals.
front()
16810 Vals.
back(), Mask);
16812auto It = ShuffledInserts[
I].InsertElements.
rbegin();
16813// Rebuild buildvector chain. 16815if (It != ShuffledInserts[
I].InsertElements.
rend())
16818while (It != ShuffledInserts[
I].InsertElements.
rend()) {
16819assert(
II &&
"Must be an insertelement instruction.");
16824II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16827II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16828if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16829if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16830II->moveAfter(NewI);
16833 LastInsert->replaceAllUsesWith(NewInst);
16835 IE->replaceUsesOfWith(IE->getOperand(0),
16837 IE->replaceUsesOfWith(IE->getOperand(1),
16841 CSEBlocks.
insert(LastInsert->getParent());
16845// For each vectorized value: 16846for (
auto &TEPtr : VectorizableTree) {
16847 TreeEntry *Entry = TEPtr.get();
16849// No need to handle users of gathered values. 16850if (Entry->isGather())
16853assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
16856for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16857Value *Scalar = Entry->Scalars[Lane];
16859if (Entry->getOpcode() == Instruction::GetElementPtr &&
16860 !isa<GetElementPtrInst>(Scalar))
16862if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16863 EE && IgnoredExtracts.contains(EE))
16865if (isa<PoisonValue>(Scalar))
16868Type *Ty = Scalar->getType();
16870for (
User *U : Scalar->users()) {
16873// It is legal to delete users in the ignorelist. 16874assert((getTreeEntry(U) ||
16875 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16876 (isa_and_nonnull<Instruction>(U) &&
16877 isDeleted(cast<Instruction>(U)))) &&
16878"Deleting out-of-tree value");
16882LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16883auto *
I = cast<Instruction>(Scalar);
16888// Merge the DIAssignIDs from the about-to-be-deleted instructions into the 16889// new vector instruction. 16890if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16891V->mergeDIAssignID(RemovedInsts);
16893// Clear up reduction references, if any. 16894if (UserIgnoreList) {
16896const TreeEntry *
IE = getTreeEntry(
I);
16898 !(VectorizableTree.front()->isGather() &&
16899 !
IE->UserTreeIndices.empty() &&
16900 (ValueToGatherNodes.lookup(
I).contains(
16901 VectorizableTree.front().get()) ||
16903 [&](
const EdgeInfo &EI) {
16904 return EI.UserTE == VectorizableTree.front().get() &&
16905 EI.EdgeIdx == UINT_MAX;
16907 !(GatheredLoadsEntriesFirst.has_value() &&
16908IE->Idx >= *GatheredLoadsEntriesFirst &&
16909 VectorizableTree.front()->isGather() &&
16914// Do not replace condition of the logical op in form select <cond>. 16915 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16916 (match(U.getUser(), m_LogicalAnd()) ||
16917 match(U.getUser(), m_LogicalOr())) &&
16918 U.getOperandNo() == 0;
16919 if (IsPoisoningLogicalOp) {
16920 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16923return UserIgnoreList->contains(
U.getUser());
16925// Replace conditions of the poisoning logical ops with the non-poison 16931// Retain to-be-deleted instructions for some debug-info bookkeeping and alias 16932// cache correctness. 16933// NOTE: removeInstructionAndOperands only marks the instruction for deletion 16934// - instructions are not deleted until later. 16935 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16938 InstrElementSize.
clear();
16940const TreeEntry &RootTE = *VectorizableTree.front();
16941Value *Vec = RootTE.VectorizedValue;
16942if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16943 It != MinBWs.end() &&
16944 ReductionBitWidth != It->second.first) {
16947 ReductionRoot->getIterator());
16951 cast<VectorType>(Vec->
getType())->getElementCount()),
16952 It->second.second);
16959 <<
" gather sequences instructions.\n");
16960// LICM InsertElementInst sequences. 16965// Check if this block is inside a loop. 16966Loop *L = LI->getLoopFor(
I->getParent());
16970// Check if it has a preheader. 16971BasicBlock *PreHeader = L->getLoopPreheader();
16975// If the vector or the element that we insert into it are 16976// instructions that are defined in this basic block then we can't 16977// hoist this instruction. 16979 auto *OpI = dyn_cast<Instruction>(V);
16980 return OpI && L->contains(OpI);
16984// We can hoist this instruction. Move it to the pre-header. 16986 CSEBlocks.
insert(PreHeader);
16989// Make a list of all reachable blocks in our CSE queue. 16998// Sort blocks by domination. This ensures we visit a block after all blocks 16999// dominating it are visited. 17001assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
17002"Different nodes should have different DFS numbers");
17003returnA->getDFSNumIn() <
B->getDFSNumIn();
17006// Less defined shuffles can be replaced by the more defined copies. 17007// Between two shuffles one is less defined if it has the same vector operands 17008// and its mask indeces are the same as in the first one or undefs. E.g. 17009// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, 17010// poison, <0, 0, 0, 0>. 17014if (I1->getType() != I2->getType())
17016auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17017auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17019return I1->isIdenticalTo(I2);
17020if (SI1->isIdenticalTo(SI2))
17022for (
intI = 0, E = SI1->getNumOperands();
I < E; ++
I)
17023if (SI1->getOperand(
I) != SI2->getOperand(
I))
17025// Check if the second instruction is more defined than the first one. 17026 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17028// Count trailing undefs in the mask to check the final number of used 17030unsigned LastUndefsCnt = 0;
17031for (
intI = 0, E = NewMask.
size();
I < E; ++
I) {
17037 NewMask[
I] != SM1[
I])
17040 NewMask[
I] = SM1[
I];
17042// Check if the last undefs actually change the final number of used vector 17044return SM1.
size() - LastUndefsCnt > 1 &&
17048 SM1.
size() - LastUndefsCnt));
17050// Perform O(N^2) search over the gather/shuffle sequences and merge identical 17051// instructions. TODO: We can further optimize this scan if we split the 17052// instructions into different buckets based on the insert lane. 17054for (
autoI = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
17057"Worklist not sorted properly!");
17059// For all instructions in blocks containing gather sequences: 17063if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17064 !GatherShuffleExtractSeq.contains(&In))
17067// Check if we can replace this instruction with any of the 17068// visited instructions. 17069bool Replaced =
false;
17072if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17073 DT->
dominates(V->getParent(), In.getParent())) {
17074 In.replaceAllUsesWith(V);
17076if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17077if (!NewMask.
empty())
17078 SI->setShuffleMask(NewMask);
17082if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17083 GatherShuffleExtractSeq.contains(V) &&
17084 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17085 DT->
dominates(In.getParent(), V->getParent())) {
17087 V->replaceAllUsesWith(&In);
17089if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17090if (!NewMask.
empty())
17091 SI->setShuffleMask(NewMask);
17099 Visited.push_back(&In);
17104 GatherShuffleExtractSeq.clear();
17107BoUpSLP::ScheduleData *
17109 ScheduleData *Bundle =
nullptr;
17110 ScheduleData *PrevInBundle =
nullptr;
17111for (
Value *V : VL) {
17114 ScheduleData *BundleMember = getScheduleData(V);
17116"no ScheduleData for bundle member " 17117"(maybe not in same basic block)");
17118assert(BundleMember->isSchedulingEntity() &&
17119"bundle member already part of other bundle");
17121 PrevInBundle->NextInBundle = BundleMember;
17123 Bundle = BundleMember;
17126// Group the instructions to a bundle. 17127 BundleMember->FirstInBundle = Bundle;
17128 PrevInBundle = BundleMember;
17130assert(Bundle &&
"Failed to find schedule bundle");
17134// Groups the instructions to a bundle (which is then a single scheduling entity) 17135// and schedules instructions until the bundle gets ready. 17136std::optional<BoUpSLP::ScheduleData *>
17138const InstructionsState &S) {
17139// No need to schedule PHIs, insertelement, extractelement and extractvalue 17141if (isa<PHINode>(S.getMainOp()) ||
17145// Initialize the instruction bundle. 17149auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17150 ScheduleData *Bundle) {
17151// The scheduling region got new instructions at the lower end (or it is a 17152// new region for the first bundle). This makes it necessary to 17153// recalculate all dependencies. 17154// It is seldom that this needs to be done a second time after adding the 17155// initial bundle to the region. 17156if (ScheduleEnd != OldScheduleEnd) {
17157for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17158if (ScheduleData *SD = getScheduleData(
I))
17159 SD->clearDependencies();
17164 <<
" in block " << BB->
getName() <<
"\n");
17165 calculateDependencies(Bundle,
/*InsertInReadyList=*/true, SLP);
17170 initialFillReadyList(ReadyInsts);
17173// Now try to schedule the new bundle or (if no bundle) just calculate 17174// dependencies. As soon as the bundle is "ready" it means that there are no 17175// cyclic dependencies and we can schedule it. Note that's important that we 17176// don't "schedule" the bundle yet (see cancelScheduling). 17177while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17178 !ReadyInsts.empty()) {
17179 ScheduleData *Picked = ReadyInsts.pop_back_val();
17180assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17181"must be ready to schedule");
17182 schedule(Picked, ReadyInsts);
17186// Make sure that the scheduling region contains all 17187// instructions of the bundle. 17188for (
Value *V : VL) {
17191if (!extendSchedulingRegion(V, S)) {
17192// If the scheduling region got new instructions at the lower end (or it 17193// is a new region for the first bundle). This makes it necessary to 17194// recalculate all dependencies. 17195// Otherwise the compiler may crash trying to incorrectly calculate 17196// dependencies and emit instruction in the wrong order at the actual 17198 TryScheduleBundleImpl(
/*ReSchedule=*/false,
nullptr);
17199return std::nullopt;
17203bool ReSchedule =
false;
17204for (
Value *V : VL) {
17207 ScheduleData *BundleMember = getScheduleData(V);
17209"no ScheduleData for bundle member (maybe not in same basic block)");
17211// Make sure we don't leave the pieces of the bundle in the ready list when 17212// whole bundle might not be ready. 17213 ReadyInsts.remove(BundleMember);
17215if (!BundleMember->IsScheduled)
17217// A bundle member was scheduled as single instruction before and now 17218// needs to be scheduled as part of the bundle. We just get rid of the 17219// existing schedule. 17220LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17221 <<
" was already scheduled\n");
17225auto *Bundle = buildBundle(VL);
17226 TryScheduleBundleImpl(ReSchedule, Bundle);
17227if (!Bundle->isReady()) {
17228 cancelScheduling(VL, S.getMainOp());
17229return std::nullopt;
17242 ScheduleData *Bundle = getScheduleData(OpValue);
17243LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17244assert(!Bundle->IsScheduled &&
17245"Can't cancel bundle which is already scheduled");
17246assert(Bundle->isSchedulingEntity() &&
17248"tried to unbundle something which is not a bundle");
17250// Remove the bundle from the ready list. 17251if (Bundle->isReady())
17252 ReadyInsts.remove(Bundle);
17254// Un-bundle: make single instructions out of the bundle. 17255 ScheduleData *BundleMember = Bundle;
17256while (BundleMember) {
17257assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17258 BundleMember->FirstInBundle = BundleMember;
17259 ScheduleData *Next = BundleMember->NextInBundle;
17260 BundleMember->NextInBundle =
nullptr;
17261 BundleMember->TE =
nullptr;
17262if (BundleMember->unscheduledDepsInBundle() == 0) {
17263 ReadyInsts.insert(BundleMember);
17265 BundleMember = Next;
17269BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17270// Allocate a new ScheduleData for the instruction. 17271if (ChunkPos >= ChunkSize) {
17272 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17275return &(ScheduleDataChunks.back()[ChunkPos++]);
17278bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17279Value *V,
const InstructionsState &S) {
17281assert(
I &&
"bundle member must be an instruction");
17284"phi nodes/insertelements/extractelements/extractvalues don't need to " 17286if (getScheduleData(
I))
17288if (!ScheduleStart) {
17289// It's the first instruction in the new region. 17290 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17292 ScheduleEnd =
I->getNextNode();
17293assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17294LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17297// Search up and down at the same time, because we don't know if the new 17298// instruction is above or below the existing scheduling region. 17299// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted 17300// against the budget. Otherwise debug info could affect codegen. 17302 ++ScheduleStart->getIterator().getReverse();
17307if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17308returnII->isAssumeLikeIntrinsic();
17311 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17312 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17313while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17315if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17316LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17323 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17324 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17326if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17327assert(
I->getParent() == ScheduleStart->getParent() &&
17328"Instruction is in wrong basic block.");
17329 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17335assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17336"Expected to reach top of the basic block or instruction down the " 17338assert(
I->getParent() == ScheduleEnd->getParent() &&
17339"Instruction is in wrong basic block.");
17340 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17342 ScheduleEnd =
I->getNextNode();
17343assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17344LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17348void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17350 ScheduleData *PrevLoadStore,
17351 ScheduleData *NextLoadStore) {
17352 ScheduleData *CurrentLoadStore = PrevLoadStore;
17354// No need to allocate data for non-schedulable instructions. 17357 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17359 SD = allocateScheduleDataChunks();
17360 ScheduleDataMap[
I] = SD;
17362assert(!isInSchedulingRegion(SD) &&
17363"new ScheduleData already in scheduling region");
17364 SD->init(SchedulingRegionID,
I);
17366if (
I->mayReadOrWriteMemory() &&
17367 (!isa<IntrinsicInst>(
I) ||
17368 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17369 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17370 Intrinsic::pseudoprobe))) {
17371// Update the linked list of memory accessing instructions. 17372if (CurrentLoadStore) {
17373 CurrentLoadStore->NextLoadStore = SD;
17375 FirstLoadStoreInRegion = SD;
17377 CurrentLoadStore = SD;
17380if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17381match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17382 RegionHasStackSave =
true;
17384if (NextLoadStore) {
17385if (CurrentLoadStore)
17386 CurrentLoadStore->NextLoadStore = NextLoadStore;
17388 LastLoadStoreInRegion = CurrentLoadStore;
17392void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17393bool InsertInReadyList,
17395assert(SD->isSchedulingEntity());
17400while (!WorkList.
empty()) {
17402for (ScheduleData *BundleMember = SD; BundleMember;
17403 BundleMember = BundleMember->NextInBundle) {
17404assert(isInSchedulingRegion(BundleMember));
17405if (BundleMember->hasValidDependencies())
17410 BundleMember->Dependencies = 0;
17411 BundleMember->resetUnscheduledDeps();
17413// Handle def-use chain dependencies. 17414for (
User *U : BundleMember->Inst->
users()) {
17415if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17416 BundleMember->Dependencies++;
17417 ScheduleData *DestBundle = UseSD->FirstInBundle;
17418if (!DestBundle->IsScheduled)
17419 BundleMember->incrementUnscheduledDeps(1);
17420if (!DestBundle->hasValidDependencies())
17426auto *DepDest = getScheduleData(
I);
17427assert(DepDest &&
"must be in schedule window");
17428 DepDest->ControlDependencies.push_back(BundleMember);
17429 BundleMember->Dependencies++;
17430 ScheduleData *DestBundle = DepDest->FirstInBundle;
17431if (!DestBundle->IsScheduled)
17432 BundleMember->incrementUnscheduledDeps(1);
17433if (!DestBundle->hasValidDependencies())
17437// Any instruction which isn't safe to speculate at the beginning of the 17438// block is control dependend on any early exit or non-willreturn call 17439// which proceeds it. 17441for (
Instruction *
I = BundleMember->Inst->getNextNode();
17442I != ScheduleEnd;
I =
I->getNextNode()) {
17446// Add the dependency 17447 MakeControlDependent(
I);
17450// Everything past here must be control dependent on I. 17455if (RegionHasStackSave) {
17456// If we have an inalloc alloca instruction, it needs to be scheduled 17457// after any preceeding stacksave. We also need to prevent any alloca 17458// from reordering above a preceeding stackrestore. 17459if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17460match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17461for (
Instruction *
I = BundleMember->Inst->getNextNode();
17462I != ScheduleEnd;
I =
I->getNextNode()) {
17463if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17464match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17465// Any allocas past here must be control dependent on I, and I 17466// must be memory dependend on BundleMember->Inst. 17469if (!isa<AllocaInst>(
I))
17472// Add the dependency 17473 MakeControlDependent(
I);
17477// In addition to the cases handle just above, we need to prevent 17478// allocas and loads/stores from moving below a stacksave or a 17479// stackrestore. Avoiding moving allocas below stackrestore is currently 17480// thought to be conservatism. Moving loads/stores below a stackrestore 17481// can lead to incorrect code. 17482if (isa<AllocaInst>(BundleMember->Inst) ||
17483 BundleMember->Inst->mayReadOrWriteMemory()) {
17484for (
Instruction *
I = BundleMember->Inst->getNextNode();
17485I != ScheduleEnd;
I =
I->getNextNode()) {
17486if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17487 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17490// Add the dependency 17491 MakeControlDependent(
I);
17497// Handle the memory dependencies (if any). 17498 ScheduleData *DepDest = BundleMember->NextLoadStore;
17503"NextLoadStore list for non memory effecting bundle?");
17505bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17506unsigned NumAliased = 0;
17507unsigned DistToSrc = 1;
17509for (; DepDest; DepDest = DepDest->NextLoadStore) {
17510assert(isInSchedulingRegion(DepDest));
17512// We have two limits to reduce the complexity: 17513// 1) AliasedCheckLimit: It's a small limit to reduce calls to 17514// SLP->isAliased (which is the expensive part in this loop). 17515// 2) MaxMemDepDistance: It's for very large blocks and it aborts 17516// the whole loop (even if the loop is fast, it's quadratic). 17517// It's important for the loop break condition (see below) to 17518// check this limit even between two read-only instructions. 17520 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17522 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17524// We increment the counter only if the locations are aliased 17525// (instead of counting all alias checks). This gives a better 17526// balance between reduced runtime and accurate dependencies. 17529 DepDest->MemoryDependencies.push_back(BundleMember);
17530 BundleMember->Dependencies++;
17531 ScheduleData *DestBundle = DepDest->FirstInBundle;
17532if (!DestBundle->IsScheduled) {
17533 BundleMember->incrementUnscheduledDeps(1);
17535if (!DestBundle->hasValidDependencies()) {
17540// Example, explaining the loop break condition: Let's assume our 17541// starting instruction is i0 and MaxMemDepDistance = 3. 17544// i0,i1,i2,i3,i4,i5,i6,i7,i8 17547// MaxMemDepDistance let us stop alias-checking at i3 and we add 17548// dependencies from i0 to i3,i4,.. (even if they are not aliased). 17549// Previously we already added dependencies from i3 to i6,i7,i8 17550// (because of MaxMemDepDistance). As we added a dependency from 17551// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 17552// and we can abort this loop at i6. 17558if (InsertInReadyList && SD->isReady()) {
17559 ReadyInsts.insert(SD);
17566void BoUpSLP::BlockScheduling::resetSchedule() {
17568"tried to reset schedule on block which has not been scheduled");
17569for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17570if (ScheduleData *SD = getScheduleData(
I)) {
17571assert(isInSchedulingRegion(SD) &&
17572"ScheduleData not in scheduling region");
17573 SD->IsScheduled =
false;
17574 SD->resetUnscheduledDeps();
17577 ReadyInsts.clear();
17580void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17581if (!BS->ScheduleStart)
17584LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17586// A key point - if we got here, pre-scheduling was able to find a valid 17587// scheduling of the sub-graph of the scheduling window which consists 17588// of all vector bundles and their transitive users. As such, we do not 17589// need to reschedule anything *outside of* that subgraph. 17591 BS->resetSchedule();
17593// For the real scheduling we use a more sophisticated ready-list: it is 17594// sorted by the original instruction location. This lets the final schedule 17595// be as close as possible to the original instruction order. 17596// WARNING: If changing this order causes a correctness issue, that means 17597// there is some missing dependence edge in the schedule data graph. 17598structScheduleDataCompare {
17599bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const{
17600return SD2->SchedulingPriority < SD1->SchedulingPriority;
17603 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17605// Ensure that all dependency data is updated (for nodes in the sub-graph) 17606// and fill the ready-list with initial instructions. 17608for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17609I =
I->getNextNode()) {
17610if (ScheduleData *SD = BS->getScheduleData(
I)) {
17611 [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);
17613 SD->isPartOfBundle() ==
17615"scheduler and vectorizer bundle mismatch");
17616 SD->FirstInBundle->SchedulingPriority =
Idx++;
17618if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17619 BS->calculateDependencies(SD,
false,
this);
17622 BS->initialFillReadyList(ReadyInsts);
17626// Do the "real" scheduling. 17627while (!ReadyInsts.empty()) {
17628 ScheduleData *Picked = *ReadyInsts.begin();
17629 ReadyInsts.erase(ReadyInsts.begin());
17631// Move the scheduled instruction(s) to their dedicated places, if not 17633for (ScheduleData *BundleMember = Picked; BundleMember;
17634 BundleMember = BundleMember->NextInBundle) {
17638 LastScheduledInst = PickedInst;
17641 BS->schedule(Picked, ReadyInsts);
17644// Check that we didn't break any of our invariants. 17645#ifdef EXPENSIVE_CHECKS 17649#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) 17650// Check that all schedulable entities got scheduled 17651for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17652 ScheduleData *SD = BS->getScheduleData(
I);
17653if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17654assert(SD->IsScheduled &&
"must be scheduled at this point");
17658// Avoid duplicate scheduling of the block. 17659 BS->ScheduleStart =
nullptr;
17663// If V is a store, just return the width of the stored value (or value 17664// truncated just before storing) without traversing the expression tree. 17665// This is the common case. 17666if (
auto *Store = dyn_cast<StoreInst>(V))
17667returnDL->getTypeSizeInBits(Store->getValueOperand()->getType());
17669if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17672auto E = InstrElementSize.
find(V);
17673if (E != InstrElementSize.
end())
17676// If V is not a store, we can traverse the expression tree to find loads 17677// that feed it. The type of the loaded value may indicate a more suitable 17678// width than V's type. We want to base the vector element size on the width 17679// of memory operations where possible. 17682if (
auto *
I = dyn_cast<Instruction>(V)) {
17687// Traverse the expression tree in bottom-up order looking for loads. If we 17688// encounter an instruction we don't yet handle, we give up. 17690Value *FirstNonBool =
nullptr;
17691while (!Worklist.
empty()) {
17694// We should only be looking at scalar instructions here. If the current 17695// instruction has a vector type, skip. 17696auto *Ty =
I->getType();
17697if (isa<VectorType>(Ty))
17699if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17704// If the current instruction is a load, update MaxWidth to reflect the 17705// width of the loaded value. 17706if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17707 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17709// Otherwise, we need to visit the operands of the instruction. We only 17710// handle the interesting cases from buildTree here. If an operand is an 17711// instruction we haven't yet visited and from the same basic block as the 17712// user or the use is a PHI node, we add it to the worklist. 17715for (
Use &U :
I->operands()) {
17716if (
auto *J = dyn_cast<Instruction>(U.get()))
17717if (Visited.
insert(J).second &&
17718 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17722if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17723 FirstNonBool = U.get();
17730// If we didn't encounter a memory access in the expression tree, or if we 17731// gave up for some reason, just return the width of V. Otherwise, return the 17732// maximum width we found. 17734if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17736 Width =
DL->getTypeSizeInBits(V->getType());
17740 InstrElementSize[
I] = Width;
17745bool BoUpSLP::collectValuesToDemote(
17746const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17749bool &IsProfitableToDemote,
bool IsTruncRoot)
const{
17750// We can always demote constants. 17751if (
all_of(E.Scalars, IsaPred<Constant>))
17754unsigned OrigBitWidth =
17755DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17761// Check if the node was analyzed already and must keep its original bitwidth. 17762if (NodesToKeepBWs.
contains(E.Idx))
17765// If the value is not a vectorized instruction in the expression and not used 17766// by the insertelement instruction and not used in multiple vector nodes, it 17767// cannot be demoted. 17768bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17769 if (isa<PoisonValue>(R))
17771 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17773auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17774if (isa<PoisonValue>(V))
17778// For lat shuffle of sext/zext with many uses need to check the extra bit 17779// for unsigned values, otherwise may have incorrect casting for reused 17782if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17788unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17791if (
auto *
I = dyn_cast<Instruction>(V)) {
17793unsigned BitWidth2 =
17794 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17795while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17801 BitWidth1 = std::min(BitWidth1, BitWidth2);
17806auto FinalAnalysis = [&,
TTI =
TTI]() {
17807if (!IsProfitableToDemote)
17810 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17812if (Res && E.isGather()) {
17813// Check possible extractelement instructions bases and final vector 17816for (
Value *V : E.Scalars) {
17817auto *EE = dyn_cast<ExtractElementInst>(V);
17820 UniqueBases.
insert(EE->getVectorOperand());
17822constunsigned VF = E.Scalars.size();
17823Type *OrigScalarTy = E.Scalars.front()->getType();
17824if (UniqueBases.
size() <= 2 ||
17832if (E.isGather() || !Visited.
insert(&E).second ||
17834 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17835 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17838return FinalAnalysis();
17841 return !all_of(V->users(), [=](User *U) {
17842 return getTreeEntry(U) ||
17843 (E.Idx == 0 && UserIgnoreList &&
17844 UserIgnoreList->contains(U)) ||
17845 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17846 !U->getType()->isScalableTy() &&
17847 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17848 }) && !IsPotentiallyTruncated(V,
BitWidth);
17855unsigned InitLevel = MaxDepthLevel;
17857unsigned Level = InitLevel;
17858if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17859 ToDemote, Visited, NodesToKeepBWs, Level,
17860 IsProfitableToDemote, IsTruncRoot)) {
17861if (!IsProfitableToDemote)
17864if (!FinalAnalysis())
17868 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17872auto AttemptCheckBitwidth =
17874// Try all bitwidth < OrigBitWidth. 17876unsigned BestFailBitwidth = 0;
17878if (Checker(
BitWidth, OrigBitWidth))
17880if (BestFailBitwidth == 0 && FinalAnalysis())
17884if (BestFailBitwidth == 0) {
17895auto TryProcessInstruction =
17901 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17904// Several vectorized uses? Check if we can truncate it, otherwise - 17906if (E.UserTreeIndices.size() > 1 &&
17907 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17910bool NeedToExit =
false;
17911if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17915if (!ProcessOperands(
Operands, NeedToExit))
17922// Record the entry that we can demote. 17924return IsProfitableToDemote;
17926switch (E.getOpcode()) {
17928// We can always demote truncations and extensions. Since truncations can 17929// seed additional demotion, we save the truncated value. 17930case Instruction::Trunc:
17931if (IsProfitableToDemoteRoot)
17932 IsProfitableToDemote =
true;
17933return TryProcessInstruction(
BitWidth);
17934case Instruction::ZExt:
17935case Instruction::SExt:
17936 IsProfitableToDemote =
true;
17937return TryProcessInstruction(
BitWidth);
17939// We can demote certain binary operations if we can demote both of their 17941case Instruction::Add:
17942case Instruction::Sub:
17943case Instruction::Mul:
17944case Instruction::And:
17945case Instruction::Or:
17946case Instruction::Xor: {
17947return TryProcessInstruction(
17948BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17950case Instruction::Freeze:
17951return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17952case Instruction::Shl: {
17953// If we are truncating the result of this SHL, and if it's a shift of an 17954// inrange amount, we can always perform a SHL in a smaller type. 17957 if (isa<PoisonValue>(V))
17959 auto *I = cast<Instruction>(V);
17960 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17961 return AmtKnownBits.getMaxValue().ult(BitWidth);
17964return TryProcessInstruction(
17965BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17967case Instruction::LShr: {
17968// If this is a truncate of a logical shr, we can truncate it to a smaller 17969// lshr iff we know that the bits we would otherwise be shifting in are 17971auto LShrChecker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
17973 if (isa<PoisonValue>(V))
17975 auto *I = cast<Instruction>(V);
17976 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17977 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17978 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17979 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17980 SimplifyQuery(*DL));
17983return TryProcessInstruction(
17984BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17987case Instruction::AShr: {
17988// If this is a truncate of an arithmetic shr, we can truncate it to a 17989// smaller ashr iff we know that all the bits from the sign bit of the 17990// original type and the sign bit of the truncate type are similar. 17991auto AShrChecker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
17993 if (isa<PoisonValue>(V))
17995 auto *I = cast<Instruction>(V);
17996 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17997 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17998 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17999 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18003return TryProcessInstruction(
18004BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18007case Instruction::UDiv:
18008case Instruction::URem: {
18009// UDiv and URem can be truncated if all the truncated bits are zero. 18010auto Checker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
18013 auto *I = cast<Instruction>(V);
18014 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18015 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18016 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18019return TryProcessInstruction(
18020BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18023// We can demote selects if we can demote their true and false values. 18024case Instruction::Select: {
18025return TryProcessInstruction(
18026BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18029// We can demote phis if we can demote all their incoming operands. Note that 18030// we don't need to worry about cycles since we ensure single use above. 18031case Instruction::PHI: {
18032constunsigned NumOps = E.getNumOperands();
18035 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
18037return TryProcessInstruction(
BitWidth, Ops);
18040case Instruction::Call: {
18041auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18045if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
18046ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
18050auto CompChecker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
18053 auto *I = cast<Instruction>(V);
18054 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18055 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18056 return MaskedValueIsZero(I->getOperand(0), Mask,
18057 SimplifyQuery(*DL)) &&
18058 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18060assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
18061"Expected min/max intrinsics only.");
18062unsigned SignBits = OrigBitWidth -
BitWidth;
18068return SignBits <= Op0SignBits &&
18069 ((SignBits != Op0SignBits &&
18073 SignBits <= Op1SignBits &&
18074 ((SignBits != Op1SignBits &&
18079auto AbsChecker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
18082 auto *I = cast<Instruction>(V);
18083 unsigned SignBits = OrigBitWidth - BitWidth;
18084 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18085 unsigned Op0SignBits =
18086 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18087 return SignBits <= Op0SignBits &&
18088 ((SignBits != Op0SignBits &&
18089 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18090 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18093if (
ID != Intrinsic::abs) {
18094Operands.push_back(getOperandEntry(&E, 1));
18095 CallChecker = CompChecker;
18097 CallChecker = AbsChecker;
18100 std::numeric_limits<InstructionCost::CostType>::max();
18102unsigned VF = E.Scalars.size();
18103// Choose the best bitwidth based on cost estimations. 18112if (
Cost < BestCost) {
18118 [[maybe_unused]]
bool NeedToExit;
18119 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18124// Otherwise, conservatively give up. 18129return FinalAnalysis();
18135// We only attempt to truncate integer expressions. 18136bool IsStoreOrInsertElt =
18137 VectorizableTree.front()->hasState() &&
18138 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18139 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18140if ((IsStoreOrInsertElt || UserIgnoreList) &&
18141 ExtraBitWidthNodes.
size() <= 1 &&
18142 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18143 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18146unsigned NodeIdx = 0;
18147if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18150// Ensure the roots of the vectorizable tree don't form a cycle. 18151if (VectorizableTree[NodeIdx]->
isGather() ||
18152 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18153 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18155return EI.
UserTE->Idx > NodeIdx;
18159// The first value node for store/insertelement is sext/zext/trunc? Skip it, 18160// resize to the final type. 18161bool IsTruncRoot =
false;
18162bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18166 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18167 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18168assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18171 IsProfitableToDemoteRoot =
true;
18175// Analyzed the reduction already and not profitable - exit. 18176if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18180auto ComputeMaxBitWidth =
18181 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
18182unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
18184// Check if the root is trunc and the next node is gather/buildvector, then 18185// keep trunc in scalars, which is free in most cases. 18186if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18187 !NodesToKeepBWs.
contains(E.Idx) &&
18188 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18190return V->hasOneUse() || isa<Constant>(V) ||
18193 const TreeEntry *TE = getTreeEntry(U);
18194 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18195 if (TE == UserTE || !TE)
18197 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18199 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18200 SelectInst>(UserTE->getMainOp()))
18202 unsigned UserTESz = DL->getTypeSizeInBits(
18203 UserTE->Scalars.front()->getType());
18204 auto It = MinBWs.find(TE);
18205 if (It != MinBWs.end() && It->second.first > UserTESz)
18207 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18211const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18212auto It = MinBWs.
find(UserTE);
18213if (It != MinBWs.
end())
18214return It->second.first;
18215unsigned MaxBitWidth =
18216DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18217 MaxBitWidth =
bit_ceil(MaxBitWidth);
18218if (MaxBitWidth < 8 && MaxBitWidth > 1)
18226unsigned VF = E.getVectorFactor();
18227Type *ScalarTy = E.Scalars.front()->getType();
18229auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18234 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18240// The maximum bit width required to represent all the values that can be 18241// demoted without loss of precision. It would be safe to truncate the roots 18242// of the expression to this width. 18243unsigned MaxBitWidth = 1u;
18245// True if the roots can be zero-extended back to their original type, 18246// rather than sign-extended. We know that if the leading bits are not 18247// demanded, we can safely zero-extend. So we initialize IsKnownPositive to 18249// Determine if the sign bit of all the roots is known to be zero. If not, 18250// IsKnownPositive is set to False. 18251bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18252 if (isa<PoisonValue>(R))
18254 KnownBits Known = computeKnownBits(R, *DL);
18255 return Known.isNonNegative();
18258// We first check if all the bits of the roots are demanded. If they're not, 18259// we can truncate the roots to this narrower type. 18260for (
Value *Root : E.Scalars) {
18261if (isa<PoisonValue>(Root))
18266unsigned BitWidth1 = NumTypeBits - NumSignBits;
18267// If we can't prove that the sign bit is zero, we must add one to the 18268// maximum bit width to account for the unknown sign bit. This preserves 18269// the existing sign bit so we can safely sign-extend the root back to the 18270// original type. Otherwise, if we know the sign bit is zero, we will 18271// zero-extend the root instead. 18273// FIXME: This is somewhat suboptimal, as there will be cases where adding 18274// one to the maximum bit width will yield a larger-than-necessary 18275// type. In general, we need to add an extra bit only if we can't 18276// prove that the upper bit of the original type is equal to the 18277// upper bit of the proposed smaller type. If these two bits are 18278// the same (either zero or one) we know that sign-extending from 18279// the smaller type will result in the same value. Here, since we 18280// can't yet prove this, we are just making the proposed smaller 18281// type larger to ensure correctness. 18282if (!IsKnownPositive)
18286unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18288 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18291if (MaxBitWidth < 8 && MaxBitWidth > 1)
18294// If the original type is large, but reduced type does not improve the reg 18302unsigned Opcode = E.getOpcode();
18303bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18304 Opcode == Instruction::SExt ||
18305 Opcode == Instruction::ZExt || NumParts > 1;
18306// Conservatively determine if we can actually truncate the roots of the 18307// expression. Collect the values that can be demoted in ToDemote and 18308// additional roots that require investigating in Roots. 18310unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18311bool NeedToDemote = IsProfitableToDemote;
18313if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18314 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18315 NeedToDemote, IsTruncRoot) ||
18316 (MaxDepthLevel <= Limit &&
18317 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18318 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18319DL->getTypeSizeInBits(TreeRootIT) /
18320DL->getTypeSizeInBits(
18321 E.getMainOp()->getOperand(0)->getType()) >
18324// Round MaxBitWidth up to the next power-of-two. 18325 MaxBitWidth =
bit_ceil(MaxBitWidth);
18330// If we can truncate the root, we must collect additional values that might 18331// be demoted as a result. That is, those seeded by truncations we will 18333// Add reduction ops sizes, if any. 18334if (UserIgnoreList &&
18335 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18336// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n 18338if (
all_of(*UserIgnoreList,
18340return isa<PoisonValue>(V) ||
18341 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18343 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18344 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18345 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18347 ReductionBitWidth = 1;
18349for (
Value *V : *UserIgnoreList) {
18350if (isa<PoisonValue>(V))
18353TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18354unsigned BitWidth1 = NumTypeBits - NumSignBits;
18357unsigned BitWidth2 = BitWidth1;
18360 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18362 ReductionBitWidth =
18363 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18365if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18366 ReductionBitWidth = 8;
18368 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18371bool IsTopRoot = NodeIdx == 0;
18372while (NodeIdx < VectorizableTree.size() &&
18373 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18374 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18379bool IsSignedCmp =
false;
18380while (NodeIdx < VectorizableTree.size()) {
18384 ReductionBitWidth ==
18385DL->getTypeSizeInBits(
18386 VectorizableTree.front()->Scalars.front()->getType()))
18388unsigned MaxBitWidth = ComputeMaxBitWidth(
18389 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18390 IsTruncRoot, IsSignedCmp);
18391if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
18392if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18393 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18394elseif (MaxBitWidth == 0)
18395 ReductionBitWidth = 0;
18398for (
unsignedIdx : RootDemotes) {
18401DL->getTypeSizeInBits(V->getType()->getScalarType());
18402if (OrigBitWidth > MaxBitWidth) {
18410 RootDemotes.clear();
18412 IsProfitableToDemoteRoot =
true;
18414if (ExtraBitWidthNodes.
empty()) {
18415 NodeIdx = VectorizableTree.size();
18417unsigned NewIdx = 0;
18419 NewIdx = *ExtraBitWidthNodes.
begin();
18420 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
18421 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
18424 NodeIdx < VectorizableTree.size() &&
18425any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18428 EI.
UserTE->getOpcode() == Instruction::Trunc &&
18429 !EI.
UserTE->isAltShuffle();
18432 NodeIdx < VectorizableTree.size() &&
18434 VectorizableTree[NodeIdx]->UserTreeIndices,
18436return (EI.
UserTE->hasState() &&
18437 EI.
UserTE->getOpcode() == Instruction::ICmp) &&
18439 auto *IC = dyn_cast<ICmpInst>(V);
18442 !isKnownNonNegative(IC->getOperand(0),
18443 SimplifyQuery(*DL)) ||
18444 !isKnownNonNegative(IC->getOperand(1),
18445 SimplifyQuery(*DL)));
18450// If the maximum bit width we compute is less than the width of the roots' 18451// type, we can proceed with the narrowing. Otherwise, do nothing. 18452if (MaxBitWidth == 0 ||
18454 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18462// Finally, map the values we can demote to the maximum bit with we 18464for (
unsignedIdx : ToDemote) {
18465 TreeEntry *TE = VectorizableTree[
Idx].get();
18468bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
18469 if (isa<PoisonValue>(R))
18471 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18489bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18514DL = &
F.getDataLayout();
18518bool Changed =
false;
18520// If the target claims to have no vector registers don't attempt 18524dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18528// Don't vectorize when the attribute NoImplicitFloat is used. 18529if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18532LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18534// Use the bottom up slp vectorizer to construct chains that start with 18535// store instructions. 18536BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18538// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 18539// delete instructions. 18541// Update DFS numbers now so that we can use them for ordering. 18544// Scan the blocks in the function in post order. 18549// Start new block - clear the list of reduction roots. 18550 R.clearReductionData();
18551 collectSeedInstructions(BB);
18553// Vectorize trees that end at stores. 18554if (!Stores.
empty()) {
18556 <<
" underlying objects.\n");
18557 Changed |= vectorizeStoreChains(R);
18560// Vectorize trees that end at reductions. 18561 Changed |= vectorizeChainsInBlock(BB, R);
18563// Vectorize the index computations of getelementptr instructions. This 18564// is primarily intended to catch gather-like idioms ending at 18565// non-consecutive loads. 18566if (!GEPs.
empty()) {
18568 <<
" underlying objects.\n");
18569 Changed |= vectorizeGEPIndices(BB, R);
18574 R.optimizeGatherSequence();
18582unsignedIdx,
unsigned MinVF,
18587constunsigned Sz = R.getVectorElementSize(Chain[0]);
18588unsigned VF = Chain.
size();
18592 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18594 VF < 2 || VF < MinVF) {
18595// Check if vectorizing with a non-power-of-2 VF should be considered. At 18596// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost 18597// all vector lanes are used. 18606for (
Value *V : Chain)
18607 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18608// Operands are not same/alt opcodes or non-power-of-2 uniques - exit. 18610if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18612bool IsAllowedSize =
18616if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18617 (!S.getMainOp()->isSafeToRemove() ||
18620 return !isa<ExtractElementInst>(V) &&
18621 (V->getNumUses() > Chain.size() ||
18622 any_of(V->users(), [&](User *U) {
18623 return !Stores.contains(U);
18626 (ValOps.
size() > Chain.size() / 2 && !S)) {
18627Size = (!IsAllowedSize && S) ? 1 : 2;
18631if (
R.isLoadCombineCandidate(Chain))
18634// Check if tree tiny and store itself or its value is not vectorized. 18635if (
R.isTreeTinyAndNotFullyVectorizable()) {
18636if (
R.isGathered(Chain.front()) ||
18637R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18638return std::nullopt;
18639Size =
R.getCanonicalGraphSize();
18642R.reorderTopToBottom();
18643R.reorderBottomToTop();
18645R.buildExternalUses();
18647R.computeMinimumValueSizes();
18649Size =
R.getCanonicalGraphSize();
18650if (S && S.getOpcode() == Instruction::Load)
18651Size = 2;
// cut off masked gather small trees 18661 cast<StoreInst>(Chain[0]))
18662 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18663 <<
" and with tree size " 18664 <<
NV(
"TreeSize",
R.getTreeSize()));
18673/// Checks if the quadratic mean deviation is less than 90% of the mean size. 18678 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18679 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18680 unsigned Size = First ? Val.first : Val.second;
18692 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18693 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18694 unsigned P = First ? Val.first : Val.second;
18697 return V + (P - Mean) * (P - Mean);
18700return Dev * 81 / (Mean * Mean) == 0;
18703bool SLPVectorizerPass::vectorizeStores(
18705DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18707// We may run into multiple chains that merge into a single chain. We mark the 18708// stores that we vectorized so that we don't visit the same store twice. 18710bool Changed =
false;
18712structStoreDistCompare {
18713bool operator()(
const std::pair<unsigned, int> &Op1,
18714const std::pair<unsigned, int> &Op2)
const{
18715return Op1.second < Op2.second;
18718// A set of pairs (index of store in Stores array ref, Distance of the store 18719// address relative to base store address in units). 18720usingStoreIndexToDistSet =
18721 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18722auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18725// Collect the chain into a list. 18729 PrevDist =
Data.second;
18730if (
Idx !=
Set.size() - 1)
18735Operands.push_back(Stores[DataVar.first]);
18736 PrevDist = DataVar.second;
18741 .
insert({Operands.front(),
18742 cast<StoreInst>(Operands.front())->getValueOperand(),
18744 cast<StoreInst>(Operands.back())->getValueOperand(),
18749unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18750unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18754 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18756Type *StoreTy =
Store->getValueOperand()->getType();
18757Type *ValueTy = StoreTy;
18758if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18759 ValueTy = Trunc->getSrcTy();
18760unsigned MinVF = std::max<unsigned>(
18762R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18765if (MaxVF < MinVF) {
18766LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18768 <<
"MinVF (" << MinVF <<
")\n");
18772unsigned NonPowerOf2VF = 0;
18774// First try vectorizing with a non-power-of-2 VF. At the moment, only 18775// consider cases where VF + 1 is a power-of-2, i.e. almost all vector 18777unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18779 NonPowerOf2VF = CandVF;
18780assert(NonPowerOf2VF != MaxVF &&
18781"Non-power-of-2 VF should not be equal to MaxVF");
18785unsigned MaxRegVF = MaxVF;
18787if (MaxVF < MinVF) {
18788LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18790 <<
"MinVF (" << MinVF <<
")\n");
18796unsignedSize = MinVF;
18798 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18802unsigned Repeat = 0;
18803constexprunsigned MaxAttempts = 4;
18805for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18806P.first =
P.second = 1;
18809auto IsNotVectorized = [](
boolFirst,
18810const std::pair<unsigned, unsigned> &
P) {
18811returnFirst ?
P.first > 0 :
P.second > 0;
18813auto IsVectorized = [](
boolFirst,
18814const std::pair<unsigned, unsigned> &
P) {
18815returnFirst ?
P.first == 0 :
P.second == 0;
18817auto VFIsProfitable = [](
boolFirst,
unsignedSize,
18818const std::pair<unsigned, unsigned> &
P) {
18821auto FirstSizeSame = [](
unsignedSize,
18822const std::pair<unsigned, unsigned> &
P) {
18823returnSize ==
P.first;
18827bool RepeatChanged =
false;
18828bool AnyProfitableGraph =
false;
18829for (
unsignedSize : CandidateVFs) {
18830 AnyProfitableGraph =
false;
18831unsigned StartIdx = std::distance(
18832 RangeSizes.begin(),
18833find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18834 std::placeholders::_1)));
18835while (StartIdx <
End) {
18837 std::distance(RangeSizes.begin(),
18838find_if(RangeSizes.drop_front(StartIdx),
18839 std::bind(IsVectorized,
Size >= MaxRegVF,
18840 std::placeholders::_1)));
18841unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18842for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18844Size >= MaxRegVF)) {
18851return cast<StoreInst>(V)
18852 ->getValueOperand()
18854 cast<StoreInst>(Slice.
front())
18855 ->getValueOperand()
18858"Expected all operands of same type.");
18859if (!NonSchedulable.empty()) {
18860auto [NonSchedSizeMax, NonSchedSizeMin] =
18861 NonSchedulable.lookup(Slice.
front());
18862if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18863 Cnt += NonSchedSizeMax;
18868 std::optional<bool> Res =
18869 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18873 .first->getSecond()
18876// Mark the vectorized stores so that we don't vectorize them 18879// Mark the vectorized stores so that we don't vectorize them 18881 AnyProfitableGraph = RepeatChanged = Changed =
true;
18882// If we vectorized initial block, no need to try to vectorize 18885 [](std::pair<unsigned, unsigned> &
P) {
18886 P.first = P.second = 0;
18888if (Cnt < StartIdx + MinVF) {
18889for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18890 [](std::pair<unsigned, unsigned> &
P) {
18891 P.first = P.second = 0;
18893 StartIdx = Cnt +
Size;
18895if (Cnt > Sz -
Size - MinVF) {
18897 [](std::pair<unsigned, unsigned> &
P) {
18898 P.first = P.second = 0;
18907if (
Size > 2 && Res &&
18909 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18910 std::placeholders::_1))) {
18914// Check for the very big VFs that we're not rebuilding same 18915// trees, just with larger number of elements. 18916if (
Size > MaxRegVF && TreeSize > 1 &&
18918 std::bind(FirstSizeSame, TreeSize,
18919 std::placeholders::_1))) {
18921while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18927 [&](std::pair<unsigned, unsigned> &
P) {
18928 if (Size >= MaxRegVF)
18929 P.second = std::max(P.second, TreeSize);
18931 P.first = std::max(P.first, TreeSize);
18934 AnyProfitableGraph =
true;
18938if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18939 AnyProfitableGraph =
true;
18940 StartIdx = std::distance(
18941 RangeSizes.begin(),
18942find_if(RangeSizes.drop_front(Sz),
18943 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18944 std::placeholders::_1)));
18949// All values vectorized - exit. 18950if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18951returnP.first == 0 &&
P.second == 0;
18954// Check if tried all attempts or no need for the last attempts at all. 18955if (Repeat >= MaxAttempts ||
18956 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18958constexprunsigned StoresLimit = 64;
18959constunsigned MaxTotalNum = std::min<unsigned>(
18961static_cast<unsigned>(
18964 RangeSizes.begin(),
18965find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18966 std::placeholders::_1))) +
18968unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18971 CandidateVFs.clear();
18973 CandidateVFs.push_back(Limit);
18974if (VF > MaxTotalNum || VF >= StoresLimit)
18976for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18978P.first = std::max(
P.second,
P.first);
18980// Last attempt to vectorize max number of elements, if all previous 18981// attempts were unsuccessful because of the cost issues. 18982 CandidateVFs.push_back(VF);
18987// Stores pair (first: index of the store into Stores array ref, address of 18988// which taken as base, second: sorted set of pairs {index, dist}, which are 18989// indices of stores in the set and their store location distances relative to 18990// the base address). 18992// Need to store the index of the very first store separately, since the set 18993// may be reordered after the insertion and the first store may be moved. This 18994// container allows to reduce number of calls of getPointersDiff() function. 18996// Inserts the specified store SI with the given index Idx to the set of the 18997// stores. If the store with the same distance is found already - stop 18998// insertion, try to vectorize already found stores. If some stores from this 18999// sequence were not vectorized - try to vectorize them with the new store 19000// later. But this logic is applied only to the stores, that come before the 19001// previous store with the same distance. 19008// - Scan this from the last to first store. The very first bunch of stores is 19009// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores 19011// - The next store in the list - #1 - has the same distance from store #5 as 19013// - Try to vectorize sequence of stores 4,2,3,5. 19014// - If all these stores are vectorized - just drop them. 19015// - If some of them are not vectorized (say, #3 and #5), do extra analysis. 19016// - Start new stores sequence. 19017// The new bunch of stores is {1, {1, 0}}. 19018// - Add the stores from previous sequence, that were not vectorized. 19019// Here we consider the stores in the reversed order, rather they are used in 19020// the IR (Stores are reversed already, see vectorizeStoreChains() function). 19021// Store #3 can be added -> comes after store #4 with the same distance as 19023// Store #5 cannot be added - comes before store #4. 19024// This logic allows to improve the compile time, we assume that the stores 19025// after previous store with the same distance most likely have memory 19026// dependencies and no need to waste compile time to try to vectorize them. 19027// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}. 19029for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19031 Stores[
Set.first]->getValueOperand()->getType(),
19032 Stores[
Set.first]->getPointerOperand(),
19033SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
19034/*StrictCheck=*/true);
19037auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
19038if (It ==
Set.second.end()) {
19039Set.second.emplace(
Idx, *Diff);
19042// Try to vectorize the first found set to avoid duplicate analysis. 19043 TryToVectorize(
Set.second);
19044unsigned ItIdx = It->first;
19045int ItDist = It->second;
19046 StoreIndexToDistSet PrevSet;
19047copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
19048 [&](
const std::pair<unsigned, int> &Pair) {
19049 return Pair.first > ItIdx;
19053Set.second.emplace(
Idx, 0);
19054// Insert stores that followed previous match to try to vectorize them 19056unsigned StartIdx = ItIdx + 1;
19058// Distances to previously found dup store (or this store, since they 19059// store to the same addresses). 19061for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
19062// Do not try to vectorize sequences, we already tried. 19063if (VectorizedStores.
contains(Stores[Pair.first]))
19065unsigned BI = Pair.first - StartIdx;
19066 UsedStores.set(BI);
19067 Dists[BI] = Pair.second - ItDist;
19069for (
unsignedI = StartIdx;
I <
Idx; ++
I) {
19070unsigned BI =
I - StartIdx;
19071if (UsedStores.test(BI))
19072Set.second.emplace(
I, Dists[BI]);
19076auto &Res = SortedStores.emplace_back();
19078 Res.second.emplace(
Idx, 0);
19080Type *PrevValTy =
nullptr;
19082if (
R.isDeleted(SI))
19085 PrevValTy =
SI->getValueOperand()->getType();
19086// Check that we do not try to vectorize stores of different types. 19087if (PrevValTy !=
SI->getValueOperand()->getType()) {
19088for (
auto &Set : SortedStores)
19089 TryToVectorize(
Set.second);
19090 SortedStores.clear();
19091 PrevValTy =
SI->getValueOperand()->getType();
19093 FillStoresSet(
I, SI);
19096// Final vectorization attempt. 19097for (
auto &Set : SortedStores)
19098 TryToVectorize(
Set.second);
19103void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19104// Initialize the collections. We will make a single pass over the block. 19108// Visit the store and getelementptr instructions in BB and organize them in 19109// Stores and GEPs according to the underlying objects of their pointer 19112// Ignore store instructions that are volatile or have a pointer operand 19113// that doesn't point to a scalar type. 19114if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19115if (!
SI->isSimple())
19122// Ignore getelementptr instructions that have more than one index, a 19123// constant index, or a pointer operand that doesn't point to a scalar 19125elseif (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19126if (
GEP->getNumIndices() != 1)
19129if (isa<Constant>(
Idx))
19133if (
GEP->getType()->isVectorTy())
19145LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = " 19146 << VL.
size() <<
".\n");
19148// Check that all of the parts are instructions of the same type, 19149// we permit an alternate opcode via InstructionsState. 19155// Make sure invalid types (including vector type) are rejected before 19156// determining vectorization factor for scalar instructions. 19157for (
Value *V : VL) {
19158Type *Ty =
V->getType();
19160// NOTE: the following will give user internal llvm type name, which may 19162R.getORE()->emit([&]() {
19163 std::string TypeStr;
19167 <<
"Cannot SLP vectorize list: type " 19168 << TypeStr +
" is unsupported by vectorizer";
19175unsigned Sz =
R.getVectorElementSize(I0);
19176unsigned MinVF =
R.getMinVF(Sz);
19177unsigned MaxVF = std::max<unsigned>(
19179 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19181R.getORE()->emit([&]() {
19183 <<
"Cannot SLP vectorize list: vectorization factor " 19184 <<
"less than 2 is not supported";
19189bool Changed =
false;
19190bool CandidateFound =
false;
19193unsigned NextInst = 0, MaxInst = VL.size();
19194for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19196// No actual vectorization should happen, if number of parts is the same as 19197// provided vectorization factor (i.e. the scalar type is used for vector 19198// code during codegen). 19202for (
unsignedI = NextInst;
I < MaxInst; ++
I) {
19203unsigned ActualVF = std::min(MaxInst -
I, VF);
19208if (MaxVFOnly && ActualVF < MaxVF)
19210if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19215for (
Value *V : VL.drop_front(
I)) {
19216// Check that a previous iteration of this loop did not delete the 19218if (
auto *Inst = dyn_cast<Instruction>(V);
19219 !Inst || !
R.isDeleted(Inst)) {
19222if (
Idx == ActualVF)
19226// Not enough vectorizable instructions - exit. 19227if (
Idx != ActualVF)
19230LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations " 19234if (
R.isTreeTinyAndNotFullyVectorizable())
19236R.reorderTopToBottom();
19237R.reorderBottomToTop(
19238/*IgnoreReorder=*/!isa<InsertElementInst>(Ops.
front()) &&
19239 !
R.doesRootHaveInTreeUses());
19241R.buildExternalUses();
19243R.computeMinimumValueSizes();
19245 CandidateFound =
true;
19246 MinCost = std::min(MinCost,
Cost);
19249 <<
" for VF=" << ActualVF <<
"\n");
19253 cast<Instruction>(Ops[0]))
19254 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19255 <<
" and with tree size " 19256 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19259// Move to the next bundle. 19267if (!Changed && CandidateFound) {
19268R.getORE()->emit([&]() {
19270 <<
"List vectorization was possible but not beneficial with cost " 19271 <<
ore::NV(
"Cost", MinCost) <<
" >= " 19274 }
elseif (!Changed) {
19275R.getORE()->emit([&]() {
19277 <<
"Cannot SLP vectorize list: vectorization was impossible" 19278 <<
" with available vectorization factors";
19288if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19293// Vectorize in current basic block only. 19294auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19295auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19296if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19297R.isDeleted(Op0) ||
R.isDeleted(Op1))
19300// First collect all possible candidates 19304auto *
A = dyn_cast<BinaryOperator>(Op0);
19305auto *
B = dyn_cast<BinaryOperator>(Op1);
19307if (
A &&
B &&
B->hasOneUse()) {
19308auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19309auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19310if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19312if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19316if (
B &&
A &&
A->hasOneUse()) {
19317auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19318auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19319if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19321if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19325if (Candidates.
size() == 1)
19326return tryToVectorizeList({Op0, Op1},
R);
19328// We have multiple options. Try to pick the single best. 19329 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19332return tryToVectorizeList(
19333 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19338/// Model horizontal reductions. 19340/// A horizontal reduction is a tree of reduction instructions that has values 19341/// that can be put into a vector as its leaves. For example: 19348/// This tree has "mul" as its leaf values and "+" as its reduction 19349/// instructions. A reduction can feed into a store or a binary operation 19367 ReductionOpsListType ReductionOps;
19368 /// List of possibly reduced values. 19370 /// Maps reduced value to the corresponding reduction operation. 19373 /// The type of reduction operation. 19375 /// Checks if the optimization of original scalar identity operations on 19376 /// matched horizontal reductions is enabled and allowed. 19377bool IsSupportedHorRdxIdentityOp =
false;
19384// And/or are potentially poison-safe logical patterns like: 19385// select x, y, false 19386// select x, true, y 19388return isa<SelectInst>(
I) &&
19392 /// Checks if instruction is associative and can be vectorized. 19394if (Kind == RecurKind::None)
19397// Integer ops that map to select instructions or intrinsics are fine. 19402if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19403// FP min/max are associative except for NaN and -0.0. We do not 19404// have to rule out -0.0 here because the intrinsic semantics do not 19405// specify a fixed result for it. 19406returnI->getFastMathFlags().noNaNs();
19409if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19412returnI->isAssociative();
19416// Poison-safe 'or' takes the form: select X, true, Y 19417// To make that work with the normal operand processing, we skip the 19418// true value operand. 19419// TODO: Change the code and data structures to handle this without a hack. 19421returnI->getOperand(2);
19422returnI->getOperand(
Index);
19425 /// Creates reduction operation with the current opcode. 19429case RecurKind::Or: {
19437case RecurKind::And: {
19445case RecurKind::Add:
19446case RecurKind::Mul:
19447case RecurKind::Xor:
19448case RecurKind::FAdd:
19449case RecurKind::FMul: {
19454case RecurKind::SMax:
19455case RecurKind::SMin:
19456case RecurKind::UMax:
19457case RecurKind::UMin:
19464case RecurKind::FMax:
19465case RecurKind::FMin:
19466case RecurKind::FMaximum:
19467case RecurKind::FMinimum: {
19476 /// Creates reduction operation with the current opcode with the IR flags 19477 /// from \p ReductionOps, dropping nuw/nsw flags. 19480const ReductionOpsListType &ReductionOps) {
19481bool UseSelect = ReductionOps.size() == 2 ||
19483 (ReductionOps.size() == 1 &&
19484any_of(ReductionOps.front(), IsaPred<SelectInst>));
19485assert((!UseSelect || ReductionOps.size() != 2 ||
19486 isa<SelectInst>(ReductionOps[1][0])) &&
19487"Expected cmp + select pairs for reduction");
19490if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19492/*IncludeWrapFlags=*/false);
19494/*IncludeWrapFlags=*/false);
19504auto *
I = dyn_cast<Instruction>(V);
19506return RecurKind::None;
19508return RecurKind::Add;
19510return RecurKind::Mul;
19513return RecurKind::And;
19516return RecurKind::Or;
19518return RecurKind::Xor;
19520return RecurKind::FAdd;
19522return RecurKind::FMul;
19525return RecurKind::FMax;
19527return RecurKind::FMin;
19530return RecurKind::FMaximum;
19532return RecurKind::FMinimum;
19533// This matches either cmp+select or intrinsics. SLP is expected to handle 19535// TODO: If we are canonicalizing to intrinsics, we can remove several 19536// special-case paths that deal with selects. 19538return RecurKind::SMax;
19540return RecurKind::SMin;
19542return RecurKind::UMax;
19544return RecurKind::UMin;
19546if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19547// Try harder: look for min/max pattern based on instructions producing 19548// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). 19549// During the intermediate stages of SLP, it's very common to have 19550// pattern like this (since optimizeGatherSequence is run only once 19552// %1 = extractelement <2 x i32> %a, i32 0 19553// %2 = extractelement <2 x i32> %a, i32 1 19554// %cond = icmp sgt i32 %1, %2 19555// %3 = extractelement <2 x i32> %a, i32 0 19556// %4 = extractelement <2 x i32> %a, i32 1 19557// %select = select i1 %cond, i32 %3, i32 %4 19566// TODO: Support inverse predicates. 19568if (!isa<ExtractElementInst>(
RHS) ||
19570return RecurKind::None;
19572if (!isa<ExtractElementInst>(
LHS) ||
19574return RecurKind::None;
19576if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19577return RecurKind::None;
19581return RecurKind::None;
19586return RecurKind::None;
19589return RecurKind::SMax;
19592return RecurKind::SMin;
19595return RecurKind::UMax;
19598return RecurKind::UMin;
19601return RecurKind::None;
19604 /// Get the index of the first operand. 19606return isCmpSelMinMax(
I) ? 1 : 0;
19610 /// Total number of operands in the reduction operation. 19612return isCmpSelMinMax(
I) ? 3 : 2;
19615 /// Checks if the instruction is in basic block \p BB. 19616 /// For a cmp+sel min/max reduction check that both ops are in \p BB. 19618if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19619auto *Sel = cast<SelectInst>(
I);
19620auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19621return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19623returnI->getParent() == BB;
19626 /// Expected number of uses for reduction operations/reduced values. 19627staticbool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19628if (IsCmpSelMinMax) {
19629// SelectInst must be used twice while the condition op must have single 19631if (
auto *Sel = dyn_cast<SelectInst>(
I))
19632return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19633returnI->hasNUses(2);
19636// Arithmetic reduction operation must be used once only. 19637returnI->hasOneUse();
19640 /// Initializes the list of reduction operations. 19642if (isCmpSelMinMax(
I))
19643 ReductionOps.assign(2, ReductionOpsType());
19645 ReductionOps.assign(1, ReductionOpsType());
19648 /// Add all reduction operations for the reduction instruction \p I. 19650if (isCmpSelMinMax(
I)) {
19651 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19652 ReductionOps[1].emplace_back(
I);
19654 ReductionOps[0].emplace_back(
I);
19659int Sz = Data.size();
19660auto *
I = dyn_cast<Instruction>(Data.front());
19668 /// Try to find a reduction tree. 19672 RdxKind = HorizontalReduction::getRdxKind(Root);
19673if (!isVectorizable(RdxKind, Root))
19676// Analyze "regular" integer/FP types for reductions - no target-specific 19677// types or pointers. 19682// Though the ultimate reduction may have multiple uses, its condition must 19683// have only single use. 19684if (
auto *Sel = dyn_cast<SelectInst>(Root))
19685if (!Sel->getCondition()->hasOneUse())
19688 ReductionRoot = Root;
19690// Iterate through all the operands of the possible reduction tree and 19691// gather all the reduced values, sorting them by their value id. 19693bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19695 1, std::make_pair(Root, 0));
19696// Checks if the operands of the \p TreeN instruction are also reduction 19697// operations or should be treated as reduced values or an extra argument, 19698// which is not part of the reduction. 19703for (
intI :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19704 getNumberOfOperands(TreeN)))) {
19705Value *EdgeVal = getRdxOperand(TreeN,
I);
19706 ReducedValsToOps[EdgeVal].push_back(TreeN);
19707auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19708// If the edge is not an instruction, or it is different from the main 19709// reduction opcode or has too many uses - possible reduced value. 19710// Also, do not try to reduce const values, if the operation is not 19714 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19715 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19716 !isVectorizable(RdxKind, EdgeInst) ||
19717 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19718all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19719 PossibleReducedVals.push_back(EdgeVal);
19722 ReductionOps.push_back(EdgeInst);
19725// Try to regroup reduced values so that it gets more profitable to try to 19726// reduce them. Values are grouped by their value ids, instructions - by 19727// instruction op id and/or alternate op id, plus do extra analysis for 19728// loads (grouping them by the distabce between pointers) and cmp 19729// instructions (grouping them by the predicate). 19733 PossibleReducedVals;
19734 initReductionOps(Root);
19738auto GenerateLoadsSubkey = [&](
size_tKey,
LoadInst *LI) {
19742if (!LoadKeyUsed.
insert(Key).second) {
19743auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19744if (LIt != LoadsMap.
end()) {
19745for (
LoadInst *RLI : LIt->second) {
19748/*StrictCheck=*/true))
19751for (
LoadInst *RLI : LIt->second) {
19758if (LIt->second.size() > 2) {
19760hash_value(LIt->second.back()->getPointerOperand());
19766 .first->second.push_back(LI);
19770while (!Worklist.empty()) {
19771auto [TreeN, Level] = Worklist.pop_back_val();
19774 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19775 addReductionOps(TreeN);
19776// Add reduction values. The values are sorted for better vectorization 19778for (
Value *V : PossibleRedVals) {
19781/*AllowAlternate=*/false);
19782 ++PossibleReducedVals[
Key][
Idx]
19783 .
insert(std::make_pair(V, 0))
19787 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19789auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19790// Sort values by the total number of values kinds to start the reduction 19791// from the longest possible reduced values sequences. 19792for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19793auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19795for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19798auto RedValsVect = It->second.takeVector();
19800for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19801 PossibleRedValsVect.
back().append(Data.second, Data.first);
19803stable_sort(PossibleRedValsVect, [](
constauto &P1,
constauto &P2) {
19804returnP1.size() > P2.size();
19809 (!isGoodForReduction(Data) &&
19810 (!isa<LoadInst>(Data.front()) ||
19811 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19813 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19815 cast<LoadInst>(ReducedVals[NewIdx].front())
19817 NewIdx = ReducedVals.
size();
19820 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19823// Sort the reduced values by number of same/alternate opcode and/or pointer 19826returnP1.size() > P2.
size();
19831 /// Attempt to vectorize the tree found by matchAssociativeReduction. 19835constexprunsigned RegMaxNumber = 4;
19836constexprunsigned RedValsMaxNumber = 128;
19837// If there are a sufficient number of reduction values, reduce 19838// to a nearby power-of-2. We can safely generate oversized 19839// vectors and rely on the backend to split them to legal sizes. 19840if (
unsigned NumReducedVals = std::accumulate(
19841 ReducedVals.
begin(), ReducedVals.
end(), 0,
19843 if (!isGoodForReduction(Vals))
19845 return Num + Vals.size();
19847 NumReducedVals < ReductionLimit &&
19851for (ReductionOpsType &RdxOps : ReductionOps)
19852for (
Value *RdxOp : RdxOps)
19853V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19861// Track the reduced values in case if they are replaced by extractelement 19862// because of the vectorization. 19864 ReducedVals.
front().size());
19866// The compare instruction of a min/max is the insertion point for new 19867// instructions and may be replaced with a new compare instruction. 19868auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19869assert(isa<SelectInst>(RdxRootInst) &&
19870"Expected min/max reduction to have select root instruction");
19871Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19872assert(isa<Instruction>(ScalarCond) &&
19873"Expected min/max reduction to have compare condition");
19874return cast<Instruction>(ScalarCond);
19877bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19878 return isBoolLogicOp(cast<Instruction>(V));
19880// Return new VectorizedTree, based on previous value. 19881auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19882if (VectorizedTree) {
19883// Update the final value in the reduction. 19885 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19886if (AnyBoolLogicOp) {
19887auto It = ReducedValsToOps.
find(VectorizedTree);
19888auto It1 = ReducedValsToOps.
find(Res);
19889if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19891 (It != ReducedValsToOps.
end() &&
19893 return isBoolLogicOp(I) &&
19894 getRdxOperand(I, 0) == VectorizedTree;
19898 (It1 != ReducedValsToOps.
end() &&
19900 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19904 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19908return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19911// Initialize the final value in the reduction. 19915 ReductionOps.front().size());
19916for (ReductionOpsType &RdxOps : ReductionOps)
19917for (
Value *RdxOp : RdxOps) {
19920 IgnoreList.insert(RdxOp);
19922// Intersect the fast-math-flags from all reduction operations. 19925for (
Value *U : IgnoreList)
19926if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19927 RdxFMF &= FPMO->getFastMathFlags();
19928bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19930// Need to track reduced vals, they may be changed during vectorization of 19933for (
Value *V : Candidates)
19934 TrackedVals.try_emplace(V, V);
19938auto *It = MV.
find(V);
19939assert(It != MV.
end() &&
"Unable to find given key.");
19944// List of the values that were reduced in other trees as part of gather 19945// nodes and thus requiring extract if fully vectorized in other trees. 19948bool CheckForReusedReductionOps =
false;
19949// Try to vectorize elements based on their type. 19953for (
unsignedI = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19955 InstructionsState S = States[
I];
19959for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19960Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19961// Check if the reduction value was not overriden by the extractelement 19962// instruction because of the vectorization and exclude it, if it is not 19963// compatible with other values. 19964// Also check if the instruction was folded to constant/other value. 19965auto *Inst = dyn_cast<Instruction>(RdxVal);
19967 (!S || !S.isOpcodeOrAlt(Inst))) ||
19971 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19973bool ShuffledExtracts =
false;
19974// Try to handle shuffled extractelements. 19975if (S && S.getOpcode() == Instruction::ExtractElement &&
19976 !S.isAltShuffle() &&
I + 1 <
E) {
19978for (
Value *RV : ReducedVals[
I + 1]) {
19979Value *RdxVal = TrackedVals.at(RV);
19980// Check if the reduction value was not overriden by the 19981// extractelement instruction because of the vectorization and 19982// exclude it, if it is not compatible with other values. 19983auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19986 CommonCandidates.push_back(RdxVal);
19987 TrackedToOrig.try_emplace(RdxVal, RV);
19992 Candidates.
swap(CommonCandidates);
19993 ShuffledExtracts =
true;
19997// Emit code for constant values. 20000Value *OrigV = TrackedToOrig.at(Candidates.
front());
20001 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20003 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
20004Value *OrigV = TrackedToOrig.at(VC);
20005 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20006if (
auto *ResI = dyn_cast<Instruction>(Res))
20007V.analyzedReductionRoot(ResI);
20009 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20013unsigned NumReducedVals = Candidates.
size();
20014if (NumReducedVals < ReductionLimit &&
20015 (NumReducedVals < 2 || !
isSplat(Candidates)))
20018// Check if we support repeated scalar values processing (optimization of 20019// original scalar identity operations on matched horizontal reductions). 20020 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20021 RdxKind != RecurKind::FMul &&
20022 RdxKind != RecurKind::FMulAdd;
20023// Gather same values. 20025if (IsSupportedHorRdxIdentityOp)
20026for (
Value *V : Candidates) {
20027Value *OrigV = TrackedToOrig.at(V);
20028 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20030// Used to check if the reduced values used same number of times. In this 20031// case the compiler may produce better code. E.g. if reduced values are 20032// aabbccdd (8 x values), then the first node of the tree will have a node 20033// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>. 20034// Plus, the final reduction will be performed on <8 x aabbccdd>. 20035// Instead compiler may build <4 x abcd> tree immediately, + reduction (4 20037// Currently it only handles add/fadd/xor. and/or/min/max do not require 20038// this analysis, other operations may require an extra estimation of 20039// the profitability. 20040bool SameScaleFactor =
false;
20041bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20042 SameValuesCounter.
size() != Candidates.size();
20044if (OptReusedScalars) {
20046 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20047 RdxKind == RecurKind::Xor) &&
20049 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
20050returnP.second == SameValuesCounter.
front().second;
20052 Candidates.resize(SameValuesCounter.
size());
20053transform(SameValuesCounter, Candidates.begin(),
20054 [&](
constauto &
P) { return TrackedVals.at(P.first); });
20055 NumReducedVals = Candidates.size();
20056// Have a reduction of the same element. 20057if (NumReducedVals == 1) {
20058Value *OrigV = TrackedToOrig.at(Candidates.front());
20059unsigned Cnt = At(SameValuesCounter, OrigV);
20061 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20062 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20063 VectorizedVals.try_emplace(OrigV, Cnt);
20064 ExternallyUsedValues.
insert(OrigV);
20069unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20070unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20071constunsigned MaxElts = std::clamp<unsigned>(
20073 RegMaxNumber * RedValsMaxNumber);
20075unsigned ReduxWidth = NumReducedVals;
20076auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20077unsigned NumParts, NumRegs;
20078Type *ScalarTy = Candidates.front()->getType();
20085while (NumParts > NumRegs) {
20086assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
20093if (NumParts > NumRegs / 2)
20098 ReduxWidth = GetVectorFactor(ReduxWidth);
20099 ReduxWidth = std::min(ReduxWidth, MaxElts);
20102unsigned Pos = Start;
20103// Restarts vectorization attempt with lower vector factor. 20104unsigned PrevReduxWidth = ReduxWidth;
20105bool CheckForReusedReductionOpsLocal =
false;
20106auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20107bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20108if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20109// Check if any of the reduction ops are gathered. If so, worth 20110// trying again with less number of reduction ops. 20111 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20114if (Pos < NumReducedVals - ReduxWidth + 1)
20115return IsAnyRedOpGathered;
20119 ReduxWidth = GetVectorFactor(ReduxWidth);
20120return IsAnyRedOpGathered;
20122bool AnyVectorized =
false;
20124while (Pos < NumReducedVals - ReduxWidth + 1 &&
20125 ReduxWidth >= ReductionLimit) {
20126// Dependency in tree of the reduction ops - drop this attempt, try 20128if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20130 CheckForReusedReductionOps =
true;
20133 PrevReduxWidth = ReduxWidth;
20135// Been analyzed already - skip. 20136if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20139 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20141 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20143V.areAnalyzedReductionVals(VL)) {
20144 (void)AdjustReducedVals(
/*IgnoreVL=*/true);
20147// Early exit if any of the reduction values were deleted during 20148// previous vectorization attempts. 20150auto *RedValI = dyn_cast<Instruction>(RedVal);
20153returnV.isDeleted(RedValI);
20156V.buildTree(VL, IgnoreList);
20157if (
V.isTreeTinyAndNotFullyVectorizable(
/*ForReduction=*/true)) {
20158if (!AdjustReducedVals())
20159V.analyzedReductionVals(VL);
20162if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20163if (!AdjustReducedVals())
20164V.analyzedReductionVals(VL);
20167V.reorderTopToBottom();
20168// No need to reorder the root node at all. 20169V.reorderBottomToTop(
/*IgnoreReorder=*/true);
20170// Keep extracted other reduction values, if they are used in the 20171// vectorization trees. 20173 ExternallyUsedValues);
20174// The reduction root is used as the insertion point for new 20175// instructions, so set it as externally used to prevent it from being 20177 LocalExternallyUsedValues.insert(ReductionRoot);
20178for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20179if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20181for (
Value *V : ReducedVals[Cnt])
20182if (isa<Instruction>(V))
20183 LocalExternallyUsedValues.insert(TrackedVals[V]);
20185if (!IsSupportedHorRdxIdentityOp) {
20186// Number of uses of the candidates in the vector of values. 20188"Reused values counter map is not empty");
20189for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20190if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20192Value *
V = Candidates[Cnt];
20193Value *OrigV = TrackedToOrig.at(V);
20194 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20199// Gather externally used values. 20201for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20202if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20204Value *RdxVal = Candidates[Cnt];
20205if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20206 RdxVal = It->second;
20207if (!Visited.
insert(RdxVal).second)
20209// Check if the scalar was vectorized as part of the vectorization 20210// tree but not the top node. 20211if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20212 LocalExternallyUsedValues.insert(RdxVal);
20215Value *OrigV = TrackedToOrig.at(RdxVal);
20217 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20218if (NumOps != ReducedValsToOps.
at(OrigV).size())
20219 LocalExternallyUsedValues.insert(RdxVal);
20221// Do not need the list of reused scalars in regular mode anymore. 20222if (!IsSupportedHorRdxIdentityOp)
20223 SameValuesCounter.
clear();
20224for (
Value *RdxVal : VL)
20225if (RequiredExtract.
contains(RdxVal))
20226 LocalExternallyUsedValues.insert(RdxVal);
20227V.buildExternalUses(LocalExternallyUsedValues);
20229V.computeMinimumValueSizes();
20234 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20237 <<
" for reduction\n");
20241V.getORE()->emit([&]() {
20243 ReducedValsToOps.
at(VL[0]).front())
20244 <<
"Vectorizing horizontal reduction is possible " 20245 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20246 <<
" and threshold " 20249if (!AdjustReducedVals()) {
20250V.analyzedReductionVals(VL);
20251unsignedOffset = Pos == Start ? Pos : Pos - 1;
20252if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20253// Add subvectors of VL to the list of the analyzed values. 20255 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20256 VF >= ReductionLimit;
20258 *
TTI, VL.front()->getType(), VF - 1)) {
20260V.getCanonicalGraphSize() !=
V.getTreeSize())
20262for (
unsignedIdx : seq<unsigned>(ReduxWidth - VF))
20270LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:" 20271 <<
Cost <<
". (HorRdx)\n");
20272V.getORE()->emit([&]() {
20274 ReducedValsToOps.
at(VL[0]).front())
20275 <<
"Vectorized horizontal reduction with cost " 20277 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20282// Emit a reduction. If the root is a select (min/max idiom), the insert 20283// point is the compare condition of that select. 20284Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20287 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20289// Vectorize a tree. 20290Value *VectorizedRoot =
20291V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20292// Update TrackedToOrig mapping, since the tracked values might be 20294for (
Value *RdxVal : Candidates) {
20295Value *OrigVal = TrackedToOrig.at(RdxVal);
20296Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20297if (TransformedRdxVal != RdxVal)
20298 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20303// To prevent poison from leaking across what used to be sequential, 20304// safe, scalar boolean logic operations, the reduction operand must be 20307 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20309// Emit code to correctly handle reused reduced values, if required. 20310if (OptReusedScalars && !SameScaleFactor) {
20311 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20312 SameValuesCounter, TrackedToOrig);
20315Value *ReducedSubTree;
20316Type *ScalarTy = VL.front()->getType();
20317if (isa<FixedVectorType>(ScalarTy)) {
20322for (
unsignedI : seq<unsigned>(ScalarTyNumElements)) {
20323// Do reduction for each lane. 20324// e.g., do reduce add for 20325// VL[0] = <4 x Ty> <a, b, c, d> 20326// VL[1] = <4 x Ty> <e, f, g, h> 20327// Lane[0] = <2 x Ty> <a, e> 20328// Lane[1] = <2 x Ty> <b, f> 20329// Lane[2] = <2 x Ty> <c, g> 20330// Lane[3] = <2 x Ty> <d, h> 20331// result[0] = reduce add Lane[0] 20332// result[1] = reduce add Lane[1] 20333// result[2] = reduce add Lane[2] 20334// result[3] = reduce add Lane[3] 20340 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20343 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20346if (ReducedSubTree->
getType() != VL.front()->getType()) {
20347assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20348"Expected different reduction type.");
20350 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20351V.isSignedMinBitwidthRootNode());
20354// Improved analysis for add/fadd/xor reductions with same scale factor 20355// for all operands of reductions. We can emit scalar ops for them 20357if (OptReusedScalars && SameScaleFactor)
20358 ReducedSubTree = emitScaleForReusedOps(
20359 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20361 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20362// Count vectorized reduced values to exclude them from final reduction. 20363for (
Value *RdxVal : VL) {
20364Value *OrigV = TrackedToOrig.at(RdxVal);
20365if (IsSupportedHorRdxIdentityOp) {
20366 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20369 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20370if (!
V.isVectorized(RdxVal))
20371 RequiredExtract.
insert(RdxVal);
20375 ReduxWidth = NumReducedVals - Pos;
20377 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20378 AnyVectorized =
true;
20380if (OptReusedScalars && !AnyVectorized) {
20381for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20382Value *RdxVal = TrackedVals.at(
P.first);
20383Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20384 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20385 VectorizedVals.try_emplace(
P.first,
P.second);
20390if (VectorizedTree) {
20391// Reorder operands of bool logical op in the natural order to avoid 20392// possible problem with poison propagation. If not possible to reorder 20393// (both operands are originally RHS), emit an extra freeze instruction 20394// for the LHS operand. 20395// I.e., if we have original code like this: 20396// RedOp1 = select i1 ?, i1 LHS, i1 false 20397// RedOp2 = select i1 RHS, i1 ?, i1 false 20399// Then, we swap LHS/RHS to create a new op that matches the poison 20400// semantics of the original code. 20402// If we have original code like this and both values could be poison: 20403// RedOp1 = select i1 ?, i1 LHS, i1 false 20404// RedOp2 = select i1 ?, i1 RHS, i1 false 20406// Then, we must freeze LHS in the new op. 20411if (!AnyBoolLogicOp)
20413if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20414 getRdxOperand(RedOp1, 0) ==
LHS ||
20417if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20418 getRdxOperand(RedOp2, 0) ==
RHS ||
20423if (
LHS != VectorizedTree)
20426// Finish the reduction. 20427// Need to add extra arguments and not vectorized possible reduction 20429// Try to avoid dependencies between the scalar remainders after 20434unsigned Sz = InstVals.
size();
20437for (
unsignedI = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20440Value *RdxVal1 = InstVals[
I].second;
20441Value *StableRdxVal1 = RdxVal1;
20442auto It1 = TrackedVals.find(RdxVal1);
20443if (It1 != TrackedVals.end())
20444 StableRdxVal1 = It1->second;
20445Value *RdxVal2 = InstVals[
I + 1].second;
20446Value *StableRdxVal2 = RdxVal2;
20447auto It2 = TrackedVals.find(RdxVal2);
20448if (It2 != TrackedVals.end())
20449 StableRdxVal2 = It2->second;
20450// To prevent poison from leaking across what used to be 20451// sequential, safe, scalar boolean logic operations, the 20452// reduction operand must be frozen. 20453 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20455Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20456 StableRdxVal2,
"op.rdx", ReductionOps);
20457 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20460 ExtraReds[Sz / 2] = InstVals.
back();
20464 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20468for (
Value *RdxVal : Candidates) {
20469if (!Visited.
insert(RdxVal).second)
20471unsigned NumOps = VectorizedVals.lookup(RdxVal);
20477// Iterate through all not-vectorized reduction values/extra arguments. 20478bool InitStep =
true;
20479while (ExtraReductions.
size() > 1) {
20481 FinalGen(ExtraReductions, InitStep);
20482 ExtraReductions.
swap(NewReds);
20485 VectorizedTree = ExtraReductions.
front().second;
20487 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20489// The original scalar reduction is expected to have no remaining 20490// uses outside the reduction tree itself. Assert that we got this 20491// correct, replace internal uses with undef, and mark for eventual 20496 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20503for (
auto *U :
Ignore->users()) {
20505"All users must be either in the reduction ops list.");
20508if (!
Ignore->use_empty()) {
20510Ignore->replaceAllUsesWith(
P);
20513V.removeInstructionsAndOperands(RdxOps);
20515 }
elseif (!CheckForReusedReductionOps) {
20516for (ReductionOpsType &RdxOps : ReductionOps)
20517for (
Value *RdxOp : RdxOps)
20518V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20520return VectorizedTree;
20524 /// Calculate the cost of a reduction. 20530Type *ScalarTy = ReducedVals.
front()->getType();
20531unsigned ReduxWidth = ReducedVals.
size();
20534// If all of the reduced values are constant, the vector cost is 0, since 20535// the reduction value can be calculated at the compile time. 20539// Scalar cost is repeated for N-1 elements. 20540int Cnt = ReducedVals.
size();
20541for (
Value *RdxVal : ReducedVals) {
20546Cost += GenCostFn();
20551auto *RdxOp = cast<Instruction>(U);
20552if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20562Cost += GenCostFn();
20567case RecurKind::Add:
20568case RecurKind::Mul:
20570case RecurKind::And:
20571case RecurKind::Xor:
20572case RecurKind::FAdd:
20573case RecurKind::FMul: {
20576if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20579for (
unsignedI : seq<unsigned>(ReducedVals.size())) {
20591auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20592 std::make_pair(RedTy,
true));
20593if (RType == RedTy) {
20603 ScalarCost = EvaluateScalarCost([&]() {
20608case RecurKind::FMax:
20609case RecurKind::FMin:
20610case RecurKind::FMaximum:
20611case RecurKind::FMinimum:
20612case RecurKind::SMax:
20613case RecurKind::SMin:
20614case RecurKind::UMax:
20615case RecurKind::UMin: {
20619 ScalarCost = EvaluateScalarCost([&]() {
20629LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20631 <<
" (It is a splitting reduction)\n");
20632return VectorCost - ScalarCost;
20635 /// Emit a horizontal reduction of the vectorized value. 20638assert(VectorizedValue &&
"Need to have a vectorized tree node");
20639assert(RdxKind != RecurKind::FMulAdd &&
20640"A call to the llvm.fmuladd intrinsic is not handled yet");
20642auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20643if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20644 RdxKind == RecurKind::Add &&
20646// Convert vector_reduce_add(ZExt(<n x i1>)) to 20647// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)). 20649 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20650 ++NumVectorInstructions;
20653 ++NumVectorInstructions;
20657 /// Emits optimized code for unique scalar value reused \p Cnt times. 20660assert(IsSupportedHorRdxIdentityOp &&
20661"The optimization of matched scalar identity horizontal reductions " 20662"must be supported.");
20664return VectorizedValue;
20666case RecurKind::Add: {
20668Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20670 << VectorizedValue <<
". (HorRdx)\n");
20671return Builder.
CreateMul(VectorizedValue, Scale);
20673case RecurKind::Xor: {
20674// res = n % 2 ? 0 : vv 20675LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20679return VectorizedValue;
20681case RecurKind::FAdd: {
20683Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20685 << VectorizedValue <<
". (HorRdx)\n");
20686return Builder.
CreateFMul(VectorizedValue, Scale);
20688case RecurKind::And:
20690case RecurKind::SMax:
20691case RecurKind::SMin:
20692case RecurKind::UMax:
20693case RecurKind::UMin:
20694case RecurKind::FMax:
20695case RecurKind::FMin:
20696case RecurKind::FMaximum:
20697case RecurKind::FMinimum:
20699return VectorizedValue;
20700case RecurKind::Mul:
20701case RecurKind::FMul:
20702case RecurKind::FMulAdd:
20703case RecurKind::IAnyOf:
20704case RecurKind::FAnyOf:
20705case RecurKind::IFindLastIV:
20706case RecurKind::FFindLastIV:
20707case RecurKind::None:
20713 /// Emits actual operation for the scalar identity values, found during 20714 /// horizontal reduction analysis. 20719assert(IsSupportedHorRdxIdentityOp &&
20720"The optimization of matched scalar identity horizontal reductions " 20721"must be supported.");
20723auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20724if (VTy->getElementType() != VL.
front()->getType()) {
20728R.isSignedMinBitwidthRootNode());
20731case RecurKind::Add: {
20732// root = mul prev_root, <1, 1, n, 1> 20734for (
Value *V : VL) {
20735unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20736 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
/*IsSigned=*/false));
20740 << VectorizedValue <<
". (HorRdx)\n");
20741return Builder.
CreateMul(VectorizedValue, Scale);
20743case RecurKind::And:
20745// No need for multiple or/and(s). 20748return VectorizedValue;
20749case RecurKind::SMax:
20750case RecurKind::SMin:
20751case RecurKind::UMax:
20752case RecurKind::UMin:
20753case RecurKind::FMax:
20754case RecurKind::FMin:
20755case RecurKind::FMaximum:
20756case RecurKind::FMinimum:
20757// No need for multiple min/max(s) of the same value. 20760return VectorizedValue;
20761case RecurKind::Xor: {
20762// Replace values with even number of repeats with 0, since 20764// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, 20765// 7>, if elements 4th and 6th elements have even number of repeats. 20767 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20769 std::iota(
Mask.begin(),
Mask.end(), 0);
20770bool NeedShuffle =
false;
20771for (
unsignedI = 0, VF = VL.size();
I < VF; ++
I) {
20773unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20782dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20786 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20787return VectorizedValue;
20789case RecurKind::FAdd: {
20790// root = fmul prev_root, <1.0, 1.0, n.0, 1.0> 20792for (
Value *V : VL) {
20793unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20794 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20797return Builder.
CreateFMul(VectorizedValue, Scale);
20799case RecurKind::Mul:
20800case RecurKind::FMul:
20801case RecurKind::FMulAdd:
20802case RecurKind::IAnyOf:
20803case RecurKind::FAnyOf:
20804case RecurKind::IFindLastIV:
20805case RecurKind::FFindLastIV:
20806case RecurKind::None:
20812}
// end anonymous namespace 20814/// Gets recurrence kind from the specified value. 20816return HorizontalReduction::getRdxKind(V);
20819if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20820return cast<FixedVectorType>(IE->getType())->getNumElements();
20822unsigned AggregateSize = 1;
20823auto *
IV = cast<InsertValueInst>(InsertInst);
20824Type *CurrentType =
IV->getType();
20826if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20827for (
auto *Elt : ST->elements())
20828if (Elt != ST->getElementType(0))
// check homogeneity 20829return std::nullopt;
20830 AggregateSize *= ST->getNumElements();
20831 CurrentType = ST->getElementType(0);
20832 }
elseif (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20833 AggregateSize *= AT->getNumElements();
20834 CurrentType = AT->getElementType();
20835 }
elseif (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20836 AggregateSize *= VT->getNumElements();
20837return AggregateSize;
20839return AggregateSize;
20841return std::nullopt;
20850unsigned OperandOffset,
constBoUpSLP &R) {
20853 std::optional<unsigned> OperandIndex =
20855if (!OperandIndex || R.isDeleted(LastInsertInst))
20857if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20859 BuildVectorOpds, InsertElts, *OperandIndex, R);
20862 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20863 InsertElts[*OperandIndex] = LastInsertInst;
20865 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20866 }
while (LastInsertInst !=
nullptr &&
20867 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20871/// Recognize construction of vectors like 20872/// %ra = insertelement <4 x float> poison, float %s0, i32 0 20873/// %rb = insertelement <4 x float> %ra, float %s1, i32 1 20874/// %rc = insertelement <4 x float> %rb, float %s2, i32 2 20875/// %rd = insertelement <4 x float> %rc, float %s3, i32 3 20876/// starting from the last insertelement or insertvalue instruction. 20878/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, 20879/// {{float, float}, {float, float}}, [2 x {float, float}] and so on. 20880/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. 20882/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. 20884/// \return true if it matches. 20891assert((isa<InsertElementInst>(LastInsertInst) ||
20892 isa<InsertValueInst>(LastInsertInst)) &&
20893"Expected insertelement or insertvalue instruction!");
20896"Expected empty result vectors!");
20901 BuildVectorOpds.
resize(*AggregateSize);
20902 InsertElts.
resize(*AggregateSize);
20908if (BuildVectorOpds.
size() >= 2)
20914/// Try and get a reduction instruction from a phi node. 20916/// Given a phi node \p P in a block \p ParentBB, consider possible reductions 20917/// if they come from either \p ParentBB or a containing loop latch. 20919/// \returns A candidate reduction value if possible, or \code nullptr \endcode 20920/// if not possible. 20923// There are situations where the reduction value is not dominated by the 20924// reduction phi. Vectorizing such cases has been reported to cause 20925// miscompiles. See PR25787. 20926auto DominatedReduxValue = [&](
Value *R) {
20927return isa<Instruction>(R) &&
20928 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20933// Return the incoming value if it comes from the same BB as the phi node. 20934if (
P->getIncomingBlock(0) == ParentBB) {
20935 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20936 }
elseif (
P->getIncomingBlock(1) == ParentBB) {
20937 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20940if (Rdx && DominatedReduxValue(Rdx))
20943// Otherwise, check whether we have a loop latch to look at. 20951// There is a loop latch, return the incoming value if it comes from 20952// that. This reduction pattern occasionally turns up. 20953if (
P->getIncomingBlock(0) == BBLatch) {
20954 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20955 }
elseif (
P->getIncomingBlock(1) == BBLatch) {
20956 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20959if (Rdx && DominatedReduxValue(Rdx))
20987/// We could have an initial reduction that is not an add. 20988/// r *= v1 + v2 + v3 + v4 20989/// In such a case start looking for a tree rooted in the first '+'. 20990/// \Returns the new root if found, which may be nullptr if not an instruction. 20993assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20994 isa<IntrinsicInst>(Root)) &&
20995"Expected binop, select, or intrinsic for reduction matching");
20997 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20999 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
21001return dyn_cast<Instruction>(
RHS);
21003return dyn_cast<Instruction>(
LHS);
21007/// \p Returns the first operand of \p I that does not match \p Phi. If 21008/// operand is not an instruction it returns nullptr. 21010Value *Op0 =
nullptr;
21011Value *Op1 =
nullptr;
21014return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21017/// \Returns true if \p I is a candidate instruction for reduction vectorization. 21020Value *B0 =
nullptr, *B1 =
nullptr;
21025bool SLPVectorizerPass::vectorizeHorReduction(
21030bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
21032if (Root->
getParent() != BB || isa<PHINode>(Root))
21035// If we can find a secondary reduction root, use that instead. 21036auto SelectRoot = [&]() {
21044// Start analysis starting from Root instruction. If horizontal reduction is 21045// found, try to vectorize it. If it is not a horizontal reduction or 21046// vectorization is not possible or not effective, and currently analyzed 21047// instruction is a binary operation, try to vectorize the operands, using 21048// pre-order DFS traversal order. If the operands were not vectorized, repeat 21049// the same procedure considering each operand as a possible root of the 21050// horizontal reduction. 21051// Interrupt the process if the Root instruction itself was vectorized or all 21052// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. 21053// If a horizintal reduction was not matched or vectorized we collect 21054// instructions for possible later attempts for vectorization. 21055 std::queue<std::pair<Instruction *, unsigned>>
Stack;
21056Stack.emplace(SelectRoot(), 0);
21060if (
R.isAnalyzedReductionRoot(Inst))
21065if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21067return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21069auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21070if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21075// Do not collect CmpInst or InsertElementInst/InsertValueInst as their 21076// analysis is done separately. 21077if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21082while (!
Stack.empty()) {
21085 std::tie(Inst, Level) =
Stack.front();
21087// Do not try to analyze instruction that has already been vectorized. 21088// This may happen when we vectorize instruction operands on a previous 21089// iteration while stack was populated before that happened. 21090if (
R.isDeleted(Inst))
21092if (
Value *VectorizedV = TryToReduce(Inst)) {
21094if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21095// Try to find another reduction. 21099if (
R.isDeleted(Inst))
21102// We could not vectorize `Inst` so try to use it as a future seed. 21103if (!TryAppendToPostponedInsts(Inst)) {
21109// Try to vectorize operands. 21110// Continue analysis for the instruction from the same basic block only to 21111// save compile time. 21114if (VisitedInstrs.
insert(
Op).second)
21115if (
auto *
I = dyn_cast<Instruction>(
Op))
21116// Do not try to vectorize CmpInst operands, this is done 21118if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21119 !
R.isDeleted(
I) &&
I->getParent() == BB)
21128bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21129 Res |= tryToVectorize(PostponedInsts, R);
21136for (
Value *V : Insts)
21137if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21138 Res |= tryToVectorize(Inst, R);
21142bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21145if (!
R.canMapToVector(IVI->
getType()))
21153if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21154R.getORE()->emit([&]() {
21156 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, " 21157"trying reduction first.";
21161LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21162// Aggregate value is unlikely to be processed in vector register. 21163return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21173 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21177if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21178R.getORE()->emit([&]() {
21180 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, " 21181"trying reduction first.";
21185LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21186return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21189template <
typename T>
21195bool Changed =
false;
21196// Sort by type, parent, operands. 21199// Try to vectorize elements base on their type. 21204// Look for the next elements with the same type, parent and operand 21206auto *
I = dyn_cast<Instruction>(*IncIt);
21207if (!
I || R.isDeleted(
I)) {
21211auto *SameTypeIt = IncIt;
21212while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21213 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21214 AreCompatible(*SameTypeIt, *IncIt))) {
21215auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21217if (
I && !R.isDeleted(
I))
21221// Try to vectorize them. 21222unsigned NumElts = VL.
size();
21223LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes (" 21224 << NumElts <<
")\n");
21225// The vectorization is a 3-state attempt: 21226// 1. Try to vectorize instructions with the same/alternate opcodes with the 21227// size of maximal register at first. 21228// 2. Try to vectorize remaining instructions with the same type, if 21229// possible. This may result in the better vectorization results rather than 21230// if we try just to vectorize instructions with the same/alternate opcodes. 21231// 3. Final attempt to try to vectorize all instructions with the 21232// same/alternate ops only, this may result in some extra final 21234if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21235// Success start over because instructions might have been changed. 21237 VL.
swap(Candidates);
21238 Candidates.
clear();
21240if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21244 /// \Returns the minimum number of elements that we will attempt to 21246auto GetMinNumElements = [&R](
Value *V) {
21247unsigned EltSize = R.getVectorElementSize(V);
21248return std::max(2U, R.getMaxVecRegSize() / EltSize);
21250if (NumElts < GetMinNumElements(*IncIt) &&
21251 (Candidates.
empty() ||
21252 Candidates.
front()->getType() == (*IncIt)->getType())) {
21254if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21259// Final attempt to vectorize instructions with the same types. 21260if (Candidates.
size() > 1 &&
21261 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21262if (TryToVectorizeHelper(Candidates,
/*MaxVFOnly=*/false)) {
21263// Success start over because instructions might have been changed. 21265 }
elseif (MaxVFOnly) {
21266// Try to vectorize using small vectors. 21268for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21270auto *
I = dyn_cast<Instruction>(*It);
21271if (!
I || R.isDeleted(
I)) {
21275auto *SameTypeIt = It;
21276while (SameTypeIt !=
End &&
21277 (!isa<Instruction>(*SameTypeIt) ||
21278 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21279 AreCompatible(*SameTypeIt, *It))) {
21280auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21282if (
I && !R.isDeleted(
I))
21285unsigned NumElts = VL.
size();
21286if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21287/*MaxVFOnly=*/false))
21292 Candidates.
clear();
21295// Start over at the next instruction of a different type (or the end). 21296 IncIt = SameTypeIt;
21301/// Compare two cmp instructions. If IsCompatibility is true, function returns 21302/// true if 2 cmps have same/swapped predicates and mos compatible corresponding 21303/// operands. If IsCompatibility is false, function implements strict weak 21304/// ordering relation between two cmp instructions, returning true if the first 21305/// instruction is "less" than the second, i.e. its predicate is less than the 21306/// predicate of the second or the operands IDs are less than the operands IDs 21307/// of the second cmp instruction. 21308template <
bool IsCompatibility>
21313"Expected valid element types only.");
21315return IsCompatibility;
21316auto *CI1 = cast<CmpInst>(V);
21317auto *CI2 = cast<CmpInst>(V2);
21318if (CI1->getOperand(0)->getType()->getTypeID() <
21320return !IsCompatibility;
21321if (CI1->getOperand(0)->getType()->getTypeID() >
21324if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21326return !IsCompatibility;
21327if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21336if (BasePred1 < BasePred2)
21337return !IsCompatibility;
21338if (BasePred1 > BasePred2)
21340// Compare operands. 21341bool CI1Preds = Pred1 == BasePred1;
21342bool CI2Preds = Pred2 == BasePred1;
21343for (
intI = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21344auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21349return !IsCompatibility;
21352if (
auto *I1 = dyn_cast<Instruction>(Op1))
21353if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21354if (IsCompatibility) {
21355if (I1->getParent() != I2->getParent())
21358// Try to compare nodes with same parent. 21362return NodeI2 !=
nullptr;
21365assert((NodeI1 == NodeI2) ==
21367"Different nodes should have different DFS numbers");
21368if (NodeI1 != NodeI2)
21372if (S && (IsCompatibility || !S.isAltShuffle()))
21374if (IsCompatibility)
21376if (I1->getOpcode() != I2->getOpcode())
21377return I1->getOpcode() < I2->getOpcode();
21380return IsCompatibility;
21383template <
typename ItT>
21386bool Changed =
false;
21387// Try to find reductions first. 21392if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21393 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21398// Try to vectorize operands as vector bundles. 21402 Changed |= tryToVectorize(
I, R);
21404// Try to vectorize list of compares. 21405// Sort by type, compare predicate, etc. 21409return compareCmp<false>(V, V2, *TLI, *DT);
21412auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21415return compareCmp<true>(V1, V2, *TLI, *DT);
21422if (Vals.
size() <= 1)
21424 Changed |= tryToVectorizeSequence<Value>(
21425 Vals, CompareSorter, AreCompatibleCompares,
21427// Exclude possible reductions from other blocks. 21428bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21430 auto *Select = dyn_cast<SelectInst>(U);
21432 Select->getParent() != cast<Instruction>(V)->getParent();
21435if (ArePossiblyReducedInOtherBlock)
21437return tryToVectorizeList(Candidates, R, MaxVFOnly);
21439/*MaxVFOnly=*/true,
R);
21443bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21445assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21446"This function only accepts Insert instructions");
21447bool OpsChanged =
false;
21449for (
auto *
I :
reverse(Instructions)) {
21450// pass1 - try to match and vectorize a buildvector sequence for MaxVF only. 21451if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21453if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21455 vectorizeInsertValueInst(LastInsertValue, BB, R,
/*MaxVFOnly=*/true);
21456 }
elseif (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21458 vectorizeInsertElementInst(LastInsertElem, BB, R,
/*MaxVFOnly=*/true);
21460// pass2 - try to vectorize reductions only 21463 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21464if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21466// pass3 - try to match and vectorize a buildvector sequence. 21467if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21469 vectorizeInsertValueInst(LastInsertValue, BB, R,
/*MaxVFOnly=*/false);
21470 }
elseif (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21471 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21472/*MaxVFOnly=*/false);
21475// Now try to vectorize postponed instructions. 21476 OpsChanged |= tryToVectorize(PostponedInsts, R);
21483bool Changed =
false;
21486// Maps phi nodes to the non-phi nodes found in the use tree for each phi 21487// node. Allows better to identify the chains that can be vectorized in the 21490auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21493"Expected vectorizable types only.");
21494// It is fine to compare type IDs here, since we expect only vectorizable 21495// types, like ints, floats and pointers, we don't care about other type. 21501V2->getType()->getScalarSizeInBits())
21504V2->getType()->getScalarSizeInBits())
21508if (Opcodes1.
size() < Opcodes2.
size())
21510if (Opcodes1.
size() > Opcodes2.
size())
21512for (
intI = 0, E = Opcodes1.
size();
I < E; ++
I) {
21514// Instructions come first. 21515auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21516auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21521return NodeI2 !=
nullptr;
21524assert((NodeI1 == NodeI2) ==
21526"Different nodes should have different DFS numbers");
21527if (NodeI1 != NodeI2)
21530if (S && !S.isAltShuffle())
21532returnI1->getOpcode() < I2->getOpcode();
21540// Non-undef constants come next. 21541bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21542bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21550bool U1 = isa<UndefValue>(Opcodes1[
I]);
21551bool U2 = isa<UndefValue>(Opcodes2[
I]);
21553// Non-constant non-instructions come next. 21555auto ValID1 = Opcodes1[
I]->getValueID();
21556auto ValID2 = Opcodes2[
I]->getValueID();
21557if (ValID1 == ValID2)
21559if (ValID1 < ValID2)
21561if (ValID1 > ValID2)
21569// Undefs come last. 21570assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21574auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21577if (V1->getType() !=
V2->getType())
21581if (Opcodes1.
size() != Opcodes2.
size())
21583for (
intI = 0, E = Opcodes1.
size();
I < E; ++
I) {
21584// Undefs are compatible with any other value. 21585if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21587if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21588if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21589if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21591if (
I1->getParent() != I2->getParent())
21597if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21599if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21605bool HaveVectorizedPhiNodes =
false;
21607// Collect the incoming values from the PHIs. 21610auto *
P = dyn_cast<PHINode>(&
I);
21614// No need to analyze deleted, vectorized and non-vectorizable 21616if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21624// Find the corresponding non-phi nodes for better matching when trying to 21629if (!Opcodes.
empty())
21633while (!Nodes.
empty()) {
21634auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21637for (
Value *V :
PHI->incoming_values()) {
21638if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21639 Nodes.push_back(PHI1);
21647 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21648Incoming, PHICompare, AreCompatiblePHIs,
21650return tryToVectorizeList(Candidates, R, MaxVFOnly);
21652/*MaxVFOnly=*/true,
R);
21653 Changed |= HaveVectorizedPhiNodes;
21654if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
constauto &
P) {
21655auto *
PHI = dyn_cast<PHINode>(
P.first);
21656return !
PHI ||
R.isDeleted(
PHI);
21658 PHIToOpcodes.
clear();
21660 }
while (HaveVectorizedPhiNodes);
21662 VisitedInstrs.
clear();
21664 InstSetVector PostProcessInserts;
21666// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true 21667// also vectorizes `PostProcessCmps`. 21668auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21669bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21670if (VectorizeCmps) {
21671 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21672 PostProcessCmps.
clear();
21674 PostProcessInserts.clear();
21677// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`. 21679if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21680return PostProcessCmps.
contains(Cmp);
21681return isa<InsertElementInst, InsertValueInst>(
I) &&
21682 PostProcessInserts.contains(
I);
21684// Returns true if `I` is an instruction without users, like terminator, or 21685// function call with ignored return value, store. Ignore unused instructions 21686// (basing on instruction type, except for CallInst and InvokeInst). 21688returnI->use_empty() &&
21689 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21692// Skip instructions with scalable type. The num of elements is unknown at 21693// compile-time for scalable type. 21694if (isa<ScalableVectorType>(It->getType()))
21697// Skip instructions marked for the deletion. 21698if (
R.isDeleted(&*It))
21700// We may go through BB multiple times so skip the one we have checked. 21701if (!VisitedInstrs.
insert(&*It).second) {
21702if (HasNoUsers(&*It) &&
21703 VectorizeInsertsAndCmps(
/*VectorizeCmps=*/It->isTerminator())) {
21704// We would like to start over since some instructions are deleted 21705// and the iterator may become invalid value. 21713if (isa<DbgInfoIntrinsic>(It))
21716// Try to vectorize reductions that use PHINodes. 21717if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21718// Check that the PHI is a reduction PHI. 21719if (
P->getNumIncomingValues() == 2) {
21720// Try to match and vectorize a horizontal reduction. 21722if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21729// Try to vectorize the incoming values of the PHI, to catch reductions 21730// that feed into PHIs. 21731for (
unsignedI : seq<unsigned>(
P->getNumIncomingValues())) {
21732// Skip if the incoming block is the current BB for now. Also, bypass 21733// unreachable IR for efficiency and to avoid crashing. 21734// TODO: Collect the skipped incoming values and try to vectorize them 21735// after processing BB. 21736if (BB ==
P->getIncomingBlock(
I) ||
21740// Postponed instructions should not be vectorized here, delay their 21742if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21743 PI && !IsInPostProcessInstrs(PI)) {
21745 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21747if (Res &&
R.isDeleted(
P)) {
21757if (HasNoUsers(&*It)) {
21758bool OpsChanged =
false;
21759auto *
SI = dyn_cast<StoreInst>(It);
21763// Try to vectorize chain in store, if this is the only store to the 21764// address in the block. 21765// TODO: This is just a temporarily solution to save compile time. Need 21766// to investigate if we can safely turn on slp-vectorize-hor-store 21767// instead to allow lookup for reduction chains in all non-vectorized 21768// stores (need to check side effects and compile time). 21769 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21770SI->getValueOperand()->hasOneUse();
21772if (TryToVectorizeRoot) {
21773for (
auto *V : It->operand_values()) {
21774// Postponed instructions should not be vectorized here, delay their 21776if (
auto *VI = dyn_cast<Instruction>(V);
21777VI && !IsInPostProcessInstrs(VI))
21778// Try to match and vectorize a horizontal reduction. 21779 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21782// Start vectorization of post-process list of instructions from the 21783// top-tree instructions to try to vectorize as many instructions as 21786 VectorizeInsertsAndCmps(
/*VectorizeCmps=*/It->isTerminator());
21788// We would like to start over since some instructions are deleted 21789// and the iterator may become invalid value. 21797if (isa<InsertElementInst, InsertValueInst>(It))
21798 PostProcessInserts.insert(&*It);
21799elseif (isa<CmpInst>(It))
21800 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21807auto Changed =
false;
21808for (
auto &Entry : GEPs) {
21809// If the getelementptr list has fewer than two elements, there's nothing 21811if (
Entry.second.size() < 2)
21814LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length " 21815 <<
Entry.second.size() <<
".\n");
21817// Process the GEP list in chunks suitable for the target's supported 21818// vector size. If a vector register can't hold 1 element, we are done. We 21819// are trying to vectorize the index computations, so the maximum number of 21820// elements is based on the size of the index expression, rather than the 21821// size of the GEP itself (the target's pointer size). 21823 return !R.isDeleted(GEP);
21825if (It ==
Entry.second.end())
21827unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21828unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21829if (MaxVecRegSize < EltSize)
21832unsigned MaxElts = MaxVecRegSize / EltSize;
21833for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21834autoLen = std::min<unsigned>(BE - BI, MaxElts);
21837// Initialize a set a candidate getelementptrs. Note that we use a 21838// SetVector here to preserve program order. If the index computations 21839// are vectorizable and begin with loads, we want to minimize the chance 21840// of having to reorder them later. 21843// Some of the candidates may have already been vectorized after we 21844// initially collected them or their index is optimized to constant value. 21845// If so, they are marked as deleted, so remove them from the set of 21847 Candidates.remove_if([&R](
Value *
I) {
21848returnR.isDeleted(cast<Instruction>(
I)) ||
21849 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21852// Remove from the set of candidates all pairs of getelementptrs with 21853// constant differences. Such getelementptrs are likely not good 21854// candidates for vectorization in a bottom-up phase since one can be 21855// computed from the other. We also ensure all candidate getelementptr 21856// indices are unique. 21857for (
intI = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21858auto *GEPI = GEPList[
I];
21859if (!Candidates.count(GEPI))
21862for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21863auto *GEPJ = GEPList[J];
21865if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21866 Candidates.remove(GEPI);
21867 Candidates.remove(GEPJ);
21868 }
elseif (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21869 Candidates.remove(GEPJ);
21874// We break out of the above computation as soon as we know there are 21875// fewer than two candidates remaining. 21876if (Candidates.
size() < 2)
21879// Add the single, non-constant index of each candidate to the bundle. We 21880// ensured the indices met these constraints when we originally collected 21881// the getelementptrs. 21883auto BundleIndex = 0
u;
21884for (
auto *V : Candidates) {
21885auto *
GEP = cast<GetElementPtrInst>(V);
21886auto *GEPIdx =
GEP->idx_begin()->get();
21887assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21888 Bundle[BundleIndex++] = GEPIdx;
21891// Try and vectorize the indices. We are currently only interested in 21892// gather-like cases of the form: 21894// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... 21896// where the loads of "a", the loads of "b", and the subtractions can be 21897// performed in parallel. It's likely that detecting this pattern in a 21898// bottom-up phase will be simpler and less costly than building a 21899// full-blown top-down phase beginning at the consecutive loads. 21900 Changed |= tryToVectorizeList(Bundle, R);
21906bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21907bool Changed =
false;
21908// Sort by type, base pointers and values operand. Value operands must be 21909// compatible (have the same opcode, same parent), otherwise it is 21910// definitely not profitable to try to vectorize them. 21912if (
V->getValueOperand()->getType()->getTypeID() <
21913V2->getValueOperand()->getType()->getTypeID())
21915if (
V->getValueOperand()->getType()->getTypeID() >
21916V2->getValueOperand()->getType()->getTypeID())
21918if (
V->getPointerOperandType()->getTypeID() <
21919V2->getPointerOperandType()->getTypeID())
21921if (
V->getPointerOperandType()->getTypeID() >
21922V2->getPointerOperandType()->getTypeID())
21924if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21925V2->getValueOperand()->getType()->getScalarSizeInBits())
21927if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21928V2->getValueOperand()->getType()->getScalarSizeInBits())
21930// UndefValues are compatible with all other values. 21931if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21932if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21936 DT->
getNode(I2->getParent());
21937assert(NodeI1 &&
"Should only process reachable instructions");
21938assert(NodeI2 &&
"Should only process reachable instructions");
21939assert((NodeI1 == NodeI2) ==
21941"Different nodes should have different DFS numbers");
21942if (NodeI1 != NodeI2)
21944returnI1->getOpcode() < I2->getOpcode();
21946returnV->getValueOperand()->getValueID() <
21947V2->getValueOperand()->getValueID();
21957// Undefs are compatible with any other value. 21959 isa<UndefValue>(
V2->getValueOperand()))
21962if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21963if (
I1->getParent() != I2->getParent())
21968 isa<Constant>(
V2->getValueOperand()))
21971V2->getValueOperand()->getValueID();
21974// Attempt to sort and vectorize each of the store-groups. 21976for (
auto &Pair : Stores) {
21977if (Pair.second.size() < 2)
21981 << Pair.second.size() <<
".\n");
21986// Reverse stores to do bottom-to-top analysis. This is important if the 21987// values are stores to the same addresses several times, in this case need 21988// to follow the stores order (reversed to meet the memory dependecies). 21990 Pair.second.rend());
21991 Changed |= tryToVectorizeSequence<StoreInst>(
21992 ReversedStores, StoreSorter, AreCompatibleStores,
21994return vectorizeStores(Candidates, R, Attempted);
21996/*MaxVFOnly=*/false,
R);
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
@ OK_UniformConstantValue
@ OK_NonUniformConstantValue
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.