1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive 10// stores that can be put together into vector-stores. Next, it attempts to 11// construct vectorizable tree using the use-def chains. If a profitable tree 12// was found, the SLP vectorizer performs vectorization on the tree. 14// The pass is inspired by the work described in the paper: 15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 17//===----------------------------------------------------------------------===// 73#ifdef EXPENSIVE_CHECKS 106using namespaceslpvectorizer;
107using namespacestd::placeholders;
109#define SV_NAME "slp-vectorizer" 110#define DEBUG_TYPE "SLP" 112STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
115"Controls which SLP graphs should be vectorized.");
119cl::desc(
"Run the SLP vectorization passes"));
123cl::desc(
"Enable vectorization for wider vector utilization"));
127cl::desc(
"Only vectorize if you gain more than this " 132cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on " 133"heuristics and makes vectorization decision via cost modeling."));
137cl::desc(
"Attempt to vectorize horizontal reductions"));
142"Attempt to vectorize horizontal reductions feeding into a store"));
146cl::desc(
"Attempt to vectorize for this register size in bits"));
150cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
152/// Limits the size of scheduling regions in a block. 153/// It avoid long compile times for _very_ large blocks where vector 154/// instructions are spread over a wide range. 155/// This limit is way higher than needed by real-world functions. 158cl::desc(
"Limit the size of the SLP scheduling region per block"));
162cl::desc(
"Attempt to vectorize for this register size in bits"));
166cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
170cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
172// The maximum depth that the look-ahead score heuristic will explore. 173// The higher this value, the higher the compilation time overhead. 176cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
178// The maximum depth that the look-ahead score heuristic will explore 179// when it probing among candidates for vectorization tree roots. 180// The higher this value, the higher the compilation time overhead but unlike 181// similar limit for operands ordering this is less frequently used, hence 182// impact of higher value is less noticeable. 185cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
189cl::desc(
"The minimum number of loads, which should be considered strided, " 190"if the stride is > 1 or is runtime value"));
194cl::desc(
"The maximum stride, considered to be profitable."));
198cl::desc(
"Display the SLP trees with Graphviz"));
202cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
204// Limit the number of alias checks. The limit is chosen so that 205// it has no negative effect on the llvm benchmarks. 208// Limit of the number of uses for potentially transformed instructions/values, 209// used in checks to avoid compile-time explode. 212// Another limit for the alias checks: The maximum distance between load/store 213// instructions where alias checks are done. 214// This limit is useful for very large basic blocks. 217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling 218/// regions to be handled. 221/// Maximum allowed number of operands in the PHI nodes. 224/// Predicate for the element types that the SLP vectorizer supports. 226/// The most important thing to filter here are types which are invalid in LLVM 227/// vectors. We also filter target specific types which have absolutely no 228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just 229/// avoids spending time checking the cost model and realizing that they will 230/// be inevitably scalarized. 232// TODO: Support ScalableVectorType. 233if (
SLPReVec && isa<FixedVectorType>(Ty))
235return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
239/// Returns the type of the given value/instruction \p V. If it is store, 240/// returns the type of its value operand, for Cmp - the types of the compare 241/// operands and for insertelement - the type os the inserted operand. 242/// Otherwise, just the type of the value is returned. 244if (
auto *SI = dyn_cast<StoreInst>(V))
245return SI->getValueOperand()->getType();
246if (
auto *CI = dyn_cast<CmpInst>(V))
247return CI->getOperand(0)->getType();
248if (
auto *IE = dyn_cast<InsertElementInst>(V))
249return IE->getOperand(1)->getType();
253/// \returns the number of elements for Ty. 255assert(!isa<ScalableVectorType>(Ty) &&
256"ScalableVectorType is not supported.");
257if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258return VecTy->getNumElements();
262/// \returns the vector type of ScalarTy based on vectorization factor. 268/// Returns the number of elements of the given type \p Ty, not less than \p Sz, 269/// which forms type, which splits by \p TTI into whole vector types during 272Type *Ty,
unsigned Sz) {
275// Find the number of elements, which forms full vectors. 277if (NumParts == 0 || NumParts >= Sz)
282/// Returns the number of elements of the given type \p Ty, not greater than \p 283/// Sz, which forms type, which splits by \p TTI into whole vector types during 290// Find the number of elements, which forms full vectors. 292if (NumParts == 0 || NumParts >= Sz)
297return (Sz / RegVF) * RegVF;
302// The ShuffleBuilder implementation use shufflevector to splat an "element". 303// But the element have different meaning for SLP (scalar) and REVEC 304// (vector). We need to expand Mask into masks which shufflevector can use 307for (
unsignedI : seq<unsigned>(Mask.size()))
309I * VecTyNumElements, VecTyNumElements)))
311 : Mask[
I] * VecTyNumElements + J;
315/// \returns the number of groups of shufflevector 316/// A group has the following features 317/// 1. All of value in a group are shufflevector. 318/// 2. The mask of all shufflevector is isExtractSubvectorMask. 319/// 3. The mask of all shufflevector uses all of the elements of the source. 320/// e.g., it is 1 group (%0) 321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison, 322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison, 324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 325/// it is 2 groups (%3 and %4) 326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison, 327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison, 329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison, 331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison, 333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison, 336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison, 338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 342if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
344auto *SV = cast<ShuffleVectorInst>(VL.
front());
345unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348if (SVNumElements % ShuffleMaskSize != 0)
350unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
353unsigned NumGroup = 0;
354for (
size_tI = 0, E = VL.
size();
I != E;
I += GroupSize) {
355auto *SV = cast<ShuffleVectorInst>(VL[
I]);
356Value *Src = SV->getOperand(0);
360auto *SV = cast<ShuffleVectorInst>(V);
361// From the same source. 362if (SV->getOperand(0) != Src)
365if (!SV->isExtractSubvectorMask(Index))
367 ExpectedIndex.
set(Index / ShuffleMaskSize);
371if (!ExpectedIndex.
all())
375assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
379/// \returns a shufflevector mask which is used to vectorize shufflevectors 381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison, 382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison, 384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison, 386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3> 387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison, 388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7> 390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31> 393auto *SV = cast<ShuffleVectorInst>(VL.
front());
394unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
397unsigned AccumulateLength = 0;
399auto *SV = cast<ShuffleVectorInst>(V);
400for (
int M : SV->getShuffleMask())
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
408/// \returns True if the value is a constant (but not globals/constant 411return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
414/// Checks if \p V is one of vector-like instructions, i.e. undef, 415/// insertelement/extractelement with constant indices for fixed vector type or 416/// extractvalue instruction. 418if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
421auto *
I = dyn_cast<Instruction>(V);
422if (!
I || isa<ExtractValueInst>(
I))
424if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
426if (isa<ExtractElementInst>(
I))
428assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
432/// Returns power-of-2 number of elements in a single register (part), given the 433/// total number of elements \p Size and number of registers (parts) \p 439/// Returns correct remaining number of elements, considering total amount \p 440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems 441/// and current register (part) \p Part. 444return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
448/// Print a short descriptor of the instruction bundle suitable for debug output. 453OS <<
"Idx: " <<
Idx <<
", ";
454OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
459/// \returns true if all of the instructions in \p VL are in the same block or 462auto *It =
find_if(VL, IsaPred<Instruction>);
471if (isa<PoisonValue>(V))
473auto *
II = dyn_cast<Instruction>(V);
477if (BB !=
II->getParent())
483/// \returns True if all of the values in \p VL are constants (but not 484/// globals/constant expressions). 486// Constant expressions and globals can't be vectorized like normal integer/FP 491/// \returns True if all of the values in \p VL are identical or some of them 494Value *FirstNonUndef =
nullptr;
496if (isa<UndefValue>(V))
502if (V != FirstNonUndef)
505return FirstNonUndef !=
nullptr;
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. 510if (
auto *Cmp = dyn_cast<CmpInst>(
I))
511return Cmp->isCommutative();
512if (
auto *BO = dyn_cast<BinaryOperator>(
I))
513return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
519// Commutative, if icmp eq/ne sub, 0 521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
525// Commutative, if abs(sub nsw, true) or abs(sub, false). 527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
533 (BO->getOpcode() == Instruction::FSub &&
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
539returnI->isCommutative();
545static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
549if (
constauto *IE = dyn_cast<T>(Inst)) {
550constauto *VT = dyn_cast<FixedVectorType>(IE->getType());
553constauto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
556if (CI->getValue().uge(VT->getNumElements()))
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
565/// \returns inserting or extracting index of InsertElement, ExtractElement or 566/// InsertValue instruction, using Offset as base offset for index. 567/// \returns std::nullopt if the index is not an immediate. 570if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
572if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
577constauto *
IV = dyn_cast<InsertValueInst>(Inst);
581Type *CurrentType =
IV->getType();
582for (
unsignedI :
IV->indices()) {
583if (
constauto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(
I);
586 }
elseif (
constauto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements 599/// in the shuffle mask. 601 FirstArg,
///< The mask is expected to be for permutation of 1-2 vectors, 602 ///< check for the mask elements for the first argument (mask 603 ///< indices are in range [0:VF)). 604 SecondArg,
///< The mask is expected to be for permutation of 2 vectors, check 605 ///< for the mask elements for the second argument (mask indices 606 ///< are in range [VF:2*VF)) 607 UndefsAsMask
///< Consider undef mask elements (-1) as placeholders for 608 ///< future shuffle elements and mark them as ones as being used 609 ///< in future. Non-undef elements are considered as unused since 610 ///< they're already marked as used in the mask. 614/// Prepares a use bitset for the given mask either for the first argument or 621if (MaskArg == UseMask::UndefsAsMask)
625if (MaskArg == UseMask::FirstArg &&
Value < VF)
626 UseMask.reset(
Value);
627elseif (MaskArg == UseMask::SecondArg &&
Value >= VF)
628 UseMask.reset(
Value - VF);
633/// Checks if the given value is actually an undefined constant vector. 634/// Also, if the \p UseMask is not empty, tries to check if the non-masked 635/// elements actually mask the insertelement buildvector, if any. 636template <
bool IsPoisonOnly = false>
640usingT = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
643auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
646auto *
C = dyn_cast<Constant>(V);
648if (!UseMask.empty()) {
650while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
652if (isa<T>(
II->getOperand(1)))
659if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
662// TODO: Add analysis for shuffles here too. 667 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
674for (
unsignedI = 0, E = VecTy->getNumElements();
I != E; ++
I) {
677 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
683/// Checks if the vector of instructions can be represented as a shuffle, like: 684/// %x0 = extractelement <4 x i8> %x, i32 0 685/// %x3 = extractelement <4 x i8> %x, i32 3 686/// %y1 = extractelement <4 x i8> %y, i32 1 687/// %y2 = extractelement <4 x i8> %y, i32 2 688/// %x0x0 = mul i8 %x0, %x0 689/// %x3x3 = mul i8 %x3, %x3 690/// %y1y1 = mul i8 %y1, %y1 691/// %y2y2 = mul i8 %y2, %y2 692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 696/// ret <4 x i8> %ins4 697/// can be transformed into: 698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, 700/// %2 = mul <4 x i8> %1, %1 702/// Mask will return the Shuffle Mask equivalent to the extracted elements. 703/// TODO: Can we split off and reuse the shuffle mask detection from 704/// ShuffleVectorInst/getShuffleCost? 705static std::optional<TargetTransformInfo::ShuffleKind>
708constauto *It =
find_if(VL, IsaPred<ExtractElementInst>);
712 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
719 return std::max(S, VTy->getNumElements());
725auto *EE = dyn_cast<ExtractElementInst>(V);
728Value *Vec = EE->getVectorOperand();
729if (isa<UndefValue>(Vec))
734 ShuffleMode CommonShuffleMode =
Unknown;
736for (
unsignedI = 0, E = VL.
size();
I < E; ++
I) {
737// Undef can be represented as an undef element in a vector. 738if (isa<UndefValue>(VL[
I]))
740auto *EI = cast<ExtractElementInst>(VL[
I]);
741if (isa<ScalableVectorType>(EI->getVectorOperandType()))
743auto *Vec = EI->getVectorOperand();
744// We can extractelement from undef or poison vector. 745if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
747// All vector operands must have the same number of vector elements. 748if (isa<UndefValue>(Vec)) {
751if (isa<UndefValue>(EI->getIndexOperand()))
753auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
756// Undefined behavior if Idx is negative or >= Size. 759unsigned IntIdx =
Idx->getValue().getZExtValue();
764// For correct shuffling we have to have at most 2 different vector operands 765// in all extractelement instructions. 766if (!Vec1 || Vec1 == Vec) {
768 }
elseif (!Vec2 || Vec2 == Vec) {
774if (CommonShuffleMode == Permute)
776// If the extract index is not the same as the operation number, it is a 779 CommonShuffleMode = Permute;
782 CommonShuffleMode =
Select;
784// If we're not crossing lanes in different vectors, consider it as blending. 785if (CommonShuffleMode ==
Select && Vec2)
787// If Vec2 was never used, we have a permutation of a single vector, otherwise 788// we have permutation of 2 vectors. 793/// \returns True if Extract{Value,Element} instruction extracts element Idx. 796assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798"Expected extractelement or extractvalue instruction.");
799if (Opcode == Instruction::ExtractElement) {
800auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
803return CI->getZExtValue();
805auto *EI = cast<ExtractValueInst>(E);
806if (EI->getNumIndices() != 1)
808return *EI->idx_begin();
813/// Main data required for vectorization of instructions. 814classInstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0. 821assert(valid() &&
"InstructionsState is invalid.");
826assert(valid() &&
"InstructionsState is invalid.");
830 /// The main/alternate opcodes for the list of instructions. 831unsignedgetOpcode()
const{
return getMainOp()->getOpcode(); }
833unsigned getAltOpcode()
const{
return getAltOp()->getOpcode(); }
835 /// Some of the instructions in the list have alternate opcodes. 836bool isAltShuffle()
const{
return getMainOp() != getAltOp(); }
839unsigned CheckedOpcode =
I->getOpcode();
840returngetOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
843 /// Checks if the current state is valid, i.e. has non-null MainOp 844bool valid()
const{
return MainOp && AltOp; }
846explicitoperatorbool()
const{
return valid(); }
848 InstructionsState() =
delete;
850 : MainOp(MainOp), AltOp(AltOp) {}
851static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
854}
// end anonymous namespace 856/// \returns true if \p Opcode is allowed as part of the main/alternate 857/// instruction for SLP vectorization. 859/// Example of unsupported opcode is SDIV that can potentially cause UB if the 860/// "shuffled out" lane would result in division by zero. 871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e. 872/// compatible instructions or constants, or just some other regular values. 877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
884/// \returns true if a compare instruction \p CI has similar "look" and 885/// same predicate as \p BaseCI, "as is" or with its operands and predicate 886/// swapped, false otherwise. 890"Assessing comparisons of different types?");
900return (BasePred == Pred &&
902 (BasePred == SwappedPred &&
906/// \returns analysis of the Instructions in \p VL described in 907/// InstructionsState, the Opcode that we suppose the whole list 908/// could be vectorized even if its structure is diverse. 911// Make sure these are all Instructions. 912if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
913return InstructionsState::invalid();
915auto *It =
find_if(VL, IsaPred<Instruction>);
917return InstructionsState::invalid();
920unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
921if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
922 (VL.
size() == 2 && InstCnt < 2))
923return InstructionsState::invalid();
925bool IsCastOp = isa<CastInst>(MainOp);
926bool IsBinOp = isa<BinaryOperator>(MainOp);
927bool IsCmpOp = isa<CmpInst>(MainOp);
932unsigned AltOpcode = Opcode;
934bool SwappedPredsCompatible = IsCmpOp && [&]() {
936 UniquePreds.
insert(BasePred);
937 UniqueNonSwappedPreds.
insert(BasePred);
939auto *
I = dyn_cast<CmpInst>(V);
945 UniqueNonSwappedPreds.
insert(CurrentPred);
946if (!UniquePreds.
contains(CurrentPred) &&
947 !UniquePreds.
contains(SwappedCurrentPred))
948 UniquePreds.
insert(CurrentPred);
950// Total number of predicates > 2, but if consider swapped predicates 951// compatible only 2, consider swappable predicates as compatible opcodes, 953return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
955// Check for one alternate opcode from another BinaryOperator. 956// TODO - generalize to support all operators (types, calls etc.). 959if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
963return InstructionsState::invalid();
965bool AnyPoison = InstCnt != VL.
size();
966// Check MainOp too to be sure that it matches the requirements for the 969auto *
I = dyn_cast<Instruction>(V);
973// Cannot combine poison and divisions. 974// TODO: do some smart analysis of the CallInsts to exclude divide-like 975// intrinsics/functions only. 976if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
977return InstructionsState::invalid();
978unsigned InstOpcode =
I->getOpcode();
979if (IsBinOp && isa<BinaryOperator>(
I)) {
980if (InstOpcode == Opcode || InstOpcode == AltOpcode)
984 AltOpcode = InstOpcode;
988 }
elseif (IsCastOp && isa<CastInst>(
I)) {
991Value *Op1 =
I->getOperand(0);
994if (InstOpcode == Opcode || InstOpcode == AltOpcode)
996if (Opcode == AltOpcode) {
999"Cast isn't safe for alternation, logic needs to be updated!");
1000 AltOpcode = InstOpcode;
1005 }
elseif (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1006auto *BaseInst = cast<CmpInst>(MainOp);
1007Type *Ty0 = BaseInst->getOperand(0)->getType();
1008Type *Ty1 = Inst->getOperand(0)->getType();
1010assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1011assert(InstOpcode == AltOpcode &&
1012"Alternate instructions are only supported by BinaryOperator " 1014// Check for compatible operands. If the corresponding operands are not 1015// compatible - need to perform alternate vectorization. 1020if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1026auto *AltInst = cast<CmpInst>(AltOp);
1027if (MainOp != AltOp) {
1030 }
elseif (BasePred != CurrentPred) {
1033"CmpInst isn't safe for alternation, logic needs to be updated!");
1038if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1042 }
elseif (InstOpcode == Opcode) {
1043assert(InstOpcode == AltOpcode &&
1044"Alternate instructions are only supported by BinaryOperator and " 1046if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1047if (Gep->getNumOperands() != 2 ||
1049return InstructionsState::invalid();
1050 }
elseif (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1052return InstructionsState::invalid();
1053 }
elseif (
auto *LI = dyn_cast<LoadInst>(
I)) {
1054auto *BaseLI = cast<LoadInst>(MainOp);
1055if (!LI->isSimple() || !BaseLI->isSimple())
1056return InstructionsState::invalid();
1057 }
elseif (
auto *Call = dyn_cast<CallInst>(
I)) {
1058auto *
CallBase = cast<CallInst>(MainOp);
1060return InstructionsState::invalid();
1061if (Call->hasOperandBundles() &&
1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1064 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1067return InstructionsState::invalid();
1070return InstructionsState::invalid();
1073if (Mappings.
size() != BaseMappings.
size() ||
1074 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1075 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1076 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1077 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1078 Mappings.
front().Shape.Parameters !=
1079 BaseMappings.
front().Shape.Parameters)
1080return InstructionsState::invalid();
1085return InstructionsState::invalid();
1088return InstructionsState(MainOp, AltOp);
1091/// \returns true if all of the values in \p VL have the same type or false 1098/// \returns True if in-tree use also needs extract. This refers to 1099/// possible scalar operand in vectorized instruction. 1107case Instruction::Load: {
1108LoadInst *LI = cast<LoadInst>(UserInst);
1111case Instruction::Store: {
1112StoreInst *SI = cast<StoreInst>(UserInst);
1113return (SI->getPointerOperand() == Scalar);
1115case Instruction::Call: {
1116CallInst *CI = cast<CallInst>(UserInst);
1119 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1120 Arg.value().get() == Scalar;
1128/// \returns the AA location that is being access by the instruction. 1132if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1137/// \returns True if the instruction is not a volatile or atomic load/store. 1139if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1140return LI->isSimple();
1142return SI->isSimple();
1144return !
MI->isVolatile();
1148/// Shuffles \p Mask in accordance with the given \p SubMask. 1149/// \param ExtendingManyInputs Supports reshuffling of the mask with not only 1150/// one but two input vectors. 1152bool ExtendingManyInputs =
false) {
1156 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1157// Check if input scalars were extended to match the size of other node. 1159"SubMask with many inputs support must be larger than the mask.");
1161 Mask.append(SubMask.
begin(), SubMask.
end());
1165int TermValue = std::min(Mask.size(), SubMask.
size());
1166for (
intI = 0, E = SubMask.
size();
I < E; ++
I) {
1168 (!ExtendingManyInputs &&
1169 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1171 NewMask[
I] = Mask[SubMask[
I]];
1176/// Order may have elements assigned special value (size) which is out of 1177/// bounds. Such indices only appear on places which correspond to undef values 1178/// (see canReuseExtract for details) and used in order to avoid undef values 1179/// have effect on operands ordering. 1180/// The first loop below simply finds all unused indices and then the next loop 1181/// nest assigns these indices for undef values positions. 1182/// As an example below Order has two undef positions and they have assigned 1183/// values 3 and 7 respectively: 1184/// before: 6 9 5 4 9 2 1 0 1185/// after: 6 3 5 4 7 2 1 0 1187constunsigned Sz = Order.
size();
1190for (
unsignedI = 0;
I < Sz; ++
I) {
1192 UnusedIndices.
reset(Order[
I]);
1194 MaskedIndices.
set(
I);
1196if (MaskedIndices.
none())
1199"Non-synced masked/available indices.");
1203assert(
Idx >= 0 &&
"Indices must be synced.");
1210/// \returns a bitset for selecting opcodes. false for Opcode0 and true for 1214Type *ScalarTy = VL[0]->getType();
1217for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1218if (isa<PoisonValue>(VL[Lane]))
1220if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1221 OpcodeMask.
set(Lane * ScalarTyNumElements,
1222 Lane * ScalarTyNumElements + ScalarTyNumElements);
1232constunsigned E = Indices.
size();
1234for (
unsignedI = 0;
I < E; ++
I)
1235 Mask[Indices[
I]] =
I;
1238/// Reorders the list of scalars in accordance with the given \p Mask. 1241assert(!Mask.empty() &&
"Expected non-empty mask.");
1245for (
unsignedI = 0, E = Prev.
size();
I < E; ++
I)
1247 Scalars[Mask[
I]] = Prev[
I];
1250/// Checks if the provided value does not require scheduling. It does not 1251/// require scheduling if this is not an instruction or it is an instruction 1252/// that does not read/write memory and all operands are either not instructions 1253/// or phi nodes or instructions from different blocks. 1255auto *
I = dyn_cast<Instruction>(V);
1260 auto *IO = dyn_cast<Instruction>(V);
1263 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1267/// Checks if the provided value does not require scheduling. It does not 1268/// require scheduling if this is not an instruction or it is an instruction 1269/// that does not read/write memory and all users are phi nodes or instructions 1270/// from the different blocks. 1272auto *
I = dyn_cast<Instruction>(V);
1275// Limits the number of uses to save compile time. 1276return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1278 auto *IU = dyn_cast<Instruction>(U);
1281 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1285/// Checks if the specified value does not require scheduling. It does not 1286/// require scheduling if all operands and all users do not need to be scheduled 1287/// in the current basic block. 1292/// Checks if the specified array of instructions does not require scheduling. 1293/// It is so if all either instructions have operands that do not require 1294/// scheduling or their users do not require scheduling since they are phis or 1295/// in other basic blocks. 1297return !VL.
empty() &&
1301/// Returns true if widened type of \p Ty elements with size \p Sz represents 1302/// full vector type, i.e. adding extra element results in extra parts upon type 1313return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1317/// Returns number of parts, the type \p VecTy will be split at the codegen 1318/// phase. If the type is going to be scalarized or does not uses whole 1319/// registers, returns 1. 1322constunsigned Limit = std::numeric_limits<unsigned>::max()) {
1324if (NumParts == 0 || NumParts >= Limit)
1327if (NumParts >= Sz || Sz % NumParts != 0 ||
1333namespaceslpvectorizer {
1335/// Bottom Up SLP Vectorizer. 1343 /// Tracks the state we can represent the loads in the given sequence. 1362 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1363 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1366// Use the vector register size specified by the target unless overridden 1367// by a command-line option. 1368// TODO: It would be better to limit the vectorization factor based on 1369// data type rather than just register size. For example, x86 AVX has 1370// 256-bit registers, but it does not support integer operations 1371// at that width (that requires AVX2). 1385 /// Vectorize the tree that starts with the elements in \p VL. 1386 /// Returns the vectorized root. 1389 /// Vectorize the tree but with the list of externally used values \p 1390 /// ExternallyUsedValues. Values in this MapVector can be replaced but the 1391 /// generated extractvalue instructions. 1396 /// \returns the cost incurred by unwanted spills and fills, caused by 1397 /// holding live values over call sites. 1400 /// \returns the vectorization cost of the subtree that starts at \p VL. 1401 /// A negative number means that this is profitable. 1404 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 1405 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 1409 /// Construct a vectorizable tree that starts at \p Roots. 1412 /// Returns whether the root node has in-tree uses. 1414return !VectorizableTree.
empty() &&
1415 !VectorizableTree.
front()->UserTreeIndices.empty();
1418 /// Return the scalars of the root node. 1420assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1421return VectorizableTree.
front()->Scalars;
1424 /// Returns the type/is-signed info for the root node in the graph without 1427const TreeEntry &Root = *VectorizableTree.
front().get();
1428if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1429 !Root.Scalars.front()->getType()->isIntegerTy())
1431auto It = MinBWs.
find(&Root);
1432if (It != MinBWs.
end())
1436if (Root.getOpcode() == Instruction::ZExt ||
1437 Root.getOpcode() == Instruction::SExt)
1438return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1439 Root.getOpcode() == Instruction::SExt);
1443 /// Checks if the root graph node can be emitted with narrower bitwidth at 1444 /// codegen and returns it signedness, if so. 1446return MinBWs.
at(VectorizableTree.
front().get()).second;
1449 /// Returns reduction type after minbitdth analysis. 1451if (ReductionBitWidth == 0 ||
1452 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1453 ReductionBitWidth >=
1454DL->getTypeSizeInBits(
1455 VectorizableTree.
front()->Scalars.front()->getType()))
1457 VectorizableTree.
front()->Scalars.front()->getType(),
1458 VectorizableTree.
front()->getVectorFactor());
1461 VectorizableTree.
front()->Scalars.front()->getContext(),
1463 VectorizableTree.
front()->getVectorFactor());
1466 /// Builds external uses of the vectorized scalars, i.e. the list of 1467 /// vectorized scalars to be extracted, their lanes and their scalar users. \p 1468 /// ExternallyUsedValues contains additional list of external uses to handle 1469 /// vectorization of reductions. 1473 /// Transforms graph nodes to target specific representations, if profitable. 1476 /// Clear the internal data structures that are created by 'buildTree'. 1478 VectorizableTree.
clear();
1479 ScalarToTreeEntry.clear();
1480 MultiNodeScalars.clear();
1482 NonScheduledFirst.
clear();
1483 EntryToLastInstruction.clear();
1484 LoadEntriesToVectorize.
clear();
1485 IsGraphTransformMode =
false;
1486 GatheredLoadsEntriesFirst.reset();
1487 ExternalUses.
clear();
1488 ExternalUsesAsOriginalScalar.clear();
1489for (
auto &Iter : BlocksSchedules) {
1490 BlockScheduling *BS = Iter.second.get();
1494 ReductionBitWidth = 0;
1496 CastMaxMinBWSizes.reset();
1497 ExtraBitWidthNodes.
clear();
1498 InstrElementSize.clear();
1499 UserIgnoreList =
nullptr;
1500 PostponedGathers.
clear();
1501 ValueToGatherNodes.
clear();
1506 /// Returns the base graph size, before any transformations. 1509 /// Perform LICM and CSE on the newly generated gather sequences. 1512 /// Does this non-empty order represent an identity order? Identity 1513 /// should be represented as an empty order, so this is used to 1514 /// decide if we can canonicalize a computed order. Undef elements 1515 /// (represented as size) are ignored. 1517assert(!Order.
empty() &&
"expected non-empty order");
1518constunsigned Sz = Order.
size();
1520returnP.value() ==
P.index() ||
P.value() == Sz;
1524 /// Checks if the specified gather tree entry \p TE can be represented as a 1525 /// shuffled vector entry + (possibly) permutation with other gathers. It 1526 /// implements the checks only for possibly ordered scalars (Loads, 1527 /// ExtractElement, ExtractValue), which can be part of the graph. 1530 /// Sort loads into increasing pointers offsets to allow greater clustering. 1533 /// Gets reordering data for the given tree entry. If the entry is vectorized 1534 /// - just return ReorderIndices, otherwise check if the scalars can be 1535 /// reordered and return the most optimal order. 1536 /// \return std::nullopt if ordering is not important, empty order, if 1537 /// identity order is important, or the actual order. 1538 /// \param TopToBottom If true, include the order of vectorized stores and 1539 /// insertelement nodes, otherwise skip them. 1543 /// Reorders the current graph to the most profitable order starting from the 1544 /// root node to the leaf nodes. The best order is chosen only from the nodes 1545 /// of the same size (vectorization factor). Smaller nodes are considered 1546 /// parts of subgraph with smaller VF and they are reordered independently. We 1547 /// can make it because we still need to extend smaller nodes to the wider VF 1548 /// and we can merge reordering shuffles with the widening shuffles. 1551 /// Reorders the current graph to the most profitable order starting from 1552 /// leaves to the root. It allows to rotate small subgraphs and reduce the 1553 /// number of reshuffles if the leaf nodes use the same order. In this case we 1554 /// can merge the orders and just shuffle user node instead of shuffling its 1555 /// operands. Plus, even the leaf nodes have different orders, it allows to 1556 /// sink reordering in the graph closer to the root node and merge it later 1557 /// during analysis. 1560 /// \return The vector element size in bits to use when vectorizing the 1561 /// expression tree ending at \p V. If V is a store, the size is the width of 1562 /// the stored value. Otherwise, the size is the width of the largest loaded 1563 /// value reaching V. This method is used by the vectorizer to calculate 1564 /// vectorization factors. 1567 /// Compute the minimum type sizes required to represent the entries in a 1568 /// vectorizable tree. 1571// \returns maximum vector register size as set by TTI or overridden by cl::opt. 1573return MaxVecRegSize;
1576// \returns minimum vector register size as set by cl::opt. 1578return MinVecRegSize;
1588return MaxVF ? MaxVF : UINT_MAX;
1591 /// Check if homogeneous aggregate is isomorphic to some VectorType. 1592 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like 1593 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, 1594 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. 1596 /// \returns number of elements in vector if isomorphism exists, 0 otherwise. 1599 /// \returns True if the VectorizableTree is both tiny and not fully 1600 /// vectorizable. We do not vectorize such trees. 1603 /// Checks if the graph and all its subgraphs cannot be better vectorized. 1604 /// It may happen, if all gather nodes are loads and they cannot be 1605 /// "clusterized". In this case even subgraphs cannot be vectorized more 1606 /// effectively than the base graph. 1609 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values 1610 /// can be load combined in the backend. Load combining may not be allowed in 1611 /// the IR optimizer, so we do not want to alter the pattern. For example, 1612 /// partially transforming a scalar bswap() pattern into vector code is 1613 /// effectively impossible for the backend to undo. 1614 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1615 /// may not be necessary. 1618 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values 1619 /// can be load combined in the backend. Load combining may not be allowed in 1620 /// the IR optimizer, so we do not want to alter the pattern. For example, 1621 /// partially transforming a scalar bswap() pattern into vector code is 1622 /// effectively impossible for the backend to undo. 1623 /// TODO: If load combining is allowed in the IR optimizer, this analysis 1624 /// may not be necessary. 1627 /// Checks if the given array of loads can be represented as a vectorized, 1628 /// scatter or just simple gather. 1629 /// \param VL list of loads. 1630 /// \param VL0 main load value. 1631 /// \param Order returned order of load instructions. 1632 /// \param PointerOps returned list of pointer operands. 1633 /// \param BestVF return best vector factor, if recursive check found better 1634 /// vectorization sequences rather than masked gather. 1635 /// \param TryRecursiveCheck used to check if long masked gather can be 1636 /// represented as a serie of loads/insert subvector, if profitable. 1640unsigned *BestVF =
nullptr,
1641bool TryRecursiveCheck =
true)
const;
1643 /// Registers non-vectorizable sequence of loads 1648 /// Checks if the given loads sequence is known as not vectorizable 1649template <
typename T>
1656 /// This structure holds any data we need about the edges being traversed 1657 /// during buildTree_rec(). We keep track of: 1658 /// (i) the user TreeEntry index, and 1659 /// (ii) the index of the edge. 1664 /// The user TreeEntry. 1666 /// The operand index of the use. 1677 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1686 /// A helper class used for scoring candidates for two consecutive lanes. 1692int NumLanes;
// Total number of lanes (aka vectorization factor). 1693int MaxLevel;
// The maximum recursion depth for accumulating score. 1699 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1700 MaxLevel(MaxLevel) {}
1702// The hard-coded scores listed here are not very important, though it shall 1703// be higher for better matches to improve the resulting cost. When 1704// computing the scores of matching one sub-tree with another, we are 1705// basically counting the number of values that are matching. So even if all 1706// scores are set to 1, we would still get a decent matching result. 1707// However, sometimes we have to break ties. For example we may have to 1708// choose between matching loads vs matching opcodes. This is what these 1709// scores are helping us with: they provide the order of preference. Also, 1710// this is important if the scalar is externally used or used in another 1711// tree entry node in the different lane. 1713 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). 1715 /// The same load multiple times. This should have a better score than 1716 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it 1717 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for 1718 /// a vector load and 1.0 for a broadcast. 1720 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). 1722 /// A load candidate for masked gather. 1724 /// ExtractElementInst from same vector and consecutive indexes. 1726 /// ExtractElementInst from same vector and reversed indices. 1730 /// Instructions with the same opcode. 1732 /// Instructions with alt opcodes (e.g, add + sub). 1734 /// Identical instructions (a.k.a. splat or broadcast). 1736 /// Matching with an undef is preferable to failing. 1738 /// Score for failing to find a decent match. 1740 /// Score if all users are vectorized. 1743 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. 1744 /// \p U1 and \p U2 are the users of \p V1 and \p V2. 1745 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p 1754if (isa<LoadInst>(V1)) {
1755// Retruns true if the users of V1 and V2 won't need to be extracted. 1756auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1757// Bail out if we have too many uses to save compilation time. 1761auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1763 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1766return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1768// A broadcast of a load can be cheaper on some targets. 1769if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1771 ((
int)V1->getNumUses() == NumLanes ||
1772 AllUsersAreInternal(V1, V2)))
1778auto CheckSameEntryOrFail = [&]() {
1779if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1780 TE1 && TE1 == R.getTreeEntry(V2))
1785auto *LI1 = dyn_cast<LoadInst>(V1);
1786auto *LI2 = dyn_cast<LoadInst>(V2);
1788if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1790return CheckSameEntryOrFail();
1793 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1794 LI2->getPointerOperand(),
DL, SE,
/*StrictCheck=*/true);
1795if (!Dist || *Dist == 0) {
1798 R.TTI->isLegalMaskedGather(
1801return CheckSameEntryOrFail();
1803// The distance is too large - still may be profitable to use masked 1805if (std::abs(*Dist) > NumLanes / 2)
1807// This still will detect consecutive loads, but we might have "holes" 1808// in some cases. It is ok for non-power-2 vectorization and may produce 1809// better results. It should not affect current vectorization. 1814auto *C1 = dyn_cast<Constant>(V1);
1815auto *C2 = dyn_cast<Constant>(V2);
1819// Extracts from consecutive indexes of the same vector better score as 1820// the extracts could be optimized away. 1824// Undefs are always profitable for extractelements. 1825// Compiler can easily combine poison and extractelement <non-poison> or 1826// undef and extractelement <poison>. But combining undef + 1827// extractelement <non-poison-but-may-produce-poison> requires some 1829if (isa<UndefValue>(V2))
1838// Undefs are always profitable for extractelements. 1846int Dist = Idx2 - Idx1;
1847// The distance is too large - still may be profitable to use 1849if (std::abs(Dist) == 0)
1851if (std::abs(Dist) > NumLanes / 2)
1858return CheckSameEntryOrFail();
1861auto *I1 = dyn_cast<Instruction>(V1);
1862auto *I2 = dyn_cast<Instruction>(V2);
1864if (I1->getParent() != I2->getParent())
1865return CheckSameEntryOrFail();
1870// Note: Only consider instructions with <= 2 operands to avoid 1871// complexity explosion. 1873 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1874 !S.isAltShuffle()) &&
1876return isa<PoisonValue>(V) ||
1877 cast<Instruction>(V)->getNumOperands() ==
1878 S.getMainOp()->getNumOperands();
1884if (I1 && isa<PoisonValue>(V2))
1887if (isa<UndefValue>(V2))
1890return CheckSameEntryOrFail();
1893 /// Go through the operands of \p LHS and \p RHS recursively until 1894 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are 1895 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands 1896 /// of \p U1 and \p U2), except at the beginning of the recursion where 1897 /// these are set to nullptr. 1901 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] 1906 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at 1907 /// each level recursively, accumulating the score. It starts from matching 1908 /// the additions at level 0, then moves on to the loads (level 1). The 1909 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and 1910 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while 1911 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail. 1912 /// Please note that the order of the operands does not matter, as we 1913 /// evaluate the score of all profitable combinations of operands. In 1914 /// other words the score of G1 and G4 is the same as G1 and G2. This 1915 /// heuristic is based on ideas described in: 1916 /// Look-ahead SLP: Auto-vectorization in the presence of commutative 1917 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, 1923// Get the shallow score of V1 and V2. 1924int ShallowScoreAtThisLevel =
1927// If reached MaxLevel, 1928// or if V1 and V2 are not instructions, 1929// or if they are SPLAT, 1930// or if they are not consecutive, 1931// or if profitable to vectorize loads or extractelements, early return 1933auto *I1 = dyn_cast<Instruction>(
LHS);
1934auto *I2 = dyn_cast<Instruction>(
RHS);
1935if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1937 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1938 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1939 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1940 ShallowScoreAtThisLevel))
1941return ShallowScoreAtThisLevel;
1942assert(I1 && I2 &&
"Should have early exited.");
1944// Contains the I2 operand indexes that got matched with I1 operands. 1947// Recursion towards the operands of I1 and I2. We are trying all possible 1948// operand pairs, and keeping track of the best score. 1949for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1950 OpIdx1 != NumOperands1; ++OpIdx1) {
1951// Try to pair op1I with the best operand of I2. 1953unsigned MaxOpIdx2 = 0;
1954bool FoundBest =
false;
1955// If I2 is commutative try all combinations. 1958 ? I2->getNumOperands()
1959 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1960assert(FromIdx <= ToIdx &&
"Bad index");
1961for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1962// Skip operands already paired with OpIdx1. 1963if (Op2Used.
count(OpIdx2))
1965// Recursively calculate the cost at each level 1968 I1, I2, CurrLevel + 1, {});
1969// Look for the best score. 1971 TmpScore > MaxTmpScore) {
1972 MaxTmpScore = TmpScore;
1978// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. 1979 Op2Used.
insert(MaxOpIdx2);
1980 ShallowScoreAtThisLevel += MaxTmpScore;
1983return ShallowScoreAtThisLevel;
1986 /// A helper data structure to hold the operands of a vector of instructions. 1987 /// This supports a fixed vector length for all operand vectors. 1989 /// For each operand we need (i) the value, and (ii) the opcode that it 1990 /// would be attached to if the expression was in a left-linearized form. 1991 /// This is required to avoid illegal operand reordering. 1996 /// Op1 Op2 Linearized + Op2 1997 /// \ / ----------> |/ 2000 /// Op1 - Op2 (0 + Op1) - Op2 2003 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. 2005 /// Another way to think of this is to track all the operations across the 2006 /// path from the operand all the way to the root of the tree and to 2007 /// calculate the operation that corresponds to this path. For example, the 2008 /// path from Op2 to the root crosses the RHS of the '-', therefore the 2009 /// corresponding operation is a '-' (which matches the one in the 2010 /// linearized tree, as shown above). 2012 /// For lack of a better term, we refer to this operation as Accumulated 2013 /// Path Operation (APO). 2015 OperandData() =
default;
2016 OperandData(
Value *V,
bool APO,
bool IsUsed)
2017 : V(V), APO(APO), IsUsed(IsUsed) {}
2018 /// The operand value. 2020 /// TreeEntries only allow a single opcode, or an alternate sequence of 2021 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the 2022 /// APO. It is set to 'true' if 'V' is attached to an inverse operation 2023 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise 2026 /// Helper data for the reordering function. 2030 /// During operand reordering, we are trying to select the operand at lane 2031 /// that matches best with the operand at the neighboring lane. Our 2032 /// selection is based on the type of value we are looking for. For example, 2033 /// if the neighboring lane has a load, we need to look for a load that is 2034 /// accessing a consecutive address. These strategies are summarized in the 2035 /// 'ReorderingMode' enumerator. 2036enum class ReorderingMode {
2037 Load,
///< Matching loads to consecutive memory addresses 2038 Opcode,
///< Matching instructions based on opcode (same or alternate) 2040Splat,
///< Matching the same instruction multiple times (broadcast) 2041Failed,
///< We failed to create a vectorizable group 2046 /// A vector of operand vectors. 2048 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0] 2049 /// is not IntrinsicInst, ArgSize is User::getNumOperands. 2050unsigned ArgSize = 0;
2056constLoop *L =
nullptr;
2058 /// \returns the operand data at \p OpIdx and \p Lane. 2059 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2060return OpsVec[OpIdx][Lane];
2063 /// \returns the operand data at \p OpIdx and \p Lane. Const version. 2064const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const{
2065return OpsVec[OpIdx][Lane];
2068 /// Clears the used flag for all entries. 2070for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2071 OpIdx != NumOperands; ++OpIdx)
2072for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2074 OpsVec[OpIdx][Lane].IsUsed =
false;
2077 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. 2078void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2079std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2082 /// \param Lane lane of the operands under analysis. 2083 /// \param OpIdx operand index in \p Lane lane we're looking the best 2085 /// \param Idx operand index of the current candidate value. 2086 /// \returns The additional score due to possible broadcasting of the 2087 /// elements in the lane. It is more profitable to have power-of-2 unique 2088 /// elements in the lane, it will be vectorized with higher probability 2089 /// after removing duplicates. Currently the SLP vectorizer supports only 2090 /// vectorization of the power-of-2 number of unique scalars. 2091int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsignedIdx,
2093Value *IdxLaneV = getData(
Idx, Lane).V;
2094if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2095 isa<ExtractElementInst>(IdxLaneV))
2098for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2101Value *OpIdxLnV = getData(OpIdx, Ln).V;
2102if (!isa<Instruction>(OpIdxLnV))
2106unsigned UniquesCount = Uniques.
size();
2107auto IdxIt = Uniques.
find(IdxLaneV);
2108unsigned UniquesCntWithIdxLaneV =
2109 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2110Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2111auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2112unsigned UniquesCntWithOpIdxLaneV =
2113 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2114if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2116return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2117 UniquesCntWithOpIdxLaneV,
2118 UniquesCntWithOpIdxLaneV -
2120 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2121 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2122 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2125 /// \param Lane lane of the operands under analysis. 2126 /// \param OpIdx operand index in \p Lane lane we're looking the best 2128 /// \param Idx operand index of the current candidate value. 2129 /// \returns The additional score for the scalar which users are all 2131int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsignedIdx)
const{
2132Value *IdxLaneV = getData(
Idx, Lane).V;
2133Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2134// Do not care about number of uses for vector-like instructions 2135// (extractelement/extractvalue with constant indices), they are extracts 2136// themselves and already externally used. Vectorization of such 2137// instructions does not add extra extractelement instruction, just may 2142auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2143if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2145return R.areAllUsersVectorized(IdxLaneI)
2150 /// Score scaling factor for fully compatible instructions but with 2151 /// different number of external uses. Allows better selection of the 2152 /// instructions with less external uses. 2153staticconstint ScoreScaleFactor = 10;
2155 /// \Returns the look-ahead score, which tells us how much the sub-trees 2156 /// rooted at \p LHS and \p RHS match, the more they match the higher the 2157 /// score. This helps break ties in an informed way when we cannot decide on 2158 /// the order of the operands by just considering the immediate 2161int Lane,
unsigned OpIdx,
unsignedIdx,
2165// Keep track of the instruction stack as we recurse into the operands 2166// during the look-ahead score exploration. 2169/*CurrLevel=*/1, MainAltOps);
2171int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2172if (Score <= -SplatScore) {
2176 Score += SplatScore;
2177// Scale score to see the difference between different operands 2178// and similar operands but all vectorized/not all vectorized 2179// uses. It does not affect actual selection of the best 2180// compatible operand in general, just allows to select the 2181// operand with all vectorized uses. 2182 Score *= ScoreScaleFactor;
2183 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2190 /// Best defined scores per lanes between the passes. Used to choose the 2191 /// best operand (with the highest score) between the passes. 2192 /// The key - {Operand Index, Lane}. 2193 /// The value - the best score between the passes for the lane and the 2198// Search all operands in Ops[*][Lane] for the one that matches best 2199// Ops[OpIdx][LastLane] and return its opreand index. 2200// If no good match can be found, return std::nullopt. 2201 std::optional<unsigned>
2202 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2206unsigned NumOperands = getNumOperands();
2208// The operand of the previous lane at OpIdx. 2209Value *OpLastLane = getData(OpIdx, LastLane).V;
2211// Our strategy mode for OpIdx. 2212 ReorderingMode RMode = ReorderingModes[OpIdx];
2213if (RMode == ReorderingMode::Failed)
2216// The linearized opcode of the operand at OpIdx, Lane. 2217bool OpIdxAPO = getData(OpIdx, Lane).APO;
2219// The best operand index and its score. 2220// Sometimes we have more than one option (e.g., Opcode and Undefs), so we 2221// are using the score to differentiate between the two. 2223 std::optional<unsigned>
Idx;
2227 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2230// Track if the operand must be marked as used. If the operand is set to 2231// Score 1 explicitly (because of non power-of-2 unique scalars, we may 2232// want to reestimate the operands again on the following iterations). 2233bool IsUsed = RMode == ReorderingMode::Splat ||
2234 RMode == ReorderingMode::Constant ||
2235 RMode == ReorderingMode::Load;
2236// Iterate through all unused operands and look for the best. 2237for (
unsignedIdx = 0;
Idx != NumOperands; ++
Idx) {
2238// Get the operand at Idx and Lane. 2239 OperandData &OpData = getData(
Idx, Lane);
2241bool OpAPO = OpData.APO;
2243// Skip already selected operands. 2247// Skip if we are trying to move the operand to a position with a 2248// different opcode in the linearized tree form. This would break the 2250if (OpAPO != OpIdxAPO)
2253// Look for an operand that matches the current mode. 2255case ReorderingMode::Load:
2256case ReorderingMode::Opcode: {
2257bool LeftToRight = Lane > LastLane;
2258Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2259Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2260int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2261 OpIdx,
Idx, IsUsed, UsedLanes);
2262if (Score >
static_cast<int>(BestOp.Score) ||
2263 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2266 BestOp.Score = Score;
2267 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2271case ReorderingMode::Constant:
2272if (isa<Constant>(
Op) ||
2273 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2275if (isa<Constant>(
Op)) {
2277 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2280if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2284case ReorderingMode::Splat:
2285if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2286 IsUsed =
Op == OpLastLane;
2287if (
Op == OpLastLane) {
2289 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2295case ReorderingMode::Failed:
2301 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2304// If we could not find a good match return std::nullopt. 2308 /// Helper for reorderOperandVecs. 2309 /// \returns the lane that we should start reordering from. This is the one 2310 /// which has the least number of operands that can freely move about or 2311 /// less profitable because it already has the most optimal set of operands. 2312unsigned getBestLaneToStartReordering()
const{
2313unsigned Min = UINT_MAX;
2314unsigned SameOpNumber = 0;
2315// std::pair<unsigned, unsigned> is used to implement a simple voting 2316// algorithm and choose the lane with the least number of operands that 2317// can freely move about or less profitable because it already has the 2318// most optimal set of operands. The first unsigned is a counter for 2319// voting, the second unsigned is the counter of lanes with instructions 2320// with same/alternate opcodes and same parent basic block. 2322// Try to be closer to the original results, if we have multiple lanes 2323// with same cost. If 2 lanes have the same cost, use the one with the 2325for (
intI = getNumLanes();
I > 0; --
I) {
2326unsigned Lane =
I - 1;
2327 OperandsOrderData NumFreeOpsHash =
2328 getMaxNumOperandsThatCanBeReordered(Lane);
2329// Compare the number of operands that can move and choose the one with 2331if (NumFreeOpsHash.NumOfAPOs < Min) {
2332 Min = NumFreeOpsHash.NumOfAPOs;
2333 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2335 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2336 }
elseif (NumFreeOpsHash.NumOfAPOs == Min &&
2337 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2338// Select the most optimal lane in terms of number of operands that 2339// should be moved around. 2340 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2341 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2342 }
elseif (NumFreeOpsHash.NumOfAPOs == Min &&
2343 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2344auto [It, Inserted] =
2345 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2350// Select the lane with the minimum counter. 2351unsigned BestLane = 0;
2352unsigned CntMin = UINT_MAX;
2354if (
Data.second.first < CntMin) {
2355 CntMin =
Data.second.first;
2356 BestLane =
Data.second.second;
2362 /// Data structure that helps to reorder operands. 2363structOperandsOrderData {
2364 /// The best number of operands with the same APOs, which can be 2366unsigned NumOfAPOs = UINT_MAX;
2367 /// Number of operands with the same/alternate instruction opcode and 2369unsigned NumOpsWithSameOpcodeParent = 0;
2370 /// Hash for the actual operands ordering. 2371 /// Used to count operands, actually their position id and opcode 2372 /// value. It is used in the voting mechanism to find the lane with the 2373 /// least number of operands that can freely move about or less profitable 2374 /// because it already has the most optimal set of operands. Can be 2375 /// replaced with SmallVector<unsigned> instead but hash code is faster 2376 /// and requires less memory. 2379 /// \returns the maximum number of operands that are allowed to be reordered 2380 /// for \p Lane and the number of compatible instructions(with the same 2381 /// parent/opcode). This is used as a heuristic for selecting the first lane 2382 /// to start operand reordering. 2383 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const{
2384unsigned CntTrue = 0;
2385unsigned NumOperands = getNumOperands();
2386// Operands with the same APO can be reordered. We therefore need to count 2387// how many of them we have for each APO, like this: Cnt[APO] = x. 2388// Since we only have two APOs, namely true and false, we can avoid using 2389// a map. Instead we can simply count the number of operands that 2390// correspond to one of them (in this case the 'true' APO), and calculate 2391// the other by subtracting it from the total number of operands. 2392// Operands with the same instruction opcode and parent are more 2393// profitable since we don't need to move them in many cases, with a high 2394// probability such lane already can be vectorized effectively. 2395bool AllUndefs =
true;
2396unsigned NumOpsWithSameOpcodeParent = 0;
2400for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2401const OperandData &OpData = getData(OpIdx, Lane);
2404// Use Boyer-Moore majority voting for finding the majority opcode and 2405// the number of times it occurs. 2406if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2408I->getParent() != Parent) {
2409if (NumOpsWithSameOpcodeParent == 0) {
2410 NumOpsWithSameOpcodeParent = 1;
2412 Parent =
I->getParent();
2414 --NumOpsWithSameOpcodeParent;
2417 ++NumOpsWithSameOpcodeParent;
2421 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2422 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2426 OperandsOrderData
Data;
2427Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2428Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2433 /// Go through the instructions in VL and append their operands. 2436assert((empty() || VL.
size() == getNumLanes()) &&
2437"Expected same number of lanes");
2438assert(S.valid() &&
"InstructionsState is invalid.");
2439// IntrinsicInst::isCommutative returns true if swapping the first "two" 2440// arguments to the intrinsic produces the same result. 2441constexprunsigned IntrinsicNumOperands = 2;
2444 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2445 OpsVec.
resize(NumOperands);
2446unsigned NumLanes = VL.
size();
2447for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2448 OpsVec[OpIdx].
resize(NumLanes);
2449for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2450assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2451"Expected instruction or poison value");
2452// Our tree has just 3 nodes: the root and two operands. 2453// It is therefore trivial to get the APO. We only need to check the 2454// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or 2455// RHS operand. The LHS operand of both add and sub is never attached 2456// to an inversese operation in the linearized form, therefore its APO 2457// is false. The RHS is true only if VL[Lane] is an inverse operation. 2459// Since operand reordering is performed on groups of commutative 2460// operations or alternating sequences (e.g., +, -), we can safely 2461// tell the inverse operations by checking commutativity. 2462if (isa<PoisonValue>(VL[Lane])) {
2463if (
auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2465 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),
true,
false};
2468 }
elseif (
auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2470 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),
true,
false};
2474 OpsVec[OpIdx][Lane] = {
2479bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2480bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2481 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2487 /// \returns the number of operands. 2488unsigned getNumOperands()
const{
return ArgSize; }
2490 /// \returns the number of lanes. 2491unsigned getNumLanes()
const{
return OpsVec[0].
size(); }
2493 /// \returns the operand value at \p OpIdx and \p Lane. 2494Value *getValue(
unsigned OpIdx,
unsigned Lane)
const{
2495return getData(OpIdx, Lane).V;
2498 /// \returns true if the data structure is empty. 2499bool empty()
const{
return OpsVec.
empty(); }
2501 /// Clears the data. 2502void clear() { OpsVec.
clear(); }
2504 /// \Returns true if there are enough operands identical to \p Op to fill 2505 /// the whole vector (it is mixed with constants or loop invariant values). 2506 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. 2507bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2508assert(
Op == getValue(OpIdx, Lane) &&
2509"Op is expected to be getValue(OpIdx, Lane).");
2510// Small number of loads - try load matching. 2511if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2513bool OpAPO = getData(OpIdx, Lane).APO;
2514bool IsInvariant = L && L->isLoopInvariant(
Op);
2516for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2519// This is set to true if we found a candidate for broadcast at Lane. 2520bool FoundCandidate =
false;
2521for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2522 OperandData &
Data = getData(OpI, Ln);
2523if (
Data.APO != OpAPO ||
Data.IsUsed)
2525Value *OpILane = getValue(OpI, Lane);
2526bool IsConstantOp = isa<Constant>(OpILane);
2527// Consider the broadcast candidate if: 2528// 1. Same value is found in one of the operands. 2530// 2. The operand in the given lane is not constant but there is a 2531// constant operand in another lane (which can be moved to the 2532// given lane). In this case we can represent it as a simple 2533// permutation of constant and broadcast. 2535 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2536// 2.1. If we have only 2 lanes, need to check that value in the 2537// next lane does not build same opcode sequence. 2540 isa<Constant>(
Data.V)))) ||
2541// 3. The operand in the current lane is loop invariant (can be 2542// hoisted out) and another operand is also a loop invariant 2543// (though not a constant). In this case the whole vector can be 2545// FIXME: need to teach the cost model about this case for better 2547 (IsInvariant && !isa<Constant>(
Data.V) &&
2549 L->isLoopInvariant(
Data.V))) {
2550 FoundCandidate =
true;
2560return getNumLanes() == 2 || Cnt > 1;
2563 /// Checks if there is at least single compatible operand in lanes other 2564 /// than \p Lane, compatible with the operand \p Op. 2565bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const{
2566assert(
Op == getValue(OpIdx, Lane) &&
2567"Op is expected to be getValue(OpIdx, Lane).");
2568bool OpAPO = getData(OpIdx, Lane).APO;
2569for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2572if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2573const OperandData &
Data = getData(OpI, Ln);
2574if (
Data.APO != OpAPO ||
Data.IsUsed)
2576Value *OpILn = getValue(OpI, Ln);
2577return (L && L->isLoopInvariant(OpILn)) ||
2587 /// Initialize with all the operands of the instruction vector \p RootVL. 2590 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2591 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
2592// Append all the operands of RootVL. 2593 appendOperandsOfVL(RootVL, S);
2596 /// \Returns a value vector with the operands across all lanes for the 2597 /// opearnd at \p OpIdx. 2600assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2601"Expected same num of lanes across all operands");
2602for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2603 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2607// Performs operand reordering for 2 or more operands. 2608// The original operands are in OrigOps[OpIdx][Lane]. 2609// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. 2611unsigned NumOperands = getNumOperands();
2612unsigned NumLanes = getNumLanes();
2613// Each operand has its own mode. We are using this mode to help us select 2614// the instructions for each lane, so that they match best with the ones 2615// we have selected so far. 2618// This is a greedy single-pass algorithm. We are going over each lane 2619// once and deciding on the best order right away with no back-tracking. 2620// However, in order to increase its effectiveness, we start with the lane 2621// that has operands that can move the least. For example, given the 2623// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd 2624// Lane 1 : A[1] = C[1] - B[1] // Visited 1st 2625// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd 2626// Lane 3 : A[3] = C[3] - B[3] // Visited 4th 2627// we will start at Lane 1, since the operands of the subtraction cannot 2628// be reordered. Then we will visit the rest of the lanes in a circular 2629// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. 2631// Find the first lane that we will start our search from. 2632unsigned FirstLane = getBestLaneToStartReordering();
2634// Initialize the modes. 2635for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2636Value *OpLane0 = getValue(OpIdx, FirstLane);
2637// Keep track if we have instructions with all the same opcode on one 2639if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2640// Check if OpLane0 should be broadcast. 2641if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2642 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2643 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2644elseif (isa<LoadInst>(OpILane0))
2645 ReorderingModes[OpIdx] = ReorderingMode::Load;
2647 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2648 }
elseif (isa<Constant>(OpLane0)) {
2649 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2650 }
elseif (isa<Argument>(OpLane0)) {
2651// Our best hope is a Splat. It may save some cost in some cases. 2652 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2658// Check that we don't have same operands. No need to reorder if operands 2659// are just perfect diamond or shuffled diamond match. Do not do it only 2660// for possible broadcasts or non-power of 2 number of scalars (just for 2662auto &&SkipReordering = [
this]() {
2665for (
const OperandData &
Data : Op0)
2669if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2674// TODO: Check if we can remove a check for non-power-2 number of 2675// scalars after full support of non-power-2 vectorization. 2676return UniqueValues.
size() != 2 &&
2678 UniqueValues.
size());
2681// If the initial strategy fails for any of the operand indexes, then we 2682// perform reordering again in a second pass. This helps avoid assigning 2683// high priority to the failed strategy, and should improve reordering for 2684// the non-failed operand indexes. 2686// Check if no need to reorder operands since they're are perfect or 2687// shuffled diamond match. 2688// Need to do it to avoid extra external use cost counting for 2689// shuffled matches, which may cause regressions. 2690if (SkipReordering())
2692// Skip the second pass if the first pass did not fail. 2693bool StrategyFailed =
false;
2694// Mark all operand data as free to use. 2696// We keep the original operand order for the FirstLane, so reorder the 2697// rest of the lanes. We are visiting the nodes in a circular fashion, 2698// using FirstLane as the center point and increasing the radius 2701for (
unsignedI = 0;
I < NumOperands; ++
I)
2702 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2705 UsedLanes.
set(FirstLane);
2706for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2707// Visit the lane on the right and then the lane on the left. 2709int Lane = FirstLane +
Direction * Distance;
2710if (Lane < 0 || Lane >= (
int)NumLanes)
2712 UsedLanes.
set(Lane);
2714assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2716// Look for a good match for each operand. 2717for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2718// Search for the operand that matches SortedOps[OpIdx][Lane-1]. 2719 std::optional<unsigned> BestIdx =
2720 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2721 MainAltOps[OpIdx], UsedLanes);
2722// By not selecting a value, we allow the operands that follow to 2723// select a better matching value. We will get a non-null value in 2724// the next run of getBestOperand(). 2726// Swap the current operand with the one returned by 2728 swap(OpIdx, *BestIdx, Lane);
2730// Enable the second pass. 2731 StrategyFailed =
true;
2733// Try to get the alternate opcode and follow it during analysis. 2734if (MainAltOps[OpIdx].
size() != 2) {
2735 OperandData &AltOp = getData(OpIdx, Lane);
2736 InstructionsState OpS =
2738if (OpS && OpS.isAltShuffle())
2744// Skip second pass if the strategy did not fail. 2750#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2753case ReorderingMode::Load:
2755case ReorderingMode::Opcode:
2757case ReorderingMode::Constant:
2759case ReorderingMode::Splat:
2761case ReorderingMode::Failed:
2782constunsigned Indent = 2;
2785OS <<
"Operand " << Cnt++ <<
"\n";
2786for (
const OperandData &OpData : OpDataVec) {
2788if (
Value *V = OpData.V)
2792OS <<
", APO:" << OpData.APO <<
"}\n";
2804 /// Evaluate each pair in \p Candidates and return index into \p Candidates 2805 /// for a pair which have highest score deemed to have best chance to form 2806 /// root of profitable tree to vectorize. Return std::nullopt if no candidate 2807 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit 2808 /// of the cost, considered to be good enough score. 2814int BestScore = Limit;
2815 std::optional<int> Index;
2816for (
intI : seq<int>(0, Candidates.size())) {
2818 Candidates[
I].second,
2819/*U1=*/nullptr,
/*U2=*/nullptr,
2820/*CurrLevel=*/1, {});
2821if (Score > BestScore) {
2829 /// Checks if the instruction is marked for deletion. 2832 /// Removes an instruction from its block and eventually deletes it. 2833 /// It's like Instruction::eraseFromParent() except that the actual deletion 2834 /// is delayed until BoUpSLP is destructed. 2836 DeletedInstructions.insert(
I);
2839 /// Remove instructions from the parent function and clear the operands of \p 2840 /// DeadVals instructions, marking for deletion trivially dead operands. 2841template <
typename T>
2844for (
T *V : DeadVals) {
2845auto *
I = cast<Instruction>(V);
2846 DeletedInstructions.insert(
I);
2849for (
T *V : DeadVals) {
2850if (!V || !Processed.
insert(V).second)
2852auto *
I = cast<Instruction>(V);
2855if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2856 Entries.push_back(Entry);
2857auto It = MultiNodeScalars.find(
I);
2858if (It != MultiNodeScalars.end())
2859 Entries.append(It->second.begin(), It->second.end());
2861for (
Use &U :
I->operands()) {
2862if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2863 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2865 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2866return Entry->VectorizedValue == OpI;
2870I->dropAllReferences();
2872for (
T *V : DeadVals) {
2873auto *
I = cast<Instruction>(V);
2879 cast<Instruction>(U.getUser()));
2881"trying to erase instruction with users.");
2882I->removeFromParent();
2885// Process the dead instruction list until empty. 2886while (!DeadInsts.
empty()) {
2889if (!VI || !VI->getParent())
2892"Live instruction found in dead worklist!");
2893assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2895// Don't lose the debug info while deleting the instructions. 2898// Null out all of the instruction's operands to see if any operand 2899// becomes dead as we go. 2900for (
Use &OpU : VI->operands()) {
2901Value *OpV = OpU.get();
2909// If the operand is an instruction that became dead as we nulled out 2910// the operand, and if it is 'trivially' dead, delete it in a future 2912if (
auto *OpI = dyn_cast<Instruction>(OpV))
2913if (!DeletedInstructions.contains(OpI) &&
2918 VI->removeFromParent();
2919 DeletedInstructions.insert(VI);
2924 /// Checks if the instruction was already analyzed for being possible 2927return AnalyzedReductionsRoots.count(
I);
2929 /// Register given instruction as already analyzed for being possible 2932 AnalyzedReductionsRoots.insert(
I);
2934 /// Checks if the provided list of reduced values was checked already for 2939 /// Adds the list of reduced values to list of already checked values for the 2944 /// Clear the list of the analyzed reduction root instructions. 2946 AnalyzedReductionsRoots.clear();
2947 AnalyzedReductionVals.
clear();
2948 AnalyzedMinBWVals.
clear();
2950 /// Checks if the given value is gathered in one of the nodes. 2954 /// Checks if the given value is gathered in one of the nodes. 2958 /// Checks if the specified value was not schedule. 2960return NonScheduledFirst.
contains(V);
2963 /// Check if the value is vectorized in the tree. 2969 /// Determine if a node \p E in can be demoted to a smaller type with a 2970 /// truncation. We collect the entries that will be demoted in ToDemote. 2971 /// \param E Node for analysis 2972 /// \param ToDemote indices of the nodes to be demoted. 2973bool collectValuesToDemote(
2974const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2977bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2979 /// Check if the operands on the edges \p Edges of the \p UserTE allows 2980 /// reordering (i.e. the operands can be reordered because they have only one 2981 /// user and reordarable). 2982 /// \param ReorderableGathers List of all gather nodes that require reordering 2983 /// (e.g., gather of extractlements or partially vectorizable loads). 2984 /// \param GatherOps List of gather operand nodes for \p UserTE that require 2985 /// reordering, subset of \p NonVectorized. 2987 canReorderOperands(TreeEntry *UserTE,
2992 /// Checks if the given \p TE is a gather node with clustered reused scalars 2993 /// and reorders it per given \p Mask. 2994void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2996 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 2997 /// if any. If it is not vectorized (gather node), returns nullptr. 2998 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
3000 TreeEntry *TE =
nullptr;
3002 TE = getTreeEntry(V);
3003if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
3005auto It = MultiNodeScalars.find(V);
3006if (It != MultiNodeScalars.end()) {
3007for (TreeEntry *E : It->second) {
3008if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
3016if (It != VL.
end()) {
3017assert(
TE->isSame(VL) &&
"Expected same scalars.");
3023 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, 3024 /// if any. If it is not vectorized (gather node), returns nullptr. 3025const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
3026unsigned OpIdx)
const{
3027returnconst_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
3028const_cast<TreeEntry *
>(UserTE), OpIdx);
3031 /// Checks if all users of \p I are the part of the vectorization tree. 3032bool areAllUsersVectorized(
3036 /// Return information about the vector formed for the specified index 3037 /// of a vector of (the same) instruction. 3040 /// \ returns the graph entry for the \p Idx operand of the \p E entry. 3041const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsignedIdx)
const;
3043 /// Gets the root instruction for the given node. If the node is a strided 3044 /// load/store node with the reverse order, the root instruction is the last 3046Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3048 /// \returns Cast context for the given graph node. 3050 getCastContextHint(
const TreeEntry &TE)
const;
3052 /// \returns the cost of the vectorizable entry. 3057 /// This is the recursive part of buildTree. 3059const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3061 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can 3062 /// be vectorized to use the original vector (or aggregate "bitcast" to a 3063 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise 3064 /// returns false, setting \p CurrentOrder to either an empty vector or a 3065 /// non-identity permutation that allows to reuse extract instructions. 3066 /// \param ResizeAllowed indicates whether it is allowed to handle subvector 3070bool ResizeAllowed =
false)
const;
3072 /// Vectorize a single entry in the tree. 3073 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to 3074 /// avoid issues with def-use order. 3077 /// Returns vectorized operand node, that matches the order of the scalars 3078 /// operand number \p NodeIdx in entry \p E. 3079 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3080const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3081unsigned NodeIdx)
const{
3082returnconst_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3085 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry 3087 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to 3088 /// avoid issues with def-use order. 3089Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3091 /// Create a new vector from a list of scalar values. Produces a sequence 3092 /// which exploits values reused across lanes, and arranges the inserts 3093 /// for ease of later optimization. 3094template <
typename BVTy,
typename ResTy,
typename...
Args>
3095 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3097 /// Create a new vector from a list of scalar values. Produces a sequence 3098 /// which exploits values reused across lanes, and arranges the inserts 3099 /// for ease of later optimization. 3100Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3103 /// Returns the instruction in the bundle, which can be used as a base point 3104 /// for scheduling. Usually it is the last instruction in the bundle, except 3105 /// for the case when all operands are external (in this case, it is the first 3106 /// instruction in the list). 3107Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3109 /// Tries to find extractelement instructions with constant indices from fixed 3110 /// vector type and gather such instructions into a bunch, which highly likely 3111 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt 3112 /// was successful, the matched scalars are replaced by poison values in \p VL 3113 /// for future analysis. 3114 std::optional<TargetTransformInfo::ShuffleKind>
3118 /// Tries to find extractelement instructions with constant indices from fixed 3119 /// vector type and gather such instructions into a bunch, which highly likely 3120 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt 3121 /// was successful, the matched scalars are replaced by poison values in \p VL 3122 /// for future analysis. 3126unsigned NumParts)
const;
3128 /// Checks if the gathered \p VL can be represented as a single register 3129 /// shuffle(s) of previous tree entries. 3130 /// \param TE Tree entry checked for permutation. 3131 /// \param VL List of scalars (a subset of the TE scalar), checked for 3132 /// permutations. Must form single-register vector. 3133 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also 3134 /// commands to build the mask using the original vector value, without 3135 /// relying on the potential reordering. 3136 /// \returns ShuffleKind, if gathered values can be represented as shuffles of 3137 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask. 3138 std::optional<TargetTransformInfo::ShuffleKind>
3139 isGatherShuffledSingleRegisterEntry(
3144 /// Checks if the gathered \p VL can be represented as multi-register 3145 /// shuffle(s) of previous tree entries. 3146 /// \param TE Tree entry checked for permutation. 3147 /// \param VL List of scalars (a subset of the TE scalar), checked for 3149 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also 3150 /// commands to build the mask using the original vector value, without 3151 /// relying on the potential reordering. 3152 /// \returns per-register series of ShuffleKind, if gathered values can be 3153 /// represented as shuffles of previous tree entries. \p Mask is filled with 3154 /// the shuffle mask (also on per-register base). 3156 isGatherShuffledEntry(
3159unsigned NumParts,
bool ForOrder =
false);
3161 /// \returns the cost of gathering (inserting) the values in \p VL into a 3163 /// \param ForPoisonSrc true if initial vector is poison, false otherwise. 3165Type *ScalarTy)
const;
3167 /// Set the Builder insert point to one after the last instruction in 3169void setInsertPointAfterBundle(
const TreeEntry *E);
3171 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not 3172 /// specified, the starting vector value is poison. 3177 /// \returns whether the VectorizableTree is fully vectorizable and will 3178 /// be beneficial even the tree height is tiny. 3179bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3181 /// Run through the list of all gathered loads in the graph and try to find 3182 /// vector loads/masked gathers instead of regular gathers. Later these loads 3183 /// are reshufled to build final gathered nodes. 3184void tryToVectorizeGatheredLoads(
3189 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the 3190 /// users of \p TE and collects the stores. It returns the map from the store 3191 /// pointers to the collected stores. 3193 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3195 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the 3196 /// stores in \p StoresVec can form a vector instruction. If so it returns 3197 /// true and populates \p ReorderIndices with the shuffle indices of the 3198 /// stores when compared to the sorted vector. 3202 /// Iterates through the users of \p TE, looking for scalar stores that can be 3203 /// potentially vectorized in a future SLP-tree. If found, it keeps track of 3204 /// their order and builds an order index vector for each store bundle. It 3205 /// returns all these order vectors found. 3206 /// We run this after the tree has formed, otherwise we may come across user 3207 /// instructions that are not yet in the tree. 3209 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3211 /// Tries to reorder the gathering node for better vectorization 3213void reorderGatherNode(TreeEntry &TE);
3217 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3219 /// \returns Common mask for reorder indices and reused scalars. 3227 /// \returns true if the scalars in VL are equal to this entry. 3234 [Scalars](
Value *V,
int Idx) {
3235 return (isa<UndefValue>(V) &&
3236 Idx == PoisonMaskElem) ||
3237 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3240if (!ReorderIndices.empty()) {
3241// TODO: implement matching if the nodes are just reordered, still can 3242// treat the vector as the same if the list of scalars matches VL 3243// directly, without reordering. 3247return IsSame(Scalars, Mask);
3248if (VL.
size() == ReuseShuffleIndices.size()) {
3250return IsSame(Scalars, Mask);
3254return IsSame(Scalars, ReuseShuffleIndices);
3257bool isOperandGatherNode(
const EdgeInfo &UserEI)
const{
3258returnisGather() && !UserTreeIndices.empty() &&
3259 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3260 UserTreeIndices.front().UserTE == UserEI.UserTE;
3263 /// \returns true if current entry has same operands as \p TE. 3264bool hasEqualOperands(
const TreeEntry &TE)
const{
3265if (
TE.getNumOperands() != getNumOperands())
3268for (
unsignedI = 0, E = getNumOperands();
I < E; ++
I) {
3269unsigned PrevCount =
Used.count();
3270for (
unsigned K = 0;
K < E; ++
K) {
3273if (getOperand(K) ==
TE.getOperand(
I)) {
3278// Check if we actually found the matching operand. 3279if (PrevCount ==
Used.count())
3285 /// \return Final vectorization factor for the node. Defined by the total 3286 /// number of vectorized scalars, including those, used several times in the 3287 /// entry and counted in the \a ReuseShuffleIndices, if any. 3288unsigned getVectorFactor()
const{
3289if (!ReuseShuffleIndices.empty())
3290return ReuseShuffleIndices.size();
3291return Scalars.
size();
3294 /// Checks if the current node is a gather node. 3295boolisGather()
const{
return State == NeedToGather; }
3297 /// A vector of scalars. 3300 /// The Scalars are vectorized into this value. It is initialized to Null. 3303 /// New vector phi instructions emitted for the vectorized phi nodes. 3306 /// Do we need to gather this sequence or vectorize it 3307 /// (either with vector instruction or with scatter/gather 3308 /// intrinsics for store/load)? 3310 Vectorize,
///< The node is regularly vectorized. 3311 ScatterVectorize,
///< Masked scatter/gather node. 3312 StridedVectorize,
///< Strided loads (and stores) 3313 NeedToGather,
///< Gather/buildvector node. 3314 CombinedVectorize,
///< Vectorized node, combined with its user into more 3315 ///< complex node like select/cmp to minmax, mul/add to 3316 ///< fma, etc. Must be used for the following nodes in 3317 ///< the pattern, not the very first one. 3321 /// List of combined opcodes supported by the vectorizer. 3322enum CombinedOpcode {
3324MinMax = Instruction::OtherOpsEnd + 1,
3326 CombinedOpcode CombinedOp = NotCombinedOp;
3328 /// Does this sequence require some shuffling? 3331 /// Does this entry require reordering? 3334 /// Points back to the VectorizableTree. 3336 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has 3337 /// to be a pointer and needs to be able to initialize the child iterator. 3338 /// Thus we need a reference back to the container to translate the indices 3340 VecTreeTy &Container;
3342 /// The TreeEntry index containing the user of this entry. We can actually 3343 /// have multiple users so the data structure is not truly a tree. 3346 /// The index of this treeEntry in VectorizableTree. 3349 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from 3350 /// other nodes as a series of insertvector instructions. 3354 /// The operands of each instruction in each lane Operands[op_index][lane]. 3355 /// Note: This helps avoid the replication of the code that performs the 3356 /// reordering of operands during buildTree_rec() and vectorizeTree(). 3359 /// MainOp and AltOp are recorded inside. S should be obtained from 3361 InstructionsState S = InstructionsState::invalid();
3363 /// Interleaving factor for interleaved loads Vectorize nodes. 3364unsigned InterleaveFactor = 0;
3367 /// Returns interleave factor for interleave nodes. 3368unsigned getInterleaveFactor()
const{
return InterleaveFactor; }
3369 /// Sets interleaving factor for the interleaving nodes. 3370void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3372 /// Set this bundle's \p OpIdx'th operand to \p OpVL. 3376assert(Operands[OpIdx].empty() &&
"Already resized?");
3378"Number of operands is greater than the number of scalars.");
3383 /// Set this bundle's operand from Scalars. 3384void setOperand(
constBoUpSLP &R,
bool RequireReorder =
false) {
3385 VLOperands Ops(Scalars, S, R);
3388for (
unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands()))
3389 setOperand(
I, Ops.getVL(
I));
3392 /// Reorders operands of the node to the given mask \p Mask. 3398 /// \returns the \p OpIdx operand of this TreeEntry. 3404 /// \returns the \p OpIdx operand of this TreeEntry. 3410 /// \returns the number of operands. 3411unsigned getNumOperands()
const{
returnOperands.size(); }
3413 /// \return the single \p OpIdx operand. 3414Value *getSingleOperand(
unsigned OpIdx)
const{
3416assert(!Operands[OpIdx].empty() &&
"No operand available");
3420 /// Some of the instructions in the list have alternate opcodes. 3421bool isAltShuffle()
const{
return S.isAltShuffle(); }
3423bool isOpcodeOrAlt(
Instruction *
I)
const{
return S.isOpcodeOrAlt(
I); }
3425 /// Chooses the correct key for scheduling data. If \p Op has the same (or 3426 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is 3429auto *
I = dyn_cast<Instruction>(
Op);
3430if (
I && isOpcodeOrAlt(
I))
3432return S.getMainOp();
3435void setOperations(
const InstructionsState &S) {
3436assert(S &&
"InstructionsState is invalid.");
3440Instruction *getMainOp()
const{
return S.getMainOp(); }
3442Instruction *getAltOp()
const{
return S.getAltOp(); }
3444 /// The main/alternate opcodes for the list of instructions. 3445unsigned getOpcode()
const{
return S.
getOpcode(); }
3447unsigned getAltOpcode()
const{
return S.getAltOpcode(); }
3449bool hasState()
const{
return S.valid(); }
3451 /// When ReuseReorderShuffleIndices is empty it just returns position of \p 3452 /// V within vector of Scalars. Otherwise, try to remap on its reuse index. 3453int findLaneForValue(
Value *V)
const{
3454unsigned FoundLane = getVectorFactor();
3455for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3456 std::advance(It, 1)) {
3459 FoundLane = std::distance(Scalars.begin(), It);
3460assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3461if (!ReorderIndices.
empty())
3462 FoundLane = ReorderIndices[FoundLane];
3463assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3464if (ReuseShuffleIndices.
empty())
3466if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3467 RIt != ReuseShuffleIndices.
end()) {
3468 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3472assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3476 /// Build a shuffle mask for graph entry which represents a merge of main 3477 /// and alternate operations. 3484 /// Return true if this is a non-power-of-2 node. 3485bool isNonPowOf2Vec()
const{
3487return IsNonPowerOf2;
3490 /// Return true if this is a node, which tries to vectorize number of 3491 /// elements, forming whole vectors. 3496assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3497"Reshuffling not supported with non-power-of-2 vectors yet.");
3498return IsNonPowerOf2;
3501Value *getOrdered(
unsigned Idx)
const{
3502assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3503if (ReorderIndices.
empty())
3514for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3515dbgs() <<
"Operand " << OpI <<
":\n";
3516for (
constValue *V : Operands[OpI])
3519dbgs() <<
"Scalars: \n";
3520for (
Value *V : Scalars)
3525if (InterleaveFactor > 0) {
3526dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3529dbgs() <<
"Vectorize\n";
3532case ScatterVectorize:
3533dbgs() <<
"ScatterVectorize\n";
3535case StridedVectorize:
3536dbgs() <<
"StridedVectorize\n";
3539dbgs() <<
"NeedToGather\n";
3541case CombinedVectorize:
3542dbgs() <<
"CombinedVectorize\n";
3546dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
3547dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
3549dbgs() <<
"MainOp: NULL\n";
3550dbgs() <<
"AltOp: NULL\n";
3552dbgs() <<
"VectorizedValue: ";
3554dbgs() << *VectorizedValue <<
"\n";
3557dbgs() <<
"ReuseShuffleIndices: ";
3558if (ReuseShuffleIndices.
empty())
3561for (
int ReuseIdx : ReuseShuffleIndices)
3562dbgs() << ReuseIdx <<
", ";
3564dbgs() <<
"ReorderIndices: ";
3565for (
unsigned ReorderIdx : ReorderIndices)
3566dbgs() << ReorderIdx <<
", ";
3568dbgs() <<
"UserTreeIndices: ";
3569for (
constauto &EInfo : UserTreeIndices)
3570dbgs() << EInfo <<
", ";
3572if (!CombinedEntriesWithIndices.
empty()) {
3573dbgs() <<
"Combined entries: ";
3575dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3584void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3587dbgs() <<
"SLP: " << Banner <<
":\n";
3589dbgs() <<
"SLP: Costs:\n";
3590dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3591dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3592dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3593dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = " 3594 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3598 /// Create a new VectorizableTree entry. 3600 std::optional<ScheduleData *> Bundle,
3601const InstructionsState &S,
3602const EdgeInfo &UserTreeIdx,
3605unsigned InterleaveFactor = 0) {
3606 TreeEntry::EntryState EntryState =
3607 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3608 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3609 ReuseShuffleIndices, ReorderIndices);
3610if (E && InterleaveFactor > 0)
3611 E->setInterleave(InterleaveFactor);
3616 TreeEntry::EntryState EntryState,
3617 std::optional<ScheduleData *> Bundle,
3618const InstructionsState &S,
3619const EdgeInfo &UserTreeIdx,
3622assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3623 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3624"Need to vectorize gather entry?");
3625// Gathered loads still gathered? Do not create entry, use the original one. 3626if (GatheredLoadsEntriesFirst.has_value() &&
3627 EntryState == TreeEntry::NeedToGather && S &&
3628 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3629 !UserTreeIdx.UserTE)
3631 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3632 TreeEntry *
Last = VectorizableTree.
back().get();
3633Last->Idx = VectorizableTree.
size() - 1;
3634Last->State = EntryState;
3635// FIXME: Remove once support for ReuseShuffleIndices has been implemented 3636// for non-power-of-two vectors. 3639 ReuseShuffleIndices.empty()) &&
3640"Reshuffling scalars not yet supported for nodes with padding");
3641Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3642 ReuseShuffleIndices.end());
3643if (ReorderIndices.
empty()) {
3646Last->setOperations(S);
3648// Reorder scalars and build final mask. 3649Last->Scalars.assign(VL.
size(),
nullptr);
3652 if (Idx >= VL.size())
3653 return UndefValue::get(VL.front()->getType());
3658Last->setOperations(S);
3659Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3661if (!
Last->isGather()) {
3662for (
Value *V : VL) {
3663if (isa<PoisonValue>(V))
3665const TreeEntry *
TE = getTreeEntry(V);
3667"Scalar already in tree!");
3670 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3673 ScalarToTreeEntry[
V] =
Last;
3675// Update the scheduler bundle to point to this TreeEntry. 3676 ScheduleData *BundleMember = *Bundle;
3677assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3680"Bundle and VL out of sync");
3682for (
Value *V : VL) {
3687 BundleMember->TE =
Last;
3688 BundleMember = BundleMember->NextInBundle;
3691assert(!BundleMember &&
"Bundle and VL out of sync");
3693// Build a map for gathered scalars to the nodes where they are used. 3694bool AllConstsOrCasts =
true;
3697auto *
I = dyn_cast<CastInst>(V);
3698 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3699if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3700 !UserTreeIdx.UserTE->isGather())
3703if (AllConstsOrCasts)
3705 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3706 MustGather.
insert(VL.begin(), VL.end());
3709if (UserTreeIdx.UserTE)
3710Last->UserTreeIndices.push_back(UserTreeIdx);
3714 /// -- Vectorization State -- 3715 /// Holds all of the tree entries. 3716 TreeEntry::VecTreeTy VectorizableTree;
3721for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3722 VectorizableTree[
Id]->dump();
3728 TreeEntry *getTreeEntry(
Value *V) {
3729assert(V &&
"V cannot be nullptr.");
3730return ScalarToTreeEntry.lookup(V);
3733const TreeEntry *getTreeEntry(
Value *V)
const{
3734assert(V &&
"V cannot be nullptr.");
3735return ScalarToTreeEntry.lookup(V);
3738 /// Check that the operand node of alternate node does not generate 3739 /// buildvector sequence. If it is, then probably not worth it to build 3740 /// alternate shuffle, if number of buildvector operands + alternate 3741 /// instruction > than the number of buildvector instructions. 3742 /// \param S the instructions state of the analyzed values. 3743 /// \param VL list of the instructions with alternate opcodes. 3744bool areAltOperandsProfitable(
const InstructionsState &S,
3747 /// Checks if the specified list of the instructions/values can be vectorized 3748 /// and fills required data before actual scheduling of the instructions. 3749 TreeEntry::EntryState
3751bool IsScatterVectorizeUserTE,
3755 /// Maps a specific scalar to its tree entry. 3758 /// List of scalars, used in several vectorize nodes, and the list of the 3762 /// Maps a value to the proposed vectorizable size. 3765 /// A list of scalars that we found that we need to keep as scalars. 3768 /// A set of first non-schedulable values. 3771 /// A map between the vectorized entries and the last instructions in the 3772 /// bundles. The bundles are built in use order, not in the def order of the 3773 /// instructions. So, we cannot rely directly on the last instruction in the 3774 /// bundle being the last instruction in the program order during 3775 /// vectorization process since the basic blocks are affected, need to 3776 /// pre-gather them before. 3779 /// List of gather nodes, depending on other gather/vector nodes, which should 3780 /// be emitted after the vector instruction emission process to correctly 3781 /// handle order of the vector instructions and shuffles. 3784usingValueToGatherNodesMap =
3786 ValueToGatherNodesMap ValueToGatherNodes;
3788 /// A list of the load entries (node indices), which can be vectorized using 3789 /// strided or masked gather approach, but attempted to be represented as 3790 /// contiguous loads. 3793 /// true if graph nodes transforming mode is on. 3794bool IsGraphTransformMode =
false;
3796 /// The index of the first gathered load entry in the VectorizeTree. 3797 std::optional<unsigned> GatheredLoadsEntriesFirst;
3799 /// This POD struct describes one external user in the vectorized tree. 3804// Which scalar in our function. 3807// Which user that uses the scalar. 3810// Which lane does the scalar belong to. 3815 /// Checks if two instructions may access the same memory. 3817 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it 3818 /// is invariant in the calling loop. 3823// First check if the result is already in the cache. 3824 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3825auto It = AliasCache.
find(Key);
3826if (It != AliasCache.
end())
3829// Store the result in the cache. 3831 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3835usingAliasCacheKey = std::pair<Instruction *, Instruction *>;
3837 /// Cache for alias results. 3838 /// TODO: consider moving this to the AliasAnalysis itself. 3841// Cache for pointerMayBeCaptured calls inside AA. This is preserved 3842// globally through SLP because we don't perform any action which 3843// invalidates capture results. 3846 /// Temporary store for deleted instructions. Instructions will be deleted 3847 /// eventually when the BoUpSLP is destructed. The deferral is required to 3848 /// ensure that there are no incorrect collisions in the AliasCache, which 3849 /// can happen if a new instruction is allocated at the same address as a 3850 /// previously deleted instruction. 3853 /// Set of the instruction, being analyzed already for reductions. 3856 /// Set of hashes for the list of reduction values already being analyzed. 3859 /// Values, already been analyzed for mininmal bitwidth and found to be 3863 /// A list of values that need to extracted out of the tree. 3864 /// This list holds pairs of (Internal Scalar : External User). External User 3865 /// can be nullptr, it means that this Internal Scalar will be used later, 3866 /// after vectorization. 3867 UserList ExternalUses;
3869 /// A list of GEPs which can be reaplced by scalar GEPs instead of 3870 /// extractelement instructions. 3873 /// Values used only by @llvm.assume calls. 3876 /// Holds all of the instructions that we gathered, shuffle instructions and 3877 /// extractelements. 3880 /// A list of blocks that we are going to CSE. 3883 /// List of hashes of vector of loads, which are known to be non vectorizable. 3886 /// Contains all scheduling relevant data for an instruction. 3887 /// A ScheduleData either represents a single instruction or a member of an 3888 /// instruction bundle (= a group of instructions which is combined into a 3889 /// vector instruction). 3891// The initial value for the dependency counters. It means that the 3892// dependencies are not calculated yet. 3893enum { InvalidDeps = -1 };
3895 ScheduleData() =
default;
3898 FirstInBundle =
this;
3899 NextInBundle =
nullptr;
3900 NextLoadStore =
nullptr;
3902 SchedulingRegionID = BlockSchedulingRegionID;
3903 clearDependencies();
3908 /// Verify basic self consistency properties 3910if (hasValidDependencies()) {
3911assert(UnscheduledDeps <= Dependencies &&
"invariant");
3913assert(UnscheduledDeps == Dependencies &&
"invariant");
3917assert(isSchedulingEntity() &&
3918"unexpected scheduled state");
3919for (
const ScheduleData *BundleMember =
this; BundleMember;
3920 BundleMember = BundleMember->NextInBundle) {
3921assert(BundleMember->hasValidDependencies() &&
3922 BundleMember->UnscheduledDeps == 0 &&
3923"unexpected scheduled state");
3924assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3925"only bundle is marked scheduled");
3929assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3930"all bundle members must be in same basic block");
3933 /// Returns true if the dependency information has been calculated. 3934 /// Note that depenendency validity can vary between instructions within 3935 /// a single bundle. 3936bool hasValidDependencies()
const{
return Dependencies != InvalidDeps; }
3938 /// Returns true for single instructions and for bundle representatives 3939 /// (= the head of a bundle). 3940bool isSchedulingEntity()
const{
return FirstInBundle ==
this; }
3942 /// Returns true if it represents an instruction bundle and not only a 3943 /// single instruction. 3944bool isPartOfBundle()
const{
3945return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3948 /// Returns true if it is ready for scheduling, i.e. it has no more 3949 /// unscheduled depending instructions/bundles. 3950bool isReady()
const{
3951assert(isSchedulingEntity() &&
3952"can't consider non-scheduling entity for ready list");
3953return unscheduledDepsInBundle() == 0 && !IsScheduled;
3956 /// Modifies the number of unscheduled dependencies for this instruction, 3957 /// and returns the number of remaining dependencies for the containing 3959int incrementUnscheduledDeps(
int Incr) {
3960assert(hasValidDependencies() &&
3961"increment of unscheduled deps would be meaningless");
3962 UnscheduledDeps += Incr;
3963return FirstInBundle->unscheduledDepsInBundle();
3966 /// Sets the number of unscheduled dependencies to the number of 3968void resetUnscheduledDeps() {
3969 UnscheduledDeps = Dependencies;
3972 /// Clears all dependency information. 3973void clearDependencies() {
3974 Dependencies = InvalidDeps;
3975 resetUnscheduledDeps();
3976 MemoryDependencies.clear();
3977 ControlDependencies.clear();
3980int unscheduledDepsInBundle()
const{
3981assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3983for (
const ScheduleData *BundleMember =
this; BundleMember;
3984 BundleMember = BundleMember->NextInBundle) {
3985if (BundleMember->UnscheduledDeps == InvalidDeps)
3987 Sum += BundleMember->UnscheduledDeps;
3993if (!isSchedulingEntity()) {
3995 }
elseif (NextInBundle) {
3997 ScheduleData *SD = NextInBundle;
3999 os <<
';' << *SD->Inst;
4000 SD = SD->NextInBundle;
4012 /// The TreeEntry that this instruction corresponds to. 4013 TreeEntry *
TE =
nullptr;
4015 /// Points to the head in an instruction bundle (and always to this for 4016 /// single instructions). 4017 ScheduleData *FirstInBundle =
nullptr;
4019 /// Single linked list of all instructions in a bundle. Null if it is a 4020 /// single instruction. 4021 ScheduleData *NextInBundle =
nullptr;
4023 /// Single linked list of all memory instructions (e.g. load, store, call) 4024 /// in the block - until the end of the scheduling region. 4025 ScheduleData *NextLoadStore =
nullptr;
4027 /// The dependent memory instructions. 4028 /// This list is derived on demand in calculateDependencies(). 4031 /// List of instructions which this instruction could be control dependent 4032 /// on. Allowing such nodes to be scheduled below this one could introduce 4033 /// a runtime fault which didn't exist in the original program. 4034 /// ex: this is a load or udiv following a readonly call which inf loops 4037 /// This ScheduleData is in the current scheduling region if this matches 4038 /// the current SchedulingRegionID of BlockScheduling. 4039int SchedulingRegionID = 0;
4041 /// Used for getting a "good" final ordering of instructions. 4042int SchedulingPriority = 0;
4044 /// The number of dependencies. Constitutes of the number of users of the 4045 /// instruction plus the number of dependent memory instructions (if any). 4046 /// This value is calculated on demand. 4047 /// If InvalidDeps, the number of dependencies is not calculated yet. 4048int Dependencies = InvalidDeps;
4050 /// The number of dependencies minus the number of dependencies of scheduled 4051 /// instructions. As soon as this is zero, the instruction/bundle gets ready 4053 /// Note that this is negative as long as Dependencies is not calculated. 4054int UnscheduledDeps = InvalidDeps;
4056 /// True if this instruction is scheduled (or considered as scheduled in the 4058bool IsScheduled =
false;
4063const BoUpSLP::ScheduleData &SD) {
4072 /// Contains all scheduling data for a basic block. 4073 /// It does not schedules instructions, which are not memory read/write 4074 /// instructions and their operands are either constants, or arguments, or 4075 /// phis, or instructions from others blocks, or their users are phis or from 4076 /// the other blocks. The resulting vector instructions can be placed at the 4077 /// beginning of the basic block without scheduling (if operands does not need 4078 /// to be scheduled) or at the end of the block (if users are outside of the 4079 /// block). It allows to save some compile time and memory used by the 4081 /// ScheduleData is assigned for each instruction in between the boundaries of 4082 /// the tree entry, even for those, which are not part of the graph. It is 4083 /// required to correctly follow the dependencies between the instructions and 4084 /// their correct scheduling. The ScheduleData is not allocated for the 4085 /// instructions, which do not require scheduling, like phis, nodes with 4086 /// extractelements/insertelements only or nodes with instructions, with 4087 /// uses/operands outside of the block. 4088structBlockScheduling {
4090 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4094 ScheduleStart =
nullptr;
4095 ScheduleEnd =
nullptr;
4096 FirstLoadStoreInRegion =
nullptr;
4097 LastLoadStoreInRegion =
nullptr;
4098 RegionHasStackSave =
false;
4100// Reduce the maximum schedule region size by the size of the 4101// previous scheduling run. 4102 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4105 ScheduleRegionSize = 0;
4107// Make a new scheduling region, i.e. all existing ScheduleData is not 4108// in the new region yet. 4109 ++SchedulingRegionID;
4113if (BB !=
I->getParent())
4114// Avoid lookup if can't possibly be in map. 4116 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4117if (SD && isInSchedulingRegion(SD))
4122 ScheduleData *getScheduleData(
Value *V) {
4123if (
auto *
I = dyn_cast<Instruction>(V))
4124return getScheduleData(
I);
4128bool isInSchedulingRegion(ScheduleData *SD)
const{
4129return SD->SchedulingRegionID == SchedulingRegionID;
4132 /// Marks an instruction as scheduled and puts all dependent ready 4133 /// instructions into the ready-list. 4134template <
typename ReadyListType>
4135void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4136 SD->IsScheduled =
true;
4139for (ScheduleData *BundleMember = SD; BundleMember;
4140 BundleMember = BundleMember->NextInBundle) {
4142// Handle the def-use chain dependencies. 4144// Decrement the unscheduled counter and insert to ready list if ready. 4145auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4146 ScheduleData *OpDef = getScheduleData(
I);
4147if (OpDef && OpDef->hasValidDependencies() &&
4148 OpDef->incrementUnscheduledDeps(-1) == 0) {
4149// There are no more unscheduled dependencies after 4150// decrementing, so we can put the dependent instruction 4151// into the ready list. 4152 ScheduleData *DepBundle = OpDef->FirstInBundle;
4153assert(!DepBundle->IsScheduled &&
4154"already scheduled bundle gets ready");
4155 ReadyList.insert(DepBundle);
4157 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4161// If BundleMember is a vector bundle, its operands may have been 4162// reordered during buildTree(). We therefore need to get its operands 4163// through the TreeEntry. 4164if (TreeEntry *TE = BundleMember->TE) {
4165// Need to search for the lane since the tree entry can be reordered. 4166auto *
In = BundleMember->Inst;
4167int Lane = std::distance(
TE->Scalars.begin(),
4169assert(Lane >= 0 &&
"Lane not set");
4171// Since vectorization tree is being built recursively this assertion 4172// ensures that the tree entry has all operands set before reaching 4173// this code. Couple of exceptions known at the moment are extracts 4174// where their second (immediate) operand is not added. Since 4175// immediates do not affect scheduler behavior this is considered 4179 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4180In->getNumOperands() ==
TE->getNumOperands()) &&
4181"Missed TreeEntry operands?");
4183for (
unsigned OpIdx : seq<unsigned>(
TE->getNumOperands()))
4184if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4187// If BundleMember is a stand-alone instruction, no operand reordering 4188// has taken place, so we directly access its operands. 4189for (
Use &U : BundleMember->Inst->operands())
4190if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4193// Handle the memory dependencies. 4194for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4195if (MemoryDepSD->hasValidDependencies() &&
4196 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4197// There are no more unscheduled dependencies after decrementing, 4198// so we can put the dependent instruction into the ready list. 4199 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4200assert(!DepBundle->IsScheduled &&
4201"already scheduled bundle gets ready");
4202 ReadyList.insert(DepBundle);
4204 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4207// Handle the control dependencies. 4208for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4209if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4210// There are no more unscheduled dependencies after decrementing, 4211// so we can put the dependent instruction into the ready list. 4212 ScheduleData *DepBundle = DepSD->FirstInBundle;
4213assert(!DepBundle->IsScheduled &&
4214"already scheduled bundle gets ready");
4215 ReadyList.insert(DepBundle);
4217 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4223 /// Verify basic self consistency properties of the data structure. 4228assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4229 ScheduleStart->comesBefore(ScheduleEnd) &&
4230"Not a valid scheduling region?");
4232for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4233auto *SD = getScheduleData(
I);
4236assert(isInSchedulingRegion(SD) &&
4237"primary schedule data not in window?");
4238assert(isInSchedulingRegion(SD->FirstInBundle) &&
4239"entire bundle in window!");
4243for (
auto *SD : ReadyInsts) {
4244assert(SD->isSchedulingEntity() && SD->isReady() &&
4245"item in ready list not ready?");
4250 /// Put all instructions into the ReadyList which are ready for scheduling. 4251template <
typename ReadyListType>
4252void initialFillReadyList(ReadyListType &ReadyList) {
4253for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4254 ScheduleData *SD = getScheduleData(
I);
4255if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4257 ReadyList.insert(SD);
4259 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4264 /// Build a bundle from the ScheduleData nodes corresponding to the 4265 /// scalar instruction for each lane. 4268 /// Checks if a bundle of instructions can be scheduled, i.e. has no 4269 /// cyclic dependencies. This is only a dry-run, no instructions are 4270 /// actually moved at this stage. 4271 /// \returns the scheduling bundle. The returned Optional value is not 4272 /// std::nullopt if \p VL is allowed to be scheduled. 4273 std::optional<ScheduleData *>
4275const InstructionsState &S);
4277 /// Un-bundles a group of instructions. 4280 /// Allocates schedule data chunk. 4281 ScheduleData *allocateScheduleDataChunks();
4283 /// Extends the scheduling region so that V is inside the region. 4284 /// \returns true if the region size is within the limit. 4285bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4287 /// Initialize the ScheduleData structures for new instructions in the 4288 /// scheduling region. 4290 ScheduleData *PrevLoadStore,
4291 ScheduleData *NextLoadStore);
4293 /// Updates the dependency information of a bundle and of all instructions/ 4294 /// bundles which depend on the original bundle. 4295void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4298 /// Sets all instruction in the scheduling region to un-scheduled. 4299void resetSchedule();
4303 /// Simple memory allocation for ScheduleData. 4306 /// The size of a ScheduleData array in ScheduleDataChunks. 4309 /// The allocator position in the current chunk, which is the last entry 4310 /// of ScheduleDataChunks. 4313 /// Attaches ScheduleData to Instruction. 4314 /// Note that the mapping survives during all vectorization iterations, i.e. 4315 /// ScheduleData structures are recycled. 4318 /// The ready-list for scheduling (only used for the dry-run). 4321 /// The first instruction of the scheduling region. 4324 /// The first instruction _after_ the scheduling region. 4327 /// The first memory accessing instruction in the scheduling region 4329 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4331 /// The last memory accessing instruction in the scheduling region 4333 ScheduleData *LastLoadStoreInRegion =
nullptr;
4335 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling 4336 /// region? Used to optimize the dependence calculation for the 4337 /// common case where there isn't. 4338bool RegionHasStackSave =
false;
4340 /// The current size of the scheduling region. 4341int ScheduleRegionSize = 0;
4343 /// The maximum size allowed for the scheduling region. 4346 /// The ID of the scheduling region. For a new vectorization iteration this 4347 /// is incremented which "removes" all ScheduleData from the region. 4348 /// Make sure that the initial SchedulingRegionID is greater than the 4349 /// initial SchedulingRegionID in ScheduleData (which is 0). 4350int SchedulingRegionID = 1;
4353 /// Attaches the BlockScheduling structures to basic blocks. 4356 /// Performs the "real" scheduling. Done before vectorization is actually 4357 /// performed in a basic block. 4358void scheduleBlock(BlockScheduling *BS);
4360 /// List of users to ignore during scheduling and that don't need extracting. 4363 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of 4364 /// sorted SmallVectors of unsigned. 4365structOrdersTypeDenseMapInfo {
4378staticunsigned getHashValue(
constOrdersType &V) {
4387// Analysis and block reference. 4399unsigned MaxVecRegSize;
// This is set by TTI or overridden by cl::opt. 4400unsigned MinVecRegSize;
// Set by cl::opt (default: 128). 4402 /// Instruction builder to construct the vectorized tree. 4405 /// A map of scalar integer values to the smallest bit width with which they 4406 /// can legally be represented. The values map to (width, signed) pairs, 4407 /// where "width" indicates the minimum bit width and "signed" is True if the 4408 /// value must be signed-extended, rather than zero-extended, back to its 4412 /// Final size of the reduced vector, if the current graph represents the 4413 /// input for the reduction and it was possible to narrow the size of the 4415unsigned ReductionBitWidth = 0;
4417 /// Canonical graph size before the transformations. 4418unsigned BaseGraphSize = 1;
4420 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of 4421 /// type sizes, used in the tree. 4422 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4424 /// Indices of the vectorized nodes, which supposed to be the roots of the new 4425 /// bitwidth analysis attempt, like trunc, IToFP or ICmp. 4429}
// end namespace slpvectorizer 4434 /// NodeRef has to be a pointer per the GraphWriter. 4439 /// Add the VectorizableTree to the index iterator to be able to return 4440 /// TreeEntry pointers. 4441structChildIteratorType
4443 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4454return R.VectorizableTree[0].get();
4458return {
N->UserTreeIndices.begin(),
N->Container};
4462return {
N->UserTreeIndices.end(),
N->Container};
4465 /// For the node iterator we just need to turn the TreeEntry iterator into a 4466 /// TreeEntry* iterator so that it dereferences to NodeRef. 4467classnodes_iterator {
4478booloperator!=(
const nodes_iterator &N2)
const{
return N2.It != It; }
4482return nodes_iterator(R->VectorizableTree.begin());
4486return nodes_iterator(R->VectorizableTree.end());
4500OS << Entry->Idx <<
".\n";
4503for (
auto *V : Entry->Scalars) {
4505if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4506 return EU.Scalar == V;
4516if (Entry->isGather())
4518if (Entry->State == TreeEntry::ScatterVectorize ||
4519 Entry->State == TreeEntry::StridedVectorize)
4525}
// end namespace llvm 4529for (
auto *
I : DeletedInstructions) {
4530if (!
I->getParent()) {
4531// Temporarily insert instruction back to erase them from parent and 4534// Phi nodes must be the very first instructions in the block. 4535I->insertBefore(
F->getEntryBlock(),
4536F->getEntryBlock().getFirstNonPHIIt());
4538I->insertBefore(
F->getEntryBlock().getTerminator()->getIterator());
4541for (
Use &U :
I->operands()) {
4542auto *
Op = dyn_cast<Instruction>(U.get());
4543if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4547I->dropAllReferences();
4549for (
auto *
I : DeletedInstructions) {
4551"trying to erase instruction with users.");
4552I->eraseFromParent();
4555// Cleanup any dead scalar code feeding the vectorized instructions 4558#ifdef EXPENSIVE_CHECKS 4559// If we could guarantee that this call is not extremely slow, we could 4560// remove the ifdef limitation (see PR47712). 4565/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses 4566/// contains original mask for the scalars reused in the node. Procedure 4567/// transform this mask in accordance with the given \p Mask. 4569assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4570"Expected non-empty mask.");
4573for (
unsignedI = 0,
E = Prev.
size();
I <
E; ++
I)
4575 Reuses[Mask[
I]] = Prev[
I];
4578/// Reorders the given \p Order according to the given \p Mask. \p Order - is 4579/// the original order of the scalars. Procedure transforms the provided order 4580/// in accordance with the given \p Mask. If the resulting \p Order is just an 4581/// identity order, \p Order is cleared. 4583bool BottomOrder =
false) {
4584assert(!Mask.empty() &&
"Expected non-empty mask.");
4585unsigned Sz = Mask.size();
4590 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4592 PrevOrder.
swap(Order);
4595for (
unsignedI = 0;
I < Sz; ++
I)
4597 Order[
I] = PrevOrder[Mask[
I]];
4610 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4620for (
unsignedI = 0;
I < Sz; ++
I)
4622 Order[MaskOrder[
I]] =
I;
4626std::optional<BoUpSLP::OrdersType>
4628assert(TE.isGather() &&
"Expected gather node only.");
4629// Try to find subvector extract/insert patterns and reorder only such 4632Type *ScalarTy = GatheredScalars.
front()->getType();
4633int NumScalars = GatheredScalars.
size();
4642 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4644 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4646// No shuffled operands - ignore. 4647if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4649OrdersType CurrentOrder(NumScalars, NumScalars);
4650if (GatherShuffles.
size() == 1 &&
4652 Entries.front().front()->isSame(TE.Scalars)) {
4653// Perfect match in the graph, will reuse the previously vectorized 4655 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4666// Exclusive broadcast mask - ignore. 4667if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4668 (Entries.size() != 1 ||
4669 Entries.front().front()->ReorderIndices.empty())) ||
4670 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4676for (
intI : seq<int>(0, NumParts)) {
4677if (ShuffledSubMasks.
test(
I))
4679constint VF = GetVF(
I);
4684// Shuffle of at least 2 vectors - ignore. 4685if (
any_of(Slice, [&](
intI) {
returnI != NumScalars; })) {
4686 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4687 ShuffledSubMasks.
set(
I);
4690// Try to include as much elements from the mask as possible. 4691int FirstMin = INT_MAX;
4692int SecondVecFound =
false;
4693for (
int K : seq<int>(Limit)) {
4694intIdx = Mask[
I * PartSz + K];
4696Value *V = GatheredScalars[
I * PartSz + K];
4698 SecondVecFound =
true;
4707 SecondVecFound =
true;
4711 FirstMin = (FirstMin / PartSz) * PartSz;
4712// Shuffle of at least 2 vectors - ignore. 4713if (SecondVecFound) {
4714 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4715 ShuffledSubMasks.
set(
I);
4718for (
int K : seq<int>(Limit)) {
4719intIdx = Mask[
I * PartSz + K];
4724 SecondVecFound =
true;
4727if (CurrentOrder[
I * PartSz +
Idx] >
4728static_cast<unsigned>(
I * PartSz + K) &&
4729 CurrentOrder[
I * PartSz +
Idx] !=
4730static_cast<unsigned>(
I * PartSz +
Idx))
4731 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4733// Shuffle of at least 2 vectors - ignore. 4734if (SecondVecFound) {
4735 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4736 ShuffledSubMasks.
set(
I);
4742if (!ExtractShuffles.
empty())
4743 TransformMaskToOrder(
4744 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsignedI) {
4745if (!ExtractShuffles[
I])
4748unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4749for (
unsignedIdx : seq<unsigned>(Sz)) {
4750int K =
I * PartSz +
Idx;
4753if (!TE.ReuseShuffleIndices.empty())
4754 K = TE.ReuseShuffleIndices[K];
4757if (!TE.ReorderIndices.empty())
4758 K = std::distance(TE.ReorderIndices.begin(),
4759find(TE.ReorderIndices, K));
4760auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4763 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4765 .getKnownMinValue());
4769// Check special corner case - single shuffle of the same entry. 4770if (GatherShuffles.
size() == 1 && NumParts != 1) {
4771if (ShuffledSubMasks.
any())
4773 PartSz = NumScalars;
4776if (!Entries.empty())
4777 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsignedI) {
4778if (!GatherShuffles[
I])
4780return std::max(Entries[
I].front()->getVectorFactor(),
4781 Entries[
I].back()->getVectorFactor());
4785if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4787return std::move(CurrentOrder);
4792bool CompareOpcodes =
true) {
4796auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4797auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4798return (!GEP1 || GEP1->getNumOperands() == 2) &&
4799 (!GEP2 || GEP2->getNumOperands() == 2) &&
4800 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4801 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4804getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4807/// Calculates minimal alignment as a common alignment. 4808template <
typename T>
4810Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4812 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4813return CommonAlignment;
4816/// Check if \p Order represents reverse order. 4819"Order is empty. Please check it before using isReverseOrder.");
4820unsigned Sz = Order.
size();
4822return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4826/// Checks if the provided list of pointers \p Pointers represents the strided 4827/// pointers for type ElemTy. If they are not, std::nullopt is returned. 4828/// Otherwise, if \p Inst is not specified, just initialized optional value is 4829/// returned to show that the pointers represent strided pointers. If \p Inst 4830/// specified, the runtime stride is materialized before the given \p Inst. 4831/// \returns std::nullopt if the pointers are not pointers with the runtime 4832/// stride, nullptr or actual stride value, otherwise. 4833static std::optional<Value *>
4839constSCEV *PtrSCEVLowest =
nullptr;
4840constSCEV *PtrSCEVHighest =
nullptr;
4841// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest 4848if (!PtrSCEVLowest && !PtrSCEVHighest) {
4849 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4853if (isa<SCEVCouldNotCompute>(Diff))
4856 PtrSCEVLowest = PtrSCEV;
4860if (isa<SCEVCouldNotCompute>(Diff1))
4863 PtrSCEVHighest = PtrSCEV;
4867// Dist = PtrSCEVHighest - PtrSCEVLowest; 4869if (isa<SCEVCouldNotCompute>(Dist))
4871intSize =
DL.getTypeStoreSize(ElemTy);
4872auto TryGetStride = [&](
constSCEV *Dist,
4873constSCEV *Multiplier) ->
constSCEV * {
4874if (
constauto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4875if (M->getOperand(0) == Multiplier)
4876return M->getOperand(1);
4877if (M->getOperand(1) == Multiplier)
4878return M->getOperand(0);
4881if (Multiplier == Dist)
4885// Stride_in_elements = Dist / element_size * (num_elems - 1). 4886constSCEV *Stride =
nullptr;
4887if (
Size != 1 || SCEVs.
size() > 2) {
4889 Stride = TryGetStride(Dist, Sz);
4893if (!Stride || isa<SCEVConstant>(Stride))
4895// Iterate through all pointers and check if all distances are 4896// unique multiple of Stride. 4897usingDistOrdPair = std::pair<int64_t, int>;
4899 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4901bool IsConsecutive =
true;
4902for (
constSCEV *PtrSCEV : SCEVs) {
4904if (PtrSCEV != PtrSCEVLowest) {
4906constSCEV *Coeff = TryGetStride(Diff, Stride);
4909constauto *SC = dyn_cast<SCEVConstant>(Coeff);
4910if (!SC || isa<SCEVCouldNotCompute>(SC))
4916 Dist = SC->getAPInt().getZExtValue();
4918// If the strides are not the same or repeated, we can't vectorize. 4921auto Res = Offsets.emplace(Dist, Cnt);
4924// Consecutive order if the inserted element is the last one. 4925 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4928if (Offsets.size() != SCEVs.
size())
4930 SortedIndices.
clear();
4931if (!IsConsecutive) {
4932// Fill SortedIndices array only if it is non-consecutive. 4935for (
const std::pair<int64_t, int> &Pair : Offsets) {
4936 SortedIndices[Cnt] = Pair.second;
4946static std::pair<InstructionCost, InstructionCost>
4951/// Returns the cost of the shuffle instructions with the given \p Kind, vector 4952/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert 4953/// subvector pattern. 4962int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4965 Mask, NumSrcElts, NumSubElts,
Index)) {
4966if (
Index + NumSubElts > NumSrcElts &&
4967Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4976/// Correctly creates insert_subvector, checking that the index is multiple of 4977/// the subvectors length. Otherwise, generates shuffle using \p Generator or 4978/// using default shuffle. 4983if (
Index % SubVecVF == 0) {
4987// Create shuffle, insertvector requires that index is multiple of 4988// the subvector length. 4991 std::iota(
Mask.begin(),
Mask.end(), 0);
4992for (
unsignedI : seq<unsigned>(SubVecVF))
4995 Vec = Generator(Vec, V, Mask);
4997// 1. Resize V to the size of Vec. 4999 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
5007/// Correctly creates extract_subvector, checking that the index is multiple of 5008/// the subvectors length. Otherwise, generates shuffle using \p Generator or 5009/// using default shuffle. 5011unsigned SubVecVF,
unsignedIndex) {
5012if (
Index % SubVecVF == 0) {
5017// Create shuffle, extract_subvector requires that index is multiple of 5018// the subvector length. 5020 std::iota(Mask.begin(), Mask.end(),
Index);
5028unsigned *BestVF,
bool TryRecursiveCheck)
const{
5029// Check that a vectorized load would load the same memory as a scalar 5030// load. For example, we don't want to vectorize loads that are smaller 5031// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 5032// treats loading/storing it as an i8 struct. If we vectorize loads/stores 5033// from such a struct, we read/write packed bits disagreeing with the 5034// unvectorized version. 5041if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
5044// Make sure all loads in the bundle are simple - we can't vectorize 5045// atomic or volatile loads. 5047constunsigned Sz = VL.
size();
5049auto *POIter = PointerOps.
begin();
5050for (
Value *V : VL) {
5051auto *L = dyn_cast<LoadInst>(V);
5052if (!L || !L->isSimple())
5054 *POIter = L->getPointerOperand();
5059// Check the order of pointer operands or that all pointers are the same. 5063Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5084 Ptr0 = PointerOps.
front();
5085 PtrN = PointerOps.
back();
5087 Ptr0 = PointerOps[Order.
front()];
5088 PtrN = PointerOps[Order.
back()];
5090 std::optional<int> Diff =
5092// Check that the sorted loads are consecutive. 5093if (
static_cast<unsigned>(*Diff) == Sz - 1)
5098// Simple check if not a strided access - clear order. 5099bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5100// Try to generate strided load node if: 5101// 1. Target with strided load support is detected. 5102// 2. The number of loads is greater than MinProfitableStridedLoads, 5103// or the potential stride <= MaxProfitableLoadStride and the 5104// potential stride is power-of-2 (to avoid perf regressions for the very 5105// small number of loads) and max distance > number of loads, or potential 5107// 3. The loads are ordered, or number of unordered loads <= 5108// MaxProfitableUnorderedLoads, or loads are in reversed order. 5109// (this check is to avoid extra costs for very expensive shuffles). 5110// 4. Any pointer operand is an instruction with the users outside of the 5111// current graph (for masked gathers extra extractelement instructions 5112// might be required). 5113auto IsAnyPointerUsedOutGraph =
5114 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5115return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5116 return !getTreeEntry(U) && !MustGather.contains(U);
5119constunsigned AbsoluteDiff = std::abs(*Diff);
5120if (IsPossibleStrided &&
5121 (IsAnyPointerUsedOutGraph ||
5122 (AbsoluteDiff > Sz &&
5126 *Diff == -(
static_cast<int>(Sz) - 1))) {
5127int Stride = *Diff /
static_cast<int>(Sz - 1);
5128if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5133// Iterate through all pointers and check if all distances are 5134// unique multiple of Dist. 5142// If the strides are not the same or repeated, we can't 5144if (((Dist / Stride) * Stride) != Dist ||
5145 !Dists.
insert(Dist).second)
5148if (Dists.
size() == Sz)
5154// Correctly identify compare the cost of loads + shuffles rather than 5155// strided/masked gather loads. Returns true if vectorized + shuffles 5156// representation is better than just gather. 5157auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5159bool ProfitableGatherPointers) {
5162// Compare masked gather cost and loads + insert subvector costs. 5164auto [ScalarGEPCost, VectorGEPCost] =
5166 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5167// Estimate the cost of masked gather GEP. If not a splat, roughly 5168// estimate as a buildvector, otherwise estimate as splat. 5172 VecTy->getNumElements());
5174 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5180 PtrVecTy, DemandedElts,
/*Insert=*/true,
/*Extract=*/false,
CostKind);
5185/*Insert=*/true,
/*Extract=*/false,
CostKind) +
5187// The cost of scalar loads. 5195// The cost of masked gather. 5199/*VariableMask=*/false, CommonAlignment,
CostKind) +
5200 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5205// The list of loads is small or perform partial check already - directly 5206// compare masked gather cost and gather cost. 5207constexprunsigned ListLimit = 4;
5208if (!TryRecursiveCheck || VL.
size() < ListLimit)
5211// FIXME: The following code has not been updated for non-power-of-2 5212// vectors (and not whole registers). The splitting logic here does not 5213// cover the original vector if the vector factor is not a power of two. 5217unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5220// Iterate through possible vectorization factors and check if vectorized + 5221// shuffles is better than just gather. 5227for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5233/*TryRecursiveCheck=*/false);
5234// Check that the sorted loads are consecutive. 5240 DemandedElts.
setBits(Cnt, Cnt + VF);
5243// If need the reorder - consider as high-cost masked gather for now. 5251// All loads gathered - try smaller VF. 5253// Can be vectorized later as a serie of loads/insertelements. 5255if (!DemandedElts.
isZero()) {
5260for (
unsignedIdx : seq<unsigned>(VL.
size()))
5261if (DemandedElts[
Idx])
5268auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5273 LI0->getPointerOperand(),
5274 Instruction::GetElementPtr,
CostKind, ScalarTy,
5278if (
static_cast<unsigned>(
5279count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5280 PointerOps.
size() - 1 ||
5287/*Insert=*/true,
/*Extract=*/false,
CostKind);
5292/*Insert=*/true,
/*Extract=*/false,
CostKind) +
5300 LI0->getPointerAddressSpace(),
CostKind,
5306 LI0->getPointerOperand(),
5307/*VariableMask=*/false,
5313 LI0->getPointerOperand(),
5314/*VariableMask=*/false,
5319// Gathers are already calculated - ignore. 5323for (
intIdx : seq<int>(0, VL.
size()))
5330// If masked gather cost is higher - better to vectorize, so 5331// consider it as a gather node. It will be better estimated 5333if (MaskedGatherCost >= VecLdCost &&
5342// TODO: need to improve analysis of the pointers, if not all of them are 5343// GEPs or have > 2 operands, we end up with a gather node, which just 5344// increases the cost. 5346bool ProfitableGatherPointers =
5347 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5348return L->isLoopInvariant(V);
5350if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5351auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5353 (
GEP &&
GEP->getNumOperands() == 2 &&
5354 isa<Constant, Instruction>(
GEP->getOperand(1)));
5356// Check if potential masked gather can be represented as series 5357// of loads + insertsubvectors. 5358// If masked gather cost is higher - better to vectorize, so 5359// consider it as a gather node. It will be better estimated 5361if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5362 ProfitableGatherPointers))
5375"Expected list of pointer operands.");
5376// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each 5377// Ptr into, sort and return the sorted indices with values next to one 5385 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5387 SortedIndices.
clear();
5389auto Key = std::make_pair(BBs[Cnt + 1],
5393 std::optional<int> Diff = getPointersDiff(
5394 ElemTy, std::get<0>(Base.front()), ElemTy,
5396/*StrictCheck=*/true);
5400 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5405// If we haven't found enough to usefully cluster, return early. 5406if (Bases.
size() > VL.
size() / 2 - 1)
5409// Not found already - add a new Base 5410 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5417if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5418 Bases.
front().second.size() == VL.
size()))
5421// For each of the bases sort the pointers by Offset and check if any of the 5422// base become consecutively allocated. 5423auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5432 FirstPointers.
insert(P1);
5433 SecondPointers.
insert(P2);
5439"Unable to find matching root.");
5442for (
auto &
Base : Bases) {
5443for (
auto &Vec :
Base.second) {
5444if (Vec.size() > 1) {
5445stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5446const std::tuple<Value *, int, unsigned> &
Y) {
5447return std::get<1>(
X) < std::get<1>(
Y);
5449int InitialOffset = std::get<1>(Vec[0]);
5450bool AnyConsecutive =
5452return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5454// Fill SortedIndices array only if it looks worth-while to sort the 5461 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5465for (
auto &
T : Bases)
5466for (
constauto &Vec :
T.second)
5467for (
constauto &
P : Vec)
5471"Expected SortedIndices to be the size of VL");
5475std::optional<BoUpSLP::OrdersType>
5477assert(TE.isGather() &&
"Expected gather node only.");
5478Type *ScalarTy = TE.Scalars[0]->getType();
5481 Ptrs.
reserve(TE.Scalars.size());
5483 BBs.
reserve(TE.Scalars.size());
5484for (
Value *V : TE.Scalars) {
5485auto *L = dyn_cast<LoadInst>(V);
5486if (!L || !L->isSimple())
5493if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5495return std::move(Order);
5499/// Check if two insertelement instructions are from the same buildvector. 5503// Instructions must be from the same basic blocks. 5506// Checks if 2 insertelements are from the same buildvector. 5507if (VU->
getType() != V->getType())
5509// Multiple used inserts are separate nodes. 5516if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5518// Go through the vector operand of insertelement instructions trying to find 5519// either VU as the original vector for IE2 or V as the original vector for 5522 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5523bool IsReusedIdx =
false;
5525if (IE2 == VU && !IE1)
5527if (IE1 == V && !IE2)
5528return V->hasOneUse();
5529if (IE1 && IE1 != V) {
5531 IsReusedIdx |= ReusedIdx.
test(Idx1);
5532 ReusedIdx.
set(Idx1);
5533if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5536 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5538if (IE2 && IE2 != VU) {
5540 IsReusedIdx |= ReusedIdx.
test(Idx2);
5541 ReusedIdx.
set(Idx2);
5542if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5545 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5547 }
while (!IsReusedIdx && (IE1 || IE2));
5551std::optional<BoUpSLP::OrdersType>
5553// No need to reorder if need to shuffle reuses, still need to shuffle the 5555if (!TE.ReuseShuffleIndices.empty()) {
5556// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors. 5557assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5558"Reshuffling scalars not yet supported for nodes with padding");
5562// Check if reuse shuffle indices can be improved by reordering. 5563// For this, check that reuse mask is "clustered", i.e. each scalar values 5564// is used once in each submask of size <number_of_scalars>. 5565// Example: 4 scalar values. 5566// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. 5567// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because 5568// element 3 is used twice in the second submask. 5569unsigned Sz = TE.Scalars.size();
5571if (std::optional<OrdersType> CurrentOrder =
5577OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5578unsigned Sz = TE.Scalars.size();
5579for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5582 Res[
Idx + K * Sz] =
I + K * Sz;
5584return std::move(Res);
5587if (Sz == 2 && TE.getVectorFactor() == 4 &&
5589 2 * TE.getVectorFactor())) == 1)
5594if (TE.ReorderIndices.empty())
5595 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5598::addMask(ReorderMask, TE.ReuseShuffleIndices);
5599unsigned VF = ReorderMask.
size();
5603for (
unsignedI = 0;
I < VF;
I += Sz) {
5605unsigned UndefCnt = 0;
5606unsigned Limit = std::min(Sz, VF -
I);
5615 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5619for (
unsigned K = 0; K < NumParts; ++K) {
5620unsignedIdx = Val + Sz * K;
5622 ResOrder[
Idx] =
I + K;
5625return std::move(ResOrder);
5627unsigned VF = TE.getVectorFactor();
5628// Try build correct order for extractelement instructions. 5630 TE.ReuseShuffleIndices.end());
5631if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5633 if (isa<PoisonValue>(V))
5635 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5636 return Idx && *Idx < Sz;
5638assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported " 5639"by BinaryOperator and CastInst.");
5641if (TE.ReorderIndices.empty())
5642 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5645for (
unsignedI = 0;
I < VF; ++
I) {
5646int &
Idx = ReusedMask[
I];
5649Value *V = TE.Scalars[ReorderMask[
Idx]];
5651Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5654// Build the order of the VF size, need to reorder reuses shuffles, they are 5655// always of VF size. 5657 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5658auto *It = ResOrder.
begin();
5659for (
unsigned K = 0; K < VF; K += Sz) {
5663 std::iota(SubMask.begin(), SubMask.end(), 0);
5665transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5666 std::advance(It, Sz);
5671return std::nullopt;
// No need to reorder. 5672return std::move(ResOrder);
5674if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5675any_of(TE.UserTreeIndices,
5677 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5681if ((TE.State == TreeEntry::Vectorize ||
5682 TE.State == TreeEntry::StridedVectorize) &&
5683 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5684 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5685assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by " 5686"BinaryOperator and CastInst.");
5687return TE.ReorderIndices;
5689if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5690if (!TE.ReorderIndices.empty())
5691return TE.ReorderIndices;
5694for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5695if (!V->hasNUsesOrMore(1))
5697auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5702while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5704II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5710assert(BB1 != BB2 &&
"Expected different basic blocks.");
5711auto *NodeA = DT->
getNode(BB1);
5712auto *NodeB = DT->
getNode(BB2);
5713assert(NodeA &&
"Should only process reachable instructions");
5714assert(NodeB &&
"Should only process reachable instructions");
5715assert((NodeA == NodeB) ==
5716 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5717"Different nodes should have different DFS numbers");
5718return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5720auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5721Value *V1 = TE.Scalars[I1];
5722Value *V2 = TE.Scalars[I2];
5723if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5725if (isa<PoisonValue>(V1))
5727if (isa<PoisonValue>(V2))
5733auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5734auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5735if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5736return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5737 FirstUserOfPhi2->getParent());
5738auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5739auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5740auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5741auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5747if (UserBVHead[I1] && !UserBVHead[I2])
5751if (UserBVHead[I1] == UserBVHead[I2])
5754return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5756return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5763auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5764auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5765auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5766auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5769if (EE1->getOperand(0) == EE2->getOperand(0))
5773if (Inst1 && Inst2) {
5781"Expected either instructions or arguments vector operands.");
5782return P1->getArgNo() < P2->getArgNo();
5787 std::iota(Phis.
begin(), Phis.
end(), 0);
5790return std::nullopt;
// No need to reorder. 5791return std::move(Phis);
5793if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5795// TODO: add analysis of other gather nodes with extractelement 5796// instructions and other values/instructions, not only undefs. 5797if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5798 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5799any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5801 auto *EE = dyn_cast<ExtractElementInst>(V);
5802 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5804// Check that gather of extractelements can be represented as 5805// just a shuffle of a single vector. 5808 canReuseExtract(TE.Scalars, CurrentOrder,
/*ResizeAllowed=*/true);
5809if (Reuse || !CurrentOrder.
empty())
5810return std::move(CurrentOrder);
5812// If the gather node is <undef, v, .., poison> and 5813// insertelement poison, v, 0 [+ permute] 5815// insertelement poison, v, n - try to reorder. 5816// If rotating the whole graph, exclude the permute cost, the whole graph 5817// might be transformed. 5818int Sz = TE.Scalars.size();
5820count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5822find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5823if (It == TE.Scalars.begin())
5826if (It != TE.Scalars.end()) {
5828unsignedIdx = std::distance(TE.Scalars.begin(), It);
5843if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5846return std::move(Order);
5852if (TE.Scalars.size() >= 3)
5855// Check if can include the order of vectorized loads. For masked gathers do 5856// extra analysis later, so include such nodes into a special list. 5857if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5861 CurrentOrder, PointerOps);
5863return std::move(CurrentOrder);
5865// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars 5866// has been auditted for correctness with non-power-of-two vectors. 5874/// Checks if the given mask is a "clustered" mask with the same clusters of 5875/// size \p Sz, which are not identity submasks. 5881for (
unsignedI = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5883if (Cluster != FirstCluster)
5889void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const{
5890// Reorder reuses mask. 5892constunsigned Sz =
TE.Scalars.size();
5893// For vectorized and non-clustered reused no need to do anything else. 5894if (!
TE.isGather() ||
5902// Clear reorder since it is going to be applied to the new mask. 5903TE.ReorderIndices.clear();
5904// Try to improve gathered nodes with clustered reuses, if possible. 5909// Fill the reuses mask with the identity submasks. 5910for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5911 *
End =
TE.ReuseShuffleIndices.end();
5912 It !=
End; std::advance(It, Sz))
5913 std::iota(It, std::next(It, Sz), 0);
5919"Expected same size of orders");
5920unsigned Sz = Order.
size();
5922for (
unsignedIdx : seq<unsigned>(0, Sz)) {
5923if (Order[
Idx] != Sz)
5924 UsedIndices.
set(Order[
Idx]);
5926if (SecondaryOrder.
empty()) {
5927for (
unsignedIdx : seq<unsigned>(0, Sz))
5928if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5931for (
unsignedIdx : seq<unsigned>(0, Sz))
5932if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5933 !UsedIndices.
test(SecondaryOrder[
Idx]))
5934 Order[
Idx] = SecondaryOrder[
Idx];
5939// Maps VF to the graph nodes. 5941// ExtractElement gather nodes which can be vectorized and need to handle 5945// Phi nodes can have preferred ordering based on their result users 5948// AltShuffles can also have a preferred ordering that leads to fewer 5949// instructions, e.g., the addsub instruction in x86. 5952// Maps a TreeEntry to the reorder indices of external users. 5954 ExternalUserReorderMap;
5955// Find all reorderable nodes with the given VF. 5956// Currently the are vectorized stores,loads,extracts + some gathering of 5959const std::unique_ptr<TreeEntry> &TE) {
5960// Look for external users that will probably be vectorized. 5962 findExternalStoreUsersReorderIndices(TE.get());
5963if (!ExternalUserReorderIndices.
empty()) {
5964 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5966 std::move(ExternalUserReorderIndices));
5969// Patterns like [fadd,fsub] can be combined into a single instruction in 5970// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need 5971// to take into account their order when looking for the most used order. 5972if (TE->hasState() && TE->isAltShuffle()) {
5975unsigned Opcode0 = TE->getOpcode();
5976unsigned Opcode1 = TE->getAltOpcode();
5978// If this pattern is supported by the target then we consider the order. 5979if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5980 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5983// TODO: Check the reverse order too. 5986if (std::optional<OrdersType> CurrentOrder =
5988// Do not include ordering for nodes used in the alt opcode vectorization, 5989// better to reorder them during bottom-to-top stage. If follow the order 5990// here, it causes reordering of the whole graph though actually it is 5991// profitable just to reorder the subgraph that starts from the alternate 5992// opcode vectorization node. Such nodes already end-up with the shuffle 5993// instruction and it is just enough to change this shuffle rather than 5994// rotate the scalars for the whole graph. 5996const TreeEntry *UserTE = TE.get();
5998if (UserTE->UserTreeIndices.size() != 1)
6001 return EI.UserTE->State == TreeEntry::Vectorize &&
6002 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
6005 UserTE = UserTE->UserTreeIndices.back().UserTE;
6008 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
6009if (!(TE->State == TreeEntry::Vectorize ||
6010 TE->State == TreeEntry::StridedVectorize) ||
6011 !TE->ReuseShuffleIndices.empty())
6012 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
6013if (TE->State == TreeEntry::Vectorize &&
6014 TE->getOpcode() == Instruction::PHI)
6015 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
6019// Reorder the graph nodes according to their vectorization factor. 6020for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
6021 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6022auto It = VFToOrderedEntries.
find(VF);
6023if (It == VFToOrderedEntries.
end())
6025// Try to find the most profitable order. We just are looking for the most 6026// used order and reorder scalar elements in the nodes according to this 6027// mostly used order. 6029// Delete VF entry upon exit. 6032// All operands are reordered and used only in this node - propagate the 6033// most used order to the user node. 6038for (
const TreeEntry *OpTE : OrderedEntries) {
6039// No need to reorder this nodes, still need to extend and to use shuffle, 6040// just need to merge reordering shuffle and the reuse shuffle. 6041if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6043// Count number of orders uses. 6044constauto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6046if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6047auto It = GathersToOrders.find(OpTE);
6048if (It != GathersToOrders.end())
6051if (OpTE->hasState() && OpTE->isAltShuffle()) {
6052auto It = AltShufflesToOrders.find(OpTE);
6053if (It != AltShufflesToOrders.end())
6056if (OpTE->State == TreeEntry::Vectorize &&
6057 OpTE->getOpcode() == Instruction::PHI) {
6058auto It = PhisToOrders.
find(OpTE);
6059if (It != PhisToOrders.
end())
6062return OpTE->ReorderIndices;
6064// First consider the order of the external scalar users. 6065auto It = ExternalUserReorderMap.
find(OpTE);
6066if (It != ExternalUserReorderMap.
end()) {
6067constauto &ExternalUserReorderIndices = It->second;
6068// If the OpTE vector factor != number of scalars - use natural order, 6069// it is an attempt to reorder node with reused scalars but with 6071if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6072 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
6073 ExternalUserReorderIndices.size();
6075for (
constOrdersType &ExtOrder : ExternalUserReorderIndices)
6076 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6078// No other useful reorder data in this entry. 6082// Stores actually store the mask, not the order, need to invert. 6083if (OpTE->State == TreeEntry::Vectorize &&
6084 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6085assert(!OpTE->isAltShuffle() &&
6086"Alternate instructions are only supported by BinaryOperator " 6090unsignedE = Order.size();
6093 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6096 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6098 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6101if (OrdersUses.empty())
6103// Choose the most used order. 6104unsigned IdentityCnt = 0;
6105unsigned FilledIdentityCnt = 0;
6107for (
auto &Pair : OrdersUses) {
6109if (!Pair.first.empty())
6110 FilledIdentityCnt += Pair.second;
6111 IdentityCnt += Pair.second;
6116unsigned Cnt = IdentityCnt;
6117for (
auto &Pair : OrdersUses) {
6118// Prefer identity order. But, if filled identity found (non-empty order) 6119// with same number of uses, as the new candidate order, we can choose 6120// this candidate order. 6121if (Cnt < Pair.second ||
6122 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6123 Cnt == Pair.second && !BestOrder.
empty() &&
6126 BestOrder = Pair.first;
6132// Set order of the user node. 6139unsignedE = BestOrder.
size();
6141 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6143// Do an actual reordering, if profitable. 6144for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6145// Just do the reordering for the nodes with the given VF. 6146if (TE->Scalars.size() != VF) {
6147if (TE->ReuseShuffleIndices.size() == VF) {
6148// Need to reorder the reuses masks of the operands with smaller VF to 6149// be able to find the match between the graph nodes and scalar 6150// operands of the given node during vectorization/cost estimation. 6153 return EI.UserTE->Scalars.size() == VF ||
6154 EI.UserTE->Scalars.size() ==
6157"All users must be of VF size.");
6160// ShuffleVectorInst does not do reorderOperands (and it should not 6161// because ShuffleVectorInst supports only a limited set of 6162// patterns). Only do reorderNodeWithReuses if all of the users are 6163// not ShuffleVectorInst. 6165 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6170 return isa<ShuffleVectorInst>(
6171 EI.UserTE->getMainOp());
6173"Does not know how to reorder.");
6175// Update ordering of the operands with the smaller VF than the given 6177 reorderNodeWithReuses(*TE, Mask);
6181if ((TE->State == TreeEntry::Vectorize ||
6182 TE->State == TreeEntry::StridedVectorize) &&
6185 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6186assert(!TE->isAltShuffle() &&
6187"Alternate instructions are only supported by BinaryOperator " 6189// Build correct orders for extract{element,value}, loads and 6192if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6193 TE->reorderOperands(Mask);
6195// Reorder the node and its operands. 6196 TE->reorderOperands(Mask);
6197assert(TE->ReorderIndices.empty() &&
6198"Expected empty reorder sequence.");
6201if (!TE->ReuseShuffleIndices.empty()) {
6202// Apply reversed order to keep the original ordering of the reused 6203// elements to avoid extra reorder indices shuffling. 6208addMask(NewReuses, TE->ReuseShuffleIndices);
6209 TE->ReuseShuffleIndices.swap(NewReuses);
6215bool BoUpSLP::canReorderOperands(
6216 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6219for (
unsignedI = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6220if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6221return OpData.first ==
I &&
6222 (OpData.second->State == TreeEntry::Vectorize ||
6223 OpData.second->State == TreeEntry::StridedVectorize);
6226if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6227// Do not reorder if operand node is used by many user nodes. 6228if (
any_of(TE->UserTreeIndices,
6229 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6231// Add the node to the list of the ordered nodes with the identity 6233 Edges.emplace_back(
I, TE);
6234// Add ScatterVectorize nodes to the list of operands, where just 6235// reordering of the scalars is required. Similar to the gathers, so 6236// simply add to the list of gathered ops. 6237// If there are reused scalars, process this node as a regular vectorize 6238// node, just reorder reuses mask. 6239if (TE->State != TreeEntry::Vectorize &&
6240 TE->State != TreeEntry::StridedVectorize &&
6241 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6245 TreeEntry *
Gather =
nullptr;
6247 [&
Gather, UserTE,
I](TreeEntry *TE) {
6248assert(TE->State != TreeEntry::Vectorize &&
6249 TE->State != TreeEntry::StridedVectorize &&
6250"Only non-vectorized nodes are expected.");
6251if (
any_of(TE->UserTreeIndices,
6252 [UserTE,
I](
const EdgeInfo &EI) {
6253 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6255assert(TE->isSame(UserTE->getOperand(
I)) &&
6256"Operand entry does not match operands.");
6273// Find all reorderable leaf nodes with the given VF. 6274// Currently the are vectorized loads,extracts without alternate operands + 6275// some gathering of extracts. 6277for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6278if (TE->State != TreeEntry::Vectorize &&
6279 TE->State != TreeEntry::StridedVectorize)
6281if (std::optional<OrdersType> CurrentOrder =
6283 OrderedEntries.
insert(TE.get());
6284if (!(TE->State == TreeEntry::Vectorize ||
6285 TE->State == TreeEntry::StridedVectorize) ||
6286 !TE->ReuseShuffleIndices.empty())
6287 GathersToOrders.
insert(TE.get());
6291// 1. Propagate order to the graph nodes, which use only reordered nodes. 6292// I.e., if the node has operands, that are reordered, try to make at least 6293// one operand order in the natural order and reorder others + reorder the 6296while (!OrderedEntries.
empty()) {
6297// 1. Filter out only reordered nodes. 6298// 2. If the entry has multiple uses - skip it and jump to the next node. 6301for (TreeEntry *TE : OrderedEntries) {
6302if (!(TE->State == TreeEntry::Vectorize ||
6303 TE->State == TreeEntry::StridedVectorize ||
6304 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6305 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6308 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6310 !Visited.
insert(TE).second) {
6314// Build a map between user nodes and their operands order to speedup 6315// search. The graph currently does not provide this dependency directly. 6316for (
EdgeInfo &EI : TE->UserTreeIndices)
6319// Erase filtered entries. 6320for (TreeEntry *TE : Filtered)
6321 OrderedEntries.remove(TE);
6323 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6325sort(UsersVec, [](
constauto &Data1,
constauto &Data2) {
6326return Data1.first->Idx > Data2.first->Idx;
6328for (
auto &
Data : UsersVec) {
6329// Check that operands are used only in the User node. 6331if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6333for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6334 OrderedEntries.remove(
Op.second);
6337// All operands are reordered and used only in this node - propagate the 6338// most used order to the user node. 6342// Do the analysis for each tree entry only once, otherwise the order of 6343// the same node my be considered several times, though might be not 6347for (
constauto &
Op :
Data.second) {
6348 TreeEntry *OpTE =
Op.second;
6349if (!VisitedOps.
insert(OpTE).second)
6351if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6354if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6357return OpTE->ReorderIndices;
6359// The order is partially ordered, skip it in favor of fully non-ordered 6361if (Order.size() == 1)
6364Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6365 return P.second == OpTE;
6367// Stores actually store the mask, not the order, need to invert. 6368if (OpTE->State == TreeEntry::Vectorize &&
6369 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6370assert(!OpTE->isAltShuffle() &&
6371"Alternate instructions are only supported by BinaryOperator " 6375unsignedE = Order.size();
6378 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6381 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6384 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6386auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6387constauto AllowsReordering = [&](
const TreeEntry *TE) {
6388if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6389 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6390 (IgnoreReorder && TE->Idx == 0))
6392if (TE->isGather()) {
6401for (
constEdgeInfo &EI : OpTE->UserTreeIndices) {
6402 TreeEntry *UserTE = EI.
UserTE;
6403if (!VisitedUsers.
insert(UserTE).second)
6405// May reorder user node if it requires reordering, has reused 6406// scalars, is an alternate op vectorize node or its op nodes require 6408if (AllowsReordering(UserTE))
6410// Check if users allow reordering. 6411// Currently look up just 1 level of operands to avoid increase of 6413// Profitable to reorder if definitely more operands allow 6414// reordering rather than those with natural order. 6417 Ops, [UserTE, &AllowsReordering](
6418const std::pair<unsigned, TreeEntry *> &
Op) {
6419return AllowsReordering(
Op.second) &&
6422 return EI.UserTE == UserTE;
6424 })) <= Ops.
size() / 2)
6425 ++Res.first->second;
6428if (OrdersUses.empty()) {
6429for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6430 OrderedEntries.remove(
Op.second);
6433// Choose the most used order. 6434unsigned IdentityCnt = 0;
6435unsigned VF =
Data.second.front().second->getVectorFactor();
6437for (
auto &Pair : OrdersUses) {
6439 IdentityCnt += Pair.second;
6444unsigned Cnt = IdentityCnt;
6445for (
auto &Pair : OrdersUses) {
6446// Prefer identity order. But, if filled identity found (non-empty 6447// order) with same number of uses, as the new candidate order, we can 6448// choose this candidate order. 6449if (Cnt < Pair.second) {
6451 BestOrder = Pair.first;
6457// Set order of the user node. 6459for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6460 OrderedEntries.remove(
Op.second);
6464// Erase operands from OrderedEntries list and adjust their orders. 6469unsignedE = BestOrder.
size();
6471 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6473for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6474 TreeEntry *TE =
Op.second;
6475 OrderedEntries.remove(TE);
6476if (!VisitedOps.
insert(TE).second)
6478if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6479 reorderNodeWithReuses(*TE, Mask);
6482// Gathers are processed separately. 6483if (TE->State != TreeEntry::Vectorize &&
6484 TE->State != TreeEntry::StridedVectorize &&
6485 (TE->State != TreeEntry::ScatterVectorize ||
6486 TE->ReorderIndices.empty()))
6488assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6489 TE->ReorderIndices.empty()) &&
6490"Non-matching sizes of user/operand entries.");
6492if (IgnoreReorder && TE == VectorizableTree.front().get())
6493 IgnoreReorder =
false;
6495// For gathers just need to reorder its scalars. 6496for (TreeEntry *
Gather : GatherOps) {
6498"Unexpected reordering of gathers.");
6499if (!
Gather->ReuseShuffleIndices.empty()) {
6500// Just reorder reuses indices. 6505 OrderedEntries.remove(
Gather);
6507// Reorder operands of the user node and set the ordering for the user 6509if (
Data.first->State != TreeEntry::Vectorize ||
6510 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6511Data.first->getMainOp()) ||
6512Data.first->isAltShuffle())
6513Data.first->reorderOperands(Mask);
6514if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6515Data.first->isAltShuffle() ||
6516Data.first->State == TreeEntry::StridedVectorize) {
6519/*BottomOrder=*/true);
6520if (
Data.first->ReuseShuffleIndices.empty() &&
6521 !
Data.first->ReorderIndices.empty() &&
6522 !
Data.first->isAltShuffle()) {
6523// Insert user node to the list to try to sink reordering deeper in 6525 OrderedEntries.insert(
Data.first);
6532// If the reordering is unnecessary, just remove the reorder. 6533if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6534 VectorizableTree.front()->ReuseShuffleIndices.empty())
6535 VectorizableTree.front()->ReorderIndices.clear();
6538Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const{
6539if ((Entry.getOpcode() == Instruction::Store ||
6540 Entry.getOpcode() == Instruction::Load) &&
6541 Entry.State == TreeEntry::StridedVectorize &&
6542 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6543return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6544return dyn_cast<Instruction>(Entry.Scalars.front());
6550// Collect the values that we need to extract from the tree. 6551for (
auto &TEPtr : VectorizableTree) {
6552 TreeEntry *Entry = TEPtr.get();
6554// No need to handle users of gathered values. 6555if (Entry->isGather())
6559for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6560Value *Scalar = Entry->Scalars[Lane];
6561if (!isa<Instruction>(Scalar))
6563// All uses must be replaced already? No need to do it again. 6564auto It = ScalarToExtUses.
find(Scalar);
6565if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6568// Check if the scalar is externally used as an extra arg. 6569constauto ExtI = ExternallyUsedValues.
find(Scalar);
6570if (ExtI != ExternallyUsedValues.
end()) {
6571int FoundLane = Entry->findLaneForValue(Scalar);
6573 << FoundLane <<
" from " << *Scalar <<
".\n");
6574 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6575 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6578for (
User *U : Scalar->users()) {
6585// Ignore users in the user ignore list. 6586if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6589// Skip in-tree scalars that become vectors 6590if (TreeEntry *UseEntry = getTreeEntry(U)) {
6591// Some in-tree scalars will remain as scalar in vectorized 6592// instructions. If that is the case, the one in FoundLane will 6594if (UseEntry->State == TreeEntry::ScatterVectorize ||
6596 Scalar, getRootEntryInstruction(*UseEntry), TLI,
TTI)) {
6599assert(!UseEntry->isGather() &&
"Bad state");
6603if (It != ScalarToExtUses.
end()) {
6604 ExternalUses[It->second].User =
nullptr;
6609if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6611int FoundLane = Entry->findLaneForValue(Scalar);
6613 <<
" from lane " << FoundLane <<
" from " << *Scalar
6615 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6616 ExternalUses.emplace_back(Scalar, U, FoundLane);
6625BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const{
6629for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6630Value *V = TE->Scalars[Lane];
6631// Don't iterate over the users of constant data. 6632if (!isa<Instruction>(V))
6634// To save compilation time we don't visit if we have too many users. 6638// Collect stores per pointer object. 6639for (
User *U : V->users()) {
6640auto *SI = dyn_cast<StoreInst>(U);
6641// Test whether we can handle the store. V might be a global, which could 6642// be used in a different function. 6643if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6646// Skip entry if already 6652auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6653 SI->getValueOperand()->getType(),
Ptr}];
6654// For now just keep one store per pointer object per lane. 6655// TODO: Extend this to support multiple stores per pointer per lane 6656if (StoresVec.size() > Lane)
6658if (!StoresVec.empty()) {
6660 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6661 SI->getValueOperand()->getType(),
6662 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6663/*StrictCheck=*/true);
6664// We failed to compare the pointers so just abandon this store. 6668 StoresVec.push_back(SI);
6673for (
auto &
P : PtrToStoresMap) {
6674 Res[
I].swap(
P.second);
6681 OrdersType &ReorderIndices)
const{
6682// We check whether the stores in StoreVec can form a vector by sorting them 6683// and checking whether they are consecutive. 6685// To avoid calling getPointersDiff() while sorting we create a vector of 6686// pairs {store, offset from first} and sort this instead. 6692for (
unsignedIdx : seq<unsigned>(1, StoresVec.
size())) {
6694 std::optional<int> Diff =
6696SI->getPointerOperand(), *
DL, *SE,
6697/*StrictCheck=*/true);
6701// Check if the stores are consecutive by checking if their difference is 1. 6702if (StoreOffsetVec.
size() != StoresVec.
size())
6705 [](
const std::pair<int, unsigned> &L,
6706const std::pair<int, unsigned> &R) {
returnL.first <
R.first; });
6709for (
constauto &
P : StoreOffsetVec) {
6710if (
Idx > 0 &&
P.first != PrevDist + 1)
6716// Calculate the shuffle indices according to their offset against the sorted 6718 ReorderIndices.assign(StoresVec.
size(), 0);
6719bool IsIdentity =
true;
6721 ReorderIndices[
P.second] =
I;
6722 IsIdentity &=
P.second ==
I;
6724// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in 6725// reorderTopToBottom() and reorderBottomToTop(), so we are following the 6726// same convention here. 6728 ReorderIndices.clear();
6735for (
unsignedIdx : Order)
6742BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const{
6743unsigned NumLanes =
TE->Scalars.size();
6747// Holds the reorder indices for each candidate store vector that is a user of 6748// the current TreeEntry. 6751// Now inspect the stores collected per pointer and look for vectorization 6752// candidates. For each candidate calculate the reorder index vector and push 6753// it into `ExternalReorderIndices` 6755// If we have fewer than NumLanes stores, then we can't form a vector. 6756if (StoresVec.
size() != NumLanes)
6759// If the stores are not consecutive then abandon this StoresVec. 6761if (!canFormVector(StoresVec, ReorderIndices))
6764// We now know that the scalars in StoresVec can form a vector instruction, 6765// so set the reorder indices. 6766 ExternalReorderIndices.
push_back(ReorderIndices);
6768return ExternalReorderIndices;
6774 UserIgnoreList = &UserIgnoreLst;
6777 buildTree_rec(Roots, 0,
EdgeInfo());
6784 buildTree_rec(Roots, 0,
EdgeInfo());
6787/// Tries to find subvector of loads and builds new vector of only loads if can 6801for (
Value *V : VL) {
6802auto *LI = dyn_cast<LoadInst>(V);
6805if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6808for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6809assert(LI->getParent() ==
Data.front().first->getParent() &&
6810 LI->getType() ==
Data.front().first->getType() &&
6814"Expected loads with the same type, same parent and same " 6815"underlying pointer.");
6817 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6818Data.front().first->getPointerOperand(),
DL, SE,
6819/*StrictCheck=*/true);
6822auto It = Map.find(*Dist);
6823if (It != Map.end() && It->second != LI)
6825if (It == Map.end()) {
6826Data.emplace_back(LI, *Dist);
6827 Map.try_emplace(*Dist, LI);
6837auto FindMatchingLoads =
6842int &
Offset,
unsigned &Start) {
6844return GatheredLoads.
end();
6854 std::optional<int> Dist =
6856Data.front().first->getType(),
6857Data.front().first->getPointerOperand(),
DL, SE,
6858/*StrictCheck=*/true);
6863for (std::pair<LoadInst *, int>
P :
Data) {
6867// Found matching gathered loads - check if all loads are unique or 6868// can be effectively vectorized. 6869unsigned NumUniques = 0;
6870for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6871bool Used = DataLoads.
contains(Pair.first);
6872if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6876 Repeated.insert(Cnt);
6879if (NumUniques > 0 &&
6880 (Loads.
size() == NumUniques ||
6881 (Loads.
size() - NumUniques >= 2 &&
6882 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6888return std::next(GatheredLoads.
begin(),
Idx);
6892return GatheredLoads.
end();
6894for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6898auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6900while (It != GatheredLoads.
end()) {
6901assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6902for (
unsignedIdx : LocalToAdd)
6904 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6905 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6909 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6913for (
unsignedIdx : seq<unsigned>(
Data.size())) {
6922 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6923return PD.front().first->getParent() == LI->
getParent() &&
6924 PD.front().first->getType() == LI->
getType();
6926while (It != GatheredLoads.
end()) {
6929 std::next(It), GatheredLoads.
end(),
6930 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6931 return PD.front().first->getParent() == LI->getParent() &&
6932 PD.front().first->getType() == LI->getType();
6936 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6937 AddNewLoads(GatheredLoads.emplace_back());
6942void BoUpSLP::tryToVectorizeGatheredLoads(
6945 8> &GatheredLoads) {
6946 GatheredLoadsEntriesFirst = VectorizableTree.size();
6949 LoadEntriesToVectorize.
size());
6950for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6951Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6952 VectorizableTree[
Idx]->Scalars.end());
6954// Sort loads by distance. 6955auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6956const std::pair<LoadInst *, int> &L2) {
6957return L1.second > L2.second;
6963Align Alignment = computeCommonAlignment<LoadInst>(Values);
6972bool Final,
unsigned MaxVF) {
6974unsigned StartIdx = 0;
6979 *
TTI, Loads.
front()->getType(), MaxVF);
6981 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
6987if (Final && CandidateVFs.
empty())
6990unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6991for (
unsigned NumElts : CandidateVFs) {
6992if (Final && NumElts > BestVF)
6995for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
6999if (VectorizedLoads.count(Slice.
front()) ||
7000 VectorizedLoads.count(Slice.
back()) ||
7003// Check if it is profitable to try vectorizing gathered loads. It is 7004// profitable if we have more than 3 consecutive loads or if we have 7005// less but all users are vectorized or deleted. 7006bool AllowToVectorize =
false;
7007// Check if it is profitable to vectorize 2-elements loads. 7013// If single use/user - allow to vectorize. 7016// 1. Check if number of uses equals number of users. 7017// 2. All users are deleted. 7018// 3. The load broadcasts are not allowed or the load is not 7020if (
static_cast<unsignedint>(std::distance(
7021 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7023if (!IsLegalBroadcastLoad)
7027for (
User *U : LI->users()) {
7028if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
7030if (
const TreeEntry *UTE = getTreeEntry(U)) {
7031for (
intI : seq<int>(UTE->getNumOperands())) {
7033 [LI](
Value *V) { return V == LI; }))
7034// Found legal broadcast - do not vectorize. 7042 AllowToVectorize = CheckIfAllowed(Slice);
7046any_of(ValueToGatherNodes.at(Slice.front()),
7047 [=](
const TreeEntry *TE) {
7048 return TE->Scalars.size() == 2 &&
7049 ((TE->Scalars.front() == Slice.front() &&
7050 TE->Scalars.back() == Slice.back()) ||
7051 (TE->Scalars.front() == Slice.back() &&
7052 TE->Scalars.back() == Slice.front()));
7057if (AllowToVectorize) {
7060// Try to build vector load. 7062reinterpret_cast<Value *
const*
>(Slice.begin()), Slice.size());
7064 PointerOps, &BestVF);
7066 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7068if (MaskedGatherVectorized.
empty() ||
7069 Cnt >= MaskedGatherVectorized.
back() + NumElts)
7074Results.emplace_back(Values, LS);
7075 VectorizedLoads.insert(Slice.begin(), Slice.end());
7076// If we vectorized initial block, no need to try to vectorize it 7079 StartIdx += NumElts;
7081// Check if the whole array was vectorized already - exit. 7082if (StartIdx >= Loads.
size())
7084// Erase last masked gather candidate, if another candidate within 7085// the range is found to be better. 7086if (!MaskedGatherVectorized.
empty() &&
7087 Cnt < MaskedGatherVectorized.
back() + NumElts)
7093if (!AllowToVectorize || BestVF == 0)
7096// Mark masked gathers candidates as vectorized, if any. 7097for (
unsigned Cnt : MaskedGatherVectorized) {
7099 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7103 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7104// If we vectorized initial block, no need to try to vectorize it again. 7106 StartIdx += NumElts;
7110if (!VectorizedLoads.contains(LI))
7111 NonVectorized.push_back(LI);
7115auto ProcessGatheredLoads =
7120for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7121if (LoadsDists.size() <= 1) {
7122 NonVectorized.
push_back(LoadsDists.back().first);
7127transform(LoadsDists, OriginalLoads.begin(),
7128 [](
const std::pair<LoadInst *, int> &L) ->
LoadInst * {
7133unsigned MaxConsecutiveDistance = 0;
7134unsigned CurrentConsecutiveDist = 1;
7135int LastDist = LocalLoadsDists.
front().second;
7136bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7137for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7138if (getTreeEntry(
L.first))
7141"Expected first distance always not less than second");
7142if (
static_cast<unsigned>(LastDist -
L.second) ==
7143 CurrentConsecutiveDist) {
7144 ++CurrentConsecutiveDist;
7145 MaxConsecutiveDistance =
7146 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7150if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7153 CurrentConsecutiveDist = 1;
7157if (Loads.
size() <= 1)
7159if (AllowMaskedGather)
7160 MaxConsecutiveDistance = Loads.
size();
7161elseif (MaxConsecutiveDistance < 2)
7166 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7167 Final, MaxConsecutiveDistance);
7169 OriginalLoads.size() == Loads.
size() &&
7170 MaxConsecutiveDistance == Loads.
size() &&
7175 VectorizedLoads.
clear();
7179 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7180 UnsortedNonVectorized, Final,
7181 OriginalLoads.size());
7182if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7183 SortedNonVectorized.
swap(UnsortedNonVectorized);
7189 << Slice.
size() <<
")\n");
7190if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
7191for (
Value *L : Slice)
7192if (!getTreeEntry(L))
7193 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7197// Select maximum VF as a maximum of user gathered nodes and 7198// distance between scalar loads in these nodes. 7199unsigned MaxVF = Slice.size();
7200unsigned UserMaxVF = 0;
7201unsigned InterleaveFactor = 0;
7205// Found distance between segments of the interleaved loads. 7206 std::optional<unsigned> InterleavedLoadsDistance = 0;
7208 std::optional<unsigned> CommonVF = 0;
7212for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7213 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7216 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7218if (*CommonVF == 0) {
7219 CommonVF =
E->Scalars.size();
7222if (*CommonVF !=
E->Scalars.size())
7225// Check if the load is the part of the interleaved load. 7226if (Pos !=
Idx && InterleavedLoadsDistance) {
7229 if (isa<Constant>(V))
7231 if (getTreeEntry(V))
7233 const auto &Nodes = ValueToGatherNodes.at(V);
7234 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7235 !is_contained(Slice, V);
7237 InterleavedLoadsDistance.reset();
7241if (*InterleavedLoadsDistance == 0) {
7242 InterleavedLoadsDistance =
Idx - Pos;
7245if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7246 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7247 InterleavedLoadsDistance.reset();
7248 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7252 DeinterleavedNodes.
clear();
7253// Check if the large load represents interleaved load operation. 7254if (InterleavedLoadsDistance.value_or(0) > 1 &&
7255 CommonVF.value_or(0) != 0) {
7256 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7257unsigned VF = *CommonVF;
7260// Segmented load detected - vectorize at maximum vector factor. 7261if (InterleaveFactor <= Slice.size() &&
7265 cast<LoadInst>(Slice.front())->getAlign(),
7266 cast<LoadInst>(Slice.front())
7270 UserMaxVF = InterleaveFactor * VF;
7272 InterleaveFactor = 0;
7275// Cannot represent the loads as consecutive vectorizable nodes - 7277unsigned ConsecutiveNodesSize = 0;
7278if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7279any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7280 [&, Slice = Slice](
constauto &
P) {
7282return std::get<1>(
P).contains(V);
7284if (It == Slice.end())
7287 VectorizableTree[std::get<0>(
P)]->Scalars;
7288 ConsecutiveNodesSize += VL.
size();
7289unsigned Start = std::distance(Slice.begin(), It);
7290unsigned Sz = Slice.size() - Start;
7291return Sz < VL.
size() ||
7292 Slice.slice(std::distance(Slice.begin(), It),
7296// Try to build long masked gather loads. 7298if (InterleaveFactor == 0 &&
7299any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7300 [&, Slice = Slice](
unsignedIdx) {
7302 SmallVector<Value *> PointerOps;
7303 return canVectorizeLoads(
7304 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7305 Slice[Idx * UserMaxVF], Order,
7307 LoadsState::ScatterVectorize;
7310if (Slice.size() != ConsecutiveNodesSize)
7311 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7313for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7314bool IsVectorized =
true;
7315for (
unsignedI = 0,
E = Slice.size();
I <
E;
I += VF) {
7318if (getTreeEntry(SubSlice.
front()))
7320// Check if the subslice is to be-vectorized entry, which is not 7322if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7325 VectorizableTree[std::get<0>(
P)]
7330unsigned Sz = VectorizableTree.size();
7331 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7332if (Sz == VectorizableTree.size()) {
7333 IsVectorized =
false;
7334// Try non-interleaved vectorization with smaller vector 7336if (InterleaveFactor > 0) {
7337 VF = 2 * (MaxVF / InterleaveFactor);
7338 InterleaveFactor = 0;
7347 NonVectorized.
append(SortedNonVectorized);
7349return NonVectorized;
7351for (
constauto &GLs : GatheredLoads) {
7352constauto &
Ref = GLs.second;
7354if (!
Ref.empty() && !NonVectorized.
empty() &&
7356Ref.begin(),
Ref.end(), 0u,
7358ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->
unsigned {
7359 return S + LoadsDists.size();
7360 }) != NonVectorized.
size() &&
7361 IsMaskedGatherSupported(NonVectorized)) {
7363for (
LoadInst *LI : NonVectorized) {
7364// Reinsert non-vectorized loads to other list of loads with the same 7370// Final attempt to vectorize non-vectorized loads. 7371 (void)ProcessGatheredLoads(FinalGatheredLoads,
/*Final=*/true);
7374// Try to vectorize postponed load entries, previously marked as gathered. 7375for (
unsignedIdx : LoadEntriesToVectorize) {
7376const TreeEntry &
E = *VectorizableTree[
Idx];
7378// Avoid reordering, if possible. 7379if (!
E.ReorderIndices.empty()) {
7380// Build a mask out of the reorder indices and reorder scalars per this 7386 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7388// If no new entries created, consider it as no gathered loads entries must be 7390if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7391 VectorizableTree.size())
7392 GatheredLoadsEntriesFirst.reset();
7395/// \return true if the specified list of values has only one instruction that 7396/// requires scheduling, false otherwise. 7399Value *NeedsScheduling =
nullptr;
7400for (
Value *V : VL) {
7403if (!NeedsScheduling) {
7404 NeedsScheduling = V;
7409return NeedsScheduling;
7413/// Generates key/subkey pair for the given value to provide effective sorting 7414/// of the values and better detection of the vectorizable values sequences. The 7415/// keys/subkeys can be used for better sorting of the values themselves (keys) 7416/// and in values subgroups (subkeys). 7420bool AllowAlternate) {
7423// Sort the loads by the distance between the pointers. 7424if (
auto *LI = dyn_cast<LoadInst>(V)) {
7427 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7431// Sort extracts by the vector operands. 7432if (isa<ExtractElementInst, UndefValue>(V))
7434if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7436 !isa<UndefValue>(EI->getIndexOperand()))
7439 }
elseif (
auto *
I = dyn_cast<Instruction>(V)) {
7440// Sort other instructions just by the opcodes except for CMPInst. 7441// For CMP also sort by the predicate kind. 7442if ((isa<BinaryOperator, CastInst>(
I)) &&
7452 : cast<CastInst>(
I)->getOperand(0)->getType()));
7453// For casts, look through the only operand to improve compile time. 7454if (isa<CastInst>(
I)) {
7455 std::pair<size_t, size_t> OpVals =
7457/*AllowAlternate=*/true);
7461 }
elseif (
auto *CI = dyn_cast<CmpInst>(
I)) {
7463if (CI->isCommutative())
7469 }
elseif (
auto *Call = dyn_cast<CallInst>(
I)) {
7483 }
elseif (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7484if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7485 SubKey =
hash_value(Gep->getPointerOperand());
7489 !isa<ConstantInt>(
I->getOperand(1))) {
7490// Do not try to vectorize instructions with potentially high cost. 7497return std::make_pair(Key, SubKey);
7500/// Checks if the specified instruction \p I is an alternate operation for 7501/// the given \p MainOp and \p AltOp instructions. 7507bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7509unsigned Opcode0 = S.getOpcode();
7510unsigned Opcode1 = S.getAltOpcode();
7512// If this pattern is supported by the target then consider it profitable. 7514 Opcode0, Opcode1, OpcodeMask))
7517for (
unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7519// Prepare the operand vector. 7520for (
Value *V : VL) {
7521if (isa<PoisonValue>(V)) {
7526Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7530// Try find best operands candidates. 7531for (
unsignedI : seq<unsigned>(0, VL.size() - 1)) {
7537switch (Res.value_or(0)) {
7552constexprunsigned NumAltInsts = 3;
// main + alt + shuffle. 7553unsigned NonInstCnt = 0;
7554// Estimate number of instructions, required for the vectorized node and for 7555// the buildvector node. 7556unsigned UndefCnt = 0;
7557// Count the number of extra shuffles, required for vector nodes. 7558unsigned ExtraShuffleInsts = 0;
7559// Check that operands do not contain same values and create either perfect 7560// diamond match or shuffled match. 7562// Do not count same operands twice. 7567 return is_contained(Operands.back(), V);
7570 ++ExtraShuffleInsts;
7574// Vectorize node, if: 7575// 1. at least single operand is constant or splat. 7576// 2. Operands have many loop invariants (the instructions are not loop 7578// 3. At least single unique operands is supposed to vectorized. 7587if (isa<Constant, ExtractElementInst>(V) ||
7588 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7589 if (isa<UndefValue>(V))
7594// Found first duplicate - need to add shuffle. 7595if (!Res.second && Res.first->second == 1)
7596 ++ExtraShuffleInsts;
7597 ++Res.first->getSecond();
7598if (
auto *
I = dyn_cast<Instruction>(V))
7599 UniqueOpcodes.
insert(
I->getOpcode());
7603returnnone_of(Uniques, [&](
constauto &
P) {
7604returnP.first->hasNUsesOrMore(
P.second + 1) &&
7606 return getTreeEntry(U) || Uniques.contains(U);
7610// Do not vectorize node, if estimated number of vector instructions is 7611// more than estimated number of buildvector instructions. Number of 7612// vector operands is number of vector instructions + number of vector 7613// instructions for operands (buildvectors). Number of buildvector 7614// instructions is just number_of_operands * number_of_scalars. 7615 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7616 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7617 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7620BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7622bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7625"Expected instructions with same/alternate opcodes only.");
7627unsigned ShuffleOrOp =
7628 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7630switch (ShuffleOrOp) {
7631case Instruction::PHI: {
7632// Too many operands - gather, most probably won't be vectorized. 7634return TreeEntry::NeedToGather;
7635// Check for terminator values (e.g. invoke). 7636for (
Value *V : VL) {
7637auto *
PHI = dyn_cast<PHINode>(V);
7642if (Term &&
Term->isTerminator()) {
7644 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7645return TreeEntry::NeedToGather;
7650return TreeEntry::Vectorize;
7652case Instruction::ExtractValue:
7653case Instruction::ExtractElement: {
7654bool Reuse = canReuseExtract(VL, CurrentOrder);
7655// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and 7656// non-full registers). 7658return TreeEntry::NeedToGather;
7659if (Reuse || !CurrentOrder.empty())
7660return TreeEntry::Vectorize;
7662return TreeEntry::NeedToGather;
7664case Instruction::InsertElement: {
7665// Check that we have a buildvector and not a shuffle of 2 or more 7666// different vectors. 7668for (
Value *V : VL) {
7669 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7671"Non-constant or undef index?");
7675return !SourceVectors.contains(V);
7677// Found 2nd source vector - cancel. 7679"different source vectors.\n");
7680return TreeEntry::NeedToGather;
7684// The last InsertElement can have multiple uses. 7685return SourceVectors.contains(V) && !
V->hasOneUse();
7690return TreeEntry::NeedToGather;
7693return TreeEntry::Vectorize;
7695case Instruction::Load: {
7696// Check that a vectorized load would load the same memory as a scalar 7697// load. For example, we don't want to vectorize loads that are smaller 7698// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 7699// treats loading/storing it as an i8 struct. If we vectorize loads/stores 7700// from such a struct, we read/write packed bits disagreeing with the 7701// unvectorized version. 7704return TreeEntry::Vectorize;
7706if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7707// Delay slow vectorized nodes for better vectorization attempts. 7708 LoadEntriesToVectorize.insert(VectorizableTree.size());
7709return TreeEntry::NeedToGather;
7711return TreeEntry::ScatterVectorize;
7713if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7714// Delay slow vectorized nodes for better vectorization attempts. 7715 LoadEntriesToVectorize.insert(VectorizableTree.size());
7716return TreeEntry::NeedToGather;
7718return TreeEntry::StridedVectorize;
7722if (
DL->getTypeSizeInBits(ScalarTy) !=
7723DL->getTypeAllocSizeInBits(ScalarTy))
7724LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7726auto *LI = dyn_cast<LoadInst>(V);
7727return !LI || !LI->isSimple();
7734return TreeEntry::NeedToGather;
7738case Instruction::ZExt:
7739case Instruction::SExt:
7740case Instruction::FPToUI:
7741case Instruction::FPToSI:
7742case Instruction::FPExt:
7743case Instruction::PtrToInt:
7744case Instruction::IntToPtr:
7745case Instruction::SIToFP:
7746case Instruction::UIToFP:
7747case Instruction::Trunc:
7748case Instruction::FPTrunc:
7749case Instruction::BitCast: {
7751for (
Value *V : VL) {
7752if (isa<PoisonValue>(V))
7754Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7757dbgs() <<
"SLP: Gathering casts with different src types.\n");
7758return TreeEntry::NeedToGather;
7761return TreeEntry::Vectorize;
7763case Instruction::ICmp:
7764case Instruction::FCmp: {
7765// Check that all of the compares have the same predicate. 7769for (
Value *V : VL) {
7770if (isa<PoisonValue>(V))
7772auto *
Cmp = cast<CmpInst>(V);
7773if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7774Cmp->getOperand(0)->getType() != ComparedTy) {
7775LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7776return TreeEntry::NeedToGather;
7779return TreeEntry::Vectorize;
7781case Instruction::Select:
7782case Instruction::FNeg:
7783case Instruction::Add:
7784case Instruction::FAdd:
7785case Instruction::Sub:
7786case Instruction::FSub:
7787case Instruction::Mul:
7788case Instruction::FMul:
7789case Instruction::UDiv:
7790case Instruction::SDiv:
7791case Instruction::FDiv:
7792case Instruction::URem:
7793case Instruction::SRem:
7794case Instruction::FRem:
7795case Instruction::Shl:
7796case Instruction::LShr:
7797case Instruction::AShr:
7798case Instruction::And:
7799case Instruction::Or:
7800case Instruction::Xor:
7801case Instruction::Freeze:
7802if (S.getMainOp()->getType()->isFloatingPointTy() &&
7804auto *
I = dyn_cast<Instruction>(V);
7805returnI &&
I->isBinaryOp() && !
I->isFast();
7807return TreeEntry::NeedToGather;
7808return TreeEntry::Vectorize;
7809case Instruction::GetElementPtr: {
7810// We don't combine GEPs with complicated (nested) indexing. 7811for (
Value *V : VL) {
7812auto *
I = dyn_cast<GetElementPtrInst>(V);
7815if (
I->getNumOperands() != 2) {
7816LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7817return TreeEntry::NeedToGather;
7821// We can't combine several GEPs into one vector if they operate on 7823Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7824for (
Value *V : VL) {
7825auto *
GEP = dyn_cast<GEPOperator>(V);
7828Type *CurTy =
GEP->getSourceElementType();
7830LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7831return TreeEntry::NeedToGather;
7835// We don't combine GEPs with non-constant indexes. 7837for (
Value *V : VL) {
7838auto *
I = dyn_cast<GetElementPtrInst>(V);
7841auto *
Op =
I->getOperand(1);
7842if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7843 (
Op->getType() != Ty1 &&
7844 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7845Op->getType()->getScalarSizeInBits() >
7846DL->getIndexSizeInBits(
7847V->getType()->getPointerAddressSpace())))) {
7849dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7850return TreeEntry::NeedToGather;
7854return TreeEntry::Vectorize;
7856case Instruction::Store: {
7857// Check if the stores are consecutive or if we need to swizzle them. 7858llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7859// Avoid types that are padded when being allocated as scalars, while 7860// being packed together in a vector (such as i1). 7861if (
DL->getTypeSizeInBits(ScalarTy) !=
7862DL->getTypeAllocSizeInBits(ScalarTy)) {
7863LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7864return TreeEntry::NeedToGather;
7866// Make sure all stores in the bundle are simple - we can't vectorize 7867// atomic or volatile stores. 7868for (
Value *V : VL) {
7869auto *
SI = cast<StoreInst>(V);
7870if (!
SI->isSimple()) {
7872return TreeEntry::NeedToGather;
7877// Check the order of pointer operands. 7881if (CurrentOrder.empty()) {
7882 Ptr0 = PointerOps.
front();
7883 PtrN = PointerOps.
back();
7885 Ptr0 = PointerOps[CurrentOrder.front()];
7886 PtrN = PointerOps[CurrentOrder.back()];
7888 std::optional<int> Dist =
7890// Check that the sorted pointer operands are consecutive. 7891if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7892return TreeEntry::Vectorize;
7896return TreeEntry::NeedToGather;
7898case Instruction::Call: {
7899if (S.getMainOp()->getType()->isFloatingPointTy() &&
7901auto *
I = dyn_cast<Instruction>(V);
7902returnI && !
I->isFast();
7904return TreeEntry::NeedToGather;
7905// Check if the calls are all to the same vectorizable intrinsic or 7913false/*HasGlobalPred*/);
7918return TreeEntry::NeedToGather;
7923for (
unsigned J = 0; J != NumArgs; ++J)
7926for (
Value *V : VL) {
7927CallInst *CI2 = dyn_cast<CallInst>(V);
7935return TreeEntry::NeedToGather;
7937// Some intrinsics have scalar arguments and should be same in order for 7938// them to be vectorized. 7939for (
unsigned J = 0; J != NumArgs; ++J) {
7942if (ScalarArgs[J] != A1J) {
7944 <<
"SLP: mismatched arguments in call:" << *CI
7945 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7946return TreeEntry::NeedToGather;
7950// Verify that the bundle operands are identical between the two calls. 7955LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7956 <<
"!=" << *V <<
'\n');
7957return TreeEntry::NeedToGather;
7961return TreeEntry::Vectorize;
7963case Instruction::ShuffleVector: {
7964if (!S.isAltShuffle()) {
7965// REVEC can support non alternate shuffle. 7967return TreeEntry::Vectorize;
7968// If this is not an alternate sequence of opcode like add-sub 7969// then do not vectorize this instruction. 7971return TreeEntry::NeedToGather;
7976 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and " 7977"the whole alt sequence is not profitable.\n");
7978return TreeEntry::NeedToGather;
7981return TreeEntry::Vectorize;
7985return TreeEntry::NeedToGather;
7990/// Allows to correctly handle operands of the phi nodes based on the \p Main 7991/// PHINode order of incoming basic blocks/values. 7999 PHIHandler() =
delete;
8001 : DT(DT), Main(Main), Phis(Phis),
8002Operands(Main->getNumIncomingValues(),
8004void buildOperands() {
8005constexprunsigned FastLimit = 4;
8013// Prepare the operand vector. 8015auto *
P = dyn_cast<PHINode>(V);
8017assert(isa<PoisonValue>(V) &&
8018"Expected isa instruction or poison value.");
8022if (
P->getIncomingBlock(
I) == InBB)
8037Blocks.try_emplace(InBB).first->second.push_back(
I);
8040if (isa<PoisonValue>(V)) {
8045auto *
P = cast<PHINode>(V);
8046for (
unsignedI : seq<unsigned>(0,
P->getNumIncomingValues())) {
8054auto It =
Blocks.find(InBB);
8061if (
P.getSecond().size() <= 1)
8063unsigned BasicI =
P.getSecond().front();
8066 [&](
constauto &Data) {
8067return !Data.value() ||
8068 Data.value() ==
Operands[BasicI][Data.index()];
8070"Expected empty operands list.");
8080const EdgeInfo &UserTreeIdx,
8081unsigned InterleaveFactor) {
8087auto TryToFindDuplicates = [&](
const InstructionsState &S,
8088bool DoNotFail =
false) {
8089// Check that every instruction appears once in this bundle. 8091for (
Value *V : VL) {
8098auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8103size_t NumUniqueScalarValues = UniqueValues.
size();
8106if (NumUniqueScalarValues == VL.size() &&
8108 ReuseShuffleIndices.
clear();
8110// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. 8111if ((UserTreeIdx.UserTE &&
8112 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8115"for nodes with padding.\n");
8116 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8120if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8121 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8124if (DoNotFail && UniquePositions.size() > 1 &&
8125 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8126all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8127// Find the number of elements, which forms full vectors. 8129 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8130if (PWSz == VL.size()) {
8131 ReuseShuffleIndices.
clear();
8133 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8135 PWSz - UniqueValues.
size(),
8137// Check that extended with poisons operations are still valid for 8138// vectorization (div/rem are not allowed). 8141 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8144 VL = NonUniqueValueVL;
8149 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8159// Don't go into catchswitch blocks, which can happen with PHIs. 8160// Such blocks can only have PHIs and the catchswitch. There is no 8161// place to insert a shuffle if we need to, so just avoid that issue. 8162if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8164 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8168// Check if this is a duplicate of another entry. 8170if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8173if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8174auto It = MultiNodeScalars.
find(S.getMainOp());
8175if (It != MultiNodeScalars.
end()) {
8176auto *TEIt =
find_if(It->getSecond(),
8177 [&](TreeEntry *ME) { return ME->isSame(VL); });
8178if (TEIt != It->getSecond().end())
8189if (TryToFindDuplicates(S))
8190 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8191 ReuseShuffleIndices);
8195 Nodes.
insert(getTreeEntry(S.getMainOp()));
8196for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.getMainOp()))
8199if (
any_of(Nodes, [&](
const TreeEntry *E) {
8201 [&](
Value *V) { return Values.contains(V); }))
8206all_of(VL, [&](
Value *V) {
return EValues.contains(V); }));
8209if (TryToFindDuplicates(S))
8210 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8211 ReuseShuffleIndices);
8215// Record the reuse of the tree node. FIXME, currently this is only 8216// used to properly draw the graph rather than for the actual 8218 E->UserTreeIndices.push_back(UserTreeIdx);
8219LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8226// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of 8227// a load), in which case peek through to include it in the tree, without 8228// ballooning over-budget. 8230 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8235 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
8237LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8238if (TryToFindDuplicates(S))
8239 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8240 ReuseShuffleIndices);
8244// Don't handle scalable vectors 8245if (S && S.getOpcode() == Instruction::ExtractElement &&
8246 isa<ScalableVectorType>(
8247 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8248LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8249if (TryToFindDuplicates(S))
8250 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8251 ReuseShuffleIndices);
8255// Don't handle vectors. 8258 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8262// If all of the operands are identical or constant we have a simple solution. 8263// If we deal with insert/extract instructions, they all must have constant 8264// indices, otherwise we should gather them, not try to vectorize. 8265// If alternate op node with 2 elements with gathered operands - do not 8267auto &&NotProfitableForVectorization = [&S,
this,
8269if (!S || !S.isAltShuffle() || VL.size() > 2)
8275// Check if all operands are extracts, part of vector node or can build a 8276// regular vectorize node. 8278for (
Value *V : VL) {
8279auto *
I = cast<Instruction>(V);
8281 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8286if ((IsCommutative &&
8287 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8289all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8291assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8293auto *
I1 = cast<Instruction>(VL.front());
8294auto *I2 = cast<Instruction>(VL.back());
8295for (
intOp : seq<int>(S.getMainOp()->getNumOperands()))
8297 I2->getOperand(
Op));
8299 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8301 })) >= S.getMainOp()->getNumOperands() / 2)
8303if (S.getMainOp()->getNumOperands() > 2)
8306// Check permuted operands. 8310 I2->getOperand((
Op + 1) % E));
8312 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8321bool IsScatterVectorizeUserTE =
8322 UserTreeIdx.UserTE &&
8323 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8325bool AreScatterAllGEPSameBlock =
8326 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8330auto *
I = dyn_cast<GetElementPtrInst>(V);
8335return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8338sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8340bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8343 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8346 NotProfitableForVectorization(VL)) {
8347LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8348if (TryToFindDuplicates(S))
8349 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8350 ReuseShuffleIndices);
8354// Don't vectorize ephemeral values. 8355if (S && !EphValues.
empty()) {
8356for (
Value *V : VL) {
8357if (EphValues.
count(V)) {
8359 <<
") is ephemeral.\n");
8360 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8366// We now know that this is a vector of instructions of the same type from 8369// Check that none of the instructions in the bundle are already in the tree. 8370for (
Value *V : VL) {
8371if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8374if (getTreeEntry(V)) {
8376 <<
") is already in tree.\n");
8377if (TryToFindDuplicates(S))
8378 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8379 ReuseShuffleIndices);
8384// The reduction nodes (stored in UserIgnoreList) also should stay scalar. 8385if (UserIgnoreList && !UserIgnoreList->empty()) {
8386for (
Value *V : VL) {
8387if (UserIgnoreList->contains(V)) {
8389if (TryToFindDuplicates(S))
8390 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8391 ReuseShuffleIndices);
8397// Special processing for sorted pointers for ScatterVectorize node with 8398// constant indeces only. 8399if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8400assert(VL.front()->getType()->isPointerTy() &&
8401count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8402"Expected pointers only.");
8403// Reset S to make it GetElementPtr kind of node. 8404constauto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8405assert(It != VL.end() &&
"Expected at least one GEP.");
8409// Check that all of the users of the scalars that we want to vectorize are 8417// Don't go into unreachable blocks. They may contain instructions with 8418// dependency cycles which confuse the final scheduling. 8419// Do not vectorize EH and non-returning blocks, not profitable in most 8422 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx);
8426// Check that every instruction appears once in this bundle. 8427if (!TryToFindDuplicates(S,
/*DoNotFail=*/true))
8430// Perform specific checks for each particular instruction kind. 8433 TreeEntry::EntryState State = getScalarsVectorizationState(
8434 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8435if (State == TreeEntry::NeedToGather) {
8436 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8437 ReuseShuffleIndices);
8441auto &BSRef = BlocksSchedules[BB];
8443 BSRef = std::make_unique<BlockScheduling>(BB);
8445 BlockScheduling &BS = *BSRef;
8447 std::optional<ScheduleData *> Bundle =
8448 BS.tryScheduleBundle(UniqueValues,
this, S);
8449#ifdef EXPENSIVE_CHECKS 8450// Make sure we didn't break any internal invariants 8454LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8455assert((!BS.getScheduleData(VL0) ||
8456 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8457"tryScheduleBundle should cancelScheduling on failure");
8458 newTreeEntry(VL, std::nullopt
/*not vectorized*/, S, UserTreeIdx,
8459 ReuseShuffleIndices);
8460 NonScheduledFirst.insert(VL.front());
8461if (S.getOpcode() == Instruction::Load &&
8462 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8466LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8468unsigned ShuffleOrOp =
8469 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
8470auto CreateOperandNodes = [&](TreeEntry *
TE,
constauto &
Operands) {
8471// Postpone PHI nodes creation 8473for (
unsignedI : seq<unsigned>(
Operands.size())) {
8478if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8483for (
unsignedI : PHIOps)
8486switch (ShuffleOrOp) {
8487case Instruction::PHI: {
8488auto *PH = cast<PHINode>(VL0);
8491 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8495// Keeps the reordered operands to avoid code duplication. 8496 PHIHandler Handler(*DT, PH, VL);
8497 Handler.buildOperands();
8498for (
unsignedI : seq<unsigned>(PH->getNumOperands()))
8499TE->setOperand(
I, Handler.getOperands(
I));
8501for (
unsignedI : seq<unsigned>(PH->getNumOperands()))
8506case Instruction::ExtractValue:
8507case Instruction::ExtractElement: {
8508if (CurrentOrder.empty()) {
8509LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8512dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence " 8514for (
unsignedIdx : CurrentOrder)
8520// Insert new order with initial value 0, if it does not exist, 8521// otherwise return the iterator to the existing one. 8522 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8523 ReuseShuffleIndices, CurrentOrder);
8525"(ExtractValueInst/ExtractElementInst).\n";
8527// This is a special case, as it does not gather, but at the same time 8528// we are not extending buildTree_rec() towards the operands. 8529TE->setOperand(*
this);
8532case Instruction::InsertElement: {
8533assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8535auto OrdCompare = [](
const std::pair<int, int> &
P1,
8536const std::pair<int, int> &P2) {
8537returnP1.first > P2.first;
8540decltype(OrdCompare)>
8541 Indices(OrdCompare);
8542for (
intI = 0, E = VL.size();
I < E; ++
I) {
8544 Indices.emplace(
Idx,
I);
8546OrdersType CurrentOrder(VL.size(), VL.size());
8547bool IsIdentity =
true;
8548for (
intI = 0, E = VL.size();
I < E; ++
I) {
8549 CurrentOrder[Indices.top().second] =
I;
8550 IsIdentity &= Indices.top().second ==
I;
8554 CurrentOrder.clear();
8555 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8557LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8560TE->setOperand(*
this);
8561 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8564case Instruction::Load: {
8565// Check that a vectorized load would load the same memory as a scalar 8566// load. For example, we don't want to vectorize loads that are smaller 8567// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM 8568// treats loading/storing it as an i8 struct. If we vectorize loads/stores 8569// from such a struct, we read/write packed bits disagreeing with the 8570// unvectorized version. 8571 TreeEntry *
TE =
nullptr;
8574case TreeEntry::Vectorize:
8575TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8576 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8577if (CurrentOrder.empty())
8582 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8585case TreeEntry::StridedVectorize:
8586// Vectorizing non-consecutive loads with `llvm.masked.gather`. 8587TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8588 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8589LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8592case TreeEntry::ScatterVectorize:
8593// Vectorizing non-consecutive loads with `llvm.masked.gather`. 8594TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8595 UserTreeIdx, ReuseShuffleIndices);
8598 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8601case TreeEntry::CombinedVectorize:
8602case TreeEntry::NeedToGather:
8605TE->setOperand(*
this);
8606if (State == TreeEntry::ScatterVectorize)
8607 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8610case Instruction::ZExt:
8611case Instruction::SExt:
8612case Instruction::FPToUI:
8613case Instruction::FPToSI:
8614case Instruction::FPExt:
8615case Instruction::PtrToInt:
8616case Instruction::IntToPtr:
8617case Instruction::SIToFP:
8618case Instruction::UIToFP:
8619case Instruction::Trunc:
8620case Instruction::FPTrunc:
8621case Instruction::BitCast: {
8622auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8623 std::make_pair(std::numeric_limits<unsigned>::min(),
8624 std::numeric_limits<unsigned>::max()));
8625if (ShuffleOrOp == Instruction::ZExt ||
8626 ShuffleOrOp == Instruction::SExt) {
8627 CastMaxMinBWSizes = std::make_pair(
8633 }
elseif (ShuffleOrOp == Instruction::Trunc) {
8634 CastMaxMinBWSizes = std::make_pair(
8641 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8642 ReuseShuffleIndices);
8646TE->setOperand(*
this);
8648 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8649if (ShuffleOrOp == Instruction::Trunc) {
8650 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8651 }
elseif (ShuffleOrOp == Instruction::SIToFP ||
8652 ShuffleOrOp == Instruction::UIToFP) {
8653unsigned NumSignBits =
8655if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8657 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8659if (NumSignBits * 2 >=
8661 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8665case Instruction::ICmp:
8666case Instruction::FCmp: {
8667// Check that all of the compares have the same predicate. 8669 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8670 ReuseShuffleIndices);
8675 VLOperands Ops(VL, S, *
this);
8677// Commutative predicate - collect + sort operands of the instructions 8678// so that each side is more likely to have the same opcode. 8680"Commutative Predicate mismatch");
8683Right = Ops.getVL(1);
8685// Collect operands - commute if it uses the swapped predicate. 8686for (
Value *V : VL) {
8687if (isa<PoisonValue>(V)) {
8692auto *
Cmp = cast<CmpInst>(V);
8695if (
Cmp->getPredicate() != P0)
8698Right.push_back(RHS);
8705if (ShuffleOrOp == Instruction::ICmp) {
8706unsigned NumSignBits0 =
8708if (NumSignBits0 * 2 >=
8710 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8711unsigned NumSignBits1 =
8713if (NumSignBits1 * 2 >=
8715 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8719case Instruction::Select:
8720case Instruction::FNeg:
8721case Instruction::Add:
8722case Instruction::FAdd:
8723case Instruction::Sub:
8724case Instruction::FSub:
8725case Instruction::Mul:
8726case Instruction::FMul:
8727case Instruction::UDiv:
8728case Instruction::SDiv:
8729case Instruction::FDiv:
8730case Instruction::URem:
8731case Instruction::SRem:
8732case Instruction::FRem:
8733case Instruction::Shl:
8734case Instruction::LShr:
8735case Instruction::AShr:
8736case Instruction::And:
8737case Instruction::Or:
8738case Instruction::Xor:
8739case Instruction::Freeze: {
8740 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8741 ReuseShuffleIndices);
8743dbgs() <<
"SLP: added a new TreeEntry " 8744"(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8749 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8752case Instruction::GetElementPtr: {
8753 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8754 ReuseShuffleIndices);
8755LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8758// Prepare the operand vector for pointer operands. 8759for (
Value *V : VL) {
8760auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8765Operands.front().push_back(
GEP->getPointerOperand());
8768// Need to cast all indices to the same type before vectorization to 8770// Required to be able to find correct matches between different gather 8771// nodes and reuse the vectorized values rather than trying to gather them 8776 [VL0Ty, IndexIdx](
Value *V) {
8777auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8780return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8784 ->getPointerOperandType()
8786// Prepare the operand vector. 8787for (
Value *V : VL) {
8788auto *
I = dyn_cast<GetElementPtrInst>(V);
8791 ConstantInt::get(Ty, 0,
/*isSigned=*/false));
8794auto *
Op =
I->getOperand(IndexIdx);
8795auto *CI = dyn_cast<ConstantInt>(
Op);
8800 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8804for (
unsignedI = 0, Ops =
Operands.size();
I < Ops; ++
I)
8808case Instruction::Store: {
8809bool Consecutive = CurrentOrder.empty();
8812 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8813 ReuseShuffleIndices, CurrentOrder);
8819dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8821TE->setOperand(*
this);
8822 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8825case Instruction::Call: {
8826// Check if the calls are all to the same vectorizable intrinsic or 8831 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8832 ReuseShuffleIndices);
8836for (
unsignedI : seq<unsigned>(CI->
arg_size())) {
8837// For scalar operands no need to create an entry since no need to 8841 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8845case Instruction::ShuffleVector: {
8846 TreeEntry *
TE = newTreeEntry(VL, Bundle
/*vectorized*/, S, UserTreeIdx,
8847 ReuseShuffleIndices);
8848if (S.isAltShuffle()) {
8849LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8854dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8858// Reorder operands if reordering would enable vectorization. 8859auto *CI = dyn_cast<CmpInst>(VL0);
8861return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8863auto *MainCI = cast<CmpInst>(S.getMainOp());
8864auto *AltCI = cast<CmpInst>(S.getAltOp());
8868"Expected different main/alternate predicates.");
8870// Collect operands - commute if it uses the swapped predicate or 8871// alternate operation. 8872for (
Value *V : VL) {
8873if (isa<PoisonValue>(V)) {
8878auto *
Cmp = cast<CmpInst>(V);
8890Right.push_back(RHS);
8899TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8901 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8914while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8917if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8918// Check that struct is homogeneous. 8919for (
constauto *Ty : ST->elements())
8920if (Ty != *ST->element_begin())
8922N *= ST->getNumElements();
8923 EltTy = *ST->element_begin();
8924 }
elseif (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8925N *= AT->getNumElements();
8926 EltTy = AT->getElementType();
8928auto *VT = cast<FixedVectorType>(EltTy);
8929N *= VT->getNumElements();
8930 EltTy = VT->getElementType();
8937if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8945bool ResizeAllowed)
const{
8946constauto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8947assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8948auto *E0 = cast<Instruction>(*It);
8950all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8952// Check if all of the extracts come from the same vector and from the 8954Value *Vec = E0->getOperand(0);
8956 CurrentOrder.
clear();
8958// We have to extract from a vector/aggregate with the same number of elements. 8960if (E0->getOpcode() == Instruction::ExtractValue) {
8964// Check if load can be rewritten as load of vector. 8965LoadInst *LI = dyn_cast<LoadInst>(Vec);
8969 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8972unsigned E = VL.
size();
8973if (!ResizeAllowed && NElts != E)
8976unsigned MinIdx = NElts, MaxIdx = 0;
8978auto *Inst = dyn_cast<Instruction>(V);
8981if (Inst->getOperand(0) != Vec)
8983if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8984if (isa<UndefValue>(EE->getIndexOperand()))
8989constunsigned ExtIdx = *
Idx;
8992 Indices[
I] = ExtIdx;
8998if (MaxIdx - MinIdx + 1 > E)
9003// Check that all of the indices extract from the correct offset. 9004bool ShouldKeepOrder =
true;
9005// Assign to all items the initial value E + 1 so we can check if the extract 9006// instruction index was used already. 9007// Also, later we can check that all the indices are used and we have a 9008// consecutive access in the extract instructions, by checking that no 9009// element of CurrentOrder still has value E + 1. 9010 CurrentOrder.
assign(E, E);
9011for (
unsignedI = 0;
I < E; ++
I) {
9014constunsigned ExtIdx = Indices[
I] - MinIdx;
9015if (CurrentOrder[ExtIdx] != E) {
9016 CurrentOrder.
clear();
9019 ShouldKeepOrder &= ExtIdx ==
I;
9020 CurrentOrder[ExtIdx] =
I;
9023 CurrentOrder.
clear();
9025return ShouldKeepOrder;
9028bool BoUpSLP::areAllUsersVectorized(
9030return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
9032 return ScalarToTreeEntry.contains(U) ||
9033 isVectorLikeInstWithConstOps(U) ||
9034 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9038static std::pair<InstructionCost, InstructionCost>
9044// Calculate the cost of the scalar and vector calls. 9046if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
9047 FMF = FPCI->getFastMathFlags();
9054false/*HasGlobalPred*/);
9056auto LibCost = IntrinsicCost;
9058// Calculate the cost of the vector library call. 9059// If the corresponding vector call is cheaper, return its cost. 9063return {IntrinsicCost, LibCost};
9066void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9070unsigned Sz = Scalars.size();
9073if (!ReorderIndices.empty())
9075for (
unsignedI = 0;
I < Sz; ++
I) {
9077if (!ReorderIndices.empty())
9079if (isa<PoisonValue>(Scalars[
Idx]))
9081auto *OpInst = cast<Instruction>(Scalars[
Idx]);
9082if (IsAltOp(OpInst)) {
9092if (!ReuseShuffleIndices.
empty()) {
9095 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9105if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9106auto *AltCI = cast<CmpInst>(AltOp);
9109assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9110auto *CI = cast<CmpInst>(
I);
9118assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9119"CmpInst expected to match either main or alternate predicate or " 9121return MainP !=
P && MainP != SwappedP;
9128constauto *Op0 = Ops.
front();
9131// TODO: We should allow undef elements here 9135// TODO: We should allow undef elements here 9139// TODO: We should allow undef elements here 9140if (
auto *CI = dyn_cast<ConstantInt>(V))
9141return CI->getValue().isPowerOf2();
9144constbool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9145// TODO: We should allow undef elements here 9146if (
auto *CI = dyn_cast<ConstantInt>(V))
9147return CI->getValue().isNegatedPowerOf2();
9152if (IsConstant && IsUniform)
9167/// The base class for shuffle instruction emission and shuffle cost estimation. 9168classBaseShuffleAnalysis {
9170Type *ScalarTy =
nullptr;
9172 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9174 /// V is expected to be a vectorized value. 9175 /// When REVEC is disabled, there is no difference between VF and 9177 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements. 9178 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead 9180unsigned getVF(
Value *V)
const{
9181assert(V &&
"V cannot be nullptr");
9182assert(isa<FixedVectorType>(
V->getType()) &&
9183"V does not have FixedVectorType");
9184assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9186unsigned VNumElements =
9187 cast<FixedVectorType>(
V->getType())->getNumElements();
9188assert(VNumElements > ScalarTyNumElements &&
9189"the number of elements of V is not large enough");
9190assert(VNumElements % ScalarTyNumElements == 0 &&
9191"the number of elements of V is not a vectorized value");
9192return VNumElements / ScalarTyNumElements;
9195 /// Checks if the mask is an identity mask. 9196 /// \param IsStrict if is true the function returns false if mask size does 9197 /// not match vector size. 9200int Limit =
Mask.size();
9206// Consider extract subvector starting from index 0. 9210// All VF-size submasks are identity (e.g. 9211// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4). 9212if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
intIdx) {
9222 /// Tries to combine 2 different masks into single one. 9223 /// \param LocalVF Vector length of the permuted input vector. \p Mask may 9224 /// change the size of the vector, \p LocalVF is the original size of the 9225 /// shuffled vector. 9228unsigned VF =
Mask.size();
9230for (
intI = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9233int MaskedIdx =
Mask[ExtMask[
I] % VF];
9240 /// Looks through shuffles trying to reduce final number of shuffles in the 9241 /// code. The function looks through the previously emitted shuffle 9242 /// instructions and properly mark indices in mask as undef. 9243 /// For example, given the code 9245 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 9246 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 9248 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 9249 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 9250 /// <0, 1, 2, 3> for the shuffle. 9251 /// If 2 operands are of different size, the smallest one will be resized and 9252 /// the mask recalculated properly. 9253 /// For example, given the code 9255 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 9256 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 9258 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 9259 /// look through %s1 and %s2 and select vectors %0 and %1 with mask 9260 /// <0, 1, 2, 3> for the shuffle. 9261 /// So, it tries to transform permutations to simple vector merge, if 9263 /// \param V The input vector which must be shuffled using the given \p Mask. 9264 /// If the better candidate is found, \p V is set to this best candidate 9266 /// \param Mask The input mask for the shuffle. If the best candidate is found 9267 /// during looking-through-shuffles attempt, it is updated accordingly. 9268 /// \param SinglePermute true if the shuffle operation is originally a 9269 /// single-value-permutation. In this case the look-through-shuffles procedure 9270 /// may look for resizing shuffles as the best candidates. 9271 /// \return true if the shuffle results in the non-resizing identity shuffle 9272 /// (and thus can be ignored), false - otherwise. 9274bool SinglePermute) {
9278while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9279// Exit if not a fixed vector type or changing size shuffle. 9280auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9283// Remember the identity or broadcast mask, if it is not a resizing 9284// shuffle. If no better candidates are found, this Op and Mask will be 9285// used in the final shuffle. 9286if (isIdentityMask(Mask, SVTy,
/*IsStrict=*/false)) {
9287if (!IdentityOp || !SinglePermute ||
9288 (isIdentityMask(Mask, SVTy,
/*IsStrict=*/true) &&
9290 IdentityMask.
size()))) {
9292// Store current mask in the IdentityMask so later we did not lost 9293// this info if IdentityOp is selected as the best candidate for the 9295 IdentityMask.
assign(Mask);
9298// Remember the broadcast mask. If no better candidates are found, this Op 9299// and Mask will be used in the final shuffle. 9300// Zero splat can be used as identity too, since it might be used with 9301// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling. 9302// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is 9303// expensive, the analysis founds out, that the source vector is just a 9304// broadcast, this original mask can be transformed to identity mask <0, 9307// %0 = shuffle %v, poison, zeroinitalizer 9308// %res = shuffle %0, poison, <3, 1, 2, 0> 9310// may be transformed to 9312// %0 = shuffle %v, poison, zeroinitalizer 9313// %res = shuffle %0, poison, <0, 1, 2, 3> 9315if (SV->isZeroEltSplat()) {
9317 IdentityMask.
assign(Mask);
9319int LocalVF =
Mask.size();
9321 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9322 LocalVF = SVOpTy->getNumElements();
9326static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9328 ExtMask[
Idx] = SV->getMaskValue(
I);
9338if (!IsOp1Undef && !IsOp2Undef) {
9339// Update mask and mark undef elems. 9340for (
int &
I : Mask) {
9343if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9350 combineMasks(LocalVF, ShuffleMask, Mask);
9351Mask.swap(ShuffleMask);
9353Op = SV->getOperand(0);
9355Op = SV->getOperand(1);
9357if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9358 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9363"Expected masks of same sizes.");
9364// Clear known poison elements. 9368Mask.swap(IdentityMask);
9369auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9370return SinglePermute &&
9371 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9372/*IsStrict=*/true) ||
9373 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9374 Shuffle->isZeroEltSplat() &&
9384 /// Smart shuffle instruction emission, walks through shuffles trees and 9385 /// tries to find the best matching vector for the actual shuffle 9387template <
typename T,
typename ShuffleBuilderTy>
9389 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
9390assert(V1 &&
"Expected at least one vector value.");
9393if (ScalarTyNumElements != 1) {
9399 Builder.resizeToMatch(V1, V2);
9401if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9402 VF = FTy->getNumElements();
9403if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9406// Peek through shuffles. 9410 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9413for (
intI = 0,
E =
Mask.size();
I <
E; ++
I) {
9415 CombinedMask1[
I] =
Mask[
I];
9417 CombinedMask2[
I] =
Mask[
I] - VF;
9424 (void)peekThroughShuffles(Op1, CombinedMask1,
/*SinglePermute=*/false);
9425 (void)peekThroughShuffles(Op2, CombinedMask2,
/*SinglePermute=*/false);
9426// Check if we have 2 resizing shuffles - need to peek through operands 9428if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9429if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9434 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9437 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9439 ExtMask1, UseMask::SecondArg);
9444 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9447 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9449 ExtMask2, UseMask::SecondArg);
9450if (SV1->getOperand(0)->getType() ==
9451 SV2->getOperand(0)->getType() &&
9452 SV1->getOperand(0)->getType() != SV1->getType() &&
9455 Op1 = SV1->getOperand(0);
9456 Op2 = SV2->getOperand(0);
9458int LocalVF = ShuffleMask1.size();
9459if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9460 LocalVF = FTy->getNumElements();
9461 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9462 CombinedMask1.swap(ShuffleMask1);
9464 LocalVF = ShuffleMask2.size();
9465if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9466 LocalVF = FTy->getNumElements();
9467 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9468 CombinedMask2.swap(ShuffleMask2);
9471 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9472 Builder.resizeToMatch(Op1, Op2);
9473 VF = std::max(cast<VectorType>(Op1->
getType())
9475 .getKnownMinValue(),
9476 cast<VectorType>(Op2->
getType())
9478 .getKnownMinValue());
9479for (
intI = 0,
E =
Mask.size();
I <
E; ++
I) {
9482"Expected undefined mask element");
9483 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9489 isa<ShuffleVectorInst>(Op1) &&
9490 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9492return Builder.createIdentity(Op1);
9493return Builder.createShuffleVector(
9497if (isa<PoisonValue>(V1))
9498return Builder.createPoison(
9499 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9500bool IsIdentity = peekThroughShuffles(V1, NewMask,
/*SinglePermute=*/true);
9501assert(V1 &&
"Expected non-null value after looking through shuffles.");
9504return Builder.createShuffleVector(V1, NewMask);
9505return Builder.createIdentity(V1);
9508 /// Transforms mask \p CommonMask per given \p Mask to make proper set after 9509 /// shuffle emission. 9512for (
unsignedI : seq<unsigned>(CommonMask.
size()))
9519/// Calculate the scalar and the vector costs from vectorizing set of GEPs. 9520static std::pair<InstructionCost, InstructionCost>
9526// Here we differentiate two cases: (1) when Ptrs represent a regular 9527// vectorization tree node (as they are pointer arguments of scattered 9528// loads) or (2) when Ptrs are the arguments of loads or stores being 9529// vectorized as plane wide unit-stride load/store since all the 9530// loads/stores are known to be from/to adjacent locations. 9531if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9532// Case 2: estimate costs for pointer related costs when vectorizing to 9533// a wide load/store. 9534// Scalar cost is estimated as a set of pointers with known relationship 9536// For vector code we will use BasePtr as argument for the wide load/store 9537// but we also need to account all the instructions which are going to 9538// stay in vectorized code due to uses outside of these scalar 9541 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9545for (
Value *V : Ptrs) {
9550auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9551// For simplicity assume Ptr to stay in vectorized code if it's not a 9552// GEP instruction. We don't care since it's cost considered free. 9553// TODO: We should check for any uses outside of vectorizable tree 9554// rather than just single use. 9555if (!
Ptr || !
Ptr->hasOneUse())
9559if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9560// If all pointers stay in vectorized code then we don't have 9561// any savings on that. 9565 TTI::PointersChainInfo::getKnownStride(),
9568// Case 1: Ptrs are the arguments of loads that we are going to transform 9569// into masked gather load intrinsic. 9570// All the scalar GEPs will be removed as a result of vectorization. 9571// For any external uses of some lanes extract element instructions will 9572// be generated (which cost is estimated separately). 9576auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9577returnPtr && !
Ptr->hasAllConstantIndices();
9579 ? TTI::PointersChainInfo::getUnknownStride()
9580 : TTI::PointersChainInfo::getKnownStride();
9584auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9586auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9587if (It != Ptrs.
end())
9588 BaseGEP = cast<GEPOperator>(*It);
9593 BaseGEP->getPointerOperand(), Indices, VecTy,
9598return std::make_pair(ScalarCost, VecCost);
9601void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9602assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9603"Expected gather node without reordering.");
9607// Do not reorder nodes if it small (just 2 elements), all-constant or all 9608// instructions have same opcode already. 9609if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
9613if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsignedIdx) {
9614 return VectorizableTree[Idx]->isSame(TE.Scalars);
9618auto GenerateLoadsSubkey = [&](
size_tKey,
LoadInst *LI) {
9623auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9624if (LIt != LoadsMap.
end()) {
9628/*StrictCheck=*/true))
9638if (LIt->second.size() > 2) {
9640hash_value(LIt->second.back()->getPointerOperand());
9646 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9651bool IsOrdered =
true;
9652unsigned NumInstructions = 0;
9653// Try to "cluster" scalar instructions, to be able to build extra vectorized 9657if (
auto *Inst = dyn_cast<Instruction>(V);
9658 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9661/*AllowAlternate=*/false);
9664auto &Container = SortedValues[
Key];
9665if (IsOrdered && !KeyToIndex.
contains(V) &&
9666 !(isa<Constant, ExtractElementInst>(V) ||
9668 ((Container.contains(
Idx) &&
9669 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9670 (!Container.empty() && !Container.contains(
Idx) &&
9671 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9673auto &KTI = KeyToIndex[
V];
9675 Container[
Idx].push_back(V);
9680if (!IsOrdered && NumInstructions > 1) {
9682TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9683for (
constauto &
D : SortedValues) {
9684for (
constauto &
P :
D.second) {
9686for (
Value *V :
P.second) {
9689TE.ReorderIndices[Cnt +
K] =
Idx;
9690TE.Scalars[Cnt +
K] =
V;
9692 Sz += Indices.
size();
9693 Cnt += Indices.
size();
9695if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9697 *
TTI,
TE.Scalars.front()->getType(), Sz);
9699for (
unsignedI : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9701 }
elseif (!
P.second.empty() &&
isConstant(
P.second.front())) {
9702for (
unsignedI : seq<unsigned>(Cnt - Sz, Cnt))
9708// Reuses always require shuffles, so consider it as profitable. 9709if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9711// Do simple cost estimation. 9714auto *ScalarTy =
TE.Scalars.front()->getType();
9716for (
auto [
Idx, Sz] : SubVectors) {
9720if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9722// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead 9723// of CreateInsertElement. 9725for (
unsignedI : seq<unsigned>(
TE.Scalars.size()))
9734int Sz =
TE.Scalars.size();
9736TE.ReorderIndices.end());
9737for (
unsignedI : seq<unsigned>(Sz)) {
9739if (isa<PoisonValue>(V)) {
9742 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9746any_of(ReorderMask, [&](
intI) {
returnI >= Sz; })
9749 VecTy, ReorderMask);
9752for (
unsignedI : seq<unsigned>(Sz)) {
9756if (!isa<PoisonValue>(V))
9759 ReorderMask[
I] =
I + Sz;
9763 VecTy, DemandedElts,
/*Insert=*/true,
/*Extract=*/false,
CostKind);
9766if (
Cost >= BVCost) {
9769TE.ReorderIndices.clear();
9775 BaseGraphSize = VectorizableTree.size();
9776// Turn graph transforming mode on and off, when done. 9777classGraphTransformModeRAAI {
9778bool &SavedIsGraphTransformMode;
9781 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9782 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9783 IsGraphTransformMode =
true;
9785 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9786 } TransformContext(IsGraphTransformMode);
9787// Operands are profitable if they are: 9788// 1. At least one constant 9792// 3. Results in good vectorization opportunity, i.e. may generate vector 9793// nodes and reduce cost of the graph. 9795const InstructionsState &S) {
9797for (
unsignedOp : seq<unsigned>(S.getMainOp()->getNumOperands()))
9799 I2->getOperand(
Op));
9801 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9803 [](
const std::pair<Value *, Value *> &
P) {
9804return isa<Constant>(
P.first) ||
9805 isa<Constant>(
P.second) ||
P.first ==
P.second;
9811// Try to reorder gather nodes for better vectorization opportunities. 9812for (
unsignedIdx : seq<unsigned>(BaseGraphSize)) {
9813 TreeEntry &E = *VectorizableTree[
Idx];
9815 reorderGatherNode(E);
9818// The tree may grow here, so iterate over nodes, built before. 9819for (
unsignedIdx : seq<unsigned>(BaseGraphSize)) {
9820 TreeEntry &E = *VectorizableTree[
Idx];
9825// Do not try partial vectorization for small nodes (<= 2), nodes with the 9826// same opcode and same parent block or all constants. 9827if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9828 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9832// Try to find vectorizable sequences and transform them into a series of 9833// insertvector instructions. 9834unsigned StartIdx = 0;
9839 *
TTI, VL.
front()->getType(), VF - 1)) {
9840if (StartIdx + VF >
End)
9843for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9845// If any instruction is vectorized already - do not try again. 9846// Reuse the existing node, if it fully matches the slice. 9847if (
const TreeEntry *SE = getTreeEntry(Slice.
front());
9848 SE || getTreeEntry(Slice.
back())) {
9851if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9854// Constant already handled effectively - skip. 9857// Do not try to vectorize small splats (less than vector register and 9858// only with the single non-undef element). 9860bool IsTwoRegisterSplat =
true;
9861if (IsSplat && VF == 2) {
9864 IsTwoRegisterSplat = NumRegs2VF == 2;
9866if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
9868static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9874 (S.getOpcode() == Instruction::Load &&
9876 (S.getOpcode() != Instruction::Load &&
9880// Try to vectorize reduced values or if all users are vectorized. 9881// For expensive instructions extra extracts might be profitable. 9882if ((!UserIgnoreList || E.Idx != 0) &&
9886if (isa<PoisonValue>(V))
9888return areAllUsersVectorized(cast<Instruction>(V),
9892if (S.getOpcode() == Instruction::Load) {
9897// Do not vectorize gathers. 9902// If reductions and the scalars from the root node are 9903// analyzed - mark as non-vectorizable reduction. 9904if (UserIgnoreList && E.Idx == 0)
9909 }
elseif (S.getOpcode() == Instruction::ExtractElement ||
9912 !CheckOperandsProfitability(
9915 IsaPred<Instruction>)),
9917// Do not vectorize extractelements (handled effectively 9918// alread). Do not vectorize non-profitable instructions (with 9919// low cost and non-vectorizable operands.) 9926auto AddCombinedNode = [&](
unsignedIdx,
unsigned Cnt,
unsigned Sz) {
9927 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9929 StartIdx = Cnt + Sz;
9933for (
auto [Cnt, Sz] : Slices) {
9935// If any instruction is vectorized already - do not try again. 9936if (TreeEntry *SE = getTreeEntry(Slice.
front());
9937 SE || getTreeEntry(Slice.
back())) {
9940if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9942 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9943 AddCombinedNode(SE->Idx, Cnt, Sz);
9946unsigned PrevSize = VectorizableTree.size();
9947 [[maybe_unused]]
unsigned PrevEntriesSize =
9948 LoadEntriesToVectorize.size();
9949 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9950if (PrevSize + 1 == VectorizableTree.size() &&
9951 VectorizableTree[PrevSize]->isGather() &&
9952 VectorizableTree[PrevSize]->hasState() &&
9953 VectorizableTree[PrevSize]->getOpcode() !=
9954 Instruction::ExtractElement &&
9956if (UserIgnoreList && E.Idx == 0 && VF == 2)
9958 VectorizableTree.pop_back();
9959assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9960"LoadEntriesToVectorize expected to remain the same");
9963 AddCombinedNode(PrevSize, Cnt, Sz);
9966// Restore ordering, if no extra vectorization happened. 9967if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9970 E.ReorderIndices.clear();
9975switch (E.getOpcode()) {
9976case Instruction::Load: {
9977// No need to reorder masked gather loads, just reorder the scalar 9979if (E.State != TreeEntry::Vectorize)
9981Type *ScalarTy = E.getMainOp()->getType();
9983Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9984// Check if profitable to represent consecutive load + reverse as strided 9985// load with stride -1. 9986if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9990auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9997 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9998/*VariableMask=*/false, CommonAlignment,
CostKind, BaseLI);
9999if (StridedCost < OriginalVecCost)
10000// Strided load is more profitable than consecutive load + reverse - 10001// transform the node to strided load. 10002 E.State = TreeEntry::StridedVectorize;
10006case Instruction::Store: {
10008 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
10010Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
10011// Check if profitable to represent consecutive load + reverse as strided 10012// load with stride -1. 10013if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
10017auto *BaseSI = cast<StoreInst>(E.Scalars.back());
10024 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10025/*VariableMask=*/false, CommonAlignment,
CostKind, BaseSI);
10026if (StridedCost < OriginalVecCost)
10027// Strided store is more profitable than reverse + consecutive store - 10028// transform the node to strided store. 10029 E.State = TreeEntry::StridedVectorize;
10030 }
elseif (!E.ReorderIndices.empty()) {
10031// Check for interleaved stores. 10033auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10034assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
10035if (Mask.size() < 4)
10037for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10041 VecTy, Factor, BaseSI->getAlign(),
10049unsigned InterleaveFactor = IsInterleaveMask(Mask);
10050if (InterleaveFactor != 0)
10051 E.setInterleave(InterleaveFactor);
10055case Instruction::Select: {
10056if (E.State != TreeEntry::Vectorize)
10061// This node is a minmax node. 10062 E.CombinedOp = TreeEntry::MinMax;
10063 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
10064if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10065 CondEntry->State == TreeEntry::Vectorize) {
10066// The condition node is part of the combined minmax node. 10067 CondEntry->State = TreeEntry::CombinedVectorize;
10076if (LoadEntriesToVectorize.empty()) {
10077// Single load node - exit. 10078if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10079 VectorizableTree.front()->getOpcode() == Instruction::Load)
10081// Small graph with small VF - exit. 10082constexprunsigned SmallTree = 3;
10083constexprunsigned SmallVF = 2;
10084if ((VectorizableTree.size() <= SmallTree &&
10085 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10086 (VectorizableTree.size() <= 2 && UserIgnoreList))
10089if (VectorizableTree.front()->isNonPowOf2Vec() &&
10093 [](
const std::unique_ptr<TreeEntry> &TE) {
10094return TE->isGather() && TE->hasState() &&
10095 TE->getOpcode() == Instruction::Load &&
10101// A list of loads to be gathered during the vectorization process. We can 10102// try to vectorize them at the end, if profitable. 10107for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10108 TreeEntry &E = *TE;
10110 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10111 (!E.hasState() &&
any_of(E.Scalars,
10113 return isa<LoadInst>(V) &&
10114 !isVectorized(V) &&
10115 !isDeleted(cast<Instruction>(V));
10118for (
Value *V : E.Scalars) {
10119auto *LI = dyn_cast<LoadInst>(V);
10125 *
this, V, *DL, *SE, *
TTI,
10126 GatheredLoads[std::make_tuple(
10133// Try to vectorize gathered loads if this is not just a gather of loads. 10134if (!GatheredLoads.
empty())
10135 tryToVectorizeGatheredLoads(GatheredLoads);
10138/// Merges shuffle masks and emits final shuffle instruction, if required. It 10139/// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 10140/// when the actual shuffle instruction is generated only if this is actually 10141/// required. Otherwise, the shuffle instruction emission is delayed till the 10142/// end of the process, to reduce the number of emitted instructions and further 10143/// analysis/transformations. 10145bool IsFinalized =
false;
10154 /// While set, still trying to estimate the cost for the same nodes and we 10155 /// can delay actual cost estimation (virtual shuffle instruction emission). 10156 /// May help better estimate the cost if same nodes must be permuted + allows 10157 /// to move most of the long shuffles cost estimation to TTI. 10158bool SameNodesEstimated =
true;
10167if (
auto *VTy = dyn_cast<VectorType>(Ty))
10181// Found the broadcasting of the single scalar, calculate the cost as 10183constauto *It =
find_if_not(VL, IsaPred<UndefValue>);
10184assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10185// Add broadcast for non-identity shuffle only. 10187count(VL, *It) > 1 &&
10190if (isa<FixedVectorType>(ScalarTy)) {
10195 cast<FixedVectorType>(ScalarTy));
10198 CostKind, std::distance(VL.
begin(), It),
10204 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10211 VecTy, ShuffleMask, CostKind,
10212/*Index=*/0,
/*SubTp=*/nullptr,
10216 (
all_of(Gathers, IsaPred<UndefValue>)
10218 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10222 /// Compute the cost of creating a vector containing the extracted values from 10226ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10227unsigned NumParts) {
10228assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10230 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10231 auto *EE = dyn_cast<ExtractElementInst>(V);
10234 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10237 return std::max(Sz, VecTy->getNumElements());
10239// FIXME: this must be moved to TTI for better estimation. 10243 -> std::optional<TTI::ShuffleKind> {
10244if (NumElts <= EltsPerVector)
10245return std::nullopt;
10247alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10249 if (I == PoisonMaskElem)
10251 return std::min(S, I);
10254int OffsetReg1 = OffsetReg0;
10256// Check that if trying to permute same single/2 input vectors. 10258int FirstRegId = -1;
10259 Indices.assign(1, OffsetReg0);
10263intIdx =
I - OffsetReg0;
10265 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10267 FirstRegId = RegId;
10268 RegIndices.
insert(RegId);
10269if (RegIndices.
size() > 2)
10270return std::nullopt;
10271if (RegIndices.
size() == 2) {
10273if (Indices.
size() == 1) {
10276 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10278 if (I == PoisonMaskElem)
10280 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10281 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10282 if (RegId == FirstRegId)
10284 return std::min(S, I);
10287 Indices.push_back(OffsetReg1 % NumElts);
10289Idx =
I - OffsetReg1;
10291I = (
Idx % NumElts) % EltsPerVector +
10292 (RegId == FirstRegId ? 0 : EltsPerVector);
10298// Process extracts in blocks of EltsPerVector to check if the source vector 10299// operand can be re-used directly. If not, add the cost of creating a 10300// shuffle to extract the values into a vector register. 10301for (
unsigned Part : seq<unsigned>(NumParts)) {
10302if (!ShuffleKinds[Part])
10305 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10309 std::optional<TTI::ShuffleKind> RegShuffleKind =
10310 CheckPerRegistersShuffle(SubMask, Indices);
10311if (!RegShuffleKind) {
10314 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10327 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10328for (
unsignedIdx : Indices) {
10329assert((
Idx + EltsPerVector) <= BaseVF &&
10330"SK_ExtractSubvector index out of range");
10335// Second attempt to check, if just a permute is better estimated than 10336// subvector extract. 10341if (OriginalCost <
Cost)
10342Cost = OriginalCost;
10346 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given 10347 /// mask \p Mask, register number \p Part, that includes \p SliceSize 10349void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10351unsigned SliceSize) {
10352if (SameNodesEstimated) {
10353// Delay the cost estimation if the same nodes are reshuffling. 10354// If we already requested the cost of reshuffling of E1 and E2 before, no 10355// need to estimate another cost with the sub-Mask, instead include this 10356// sub-Mask into the CommonMask to estimate it later and avoid double cost 10358if ((InVectors.
size() == 2 &&
10359 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10360 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10361 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10362unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10365"Expected all poisoned elements.");
10367copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10370// Found non-matching nodes - need to estimate the cost for the matched 10371// and transform mask. 10372Cost += createShuffle(InVectors.
front(),
10373 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10375 transformMaskAfterShuffle(CommonMask, CommonMask);
10376 }
elseif (InVectors.
size() == 2) {
10377Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10378 transformMaskAfterShuffle(CommonMask, CommonMask);
10380 SameNodesEstimated =
false;
10381if (!E2 && InVectors.
size() == 1) {
10382unsigned VF = E1.getVectorFactor();
10385 cast<FixedVectorType>(V1->
getType())->getNumElements());
10387constauto *E = cast<const TreeEntry *>(InVectors.
front());
10388 VF = std::max(VF, E->getVectorFactor());
10390for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10392 CommonMask[
Idx] = Mask[
Idx] + VF;
10393Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10394 transformMaskAfterShuffle(CommonMask, CommonMask);
10396autoP = InVectors.
front();
10397Cost += createShuffle(&E1, E2, Mask);
10398unsigned VF = Mask.size();
10403constauto *E = cast<const TreeEntry *>(
P);
10404 VF = std::max(VF, E->getVectorFactor());
10406for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10408 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10409Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10410 transformMaskAfterShuffle(CommonMask, CommonMask);
10414classShuffleCostBuilder {
10417staticbool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10419return Mask.empty() ||
10420 (VF == Mask.size() &&
10428 ~ShuffleCostBuilder() =
default;
10431// Empty mask or identity mask are free. 10433 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10434if (isEmptyOrIdentity(Mask, VF))
10437 cast<VectorType>(V1->
getType()), Mask);
10440// Empty mask or identity mask are free. 10442 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10443if (isEmptyOrIdentity(Mask, VF))
10446 cast<VectorType>(V1->
getType()), Mask);
10452void resizeToMatch(
Value *&,
Value *&)
const{}
10455 /// Smart shuffle instruction emission, walks through shuffles trees and 10456 /// tries to find the best matching vector for the actual shuffle 10462 ShuffleCostBuilder Builder(
TTI);
10465unsigned CommonVF = Mask.size();
10467auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10471Type *EScalarTy = E.Scalars.front()->getType();
10472bool IsSigned =
true;
10473if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10475 IsSigned = It->second.second;
10477if (EScalarTy != ScalarTy) {
10478unsigned CastOpcode = Instruction::Trunc;
10479unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10480unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10482 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10490if (isa<Constant>(V))
10492auto *VecTy = cast<VectorType>(V->getType());
10494if (EScalarTy != ScalarTy) {
10496unsigned CastOpcode = Instruction::Trunc;
10497unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10498unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10500 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10507if (!V1 && !V2 && !P2.
isNull()) {
10508// Shuffle 2 entry nodes. 10509const TreeEntry *E = cast<const TreeEntry *>(P1);
10510unsigned VF = E->getVectorFactor();
10511const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10512 CommonVF = std::max(VF, E2->getVectorFactor());
10515return Idx < 2 * static_cast<int>(CommonVF);
10517"All elements in mask must be less than 2 * CommonVF.");
10518if (E->Scalars.size() == E2->Scalars.size()) {
10522for (
int &
Idx : CommonMask) {
10525if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10527elseif (
Idx >=
static_cast<int>(CommonVF))
10528Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10532 CommonVF = E->Scalars.size();
10533 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10534 GetNodeMinBWAffectedCost(*E2, CommonVF);
10536 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10537 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10541 }
elseif (!V1 && P2.
isNull()) {
10542// Shuffle single entry node. 10543const TreeEntry *E = cast<const TreeEntry *>(P1);
10544unsigned VF = E->getVectorFactor();
10548 [=](
intIdx) {
return Idx < static_cast<int>(CommonVF); }) &&
10549"All elements in mask must be less than CommonVF.");
10550if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10552assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10553for (
int &
Idx : CommonMask) {
10557 CommonVF = E->Scalars.size();
10558 }
elseif (
unsigned Factor = E->getInterleaveFactor();
10559 Factor > 0 && E->Scalars.size() != Mask.size() &&
10562// Deinterleaved nodes are free. 10563 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10565 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10567// Not identity/broadcast? Try to see if the original vector is better. 10568if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10569 CommonVF == CommonMask.
size() &&
10571 [](
constauto &&
P) {
10573static_cast<unsigned>(
P.value()) !=
P.index();
10581 }
elseif (V1 && P2.
isNull()) {
10582// Shuffle single vector. 10583 ExtraCost += GetValueMinBWAffectedCost(V1);
10584 CommonVF = getVF(V1);
10587 [=](
intIdx) {
return Idx < static_cast<int>(CommonVF); }) &&
10588"All elements in mask must be less than CommonVF.");
10589 }
elseif (V1 && !V2) {
10590// Shuffle vector and tree node. 10591unsigned VF = getVF(V1);
10592const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10593 CommonVF = std::max(VF, E2->getVectorFactor());
10596return Idx < 2 * static_cast<int>(CommonVF);
10598"All elements in mask must be less than 2 * CommonVF.");
10599if (E2->Scalars.size() == VF && VF != CommonVF) {
10601assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10602for (
int &
Idx : CommonMask) {
10605if (
Idx >=
static_cast<int>(CommonVF))
10606Idx = E2Mask[
Idx - CommonVF] + VF;
10610 ExtraCost += GetValueMinBWAffectedCost(V1);
10612 ExtraCost += GetNodeMinBWAffectedCost(
10613 *E2, std::min(CommonVF, E2->getVectorFactor()));
10615 }
elseif (!V1 && V2) {
10616// Shuffle vector and tree node. 10617unsigned VF = getVF(V2);
10618const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10619 CommonVF = std::max(VF, E1->getVectorFactor());
10622return Idx < 2 * static_cast<int>(CommonVF);
10624"All elements in mask must be less than 2 * CommonVF.");
10625if (E1->Scalars.size() == VF && VF != CommonVF) {
10627assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10628for (
int &
Idx : CommonMask) {
10631if (
Idx >=
static_cast<int>(CommonVF))
10632Idx = E1Mask[
Idx - CommonVF] + VF;
10638 ExtraCost += GetNodeMinBWAffectedCost(
10639 *E1, std::min(CommonVF, E1->getVectorFactor()));
10641 ExtraCost += GetValueMinBWAffectedCost(V2);
10644assert(V1 && V2 &&
"Expected both vectors.");
10645unsigned VF = getVF(V1);
10646 CommonVF = std::max(VF, getVF(V2));
10649return Idx < 2 * static_cast<int>(CommonVF);
10651"All elements in mask must be less than 2 * CommonVF.");
10653 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10654if (V1->
getType() != V2->getType()) {
10658if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10660if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10664 InVectors.
front() =
10666if (InVectors.
size() == 2)
10668return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10669 V1, V2, CommonMask, Builder, ScalarTy);
10676 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10677 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10678 CheckedExtracts(CheckedExtracts) {}
10680ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10681unsigned NumParts,
bool &UseVecBaseAsInput) {
10682 UseVecBaseAsInput =
false;
10685Value *VecBase =
nullptr;
10687if (!E->ReorderIndices.empty()) {
10689 E->ReorderIndices.end());
10692// Check if it can be considered reused if same extractelements were 10693// vectorized already. 10694bool PrevNodeFound =
any_of(
10696 [&](
const std::unique_ptr<TreeEntry> &TE) {
10697 return ((TE->hasState() && !TE->isAltShuffle() &&
10698 TE->getOpcode() == Instruction::ExtractElement) ||
10700 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10701 return VL.size() > Data.index() &&
10702 (Mask[Data.index()] == PoisonMaskElem ||
10703 isa<UndefValue>(VL[Data.index()]) ||
10704 Data.value() == VL[Data.index()]);
10709for (
unsigned Part : seq<unsigned>(NumParts)) {
10711ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10714// Ignore non-extractelement scalars. 10715if (isa<UndefValue>(V) ||
10718// If all users of instruction are going to be vectorized and this 10719// instruction itself is not going to be vectorized, consider this 10720// instruction as dead and remove its cost from the final cost of the 10722// Also, avoid adjusting the cost for extractelements with multiple uses 10723// in different graph entries. 10724auto *EE = cast<ExtractElementInst>(V);
10725 VecBase = EE->getVectorOperand();
10726 UniqueBases.
insert(VecBase);
10727const TreeEntry *VE = R.getTreeEntry(V);
10728if (!CheckedExtracts.
insert(V).second ||
10729 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10732 return isa<GetElementPtrInst>(U) &&
10733 !R.areAllUsersVectorized(cast<Instruction>(U),
10741unsignedIdx = *EEIdx;
10742// Take credit for instruction that will become dead. 10743if (EE->hasOneUse() || !PrevNodeFound) {
10745if (isa<SExtInst, ZExtInst>(Ext) &&
10746all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10747// Use getExtractWithExtendCost() to calculate the cost of 10748// extractelement/ext pair. 10751 EE->getVectorOperandType(),
Idx);
10752// Add back the cost of s|zext which is subtracted separately. 10754 Ext->getOpcode(), Ext->getType(), EE->getType(),
10763// Check that gather of extractelements can be represented as just a 10764// shuffle of a single/two vectors the scalars are extracted from. 10765// Found the bunch of extractelement instructions that must be gathered 10766// into a vector and can be represented as a permutation elements in a 10767// single input vector or of 2 input vectors. 10768// Done for reused if same extractelements were vectorized already. 10770Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10773 transformMaskAfterShuffle(CommonMask, CommonMask);
10774 SameNodesEstimated =
false;
10775if (NumParts != 1 && UniqueBases.
size() != 1) {
10776 UseVecBaseAsInput =
true;
10782 /// Checks if the specified entry \p E needs to be delayed because of its 10783 /// dependency nodes. 10784 std::optional<InstructionCost>
10787// No need to delay the cost estimation during analysis. 10788return std::nullopt;
10794return Idx < static_cast<int>(E1.getVectorFactor());
10796"Expected single vector shuffle mask.");
10800if (InVectors.
empty()) {
10801 CommonMask.
assign(Mask.begin(), Mask.end());
10802 InVectors.
assign({&E1, &E2});
10805assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10811unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10812 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10815if (InVectors.
empty()) {
10816 CommonMask.
assign(Mask.begin(), Mask.end());
10817 InVectors.
assign(1, &E1);
10820assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10826unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10827 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10828if (!SameNodesEstimated && InVectors.
size() == 1)
10831 /// Adds 2 input vectors and the mask for their shuffling. 10833// May come only for shuffling of 2 vectors with extractelements, already 10834// handled in adjustExtracts. 10840auto *EI = cast<ExtractElementInst>(
10841 cast<const TreeEntry *>(InVectors.
front())
10842 ->getOrdered(
P.index()));
10843return EI->getVectorOperand() == V1 ||
10844 EI->getVectorOperand() == V2;
10846"Expected extractelement vectors.");
10848 /// Adds another one input vector and the mask for the shuffling. 10850if (InVectors.
empty()) {
10852"Expected empty input mask/vectors.");
10853 CommonMask.
assign(Mask.begin(), Mask.end());
10854 InVectors.
assign(1, V1);
10858// No need to add vectors here, already handled them in adjustExtracts. 10859assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10860 !CommonMask.
empty() &&
10863Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10864 ->getOrdered(
P.index());
10866returnP.value() == Mask[
P.index()] ||
10867 isa<UndefValue>(Scalar);
10868if (isa<Constant>(V1))
10870auto *EI = cast<ExtractElementInst>(Scalar);
10871return EI->getVectorOperand() == V1;
10873"Expected only tree entry for extractelement vectors.");
10877"Expected only tree entries from extracts/reused buildvectors.");
10878unsigned VF = getVF(V1);
10879if (InVectors.
size() == 2) {
10880Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10881 transformMaskAfterShuffle(CommonMask, CommonMask);
10882 VF = std::max<unsigned>(VF, CommonMask.
size());
10883 }
elseif (
constauto *InTE =
10884 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10885 VF = std::max(VF, InTE->getVectorFactor());
10888 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10889 ->getNumElements());
10892for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10894 CommonMask[
Idx] = Mask[
Idx] + VF;
10897Value *Root =
nullptr) {
10898Cost += getBuildVectorCost(VL, Root);
10900// FIXME: Need to find a way to avoid use of getNullValue here. 10902unsigned VF = VL.
size();
10904 VF = std::min(VF, MaskVF);
10906if (isa<UndefValue>(V)) {
10912if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10914// When REVEC is enabled, we need to expand vector types into scalar 10919Type *ScalarTy = V->getType()->getScalarType();
10921if (isa<PoisonValue>(V))
10923elseif (isa<UndefValue>(V))
10927 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10930 Vals.
swap(NewVals);
10936 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10940 /// Finalize emission of the shuffles. 10943ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10949if (InVectors.
size() == 2)
10950Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10952Cost += createShuffle(Vec,
nullptr, CommonMask);
10953 transformMaskAfterShuffle(CommonMask, CommonMask);
10955"Expected vector length for the final value before action.");
10956Value *V = cast<Value *>(Vec);
10957 Action(V, CommonMask);
10958 InVectors.
front() = V;
10960if (!SubVectors.empty()) {
10962if (InVectors.
size() == 2)
10963Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10965Cost += createShuffle(Vec,
nullptr, CommonMask);
10966 transformMaskAfterShuffle(CommonMask, CommonMask);
10967// Add subvectors permutation cost. 10968if (!SubVectorsMask.
empty()) {
10970"Expected same size of masks for subvectors and common mask.");
10972copy(SubVectorsMask, SVMask.begin());
10973for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10976 I1 = I2 + CommonMask.
size();
10983for (
auto [E,
Idx] : SubVectors) {
10984Type *EScalarTy = E->Scalars.front()->getType();
10985bool IsSigned =
true;
10986if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10989 IsSigned = It->second.second;
10991if (ScalarTy != EScalarTy) {
10992unsigned CastOpcode = Instruction::Trunc;
10993unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10994unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10996 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
11006if (!CommonMask.
empty()) {
11007 std::iota(std::next(CommonMask.
begin(),
Idx),
11008 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
11014if (!ExtMask.
empty()) {
11015if (CommonMask.
empty()) {
11019for (
intI = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11022 NewMask[
I] = CommonMask[ExtMask[
I]];
11024 CommonMask.
swap(NewMask);
11027if (CommonMask.
empty()) {
11028assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11032 createShuffle(InVectors.
front(),
11033 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
11039"Shuffle construction must be finalized.");
11043const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
11045if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
11048find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11049return TE->isGather() &&
11050find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11051 return EI.EdgeIdx == Idx && EI.UserTE == E;
11052 }) != TE->UserTreeIndices.end();
11054assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
11059if (TE.State == TreeEntry::ScatterVectorize ||
11060 TE.State == TreeEntry::StridedVectorize)
11062if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11063 !TE.isAltShuffle()) {
11064if (TE.ReorderIndices.empty())
11074/// Builds the arguments types vector for the given call instruction with the 11075/// given \p ID for the specified vector factor. 11078constunsigned VF,
unsigned MinBW,
11108// If we have computed a smaller type for the expression, update VecTy so 11109// that the costs will be accurate. 11110auto It = MinBWs.
find(E);
11111Type *OrigScalarTy = ScalarTy;
11112if (It != MinBWs.
end()) {
11113auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11119unsigned EntryVF = E->getVectorFactor();
11122if (E->isGather()) {
11125if (isa<InsertElementInst>(VL[0]))
11127if (isa<CmpInst>(VL.
front()))
11128 ScalarTy = VL.
front()->getType();
11129return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11130 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11134if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11137if (E->getOpcode() == Instruction::Store) {
11138// For stores the order is actually a mask. 11139 NewMask.
resize(E->ReorderIndices.size());
11140copy(E->ReorderIndices, NewMask.
begin());
11146if (!E->ReuseShuffleIndices.empty())
11147::addMask(Mask, E->ReuseShuffleIndices);
11151assert((E->State == TreeEntry::Vectorize ||
11152 E->State == TreeEntry::ScatterVectorize ||
11153 E->State == TreeEntry::StridedVectorize) &&
11157 (E->getOpcode() == Instruction::GetElementPtr &&
11158 E->getMainOp()->getType()->isPointerTy())) &&
11161unsigned ShuffleOrOp =
11162 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11163if (E->CombinedOp != TreeEntry::NotCombinedOp)
11164 ShuffleOrOp = E->CombinedOp;
11166constunsigned Sz = UniqueValues.
size();
11168for (
unsignedI = 0;
I < Sz; ++
I) {
11169if (isa<Instruction>(UniqueValues[
I]) && getTreeEntry(UniqueValues[
I]) == E)
11171 UsedScalars.set(
I);
11173auto GetCastContextHint = [&](
Value *
V) {
11174if (
const TreeEntry *OpTE = getTreeEntry(V))
11175return getCastContextHint(*OpTE);
11176 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11177if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11178 !SrcState.isAltShuffle())
11185// Calculate the cost of this instruction. 11187if (isa<CastInst, CallInst>(VL0)) {
11188// For some of the instructions no need to calculate cost for each 11189// particular instruction, we can use the cost of the single 11190// instruction x total number of scalar instructions. 11191 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11193for (
unsignedI = 0;
I < Sz; ++
I) {
11194if (UsedScalars.test(
I))
11196 ScalarCost += ScalarEltCost(
I);
11201// Check if the current node must be resized, if the parent node is not 11205 (E->getOpcode() != Instruction::Load ||
11206 !E->UserTreeIndices.empty())) {
11207const EdgeInfo &EI =
11208 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11209 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11211if (EI.UserTE->getOpcode() != Instruction::Select ||
11213auto UserBWIt = MinBWs.
find(EI.UserTE);
11214Type *UserScalarTy =
11215 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11216if (UserBWIt != MinBWs.
end())
11218 UserBWIt->second.first);
11219if (ScalarTy != UserScalarTy) {
11220unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11221unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11223auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11225 VecOpcode = Instruction::Trunc;
11228 It->second.second ? Instruction::SExt : Instruction::ZExt;
11235LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11236 ScalarCost,
"Calculated costs for Tree"));
11237return VecCost - ScalarCost;
11239// Calculate cost difference from vectorizing set of GEPs. 11240// Negative value means vectorizing is profitable. 11242assert((E->State == TreeEntry::Vectorize ||
11243 E->State == TreeEntry::StridedVectorize) &&
11244"Entry state expected to be Vectorize or StridedVectorize here.");
11248 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11249LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11250"Calculated GEPs cost for Tree"));
11252return VecCost - ScalarCost;
11259Type *CanonicalType = Ty;
11266 {CanonicalType, CanonicalType});
11269// If the selects are the only uses of the compares, they will be 11270// dead and we can adjust the cost by removing their cost. 11271if (VI && SelectOnly) {
11273"Expected only for scalar type.");
11274auto *CI = cast<CmpInst>(
VI->getOperand(0));
11276 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11277CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11278 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11280return IntrinsicCost;
11282switch (ShuffleOrOp) {
11283case Instruction::PHI: {
11284// Count reused scalars. 11287for (
Value *V : UniqueValues) {
11288auto *
PHI = dyn_cast<PHINode>(V);
11293for (
unsignedI = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11297if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
11299if (!OpTE->ReuseShuffleIndices.empty())
11300 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11301 OpTE->Scalars.size());
11304return CommonCost - ScalarCost;
11306case Instruction::ExtractValue:
11307case Instruction::ExtractElement: {
11308auto GetScalarCost = [&](
unsignedIdx) {
11309if (isa<PoisonValue>(UniqueValues[
Idx]))
11312auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11314if (ShuffleOrOp == Instruction::ExtractElement) {
11315auto *EE = cast<ExtractElementInst>(
I);
11316 SrcVecTy = EE->getVectorOperandType();
11318auto *EV = cast<ExtractValueInst>(
I);
11319Type *AggregateTy = EV->getAggregateOperand()->getType();
11321if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11322 NumElts = ATy->getNumElements();
11327if (
I->hasOneUse()) {
11329if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11330all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11331// Use getExtractWithExtendCost() to calculate the cost of 11332// extractelement/ext pair. 11335// Subtract the cost of s|zext which is subtracted separately. 11337Ext->getOpcode(),
Ext->getType(),
I->getType(),
11345auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11346return GetCostDiff(GetScalarCost, GetVectorCost);
11348case Instruction::InsertElement: {
11349assert(E->ReuseShuffleIndices.empty() &&
11350"Unique insertelements only are expected.");
11351auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11352unsignedconst NumElts = SrcVecTy->getNumElements();
11353unsignedconst NumScalars = VL.
size();
11359unsigned OffsetEnd = OffsetBeg;
11360 InsertMask[OffsetBeg] = 0;
11365elseif (OffsetEnd <
Idx)
11367 InsertMask[
Idx] =
I + 1;
11370if (NumOfParts > 0 && NumOfParts < NumElts)
11371 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11372unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11374unsignedOffset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11375unsigned InsertVecSz = std::min<unsigned>(
11377 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11378bool IsWholeSubvector =
11379 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11380// Check if we can safely insert a subvector. If it is not possible, just 11381// generate a whole-sized vector and shuffle the source vector and the new 11383if (OffsetBeg + InsertVecSz > VecSz) {
11384// Align OffsetBeg to generate correct mask. 11386 InsertVecSz = VecSz;
11390// TODO: Add support for Instruction::InsertValue. 11392if (!E->ReorderIndices.empty()) {
11397 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11399bool IsIdentity =
true;
11401Mask.swap(PrevMask);
11402for (
unsignedI = 0;
I < NumScalars; ++
I) {
11404 DemandedElts.
setBit(InsertIdx);
11405 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11406Mask[InsertIdx - OffsetBeg] =
I;
11408assert(
Offset < NumElts &&
"Failed to find vector index offset");
11412/*Insert*/true,
/*Extract*/false,
11415// First cost - resize to actual vector size if not identity shuffle or 11416// need to shift the vector. 11417// Do not calculate the cost if the actual size is the register size and 11418// we can merge this shuffle with the following SK_Select. 11422 InsertVecTy, Mask);
11423auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11424 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11426// Second cost - permutation with subvector, if some elements are from the 11427// initial vector or inserting a subvector. 11428// TODO: Implement the analysis of the FirstInsert->getOperand(0) 11429// subvector of ActualVecTy. 11432buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11433if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11434if (InsertVecSz != VecSz) {
11445for (
unsignedI = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11454case Instruction::ZExt:
11455case Instruction::SExt:
11456case Instruction::FPToUI:
11457case Instruction::FPToSI:
11458case Instruction::FPExt:
11459case Instruction::PtrToInt:
11460case Instruction::IntToPtr:
11461case Instruction::SIToFP:
11462case Instruction::UIToFP:
11463case Instruction::Trunc:
11464case Instruction::FPTrunc:
11465case Instruction::BitCast: {
11466auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11469unsigned Opcode = ShuffleOrOp;
11470unsigned VecOpcode = Opcode;
11472 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11473// Check if the values are candidates to demote. 11474unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11475if (SrcIt != MinBWs.
end()) {
11476 SrcBWSz = SrcIt->second.first;
11483if (BWSz == SrcBWSz) {
11484 VecOpcode = Instruction::BitCast;
11485 }
elseif (BWSz < SrcBWSz) {
11486 VecOpcode = Instruction::Trunc;
11487 }
elseif (It != MinBWs.
end()) {
11488assert(BWSz > SrcBWSz &&
"Invalid cast!");
11489 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11490 }
elseif (SrcIt != MinBWs.
end()) {
11491assert(BWSz > SrcBWSz &&
"Invalid cast!");
11493 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11495 }
elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11496 !SrcIt->second.second) {
11497 VecOpcode = Instruction::UIToFP;
11500assert(
Idx == 0 &&
"Expected 0 index only");
11506// Do not count cost here if minimum bitwidth is in effect and it is just 11507// a bitcast (here it is just a noop). 11508if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11510auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11513bool IsArithmeticExtendedReduction =
11514 E->Idx == 0 && UserIgnoreList &&
11516auto *
I = cast<Instruction>(V);
11517returnis_contained({Instruction::Add, Instruction::FAdd,
11518 Instruction::Mul, Instruction::FMul,
11519 Instruction::And, Instruction::Or,
11523if (IsArithmeticExtendedReduction &&
11524 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11528 VecOpcode == Opcode ? VI :
nullptr);
11530return GetCostDiff(GetScalarCost, GetVectorCost);
11532case Instruction::FCmp:
11533case Instruction::ICmp:
11534case Instruction::Select: {
11538match(VL0, MatchCmp))
11544auto GetScalarCost = [&](
unsignedIdx) {
11545if (isa<PoisonValue>(UniqueValues[
Idx]))
11548auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11554 !
match(VI, MatchCmp)) ||
11562 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11564 getOperandInfo(
VI->getOperand(1)), VI);
11567 ScalarCost = IntrinsicCost;
11576CostKind, getOperandInfo(E->getOperand(0)),
11577 getOperandInfo(E->getOperand(1)), VL0);
11578if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11581unsigned CondNumElements = CondType->getNumElements();
11583assert(VecTyNumElements >= CondNumElements &&
11584 VecTyNumElements % CondNumElements == 0 &&
11585"Cannot vectorize Instruction::Select");
11586if (CondNumElements != VecTyNumElements) {
11587// When the return type is i1 but the source is fixed vector type, we 11588// need to duplicate the condition value. 11595return VecCost + CommonCost;
11597return GetCostDiff(GetScalarCost, GetVectorCost);
11599case TreeEntry::MinMax: {
11600auto GetScalarCost = [&](
unsignedIdx) {
11601return GetMinMaxCost(OrigScalarTy);
11605return VecCost + CommonCost;
11607return GetCostDiff(GetScalarCost, GetVectorCost);
11609case Instruction::FNeg:
11610case Instruction::Add:
11611case Instruction::FAdd:
11612case Instruction::Sub:
11613case Instruction::FSub:
11614case Instruction::Mul:
11615case Instruction::FMul:
11616case Instruction::UDiv:
11617case Instruction::SDiv:
11618case Instruction::FDiv:
11619case Instruction::URem:
11620case Instruction::SRem:
11621case Instruction::FRem:
11622case Instruction::Shl:
11623case Instruction::LShr:
11624case Instruction::AShr:
11625case Instruction::And:
11626case Instruction::Or:
11627case Instruction::Xor: {
11628auto GetScalarCost = [&](
unsignedIdx) {
11629if (isa<PoisonValue>(UniqueValues[
Idx]))
11632auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11633unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11642if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11643for (
unsignedI : seq<unsigned>(0, E->getNumOperands())) {
11646auto *CI = dyn_cast<ConstantInt>(
Op);
11647return CI && CI->getValue().countr_one() >= It->second.first;
11652unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11656 Op2Info, {},
nullptr, TLI) +
11659return GetCostDiff(GetScalarCost, GetVectorCost);
11661case Instruction::GetElementPtr: {
11662return CommonCost + GetGEPCostDiff(VL, VL0);
11664case Instruction::Load: {
11665auto GetScalarCost = [&](
unsignedIdx) {
11666auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11668VI->getAlign(),
VI->getPointerAddressSpace(),
11671auto *LI0 = cast<LoadInst>(VL0);
11675case TreeEntry::Vectorize:
11676if (
unsigned Factor = E->getInterleaveFactor()) {
11678 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11679 LI0->getPointerAddressSpace(),
CostKind);
11683 Instruction::Load, VecTy, LI0->getAlign(),
11687case TreeEntry::StridedVectorize: {
11688Align CommonAlignment =
11689 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11691 Instruction::Load, VecTy, LI0->getPointerOperand(),
11692/*VariableMask=*/false, CommonAlignment,
CostKind);
11695case TreeEntry::ScatterVectorize: {
11696Align CommonAlignment =
11697 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11699 Instruction::Load, VecTy, LI0->getPointerOperand(),
11700/*VariableMask=*/false, CommonAlignment,
CostKind);
11703case TreeEntry::CombinedVectorize:
11704case TreeEntry::NeedToGather:
11707return VecLdCost + CommonCost;
11711// If this node generates masked gather load then it is not a terminal node. 11712// Hence address operand cost is estimated separately. 11713if (E->State == TreeEntry::ScatterVectorize)
11716// Estimate cost of GEPs since this tree node is a terminator. 11719 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11720returnCost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11722case Instruction::Store: {
11723bool IsReorder = !E->ReorderIndices.empty();
11724auto GetScalarCost = [=](
unsignedIdx) {
11725auto *
VI = cast<StoreInst>(VL[
Idx]);
11728VI->getAlign(),
VI->getPointerAddressSpace(),
11732 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11734// We know that we can merge the stores. Calculate the cost. 11736if (E->State == TreeEntry::StridedVectorize) {
11737Align CommonAlignment =
11738 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11740 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11741/*VariableMask=*/false, CommonAlignment,
CostKind);
11743assert(E->State == TreeEntry::Vectorize &&
11744"Expected either strided or consecutive stores.");
11745if (
unsigned Factor = E->getInterleaveFactor()) {
11746assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11747"No reused shuffles expected");
11750 Instruction::Store, VecTy, Factor, std::nullopt,
11751 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11755 Instruction::Store, VecTy, BaseSI->getAlign(),
11756 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11759return VecStCost + CommonCost;
11763unsignedIdx = IsReorder ? E->ReorderIndices[
I] :
I;
11764 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11767return GetCostDiff(GetScalarCost, GetVectorCost) +
11768 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11770case Instruction::Call: {
11771auto GetScalarCost = [&](
unsignedIdx) {
11772auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11783auto *CI = cast<CallInst>(VL0);
11787 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11789return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11791return GetCostDiff(GetScalarCost, GetVectorCost);
11793case Instruction::ShuffleVector: {
11794if (!
SLPReVec || E->isAltShuffle())
11795assert(E->isAltShuffle() &&
11800 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11801"Invalid Shuffle Vector Operand");
11802// Try to find the previous shuffle node with the same operands and same 11803// main/alternate ops. 11804auto TryFindNodeWithEqualOperands = [=]() {
11805for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11808if (
TE->hasState() &&
TE->isAltShuffle() &&
11809 ((
TE->getOpcode() == E->getOpcode() &&
11810TE->getAltOpcode() == E->getAltOpcode()) ||
11811 (
TE->getOpcode() == E->getAltOpcode() &&
11812TE->getAltOpcode() == E->getOpcode())) &&
11813TE->hasEqualOperands(*E))
11818auto GetScalarCost = [&](
unsignedIdx) {
11819if (isa<PoisonValue>(UniqueValues[
Idx]))
11822auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11823assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11827// Need to clear CommonCost since the final shuffle cost is included into 11830// VecCost is equal to sum of the cost of creating 2 vectors 11831// and the cost of creating shuffle. 11833if (TryFindNodeWithEqualOperands()) {
11835dbgs() <<
"SLP: diamond match for alternate node found.\n";
11838// No need to add new vector costs here since we're going to reuse 11839// same main/alternate vector ops, just do different shuffling. 11842 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11844 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11845 }
elseif (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11847 VecCost = TTIRef.getCmpSelInstrCost(
11848 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11849 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11851 VecCost += TTIRef.getCmpSelInstrCost(
11852 E->getOpcode(), VecTy, MaskTy,
11853 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11854 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11857Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11860auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11861unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11863DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11864if (SrcIt != MinBWs.
end()) {
11865 SrcBWSz = SrcIt->second.first;
11869if (BWSz <= SrcBWSz) {
11872 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11876 <<
"SLP: alternate extension, which should be truncated.\n";
11882 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11885 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11889 E->buildAltOpShuffleMask(
11891assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11898// Patterns like [fadd,fsub] can be combined into a single instruction 11899// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we 11900// need to take into account their order when looking for the most used 11902unsigned Opcode0 = E->getOpcode();
11903unsigned Opcode1 = E->getAltOpcode();
11905// If this pattern is supported by the target then we consider the 11907if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11909 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11910return AltVecCost < VecCost ? AltVecCost : VecCost;
11912// TODO: Check the reverse order too. 11915if (
SLPReVec && !E->isAltShuffle())
11918// If a group uses mask in order, the shufflevector can be 11919// eliminated by instcombine. Then the cost is 0. 11921"Not supported shufflevector usage.");
11922auto *SV = cast<ShuffleVectorInst>(VL.
front());
11923unsigned SVNumElements =
11924 cast<FixedVectorType>(SV->getOperand(0)->getType())
11925 ->getNumElements();
11926unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11931assert(isa<ShuffleVectorInst>(V) &&
11932"Not supported shufflevector usage.");
11933auto *SV = cast<ShuffleVectorInst>(V);
11935 [[maybe_unused]]
bool IsExtractSubvectorMask =
11936 SV->isExtractSubvectorMask(Index);
11937assert(IsExtractSubvectorMask &&
11938"Not supported shufflevector usage.");
11939if (NextIndex != Index)
11941 NextIndex += SV->getShuffleMask().size();
11944 return ::getShuffleCost(
11950return GetCostDiff(GetScalarCost, GetVectorCost);
11952case Instruction::Freeze:
11959bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const{
11961 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11963auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11965returnTE->isGather() &&
11967 [
this](
Value *V) { return EphValues.contains(V); }) &&
11969TE->Scalars.size() < Limit ||
11970 (((
TE->hasState() &&
11971TE->getOpcode() == Instruction::ExtractElement) ||
11972all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11974 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
11975 !
TE->isAltShuffle()) ||
11976any_of(
TE->Scalars, IsaPred<LoadInst>));
11979// We only handle trees of heights 1 and 2. 11980if (VectorizableTree.size() == 1 &&
11981 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11982 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11984 AreVectorizableGathers(VectorizableTree[0].
get(),
11985 VectorizableTree[0]->Scalars.size()) &&
11986 VectorizableTree[0]->getVectorFactor() > 2)))
11989if (VectorizableTree.size() != 2)
11992// Handle splat and all-constants stores. Also try to vectorize tiny trees 11993// with the second gather nodes if they have less scalar operands rather than 11994// the initial tree element (may be profitable to shuffle the second gather) 11995// or they are extractelements, which form shuffle. 11997if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11998 AreVectorizableGathers(VectorizableTree[1].
get(),
11999 VectorizableTree[0]->Scalars.size()))
12002// Gathering cost would be too much for tiny trees. 12003if (VectorizableTree[0]->
isGather() ||
12004 (VectorizableTree[1]->isGather() &&
12005 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
12006 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
12014bool MustMatchOrInst) {
12015// Look past the root to find a source value. Arbitrarily follow the 12016// path through operand 0 of any 'or'. Also, peek through optional 12017// shift-left-by-multiple-of-8-bits. 12018Value *ZextLoad = Root;
12020bool FoundOr =
false;
12021while (!isa<ConstantExpr>(ZextLoad) &&
12024 ShAmtC->
urem(8) == 0))) {
12025auto *BinOp = cast<BinaryOperator>(ZextLoad);
12026 ZextLoad = BinOp->getOperand(0);
12027if (BinOp->getOpcode() == Instruction::Or)
12030// Check if the input is an extended load of the required or/shift expression. 12032if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12036// Require that the total load bit width is a legal integer type. 12037// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. 12038// But <16 x i8> --> i128 is not, so the backend probably can't reduce it. 12039Type *SrcTy = Load->getType();
12044// Everything matched - assume that we can fold the whole sequence using 12046LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at " 12047 << *(cast<Instruction>(Root)) <<
"\n");
12056unsigned NumElts = VectorizableTree[0]->Scalars.size();
12057Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12059/* MatchOr */false);
12063// Peek through a final sequence of stores and check if all operations are 12064// likely to be load-combined. 12065unsigned NumElts = Stores.
size();
12066for (
Value *Scalar : Stores) {
12079// Graph is empty - do nothing. 12080if (VectorizableTree.empty()) {
12081assert(ExternalUses.empty() &&
"We shouldn't have any external users");
12086// No need to vectorize inserts of gathered values. 12087if (VectorizableTree.size() == 2 &&
12088 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12089 VectorizableTree[1]->isGather() &&
12090 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12091 !(
isSplat(VectorizableTree[1]->Scalars) ||
12095// If the graph includes only PHI nodes and gathers, it is defnitely not 12096// profitable for the vectorization, we can skip it, if the cost threshold is 12097// default. The cost of vectorized PHI nodes is almost always 0 + the cost of 12098// gathers/buildvectors. 12099constexprint Limit = 4;
12101 !VectorizableTree.empty() &&
12102all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12103return (TE->isGather() &&
12104 (!TE->hasState() ||
12105 TE->getOpcode() != Instruction::ExtractElement) &&
12106count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12107 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12111// We can vectorize the tree if its size is greater than or equal to the 12112// minimum size specified by the MinTreeSize command line option. 12116// If we have a tiny tree (a tree whose size is less than MinTreeSize), we 12117// can vectorize it if we can prove it fully vectorizable. 12118if (isFullyVectorizableTinyTree(ForReduction))
12121// Check if any of the gather node forms an insertelement buildvector 12123bool IsAllowedSingleBVNode =
12124 VectorizableTree.size() > 1 ||
12125 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12126 !VectorizableTree.front()->isAltShuffle() &&
12127 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12128 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12130if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12131return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12132 return isa<ExtractElementInst, UndefValue>(V) ||
12133 (IsAllowedSingleBVNode &&
12134 !V->hasNUsesOrMore(UsesLimit) &&
12135 any_of(V->users(), IsaPred<InsertElementInst>));
12140if (VectorizableTree.back()->isGather() &&
12141 VectorizableTree.back()->hasState() &&
12142 VectorizableTree.back()->isAltShuffle() &&
12143 VectorizableTree.back()->getVectorFactor() > 2 &&
12145 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12147getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12148 VectorizableTree.back()->getVectorFactor()),
12150/*Insert=*/true,
/*Extract=*/false,
12154// Otherwise, we can't vectorize the tree. It is both tiny and not fully 12161constexprunsigned SmallTree = 3;
12162if (VectorizableTree.front()->isNonPowOf2Vec() &&
12165 [](
const std::unique_ptr<TreeEntry> &TE) {
12166return TE->isGather() && TE->hasState() &&
12167 TE->getOpcode() == Instruction::Load &&
12175 TreeEntry &E = *VectorizableTree[
Idx];
12178if (E.hasState() && E.getOpcode() != Instruction::Load)
12188// Walk from the bottom of the tree to the top, tracking which values are 12189// live. When we see a call instruction that is not part of our tree, 12190// query TTI to see if there is a cost to keeping values live over it 12191// (for example, if spills and fills are required). 12192unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12198// The entries in VectorizableTree are not necessarily ordered by their 12199// position in basic blocks. Collect them and order them by dominance so later 12200// instructions are guaranteed to be visited first. For instructions in 12201// different basic blocks, we only scan to the beginning of the block, so 12202// their order does not matter, as long as all instructions in a basic block 12203// are grouped together. Using dominance ensures a deterministic order. 12205for (
constauto &TEPtr : VectorizableTree) {
12206if (TEPtr->State != TreeEntry::Vectorize)
12208Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12214auto *NodeA = DT->
getNode(
A->getParent());
12215auto *NodeB = DT->
getNode(
B->getParent());
12216assert(NodeA &&
"Should only process reachable instructions");
12217assert(NodeB &&
"Should only process reachable instructions");
12218assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12219"Different nodes should have different DFS numbers");
12221return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12222returnB->comesBefore(
A);
12231// Update LiveValues. 12232 LiveValues.
erase(PrevInst);
12233for (
auto &J : PrevInst->
operands()) {
12234if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12235 LiveValues.
insert(cast<Instruction>(&*J));
12239dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12240for (
auto *
X : LiveValues)
12241dbgs() <<
" " <<
X->getName();
12242dbgs() <<
", Looking at ";
12246// Now find the sequence of instructions between PrevInst and Inst. 12247unsigned NumCalls = 0;
12251while (InstIt != PrevInstIt) {
12252if (PrevInstIt == PrevInst->
getParent()->rend()) {
12253 PrevInstIt = Inst->getParent()->rbegin();
12258if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12259if (
II->isAssumeLikeIntrinsic())
12267if (IntrCost < CallCost)
12273// Debug information does not impact spill cost. 12274if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12275 &*PrevInstIt != PrevInst)
12283for (
auto *
II : LiveValues) {
12284auto *ScalarTy =
II->getType();
12285if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12286 ScalarTy = VectorTy->getElementType();
12298/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the 12299/// buildvector sequence. 12304constauto *I1 = IE1;
12305constauto *I2 = IE2;
12317if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12319 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12320if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12322 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12323 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12328/// Returns incoming Value *, if the requested type is Value * too, or a default 12329/// value, otherwise. 12331template <
typename U>
12332static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12335template <
typename U>
12336static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12342/// Does the analysis of the provided shuffle masks and performs the requested 12343/// actions on the vectors with the given shuffle masks. It tries to do it in 12345/// 1. If the Base vector is not undef vector, resizing the very first mask to 12346/// have common VF and perform action for 2 input vectors (including non-undef 12347/// Base). Other shuffle masks are combined with the resulting after the 1 stage 12348/// and processed as a shuffle of 2 elements. 12349/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the 12350/// action only for 1 vector with the given mask, if it is not the identity 12352/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2 12353/// vectors, combing the masks properly between the steps. 12354template <
typename T>
12360assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12362auto VMIt = std::next(ShuffleMask.begin());
12365buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12367if (!IsBaseUndef.
all()) {
12368// Base is not undef, need to combine it with the next subvectors. 12369 std::pair<T *, bool> Res =
12370 ResizeAction(ShuffleMask.begin()->first, Mask,
/*ForSingleMask=*/false);
12372for (
unsignedIdx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12376 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12378 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
12379assert((!V || GetVF(V) == Mask.size()) &&
12380"Expected base vector of VF number of elements.");
12381 Prev = Action(Mask, {
nullptr, Res.first});
12382 }
elseif (ShuffleMask.size() == 1) {
12383// Base is undef and only 1 vector is shuffled - perform the action only for 12384// single vector, if the mask is not the identity mask. 12385 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12386/*ForSingleMask=*/true);
12388// Identity mask is found. 12391 Prev = Action(Mask, {ShuffleMask.begin()->first});
12393// Base is undef and at least 2 input vectors shuffled - perform 2 vectors 12394// shuffles step by step, combining shuffle between the steps. 12395unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12396unsigned Vec2VF = GetVF(VMIt->first);
12397if (Vec1VF == Vec2VF) {
12398// No need to resize the input vectors since they are of the same size, we 12399// can shuffle them directly. 12401for (
unsignedI = 0, VF = Mask.size();
I < VF; ++
I) {
12404 Mask[
I] = SecMask[
I] + Vec1VF;
12407 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12409// Vectors of different sizes - resize and reshuffle. 12410 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12411/*ForSingleMask=*/false);
12412 std::pair<T *, bool> Res2 =
12413 ResizeAction(VMIt->first, VMIt->second,
/*ForSingleMask=*/false);
12415for (
unsignedI = 0, VF = Mask.size();
I < VF; ++
I) {
12422 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12425 Prev = Action(Mask, {Res1.first, Res2.first});
12427 VMIt = std::next(VMIt);
12429 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
12430// Perform requested actions for the remaining masks/vectors. 12431for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12432// Shuffle other input vectors, if any. 12433 std::pair<T *, bool> Res =
12434 ResizeAction(VMIt->first, VMIt->second,
/*ForSingleMask=*/false);
12436for (
unsignedI = 0, VF = Mask.size();
I < VF; ++
I) {
12439"Multiple uses of scalars.");
12440 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12445 Prev = Action(Mask, {Prev, Res.first});
12451/// Data type for handling buildvector sequences with the reused scalars from 12452/// other tree entries. 12453template <
typename T>
structShuffledInsertData {
12454 /// List of insertelements to be replaced by shuffles. 12456 /// The parent vectors and shuffle mask for the given list of inserts. 12464 << VectorizableTree.size() <<
".\n");
12466unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12469for (
unsignedI = 0, E = VectorizableTree.size();
I < E; ++
I) {
12470 TreeEntry &TE = *VectorizableTree[
I];
12471// No need to count the cost for combined entries, they are combined and 12472// just skip their cost. 12473if (TE.State == TreeEntry::CombinedVectorize) {
12475dbgs() <<
"SLP: Skipping cost for combined node that starts with " 12476 << *TE.Scalars[0] <<
".\n";
12477 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12480if (TE.isGather() && TE.hasState()) {
12481if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
12482 E && E->getVectorFactor() == TE.getVectorFactor() &&
12483 E->isSame(TE.Scalars)) {
12484// Some gather nodes might be absolutely the same as some vectorizable 12485// nodes after reordering, need to handle it. 12488 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12493// Exclude cost of gather loads nodes which are not used. These nodes were 12494// built as part of the final attempt to vectorize gathered loads. 12495assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12496"Expected gather nodes with users only.");
12502 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12511 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12514// Keep track {Scalar, Index, User} tuple. 12515// On AArch64, this helps in fusing a mov instruction, associated with 12516// extractelement, with fmul in the backend so that extractelement is free. 12518for (ExternalUser &EU : ExternalUses) {
12519 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12521for (ExternalUser &EU : ExternalUses) {
12522// Uses by ephemeral values are free (because the ephemeral value will be 12523// removed prior to code generation, and so the extraction will be 12524// removed as well). 12525if (EphValues.
count(EU.User))
12528// Used in unreachable blocks or in EH pads (rarely executed) or is 12529// terminated with unreachable instruction. 12531 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12534 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12537// We only add extract cost once for the same scalar. 12538if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12539 !ExtractCostCalculated.
insert(EU.Scalar).second)
12542// No extract cost for vector "scalar" 12543if (isa<FixedVectorType>(EU.Scalar->getType()))
12546// If found user is an insertelement, do not calculate extract cost but try 12547// to detect it as a final shuffled/identity match. 12548if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12550if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12551if (!UsedInserts.
insert(VU).second)
12555const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12558 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12559// Checks if 2 insertelements are from the same buildvector. 12563Value *Op0 =
II->getOperand(0);
12564if (getTreeEntry(
II) && !getTreeEntry(Op0))
12570if (It == ShuffledInserts.
end()) {
12572Data.InsertElements.emplace_back(VU);
12574 VecId = ShuffledInserts.
size() - 1;
12575auto It = MinBWs.
find(ScalarTE);
12576if (It != MinBWs.
end() &&
12578 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12580unsigned BWSz = It->second.first;
12581unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12584 VecOpcode = Instruction::Trunc;
12587 It->second.second ? Instruction::SExt : Instruction::ZExt;
12592 FTy->getNumElements()),
12595 <<
" for extending externally used vector with " 12596"non-equal minimum bitwidth.\n");
12601 It->InsertElements.front() = VU;
12602 VecId = std::distance(ShuffledInserts.
begin(), It);
12604int InIdx = *InsertIdx;
12606 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12609 Mask[InIdx] = EU.Lane;
12610 DemandedElts[VecId].setBit(InIdx);
12617// If we plan to rewrite the tree in a smaller type, we will need to sign 12618// extend the extracted value back to the original type. Here, we account 12619// for the extract and the added cost of the sign extend if needed. 12622const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12623auto It = MinBWs.
find(Entry);
12624if (It != MinBWs.
end()) {
12627 ? Instruction::ZExt
12628 : Instruction::SExt;
12635 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12637// Leave the scalar instructions as is if they are cheaper than extracts. 12638if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12639 Entry->getOpcode() == Instruction::Load) {
12640// Checks if the user of the external scalar is phi in loop body. 12641auto IsPhiInLoop = [&](
const ExternalUser &U) {
12642if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12643auto *
I = cast<Instruction>(U.Scalar);
12644constLoop *L = LI->getLoopFor(Phi->getParent());
12645return L && (Phi->getParent() ==
I->getParent() ||
12646 L == LI->getLoopFor(
I->getParent()));
12650if (!ValueToExtUses) {
12651 ValueToExtUses.emplace();
12653// Ignore phis in loops. 12654if (IsPhiInLoop(
P.value()))
12657 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12660// Can use original instruction, if no operands vectorized or they are 12661// marked as externally used already. 12662auto *Inst = cast<Instruction>(EU.Scalar);
12664auto OperandIsScalar = [&](
Value *V) {
12665if (!getTreeEntry(V)) {
12666// Some extractelements might be not vectorized, but 12667// transformed into shuffle and removed from the function, 12668// consider it here. 12669if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12670return !EE->hasOneUse() || !MustGather.contains(EE);
12673return ValueToExtUses->contains(V);
12675bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12676bool CanBeUsedAsScalarCast =
false;
12677if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12678if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12679Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12681 (getTreeEntry(
Op) && !ValueToExtUses->contains(
Op))
12684if (ScalarCost + OpCost <= ExtraCost) {
12685 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12686 ScalarCost += OpCost;
12690if (CanBeUsedAsScalar) {
12691bool KeepScalar = ScalarCost <= ExtraCost;
12692// Try to keep original scalar if the user is the phi node from the same 12693// block as the root phis, currently vectorized. It allows to keep 12694// better ordering info of PHIs, being vectorized currently. 12695bool IsProfitablePHIUser =
12697 VectorizableTree.front()->Scalars.size() > 2)) &&
12698 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12702 auto *PHIUser = dyn_cast<PHINode>(U);
12703 return (!PHIUser ||
12704 PHIUser->getParent() !=
12706 VectorizableTree.front()->getMainOp())
12711 return ValueToExtUses->contains(V);
12713if (IsProfitablePHIUser) {
12717 (!GatheredLoadsEntriesFirst.has_value() ||
12718 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12719unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12720 return ValueToExtUses->contains(V);
12722auto It = ExtractsCount.
find(Entry);
12723if (It != ExtractsCount.
end()) {
12724assert(ScalarUsesCount >= It->getSecond().size() &&
12725"Expected total number of external uses not less than " 12726"number of scalar uses.");
12727 ScalarUsesCount -= It->getSecond().size();
12729// Keep original scalar if number of externally used instructions in 12730// the same entry is not power of 2. It may help to do some extra 12731// vectorization for now. 12732 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12735 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12737 auto It = ValueToExtUses->find(V);
12738 if (It != ValueToExtUses->end()) {
12739// Replace all uses to avoid compiler crash. 12740 ExternalUses[It->second].User = nullptr;
12743 ExtraCost = ScalarCost;
12744if (!IsPhiInLoop(EU))
12745 ExtractsCount[Entry].
insert(Inst);
12746if (CanBeUsedAsScalarCast) {
12747 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12748// Update the users of the operands of the cast operand to avoid 12750if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12752 auto It = ValueToExtUses->find(V);
12753 if (It != ValueToExtUses->end()) {
12754// Replace all uses to avoid compiler crash. 12755 ExternalUses[It->second].User = nullptr;
12764 ExtractCost += ExtraCost;
12766// Insert externals for extract of operands of casts to be emitted as scalars 12767// instead of extractelement. 12768for (
Value *V : ScalarOpsFromCasts) {
12769 ExternalUsesAsOriginalScalar.
insert(V);
12770if (
const TreeEntry *E = getTreeEntry(V)) {
12771 ExternalUses.emplace_back(V,
nullptr, E->findLaneForValue(V));
12774// Add reduced value cost, if resized. 12775if (!VectorizedVals.
empty()) {
12776const TreeEntry &Root = *VectorizableTree.front();
12777auto BWIt = MinBWs.find(&Root);
12778if (BWIt != MinBWs.end()) {
12779Type *DstTy = Root.Scalars.front()->getType();
12782 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12783if (OriginalSz != SrcSz) {
12784unsigned Opcode = Instruction::Trunc;
12785if (OriginalSz > SrcSz)
12786 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12788if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12800Cost += SpillCost + ExtractCost;
12804unsigned VF =
Mask.size();
12805unsigned VecVF =
TE->getVectorFactor();
12807 (
any_of(Mask, [VF](
intIdx) {
returnIdx >=
static_cast<int>(VF); }) ||
12810 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12816dbgs() <<
"SLP: Adding cost " <<
C 12817 <<
" for final shuffle of insertelement external users.\n";
12818TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12820return std::make_pair(TE,
true);
12822return std::make_pair(TE,
false);
12824// Calculate the cost of the reshuffled vectors, if any. 12825for (
intI = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12826Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12827autoVector = ShuffledInserts[
I].ValueMasks.takeVector();
12831assert((TEs.size() == 1 || TEs.size() == 2) &&
12832"Expected exactly 1 or 2 tree entries.");
12833if (TEs.size() == 1) {
12835 VF = TEs.front()->getVectorFactor();
12836auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12840 (
Data.index() < VF &&
12841static_cast<int>(
Data.index()) ==
Data.value());
12846 <<
" for final shuffle of insertelement " 12847"external users.\n";
12848 TEs.front()->
dump();
12849dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12855 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12856 VF = TEs.front()->getVectorFactor();
12860auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12864 <<
" for final shuffle of vector node and external " 12865"insertelement users.\n";
12866if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12867dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12873 (void)performExtractsShuffleAction<const TreeEntry>(
12875 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12876 EstimateShufflesCost);
12878 cast<FixedVectorType>(
12879 ShuffledInserts[
I].InsertElements.front()->getType()),
12885// Add the cost for reduced value resize (if required). 12886if (ReductionBitWidth != 0) {
12887assert(UserIgnoreList &&
"Expected reduction tree.");
12888const TreeEntry &E = *VectorizableTree.front();
12889auto It = MinBWs.find(&E);
12890if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12891unsigned SrcSize = It->second.first;
12892unsigned DstSize = ReductionBitWidth;
12893unsigned Opcode = Instruction::Trunc;
12894if (SrcSize < DstSize) {
12895bool IsArithmeticExtendedReduction =
12897auto *
I = cast<Instruction>(V);
12898returnis_contained({Instruction::Add, Instruction::FAdd,
12899 Instruction::Mul, Instruction::FMul,
12900 Instruction::And, Instruction::Or,
12904if (IsArithmeticExtendedReduction)
12906 Instruction::BitCast;
// Handle it by getExtendedReductionCost 12908 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12910if (Opcode != Instruction::BitCast) {
12917switch (E.getOpcode()) {
12918case Instruction::SExt:
12919case Instruction::ZExt:
12920case Instruction::Trunc: {
12921const TreeEntry *OpTE = getOperandEntry(&E, 0);
12922 CCH = getCastContextHint(*OpTE);
12932 <<
" for final resize for reduction from " << SrcVecTy
12933 <<
" to " << DstVecTy <<
"\n";
12934dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12943OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n" 12944 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n" 12945 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12949ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12955/// Tries to find extractelement instructions with constant indices from fixed 12956/// vector type and gather such instructions into a bunch, which highly likely 12957/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was 12958/// successful, the matched scalars are replaced by poison values in \p VL for 12959/// future analysis. 12960std::optional<TTI::ShuffleKind>
12961BoUpSLP::tryToGatherSingleRegisterExtractElements(
12963// Scan list of gathered scalars for extractelements that can be represented 12967for (
intI = 0, E = VL.
size();
I < E; ++
I) {
12968auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12970if (isa<UndefValue>(VL[
I]))
12974auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12975if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12988 ExtractMask.reset(*
Idx);
12993 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12995// Sort the vector operands by the maximum number of uses in extractelements. 12998stable_sort(Vectors, [](
constauto &P1,
constauto &P2) {
12999returnP1.second.size() > P2.second.size();
13001// Find the best pair of the vectors or a single vector. 13002constint UndefSz = UndefVectorExtracts.
size();
13003unsigned SingleMax = 0;
13004unsigned PairMax = 0;
13005if (!Vectors.
empty()) {
13006 SingleMax = Vectors.
front().second.size() + UndefSz;
13007if (Vectors.
size() > 1) {
13008auto *ItNext = std::next(Vectors.
begin());
13009 PairMax = SingleMax + ItNext->second.size();
13012if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
13013return std::nullopt;
13014// Check if better to perform a shuffle of 2 vectors or just of a single 13019if (SingleMax >= PairMax && SingleMax) {
13020for (
intIdx : Vectors.
front().second)
13022 }
elseif (!Vectors.
empty()) {
13023for (
unsignedIdx : {0, 1})
13024for (
intIdx : Vectors[
Idx].second)
13027// Add extracts from undefs too. 13028for (
intIdx : UndefVectorExtracts)
13030// Check that gather of extractelements can be represented as just a 13031// shuffle of a single/two vectors the scalars are extracted from. 13032 std::optional<TTI::ShuffleKind> Res =
13035// TODO: try to check other subsets if possible. 13036// Restore the original VL if attempt was not successful. 13038return std::nullopt;
13040// Restore unused scalars from mask, if some of the extractelements were not 13041// selected for shuffle. 13042for (
intI = 0, E = GatheredExtracts.size();
I < E; ++
I) {
13044 isa<UndefValue>(GatheredExtracts[
I])) {
13048auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
13049if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13050 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13057/// Tries to find extractelement instructions with constant indices from fixed 13058/// vector type and gather such instructions into a bunch, which highly likely 13059/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was 13060/// successful, the matched scalars are replaced by poison values in \p VL for 13061/// future analysis. 13065unsigned NumParts)
const{
13066assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
13070for (
unsigned Part : seq<unsigned>(NumParts)) {
13071// Scan list of gathered scalars for extractelements that can be represented 13076 std::optional<TTI::ShuffleKind> Res =
13077 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13078 ShufflesRes[Part] = Res;
13079copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
13081if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
13082return Res.has_value();
13084 ShufflesRes.clear();
13088std::optional<TargetTransformInfo::ShuffleKind>
13089BoUpSLP::isGatherShuffledSingleRegisterEntry(
13093// TODO: currently checking only for Scalars in the tree entry, need to count 13094// reused elements too for better cost estimation. 13095const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13096 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13097 :
TE->UserTreeIndices.front();
13098constInstruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13100// Main node of PHI entries keeps the correct order of operands/incoming 13102if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13103 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13106 TEInsertBlock = TEInsertPt->
getParent();
13109return std::nullopt;
13110auto *NodeUI = DT->
getNode(TEInsertBlock);
13111assert(NodeUI &&
"Should only process reachable instructions");
13113auto CheckOrdering = [&](
constInstruction *InsertPt) {
13114// Argument InsertPt is an instruction where vector code for some other 13115// tree entry (one that shares one or more scalars with TE) is going to be 13116// generated. This lambda returns true if insertion point of vector code 13117// for the TE dominates that point (otherwise dependency is the other way 13118// around). The other node is not limited to be of a gather kind. Gather 13119// nodes are not scheduled and their vector code is inserted before their 13120// first user. If user is PHI, that is supposed to be at the end of a 13121// predecessor block. Otherwise it is the last instruction among scalars of 13122// the user node. So, instead of checking dependency between instructions 13123// themselves, we check dependency between their insertion points for vector 13124// code (since each scalar instruction ends up as a lane of a vector 13127auto *NodeEUI = DT->
getNode(InsertBlock);
13130assert((NodeUI == NodeEUI) ==
13131 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13132"Different nodes should have different DFS numbers");
13133// Check the order of the gather nodes users. 13134if (TEInsertPt->
getParent() != InsertBlock &&
13137if (TEInsertPt->
getParent() == InsertBlock &&
13142// Find all tree entries used by the gathered values. If no common entries 13143// found - not a shuffle. 13144// Here we build a set of tree nodes for each gathered value and trying to 13145// find the intersection between these sets. If we have at least one common 13146// tree node for each gathered value - we have just a permutation of the 13147// single vector. If we have 2 different sets, we're in situation where we 13148// have a permutation of 2 input vectors. 13151for (
Value *V : VL) {
13154// Build a list of tree entries where V is used. 13156for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13157if (TEPtr == TE || TEPtr->Idx == 0)
13160 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13161"Must contain at least single gathered value.");
13162assert(TEPtr->UserTreeIndices.size() == 1 &&
13163"Expected only single user of a gather node.");
13164const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13166PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13169 : &getLastInstructionInBundle(UseEI.UserTE);
13170if (TEInsertPt == InsertPt) {
13171// If 2 gathers are operands of the same entry (regardless of whether 13172// user is PHI or else), compare operands indices, use the earlier one 13174if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13176// If the user instruction is used for some reason in different 13177// vectorized nodes - make it depend on index. 13178if (TEUseEI.UserTE != UseEI.UserTE &&
13179 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13183// Check if the user node of the TE comes after user node of TEPtr, 13184// otherwise TEPtr depends on TE. 13185if ((TEInsertBlock != InsertPt->
getParent() ||
13186 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13187 !CheckOrdering(InsertPt))
13191if (
const TreeEntry *VTE = getTreeEntry(V)) {
13192if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13193if (VTE->State != TreeEntry::Vectorize) {
13194auto It = MultiNodeScalars.
find(V);
13195if (It == MultiNodeScalars.
end())
13197 VTE = *It->getSecond().begin();
13198// Iterate through all vectorized nodes. 13199auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
13200 return MTE->State == TreeEntry::Vectorize;
13202if (MIt == It->getSecond().end())
13207if (
none_of(
TE->CombinedEntriesWithIndices,
13208 [&](
constauto &
P) { return P.first == VTE->Idx; })) {
13209Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13210if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13217if (UsedTEs.
empty()) {
13218// The first iteration, just insert the list of nodes to vector. 13222// Need to check if there are any previously used tree nodes which use V. 13223// If there are no such nodes, consider that we have another one input 13228// Do we have a non-empty intersection of previously listed tree entries 13229// and tree entries using current V? 13231if (!VToTEs.
empty()) {
13232// Yes, write the new subset and continue analysis for the next 13237 VToTEs = SavedVToTEs;
13240// No non-empty intersection found - need to add a second set of possible 13243// If the number of input vectors is greater than 2 - not a permutation, 13244// fallback to the regular gather. 13245// TODO: support multiple reshuffled nodes. 13246if (UsedTEs.
size() == 2)
13248 UsedTEs.push_back(SavedVToTEs);
13255if (UsedTEs.
empty()) {
13257return std::nullopt;
13261if (UsedTEs.
size() == 1) {
13262// Keep the order to avoid non-determinism. 13264 UsedTEs.front().
end());
13265sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13266return TE1->Idx < TE2->Idx;
13268// Try to find the perfect match in another gather node at first. 13269auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13270return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13272if (It != FirstEntries.end() &&
13273 ((*It)->getVectorFactor() == VL.size() ||
13274 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13275TE->ReuseShuffleIndices.size() == VL.size() &&
13276 (*It)->isSame(
TE->Scalars)))) {
13277 Entries.push_back(*It);
13278if ((*It)->getVectorFactor() == VL.size()) {
13279 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13280 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13285// Clear undef scalars. 13286for (
unsignedI : seq<unsigned>(VL.size()))
13287if (isa<PoisonValue>(VL[
I]))
13291// No perfect match, just shuffle, so choose the first tree node from the 13293 Entries.push_back(FirstEntries.front());
13294 VF = FirstEntries.front()->getVectorFactor();
13296// Try to find nodes with the same vector factor. 13297assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13298// Keep the order of tree nodes to avoid non-determinism. 13300for (
const TreeEntry *TE : UsedTEs.front()) {
13301unsigned VF =
TE->getVectorFactor();
13302auto It = VFToTE.
find(VF);
13303if (It != VFToTE.
end()) {
13304if (It->second->Idx >
TE->Idx)
13305 It->getSecond() =
TE;
13310// Same, keep the order to avoid non-determinism. 13312 UsedTEs.back().
end());
13313sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13314return TE1->Idx < TE2->Idx;
13316for (
const TreeEntry *TE : SecondEntries) {
13317auto It = VFToTE.
find(
TE->getVectorFactor());
13318if (It != VFToTE.
end()) {
13320 Entries.push_back(It->second);
13321 Entries.push_back(TE);
13325// No 2 source vectors with the same vector factor - just choose 2 with max 13327if (Entries.empty()) {
13329 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13330 return TE1->Idx < TE2->Idx;
13332 Entries.push_back(SecondEntries.front());
13333 VF = std::max(Entries.front()->getVectorFactor(),
13334 Entries.back()->getVectorFactor());
13336 VF = Entries.front()->getVectorFactor();
13340bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13341// Checks if the 2 PHIs are compatible in terms of high possibility to be 13343auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13344auto *
PHI = cast<PHINode>(V);
13345auto *PHI1 = cast<PHINode>(V1);
13346// Check that all incoming values are compatible/from same parent (if they 13347// are instructions). 13348// The incoming values are compatible if they all are constants, or 13349// instruction with the same/alternate opcodes from the same basic block. 13350for (
intI = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13352Value *In1 = PHI1->getIncomingValue(
I);
13357if (cast<Instruction>(In)->
getParent() !=
13363// Check if the value can be ignored during analysis for shuffled gathers. 13364// We suppose it is better to ignore instruction, which do not form splats, 13365// are not vectorized/not extractelements (these instructions will be handled 13366// by extractelements processing) or may form vector node in future. 13367auto MightBeIgnored = [=](
Value *
V) {
13368auto *
I = dyn_cast<Instruction>(V);
13369returnI && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
13371 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13373// Check that the neighbor instruction may form a full vector node with the 13374// current instruction V. It is possible, if they have same/alternate opcode 13375// and same parent basic block. 13376auto NeighborMightBeIgnored = [&](
Value *
V,
intIdx) {
13378bool UsedInSameVTE =
false;
13379auto It = UsedValuesEntry.
find(V1);
13380if (It != UsedValuesEntry.
end())
13381 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13382returnV != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13384 cast<Instruction>(V)->getParent() ==
13385 cast<Instruction>(V1)->getParent() &&
13386 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13388// Build a shuffle mask for better cost estimation and vector emission. 13391for (
intI = 0, E = VL.size();
I < E; ++
I) {
13393auto It = UsedValuesEntry.
find(V);
13394if (It == UsedValuesEntry.
end())
13396// Do not try to shuffle scalars, if they are constants, or instructions 13397// that can be vectorized as a result of the following vector build 13400 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13401 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13403unsignedIdx = It->second;
13407// Iterate through all shuffled scalars and select entries, which can be used 13408// for final shuffle. 13410for (
unsignedI = 0, Sz = Entries.size();
I < Sz; ++
I) {
13411if (!UsedIdxs.test(
I))
13413// Fix the entry number for the given scalar. If it is the first entry, set 13414// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). 13415// These indices are used when calculating final shuffle mask as the vector 13417for (std::pair<unsigned, int> &Pair : EntryLanes)
13419 Pair.first = TempEntries.
size();
13422 Entries.swap(TempEntries);
13423if (EntryLanes.size() == Entries.size() &&
13425 .
slice(Part * VL.size(),
13426 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13427// We may have here 1 or 2 entries only. If the number of scalars is equal 13428// to the number of entries, no need to do the analysis, it is not very 13429// profitable. Since VL is not the same as TE->Scalars, it means we already 13430// have some shuffles before. Cut off not profitable case. 13432return std::nullopt;
13434// Build the final mask, check for the identity shuffle, if possible. 13435bool IsIdentity = Entries.size() == 1;
13436// Pair.first is the offset to the vector, while Pair.second is the index of 13437// scalar in the list. 13438for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13439unsignedIdx = Part * VL.size() + Pair.second;
13442 (ForOrder ? std::distance(
13443 Entries[Pair.first]->Scalars.begin(),
13444find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13445 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13446 IsIdentity &=
Mask[
Idx] == Pair.second;
13448if (ForOrder || IsIdentity || Entries.empty()) {
13449switch (Entries.size()) {
13451if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13455if (EntryLanes.size() > 2 || VL.size() <= 2)
13461 }
elseif (!isa<VectorType>(VL.front()->getType()) &&
13462 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13463// Do the cost estimation if shuffle beneficial than buildvector. 13465 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13466int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13467for (
intIdx : SubMask) {
13475assert(MaxElement >= 0 && MinElement >= 0 &&
13476 MaxElement % VF >= MinElement % VF &&
13477"Expected at least single element.");
13478unsigned NewVF = std::max<unsigned>(
13480 (MaxElement % VF) -
13481 (MinElement % VF) + 1));
13486Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13487 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13495auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13496auto GetShuffleCost = [&,
13500if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13502 Mask, Entries.front()->getInterleaveFactor()))
13504 return ::getShuffleCost(
TTI,
13509InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13512if (Entries.size() == 1 || !Entries[0]->isGather()) {
13513 FirstShuffleCost = ShuffleCost;
13515// Transform mask to include only first entry. 13517bool IsIdentity =
true;
13519if (
Idx >=
static_cast<int>(NewVF)) {
13524 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13528 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13530 MaskVecTy, DemandedElts,
/*Insert=*/true,
13535if (Entries.size() == 1 || !Entries[1]->isGather()) {
13536 SecondShuffleCost = ShuffleCost;
13538// Transform mask to include only first entry. 13540bool IsIdentity =
true;
13542if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
13548 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13553 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13555 MaskVecTy, DemandedElts,
/*Insert=*/true,
13565const TreeEntry *BestEntry =
nullptr;
13566if (FirstShuffleCost < ShuffleCost) {
13567 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13568 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13570 if (Idx >= static_cast<int>(VF))
13571 Idx = PoisonMaskElem;
13573 BestEntry = Entries.front();
13574 ShuffleCost = FirstShuffleCost;
13576if (SecondShuffleCost < ShuffleCost) {
13577 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13578 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13580 if (Idx < static_cast<int>(VF))
13581 Idx = PoisonMaskElem;
13585 BestEntry = Entries[1];
13586 ShuffleCost = SecondShuffleCost;
13588if (BuildVectorCost >= ShuffleCost) {
13591 Entries.push_back(BestEntry);
13598// Clear the corresponding mask elements. 13599 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13601return std::nullopt;
13605BoUpSLP::isGatherShuffledEntry(
13609assert(NumParts > 0 && NumParts < VL.
size() &&
13610"Expected positive number of registers.");
13612// No need to check for the topmost gather node. 13613if (TE == VectorizableTree.front().get() &&
13614 (!GatheredLoadsEntriesFirst.has_value() ||
13616 [](
const std::unique_ptr<TreeEntry> &TE) {
13617return !
TE->isGather();
13620// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not 13622if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI))
13625assert((
TE->UserTreeIndices.size() == 1 ||
13626 TE == VectorizableTree.front().get()) &&
13627"Expected only single user of the gather node.");
13629"Number of scalars must be divisible by NumParts.");
13630if (!
TE->UserTreeIndices.empty() &&
13631TE->UserTreeIndices.front().UserTE->isGather() &&
13632TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13635 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
13637"Expected splat or extractelements only node.");
13642for (
unsigned Part : seq<unsigned>(NumParts)) {
13646 std::optional<TTI::ShuffleKind> SubRes =
13647 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13650 SubEntries.
clear();
13653 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13654 (SubEntries.
front()->isSame(
TE->Scalars) ||
13655 SubEntries.
front()->isSame(VL))) {
13657 LocalSubEntries.
swap(SubEntries);
13660 std::iota(
Mask.begin(),
Mask.end(), 0);
13661// Clear undef scalars. 13662for (
intI = 0, Sz = VL.
size();
I < Sz; ++
I)
13663if (isa<PoisonValue>(VL[
I]))
13665 Entries.emplace_back(1, LocalSubEntries.
front());
13671 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13679Type *ScalarTy)
const{
13681bool DuplicateNonConst =
false;
13682// Find the cost of inserting/extracting values from the vector. 13683// Check if the same elements are inserted several times and count them as 13684// shuffle candidates. 13689auto EstimateInsertCost = [&](
unsignedI,
Value *
V) {
13690if (
V->getType() != ScalarTy) {
13701for (
unsignedI = 0, E = VL.
size();
I < E; ++
I) {
13703// No need to shuffle duplicates for constants. 13704if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13712 EstimateInsertCost(
I, V);
13717 DuplicateNonConst =
true;
13719 ShuffleMask[
I] = Res.first->second;
13722if (isa<FixedVectorType>(ScalarTy)) {
13724// We don't need to insert elements one by one. Instead, we can insert the 13725// entire vector into the destination. 13728for (
unsignedI : seq<unsigned>(VL.
size()))
13729if (!ShuffledElements[
I])
13732I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13735/*DemandedElts*/ ~ShuffledElements,
13740if (DuplicateNonConst)
13742 VecTy, ShuffleMask);
13746Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13747auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13750// Get the basic block this bundle is in. All instructions in the bundle 13751// should be in this block (except for extractelement-like instructions with 13752// constant indices or gathered loads). 13753auto *Front = E->getMainOp();
13755assert(((GatheredLoadsEntriesFirst.has_value() &&
13756 E->getOpcode() == Instruction::Load && E->isGather() &&
13757 E->Idx < *GatheredLoadsEntriesFirst) ||
13759 [=](
Value *V) ->
bool {
13760 if (E->getOpcode() == Instruction::GetElementPtr &&
13761 !isa<GetElementPtrInst>(V))
13763 auto *I = dyn_cast<Instruction>(V);
13764 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13765 isVectorLikeInstWithConstOps(I);
13767"Expected gathered loads or GEPs or instructions from same basic " 13770auto FindLastInst = [&]() {
13772for (
Value *V : E->Scalars) {
13773auto *
I = dyn_cast<Instruction>(V);
13776if (LastInst->
getParent() ==
I->getParent()) {
13781assert(((E->getOpcode() == Instruction::GetElementPtr &&
13782 !isa<GetElementPtrInst>(
I)) ||
13785 (GatheredLoadsEntriesFirst.has_value() &&
13786 E->getOpcode() == Instruction::Load && E->isGather() &&
13787 E->Idx < *GatheredLoadsEntriesFirst)) &&
13788"Expected vector-like or non-GEP in GEP node insts only.");
13796auto *NodeB = DT->
getNode(
I->getParent());
13797assert(NodeA &&
"Should only process reachable instructions");
13798assert(NodeB &&
"Should only process reachable instructions");
13799assert((NodeA == NodeB) ==
13800 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13801"Different nodes should have different DFS numbers");
13802if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13809auto FindFirstInst = [&]() {
13811for (
Value *V : E->Scalars) {
13812auto *
I = dyn_cast<Instruction>(V);
13815if (FirstInst->
getParent() ==
I->getParent()) {
13816if (
I->comesBefore(FirstInst))
13820assert(((E->getOpcode() == Instruction::GetElementPtr &&
13821 !isa<GetElementPtrInst>(
I)) ||
13824"Expected vector-like or non-GEP in GEP node insts only.");
13832auto *NodeB = DT->
getNode(
I->getParent());
13833assert(NodeA &&
"Should only process reachable instructions");
13834assert(NodeB &&
"Should only process reachable instructions");
13835assert((NodeA == NodeB) ==
13836 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13837"Different nodes should have different DFS numbers");
13838if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13844// Set insertpoint for gathered loads to the very first load. 13845if (GatheredLoadsEntriesFirst.has_value() &&
13846 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13847 E->getOpcode() == Instruction::Load) {
13848 Res = FindFirstInst();
13852// Set the insert point to the beginning of the basic block if the entry 13853// should not be scheduled. 13856if ((E->getOpcode() == Instruction::GetElementPtr &&
13859 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13863 return isa<PoisonValue>(V) ||
13864 (!isVectorLikeInstWithConstOps(V) &&
13865 isUsedOutsideBlock(V));
13867 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13868 return isa<ExtractElementInst, UndefValue>(V) ||
13869 areAllOperandsNonInsts(V);
13871 Res = FindLastInst();
13873 Res = FindFirstInst();
13877// Find the last instruction. The common case should be that BB has been 13878// scheduled, and the last instruction is VL.back(). So we start with 13879// VL.back() and iterate over schedule data until we reach the end of the 13880// bundle. The end of the bundle is marked by null ScheduleData. 13881if (BlocksSchedules.count(BB) && !E->isGather()) {
13882Value *
V = E->isOneOf(E->Scalars.back());
13885auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13886if (Bundle && Bundle->isPartOfBundle())
13887for (; Bundle; Bundle = Bundle->NextInBundle)
13888 Res = Bundle->Inst;
13891// LastInst can still be null at this point if there's either not an entry 13892// for BB in BlocksSchedules or there's no ScheduleData available for 13893// VL.back(). This can be the case if buildTree_rec aborts for various 13894// reasons (e.g., the maximum recursion depth is reached, the maximum region 13895// size is reached, etc.). ScheduleData is initialized in the scheduling 13898// If this happens, we can still find the last instruction by brute force. We 13899// iterate forwards from Front (inclusive) until we either see all 13900// instructions in the bundle or reach the end of the block. If Front is the 13901// last instruction in program order, LastInst will be set to Front, and we 13902// will visit all the remaining instructions in the block. 13904// One of the reasons we exit early from buildTree_rec is to place an upper 13905// bound on compile-time. Thus, taking an additional compile-time hit here is 13906// not ideal. However, this should be exceedingly rare since it requires that 13907// we both exit early from buildTree_rec and that the bundle be out-of-order 13908// (causing us to iterate all the way to the end of the block). 13910 Res = FindLastInst();
13911assert(Res &&
"Failed to find last instruction in bundle");
13915void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13916auto *Front = E->getMainOp();
13917Instruction *LastInst = &getLastInstructionInBundle(E);
13918assert(LastInst &&
"Failed to find last instruction in bundle");
13920// If the instruction is PHI, set the insert point after all the PHIs. 13921bool IsPHI = isa<PHINode>(LastInst);
13923 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13925 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13927// Set the insertion point after the last instruction in the bundle. Set the 13928// debug location to Front. 13929 Builder.SetInsertPoint(
13933 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13936Value *BoUpSLP::gather(
13939// List of instructions/lanes from current block and/or the blocks which are 13940// part of the current loop. These instructions will be inserted at the end to 13941// make it possible to optimize loops and hoist invariant instructions out of 13942// the loops body with better chances for success. 13945Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13948while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13949 InsertBB = InsertBB->getSinglePredecessor();
13950return InsertBB && InsertBB == InstBB;
13952for (
intI = 0, E = VL.
size();
I < E; ++
I) {
13953if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13954if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13955 getTreeEntry(Inst) ||
13956 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13957 PostponedIndices.
insert(
I).second)
13961auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13964if (
Scalar->getType() != Ty) {
13968if (
auto *CI = dyn_cast<CastInst>(Scalar);
13969 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13971if (
auto *IOp = dyn_cast<Instruction>(
Op);
13972 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
13975Scalar = Builder.CreateIntCast(
13980if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13984auto *
II = dyn_cast<IntrinsicInst>(Vec);
13985if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13989 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13990 InsElt = dyn_cast<InsertElementInst>(Vec);
13994 GatherShuffleExtractSeq.
insert(InsElt);
13996// Add to our 'need-to-extract' list. 13997if (isa<Instruction>(V)) {
13998if (TreeEntry *Entry = getTreeEntry(V)) {
13999// Find which lane we need to extract. 14000User *UserOp =
nullptr;
14002if (
auto *SI = dyn_cast<Instruction>(Scalar))
14008unsigned FoundLane =
Entry->findLaneForValue(V);
14009 ExternalUses.emplace_back(V, UserOp, FoundLane);
14019 std::iota(
Mask.begin(),
Mask.end(), 0);
14020Value *OriginalRoot = Root;
14021if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14022 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14023 SV->getOperand(0)->getType() == VecTy) {
14024 Root = SV->getOperand(0);
14025Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14027// Insert constant values at first. 14028for (
intI = 0, E = VL.
size();
I < E; ++
I) {
14035if (isa<PoisonValue>(VL[
I]))
14037 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14041if (isa<PoisonValue>(Vec)) {
14042 Vec = OriginalRoot;
14044 Vec = CreateShuffle(Root, Vec, Mask);
14045if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
14046 OI && OI->hasNUses(0) &&
14047none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14048returnTE->VectorizedValue == OI;
14053// Insert non-constant values. 14054for (
intI : NonConsts)
14055 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14056// Append instructions, which are/may be part of the loop, in the end to make 14057// it possible to hoist non-loop-based instructions. 14058for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14059 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14064/// Merges shuffle masks and emits final shuffle instruction, if required. It 14065/// supports shuffling of 2 input vectors. It implements lazy shuffles emission, 14066/// when the actual shuffle instruction is generated only if this is actually 14067/// required. Otherwise, the shuffle instruction emission is delayed till the 14068/// end of the process, to reduce the number of emitted instructions and further 14069/// analysis/transformations. 14070/// The class also will look through the previously emitted shuffle instructions 14071/// and properly mark indices in mask as undef. 14072/// For example, given the code 14074/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> 14075/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> 14077/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will 14078/// look through %s1 and %s2 and emit 14080/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 14083/// If 2 operands are of different size, the smallest one will be resized and 14084/// the mask recalculated properly. 14085/// For example, given the code 14087/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> 14088/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> 14090/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will 14091/// look through %s1 and %s2 and emit 14093/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> 14097bool IsFinalized =
false;
14098 /// Combined mask for all applied operands and masks. It is built during 14099 /// analysis and actual emission of shuffle vector instructions. 14101 /// List of operands for the shuffle vector instruction. It hold at max 2 14102 /// operands, if the 3rd is going to be added, the first 2 are combined into 14103 /// shuffle with \p CommonMask mask, the first operand sets to be the 14104 /// resulting shuffle and the second operand sets to be the newly added 14105 /// operand. The \p CommonMask is transformed in the proper way after that. 14110classShuffleIRBuilder {
14112 /// Holds all of the instructions that we gathered. 14114 /// A list of blocks that we are going to CSE. 14123 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14124 CSEBlocks(CSEBlocks),
DL(
DL) {}
14125 ~ShuffleIRBuilder() =
default;
14126 /// Creates shufflevector for the 2 operands with the given mask. 14128if (V1->
getType() != V2->getType()) {
14131"Expected integer vector types only.");
14132if (V1->
getType() != V2->getType()) {
14133if (cast<VectorType>(V2->getType())
14135 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14137 ->getIntegerBitWidth())
14146if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14147 GatherShuffleExtractSeq.
insert(
I);
14148 CSEBlocks.
insert(
I->getParent());
14152 /// Creates permutation of the single vector operand with the given mask, if 14153 /// it is not identity mask. 14157unsigned VF = Mask.size();
14158unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14162if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14163 GatherShuffleExtractSeq.
insert(
I);
14164 CSEBlocks.
insert(
I->getParent());
14169Value *createPoison(
Type *Ty,
unsigned VF) {
14172 /// Resizes 2 input vector to match the sizes, if the they are not equal 14173 /// yet. The smallest vector is resized to the size of the larger vector. 14175if (V1->
getType() == V2->getType())
14177int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14178int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14179int VF = std::max(V1VF, V2VF);
14180int MinVF = std::min(V1VF, V2VF);
14182 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14184Value *&
Op = MinVF == V1VF ? V1 : V2;
14186if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14187 GatherShuffleExtractSeq.
insert(
I);
14188 CSEBlocks.
insert(
I->getParent());
14197 /// Smart shuffle instruction emission, walks through shuffles trees and 14198 /// tries to find the best matching vector for the actual shuffle 14201assert(V1 &&
"Expected at least one vector value.");
14202 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14203 R.CSEBlocks, *R.DL);
14204return BaseShuffleAnalysis::createShuffle<Value *>(
14205 V1, V2, Mask, ShuffleBuilder, ScalarTy);
14208 /// Cast value \p V to the vector type with the same number of elements, but 14209 /// the base type \p ScalarTy. 14211 std::optional<bool> IsSigned = std::nullopt) {
14212auto *VecTy = cast<VectorType>(V->getType());
14223 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14225 /// Adjusts extractelements after reusing them. 14227ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14228unsigned NumParts,
bool &UseVecBaseAsInput) {
14229 UseVecBaseAsInput =
false;
14231Value *VecBase =
nullptr;
14233if (!E->ReorderIndices.empty()) {
14235 E->ReorderIndices.end());
14238for (
intI = 0, Sz = Mask.size();
I < Sz; ++
I) {
14242auto *EI = cast<ExtractElementInst>(VL[
I]);
14243 VecBase = EI->getVectorOperand();
14244if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
14245 VecBase = TE->VectorizedValue;
14246assert(VecBase &&
"Expected vectorized value.");
14247 UniqueBases.
insert(VecBase);
14248// If the only one use is vectorized - can delete the extractelement 14250if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14251 (NumParts != 1 &&
count(VL, EI) > 1) ||
14253 const TreeEntry *UTE = R.getTreeEntry(U);
14254 return !UTE || R.MultiNodeScalars.contains(U) ||
14255 (isa<GetElementPtrInst>(U) &&
14256 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14257 count_if(R.VectorizableTree,
14258 [&](const std::unique_ptr<TreeEntry> &TE) {
14259 return any_of(TE->UserTreeIndices,
14260 [&](const EdgeInfo &Edge) {
14261 return Edge.UserTE == UTE;
14263 is_contained(VL, EI);
14267 R.eraseInstruction(EI);
14269if (NumParts == 1 || UniqueBases.
size() == 1) {
14270assert(VecBase &&
"Expected vectorized value.");
14271return castToScalarTyElem(VecBase);
14273 UseVecBaseAsInput =
true;
14279// Perform multi-register vector shuffle, joining them into a single virtual 14281// Need to shuffle each part independently and then insert all this parts 14282// into a long virtual vector register, forming the original vector. 14283Value *Vec =
nullptr;
14286for (
unsigned Part : seq<unsigned>(NumParts)) {
14290constexprint MaxBases = 2;
14292auto VLMask =
zip(SubVL, SubMask);
14293constunsigned VF = std::accumulate(
14294 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
constauto &
D) {
14295 if (std::get<1>(D) == PoisonMaskElem)
14298 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14299 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14300 VecOp = TE->VectorizedValue;
14301 assert(VecOp &&
"Expected vectorized value.");
14302 const unsigned Size =
14303 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14304 return std::max(S, Size);
14306for (
constauto [V,
I] : VLMask) {
14309Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14310if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
14311 VecOp = TE->VectorizedValue;
14312assert(VecOp &&
"Expected vectorized value.");
14313 VecOp = castToScalarTyElem(VecOp);
14314 Bases[
I / VF] = VecOp;
14320 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14321 TransformToIdentity(SubMask);
14323 SubVec = Bases.front();
14330Mask.slice(
P * SliceSize,
14337"Expected first part or all previous parts masked.");
14338copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14341 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14344 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14345 NewVF = std::max(NewVF, SubVecVF);
14348for (
int &
Idx : SubMask)
14351copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14352 Vec = createShuffle(Vec, SubVec, VecMask);
14353 TransformToIdentity(VecMask);
14359 /// Checks if the specified entry \p E needs to be delayed because of its 14360 /// dependency nodes. 14361 std::optional<Value *>
14364// No need to delay emission if all deps are ready. 14367 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14369return std::nullopt;
14370// Postpone gather emission, will be emitted after the end of the 14371// process to keep correct order. 14378 /// Adds 2 input vectors (in form of tree entries) and the mask for their 14381Value *V1 = E1.VectorizedValue;
14383 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14384 if (isa<PoisonValue>(V))
14386 return !isKnownNonNegative(
14387 V, SimplifyQuery(*R.DL));
14389Value *V2 = E2.VectorizedValue;
14390if (V2->getType()->isIntOrIntVectorTy())
14391 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14392 if (isa<PoisonValue>(V))
14394 return !isKnownNonNegative(
14395 V, SimplifyQuery(*R.DL));
14399 /// Adds single input vector (in form of tree entry) and the mask for its 14402Value *V1 = E1.VectorizedValue;
14404 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14405 if (isa<PoisonValue>(V))
14407 return !isKnownNonNegative(
14408 V, SimplifyQuery(*R.DL));
14412 /// Adds 2 input vectors and the mask for their shuffling. 14414assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14416 isa<FixedVectorType>(V2->getType()) &&
14417"castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14418 V1 = castToScalarTyElem(V1);
14419 V2 = castToScalarTyElem(V2);
14420if (InVectors.
empty()) {
14423 CommonMask.
assign(Mask.begin(), Mask.end());
14427if (InVectors.
size() == 2) {
14428 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14429 transformMaskAfterShuffle(CommonMask, CommonMask);
14430 }
elseif (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14432 Vec = createShuffle(Vec,
nullptr, CommonMask);
14433 transformMaskAfterShuffle(CommonMask, CommonMask);
14435 V1 = createShuffle(V1, V2, Mask);
14436unsigned VF = std::max(getVF(V1), getVF(Vec));
14437for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14439 CommonMask[
Idx] =
Idx + VF;
14440 InVectors.
front() = Vec;
14441if (InVectors.
size() == 2)
14442 InVectors.
back() = V1;
14446 /// Adds another one input vector and the mask for the shuffling. 14449"castToScalarTyElem expects V1 to be FixedVectorType");
14450 V1 = castToScalarTyElem(V1);
14451if (InVectors.
empty()) {
14453 CommonMask.
assign(Mask.begin(), Mask.end());
14456constauto *It =
find(InVectors, V1);
14457if (It == InVectors.
end()) {
14458if (InVectors.
size() == 2 ||
14461if (InVectors.
size() == 2) {
14462 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14463 transformMaskAfterShuffle(CommonMask, CommonMask);
14464 }
elseif (cast<FixedVectorType>(V->getType())->getNumElements() !=
14465 CommonMask.
size()) {
14466 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14467 transformMaskAfterShuffle(CommonMask, CommonMask);
14469unsigned VF = std::max(CommonMask.
size(), Mask.size());
14470for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14473 V->getType() != V1->
getType()
14475 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14476 ->getNumElements();
14477if (V->getType() != V1->
getType())
14478 V1 = createShuffle(V1,
nullptr, Mask);
14479 InVectors.
front() = V;
14480if (InVectors.
size() == 2)
14481 InVectors.
back() = V1;
14486// Check if second vector is required if the used elements are already 14487// used from the first one. 14488for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14495for (
Value *V : InVectors)
14496 VF = std::max(VF, getVF(V));
14497for (
unsignedIdx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14499 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.begin() ? 0 : VF);
14501 /// Adds another one input vector and the mask for the shuffling. 14508Value *Root =
nullptr) {
14509return R.gather(VL, Root, ScalarTy,
14511return createShuffle(V1, V2, Mask);
14515 /// Finalize emission of the shuffles. 14516 /// \param Action the action (if any) to be performed before final applying of 14517 /// the \p ExtMask mask. 14520ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14526if (InVectors.
size() == 2) {
14527 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14530 Vec = createShuffle(Vec,
nullptr, CommonMask);
14532 transformMaskAfterShuffle(CommonMask, CommonMask);
14534"Expected vector length for the final value before action.");
14535unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14538 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14539 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14541 Action(Vec, CommonMask);
14542 InVectors.
front() = Vec;
14544if (!SubVectors.empty()) {
14546if (InVectors.
size() == 2) {
14547 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14550 Vec = createShuffle(Vec,
nullptr, CommonMask);
14552 transformMaskAfterShuffle(CommonMask, CommonMask);
14553auto CreateSubVectors = [&](
Value *Vec,
14555for (
auto [E,
Idx] : SubVectors) {
14556Value *
V = E->VectorizedValue;
14557if (
V->getType()->isIntOrIntVectorTy())
14558 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14559 if (isa<PoisonValue>(V))
14561 return !isKnownNonNegative(
14562 V, SimplifyQuery(*R.DL));
14566 Builder, Vec, V, InsertionIndex,
14567 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
14569if (!CommonMask.
empty()) {
14570 std::iota(std::next(CommonMask.
begin(),
Idx),
14571 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
14577if (SubVectorsMask.
empty()) {
14578 Vec = CreateSubVectors(Vec, CommonMask);
14581copy(SubVectorsMask, SVMask.begin());
14582for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14585I1 = I2 + CommonMask.
size();
14590 Vec = createShuffle(InsertVec, Vec, SVMask);
14591 transformMaskAfterShuffle(CommonMask, SVMask);
14593 InVectors.
front() = Vec;
14596if (!ExtMask.
empty()) {
14597if (CommonMask.
empty()) {
14601for (
intI = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14604 NewMask[
I] = CommonMask[ExtMask[
I]];
14606 CommonMask.
swap(NewMask);
14609if (CommonMask.
empty()) {
14610assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14611return InVectors.
front();
14613if (InVectors.
size() == 2)
14614return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14615return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14620"Shuffle construction must be finalized.");
14624BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14628// Special processing for GEPs bundle, which may include non-gep values. 14629if (!S && VL.
front()->getType()->isPointerTy()) {
14630constauto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14636auto CheckSameVE = [&](
const TreeEntry *VE) {
14637return VE->isSame(VL) &&
14638 (
any_of(VE->UserTreeIndices,
14639 [E, NodeIdx](
const EdgeInfo &EI) {
14640 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14643 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14644return TE->isOperandGatherNode(
14645 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14646 VE->isSame(TE->Scalars);
14649 TreeEntry *VE = getTreeEntry(S.getMainOp());
14650if (VE && CheckSameVE(VE))
14652auto It = MultiNodeScalars.
find(S.getMainOp());
14653if (It != MultiNodeScalars.
end()) {
14654auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
14655 return TE != VE && CheckSameVE(TE);
14657if (
I != It->getSecond().end())
14663Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14664bool PostponedPHIs) {
14666constunsigned VF = VL.size();
14667if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14669// V may be affected by MinBWs. 14670// We want ShuffleInstructionBuilder to correctly support REVEC. The key 14671// factor is the number of elements, not their type. 14672Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14674 ShuffleInstructionBuilder ShuffleBuilder(
14678 ShuffleBuilder.add(V, Mask);
14680 E->CombinedEntriesWithIndices.size());
14681transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14682 [&](
constauto &
P) {
14683 return std::make_pair(VectorizableTree[P.first].get(),
14686assert((E->CombinedEntriesWithIndices.empty() ||
14687 E->ReorderIndices.empty()) &&
14688"Expected either combined subnodes or reordering");
14689return ShuffleBuilder.finalize({}, SubVectors, {});
14693 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14694if (!VE->ReuseShuffleIndices.empty()) {
14695// Reshuffle to get only unique values. 14696// If some of the scalars are duplicated in the vectorization 14697// tree entry, we do not vectorize them but instead generate a 14698// mask for the reuses. But if there are several users of the 14699// same entry, they may have different vectorization factors. 14700// This is especially important for PHI nodes. In this case, we 14701// need to adapt the resulting instruction for the user 14702// vectorization factor and have to reshuffle it again to take 14703// only unique elements of the vector. Without this code the 14704// function incorrectly returns reduced vector instruction with 14705// the same elements, not with the unique ones. 14708// %phi = phi <2 x > { .., %entry} {%shuffle, %block} 14709// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0> 14711// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0} 14715if (isa<PoisonValue>(V))
14717Mask[
I] = VE->findLaneForValue(V);
14719V = FinalShuffle(V, Mask);
14721assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14722"Expected vectorization factor less " 14723"than original vector size.");
14725 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14726V = FinalShuffle(V, UniformMask);
14729// Need to update the operand gather node, if actually the operand is not a 14730// vectorized node, but the buildvector/gather node, which matches one of 14731// the vectorized nodes. 14732if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14733 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14734 }) == VE->UserTreeIndices.end()) {
14736find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14737returnTE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14738TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14740assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14741 (*It)->VectorizedValue =
V;
14746// Find the corresponding gather entry and vectorize it. 14747// Allows to be more accurate with tree/graph transformations, checks for the 14748// correctness of the transformations in many cases. 14750 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14751returnTE->isOperandGatherNode({E, NodeIdx});
14753assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14754assert(
I->get()->UserTreeIndices.size() == 1 &&
14755"Expected only single user for the gather node.");
14756assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14760template <
typename BVTy,
typename ResTy,
typename...
Args>
14761ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14763assert(E->isGather() &&
"Expected gather node.");
14764unsigned VF = E->getVectorFactor();
14766bool NeedFreeze =
false;
14768 E->ReuseShuffleIndices.end());
14770// Clear values, to be replaced by insertvector instructions. 14771for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14773 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14776 E->CombinedEntriesWithIndices.size());
14777transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14778 [&](
constauto &
P) {
14779 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14781// Build a mask out of the reorder indices and reorder scalars per this 14784 E->ReorderIndices.end());
14785if (!ReorderMask.empty())
14789// Transform non-clustered elements in the mask to poison (-1). 14790// "Clustered" operations will be reordered using this mask later. 14791if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14792for (
unsignedI : seq<unsigned>(GatheredScalars.size()))
14793if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14796 SubVectorsMask.
clear();
14800unsignedI,
unsigned SliceSize,
14801bool IsNotPoisonous) {
14803 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14806 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14807unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14808if (UserTE->getNumOperands() != 2)
14810if (!IsNotPoisonous) {
14812find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14813returnfind_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14814 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14815 }) !=
TE->UserTreeIndices.end();
14817if (It == VectorizableTree.end())
14820if (!(*It)->ReorderIndices.empty()) {
14824if (!
all_of(
zip(GatheredScalars, GS), [&](
constauto &
P) {
14825Value *V0 = std::get<0>(
P);
14826Value *V1 = std::get<1>(
P);
14827return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14828 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14834if ((
Mask.size() < InputVF &&
14837 (
Mask.size() == InputVF &&
14840 std::next(
Mask.begin(),
I * SliceSize),
14841 std::next(
Mask.begin(),
14848 std::next(
Mask.begin(),
I * SliceSize),
14849 std::next(
Mask.begin(),
14855 BVTy ShuffleBuilder(ScalarTy, Params...);
14856 ResTy Res = ResTy();
14860Value *ExtractVecBase =
nullptr;
14861bool UseVecBaseAsInput =
false;
14864Type *OrigScalarTy = GatheredScalars.front()->getType();
14867if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14868// Check for gathered extracts. 14869bool Resized =
false;
14871 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14872if (!ExtractShuffles.
empty()) {
14877if (
constauto *TE = getTreeEntry(
14878 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand()))
14881if (std::optional<ResTy> Delayed =
14882 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14883// Delay emission of gathers which are not ready yet. 14884 PostponedGathers.
insert(E);
14885// Postpone gather emission, will be emitted after the end of the 14886// process to keep correct order. 14889if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14890 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14891 ExtractVecBase = VecBase;
14892if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14893if (VF == VecBaseTy->getNumElements() &&
14894 GatheredScalars.size() != VF) {
14896 GatheredScalars.append(VF - GatheredScalars.size(),
14903// Gather extracts after we check for full matched gathers only. 14904if (!ExtractShuffles.
empty() || !E->hasState() ||
14905 E->getOpcode() != Instruction::Load ||
14906 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14907any_of(E->Scalars, IsaPred<LoadInst>)) &&
14910 return isa<LoadInst>(V) && getTreeEntry(V);
14912 (E->hasState() && E->isAltShuffle()) ||
14913all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
14915 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14917 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14919if (!GatherShuffles.
empty()) {
14920if (std::optional<ResTy> Delayed =
14921 ShuffleBuilder.needToDelay(E, Entries)) {
14922// Delay emission of gathers which are not ready yet. 14923 PostponedGathers.
insert(E);
14924// Postpone gather emission, will be emitted after the end of the 14925// process to keep correct order. 14928if (GatherShuffles.
size() == 1 &&
14930 Entries.front().front()->isSame(E->Scalars)) {
14931// Perfect match in the graph, will reuse the previously vectorized 14933LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle " 14935// Restore the mask for previous partially matched values. 14936Mask.resize(E->Scalars.size());
14937const TreeEntry *FrontTE = Entries.front().front();
14938if (FrontTE->ReorderIndices.empty() &&
14939 ((FrontTE->ReuseShuffleIndices.empty() &&
14940 E->Scalars.size() == FrontTE->Scalars.size()) ||
14941 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14942 std::iota(
Mask.begin(),
Mask.end(), 0);
14945if (isa<PoisonValue>(V)) {
14949Mask[
I] = FrontTE->findLaneForValue(V);
14952 ShuffleBuilder.add(*FrontTE, Mask);
14953// Full matched entry found, no need to insert subvectors. 14954 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14958if (GatheredScalars.size() != VF &&
14960returnany_of(TEs, [&](
const TreeEntry *TE) {
14961returnTE->getVectorFactor() == VF;
14964 GatheredScalars.append(VF - GatheredScalars.size(),
14967// Remove shuffled elements from list of gathers. 14968for (
intI = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14976bool IsRootPoison) {
14977// For splats with can emit broadcasts instead of gathers, so try to find 14979bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14984// Gather unique non-const values and all constant values. 14985// For repeated values, just shuffle them. 14986int NumNonConsts = 0;
14989if (isa<UndefValue>(V)) {
14990if (!isa<PoisonValue>(V)) {
15005 Scalars.
front() = OrigV;
15009 Scalars[Res.first->second] = OrigV;
15010 ReuseMask[
I] = Res.first->second;
15013if (NumNonConsts == 1) {
15014// Restore single insert element. 15018if (!UndefPos.
empty() && UndefPos.
front() == 0)
15021 ReuseMask[SinglePos] = SinglePos;
15022 }
elseif (!UndefPos.
empty() && IsSplat) {
15023// For undef values, try to replace them with the simple broadcast. 15024// We can do it if the broadcasted value is guaranteed to be 15025// non-poisonous, or by freezing the incoming scalar value first. 15027return !isa<UndefValue>(V) &&
15029 (E->UserTreeIndices.size() == 1 &&
15031// Check if the value already used in the same operation in 15032// one of the nodes already. 15033 return E->UserTreeIndices.front().EdgeIdx !=
15034 U.getOperandNo() &&
15036 E->UserTreeIndices.front().UserTE->Scalars,
15040if (It != Scalars.
end()) {
15041// Replace undefs by the non-poisoned scalars and emit broadcast. 15042int Pos = std::distance(Scalars.
begin(), It);
15043for (
intI : UndefPos) {
15044// Set the undef position to the non-poisoned scalar. 15045 ReuseMask[
I] = Pos;
15046// Replace the undef by the poison, in the mask it is replaced by 15047// non-poisoned scalar already. 15052// Replace undefs by the poisons, emit broadcast and then emit 15054for (
intI : UndefPos) {
15056if (isa<UndefValue>(Scalars[
I]))
15063if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15064bool IsNonPoisoned =
true;
15065bool IsUsedInExpr =
true;
15066Value *Vec1 =
nullptr;
15067if (!ExtractShuffles.
empty()) {
15068// Gather of extractelements can be represented as just a shuffle of 15069// a single/two vectors the scalars are extracted from. 15070// Find input vectors. 15071Value *Vec2 =
nullptr;
15072for (
unsignedI = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15076if (UseVecBaseAsInput) {
15077 Vec1 = ExtractVecBase;
15079for (
unsignedI = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15082if (isa<UndefValue>(E->Scalars[
I]))
15084auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15085Value *VecOp = EI->getVectorOperand();
15086if (
constauto *TE = getTreeEntry(VecOp))
15087if (
TE->VectorizedValue)
15088 VecOp =
TE->VectorizedValue;
15091 }
elseif (Vec1 != VecOp) {
15092assert((!Vec2 || Vec2 == VecOp) &&
15093"Expected only 1 or 2 vectors shuffle.");
15099 IsUsedInExpr =
false;
15102 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15105 IsUsedInExpr &= FindReusedSplat(
15107 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15108 ExtractMask.size(), IsNotPoisonedVec);
15109 ShuffleBuilder.add(Vec1, ExtractMask,
/*ForExtracts=*/true);
15110 IsNonPoisoned &= IsNotPoisonedVec;
15112 IsUsedInExpr =
false;
15114/*ForExtracts=*/true);
15117if (!GatherShuffles.
empty()) {
15120for (
constauto [
I, TEs] :
enumerate(Entries)) {
15123"No shuffles with empty entries list expected.");
15127"Expected shuffle of 1 or 2 entries.");
15131copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15132if (TEs.
size() == 1) {
15133bool IsNotPoisonedVec =
15134 TEs.
front()->VectorizedValue
15138 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15139 SliceSize, IsNotPoisonedVec);
15140 ShuffleBuilder.add(*TEs.
front(), VecMask);
15141 IsNonPoisoned &= IsNotPoisonedVec;
15143 IsUsedInExpr =
false;
15144 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15145if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15152// Try to figure out best way to combine values: build a shuffle and insert 15153// elements or just build several shuffles. 15154// Insert non-constant scalars. 15156int EMSz = ExtractMask.size();
15157int MSz =
Mask.size();
15158// Try to build constant vector and shuffle with it only if currently we 15159// have a single permutation and more than 1 scalar constants. 15160bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15161bool IsIdentityShuffle =
15162 ((UseVecBaseAsInput ||
15164 [](
const std::optional<TTI::ShuffleKind> &SK) {
15168none_of(ExtractMask, [&](
intI) {
returnI >= EMSz; }) &&
15170 (!GatherShuffles.
empty() &&
15172 [](
const std::optional<TTI::ShuffleKind> &SK) {
15176none_of(Mask, [&](
intI) {
returnI >= MSz; }) &&
15178bool EnoughConstsForShuffle =
15182return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15186return isa<Constant>(V) && !isa<UndefValue>(V);
15188 (!IsIdentityShuffle ||
15189 (GatheredScalars.size() == 2 &&
15191 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15193return isa<Constant>(V) && !isa<PoisonValue>(V);
15195// NonConstants array contains just non-constant values, GatheredScalars 15196// contains only constant to build final vector and then shuffle. 15197for (
intI = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15198if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15203// Generate constants for final shuffle and build a mask for them. 15204if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15206 TryPackScalars(GatheredScalars, BVMask,
/*IsRootPoison=*/true);
15207Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15208 ShuffleBuilder.add(BV, BVMask);
15211return isa<PoisonValue>(V) ||
15212 (IsSingleShuffle && ((IsIdentityShuffle &&
15213 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15215 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15218 Res = ShuffleBuilder.finalize(
15219 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15221 TryPackScalars(NonConstants, Mask,
/*IsRootPoison=*/false);
15222 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15225// Gather unique scalars and all constants. 15227 TryPackScalars(GatheredScalars, ReuseMask,
/*IsRootPoison=*/true);
15228Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15229 ShuffleBuilder.add(BV, ReuseMask);
15230 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15233// Gather all constants. 15235for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15236if (!isa<PoisonValue>(V))
15239Value *BV = ShuffleBuilder.gather(GatheredScalars);
15240 ShuffleBuilder.add(BV, Mask);
15241 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15246 Res = ShuffleBuilder.createFreeze(Res);
15250Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15251bool PostponedPHIs) {
15252for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15254return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15258/// \returns \p I after propagating metadata from \p VL only for instructions in 15263if (isa<Instruction>(V))
15271if (E->VectorizedValue &&
15272 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15273 E->isAltShuffle())) {
15274LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15275return E->VectorizedValue;
15278Value *
V = E->Scalars.front();
15279Type *ScalarTy =
V->getType();
15280if (!isa<CmpInst>(V))
15282auto It = MinBWs.
find(E);
15283if (It != MinBWs.
end()) {
15284auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15290if (E->isGather()) {
15291// Set insert point for non-reduction initial nodes. 15292if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15293 setInsertPointAfterBundle(E);
15294Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15295 E->VectorizedValue = Vec;
15299bool IsReverseOrder =
15301auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15302 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15303if (E->getOpcode() == Instruction::Store &&
15304 E->State == TreeEntry::Vectorize) {
15306ArrayRef(
reinterpret_cast<constint *
>(E->ReorderIndices.begin()),
15307 E->ReorderIndices.size());
15308 ShuffleBuilder.add(V, Mask);
15309 }
elseif (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15310 ShuffleBuilder.addOrdered(V, {});
15312 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15315 E->CombinedEntriesWithIndices.size());
15317 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
constauto &
P) {
15318 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15321 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15322"Expected either combined subnodes or reordering");
15323return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15326assert(!E->isGather() &&
"Unhandled state");
15327unsigned ShuffleOrOp =
15328 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15330auto GetOperandSignedness = [&](
unsignedIdx) {
15331const TreeEntry *OpE = getOperandEntry(E,
Idx);
15332bool IsSigned =
false;
15333auto It = MinBWs.
find(OpE);
15334if (It != MinBWs.
end())
15335 IsSigned = It->second.second;
15338 if (isa<PoisonValue>(V))
15340 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15344switch (ShuffleOrOp) {
15345case Instruction::PHI: {
15346assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15347 E != VectorizableTree.front().get() ||
15348 !E->UserTreeIndices.empty()) &&
15349"PHI reordering is free.");
15350if (PostponedPHIs && E->VectorizedValue)
15351return E->VectorizedValue;
15352auto *PH = cast<PHINode>(VL0);
15354 PH->getParent()->getFirstNonPHIIt());
15356if (PostponedPHIs || !E->VectorizedValue) {
15361// Adjust insertion point once all PHI's have been generated. 15363 PH->getParent()->getFirstInsertionPt());
15366V = FinalShuffle(V, E);
15368 E->VectorizedValue =
V;
15372PHINode *NewPhi = cast<PHINode>(E->PHI);
15373// If phi node is fully emitted - exit. 15377// PHINodes may have multiple entries from the same block. We want to 15378// visit every block once. 15381for (
unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {
15385// Stop emission if all incoming values are generated. 15391if (!VisitedBBs.
insert(IBB).second) {
15398Value *Vec = vectorizeOperand(E,
I,
/*PostponedPHIs=*/true);
15399if (VecTy != Vec->
getType()) {
15401 MinBWs.
contains(getOperandEntry(E,
I))) &&
15402"Expected item in MinBWs.");
15403 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15409"Invalid number of incoming values");
15410assert(E->VectorizedValue &&
"Expected vectorized value.");
15411return E->VectorizedValue;
15414case Instruction::ExtractElement: {
15415Value *
V = E->getSingleOperand(0);
15416if (
const TreeEntry *TE = getTreeEntry(V))
15417V =
TE->VectorizedValue;
15418 setInsertPointAfterBundle(E);
15419V = FinalShuffle(V, E);
15420 E->VectorizedValue =
V;
15423case Instruction::ExtractValue: {
15424auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15429 NewV = FinalShuffle(NewV, E);
15430 E->VectorizedValue = NewV;
15433case Instruction::InsertElement: {
15434assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15436Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15438Type *ScalarTy =
Op.front()->getType();
15439if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15441 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15442assert(Res.first > 0 &&
"Expected item in MinBWs.");
15447 cast<FixedVectorType>(
V->getType())->getNumElements()),
15451// Create InsertVector shuffle if necessary 15452auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15453 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15455constunsigned NumElts =
15456 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15457constunsigned NumScalars = E->Scalars.size();
15460assert(
Offset < NumElts &&
"Failed to find vector index offset");
15462// Create shuffle to resize vector 15464if (!E->ReorderIndices.empty()) {
15469 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15471// Create InsertVector shuffle if necessary 15472bool IsIdentity =
true;
15474Mask.swap(PrevMask);
15475for (
unsignedI = 0;
I < NumScalars; ++
I) {
15478 IsIdentity &= InsertIdx -
Offset ==
I;
15481if (!IsIdentity || NumElts != NumScalars) {
15483bool IsVNonPoisonous =
15486if (NumElts != NumScalars &&
Offset == 0) {
15487// Follow all insert element instructions from the current buildvector 15495 InsertMask[*InsertIdx] = *InsertIdx;
15496if (!
Ins->hasOneUse())
15498Ins = dyn_cast_or_null<InsertElementInst>(
15499Ins->getUniqueUndroppableUser());
15502buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15504 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15507if (!IsFirstPoison.
all()) {
15509for (
unsignedI = 0;
I < NumElts;
I++) {
15511 IsFirstUndef.
test(
I)) {
15512if (IsVNonPoisonous) {
15513 InsertMask[
I] =
I < NumScalars ?
I : 0;
15518if (
Idx >= NumScalars)
15519Idx = NumScalars - 1;
15520 InsertMask[
I] = NumScalars +
Idx;
15534if (
auto *
I = dyn_cast<Instruction>(V)) {
15535 GatherShuffleExtractSeq.
insert(
I);
15536 CSEBlocks.
insert(
I->getParent());
15541for (
unsignedI = 0;
I < NumElts;
I++) {
15546buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15549if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15550 NumElts != NumScalars) {
15551if (IsFirstUndef.
all()) {
15554 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15555if (!IsFirstPoison.
all()) {
15556for (
unsignedI = 0;
I < NumElts;
I++) {
15558 InsertMask[
I] =
I + NumElts;
15565 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15566if (
auto *
I = dyn_cast<Instruction>(V)) {
15567 GatherShuffleExtractSeq.
insert(
I);
15568 CSEBlocks.
insert(
I->getParent());
15573 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15574for (
unsignedI = 0;
I < NumElts;
I++) {
15578 InsertMask[
I] += NumElts;
15581 FirstInsert->getOperand(0), V, InsertMask,
15582 cast<Instruction>(E->Scalars.back())->getName());
15583if (
auto *
I = dyn_cast<Instruction>(V)) {
15584 GatherShuffleExtractSeq.
insert(
I);
15585 CSEBlocks.
insert(
I->getParent());
15590 ++NumVectorInstructions;
15591 E->VectorizedValue =
V;
15594case Instruction::ZExt:
15595case Instruction::SExt:
15596case Instruction::FPToUI:
15597case Instruction::FPToSI:
15598case Instruction::FPExt:
15599case Instruction::PtrToInt:
15600case Instruction::IntToPtr:
15601case Instruction::SIToFP:
15602case Instruction::UIToFP:
15603case Instruction::Trunc:
15604case Instruction::FPTrunc:
15605case Instruction::BitCast: {
15606 setInsertPointAfterBundle(E);
15608Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15609if (E->VectorizedValue) {
15611return E->VectorizedValue;
15614auto *CI = cast<CastInst>(VL0);
15616Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15617auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15619 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15621// Check if the values are candidates to demote. 15622unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15623if (SrcIt != MinBWs.
end())
15624 SrcBWSz = SrcIt->second.first;
15626if (BWSz == SrcBWSz) {
15627 VecOpcode = Instruction::BitCast;
15628 }
elseif (BWSz < SrcBWSz) {
15629 VecOpcode = Instruction::Trunc;
15630 }
elseif (It != MinBWs.
end()) {
15631assert(BWSz > SrcBWSz &&
"Invalid cast!");
15632 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15633 }
elseif (SrcIt != MinBWs.
end()) {
15634assert(BWSz > SrcBWSz &&
"Invalid cast!");
15636 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15638 }
elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15639 !SrcIt->second.second) {
15640 VecOpcode = Instruction::UIToFP;
15642Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15644 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15645V = FinalShuffle(V, E);
15647 E->VectorizedValue =
V;
15648 ++NumVectorInstructions;
15651case Instruction::FCmp:
15652case Instruction::ICmp: {
15653 setInsertPointAfterBundle(E);
15655Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15656if (E->VectorizedValue) {
15658return E->VectorizedValue;
15660Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15661if (E->VectorizedValue) {
15663return E->VectorizedValue;
15665if (
L->getType() !=
R->getType()) {
15667 getOperandEntry(E, 1)->
isGather() ||
15668 MinBWs.
contains(getOperandEntry(E, 0)) ||
15669 MinBWs.
contains(getOperandEntry(E, 1))) &&
15670"Expected item in MinBWs.");
15671if (cast<VectorType>(
L->getType())
15673 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15675 ->getIntegerBitWidth()) {
15676Type *CastTy =
R->getType();
15679Type *CastTy =
L->getType();
15687if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15688 ICmp->setSameSign(
/*B=*/false);
15689// Do not cast for cmps. 15690 VecTy = cast<FixedVectorType>(
V->getType());
15691V = FinalShuffle(V, E);
15693 E->VectorizedValue =
V;
15694 ++NumVectorInstructions;
15697case Instruction::Select: {
15698 setInsertPointAfterBundle(E);
15700Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15701if (E->VectorizedValue) {
15703return E->VectorizedValue;
15705Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15706if (E->VectorizedValue) {
15708return E->VectorizedValue;
15710Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15711if (E->VectorizedValue) {
15713return E->VectorizedValue;
15717 getOperandEntry(E, 2)->
isGather() ||
15718 MinBWs.
contains(getOperandEntry(E, 1)) ||
15719 MinBWs.
contains(getOperandEntry(E, 2))) &&
15720"Expected item in MinBWs.");
15722 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15723if (False->
getType() != VecTy)
15724 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15729assert(TrueNumElements >= CondNumElements &&
15730 TrueNumElements % CondNumElements == 0 &&
15731"Cannot vectorize Instruction::Select");
15733"Cannot vectorize Instruction::Select");
15734if (CondNumElements != TrueNumElements) {
15735// When the return type is i1 but the source is fixed vector type, we 15736// need to duplicate the condition value. 15742"Cannot vectorize Instruction::Select");
15744V = FinalShuffle(V, E);
15746 E->VectorizedValue =
V;
15747 ++NumVectorInstructions;
15750case Instruction::FNeg: {
15751 setInsertPointAfterBundle(E);
15753Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15755if (E->VectorizedValue) {
15757return E->VectorizedValue;
15763if (
auto *
I = dyn_cast<Instruction>(V))
15766V = FinalShuffle(V, E);
15768 E->VectorizedValue =
V;
15769 ++NumVectorInstructions;
15773case Instruction::Freeze: {
15774 setInsertPointAfterBundle(E);
15776Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15778if (E->VectorizedValue) {
15780return E->VectorizedValue;
15783if (
Op->getType() != VecTy) {
15785 MinBWs.
contains(getOperandEntry(E, 0))) &&
15786"Expected item in MinBWs.");
15790V = FinalShuffle(V, E);
15792 E->VectorizedValue =
V;
15793 ++NumVectorInstructions;
15797case Instruction::Add:
15798case Instruction::FAdd:
15799case Instruction::Sub:
15800case Instruction::FSub:
15801case Instruction::Mul:
15802case Instruction::FMul:
15803case Instruction::UDiv:
15804case Instruction::SDiv:
15805case Instruction::FDiv:
15806case Instruction::URem:
15807case Instruction::SRem:
15808case Instruction::FRem:
15809case Instruction::Shl:
15810case Instruction::LShr:
15811case Instruction::AShr:
15812case Instruction::And:
15813case Instruction::Or:
15814case Instruction::Xor: {
15815 setInsertPointAfterBundle(E);
15817Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15818if (E->VectorizedValue) {
15820return E->VectorizedValue;
15822Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15823if (E->VectorizedValue) {
15825return E->VectorizedValue;
15827if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15828for (
unsignedI : seq<unsigned>(0, E->getNumOperands())) {
15831auto *CI = dyn_cast<ConstantInt>(
Op);
15832return CI && CI->getValue().countr_one() >= It->second.first;
15834V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15835 E->VectorizedValue =
V;
15836 ++NumVectorInstructions;
15843 getOperandEntry(E, 1)->
isGather() ||
15844 MinBWs.
contains(getOperandEntry(E, 0)) ||
15845 MinBWs.
contains(getOperandEntry(E, 1))) &&
15846"Expected item in MinBWs.");
15857if (
auto *
I = dyn_cast<Instruction>(V)) {
15859// Drop nuw flags for abs(sub(commutative), true). 15860if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15862 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15864I->setHasNoUnsignedWrap(
/*b=*/false);
15867V = FinalShuffle(V, E);
15869 E->VectorizedValue =
V;
15870 ++NumVectorInstructions;
15874case Instruction::Load: {
15875// Loads are inserted at the head of the tree because we don't want to 15876// sink them all the way down past store instructions. 15877 setInsertPointAfterBundle(E);
15879LoadInst *LI = cast<LoadInst>(VL0);
15882if (E->State == TreeEntry::Vectorize) {
15884 }
elseif (E->State == TreeEntry::StridedVectorize) {
15885Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15886Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15887 PO = IsReverseOrder ? PtrN : Ptr0;
15893int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15895 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15896DL->getTypeAllocSize(ScalarTy));
15900 return cast<LoadInst>(V)->getPointerOperand();
15903 std::optional<Value *> Stride =
15907 Builder.
CreateIntCast(*Stride, StrideTy,
/*isSigned=*/true);
15912 (IsReverseOrder ? -1 : 1) *
15913static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15915Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15917 Intrinsic::experimental_vp_strided_load,
15918 {VecTy, PO->
getType(), StrideTy},
15920 Builder.
getInt32(E->Scalars.size())});
15926assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15927Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15928if (E->VectorizedValue) {
15930return E->VectorizedValue;
15932if (isa<FixedVectorType>(ScalarTy)) {
15934// CreateMaskedGather expects VecTy and VecPtr have same size. We need 15935// to expand VecPtr if ScalarTy is a vector type. 15936unsigned ScalarTyNumElements =
15937 cast<FixedVectorType>(ScalarTy)->getNumElements();
15938unsigned VecTyNumElements =
15939 cast<FixedVectorType>(VecTy)->getNumElements();
15940assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15941"Cannot expand getelementptr.");
15942unsigned VF = VecTyNumElements / ScalarTyNumElements;
15945 return Builder.getInt64(I % ScalarTyNumElements);
15953// Use the minimum alignment of the gathered loads. 15954Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15959V = FinalShuffle(V, E);
15960 E->VectorizedValue =
V;
15961 ++NumVectorInstructions;
15964case Instruction::Store: {
15965auto *
SI = cast<StoreInst>(VL0);
15967 setInsertPointAfterBundle(E);
15969Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15970if (VecValue->
getType() != VecTy)
15972 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15973 VecValue = FinalShuffle(VecValue, E);
15977if (E->State == TreeEntry::Vectorize) {
15980assert(E->State == TreeEntry::StridedVectorize &&
15981"Expected either strided or consecutive stores.");
15982if (!E->ReorderIndices.empty()) {
15983SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15984Ptr =
SI->getPointerOperand();
15986Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15987Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15989 Intrinsic::experimental_vp_strided_store,
15990 {VecTy,
Ptr->getType(), StrideTy},
15993 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
15995 Builder.
getInt32(E->Scalars.size())});
16004 E->VectorizedValue =
V;
16005 ++NumVectorInstructions;
16008case Instruction::GetElementPtr: {
16009auto *GEP0 = cast<GetElementPtrInst>(VL0);
16010 setInsertPointAfterBundle(E);
16012Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16013if (E->VectorizedValue) {
16015return E->VectorizedValue;
16019for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
16020Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16021if (E->VectorizedValue) {
16023return E->VectorizedValue;
16028Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16031for (
Value *V : E->Scalars) {
16032if (isa<GetElementPtrInst>(V))
16038V = FinalShuffle(V, E);
16040 E->VectorizedValue =
V;
16041 ++NumVectorInstructions;
16045case Instruction::Call: {
16046CallInst *CI = cast<CallInst>(VL0);
16047 setInsertPointAfterBundle(E);
16053 It != MinBWs.
end() ? It->second.first : 0,
TTI);
16056 VecCallCosts.first <= VecCallCosts.second;
16058Value *ScalarArg =
nullptr;
16061// Add return type if intrinsic is overloaded on it. 16064auto *CEI = cast<CallInst>(VL0);
16065for (
unsignedI : seq<unsigned>(0, CI->
arg_size())) {
16067// Some intrinsics have scalar arguments. This argument should not be 16070 ScalarArg = CEI->getArgOperand(
I);
16071// if decided to reduce bitwidth of abs intrinsic, it second argument 16072// must be set false (do not return poison, if value issigned min). 16073if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16074 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16082Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16083if (E->VectorizedValue) {
16085return E->VectorizedValue;
16087 ScalarArg = CEI->getArgOperand(
I);
16088if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16090 It == MinBWs.
end()) {
16093 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16094 }
elseif (It != MinBWs.
end()) {
16095 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16104if (!UseIntrinsic) {
16109false/*HasGlobalPred*/);
16120V = FinalShuffle(V, E);
16122 E->VectorizedValue =
V;
16123 ++NumVectorInstructions;
16126case Instruction::ShuffleVector: {
16128if (
SLPReVec && !E->isAltShuffle()) {
16129 setInsertPointAfterBundle(E);
16130Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16131if (E->VectorizedValue) {
16133return E->VectorizedValue;
16136if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16137assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16138"Not supported shufflevector usage.");
16141 return SVSrc->getShuffleMask()[Mask];
16148if (
auto *
I = dyn_cast<Instruction>(V))
16150V = FinalShuffle(V, E);
16152assert(E->isAltShuffle() &&
16157 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16158"Invalid Shuffle Vector Operand");
16162 setInsertPointAfterBundle(E);
16163LHS = vectorizeOperand(E, 0, PostponedPHIs);
16164if (E->VectorizedValue) {
16166return E->VectorizedValue;
16168RHS = vectorizeOperand(E, 1, PostponedPHIs);
16170 setInsertPointAfterBundle(E);
16171LHS = vectorizeOperand(E, 0, PostponedPHIs);
16173if (E->VectorizedValue) {
16175return E->VectorizedValue;
16182 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16183 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16184 MinBWs.
contains(getOperandEntry(E, 0)) ||
16185 MinBWs.
contains(getOperandEntry(E, 1))) &&
16186"Expected item in MinBWs.");
16187Type *CastTy = VecTy;
16191 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16193 ->getIntegerBitWidth())
16210 }
elseif (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16211 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16212auto *AltCI = cast<CmpInst>(E->getAltOp());
16214 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16217unsigned SrcBWSz =
DL->getTypeSizeInBits(
16218 cast<VectorType>(
LHS->
getType())->getElementType());
16219unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16220if (BWSz <= SrcBWSz) {
16224"Expected same type as operand.");
16225if (
auto *
I = dyn_cast<Instruction>(LHS))
16227LHS = FinalShuffle(LHS, E);
16228 E->VectorizedValue =
LHS;
16229 ++NumVectorInstructions;
16238// Add V0 and V1 to later analysis to try to find and remove matching 16239// instruction, if any. 16240for (
Value *V : {V0, V1}) {
16241if (
auto *
I = dyn_cast<Instruction>(V)) {
16242 GatherShuffleExtractSeq.
insert(
I);
16243 CSEBlocks.
insert(
I->getParent());
16247// Create shuffle to take alternate operations from the vector. 16248// Also, gather up main and alt scalar ops to propagate IR flags to 16249// each vector operation. 16252 E->buildAltOpShuffleMask(
16254assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16258Mask, &OpScalars, &AltScalars);
16262auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16263// Drop nuw flags for abs(sub(commutative), true). 16264if (
auto *
I = dyn_cast<Instruction>(Vec);
16265I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16267 if (isa<PoisonValue>(V))
16269 auto *IV = cast<Instruction>(V);
16270 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16272I->setHasNoUnsignedWrap(
/*b=*/false);
16274 DropNuwFlag(V0, E->getOpcode());
16275 DropNuwFlag(V1, E->getAltOpcode());
16277if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16282if (
auto *
I = dyn_cast<Instruction>(V)) {
16284 GatherShuffleExtractSeq.
insert(
I);
16285 CSEBlocks.
insert(
I->getParent());
16289 E->VectorizedValue =
V;
16290 ++NumVectorInstructions;
16308// All blocks must be scheduled before any instructions are inserted. 16309for (
auto &BSIter : BlocksSchedules) {
16310 scheduleBlock(BSIter.second.get());
16312// Clean Entry-to-LastInstruction table. It can be affected after scheduling, 16313// need to rebuild it. 16314 EntryToLastInstruction.
clear();
16322// Emit gathered loads first to emit better code for the users of those 16324for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16325if (GatheredLoadsEntriesFirst.has_value() &&
16326 TE->Idx >= *GatheredLoadsEntriesFirst &&
16327 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16328assert((!TE->UserTreeIndices.empty() ||
16329 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16330"Expected gathered load node.");
16334// Postpone emission of PHIs operands to avoid cyclic dependencies issues. 16336for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16337if (TE->State == TreeEntry::Vectorize &&
16338 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16339 TE->VectorizedValue)
16341// Run through the list of postponed gathers and emit them, replacing the temp 16342// emitted allocas with actual vector instructions. 16345for (
const TreeEntry *E : PostponedNodes) {
16346auto *TE =
const_cast<TreeEntry *
>(E);
16347if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
16348if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16349 TE->UserTreeIndices.front().EdgeIdx)) &&
16350 VecTE->isSame(TE->Scalars))
16351// Found gather node which is absolutely the same as one of the 16352// vectorized nodes. It may happen after reordering. 16354auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16355 TE->VectorizedValue =
nullptr;
16357 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16358// If user is a PHI node, its vector code have to be inserted right before 16359// block terminator. Since the node was delayed, there were some unresolved 16360// dependencies at the moment when stab instruction was emitted. In a case 16361// when any of these dependencies turn out an operand of another PHI, coming 16362// from this same block, position of a stab instruction will become invalid. 16363// The is because source vector that supposed to feed this gather node was 16364// inserted at the end of the block [after stab instruction]. So we need 16365// to adjust insertion point again to the end of block. 16366if (isa<PHINode>(UserI)) {
16367// Insert before all users. 16369for (
User *U : PrevVec->users()) {
16372auto *UI = dyn_cast<Instruction>(U);
16373if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16375if (UI->comesBefore(InsertPt))
16384if (
auto *VecI = dyn_cast<Instruction>(Vec);
16389if (Vec->
getType() != PrevVec->getType()) {
16391 PrevVec->getType()->isIntOrIntVectorTy() &&
16392"Expected integer vector types only.");
16393 std::optional<bool> IsSigned;
16394for (
Value *V : TE->Scalars) {
16395if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
16396auto It = MinBWs.
find(BaseTE);
16397if (It != MinBWs.
end()) {
16398 IsSigned = IsSigned.value_or(
false) || It->second.second;
16402for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
16403auto It = MinBWs.
find(MNTE);
16404if (It != MinBWs.
end()) {
16405 IsSigned = IsSigned.value_or(
false) || It->second.second;
16410if (IsSigned.value_or(
false))
16412// Scan through gather nodes. 16413for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16414auto It = MinBWs.
find(BVE);
16415if (It != MinBWs.
end()) {
16416 IsSigned = IsSigned.value_or(
false) || It->second.second;
16421if (IsSigned.value_or(
false))
16423if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16425 IsSigned.value_or(
false) ||
16429if (IsSigned.value_or(
false))
16433if (IsSigned.value_or(
false)) {
16434// Final attempt - check user node. 16435auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16436if (It != MinBWs.
end())
16437 IsSigned = It->second.second;
16440"Expected user node or perfect diamond match in MinBWs.");
16444 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16445// Replace the stub vector node, if it was used before for one of the 16446// buildvector nodes already. 16447auto It = PostponedValues.
find(PrevVec);
16448if (It != PostponedValues.
end()) {
16449for (TreeEntry *VTE : It->getSecond())
16450 VTE->VectorizedValue = Vec;
16459// Maps vector instruction to original insertelement instruction 16461// Maps extract Scalar to the corresponding extractelement instruction in the 16462// basic block. Only one extractelement per block should be emitted. 16469// Extract all of the elements with the external uses. 16470for (
constauto &ExternalUse : ExternalUses) {
16471Value *Scalar = ExternalUse.Scalar;
16474// Skip users that we already RAUW. This happens when one instruction 16475// has multiple uses of the same value. 16478 TreeEntry *E = getTreeEntry(Scalar);
16479assert(E &&
"Invalid scalar");
16480assert(!E->isGather() &&
"Extracting from a gather list");
16481// Non-instruction pointers are not deleted, just skip them. 16482if (E->getOpcode() == Instruction::GetElementPtr &&
16483 !isa<GetElementPtrInst>(Scalar))
16486Value *Vec = E->VectorizedValue;
16487assert(Vec &&
"Can't find vectorizable value");
16490auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16491if (Scalar->getType() != Vec->
getType()) {
16493Value *ExV =
nullptr;
16494auto *Inst = dyn_cast<Instruction>(Scalar);
16495bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16496auto It = ScalarToEEs.
find(Scalar);
16497if (It != ScalarToEEs.
end()) {
16498// No need to emit many extracts, just move the only one in the 16500auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16502if (EEIt != It->second.end()) {
16503Value *PrevV = EEIt->second.first;
16504if (
auto *
I = dyn_cast<Instruction>(PrevV);
16505I && !ReplaceInst &&
16510if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16514 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16518// "Reuse" the existing extract to improve final codegen. 16520// Leave the instruction as is, if it cheaper extracts and all 16521// operands are scalar. 16522if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16523 IgnoredExtracts.
insert(EE);
16526auto *CloneInst = Inst->clone();
16527 CloneInst->insertBefore(Inst->getIterator());
16528if (Inst->hasName())
16532 }
elseif (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16533 ES && isa<Instruction>(Vec)) {
16534Value *V = ES->getVectorOperand();
16535auto *IVec = cast<Instruction>(Vec);
16536if (
const TreeEntry *ETE = getTreeEntry(V))
16537 V = ETE->VectorizedValue;
16538if (
auto *
IV = dyn_cast<Instruction>(V);
16539 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16540IV->comesBefore(IVec))
16544 }
elseif (
auto *VecTy =
16545 dyn_cast<FixedVectorType>(Scalar->getType())) {
16548// When REVEC is enabled, we need to extract a vector. 16549// Note: The element size of Scalar may be different from the 16550// element size of Vec. 16552 ExternalUse.Lane * VecTyNumElements);
16556// If necessary, sign-extend or zero-extend ScalarRoot 16557// to the larger type. 16559if (Scalar->getType() != Ex->
getType())
16561 Ex, Scalar->getType(),
16563auto *
I = dyn_cast<Instruction>(Ex);
16565 : &
F->getEntryBlock(),
16566 std::make_pair(Ex, ExV));
16568// The then branch of the previous if may produce constants, since 0 16569// operand might be a constant. 16570if (
auto *ExI = dyn_cast<Instruction>(Ex);
16572 GatherShuffleExtractSeq.
insert(ExI);
16573 CSEBlocks.
insert(ExI->getParent());
16577assert(isa<FixedVectorType>(Scalar->getType()) &&
16578 isa<InsertElementInst>(Scalar) &&
16579"In-tree scalar of vector type is not insertelement?");
16580auto *IE = cast<InsertElementInst>(Scalar);
16584// If User == nullptr, the Scalar remains as scalar in vectorized 16585// instructions or is used as extra arg. Generate ExtractElement instruction 16586// and update the record for this scalar in ExternallyUsedValues. 16588if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16592 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16595 if (ExternalUsesAsOriginalScalar.contains(U))
16597 TreeEntry *UseEntry = getTreeEntry(U);
16599 (UseEntry->State == TreeEntry::Vectorize ||
16601 TreeEntry::StridedVectorize) &&
16602 (E->State == TreeEntry::Vectorize ||
16603 E->State == TreeEntry::StridedVectorize) &&
16604 doesInTreeUserNeedToExtract(
16605 Scalar, getRootEntryInstruction(*UseEntry),
16608"Scalar with nullptr User must be registered in " 16609"ExternallyUsedValues map or remain as scalar in vectorized " 16611if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16612if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16613if (
PHI->getParent()->isLandingPad())
16617PHI->getParent()->getLandingPadInst()->getIterator()));
16620PHI->getParent()->getFirstNonPHIIt());
16623 std::next(VecI->getIterator()));
16628Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16629// Required to update internally referenced instructions. 16630if (Scalar != NewInst) {
16631assert((!isa<ExtractElementInst>(Scalar) ||
16632 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16633"Extractelements should not be replaced.");
16634 Scalar->replaceAllUsesWith(NewInst);
16639if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16641// Skip if the scalar is another vector op or Vec is not an instruction. 16642if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16643if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16644if (!UsedInserts.
insert(VU).second)
16646// Need to use original vector, if the root is truncated. 16647auto BWIt = MinBWs.
find(E);
16649auto *ScalarTy = FTy->getElementType();
16650auto Key = std::make_pair(Vec, ScalarTy);
16651auto VecIt = VectorCasts.
find(Key);
16652if (VecIt == VectorCasts.
end()) {
16654if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16655if (IVec->getParent()->isLandingPad())
16657 std::next(IVec->getParent()
16658 ->getLandingPadInst()
16662 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16663 }
elseif (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16670 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16671 BWIt->second.second);
16674 Vec = VecIt->second;
16681 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16682// Checks if 2 insertelements are from the same buildvector. 16688unsignedIdx = *InsertIdx;
16689if (It == ShuffledInserts.
end()) {
16691 It = std::next(ShuffledInserts.
begin(),
16692 ShuffledInserts.
size() - 1);
16697 Mask[
Idx] = ExternalUse.Lane;
16698 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16705// Generate extracts for out-of-tree users. 16706// Find the insertion point for the extractelement lane. 16707if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16709for (
unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {
16710if (PH->getIncomingValue(
I) == Scalar) {
16712 PH->getIncomingBlock(
I)->getTerminator();
16713if (isa<CatchSwitchInst>(IncomingTerminator)) {
16715 std::next(VecI->getIterator()));
16719Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16720 PH->setOperand(
I, NewInst);
16725Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16730Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16740int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16741for (
intI = 0, E = Mask.size();
I < E; ++
I) {
16743 CombinedMask1[
I] = Mask[
I];
16745 CombinedMask2[
I] = Mask[
I] - VF;
16748 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16749 ShuffleBuilder.
add(V1, CombinedMask1);
16751 ShuffleBuilder.
add(V2, CombinedMask2);
16752return ShuffleBuilder.
finalize({}, {}, {});
16756bool ForSingleMask) {
16757unsigned VF = Mask.size();
16758unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16760if (
any_of(Mask, [VF](
intIdx) {
returnIdx >=
static_cast<int>(VF); })) {
16761 Vec = CreateShuffle(Vec,
nullptr, Mask);
16762return std::make_pair(Vec,
true);
16764if (!ForSingleMask) {
16766for (
unsignedI = 0;
I < VF; ++
I) {
16768 ResizeMask[Mask[
I]] = Mask[
I];
16770 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16774return std::make_pair(Vec,
false);
16776// Perform shuffling of the vectorize tree entries for better handling of 16777// external extracts. 16778for (
intI = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16779// Find the first and the last instruction in the list of insertelements. 16784autoVector = ShuffledInserts[
I].ValueMasks.takeVector();
16785Value *NewInst = performExtractsShuffleAction<Value>(
16789 return cast<VectorType>(Vec->getType())
16790 ->getElementCount()
16791 .getKnownMinValue();
16796 assert((Vals.size() == 1 || Vals.size() == 2) &&
16797"Expected exactly 1 or 2 input values.");
16798 if (Vals.size() == 1) {
16799// Do not create shuffle if the mask is a simple identity 16800// non-resizing mask. 16801 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16802 ->getNumElements() ||
16803 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16804 return CreateShuffle(Vals.front(), nullptr, Mask);
16805 return Vals.front();
16807return CreateShuffle(Vals.
front() ? Vals.
front()
16809 Vals.
back(), Mask);
16811auto It = ShuffledInserts[
I].InsertElements.
rbegin();
16812// Rebuild buildvector chain. 16814if (It != ShuffledInserts[
I].InsertElements.
rend())
16817while (It != ShuffledInserts[
I].InsertElements.
rend()) {
16818assert(
II &&
"Must be an insertelement instruction.");
16823II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16826II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16827if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16828if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16829II->moveAfter(NewI);
16832 LastInsert->replaceAllUsesWith(NewInst);
16834 IE->replaceUsesOfWith(IE->getOperand(0),
16836 IE->replaceUsesOfWith(IE->getOperand(1),
16840 CSEBlocks.
insert(LastInsert->getParent());
16844// For each vectorized value: 16845for (
auto &TEPtr : VectorizableTree) {
16846 TreeEntry *Entry = TEPtr.get();
16848// No need to handle users of gathered values. 16849if (Entry->isGather())
16852assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
16855for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16856Value *Scalar = Entry->Scalars[Lane];
16858if (Entry->getOpcode() == Instruction::GetElementPtr &&
16859 !isa<GetElementPtrInst>(Scalar))
16861if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16862 EE && IgnoredExtracts.contains(EE))
16864if (isa<PoisonValue>(Scalar))
16867Type *Ty = Scalar->getType();
16869for (
User *U : Scalar->users()) {
16872// It is legal to delete users in the ignorelist. 16873assert((getTreeEntry(U) ||
16874 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16875 (isa_and_nonnull<Instruction>(U) &&
16876 isDeleted(cast<Instruction>(U)))) &&
16877"Deleting out-of-tree value");
16881LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16882auto *
I = cast<Instruction>(Scalar);
16887// Merge the DIAssignIDs from the about-to-be-deleted instructions into the 16888// new vector instruction. 16889if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16890V->mergeDIAssignID(RemovedInsts);
16892// Clear up reduction references, if any. 16893if (UserIgnoreList) {
16895const TreeEntry *
IE = getTreeEntry(
I);
16897 !(VectorizableTree.front()->isGather() &&
16898 !
IE->UserTreeIndices.empty() &&
16899 (ValueToGatherNodes.lookup(
I).contains(
16900 VectorizableTree.front().get()) ||
16902 [&](
const EdgeInfo &EI) {
16903 return EI.UserTE == VectorizableTree.front().get() &&
16904 EI.EdgeIdx == UINT_MAX;
16906 !(GatheredLoadsEntriesFirst.has_value() &&
16907IE->Idx >= *GatheredLoadsEntriesFirst &&
16908 VectorizableTree.front()->isGather() &&
16913// Do not replace condition of the logical op in form select <cond>. 16914 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16915 (match(U.getUser(), m_LogicalAnd()) ||
16916 match(U.getUser(), m_LogicalOr())) &&
16917 U.getOperandNo() == 0;
16918 if (IsPoisoningLogicalOp) {
16919 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16922return UserIgnoreList->contains(
U.getUser());
16924// Replace conditions of the poisoning logical ops with the non-poison 16930// Retain to-be-deleted instructions for some debug-info bookkeeping and alias 16931// cache correctness. 16932// NOTE: removeInstructionAndOperands only marks the instruction for deletion 16933// - instructions are not deleted until later. 16934 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16937 InstrElementSize.
clear();
16939const TreeEntry &RootTE = *VectorizableTree.front();
16940Value *Vec = RootTE.VectorizedValue;
16941if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16942 It != MinBWs.end() &&
16943 ReductionBitWidth != It->second.first) {
16946 ReductionRoot->getIterator());
16950 cast<VectorType>(Vec->
getType())->getElementCount()),
16951 It->second.second);
16958 <<
" gather sequences instructions.\n");
16959// LICM InsertElementInst sequences. 16964// Check if this block is inside a loop. 16965Loop *L = LI->getLoopFor(
I->getParent());
16969// Check if it has a preheader. 16970BasicBlock *PreHeader = L->getLoopPreheader();
16974// If the vector or the element that we insert into it are 16975// instructions that are defined in this basic block then we can't 16976// hoist this instruction. 16978 auto *OpI = dyn_cast<Instruction>(V);
16979 return OpI && L->contains(OpI);
16983// We can hoist this instruction. Move it to the pre-header. 16985 CSEBlocks.
insert(PreHeader);
16988// Make a list of all reachable blocks in our CSE queue. 16997// Sort blocks by domination. This ensures we visit a block after all blocks 16998// dominating it are visited. 17000assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
17001"Different nodes should have different DFS numbers");
17002returnA->getDFSNumIn() <
B->getDFSNumIn();
17005// Less defined shuffles can be replaced by the more defined copies. 17006// Between two shuffles one is less defined if it has the same vector operands 17007// and its mask indeces are the same as in the first one or undefs. E.g. 17008// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, 17009// poison, <0, 0, 0, 0>. 17013if (I1->getType() != I2->getType())
17015auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17016auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17018return I1->isIdenticalTo(I2);
17019if (SI1->isIdenticalTo(SI2))
17021for (
intI = 0, E = SI1->getNumOperands();
I < E; ++
I)
17022if (SI1->getOperand(
I) != SI2->getOperand(
I))
17024// Check if the second instruction is more defined than the first one. 17025 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17027// Count trailing undefs in the mask to check the final number of used 17029unsigned LastUndefsCnt = 0;
17030for (
intI = 0, E = NewMask.
size();
I < E; ++
I) {
17036 NewMask[
I] != SM1[
I])
17039 NewMask[
I] = SM1[
I];
17041// Check if the last undefs actually change the final number of used vector 17043return SM1.
size() - LastUndefsCnt > 1 &&
17047 SM1.
size() - LastUndefsCnt));
17049// Perform O(N^2) search over the gather/shuffle sequences and merge identical 17050// instructions. TODO: We can further optimize this scan if we split the 17051// instructions into different buckets based on the insert lane. 17053for (
autoI = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
17056"Worklist not sorted properly!");
17058// For all instructions in blocks containing gather sequences: 17062if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17063 !GatherShuffleExtractSeq.contains(&In))
17066// Check if we can replace this instruction with any of the 17067// visited instructions. 17068bool Replaced =
false;
17071if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17072 DT->
dominates(V->getParent(), In.getParent())) {
17073 In.replaceAllUsesWith(V);
17075if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17076if (!NewMask.
empty())
17077 SI->setShuffleMask(NewMask);
17081if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17082 GatherShuffleExtractSeq.contains(V) &&
17083 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17084 DT->
dominates(In.getParent(), V->getParent())) {
17086 V->replaceAllUsesWith(&In);
17088if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17089if (!NewMask.
empty())
17090 SI->setShuffleMask(NewMask);
17098 Visited.push_back(&In);
17103 GatherShuffleExtractSeq.clear();
17106BoUpSLP::ScheduleData *
17108 ScheduleData *Bundle =
nullptr;
17109 ScheduleData *PrevInBundle =
nullptr;
17110for (
Value *V : VL) {
17113 ScheduleData *BundleMember = getScheduleData(V);
17115"no ScheduleData for bundle member " 17116"(maybe not in same basic block)");
17117assert(BundleMember->isSchedulingEntity() &&
17118"bundle member already part of other bundle");
17120 PrevInBundle->NextInBundle = BundleMember;
17122 Bundle = BundleMember;
17125// Group the instructions to a bundle. 17126 BundleMember->FirstInBundle = Bundle;
17127 PrevInBundle = BundleMember;
17129assert(Bundle &&
"Failed to find schedule bundle");
17133// Groups the instructions to a bundle (which is then a single scheduling entity) 17134// and schedules instructions until the bundle gets ready. 17135std::optional<BoUpSLP::ScheduleData *>
17137const InstructionsState &S) {
17138// No need to schedule PHIs, insertelement, extractelement and extractvalue 17140if (isa<PHINode>(S.getMainOp()) ||
17144// Initialize the instruction bundle. 17148auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17149 ScheduleData *Bundle) {
17150// The scheduling region got new instructions at the lower end (or it is a 17151// new region for the first bundle). This makes it necessary to 17152// recalculate all dependencies. 17153// It is seldom that this needs to be done a second time after adding the 17154// initial bundle to the region. 17155if (ScheduleEnd != OldScheduleEnd) {
17156for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17157if (ScheduleData *SD = getScheduleData(
I))
17158 SD->clearDependencies();
17163 <<
" in block " << BB->
getName() <<
"\n");
17164 calculateDependencies(Bundle,
/*InsertInReadyList=*/true, SLP);
17169 initialFillReadyList(ReadyInsts);
17172// Now try to schedule the new bundle or (if no bundle) just calculate 17173// dependencies. As soon as the bundle is "ready" it means that there are no 17174// cyclic dependencies and we can schedule it. Note that's important that we 17175// don't "schedule" the bundle yet (see cancelScheduling). 17176while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17177 !ReadyInsts.empty()) {
17178 ScheduleData *Picked = ReadyInsts.pop_back_val();
17179assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17180"must be ready to schedule");
17181 schedule(Picked, ReadyInsts);
17185// Make sure that the scheduling region contains all 17186// instructions of the bundle. 17187for (
Value *V : VL) {
17190if (!extendSchedulingRegion(V, S)) {
17191// If the scheduling region got new instructions at the lower end (or it 17192// is a new region for the first bundle). This makes it necessary to 17193// recalculate all dependencies. 17194// Otherwise the compiler may crash trying to incorrectly calculate 17195// dependencies and emit instruction in the wrong order at the actual 17197 TryScheduleBundleImpl(
/*ReSchedule=*/false,
nullptr);
17198return std::nullopt;
17202bool ReSchedule =
false;
17203for (
Value *V : VL) {
17206 ScheduleData *BundleMember = getScheduleData(V);
17208"no ScheduleData for bundle member (maybe not in same basic block)");
17210// Make sure we don't leave the pieces of the bundle in the ready list when 17211// whole bundle might not be ready. 17212 ReadyInsts.remove(BundleMember);
17214if (!BundleMember->IsScheduled)
17216// A bundle member was scheduled as single instruction before and now 17217// needs to be scheduled as part of the bundle. We just get rid of the 17218// existing schedule. 17219LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17220 <<
" was already scheduled\n");
17224auto *Bundle = buildBundle(VL);
17225 TryScheduleBundleImpl(ReSchedule, Bundle);
17226if (!Bundle->isReady()) {
17227 cancelScheduling(VL, S.getMainOp());
17228return std::nullopt;
17241 ScheduleData *Bundle = getScheduleData(OpValue);
17242LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17243assert(!Bundle->IsScheduled &&
17244"Can't cancel bundle which is already scheduled");
17245assert(Bundle->isSchedulingEntity() &&
17247"tried to unbundle something which is not a bundle");
17249// Remove the bundle from the ready list. 17250if (Bundle->isReady())
17251 ReadyInsts.remove(Bundle);
17253// Un-bundle: make single instructions out of the bundle. 17254 ScheduleData *BundleMember = Bundle;
17255while (BundleMember) {
17256assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17257 BundleMember->FirstInBundle = BundleMember;
17258 ScheduleData *Next = BundleMember->NextInBundle;
17259 BundleMember->NextInBundle =
nullptr;
17260 BundleMember->TE =
nullptr;
17261if (BundleMember->unscheduledDepsInBundle() == 0) {
17262 ReadyInsts.insert(BundleMember);
17264 BundleMember = Next;
17268BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17269// Allocate a new ScheduleData for the instruction. 17270if (ChunkPos >= ChunkSize) {
17271 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17274return &(ScheduleDataChunks.back()[ChunkPos++]);
17277bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17278Value *V,
const InstructionsState &S) {
17280assert(
I &&
"bundle member must be an instruction");
17283"phi nodes/insertelements/extractelements/extractvalues don't need to " 17285if (getScheduleData(
I))
17287if (!ScheduleStart) {
17288// It's the first instruction in the new region. 17289 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17291 ScheduleEnd =
I->getNextNode();
17292assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17293LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17296// Search up and down at the same time, because we don't know if the new 17297// instruction is above or below the existing scheduling region. 17298// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted 17299// against the budget. Otherwise debug info could affect codegen. 17301 ++ScheduleStart->getIterator().getReverse();
17306if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17307returnII->isAssumeLikeIntrinsic();
17310 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17311 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17312while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17314if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17315LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17322 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17323 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17325if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17326assert(
I->getParent() == ScheduleStart->getParent() &&
17327"Instruction is in wrong basic block.");
17328 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17334assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17335"Expected to reach top of the basic block or instruction down the " 17337assert(
I->getParent() == ScheduleEnd->getParent() &&
17338"Instruction is in wrong basic block.");
17339 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17341 ScheduleEnd =
I->getNextNode();
17342assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17343LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17347void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17349 ScheduleData *PrevLoadStore,
17350 ScheduleData *NextLoadStore) {
17351 ScheduleData *CurrentLoadStore = PrevLoadStore;
17353// No need to allocate data for non-schedulable instructions. 17356 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17358 SD = allocateScheduleDataChunks();
17359 ScheduleDataMap[
I] = SD;
17361assert(!isInSchedulingRegion(SD) &&
17362"new ScheduleData already in scheduling region");
17363 SD->init(SchedulingRegionID,
I);
17365if (
I->mayReadOrWriteMemory() &&
17366 (!isa<IntrinsicInst>(
I) ||
17367 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17368 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17369 Intrinsic::pseudoprobe))) {
17370// Update the linked list of memory accessing instructions. 17371if (CurrentLoadStore) {
17372 CurrentLoadStore->NextLoadStore = SD;
17374 FirstLoadStoreInRegion = SD;
17376 CurrentLoadStore = SD;
17379if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17380match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17381 RegionHasStackSave =
true;
17383if (NextLoadStore) {
17384if (CurrentLoadStore)
17385 CurrentLoadStore->NextLoadStore = NextLoadStore;
17387 LastLoadStoreInRegion = CurrentLoadStore;
17391void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17392bool InsertInReadyList,
17394assert(SD->isSchedulingEntity());
17399while (!WorkList.
empty()) {
17401for (ScheduleData *BundleMember = SD; BundleMember;
17402 BundleMember = BundleMember->NextInBundle) {
17403assert(isInSchedulingRegion(BundleMember));
17404if (BundleMember->hasValidDependencies())
17409 BundleMember->Dependencies = 0;
17410 BundleMember->resetUnscheduledDeps();
17412// Handle def-use chain dependencies. 17413for (
User *U : BundleMember->Inst->
users()) {
17414if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17415 BundleMember->Dependencies++;
17416 ScheduleData *DestBundle = UseSD->FirstInBundle;
17417if (!DestBundle->IsScheduled)
17418 BundleMember->incrementUnscheduledDeps(1);
17419if (!DestBundle->hasValidDependencies())
17425auto *DepDest = getScheduleData(
I);
17426assert(DepDest &&
"must be in schedule window");
17427 DepDest->ControlDependencies.push_back(BundleMember);
17428 BundleMember->Dependencies++;
17429 ScheduleData *DestBundle = DepDest->FirstInBundle;
17430if (!DestBundle->IsScheduled)
17431 BundleMember->incrementUnscheduledDeps(1);
17432if (!DestBundle->hasValidDependencies())
17436// Any instruction which isn't safe to speculate at the beginning of the 17437// block is control dependend on any early exit or non-willreturn call 17438// which proceeds it. 17440for (
Instruction *
I = BundleMember->Inst->getNextNode();
17441I != ScheduleEnd;
I =
I->getNextNode()) {
17445// Add the dependency 17446 MakeControlDependent(
I);
17449// Everything past here must be control dependent on I. 17454if (RegionHasStackSave) {
17455// If we have an inalloc alloca instruction, it needs to be scheduled 17456// after any preceeding stacksave. We also need to prevent any alloca 17457// from reordering above a preceeding stackrestore. 17458if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17459match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17460for (
Instruction *
I = BundleMember->Inst->getNextNode();
17461I != ScheduleEnd;
I =
I->getNextNode()) {
17462if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17463match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17464// Any allocas past here must be control dependent on I, and I 17465// must be memory dependend on BundleMember->Inst. 17468if (!isa<AllocaInst>(
I))
17471// Add the dependency 17472 MakeControlDependent(
I);
17476// In addition to the cases handle just above, we need to prevent 17477// allocas and loads/stores from moving below a stacksave or a 17478// stackrestore. Avoiding moving allocas below stackrestore is currently 17479// thought to be conservatism. Moving loads/stores below a stackrestore 17480// can lead to incorrect code. 17481if (isa<AllocaInst>(BundleMember->Inst) ||
17482 BundleMember->Inst->mayReadOrWriteMemory()) {
17483for (
Instruction *
I = BundleMember->Inst->getNextNode();
17484I != ScheduleEnd;
I =
I->getNextNode()) {
17485if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17486 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17489// Add the dependency 17490 MakeControlDependent(
I);
17496// Handle the memory dependencies (if any). 17497 ScheduleData *DepDest = BundleMember->NextLoadStore;
17502"NextLoadStore list for non memory effecting bundle?");
17504bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17505unsigned NumAliased = 0;
17506unsigned DistToSrc = 1;
17508for (; DepDest; DepDest = DepDest->NextLoadStore) {
17509assert(isInSchedulingRegion(DepDest));
17511// We have two limits to reduce the complexity: 17512// 1) AliasedCheckLimit: It's a small limit to reduce calls to 17513// SLP->isAliased (which is the expensive part in this loop). 17514// 2) MaxMemDepDistance: It's for very large blocks and it aborts 17515// the whole loop (even if the loop is fast, it's quadratic). 17516// It's important for the loop break condition (see below) to 17517// check this limit even between two read-only instructions. 17519 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17521 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17523// We increment the counter only if the locations are aliased 17524// (instead of counting all alias checks). This gives a better 17525// balance between reduced runtime and accurate dependencies. 17528 DepDest->MemoryDependencies.push_back(BundleMember);
17529 BundleMember->Dependencies++;
17530 ScheduleData *DestBundle = DepDest->FirstInBundle;
17531if (!DestBundle->IsScheduled) {
17532 BundleMember->incrementUnscheduledDeps(1);
17534if (!DestBundle->hasValidDependencies()) {
17539// Example, explaining the loop break condition: Let's assume our 17540// starting instruction is i0 and MaxMemDepDistance = 3. 17543// i0,i1,i2,i3,i4,i5,i6,i7,i8 17546// MaxMemDepDistance let us stop alias-checking at i3 and we add 17547// dependencies from i0 to i3,i4,.. (even if they are not aliased). 17548// Previously we already added dependencies from i3 to i6,i7,i8 17549// (because of MaxMemDepDistance). As we added a dependency from 17550// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 17551// and we can abort this loop at i6. 17557if (InsertInReadyList && SD->isReady()) {
17558 ReadyInsts.insert(SD);
17565void BoUpSLP::BlockScheduling::resetSchedule() {
17567"tried to reset schedule on block which has not been scheduled");
17568for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17569if (ScheduleData *SD = getScheduleData(
I)) {
17570assert(isInSchedulingRegion(SD) &&
17571"ScheduleData not in scheduling region");
17572 SD->IsScheduled =
false;
17573 SD->resetUnscheduledDeps();
17576 ReadyInsts.clear();
17579void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17580if (!BS->ScheduleStart)
17583LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17585// A key point - if we got here, pre-scheduling was able to find a valid 17586// scheduling of the sub-graph of the scheduling window which consists 17587// of all vector bundles and their transitive users. As such, we do not 17588// need to reschedule anything *outside of* that subgraph. 17590 BS->resetSchedule();
17592// For the real scheduling we use a more sophisticated ready-list: it is 17593// sorted by the original instruction location. This lets the final schedule 17594// be as close as possible to the original instruction order. 17595// WARNING: If changing this order causes a correctness issue, that means 17596// there is some missing dependence edge in the schedule data graph. 17597structScheduleDataCompare {
17598bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const{
17599return SD2->SchedulingPriority < SD1->SchedulingPriority;
17602 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17604// Ensure that all dependency data is updated (for nodes in the sub-graph) 17605// and fill the ready-list with initial instructions. 17607for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17608I =
I->getNextNode()) {
17609if (ScheduleData *SD = BS->getScheduleData(
I)) {
17610 [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);
17612 SD->isPartOfBundle() ==
17614"scheduler and vectorizer bundle mismatch");
17615 SD->FirstInBundle->SchedulingPriority =
Idx++;
17617if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17618 BS->calculateDependencies(SD,
false,
this);
17621 BS->initialFillReadyList(ReadyInsts);
17625// Do the "real" scheduling. 17626while (!ReadyInsts.empty()) {
17627 ScheduleData *Picked = *ReadyInsts.begin();
17628 ReadyInsts.erase(ReadyInsts.begin());
17630// Move the scheduled instruction(s) to their dedicated places, if not 17632for (ScheduleData *BundleMember = Picked; BundleMember;
17633 BundleMember = BundleMember->NextInBundle) {
17637 LastScheduledInst = PickedInst;
17640 BS->schedule(Picked, ReadyInsts);
17643// Check that we didn't break any of our invariants. 17644#ifdef EXPENSIVE_CHECKS 17648#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) 17649// Check that all schedulable entities got scheduled 17650for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17651 ScheduleData *SD = BS->getScheduleData(
I);
17652if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17653assert(SD->IsScheduled &&
"must be scheduled at this point");
17657// Avoid duplicate scheduling of the block. 17658 BS->ScheduleStart =
nullptr;
17662// If V is a store, just return the width of the stored value (or value 17663// truncated just before storing) without traversing the expression tree. 17664// This is the common case. 17665if (
auto *Store = dyn_cast<StoreInst>(V))
17666returnDL->getTypeSizeInBits(Store->getValueOperand()->getType());
17668if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17671auto E = InstrElementSize.
find(V);
17672if (E != InstrElementSize.
end())
17675// If V is not a store, we can traverse the expression tree to find loads 17676// that feed it. The type of the loaded value may indicate a more suitable 17677// width than V's type. We want to base the vector element size on the width 17678// of memory operations where possible. 17681if (
auto *
I = dyn_cast<Instruction>(V)) {
17686// Traverse the expression tree in bottom-up order looking for loads. If we 17687// encounter an instruction we don't yet handle, we give up. 17689Value *FirstNonBool =
nullptr;
17690while (!Worklist.
empty()) {
17693// We should only be looking at scalar instructions here. If the current 17694// instruction has a vector type, skip. 17695auto *Ty =
I->getType();
17696if (isa<VectorType>(Ty))
17698if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17703// If the current instruction is a load, update MaxWidth to reflect the 17704// width of the loaded value. 17705if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17706 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17708// Otherwise, we need to visit the operands of the instruction. We only 17709// handle the interesting cases from buildTree here. If an operand is an 17710// instruction we haven't yet visited and from the same basic block as the 17711// user or the use is a PHI node, we add it to the worklist. 17714for (
Use &U :
I->operands()) {
17715if (
auto *J = dyn_cast<Instruction>(U.get()))
17716if (Visited.
insert(J).second &&
17717 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17721if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17722 FirstNonBool = U.get();
17729// If we didn't encounter a memory access in the expression tree, or if we 17730// gave up for some reason, just return the width of V. Otherwise, return the 17731// maximum width we found. 17733if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17735 Width =
DL->getTypeSizeInBits(V->getType());
17739 InstrElementSize[
I] = Width;
17744bool BoUpSLP::collectValuesToDemote(
17745const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17748bool &IsProfitableToDemote,
bool IsTruncRoot)
const{
17749// We can always demote constants. 17750if (
all_of(E.Scalars, IsaPred<Constant>))
17753unsigned OrigBitWidth =
17754DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17760// Check if the node was analyzed already and must keep its original bitwidth. 17761if (NodesToKeepBWs.
contains(E.Idx))
17764// If the value is not a vectorized instruction in the expression and not used 17765// by the insertelement instruction and not used in multiple vector nodes, it 17766// cannot be demoted. 17767bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17768 if (isa<PoisonValue>(R))
17770 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17772auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17773if (isa<PoisonValue>(V))
17777// For lat shuffle of sext/zext with many uses need to check the extra bit 17778// for unsigned values, otherwise may have incorrect casting for reused 17781if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17787unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17790if (
auto *
I = dyn_cast<Instruction>(V)) {
17792unsigned BitWidth2 =
17793 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17794while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17800 BitWidth1 = std::min(BitWidth1, BitWidth2);
17805auto FinalAnalysis = [&,
TTI =
TTI]() {
17806if (!IsProfitableToDemote)
17809 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17811if (Res && E.isGather()) {
17812// Check possible extractelement instructions bases and final vector 17815for (
Value *V : E.Scalars) {
17816auto *EE = dyn_cast<ExtractElementInst>(V);
17819 UniqueBases.
insert(EE->getVectorOperand());
17821constunsigned VF = E.Scalars.size();
17822Type *OrigScalarTy = E.Scalars.front()->getType();
17823if (UniqueBases.
size() <= 2 ||
17834if (E.isGather() || !Visited.
insert(&E).second ||
17836 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17837 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17840return FinalAnalysis();
17843 return !all_of(V->users(), [=](User *U) {
17844 return getTreeEntry(U) ||
17845 (E.Idx == 0 && UserIgnoreList &&
17846 UserIgnoreList->contains(U)) ||
17847 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17848 !U->getType()->isScalableTy() &&
17849 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17850 }) && !IsPotentiallyTruncated(V,
BitWidth);
17857unsigned InitLevel = MaxDepthLevel;
17859unsigned Level = InitLevel;
17860if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17861 ToDemote, Visited, NodesToKeepBWs, Level,
17862 IsProfitableToDemote, IsTruncRoot)) {
17863if (!IsProfitableToDemote)
17866if (!FinalAnalysis())
17870 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17874auto AttemptCheckBitwidth =
17876// Try all bitwidth < OrigBitWidth. 17878unsigned BestFailBitwidth = 0;
17880if (Checker(
BitWidth, OrigBitWidth))
17882if (BestFailBitwidth == 0 && FinalAnalysis())
17886if (BestFailBitwidth == 0) {
17897auto TryProcessInstruction =
17903 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17906// Several vectorized uses? Check if we can truncate it, otherwise - 17908if (E.UserTreeIndices.size() > 1 &&
17909 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17912bool NeedToExit =
false;
17913if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17917if (!ProcessOperands(
Operands, NeedToExit))
17924// Record the entry that we can demote. 17926return IsProfitableToDemote;
17928switch (E.getOpcode()) {
17930// We can always demote truncations and extensions. Since truncations can 17931// seed additional demotion, we save the truncated value. 17932case Instruction::Trunc:
17933if (IsProfitableToDemoteRoot)
17934 IsProfitableToDemote =
true;
17935return TryProcessInstruction(
BitWidth);
17936case Instruction::ZExt:
17937case Instruction::SExt:
17938 IsProfitableToDemote =
true;
17939return TryProcessInstruction(
BitWidth);
17941// We can demote certain binary operations if we can demote both of their 17943case Instruction::Add:
17944case Instruction::Sub:
17945case Instruction::Mul:
17946case Instruction::And:
17947case Instruction::Or:
17948case Instruction::Xor: {
17949return TryProcessInstruction(
17950BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17952case Instruction::Freeze:
17953return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17954case Instruction::Shl: {
17955// If we are truncating the result of this SHL, and if it's a shift of an 17956// inrange amount, we can always perform a SHL in a smaller type. 17959 if (isa<PoisonValue>(V))
17961 auto *I = cast<Instruction>(V);
17962 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17963 return AmtKnownBits.getMaxValue().ult(BitWidth);
17966return TryProcessInstruction(
17967BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17969case Instruction::LShr: {
17970// If this is a truncate of a logical shr, we can truncate it to a smaller 17971// lshr iff we know that the bits we would otherwise be shifting in are 17973auto LShrChecker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
17975 if (isa<PoisonValue>(V))
17977 auto *I = cast<Instruction>(V);
17978 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17979 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17980 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17981 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17982 SimplifyQuery(*DL));
17985return TryProcessInstruction(
17986BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17989case Instruction::AShr: {
17990// If this is a truncate of an arithmetic shr, we can truncate it to a 17991// smaller ashr iff we know that all the bits from the sign bit of the 17992// original type and the sign bit of the truncate type are similar. 17993auto AShrChecker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
17995 if (isa<PoisonValue>(V))
17997 auto *I = cast<Instruction>(V);
17998 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17999 unsigned ShiftedBits = OrigBitWidth - BitWidth;
18000 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18001 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18005return TryProcessInstruction(
18006BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18009case Instruction::UDiv:
18010case Instruction::URem: {
18011// UDiv and URem can be truncated if all the truncated bits are zero. 18012auto Checker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
18015 auto *I = cast<Instruction>(V);
18016 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18017 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18018 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18021return TryProcessInstruction(
18022BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18025// We can demote selects if we can demote their true and false values. 18026case Instruction::Select: {
18027return TryProcessInstruction(
18028BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18031// We can demote phis if we can demote all their incoming operands. Note that 18032// we don't need to worry about cycles since we ensure single use above. 18033case Instruction::PHI: {
18034constunsigned NumOps = E.getNumOperands();
18037 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
18039return TryProcessInstruction(
BitWidth, Ops);
18042case Instruction::Call: {
18043auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18047if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
18048ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
18052auto CompChecker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
18055 auto *I = cast<Instruction>(V);
18056 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18057 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18058 return MaskedValueIsZero(I->getOperand(0), Mask,
18059 SimplifyQuery(*DL)) &&
18060 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18062assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
18063"Expected min/max intrinsics only.");
18064unsigned SignBits = OrigBitWidth -
BitWidth;
18070return SignBits <= Op0SignBits &&
18071 ((SignBits != Op0SignBits &&
18075 SignBits <= Op1SignBits &&
18076 ((SignBits != Op1SignBits &&
18081auto AbsChecker = [&](
unsignedBitWidth,
unsigned OrigBitWidth) {
18084 auto *I = cast<Instruction>(V);
18085 unsigned SignBits = OrigBitWidth - BitWidth;
18086 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18087 unsigned Op0SignBits =
18088 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18089 return SignBits <= Op0SignBits &&
18090 ((SignBits != Op0SignBits &&
18091 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18092 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18095if (
ID != Intrinsic::abs) {
18096Operands.push_back(getOperandEntry(&E, 1));
18097 CallChecker = CompChecker;
18099 CallChecker = AbsChecker;
18102 std::numeric_limits<InstructionCost::CostType>::max();
18104unsigned VF = E.Scalars.size();
18105// Choose the best bitwidth based on cost estimations. 18114if (
Cost < BestCost) {
18120 [[maybe_unused]]
bool NeedToExit;
18121 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18126// Otherwise, conservatively give up. 18131return FinalAnalysis();
18137// We only attempt to truncate integer expressions. 18138bool IsStoreOrInsertElt =
18139 VectorizableTree.front()->hasState() &&
18140 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18141 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18142if ((IsStoreOrInsertElt || UserIgnoreList) &&
18143 ExtraBitWidthNodes.
size() <= 1 &&
18144 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18145 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18148unsigned NodeIdx = 0;
18149if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18152// Ensure the roots of the vectorizable tree don't form a cycle. 18153if (VectorizableTree[NodeIdx]->
isGather() ||
18154 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18155 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18157return EI.
UserTE->Idx > NodeIdx;
18161// The first value node for store/insertelement is sext/zext/trunc? Skip it, 18162// resize to the final type. 18163bool IsTruncRoot =
false;
18164bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18168 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18169 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18170assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18173 IsProfitableToDemoteRoot =
true;
18177// Analyzed the reduction already and not profitable - exit. 18178if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18182auto ComputeMaxBitWidth =
18183 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
18184unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
18186// Check if the root is trunc and the next node is gather/buildvector, then 18187// keep trunc in scalars, which is free in most cases. 18188if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18189 !NodesToKeepBWs.
contains(E.Idx) &&
18190 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18192return V->hasOneUse() || isa<Constant>(V) ||
18195 const TreeEntry *TE = getTreeEntry(U);
18196 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18197 if (TE == UserTE || !TE)
18199 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18201 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18202 SelectInst>(UserTE->getMainOp()))
18204 unsigned UserTESz = DL->getTypeSizeInBits(
18205 UserTE->Scalars.front()->getType());
18206 auto It = MinBWs.find(TE);
18207 if (It != MinBWs.end() && It->second.first > UserTESz)
18209 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18213const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18214auto It = MinBWs.
find(UserTE);
18215if (It != MinBWs.
end())
18216return It->second.first;
18217unsigned MaxBitWidth =
18218DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18219 MaxBitWidth =
bit_ceil(MaxBitWidth);
18220if (MaxBitWidth < 8 && MaxBitWidth > 1)
18228unsigned VF = E.getVectorFactor();
18229Type *ScalarTy = E.Scalars.front()->getType();
18231auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18236 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18242// The maximum bit width required to represent all the values that can be 18243// demoted without loss of precision. It would be safe to truncate the roots 18244// of the expression to this width. 18245unsigned MaxBitWidth = 1u;
18247// True if the roots can be zero-extended back to their original type, 18248// rather than sign-extended. We know that if the leading bits are not 18249// demanded, we can safely zero-extend. So we initialize IsKnownPositive to 18251// Determine if the sign bit of all the roots is known to be zero. If not, 18252// IsKnownPositive is set to False. 18253bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18254 if (isa<PoisonValue>(R))
18256 KnownBits Known = computeKnownBits(R, *DL);
18257 return Known.isNonNegative();
18260// We first check if all the bits of the roots are demanded. If they're not, 18261// we can truncate the roots to this narrower type. 18262for (
Value *Root : E.Scalars) {
18263if (isa<PoisonValue>(Root))
18268unsigned BitWidth1 = NumTypeBits - NumSignBits;
18269// If we can't prove that the sign bit is zero, we must add one to the 18270// maximum bit width to account for the unknown sign bit. This preserves 18271// the existing sign bit so we can safely sign-extend the root back to the 18272// original type. Otherwise, if we know the sign bit is zero, we will 18273// zero-extend the root instead. 18275// FIXME: This is somewhat suboptimal, as there will be cases where adding 18276// one to the maximum bit width will yield a larger-than-necessary 18277// type. In general, we need to add an extra bit only if we can't 18278// prove that the upper bit of the original type is equal to the 18279// upper bit of the proposed smaller type. If these two bits are 18280// the same (either zero or one) we know that sign-extending from 18281// the smaller type will result in the same value. Here, since we 18282// can't yet prove this, we are just making the proposed smaller 18283// type larger to ensure correctness. 18284if (!IsKnownPositive)
18288unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18290 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18293if (MaxBitWidth < 8 && MaxBitWidth > 1)
18296// If the original type is large, but reduced type does not improve the reg 18306unsigned Opcode = E.getOpcode();
18307bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18308 Opcode == Instruction::SExt ||
18309 Opcode == Instruction::ZExt || NumParts > 1;
18310// Conservatively determine if we can actually truncate the roots of the 18311// expression. Collect the values that can be demoted in ToDemote and 18312// additional roots that require investigating in Roots. 18314unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18315bool NeedToDemote = IsProfitableToDemote;
18317if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18318 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18319 NeedToDemote, IsTruncRoot) ||
18320 (MaxDepthLevel <= Limit &&
18321 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18322 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18323DL->getTypeSizeInBits(TreeRootIT) /
18324DL->getTypeSizeInBits(
18325 E.getMainOp()->getOperand(0)->getType()) >
18328// Round MaxBitWidth up to the next power-of-two. 18329 MaxBitWidth =
bit_ceil(MaxBitWidth);
18334// If we can truncate the root, we must collect additional values that might 18335// be demoted as a result. That is, those seeded by truncations we will 18337// Add reduction ops sizes, if any. 18338if (UserIgnoreList &&
18339 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18340// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n 18342if (
all_of(*UserIgnoreList,
18344return isa<PoisonValue>(V) ||
18345 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18347 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18348 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18349 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18351 ReductionBitWidth = 1;
18353for (
Value *V : *UserIgnoreList) {
18354if (isa<PoisonValue>(V))
18357TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18358unsigned BitWidth1 = NumTypeBits - NumSignBits;
18361unsigned BitWidth2 = BitWidth1;
18364 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18366 ReductionBitWidth =
18367 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18369if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18370 ReductionBitWidth = 8;
18372 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18375bool IsTopRoot = NodeIdx == 0;
18376while (NodeIdx < VectorizableTree.size() &&
18377 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18378 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18383bool IsSignedCmp =
false;
18384while (NodeIdx < VectorizableTree.size()) {
18388 ReductionBitWidth ==
18389DL->getTypeSizeInBits(
18390 VectorizableTree.front()->Scalars.front()->getType()))
18392unsigned MaxBitWidth = ComputeMaxBitWidth(
18393 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18394 IsTruncRoot, IsSignedCmp);
18395if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
18396if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18397 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18398elseif (MaxBitWidth == 0)
18399 ReductionBitWidth = 0;
18402for (
unsignedIdx : RootDemotes) {
18405DL->getTypeSizeInBits(V->getType()->getScalarType());
18406if (OrigBitWidth > MaxBitWidth) {
18414 RootDemotes.clear();
18416 IsProfitableToDemoteRoot =
true;
18418if (ExtraBitWidthNodes.
empty()) {
18419 NodeIdx = VectorizableTree.size();
18421unsigned NewIdx = 0;
18423 NewIdx = *ExtraBitWidthNodes.
begin();
18424 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
18425 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
18428 NodeIdx < VectorizableTree.size() &&
18429any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18432 EI.
UserTE->getOpcode() == Instruction::Trunc &&
18433 !EI.
UserTE->isAltShuffle();
18436 NodeIdx < VectorizableTree.size() &&
18438 VectorizableTree[NodeIdx]->UserTreeIndices,
18440return (EI.
UserTE->hasState() &&
18441 EI.
UserTE->getOpcode() == Instruction::ICmp) &&
18443 auto *IC = dyn_cast<ICmpInst>(V);
18446 !isKnownNonNegative(IC->getOperand(0),
18447 SimplifyQuery(*DL)) ||
18448 !isKnownNonNegative(IC->getOperand(1),
18449 SimplifyQuery(*DL)));
18454// If the maximum bit width we compute is less than the width of the roots' 18455// type, we can proceed with the narrowing. Otherwise, do nothing. 18456if (MaxBitWidth == 0 ||
18458 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18466// Finally, map the values we can demote to the maximum bit with we 18468for (
unsignedIdx : ToDemote) {
18469 TreeEntry *TE = VectorizableTree[
Idx].get();
18472bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
18473 if (isa<PoisonValue>(R))
18475 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18493bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18518DL = &
F.getDataLayout();
18522bool Changed =
false;
18524// If the target claims to have no vector registers don't attempt 18528dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18532// Don't vectorize when the attribute NoImplicitFloat is used. 18533if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18536LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18538// Use the bottom up slp vectorizer to construct chains that start with 18539// store instructions. 18540BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18542// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to 18543// delete instructions. 18545// Update DFS numbers now so that we can use them for ordering. 18548// Scan the blocks in the function in post order. 18553// Start new block - clear the list of reduction roots. 18554 R.clearReductionData();
18555 collectSeedInstructions(BB);
18557// Vectorize trees that end at stores. 18558if (!Stores.
empty()) {
18560 <<
" underlying objects.\n");
18561 Changed |= vectorizeStoreChains(R);
18564// Vectorize trees that end at reductions. 18565 Changed |= vectorizeChainsInBlock(BB, R);
18567// Vectorize the index computations of getelementptr instructions. This 18568// is primarily intended to catch gather-like idioms ending at 18569// non-consecutive loads. 18570if (!GEPs.
empty()) {
18572 <<
" underlying objects.\n");
18573 Changed |= vectorizeGEPIndices(BB, R);
18578 R.optimizeGatherSequence();
18586unsignedIdx,
unsigned MinVF,
18591constunsigned Sz = R.getVectorElementSize(Chain[0]);
18592unsigned VF = Chain.
size();
18596 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18598 VF < 2 || VF < MinVF) {
18599// Check if vectorizing with a non-power-of-2 VF should be considered. At 18600// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost 18601// all vector lanes are used. 18610for (
Value *V : Chain)
18611 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18612// Operands are not same/alt opcodes or non-power-of-2 uniques - exit. 18614if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18616bool IsAllowedSize =
18620if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18621 (!S.getMainOp()->isSafeToRemove() ||
18624 return !isa<ExtractElementInst>(V) &&
18625 (V->getNumUses() > Chain.size() ||
18626 any_of(V->users(), [&](User *U) {
18627 return !Stores.contains(U);
18630 (ValOps.
size() > Chain.size() / 2 && !S)) {
18631Size = (!IsAllowedSize && S) ? 1 : 2;
18635if (
R.isLoadCombineCandidate(Chain))
18638// Check if tree tiny and store itself or its value is not vectorized. 18639if (
R.isTreeTinyAndNotFullyVectorizable()) {
18640if (
R.isGathered(Chain.front()) ||
18641R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18642return std::nullopt;
18643Size =
R.getCanonicalGraphSize();
18646R.reorderTopToBottom();
18647R.reorderBottomToTop();
18649R.buildExternalUses();
18651R.computeMinimumValueSizes();
18653Size =
R.getCanonicalGraphSize();
18654if (S && S.getOpcode() == Instruction::Load)
18655Size = 2;
// cut off masked gather small trees 18665 cast<StoreInst>(Chain[0]))
18666 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18667 <<
" and with tree size " 18668 <<
NV(
"TreeSize",
R.getTreeSize()));
18677/// Checks if the quadratic mean deviation is less than 90% of the mean size. 18682 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18683 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18684 unsigned Size = First ? Val.first : Val.second;
18696 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18697 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18698 unsigned P = First ? Val.first : Val.second;
18701 return V + (P - Mean) * (P - Mean);
18704return Dev * 81 / (Mean * Mean) == 0;
18707bool SLPVectorizerPass::vectorizeStores(
18709DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18711// We may run into multiple chains that merge into a single chain. We mark the 18712// stores that we vectorized so that we don't visit the same store twice. 18714bool Changed =
false;
18716structStoreDistCompare {
18717bool operator()(
const std::pair<unsigned, int> &Op1,
18718const std::pair<unsigned, int> &Op2)
const{
18719return Op1.second < Op2.second;
18722// A set of pairs (index of store in Stores array ref, Distance of the store 18723// address relative to base store address in units). 18724usingStoreIndexToDistSet =
18725 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18726auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18729// Collect the chain into a list. 18733 PrevDist =
Data.second;
18734if (
Idx !=
Set.size() - 1)
18739Operands.push_back(Stores[DataVar.first]);
18740 PrevDist = DataVar.second;
18745 .
insert({Operands.front(),
18746 cast<StoreInst>(Operands.front())->getValueOperand(),
18748 cast<StoreInst>(Operands.back())->getValueOperand(),
18753unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18754unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18758 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18760Type *StoreTy =
Store->getValueOperand()->getType();
18761Type *ValueTy = StoreTy;
18762if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18763 ValueTy = Trunc->getSrcTy();
18764unsigned MinVF = std::max<unsigned>(
18766R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18769if (MaxVF < MinVF) {
18770LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18772 <<
"MinVF (" << MinVF <<
")\n");
18776unsigned NonPowerOf2VF = 0;
18778// First try vectorizing with a non-power-of-2 VF. At the moment, only 18779// consider cases where VF + 1 is a power-of-2, i.e. almost all vector 18781unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18783 NonPowerOf2VF = CandVF;
18784assert(NonPowerOf2VF != MaxVF &&
18785"Non-power-of-2 VF should not be equal to MaxVF");
18789unsigned MaxRegVF = MaxVF;
18791if (MaxVF < MinVF) {
18792LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18794 <<
"MinVF (" << MinVF <<
")\n");
18800unsignedSize = MinVF;
18802 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18806unsigned Repeat = 0;
18807constexprunsigned MaxAttempts = 4;
18809for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18810P.first =
P.second = 1;
18813auto IsNotVectorized = [](
boolFirst,
18814const std::pair<unsigned, unsigned> &
P) {
18815returnFirst ?
P.first > 0 :
P.second > 0;
18817auto IsVectorized = [](
boolFirst,
18818const std::pair<unsigned, unsigned> &
P) {
18819returnFirst ?
P.first == 0 :
P.second == 0;
18821auto VFIsProfitable = [](
boolFirst,
unsignedSize,
18822const std::pair<unsigned, unsigned> &
P) {
18825auto FirstSizeSame = [](
unsignedSize,
18826const std::pair<unsigned, unsigned> &
P) {
18827returnSize ==
P.first;
18831bool RepeatChanged =
false;
18832bool AnyProfitableGraph =
false;
18833for (
unsignedSize : CandidateVFs) {
18834 AnyProfitableGraph =
false;
18835unsigned StartIdx = std::distance(
18836 RangeSizes.begin(),
18837find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18838 std::placeholders::_1)));
18839while (StartIdx <
End) {
18841 std::distance(RangeSizes.begin(),
18842find_if(RangeSizes.drop_front(StartIdx),
18843 std::bind(IsVectorized,
Size >= MaxRegVF,
18844 std::placeholders::_1)));
18845unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18846for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18848Size >= MaxRegVF)) {
18855return cast<StoreInst>(V)
18856 ->getValueOperand()
18858 cast<StoreInst>(Slice.
front())
18859 ->getValueOperand()
18862"Expected all operands of same type.");
18863if (!NonSchedulable.empty()) {
18864auto [NonSchedSizeMax, NonSchedSizeMin] =
18865 NonSchedulable.lookup(Slice.
front());
18866if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18867 Cnt += NonSchedSizeMax;
18872 std::optional<bool> Res =
18873 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18877 .first->getSecond()
18880// Mark the vectorized stores so that we don't vectorize them 18883// Mark the vectorized stores so that we don't vectorize them 18885 AnyProfitableGraph = RepeatChanged = Changed =
true;
18886// If we vectorized initial block, no need to try to vectorize 18889 [](std::pair<unsigned, unsigned> &
P) {
18890 P.first = P.second = 0;
18892if (Cnt < StartIdx + MinVF) {
18893for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18894 [](std::pair<unsigned, unsigned> &
P) {
18895 P.first = P.second = 0;
18897 StartIdx = Cnt +
Size;
18899if (Cnt > Sz -
Size - MinVF) {
18901 [](std::pair<unsigned, unsigned> &
P) {
18902 P.first = P.second = 0;
18911if (
Size > 2 && Res &&
18913 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18914 std::placeholders::_1))) {
18918// Check for the very big VFs that we're not rebuilding same 18919// trees, just with larger number of elements. 18920if (
Size > MaxRegVF && TreeSize > 1 &&
18922 std::bind(FirstSizeSame, TreeSize,
18923 std::placeholders::_1))) {
18925while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18931 [&](std::pair<unsigned, unsigned> &
P) {
18932 if (Size >= MaxRegVF)
18933 P.second = std::max(P.second, TreeSize);
18935 P.first = std::max(P.first, TreeSize);
18938 AnyProfitableGraph =
true;
18942if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18943 AnyProfitableGraph =
true;
18944 StartIdx = std::distance(
18945 RangeSizes.begin(),
18946find_if(RangeSizes.drop_front(Sz),
18947 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18948 std::placeholders::_1)));
18953// All values vectorized - exit. 18954if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18955returnP.first == 0 &&
P.second == 0;
18958// Check if tried all attempts or no need for the last attempts at all. 18959if (Repeat >= MaxAttempts ||
18960 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18962constexprunsigned StoresLimit = 64;
18963constunsigned MaxTotalNum = std::min<unsigned>(
18965static_cast<unsigned>(
18968 RangeSizes.begin(),
18969find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18970 std::placeholders::_1))) +
18972unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18975 CandidateVFs.clear();
18977 CandidateVFs.push_back(Limit);
18978if (VF > MaxTotalNum || VF >= StoresLimit)
18980for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18982P.first = std::max(
P.second,
P.first);
18984// Last attempt to vectorize max number of elements, if all previous 18985// attempts were unsuccessful because of the cost issues. 18986 CandidateVFs.push_back(VF);
18991// Stores pair (first: index of the store into Stores array ref, address of 18992// which taken as base, second: sorted set of pairs {index, dist}, which are 18993// indices of stores in the set and their store location distances relative to 18994// the base address). 18996// Need to store the index of the very first store separately, since the set 18997// may be reordered after the insertion and the first store may be moved. This 18998// container allows to reduce number of calls of getPointersDiff() function. 19000// Inserts the specified store SI with the given index Idx to the set of the 19001// stores. If the store with the same distance is found already - stop 19002// insertion, try to vectorize already found stores. If some stores from this 19003// sequence were not vectorized - try to vectorize them with the new store 19004// later. But this logic is applied only to the stores, that come before the 19005// previous store with the same distance. 19012// - Scan this from the last to first store. The very first bunch of stores is 19013// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores 19015// - The next store in the list - #1 - has the same distance from store #5 as 19017// - Try to vectorize sequence of stores 4,2,3,5. 19018// - If all these stores are vectorized - just drop them. 19019// - If some of them are not vectorized (say, #3 and #5), do extra analysis. 19020// - Start new stores sequence. 19021// The new bunch of stores is {1, {1, 0}}. 19022// - Add the stores from previous sequence, that were not vectorized. 19023// Here we consider the stores in the reversed order, rather they are used in 19024// the IR (Stores are reversed already, see vectorizeStoreChains() function). 19025// Store #3 can be added -> comes after store #4 with the same distance as 19027// Store #5 cannot be added - comes before store #4. 19028// This logic allows to improve the compile time, we assume that the stores 19029// after previous store with the same distance most likely have memory 19030// dependencies and no need to waste compile time to try to vectorize them. 19031// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}. 19033for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19035 Stores[
Set.first]->getValueOperand()->getType(),
19036 Stores[
Set.first]->getPointerOperand(),
19037SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
19038/*StrictCheck=*/true);
19041auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
19042if (It ==
Set.second.end()) {
19043Set.second.emplace(
Idx, *Diff);
19046// Try to vectorize the first found set to avoid duplicate analysis. 19047 TryToVectorize(
Set.second);
19048unsigned ItIdx = It->first;
19049int ItDist = It->second;
19050 StoreIndexToDistSet PrevSet;
19051copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
19052 [&](
const std::pair<unsigned, int> &Pair) {
19053 return Pair.first > ItIdx;
19057Set.second.emplace(
Idx, 0);
19058// Insert stores that followed previous match to try to vectorize them 19060unsigned StartIdx = ItIdx + 1;
19062// Distances to previously found dup store (or this store, since they 19063// store to the same addresses). 19065for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
19066// Do not try to vectorize sequences, we already tried. 19067if (VectorizedStores.
contains(Stores[Pair.first]))
19069unsigned BI = Pair.first - StartIdx;
19070 UsedStores.set(BI);
19071 Dists[BI] = Pair.second - ItDist;
19073for (
unsignedI = StartIdx;
I <
Idx; ++
I) {
19074unsigned BI =
I - StartIdx;
19075if (UsedStores.test(BI))
19076Set.second.emplace(
I, Dists[BI]);
19080auto &Res = SortedStores.emplace_back();
19082 Res.second.emplace(
Idx, 0);
19084Type *PrevValTy =
nullptr;
19086if (
R.isDeleted(SI))
19089 PrevValTy =
SI->getValueOperand()->getType();
19090// Check that we do not try to vectorize stores of different types. 19091if (PrevValTy !=
SI->getValueOperand()->getType()) {
19092for (
auto &Set : SortedStores)
19093 TryToVectorize(
Set.second);
19094 SortedStores.clear();
19095 PrevValTy =
SI->getValueOperand()->getType();
19097 FillStoresSet(
I, SI);
19100// Final vectorization attempt. 19101for (
auto &Set : SortedStores)
19102 TryToVectorize(
Set.second);
19107void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19108// Initialize the collections. We will make a single pass over the block. 19112// Visit the store and getelementptr instructions in BB and organize them in 19113// Stores and GEPs according to the underlying objects of their pointer 19116// Ignore store instructions that are volatile or have a pointer operand 19117// that doesn't point to a scalar type. 19118if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19119if (!
SI->isSimple())
19126// Ignore getelementptr instructions that have more than one index, a 19127// constant index, or a pointer operand that doesn't point to a scalar 19129elseif (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19130if (
GEP->getNumIndices() != 1)
19133if (isa<Constant>(
Idx))
19137if (
GEP->getType()->isVectorTy())
19149LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = " 19150 << VL.
size() <<
".\n");
19152// Check that all of the parts are instructions of the same type, 19153// we permit an alternate opcode via InstructionsState. 19159// Make sure invalid types (including vector type) are rejected before 19160// determining vectorization factor for scalar instructions. 19161for (
Value *V : VL) {
19162Type *Ty =
V->getType();
19164// NOTE: the following will give user internal llvm type name, which may 19166R.getORE()->emit([&]() {
19167 std::string TypeStr;
19171 <<
"Cannot SLP vectorize list: type " 19172 << TypeStr +
" is unsupported by vectorizer";
19179unsigned Sz =
R.getVectorElementSize(I0);
19180unsigned MinVF =
R.getMinVF(Sz);
19181unsigned MaxVF = std::max<unsigned>(
19183 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19185R.getORE()->emit([&]() {
19187 <<
"Cannot SLP vectorize list: vectorization factor " 19188 <<
"less than 2 is not supported";
19193bool Changed =
false;
19194bool CandidateFound =
false;
19197unsigned NextInst = 0, MaxInst = VL.size();
19198for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19200// No actual vectorization should happen, if number of parts is the same as 19201// provided vectorization factor (i.e. the scalar type is used for vector 19202// code during codegen). 19206for (
unsignedI = NextInst;
I < MaxInst; ++
I) {
19207unsigned ActualVF = std::min(MaxInst -
I, VF);
19212if (MaxVFOnly && ActualVF < MaxVF)
19214if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19219for (
Value *V : VL.drop_front(
I)) {
19220// Check that a previous iteration of this loop did not delete the 19222if (
auto *Inst = dyn_cast<Instruction>(V);
19223 !Inst || !
R.isDeleted(Inst)) {
19226if (
Idx == ActualVF)
19230// Not enough vectorizable instructions - exit. 19231if (
Idx != ActualVF)
19234LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations " 19238if (
R.isTreeTinyAndNotFullyVectorizable())
19240R.reorderTopToBottom();
19241R.reorderBottomToTop(
19242/*IgnoreReorder=*/!isa<InsertElementInst>(Ops.
front()) &&
19243 !
R.doesRootHaveInTreeUses());
19245R.buildExternalUses();
19247R.computeMinimumValueSizes();
19249 CandidateFound =
true;
19250 MinCost = std::min(MinCost,
Cost);
19253 <<
" for VF=" << ActualVF <<
"\n");
19257 cast<Instruction>(Ops[0]))
19258 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19259 <<
" and with tree size " 19260 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19263// Move to the next bundle. 19271if (!Changed && CandidateFound) {
19272R.getORE()->emit([&]() {
19274 <<
"List vectorization was possible but not beneficial with cost " 19275 <<
ore::NV(
"Cost", MinCost) <<
" >= " 19278 }
elseif (!Changed) {
19279R.getORE()->emit([&]() {
19281 <<
"Cannot SLP vectorize list: vectorization was impossible" 19282 <<
" with available vectorization factors";
19292if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19297// Vectorize in current basic block only. 19298auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19299auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19300if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19301R.isDeleted(Op0) ||
R.isDeleted(Op1))
19304// First collect all possible candidates 19308auto *
A = dyn_cast<BinaryOperator>(Op0);
19309auto *
B = dyn_cast<BinaryOperator>(Op1);
19311if (
A &&
B &&
B->hasOneUse()) {
19312auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19313auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19314if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19316if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19320if (
B &&
A &&
A->hasOneUse()) {
19321auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19322auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19323if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19325if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19329if (Candidates.
size() == 1)
19330return tryToVectorizeList({Op0, Op1},
R);
19332// We have multiple options. Try to pick the single best. 19333 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19336return tryToVectorizeList(
19337 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19342/// Model horizontal reductions. 19344/// A horizontal reduction is a tree of reduction instructions that has values 19345/// that can be put into a vector as its leaves. For example: 19352/// This tree has "mul" as its leaf values and "+" as its reduction 19353/// instructions. A reduction can feed into a store or a binary operation 19371 ReductionOpsListType ReductionOps;
19372 /// List of possibly reduced values. 19374 /// Maps reduced value to the corresponding reduction operation. 19377 /// The type of reduction operation. 19379 /// Checks if the optimization of original scalar identity operations on 19380 /// matched horizontal reductions is enabled and allowed. 19381bool IsSupportedHorRdxIdentityOp =
false;
19388// And/or are potentially poison-safe logical patterns like: 19389// select x, y, false 19390// select x, true, y 19392return isa<SelectInst>(
I) &&
19396 /// Checks if instruction is associative and can be vectorized. 19398if (Kind == RecurKind::None)
19401// Integer ops that map to select instructions or intrinsics are fine. 19406if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19407// FP min/max are associative except for NaN and -0.0. We do not 19408// have to rule out -0.0 here because the intrinsic semantics do not 19409// specify a fixed result for it. 19410returnI->getFastMathFlags().noNaNs();
19413if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19416returnI->isAssociative();
19420// Poison-safe 'or' takes the form: select X, true, Y 19421// To make that work with the normal operand processing, we skip the 19422// true value operand. 19423// TODO: Change the code and data structures to handle this without a hack. 19425returnI->getOperand(2);
19426returnI->getOperand(
Index);
19429 /// Creates reduction operation with the current opcode. 19433case RecurKind::Or: {
19441case RecurKind::And: {
19449case RecurKind::Add:
19450case RecurKind::Mul:
19451case RecurKind::Xor:
19452case RecurKind::FAdd:
19453case RecurKind::FMul: {
19458case RecurKind::SMax:
19459case RecurKind::SMin:
19460case RecurKind::UMax:
19461case RecurKind::UMin:
19468case RecurKind::FMax:
19469case RecurKind::FMin:
19470case RecurKind::FMaximum:
19471case RecurKind::FMinimum: {
19480 /// Creates reduction operation with the current opcode with the IR flags 19481 /// from \p ReductionOps, dropping nuw/nsw flags. 19484const ReductionOpsListType &ReductionOps) {
19485bool UseSelect = ReductionOps.size() == 2 ||
19487 (ReductionOps.size() == 1 &&
19488any_of(ReductionOps.front(), IsaPred<SelectInst>));
19489assert((!UseSelect || ReductionOps.size() != 2 ||
19490 isa<SelectInst>(ReductionOps[1][0])) &&
19491"Expected cmp + select pairs for reduction");
19494if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19496/*IncludeWrapFlags=*/false);
19498/*IncludeWrapFlags=*/false);
19508auto *
I = dyn_cast<Instruction>(V);
19510return RecurKind::None;
19512return RecurKind::Add;
19514return RecurKind::Mul;
19517return RecurKind::And;
19520return RecurKind::Or;
19522return RecurKind::Xor;
19524return RecurKind::FAdd;
19526return RecurKind::FMul;
19529return RecurKind::FMax;
19531return RecurKind::FMin;
19534return RecurKind::FMaximum;
19536return RecurKind::FMinimum;
19537// This matches either cmp+select or intrinsics. SLP is expected to handle 19539// TODO: If we are canonicalizing to intrinsics, we can remove several 19540// special-case paths that deal with selects. 19542return RecurKind::SMax;
19544return RecurKind::SMin;
19546return RecurKind::UMax;
19548return RecurKind::UMin;
19550if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19551// Try harder: look for min/max pattern based on instructions producing 19552// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). 19553// During the intermediate stages of SLP, it's very common to have 19554// pattern like this (since optimizeGatherSequence is run only once 19556// %1 = extractelement <2 x i32> %a, i32 0 19557// %2 = extractelement <2 x i32> %a, i32 1 19558// %cond = icmp sgt i32 %1, %2 19559// %3 = extractelement <2 x i32> %a, i32 0 19560// %4 = extractelement <2 x i32> %a, i32 1 19561// %select = select i1 %cond, i32 %3, i32 %4 19570// TODO: Support inverse predicates. 19572if (!isa<ExtractElementInst>(
RHS) ||
19574return RecurKind::None;
19576if (!isa<ExtractElementInst>(
LHS) ||
19578return RecurKind::None;
19580if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19581return RecurKind::None;
19585return RecurKind::None;
19590return RecurKind::None;
19593return RecurKind::SMax;
19596return RecurKind::SMin;
19599return RecurKind::UMax;
19602return RecurKind::UMin;
19605return RecurKind::None;
19608 /// Get the index of the first operand. 19610return isCmpSelMinMax(
I) ? 1 : 0;
19614 /// Total number of operands in the reduction operation. 19616return isCmpSelMinMax(
I) ? 3 : 2;
19619 /// Checks if the instruction is in basic block \p BB. 19620 /// For a cmp+sel min/max reduction check that both ops are in \p BB. 19622if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19623auto *Sel = cast<SelectInst>(
I);
19624auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19625return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19627returnI->getParent() == BB;
19630 /// Expected number of uses for reduction operations/reduced values. 19631staticbool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19632if (IsCmpSelMinMax) {
19633// SelectInst must be used twice while the condition op must have single 19635if (
auto *Sel = dyn_cast<SelectInst>(
I))
19636return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19637returnI->hasNUses(2);
19640// Arithmetic reduction operation must be used once only. 19641returnI->hasOneUse();
19644 /// Initializes the list of reduction operations. 19646if (isCmpSelMinMax(
I))
19647 ReductionOps.assign(2, ReductionOpsType());
19649 ReductionOps.assign(1, ReductionOpsType());
19652 /// Add all reduction operations for the reduction instruction \p I. 19654if (isCmpSelMinMax(
I)) {
19655 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19656 ReductionOps[1].emplace_back(
I);
19658 ReductionOps[0].emplace_back(
I);
19663int Sz = Data.size();
19664auto *
I = dyn_cast<Instruction>(Data.front());
19672 /// Try to find a reduction tree. 19676 RdxKind = HorizontalReduction::getRdxKind(Root);
19677if (!isVectorizable(RdxKind, Root))
19680// Analyze "regular" integer/FP types for reductions - no target-specific 19681// types or pointers. 19686// Though the ultimate reduction may have multiple uses, its condition must 19687// have only single use. 19688if (
auto *Sel = dyn_cast<SelectInst>(Root))
19689if (!Sel->getCondition()->hasOneUse())
19692 ReductionRoot = Root;
19694// Iterate through all the operands of the possible reduction tree and 19695// gather all the reduced values, sorting them by their value id. 19697bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19699 1, std::make_pair(Root, 0));
19700// Checks if the operands of the \p TreeN instruction are also reduction 19701// operations or should be treated as reduced values or an extra argument, 19702// which is not part of the reduction. 19707for (
intI :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19708 getNumberOfOperands(TreeN)))) {
19709Value *EdgeVal = getRdxOperand(TreeN,
I);
19710 ReducedValsToOps[EdgeVal].push_back(TreeN);
19711auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19712// If the edge is not an instruction, or it is different from the main 19713// reduction opcode or has too many uses - possible reduced value. 19714// Also, do not try to reduce const values, if the operation is not 19718 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19719 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19720 !isVectorizable(RdxKind, EdgeInst) ||
19721 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19722all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19723 PossibleReducedVals.push_back(EdgeVal);
19726 ReductionOps.push_back(EdgeInst);
19729// Try to regroup reduced values so that it gets more profitable to try to 19730// reduce them. Values are grouped by their value ids, instructions - by 19731// instruction op id and/or alternate op id, plus do extra analysis for 19732// loads (grouping them by the distabce between pointers) and cmp 19733// instructions (grouping them by the predicate). 19737 PossibleReducedVals;
19738 initReductionOps(Root);
19742auto GenerateLoadsSubkey = [&](
size_tKey,
LoadInst *LI) {
19746if (!LoadKeyUsed.
insert(Key).second) {
19747auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19748if (LIt != LoadsMap.
end()) {
19749for (
LoadInst *RLI : LIt->second) {
19752/*StrictCheck=*/true))
19755for (
LoadInst *RLI : LIt->second) {
19762if (LIt->second.size() > 2) {
19764hash_value(LIt->second.back()->getPointerOperand());
19770 .first->second.push_back(LI);
19774while (!Worklist.empty()) {
19775auto [TreeN, Level] = Worklist.pop_back_val();
19778 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19779 addReductionOps(TreeN);
19780// Add reduction values. The values are sorted for better vectorization 19782for (
Value *V : PossibleRedVals) {
19785/*AllowAlternate=*/false);
19786 ++PossibleReducedVals[
Key][
Idx]
19787 .
insert(std::make_pair(V, 0))
19791 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19793auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19794// Sort values by the total number of values kinds to start the reduction 19795// from the longest possible reduced values sequences. 19796for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19797auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19799for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19802auto RedValsVect = It->second.takeVector();
19804for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19805 PossibleRedValsVect.
back().append(Data.second, Data.first);
19807stable_sort(PossibleRedValsVect, [](
constauto &P1,
constauto &P2) {
19808returnP1.size() > P2.size();
19813 (!isGoodForReduction(Data) &&
19814 (!isa<LoadInst>(Data.front()) ||
19815 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19817 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19819 cast<LoadInst>(ReducedVals[NewIdx].front())
19821 NewIdx = ReducedVals.
size();
19824 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19827// Sort the reduced values by number of same/alternate opcode and/or pointer 19830returnP1.size() > P2.
size();
19835 /// Attempt to vectorize the tree found by matchAssociativeReduction. 19839constexprunsigned RegMaxNumber = 4;
19840constexprunsigned RedValsMaxNumber = 128;
19841// If there are a sufficient number of reduction values, reduce 19842// to a nearby power-of-2. We can safely generate oversized 19843// vectors and rely on the backend to split them to legal sizes. 19844if (
unsigned NumReducedVals = std::accumulate(
19845 ReducedVals.
begin(), ReducedVals.
end(), 0,
19847 if (!isGoodForReduction(Vals))
19849 return Num + Vals.size();
19851 NumReducedVals < ReductionLimit &&
19855for (ReductionOpsType &RdxOps : ReductionOps)
19856for (
Value *RdxOp : RdxOps)
19857V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19865// Track the reduced values in case if they are replaced by extractelement 19866// because of the vectorization. 19868 ReducedVals.
front().size());
19870// The compare instruction of a min/max is the insertion point for new 19871// instructions and may be replaced with a new compare instruction. 19872auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19873assert(isa<SelectInst>(RdxRootInst) &&
19874"Expected min/max reduction to have select root instruction");
19875Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19876assert(isa<Instruction>(ScalarCond) &&
19877"Expected min/max reduction to have compare condition");
19878return cast<Instruction>(ScalarCond);
19881bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19882 return isBoolLogicOp(cast<Instruction>(V));
19884// Return new VectorizedTree, based on previous value. 19885auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19886if (VectorizedTree) {
19887// Update the final value in the reduction. 19889 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19890if (AnyBoolLogicOp) {
19891auto It = ReducedValsToOps.
find(VectorizedTree);
19892auto It1 = ReducedValsToOps.
find(Res);
19893if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19895 (It != ReducedValsToOps.
end() &&
19897 return isBoolLogicOp(I) &&
19898 getRdxOperand(I, 0) == VectorizedTree;
19902 (It1 != ReducedValsToOps.
end() &&
19904 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19908 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19912return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19915// Initialize the final value in the reduction. 19919 ReductionOps.front().size());
19920for (ReductionOpsType &RdxOps : ReductionOps)
19921for (
Value *RdxOp : RdxOps) {
19924 IgnoreList.insert(RdxOp);
19926// Intersect the fast-math-flags from all reduction operations. 19929for (
Value *U : IgnoreList)
19930if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19931 RdxFMF &= FPMO->getFastMathFlags();
19932bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19934// Need to track reduced vals, they may be changed during vectorization of 19937for (
Value *V : Candidates)
19938 TrackedVals.try_emplace(V, V);
19942auto *It = MV.
find(V);
19943assert(It != MV.
end() &&
"Unable to find given key.");
19948// List of the values that were reduced in other trees as part of gather 19949// nodes and thus requiring extract if fully vectorized in other trees. 19952bool CheckForReusedReductionOps =
false;
19953// Try to vectorize elements based on their type. 19957for (
unsignedI = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19959 InstructionsState S = States[
I];
19963for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19964Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19965// Check if the reduction value was not overriden by the extractelement 19966// instruction because of the vectorization and exclude it, if it is not 19967// compatible with other values. 19968// Also check if the instruction was folded to constant/other value. 19969auto *Inst = dyn_cast<Instruction>(RdxVal);
19971 (!S || !S.isOpcodeOrAlt(Inst))) ||
19975 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19977bool ShuffledExtracts =
false;
19978// Try to handle shuffled extractelements. 19979if (S && S.getOpcode() == Instruction::ExtractElement &&
19980 !S.isAltShuffle() &&
I + 1 <
E) {
19982for (
Value *RV : ReducedVals[
I + 1]) {
19983Value *RdxVal = TrackedVals.at(RV);
19984// Check if the reduction value was not overriden by the 19985// extractelement instruction because of the vectorization and 19986// exclude it, if it is not compatible with other values. 19987auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19990 CommonCandidates.push_back(RdxVal);
19991 TrackedToOrig.try_emplace(RdxVal, RV);
19996 Candidates.
swap(CommonCandidates);
19997 ShuffledExtracts =
true;
20001// Emit code for constant values. 20004Value *OrigV = TrackedToOrig.at(Candidates.
front());
20005 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20007 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
20008Value *OrigV = TrackedToOrig.at(VC);
20009 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20010if (
auto *ResI = dyn_cast<Instruction>(Res))
20011V.analyzedReductionRoot(ResI);
20013 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20017unsigned NumReducedVals = Candidates.
size();
20018if (NumReducedVals < ReductionLimit &&
20019 (NumReducedVals < 2 || !
isSplat(Candidates)))
20022// Check if we support repeated scalar values processing (optimization of 20023// original scalar identity operations on matched horizontal reductions). 20024 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20025 RdxKind != RecurKind::FMul &&
20026 RdxKind != RecurKind::FMulAdd;
20027// Gather same values. 20029if (IsSupportedHorRdxIdentityOp)
20030for (
Value *V : Candidates) {
20031Value *OrigV = TrackedToOrig.at(V);
20032 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20034// Used to check if the reduced values used same number of times. In this 20035// case the compiler may produce better code. E.g. if reduced values are 20036// aabbccdd (8 x values), then the first node of the tree will have a node 20037// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>. 20038// Plus, the final reduction will be performed on <8 x aabbccdd>. 20039// Instead compiler may build <4 x abcd> tree immediately, + reduction (4 20041// Currently it only handles add/fadd/xor. and/or/min/max do not require 20042// this analysis, other operations may require an extra estimation of 20043// the profitability. 20044bool SameScaleFactor =
false;
20045bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20046 SameValuesCounter.
size() != Candidates.size();
20048if (OptReusedScalars) {
20050 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20051 RdxKind == RecurKind::Xor) &&
20053 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
20054returnP.second == SameValuesCounter.
front().second;
20056 Candidates.resize(SameValuesCounter.
size());
20057transform(SameValuesCounter, Candidates.begin(),
20058 [&](
constauto &
P) { return TrackedVals.at(P.first); });
20059 NumReducedVals = Candidates.size();
20060// Have a reduction of the same element. 20061if (NumReducedVals == 1) {
20062Value *OrigV = TrackedToOrig.at(Candidates.front());
20063unsigned Cnt = At(SameValuesCounter, OrigV);
20065 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20066 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20067 VectorizedVals.try_emplace(OrigV, Cnt);
20068 ExternallyUsedValues.
insert(OrigV);
20073unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20074unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20075constunsigned MaxElts = std::clamp<unsigned>(
20077 RegMaxNumber * RedValsMaxNumber);
20079unsigned ReduxWidth = NumReducedVals;
20080auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20081unsigned NumParts, NumRegs;
20082Type *ScalarTy = Candidates.front()->getType();
20089while (NumParts > NumRegs) {
20090assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
20097if (NumParts > NumRegs / 2)
20102 ReduxWidth = GetVectorFactor(ReduxWidth);
20103 ReduxWidth = std::min(ReduxWidth, MaxElts);
20106unsigned Pos = Start;
20107// Restarts vectorization attempt with lower vector factor. 20108unsigned PrevReduxWidth = ReduxWidth;
20109bool CheckForReusedReductionOpsLocal =
false;
20110auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20111bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20112if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20113// Check if any of the reduction ops are gathered. If so, worth 20114// trying again with less number of reduction ops. 20115 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20118if (Pos < NumReducedVals - ReduxWidth + 1)
20119return IsAnyRedOpGathered;
20123 ReduxWidth = GetVectorFactor(ReduxWidth);
20124return IsAnyRedOpGathered;
20126bool AnyVectorized =
false;
20128while (Pos < NumReducedVals - ReduxWidth + 1 &&
20129 ReduxWidth >= ReductionLimit) {
20130// Dependency in tree of the reduction ops - drop this attempt, try 20132if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20134 CheckForReusedReductionOps =
true;
20137 PrevReduxWidth = ReduxWidth;
20139// Been analyzed already - skip. 20140if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20143 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20145 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20147V.areAnalyzedReductionVals(VL)) {
20148 (void)AdjustReducedVals(
/*IgnoreVL=*/true);
20151// Early exit if any of the reduction values were deleted during 20152// previous vectorization attempts. 20154auto *RedValI = dyn_cast<Instruction>(RedVal);
20157returnV.isDeleted(RedValI);
20160V.buildTree(VL, IgnoreList);
20161if (
V.isTreeTinyAndNotFullyVectorizable(
/*ForReduction=*/true)) {
20162if (!AdjustReducedVals())
20163V.analyzedReductionVals(VL);
20166if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20167if (!AdjustReducedVals())
20168V.analyzedReductionVals(VL);
20171V.reorderTopToBottom();
20172// No need to reorder the root node at all. 20173V.reorderBottomToTop(
/*IgnoreReorder=*/true);
20174// Keep extracted other reduction values, if they are used in the 20175// vectorization trees. 20177 ExternallyUsedValues);
20178// The reduction root is used as the insertion point for new 20179// instructions, so set it as externally used to prevent it from being 20181 LocalExternallyUsedValues.insert(ReductionRoot);
20182for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20183if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20185for (
Value *V : ReducedVals[Cnt])
20186if (isa<Instruction>(V))
20187 LocalExternallyUsedValues.insert(TrackedVals[V]);
20189if (!IsSupportedHorRdxIdentityOp) {
20190// Number of uses of the candidates in the vector of values. 20192"Reused values counter map is not empty");
20193for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20194if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20196Value *
V = Candidates[Cnt];
20197Value *OrigV = TrackedToOrig.at(V);
20198 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20203// Gather externally used values. 20205for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20206if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20208Value *RdxVal = Candidates[Cnt];
20209if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20210 RdxVal = It->second;
20211if (!Visited.
insert(RdxVal).second)
20213// Check if the scalar was vectorized as part of the vectorization 20214// tree but not the top node. 20215if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20216 LocalExternallyUsedValues.insert(RdxVal);
20219Value *OrigV = TrackedToOrig.at(RdxVal);
20221 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20222if (NumOps != ReducedValsToOps.
at(OrigV).size())
20223 LocalExternallyUsedValues.insert(RdxVal);
20225// Do not need the list of reused scalars in regular mode anymore. 20226if (!IsSupportedHorRdxIdentityOp)
20227 SameValuesCounter.
clear();
20228for (
Value *RdxVal : VL)
20229if (RequiredExtract.
contains(RdxVal))
20230 LocalExternallyUsedValues.insert(RdxVal);
20231V.buildExternalUses(LocalExternallyUsedValues);
20233V.computeMinimumValueSizes();
20238 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20241 <<
" for reduction\n");
20245V.getORE()->emit([&]() {
20247 ReducedValsToOps.
at(VL[0]).front())
20248 <<
"Vectorizing horizontal reduction is possible " 20249 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20250 <<
" and threshold " 20253if (!AdjustReducedVals()) {
20254V.analyzedReductionVals(VL);
20255unsignedOffset = Pos == Start ? Pos : Pos - 1;
20256if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20257// Add subvectors of VL to the list of the analyzed values. 20259 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20260 VF >= ReductionLimit;
20262 *
TTI, VL.front()->getType(), VF - 1)) {
20264V.getCanonicalGraphSize() !=
V.getTreeSize())
20266for (
unsignedIdx : seq<unsigned>(ReduxWidth - VF))
20274LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:" 20275 <<
Cost <<
". (HorRdx)\n");
20276V.getORE()->emit([&]() {
20278 ReducedValsToOps.
at(VL[0]).front())
20279 <<
"Vectorized horizontal reduction with cost " 20281 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20286// Emit a reduction. If the root is a select (min/max idiom), the insert 20287// point is the compare condition of that select. 20288Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20291 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20293// Vectorize a tree. 20294Value *VectorizedRoot =
20295V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20296// Update TrackedToOrig mapping, since the tracked values might be 20298for (
Value *RdxVal : Candidates) {
20299Value *OrigVal = TrackedToOrig.at(RdxVal);
20300Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20301if (TransformedRdxVal != RdxVal)
20302 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20307// To prevent poison from leaking across what used to be sequential, 20308// safe, scalar boolean logic operations, the reduction operand must be 20311 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20313// Emit code to correctly handle reused reduced values, if required. 20314if (OptReusedScalars && !SameScaleFactor) {
20315 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20316 SameValuesCounter, TrackedToOrig);
20319Value *ReducedSubTree;
20320Type *ScalarTy = VL.front()->getType();
20321if (isa<FixedVectorType>(ScalarTy)) {
20326for (
unsignedI : seq<unsigned>(ScalarTyNumElements)) {
20327// Do reduction for each lane. 20328// e.g., do reduce add for 20329// VL[0] = <4 x Ty> <a, b, c, d> 20330// VL[1] = <4 x Ty> <e, f, g, h> 20331// Lane[0] = <2 x Ty> <a, e> 20332// Lane[1] = <2 x Ty> <b, f> 20333// Lane[2] = <2 x Ty> <c, g> 20334// Lane[3] = <2 x Ty> <d, h> 20335// result[0] = reduce add Lane[0] 20336// result[1] = reduce add Lane[1] 20337// result[2] = reduce add Lane[2] 20338// result[3] = reduce add Lane[3] 20344 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20347 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20350if (ReducedSubTree->
getType() != VL.front()->getType()) {
20351assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20352"Expected different reduction type.");
20354 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20355V.isSignedMinBitwidthRootNode());
20358// Improved analysis for add/fadd/xor reductions with same scale factor 20359// for all operands of reductions. We can emit scalar ops for them 20361if (OptReusedScalars && SameScaleFactor)
20362 ReducedSubTree = emitScaleForReusedOps(
20363 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20365 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20366// Count vectorized reduced values to exclude them from final reduction. 20367for (
Value *RdxVal : VL) {
20368Value *OrigV = TrackedToOrig.at(RdxVal);
20369if (IsSupportedHorRdxIdentityOp) {
20370 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20373 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20374if (!
V.isVectorized(RdxVal))
20375 RequiredExtract.
insert(RdxVal);
20379 ReduxWidth = NumReducedVals - Pos;
20381 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20382 AnyVectorized =
true;
20384if (OptReusedScalars && !AnyVectorized) {
20385for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20386Value *RdxVal = TrackedVals.at(
P.first);
20387Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20388 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20389 VectorizedVals.try_emplace(
P.first,
P.second);
20394if (VectorizedTree) {
20395// Reorder operands of bool logical op in the natural order to avoid 20396// possible problem with poison propagation. If not possible to reorder 20397// (both operands are originally RHS), emit an extra freeze instruction 20398// for the LHS operand. 20399// I.e., if we have original code like this: 20400// RedOp1 = select i1 ?, i1 LHS, i1 false 20401// RedOp2 = select i1 RHS, i1 ?, i1 false 20403// Then, we swap LHS/RHS to create a new op that matches the poison 20404// semantics of the original code. 20406// If we have original code like this and both values could be poison: 20407// RedOp1 = select i1 ?, i1 LHS, i1 false 20408// RedOp2 = select i1 ?, i1 RHS, i1 false 20410// Then, we must freeze LHS in the new op. 20415if (!AnyBoolLogicOp)
20417if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20418 getRdxOperand(RedOp1, 0) ==
LHS ||
20421if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20422 getRdxOperand(RedOp2, 0) ==
RHS ||
20427if (
LHS != VectorizedTree)
20430// Finish the reduction. 20431// Need to add extra arguments and not vectorized possible reduction 20433// Try to avoid dependencies between the scalar remainders after 20438unsigned Sz = InstVals.
size();
20441for (
unsignedI = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20444Value *RdxVal1 = InstVals[
I].second;
20445Value *StableRdxVal1 = RdxVal1;
20446auto It1 = TrackedVals.find(RdxVal1);
20447if (It1 != TrackedVals.end())
20448 StableRdxVal1 = It1->second;
20449Value *RdxVal2 = InstVals[
I + 1].second;
20450Value *StableRdxVal2 = RdxVal2;
20451auto It2 = TrackedVals.find(RdxVal2);
20452if (It2 != TrackedVals.end())
20453 StableRdxVal2 = It2->second;
20454// To prevent poison from leaking across what used to be 20455// sequential, safe, scalar boolean logic operations, the 20456// reduction operand must be frozen. 20457 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20459Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20460 StableRdxVal2,
"op.rdx", ReductionOps);
20461 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20464 ExtraReds[Sz / 2] = InstVals.
back();
20468 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20472for (
Value *RdxVal : Candidates) {
20473if (!Visited.
insert(RdxVal).second)
20475unsigned NumOps = VectorizedVals.lookup(RdxVal);
20481// Iterate through all not-vectorized reduction values/extra arguments. 20482bool InitStep =
true;
20483while (ExtraReductions.
size() > 1) {
20485 FinalGen(ExtraReductions, InitStep);
20486 ExtraReductions.
swap(NewReds);
20489 VectorizedTree = ExtraReductions.
front().second;
20491 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20493// The original scalar reduction is expected to have no remaining 20494// uses outside the reduction tree itself. Assert that we got this 20495// correct, replace internal uses with undef, and mark for eventual 20500 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20507for (
auto *U :
Ignore->users()) {
20509"All users must be either in the reduction ops list.");
20512if (!
Ignore->use_empty()) {
20514Ignore->replaceAllUsesWith(
P);
20517V.removeInstructionsAndOperands(RdxOps);
20519 }
elseif (!CheckForReusedReductionOps) {
20520for (ReductionOpsType &RdxOps : ReductionOps)
20521for (
Value *RdxOp : RdxOps)
20522V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20524return VectorizedTree;
20528 /// Calculate the cost of a reduction. 20534Type *ScalarTy = ReducedVals.
front()->getType();
20535unsigned ReduxWidth = ReducedVals.
size();
20538// If all of the reduced values are constant, the vector cost is 0, since 20539// the reduction value can be calculated at the compile time. 20543// Scalar cost is repeated for N-1 elements. 20544int Cnt = ReducedVals.
size();
20545for (
Value *RdxVal : ReducedVals) {
20550Cost += GenCostFn();
20555auto *RdxOp = cast<Instruction>(U);
20556if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20566Cost += GenCostFn();
20571case RecurKind::Add:
20572case RecurKind::Mul:
20574case RecurKind::And:
20575case RecurKind::Xor:
20576case RecurKind::FAdd:
20577case RecurKind::FMul: {
20580if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20583for (
unsignedI : seq<unsigned>(ReducedVals.size())) {
20595auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20596 std::make_pair(RedTy,
true));
20597if (RType == RedTy) {
20607 ScalarCost = EvaluateScalarCost([&]() {
20612case RecurKind::FMax:
20613case RecurKind::FMin:
20614case RecurKind::FMaximum:
20615case RecurKind::FMinimum:
20616case RecurKind::SMax:
20617case RecurKind::SMin:
20618case RecurKind::UMax:
20619case RecurKind::UMin: {
20623 ScalarCost = EvaluateScalarCost([&]() {
20633LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20635 <<
" (It is a splitting reduction)\n");
20636return VectorCost - ScalarCost;
20639 /// Emit a horizontal reduction of the vectorized value. 20642assert(VectorizedValue &&
"Need to have a vectorized tree node");
20643assert(RdxKind != RecurKind::FMulAdd &&
20644"A call to the llvm.fmuladd intrinsic is not handled yet");
20646auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20647if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20648 RdxKind == RecurKind::Add &&
20650// Convert vector_reduce_add(ZExt(<n x i1>)) to 20651// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)). 20653 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20654 ++NumVectorInstructions;
20657 ++NumVectorInstructions;
20661 /// Emits optimized code for unique scalar value reused \p Cnt times. 20664assert(IsSupportedHorRdxIdentityOp &&
20665"The optimization of matched scalar identity horizontal reductions " 20666"must be supported.");
20668return VectorizedValue;
20670case RecurKind::Add: {
20672Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20674 << VectorizedValue <<
". (HorRdx)\n");
20675return Builder.
CreateMul(VectorizedValue, Scale);
20677case RecurKind::Xor: {
20678// res = n % 2 ? 0 : vv 20679LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20683return VectorizedValue;
20685case RecurKind::FAdd: {
20687Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20689 << VectorizedValue <<
". (HorRdx)\n");
20690return Builder.
CreateFMul(VectorizedValue, Scale);
20692case RecurKind::And:
20694case RecurKind::SMax:
20695case RecurKind::SMin:
20696case RecurKind::UMax:
20697case RecurKind::UMin:
20698case RecurKind::FMax:
20699case RecurKind::FMin:
20700case RecurKind::FMaximum:
20701case RecurKind::FMinimum:
20703return VectorizedValue;
20704case RecurKind::Mul:
20705case RecurKind::FMul:
20706case RecurKind::FMulAdd:
20707case RecurKind::IAnyOf:
20708case RecurKind::FAnyOf:
20709case RecurKind::IFindLastIV:
20710case RecurKind::FFindLastIV:
20711case RecurKind::None:
20717 /// Emits actual operation for the scalar identity values, found during 20718 /// horizontal reduction analysis. 20723assert(IsSupportedHorRdxIdentityOp &&
20724"The optimization of matched scalar identity horizontal reductions " 20725"must be supported.");
20727auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20728if (VTy->getElementType() != VL.
front()->getType()) {
20732R.isSignedMinBitwidthRootNode());
20735case RecurKind::Add: {
20736// root = mul prev_root, <1, 1, n, 1> 20738for (
Value *V : VL) {
20739unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20740 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
/*IsSigned=*/false));
20744 << VectorizedValue <<
". (HorRdx)\n");
20745return Builder.
CreateMul(VectorizedValue, Scale);
20747case RecurKind::And:
20749// No need for multiple or/and(s). 20752return VectorizedValue;
20753case RecurKind::SMax:
20754case RecurKind::SMin:
20755case RecurKind::UMax:
20756case RecurKind::UMin:
20757case RecurKind::FMax:
20758case RecurKind::FMin:
20759case RecurKind::FMaximum:
20760case RecurKind::FMinimum:
20761// No need for multiple min/max(s) of the same value. 20764return VectorizedValue;
20765case RecurKind::Xor: {
20766// Replace values with even number of repeats with 0, since 20768// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, 20769// 7>, if elements 4th and 6th elements have even number of repeats. 20771 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20773 std::iota(
Mask.begin(),
Mask.end(), 0);
20774bool NeedShuffle =
false;
20775for (
unsignedI = 0, VF = VL.size();
I < VF; ++
I) {
20777unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20786dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20790 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20791return VectorizedValue;
20793case RecurKind::FAdd: {
20794// root = fmul prev_root, <1.0, 1.0, n.0, 1.0> 20796for (
Value *V : VL) {
20797unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20798 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20801return Builder.
CreateFMul(VectorizedValue, Scale);
20803case RecurKind::Mul:
20804case RecurKind::FMul:
20805case RecurKind::FMulAdd:
20806case RecurKind::IAnyOf:
20807case RecurKind::FAnyOf:
20808case RecurKind::IFindLastIV:
20809case RecurKind::FFindLastIV:
20810case RecurKind::None:
20816}
// end anonymous namespace 20818/// Gets recurrence kind from the specified value. 20820return HorizontalReduction::getRdxKind(V);
20823if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20824return cast<FixedVectorType>(IE->getType())->getNumElements();
20826unsigned AggregateSize = 1;
20827auto *
IV = cast<InsertValueInst>(InsertInst);
20828Type *CurrentType =
IV->getType();
20830if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20831for (
auto *Elt : ST->elements())
20832if (Elt != ST->getElementType(0))
// check homogeneity 20833return std::nullopt;
20834 AggregateSize *= ST->getNumElements();
20835 CurrentType = ST->getElementType(0);
20836 }
elseif (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20837 AggregateSize *= AT->getNumElements();
20838 CurrentType = AT->getElementType();
20839 }
elseif (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20840 AggregateSize *= VT->getNumElements();
20841return AggregateSize;
20843return AggregateSize;
20845return std::nullopt;
20854unsigned OperandOffset,
constBoUpSLP &R) {
20857 std::optional<unsigned> OperandIndex =
20859if (!OperandIndex || R.isDeleted(LastInsertInst))
20861if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20863 BuildVectorOpds, InsertElts, *OperandIndex, R);
20866 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20867 InsertElts[*OperandIndex] = LastInsertInst;
20869 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20870 }
while (LastInsertInst !=
nullptr &&
20871 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20875/// Recognize construction of vectors like 20876/// %ra = insertelement <4 x float> poison, float %s0, i32 0 20877/// %rb = insertelement <4 x float> %ra, float %s1, i32 1 20878/// %rc = insertelement <4 x float> %rb, float %s2, i32 2 20879/// %rd = insertelement <4 x float> %rc, float %s3, i32 3 20880/// starting from the last insertelement or insertvalue instruction. 20882/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, 20883/// {{float, float}, {float, float}}, [2 x {float, float}] and so on. 20884/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. 20886/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. 20888/// \return true if it matches. 20895assert((isa<InsertElementInst>(LastInsertInst) ||
20896 isa<InsertValueInst>(LastInsertInst)) &&
20897"Expected insertelement or insertvalue instruction!");
20900"Expected empty result vectors!");
20905 BuildVectorOpds.
resize(*AggregateSize);
20906 InsertElts.
resize(*AggregateSize);
20912if (BuildVectorOpds.
size() >= 2)
20918/// Try and get a reduction instruction from a phi node. 20920/// Given a phi node \p P in a block \p ParentBB, consider possible reductions 20921/// if they come from either \p ParentBB or a containing loop latch. 20923/// \returns A candidate reduction value if possible, or \code nullptr \endcode 20924/// if not possible. 20927// There are situations where the reduction value is not dominated by the 20928// reduction phi. Vectorizing such cases has been reported to cause 20929// miscompiles. See PR25787. 20930auto DominatedReduxValue = [&](
Value *R) {
20931return isa<Instruction>(R) &&
20932 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20937// Return the incoming value if it comes from the same BB as the phi node. 20938if (
P->getIncomingBlock(0) == ParentBB) {
20939 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20940 }
elseif (
P->getIncomingBlock(1) == ParentBB) {
20941 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20944if (Rdx && DominatedReduxValue(Rdx))
20947// Otherwise, check whether we have a loop latch to look at. 20955// There is a loop latch, return the incoming value if it comes from 20956// that. This reduction pattern occasionally turns up. 20957if (
P->getIncomingBlock(0) == BBLatch) {
20958 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20959 }
elseif (
P->getIncomingBlock(1) == BBLatch) {
20960 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20963if (Rdx && DominatedReduxValue(Rdx))
20991/// We could have an initial reduction that is not an add. 20992/// r *= v1 + v2 + v3 + v4 20993/// In such a case start looking for a tree rooted in the first '+'. 20994/// \Returns the new root if found, which may be nullptr if not an instruction. 20997assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20998 isa<IntrinsicInst>(Root)) &&
20999"Expected binop, select, or intrinsic for reduction matching");
21001 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
21003 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
21005return dyn_cast<Instruction>(
RHS);
21007return dyn_cast<Instruction>(
LHS);
21011/// \p Returns the first operand of \p I that does not match \p Phi. If 21012/// operand is not an instruction it returns nullptr. 21014Value *Op0 =
nullptr;
21015Value *Op1 =
nullptr;
21018return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21021/// \Returns true if \p I is a candidate instruction for reduction vectorization. 21024Value *B0 =
nullptr, *B1 =
nullptr;
21029bool SLPVectorizerPass::vectorizeHorReduction(
21034bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
21036if (Root->
getParent() != BB || isa<PHINode>(Root))
21039// If we can find a secondary reduction root, use that instead. 21040auto SelectRoot = [&]() {
21048// Start analysis starting from Root instruction. If horizontal reduction is 21049// found, try to vectorize it. If it is not a horizontal reduction or 21050// vectorization is not possible or not effective, and currently analyzed 21051// instruction is a binary operation, try to vectorize the operands, using 21052// pre-order DFS traversal order. If the operands were not vectorized, repeat 21053// the same procedure considering each operand as a possible root of the 21054// horizontal reduction. 21055// Interrupt the process if the Root instruction itself was vectorized or all 21056// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. 21057// If a horizintal reduction was not matched or vectorized we collect 21058// instructions for possible later attempts for vectorization. 21059 std::queue<std::pair<Instruction *, unsigned>>
Stack;
21060Stack.emplace(SelectRoot(), 0);
21064if (
R.isAnalyzedReductionRoot(Inst))
21069if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21071return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21073auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21074if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21079// Do not collect CmpInst or InsertElementInst/InsertValueInst as their 21080// analysis is done separately. 21081if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21086while (!
Stack.empty()) {
21089 std::tie(Inst, Level) =
Stack.front();
21091// Do not try to analyze instruction that has already been vectorized. 21092// This may happen when we vectorize instruction operands on a previous 21093// iteration while stack was populated before that happened. 21094if (
R.isDeleted(Inst))
21096if (
Value *VectorizedV = TryToReduce(Inst)) {
21098if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21099// Try to find another reduction. 21103if (
R.isDeleted(Inst))
21106// We could not vectorize `Inst` so try to use it as a future seed. 21107if (!TryAppendToPostponedInsts(Inst)) {
21113// Try to vectorize operands. 21114// Continue analysis for the instruction from the same basic block only to 21115// save compile time. 21118if (VisitedInstrs.
insert(
Op).second)
21119if (
auto *
I = dyn_cast<Instruction>(
Op))
21120// Do not try to vectorize CmpInst operands, this is done 21122if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21123 !
R.isDeleted(
I) &&
I->getParent() == BB)
21132bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21133 Res |= tryToVectorize(PostponedInsts, R);
21140for (
Value *V : Insts)
21141if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21142 Res |= tryToVectorize(Inst, R);
21146bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21149if (!
R.canMapToVector(IVI->
getType()))
21157if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21158R.getORE()->emit([&]() {
21160 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, " 21161"trying reduction first.";
21165LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21166// Aggregate value is unlikely to be processed in vector register. 21167return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21177 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21181if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21182R.getORE()->emit([&]() {
21184 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, " 21185"trying reduction first.";
21189LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21190return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21193template <
typename T>
21199bool Changed =
false;
21200// Sort by type, parent, operands. 21203// Try to vectorize elements base on their type. 21208// Look for the next elements with the same type, parent and operand 21210auto *
I = dyn_cast<Instruction>(*IncIt);
21211if (!
I || R.isDeleted(
I)) {
21215auto *SameTypeIt = IncIt;
21216while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21217 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21218 AreCompatible(*SameTypeIt, *IncIt))) {
21219auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21221if (
I && !R.isDeleted(
I))
21225// Try to vectorize them. 21226unsigned NumElts = VL.
size();
21227LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes (" 21228 << NumElts <<
")\n");
21229// The vectorization is a 3-state attempt: 21230// 1. Try to vectorize instructions with the same/alternate opcodes with the 21231// size of maximal register at first. 21232// 2. Try to vectorize remaining instructions with the same type, if 21233// possible. This may result in the better vectorization results rather than 21234// if we try just to vectorize instructions with the same/alternate opcodes. 21235// 3. Final attempt to try to vectorize all instructions with the 21236// same/alternate ops only, this may result in some extra final 21238if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21239// Success start over because instructions might have been changed. 21241 VL.
swap(Candidates);
21242 Candidates.
clear();
21244if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21248 /// \Returns the minimum number of elements that we will attempt to 21250auto GetMinNumElements = [&R](
Value *V) {
21251unsigned EltSize = R.getVectorElementSize(V);
21252return std::max(2U, R.getMaxVecRegSize() / EltSize);
21254if (NumElts < GetMinNumElements(*IncIt) &&
21255 (Candidates.
empty() ||
21256 Candidates.
front()->getType() == (*IncIt)->getType())) {
21258if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21263// Final attempt to vectorize instructions with the same types. 21264if (Candidates.
size() > 1 &&
21265 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21266if (TryToVectorizeHelper(Candidates,
/*MaxVFOnly=*/false)) {
21267// Success start over because instructions might have been changed. 21269 }
elseif (MaxVFOnly) {
21270// Try to vectorize using small vectors. 21272for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21274auto *
I = dyn_cast<Instruction>(*It);
21275if (!
I || R.isDeleted(
I)) {
21279auto *SameTypeIt = It;
21280while (SameTypeIt !=
End &&
21281 (!isa<Instruction>(*SameTypeIt) ||
21282 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21283 AreCompatible(*SameTypeIt, *It))) {
21284auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21286if (
I && !R.isDeleted(
I))
21289unsigned NumElts = VL.
size();
21290if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21291/*MaxVFOnly=*/false))
21296 Candidates.
clear();
21299// Start over at the next instruction of a different type (or the end). 21300 IncIt = SameTypeIt;
21305/// Compare two cmp instructions. If IsCompatibility is true, function returns 21306/// true if 2 cmps have same/swapped predicates and mos compatible corresponding 21307/// operands. If IsCompatibility is false, function implements strict weak 21308/// ordering relation between two cmp instructions, returning true if the first 21309/// instruction is "less" than the second, i.e. its predicate is less than the 21310/// predicate of the second or the operands IDs are less than the operands IDs 21311/// of the second cmp instruction. 21312template <
bool IsCompatibility>
21317"Expected valid element types only.");
21319return IsCompatibility;
21320auto *CI1 = cast<CmpInst>(V);
21321auto *CI2 = cast<CmpInst>(V2);
21322if (CI1->getOperand(0)->getType()->getTypeID() <
21324return !IsCompatibility;
21325if (CI1->getOperand(0)->getType()->getTypeID() >
21328if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21330return !IsCompatibility;
21331if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21340if (BasePred1 < BasePred2)
21341return !IsCompatibility;
21342if (BasePred1 > BasePred2)
21344// Compare operands. 21345bool CI1Preds = Pred1 == BasePred1;
21346bool CI2Preds = Pred2 == BasePred1;
21347for (
intI = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21348auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21353return !IsCompatibility;
21356if (
auto *I1 = dyn_cast<Instruction>(Op1))
21357if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21358if (IsCompatibility) {
21359if (I1->getParent() != I2->getParent())
21362// Try to compare nodes with same parent. 21366return NodeI2 !=
nullptr;
21369assert((NodeI1 == NodeI2) ==
21371"Different nodes should have different DFS numbers");
21372if (NodeI1 != NodeI2)
21376if (S && (IsCompatibility || !S.isAltShuffle()))
21378if (IsCompatibility)
21380if (I1->getOpcode() != I2->getOpcode())
21381return I1->getOpcode() < I2->getOpcode();
21384return IsCompatibility;
21387template <
typename ItT>
21390bool Changed =
false;
21391// Try to find reductions first. 21396if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21397 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21402// Try to vectorize operands as vector bundles. 21406 Changed |= tryToVectorize(
I, R);
21408// Try to vectorize list of compares. 21409// Sort by type, compare predicate, etc. 21413return compareCmp<false>(V, V2, *TLI, *DT);
21416auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21419return compareCmp<true>(V1, V2, *TLI, *DT);
21426if (Vals.
size() <= 1)
21428 Changed |= tryToVectorizeSequence<Value>(
21429 Vals, CompareSorter, AreCompatibleCompares,
21431// Exclude possible reductions from other blocks. 21432bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21434 auto *Select = dyn_cast<SelectInst>(U);
21436 Select->getParent() != cast<Instruction>(V)->getParent();
21439if (ArePossiblyReducedInOtherBlock)
21441return tryToVectorizeList(Candidates, R, MaxVFOnly);
21443/*MaxVFOnly=*/true,
R);
21447bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21449assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21450"This function only accepts Insert instructions");
21451bool OpsChanged =
false;
21453for (
auto *
I :
reverse(Instructions)) {
21454// pass1 - try to match and vectorize a buildvector sequence for MaxVF only. 21455if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21457if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21459 vectorizeInsertValueInst(LastInsertValue, BB, R,
/*MaxVFOnly=*/true);
21460 }
elseif (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21462 vectorizeInsertElementInst(LastInsertElem, BB, R,
/*MaxVFOnly=*/true);
21464// pass2 - try to vectorize reductions only 21467 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21468if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21470// pass3 - try to match and vectorize a buildvector sequence. 21471if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21473 vectorizeInsertValueInst(LastInsertValue, BB, R,
/*MaxVFOnly=*/false);
21474 }
elseif (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21475 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21476/*MaxVFOnly=*/false);
21479// Now try to vectorize postponed instructions. 21480 OpsChanged |= tryToVectorize(PostponedInsts, R);
21487bool Changed =
false;
21490// Maps phi nodes to the non-phi nodes found in the use tree for each phi 21491// node. Allows better to identify the chains that can be vectorized in the 21494auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21497"Expected vectorizable types only.");
21498// It is fine to compare type IDs here, since we expect only vectorizable 21499// types, like ints, floats and pointers, we don't care about other type. 21505V2->getType()->getScalarSizeInBits())
21508V2->getType()->getScalarSizeInBits())
21512if (Opcodes1.
size() < Opcodes2.
size())
21514if (Opcodes1.
size() > Opcodes2.
size())
21516for (
intI = 0, E = Opcodes1.
size();
I < E; ++
I) {
21518// Instructions come first. 21519auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21520auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21525return NodeI2 !=
nullptr;
21528assert((NodeI1 == NodeI2) ==
21530"Different nodes should have different DFS numbers");
21531if (NodeI1 != NodeI2)
21534if (S && !S.isAltShuffle())
21536returnI1->getOpcode() < I2->getOpcode();
21544// Non-undef constants come next. 21545bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21546bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21554bool U1 = isa<UndefValue>(Opcodes1[
I]);
21555bool U2 = isa<UndefValue>(Opcodes2[
I]);
21557// Non-constant non-instructions come next. 21559auto ValID1 = Opcodes1[
I]->getValueID();
21560auto ValID2 = Opcodes2[
I]->getValueID();
21561if (ValID1 == ValID2)
21563if (ValID1 < ValID2)
21565if (ValID1 > ValID2)
21573// Undefs come last. 21574assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21578auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21581if (V1->getType() !=
V2->getType())
21585if (Opcodes1.
size() != Opcodes2.
size())
21587for (
intI = 0, E = Opcodes1.
size();
I < E; ++
I) {
21588// Undefs are compatible with any other value. 21589if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21591if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21592if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21593if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21595if (
I1->getParent() != I2->getParent())
21601if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21603if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21609bool HaveVectorizedPhiNodes =
false;
21611// Collect the incoming values from the PHIs. 21614auto *
P = dyn_cast<PHINode>(&
I);
21618// No need to analyze deleted, vectorized and non-vectorizable 21620if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21628// Find the corresponding non-phi nodes for better matching when trying to 21633if (!Opcodes.
empty())
21637while (!Nodes.
empty()) {
21638auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21641for (
Value *V :
PHI->incoming_values()) {
21642if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21643 Nodes.push_back(PHI1);
21651 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21652Incoming, PHICompare, AreCompatiblePHIs,
21654return tryToVectorizeList(Candidates, R, MaxVFOnly);
21656/*MaxVFOnly=*/true,
R);
21657 Changed |= HaveVectorizedPhiNodes;
21658if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
constauto &
P) {
21659auto *
PHI = dyn_cast<PHINode>(
P.first);
21660return !
PHI ||
R.isDeleted(
PHI);
21662 PHIToOpcodes.
clear();
21664 }
while (HaveVectorizedPhiNodes);
21666 VisitedInstrs.
clear();
21668 InstSetVector PostProcessInserts;
21670// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true 21671// also vectorizes `PostProcessCmps`. 21672auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21673bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21674if (VectorizeCmps) {
21675 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21676 PostProcessCmps.
clear();
21678 PostProcessInserts.clear();
21681// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`. 21683if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21684return PostProcessCmps.
contains(Cmp);
21685return isa<InsertElementInst, InsertValueInst>(
I) &&
21686 PostProcessInserts.contains(
I);
21688// Returns true if `I` is an instruction without users, like terminator, or 21689// function call with ignored return value, store. Ignore unused instructions 21690// (basing on instruction type, except for CallInst and InvokeInst). 21692returnI->use_empty() &&
21693 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21696// Skip instructions with scalable type. The num of elements is unknown at 21697// compile-time for scalable type. 21698if (isa<ScalableVectorType>(It->getType()))
21701// Skip instructions marked for the deletion. 21702if (
R.isDeleted(&*It))
21704// We may go through BB multiple times so skip the one we have checked. 21705if (!VisitedInstrs.
insert(&*It).second) {
21706if (HasNoUsers(&*It) &&
21707 VectorizeInsertsAndCmps(
/*VectorizeCmps=*/It->isTerminator())) {
21708// We would like to start over since some instructions are deleted 21709// and the iterator may become invalid value. 21717if (isa<DbgInfoIntrinsic>(It))
21720// Try to vectorize reductions that use PHINodes. 21721if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21722// Check that the PHI is a reduction PHI. 21723if (
P->getNumIncomingValues() == 2) {
21724// Try to match and vectorize a horizontal reduction. 21726if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21733// Try to vectorize the incoming values of the PHI, to catch reductions 21734// that feed into PHIs. 21735for (
unsignedI : seq<unsigned>(
P->getNumIncomingValues())) {
21736// Skip if the incoming block is the current BB for now. Also, bypass 21737// unreachable IR for efficiency and to avoid crashing. 21738// TODO: Collect the skipped incoming values and try to vectorize them 21739// after processing BB. 21740if (BB ==
P->getIncomingBlock(
I) ||
21744// Postponed instructions should not be vectorized here, delay their 21746if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21747 PI && !IsInPostProcessInstrs(PI)) {
21749 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21751if (Res &&
R.isDeleted(
P)) {
21761if (HasNoUsers(&*It)) {
21762bool OpsChanged =
false;
21763auto *
SI = dyn_cast<StoreInst>(It);
21767// Try to vectorize chain in store, if this is the only store to the 21768// address in the block. 21769// TODO: This is just a temporarily solution to save compile time. Need 21770// to investigate if we can safely turn on slp-vectorize-hor-store 21771// instead to allow lookup for reduction chains in all non-vectorized 21772// stores (need to check side effects and compile time). 21773 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21774SI->getValueOperand()->hasOneUse();
21776if (TryToVectorizeRoot) {
21777for (
auto *V : It->operand_values()) {
21778// Postponed instructions should not be vectorized here, delay their 21780if (
auto *VI = dyn_cast<Instruction>(V);
21781VI && !IsInPostProcessInstrs(VI))
21782// Try to match and vectorize a horizontal reduction. 21783 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21786// Start vectorization of post-process list of instructions from the 21787// top-tree instructions to try to vectorize as many instructions as 21790 VectorizeInsertsAndCmps(
/*VectorizeCmps=*/It->isTerminator());
21792// We would like to start over since some instructions are deleted 21793// and the iterator may become invalid value. 21801if (isa<InsertElementInst, InsertValueInst>(It))
21802 PostProcessInserts.insert(&*It);
21803elseif (isa<CmpInst>(It))
21804 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21811auto Changed =
false;
21812for (
auto &Entry : GEPs) {
21813// If the getelementptr list has fewer than two elements, there's nothing 21815if (
Entry.second.size() < 2)
21818LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length " 21819 <<
Entry.second.size() <<
".\n");
21821// Process the GEP list in chunks suitable for the target's supported 21822// vector size. If a vector register can't hold 1 element, we are done. We 21823// are trying to vectorize the index computations, so the maximum number of 21824// elements is based on the size of the index expression, rather than the 21825// size of the GEP itself (the target's pointer size). 21827 return !R.isDeleted(GEP);
21829if (It ==
Entry.second.end())
21831unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21832unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21833if (MaxVecRegSize < EltSize)
21836unsigned MaxElts = MaxVecRegSize / EltSize;
21837for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21838autoLen = std::min<unsigned>(BE - BI, MaxElts);
21841// Initialize a set a candidate getelementptrs. Note that we use a 21842// SetVector here to preserve program order. If the index computations 21843// are vectorizable and begin with loads, we want to minimize the chance 21844// of having to reorder them later. 21847// Some of the candidates may have already been vectorized after we 21848// initially collected them or their index is optimized to constant value. 21849// If so, they are marked as deleted, so remove them from the set of 21851 Candidates.remove_if([&R](
Value *
I) {
21852returnR.isDeleted(cast<Instruction>(
I)) ||
21853 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21856// Remove from the set of candidates all pairs of getelementptrs with 21857// constant differences. Such getelementptrs are likely not good 21858// candidates for vectorization in a bottom-up phase since one can be 21859// computed from the other. We also ensure all candidate getelementptr 21860// indices are unique. 21861for (
intI = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21862auto *GEPI = GEPList[
I];
21863if (!Candidates.count(GEPI))
21866for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21867auto *GEPJ = GEPList[J];
21869if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21870 Candidates.remove(GEPI);
21871 Candidates.remove(GEPJ);
21872 }
elseif (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21873 Candidates.remove(GEPJ);
21878// We break out of the above computation as soon as we know there are 21879// fewer than two candidates remaining. 21880if (Candidates.
size() < 2)
21883// Add the single, non-constant index of each candidate to the bundle. We 21884// ensured the indices met these constraints when we originally collected 21885// the getelementptrs. 21887auto BundleIndex = 0
u;
21888for (
auto *V : Candidates) {
21889auto *
GEP = cast<GetElementPtrInst>(V);
21890auto *GEPIdx =
GEP->idx_begin()->get();
21891assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21892 Bundle[BundleIndex++] = GEPIdx;
21895// Try and vectorize the indices. We are currently only interested in 21896// gather-like cases of the form: 21898// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... 21900// where the loads of "a", the loads of "b", and the subtractions can be 21901// performed in parallel. It's likely that detecting this pattern in a 21902// bottom-up phase will be simpler and less costly than building a 21903// full-blown top-down phase beginning at the consecutive loads. 21904 Changed |= tryToVectorizeList(Bundle, R);
21910bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21911bool Changed =
false;
21912// Sort by type, base pointers and values operand. Value operands must be 21913// compatible (have the same opcode, same parent), otherwise it is 21914// definitely not profitable to try to vectorize them. 21916if (
V->getValueOperand()->getType()->getTypeID() <
21917V2->getValueOperand()->getType()->getTypeID())
21919if (
V->getValueOperand()->getType()->getTypeID() >
21920V2->getValueOperand()->getType()->getTypeID())
21922if (
V->getPointerOperandType()->getTypeID() <
21923V2->getPointerOperandType()->getTypeID())
21925if (
V->getPointerOperandType()->getTypeID() >
21926V2->getPointerOperandType()->getTypeID())
21928if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21929V2->getValueOperand()->getType()->getScalarSizeInBits())
21931if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21932V2->getValueOperand()->getType()->getScalarSizeInBits())
21934// UndefValues are compatible with all other values. 21935if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21936if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21940 DT->
getNode(I2->getParent());
21941assert(NodeI1 &&
"Should only process reachable instructions");
21942assert(NodeI2 &&
"Should only process reachable instructions");
21943assert((NodeI1 == NodeI2) ==
21945"Different nodes should have different DFS numbers");
21946if (NodeI1 != NodeI2)
21948returnI1->getOpcode() < I2->getOpcode();
21950returnV->getValueOperand()->getValueID() <
21951V2->getValueOperand()->getValueID();
21961// Undefs are compatible with any other value. 21963 isa<UndefValue>(
V2->getValueOperand()))
21966if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21967if (
I1->getParent() != I2->getParent())
21972 isa<Constant>(
V2->getValueOperand()))
21975V2->getValueOperand()->getValueID();
21978// Attempt to sort and vectorize each of the store-groups. 21980for (
auto &Pair : Stores) {
21981if (Pair.second.size() < 2)
21985 << Pair.second.size() <<
".\n");
21990// Reverse stores to do bottom-to-top analysis. This is important if the 21991// values are stores to the same addresses several times, in this case need 21992// to follow the stores order (reversed to meet the memory dependecies). 21994 Pair.second.rend());
21995 Changed |= tryToVectorizeSequence<StoreInst>(
21996 ReversedStores, StoreSorter, AreCompatibleStores,
21998return vectorizeStores(Candidates, R, Attempted);
22000/*MaxVFOnly=*/false,
R);
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
@ OK_UniformConstantValue
@ OK_NonUniformConstantValue
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.