1//===----------- VectorUtils.cpp - Vectorizer utility functions -----------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This file defines vectorizer utilities. 11//===----------------------------------------------------------------------===// 31#define DEBUG_TYPE "vectorutils" 36/// Maximum factor for an interleaved memory access. 39cl::desc(
"Maximum factor for an interleaved access group (default = 8)"),
42/// Return true if all of the intrinsic's arguments and return type are scalars 43/// for the scalar form of the intrinsic, and vectors for the vector form of the 44/// intrinsic (except operands that are marked as always being scalar by 45/// isVectorIntrinsicWithScalarOpAtArg). 48case Intrinsic::abs:
// Begin integer bit-manipulation. 50case Intrinsic::bitreverse:
60case Intrinsic::sadd_sat:
61case Intrinsic::ssub_sat:
62case Intrinsic::uadd_sat:
63case Intrinsic::usub_sat:
64case Intrinsic::smul_fix:
65case Intrinsic::smul_fix_sat:
66case Intrinsic::umul_fix:
67case Intrinsic::umul_fix_sat:
68case Intrinsic::sqrt:
// Begin floating-point. 86case Intrinsic::minnum:
87case Intrinsic::maxnum:
88case Intrinsic::minimum:
89case Intrinsic::maximum:
90case Intrinsic::copysign:
95case Intrinsic::nearbyint:
97case Intrinsic::roundeven:
100case Intrinsic::fmuladd:
101case Intrinsic::is_fpclass:
103case Intrinsic::canonicalize:
104case Intrinsic::fptosi_sat:
105case Intrinsic::fptoui_sat:
106case Intrinsic::lrint:
107case Intrinsic::llrint:
124// TODO: Move frexp to isTriviallyVectorizable. 125// https://github.com/llvm/llvm-project/issues/112408 127case Intrinsic::frexp:
133/// Identifies if the vector form of the intrinsic has a scalar operand. 135unsigned ScalarOpdIdx,
143case Intrinsic::vp_abs:
145case Intrinsic::vp_ctlz:
147case Intrinsic::vp_cttz:
148case Intrinsic::is_fpclass:
149case Intrinsic::vp_is_fpclass:
151return (ScalarOpdIdx == 1);
152case Intrinsic::smul_fix:
153case Intrinsic::smul_fix_sat:
154case Intrinsic::umul_fix:
155case Intrinsic::umul_fix_sat:
156return (ScalarOpdIdx == 2);
170return OpdIdx == -1 || OpdIdx == 0;
173case Intrinsic::fptosi_sat:
174case Intrinsic::fptoui_sat:
175case Intrinsic::lrint:
176case Intrinsic::llrint:
177case Intrinsic::vp_lrint:
178case Intrinsic::vp_llrint:
181return OpdIdx == -1 || OpdIdx == 0;
182case Intrinsic::is_fpclass:
183case Intrinsic::vp_is_fpclass:
186return OpdIdx == -1 || OpdIdx == 1;
199case Intrinsic::frexp:
200return RetIdx == 0 || RetIdx == 1;
206/// Returns intrinsic ID for call. 207/// For the input call instruction it finds mapping intrinsic and returns 208/// its ID, in case it does not found it return not_intrinsic. 216ID == Intrinsic::lifetime_end ||
ID == Intrinsic::assume ||
217ID == Intrinsic::experimental_noalias_scope_decl ||
218ID == Intrinsic::sideeffect ||
ID == Intrinsic::pseudoprobe)
223/// Given a vector and an element number, see if the scalar value is 224/// already around as a register, for example if it were inserted then extracted 227assert(V->getType()->isVectorTy() &&
"Not looking at a vector?");
228VectorType *VTy = cast<VectorType>(V->getType());
229// For fixed-length vector, return poison for out of range access. 230if (
auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
231unsigned Width = FVTy->getNumElements();
237returnC->getAggregateElement(EltNo);
240// If this is an insert to a variable element, we don't know what it is. 241if (!isa<ConstantInt>(III->getOperand(2)))
243unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
245// If this is an insert to the element we are looking for, return the 248return III->getOperand(1);
250// Guard against infinite loop on malformed, unreachable IR. 251if (III == III->getOperand(0))
254// Otherwise, the insertelement doesn't modify the value, recurse on its 260// Restrict the following transformation to fixed-length vector. 261if (SVI && isa<FixedVectorType>(SVI->
getType())) {
267if (InEl < (
int)LHSWidth)
272// Extract a value from a vector add operation with a constant zero. 273// TODO: Use getBinOpIdentity() to generalize this. 276if (
Constant *Elt =
C->getAggregateElement(EltNo))
277if (Elt->isNullValue())
280// If the vector is a splat then we can trivially find the scalar element. 281if (isa<ScalableVectorType>(VTy))
283if (EltNo < VTy->getElementCount().getKnownMinValue())
286// Otherwise, we don't know. 293// Ignore invalid (undefined) mask elements. 297// There can be only 1 non-negative mask element value if this is a splat. 298if (SplatIndex != -1 && SplatIndex != M)
301// Initialize the splat index to the 1st non-negative mask element. 304assert((SplatIndex == -1 || SplatIndex >= 0) &&
"Negative index?");
308/// Get splat value if the input is a splat vector or return nullptr. 309/// This function is not fully general. It checks only 2 cases: 310/// the input value is (1) a splat constant vector or (2) a sequence 311/// of instructions that broadcasts a scalar at element 0. 313if (isa<VectorType>(V->getType()))
314if (
auto *
C = dyn_cast<Constant>(V))
315returnC->getSplatValue();
317// shuf (inselt ?, Splat, 0), ?, <0, undef, 0, ...> 330if (isa<VectorType>(V->getType())) {
331if (isa<UndefValue>(V))
333// FIXME: We can allow undefs, but if Index was specified, we may want to 334// check that the constant is defined at that index. 335if (
auto *
C = dyn_cast<Constant>(V))
336returnC->getSplatValue() !=
nullptr;
339if (
auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) {
340// FIXME: We can safely allow undefs here. If Index was specified, we will 341// check that the mask elt is defined at the required index. 349// Match a specific element. The mask should be defined at and match the 351return Shuf->getMaskValue(Index) == Index;
354// The remaining tests are all recursive, so bail out if we hit the limit. 358// If both operands of a binop are splats, the result is a splat. 363// If all operands of a select are splats, the result is a splat. 368// TODO: Add support for unary ops (fneg), casts, intrinsics (overflow ops). 375APInt &DemandedRHS,
bool AllowUndefElts) {
378// Early out if we don't demand any elements. 382// Simple case of a shuffle with zeroinitializer. 383if (
all_of(Mask, [](
int Elt) {
return Elt == 0; })) {
388for (
unsignedI = 0, E = Mask.size();
I != E; ++
I) {
390assert((-1 <= M) && (M < (SrcWidth * 2)) &&
391"Invalid shuffle mask constant");
393if (!DemandedElts[
I] || (AllowUndefElts && (M < 0)))
396// For undef elements, we don't know anything about the common state of 397// the shuffle result. 404 DemandedRHS.
setBit(M - SrcWidth);
412assert(Scale > 0 &&
"Unexpected scaling factor");
414// Fast-path: if no scaling, then it is just a copy. 416 ScaledMask.
assign(Mask.begin(), Mask.end());
421for (
int MaskElt : Mask) {
424"Overflowed 32-bits");
426for (
int SliceElt = 0; SliceElt != Scale; ++SliceElt)
427 ScaledMask.
push_back(MaskElt < 0 ? MaskElt : Scale * MaskElt + SliceElt);
433assert(Scale > 0 &&
"Unexpected scaling factor");
435// Fast-path: if no scaling, then it is just a copy. 437 ScaledMask.
assign(Mask.begin(), Mask.end());
441// We must map the original elements down evenly to a type with less elements. 442int NumElts = Mask.size();
443if (NumElts % Scale != 0)
447 ScaledMask.
reserve(NumElts / Scale);
449// Step through the input mask by splitting into Scale-sized slices. 452assert((
int)MaskSlice.
size() == Scale &&
"Expected Scale-sized slice.");
454// The first element of the slice determines how we evaluate this slice. 455int SliceFront = MaskSlice.
front();
457// Negative values (undef or other "sentinel" values) must be equal across 463// A positive mask element must be cleanly divisible. 464if (SliceFront % Scale != 0)
466// Elements of the slice must be consecutive. 467for (
int i = 1; i < Scale; ++i)
468if (MaskSlice[i] != SliceFront + i)
470 ScaledMask.
push_back(SliceFront / Scale);
472 Mask = Mask.drop_front(Scale);
473 }
while (!Mask.empty());
475assert((
int)ScaledMask.
size() * Scale == NumElts &&
"Unexpected scaled mask");
477// All elements of the original mask can be scaled down to map to the elements 478// of a mask with wider elements. 484unsigned NumElts = M.size();
489for (
unsigned i = 0; i < NumElts; i += 2) {
493// If both elements are undef, new mask is undef too. 494if (
M0 == -1 &&
M1 == -1) {
499if (
M0 == -1 &&
M1 != -1 && (
M1 % 2) == 1) {
504if (
M0 != -1 && (
M0 % 2) == 0 && ((
M0 + 1) ==
M1 ||
M1 == -1)) {
513assert(NewMask.
size() == NumElts / 2 &&
"Incorrect size for mask!");
519unsigned NumSrcElts = Mask.size();
520assert(NumSrcElts > 0 && NumDstElts > 0 &&
"Unexpected scaling factor");
522// Fast-path: if no scaling, then it is just a copy. 523if (NumSrcElts == NumDstElts) {
524 ScaledMask.
assign(Mask.begin(), Mask.end());
528// Ensure we can find a whole scale factor. 529assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
530"Unexpected scaling factor");
532if (NumSrcElts > NumDstElts) {
533int Scale = NumSrcElts / NumDstElts;
537int Scale = NumDstElts / NumSrcElts;
544 std::array<SmallVector<int, 16>, 2> TmpMasks;
547for (
unsigned Scale = 2; Scale <= InputMask.
size(); ++Scale) {
557ArrayRef<int> Mask,
unsigned NumOfSrcRegs,
unsigned NumOfDestRegs,
558unsigned NumOfUsedRegs,
function_ref<
void()> NoInputAction,
563// Try to perform better estimation of the permutation. 564// 1. Split the source/destination vectors into real registers. 565// 2. Do the mask analysis to identify which real registers are 568unsigned SzDest = Sz / NumOfDestRegs;
569unsigned SzSrc = Sz / NumOfSrcRegs;
570for (
unsignedI = 0;
I < NumOfDestRegs; ++
I) {
571auto &RegMasks = Res[
I];
572 RegMasks.
assign(2 * NumOfSrcRegs, {});
573// Check that the values in dest registers are in the one src 575for (
unsigned K = 0; K < SzDest; ++K) {
576intIdx =
I * SzDest + K;
581int MaskIdx = Mask[
Idx] % Sz;
582int SrcRegIdx = MaskIdx / SzSrc + (Mask[
Idx] >= Sz ? NumOfSrcRegs : 0);
583// Add a cost of PermuteTwoSrc for each new source register permute, 584// if we have more than one source registers. 585if (RegMasks[SrcRegIdx].empty())
587 RegMasks[SrcRegIdx][K] = MaskIdx % SzSrc;
590// Process split mask. 591for (
unsignedI : seq<unsigned>(NumOfUsedRegs)) {
597// No input vectors were used! 601// Find the only mask with at least single undef mask elem. 604unsigned SrcReg = std::distance(Dest.begin(), It);
605 SingleInputAction(*It, SrcReg,
I);
609// The first mask is a permutation of a single register. Since we have >2 610// input registers to shuffle, we merge the masks for 2 first registers 611// and generate a shuffle of 2 registers rather than the reordering of the 612// first register and then shuffle with the second register. Next, 613// generate the shuffles of the resulting register + the remaining 614// registers from the list. 620"Expected undefined mask element.");
621 FirstMask[
Idx] = SecondMask[
Idx] + VF;
637for (
unsignedI : seq<unsigned>(2 * NumOfSrcRegs)) {
642if (FirstIdx == SecondIdx) {
648 SecondMask = RegMask;
649 CombineMasks(FirstMask, SecondMask);
650 ManyInputsAction(FirstMask, FirstIdx, SecondIdx, NewReg);
652 NormalizeMask(FirstMask);
654 SecondMask = FirstMask;
655 SecondIdx = FirstIdx;
657if (FirstIdx != SecondIdx && SecondIdx >= 0) {
658 CombineMasks(SecondMask, FirstMask);
659 ManyInputsAction(SecondMask, SecondIdx, FirstIdx, NewReg);
661 Dest[FirstIdx].clear();
662 NormalizeMask(SecondMask);
664 }
while (SecondIdx >= 0);
672constAPInt &DemandedElts,
675assert(VectorBitWidth >= 128 &&
"Vectors smaller than 128 bit not supported");
676int NumLanes = VectorBitWidth / 128;
678int NumEltsPerLane = NumElts / NumLanes;
679int HalfEltsPerLane = NumEltsPerLane / 2;
684// Map DemandedElts to the horizontal operands. 686if (!DemandedElts[
Idx])
688int LaneIdx = (
Idx / NumEltsPerLane) * NumEltsPerLane;
689int LocalIdx =
Idx % NumEltsPerLane;
690if (LocalIdx < HalfEltsPerLane) {
691 DemandedLHS.
setBit(LaneIdx + 2 * LocalIdx);
693 LocalIdx -= HalfEltsPerLane;
694 DemandedRHS.
setBit(LaneIdx + 2 * LocalIdx);
703// DemandedBits will give us every value's live-out bits. But we want 704// to ensure no extra casts would need to be inserted, so every DAG 705// of connected values must have the same minimum bitwidth. 714// Determine the roots. We work bottom-up, from truncs or icmps. 715bool SeenExtFromIllegalType =
false;
720if (
TTI && (isa<ZExtInst>(&
I) || isa<SExtInst>(&
I)) &&
722 SeenExtFromIllegalType =
true;
724// Only deal with non-vector integers up to 64-bits wide. 725if ((isa<TruncInst>(&
I) || isa<ICmpInst>(&
I)) &&
726 !
I.getType()->isVectorTy() &&
727I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) {
728// Don't make work for ourselves. If we know the loaded type is legal, 729// don't add it to the worklist. 738if (Worklist.
empty() || (
TTI && !SeenExtFromIllegalType))
741// Now proceed breadth-first, unioning values together. 742while (!Worklist.
empty()) {
746if (!Visited.
insert(Val).second)
749// Non-instructions terminate a chain successfully. 750if (!isa<Instruction>(Val))
754// If we encounter a type that is larger than 64 bits, we can't represent 756if (DB.getDemandedBits(
I).getBitWidth() > 64)
759uint64_t V = DB.getDemandedBits(
I).getZExtValue();
763// Casts, loads and instructions outside of our range terminate a chain 765if (isa<SExtInst>(
I) || isa<ZExtInst>(
I) || isa<LoadInst>(
I) ||
769// Unsafe casts terminate a chain unsuccessfully. We can't do anything 770// useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to 771// transform anything that relies on them. 772if (isa<BitCastInst>(
I) || isa<PtrToIntInst>(
I) || isa<IntToPtrInst>(
I) ||
773 !
I->getType()->isIntegerTy()) {
774 DBits[Leader] |= ~0ULL;
778// We don't modify the types of PHIs. Reductions will already have been 779// truncated if possible, and inductions' sizes will have been chosen by 784if (DBits[Leader] == ~0ULL)
785// All bits demanded, no point continuing. 788for (
Value *O : cast<User>(
I)->operands()) {
794// Now we've discovered all values, walk them to see if there are 795// any users we didn't see. If there are, we can't optimize that 798for (
auto *U :
I.first->users())
799if (U->getType()->isIntegerTy() && DBits.
count(U) == 0)
802for (
autoI = ECs.
begin(), E = ECs.
end();
I != E; ++
I) {
805 LeaderDemandedBits |= DBits[M];
808// Round up to a power of 2 811// We don't modify the types of PHIs. Reductions will already have been 812// truncated if possible, and inductions' sizes will have been chosen by 814// If we are required to shrink a PHI, abandon this entire equivalence class. 825auto *
MI = dyn_cast<Instruction>(M);
828Type *Ty = M->getType();
830 Ty =
MI->getOperand(0)->getType();
835// If any of M's operands demand more bits than MinBW then M cannot be 836// performed safely in MinBW. 838 auto *CI = dyn_cast<ConstantInt>(U);
839// For constants shift amounts, check if the shift would result in 842 isa<ShlOperator, LShrOperator, AShrOperator>(U.getUser()) &&
843 U.getOperandNo() == 1)
844 return CI->uge(MinBW);
845 uint64_t BW = bit_width(DB.getDemandedBits(&U).getZExtValue());
846 return bit_ceil(BW) > MinBW;
857/// Add all access groups in @p AccGroups to @p List. 858template <
typename ListT>
860// Interpret an access group as a list containing itself. 863List.insert(AccGroups);
867for (
constauto &AccGroupListOp : AccGroups->
operands()) {
868auto *Item = cast<MDNode>(AccGroupListOp.get());
879if (AccGroups1 == AccGroups2)
886if (Union.size() == 0)
888if (Union.size() == 1)
889return cast<MDNode>(Union.front());
900if (!MayAccessMem1 && !MayAccessMem2)
903return Inst2->
getMetadata(LLVMContext::MD_access_group);
905return Inst1->
getMetadata(LLVMContext::MD_access_group);
914// Use set for scalable 'contains' check. 921if (AccGroupSet2.
count(MD1))
925auto *Item = cast<MDNode>(Node.get());
927if (AccGroupSet2.
count(Item))
932if (Intersection.
size() == 0)
934if (Intersection.
size() == 1)
935return cast<MDNode>(Intersection.
front());
941/// \returns \p I after propagating metadata from \p VL. 949for (
auto Kind : {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
950 LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
951 LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load,
952 LLVMContext::MD_access_group, LLVMContext::MD_mmra}) {
954for (
int J = 1, E = VL.
size(); MD && J != E; ++J) {
959case LLVMContext::MD_mmra: {
963case LLVMContext::MD_tbaa:
966case LLVMContext::MD_alias_scope:
969case LLVMContext::MD_fpmath:
972case LLVMContext::MD_noalias:
973case LLVMContext::MD_nontemporal:
974case LLVMContext::MD_invariant_load:
977case LLVMContext::MD_access_group:
994// All 1's means mask is not needed. 998// TODO: support reversed access. 1002for (
unsigned i = 0; i < VF; i++)
1003for (
unsigned j = 0; j < Group.
getFactor(); ++j) {
1004unsigned HasMember = Group.
getMember(j) ? 1 : 0;
1005 Mask.push_back(Builder.
getInt1(HasMember));
1014for (
unsigned i = 0; i < VF; i++)
1015for (
unsigned j = 0; j < ReplicationFactor; j++)
1024for (
unsigned i = 0; i < VF; i++)
1025for (
unsigned j = 0; j < NumVecs; j++)
1026 Mask.push_back(j * VF + i);
1034for (
unsigned i = 0; i < VF; i++)
1035 Mask.push_back(Start + i * Stride);
1042unsigned NumUndefs) {
1044for (
unsigned i = 0; i < NumInts; i++)
1045 Mask.push_back(Start + i);
1047for (
unsigned i = 0; i < NumUndefs; i++)
1055// Avoid casts in the loop and make sure we have a reasonable number. 1056int NumEltsSigned = NumElts;
1057assert(NumEltsSigned > 0 &&
"Expected smaller or non-zero element count");
1059// If the mask chooses an element from operand 1, reduce it to choose from the 1060// corresponding element of operand 0. Undef mask elements are unchanged. 1062for (
int MaskElt : Mask) {
1063assert((MaskElt < NumEltsSigned * 2) &&
"Expected valid shuffle mask");
1064int UnaryElt = MaskElt >= NumEltsSigned ? MaskElt - NumEltsSigned : MaskElt;
1070/// A helper function for concatenating vectors. This function concatenates two 1071/// vectors having the same element type. If the second vector has fewer 1072/// elements than the first, it is padded with undefs. 1076VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
1077assert(VecTy1 && VecTy2 &&
1078 VecTy1->getScalarType() == VecTy2->getScalarType() &&
1079"Expect two vectors with the same element type");
1081unsigned NumElts1 = cast<FixedVectorType>(VecTy1)->getNumElements();
1082unsigned NumElts2 = cast<FixedVectorType>(VecTy2)->getNumElements();
1083assert(NumElts1 >= NumElts2 &&
"Unexpect the first vector has less elements");
1085if (NumElts1 > NumElts2) {
1086// Extend with UNDEFs. 1097unsigned NumVecs = Vecs.
size();
1098assert(NumVecs > 1 &&
"Should be at least two vectors");
1104for (
unsigned i = 0; i < NumVecs - 1; i += 2) {
1105Value *V0 = ResList[i], *V1 = ResList[i + 1];
1106assert((V0->
getType() == V1->getType() || i == NumVecs - 2) &&
1107"Only the last vector may have a different type");
1112// Push the last vector if the total number of vectors is odd. 1113if (NumVecs % 2 != 0)
1114 TmpList.
push_back(ResList[NumVecs - 1]);
1117 NumVecs = ResList.
size();
1118 }
while (NumVecs > 1);
1124assert(isa<VectorType>(Mask->getType()) &&
1125 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1126 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1128"Mask must be a vector of i1");
1130auto *ConstMask = dyn_cast<Constant>(Mask);
1133if (ConstMask->isNullValue() || isa<UndefValue>(ConstMask))
1135if (isa<ScalableVectorType>(ConstMask->getType()))
1139 E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
1141if (
auto *MaskElt = ConstMask->getAggregateElement(
I))
1142if (MaskElt->isNullValue() || isa<UndefValue>(MaskElt))
1150assert(isa<VectorType>(Mask->getType()) &&
1151 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1152 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1154"Mask must be a vector of i1");
1156auto *ConstMask = dyn_cast<Constant>(Mask);
1159if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
1161if (isa<ScalableVectorType>(ConstMask->getType()))
1165 E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
1167if (
auto *MaskElt = ConstMask->getAggregateElement(
I))
1168if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
1176assert(isa<VectorType>(Mask->getType()) &&
1177 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1178 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1180"Mask must be a vector of i1");
1182auto *ConstMask = dyn_cast<Constant>(Mask);
1185if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
1187if (isa<ScalableVectorType>(ConstMask->getType()))
1191 E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
1193if (
auto *MaskElt = ConstMask->getAggregateElement(
I))
1194if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
1200/// TODO: This is a lot like known bits, but for 1201/// vectors. Is there something we can common this with? 1203assert(isa<FixedVectorType>(Mask->getType()) &&
1204 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1205 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1207"Mask must be a fixed width vector of i1");
1209constunsigned VWidth =
1210 cast<FixedVectorType>(Mask->getType())->getNumElements();
1212if (
auto *CV = dyn_cast<ConstantVector>(Mask))
1213for (
unsigned i = 0; i < VWidth; i++)
1214if (CV->getAggregateElement(i)->isNullValue())
1219bool InterleavedAccessInfo::isStrided(
int Stride) {
1220unsigned Factor = std::abs(Stride);
1224void InterleavedAccessInfo::collectConstStrideAccesses(
1229// Since it's desired that the load/store instructions be maintained in 1230// "program order" for the interleaved access analysis, we have to visit the 1231// blocks in the loop in reverse postorder (i.e., in a topological order). 1232// Such an ordering will ensure that any load/store that may be executed 1233// before a second load/store will precede the second load/store in 1238for (
auto &
I : *BB) {
1244// Currently, codegen doesn't support cases where the type size doesn't 1245// match the alloc size. Skip them for now. 1247if (
Size * 8 !=
DL.getTypeSizeInBits(ElementTy))
1250// We don't check wrapping here because we don't know yet if Ptr will be 1251// part of a full group or a group with gaps. Checking wrapping for all 1252// pointers (even those that end up in groups with no gaps) will be overly 1253// conservative. For full groups, wrapping should be ok since if we would 1254// wrap around the address space we would do a memory access at nullptr 1255// even without the transformation. The wrapping checks are therefore 1256// deferred until after we've formed the interleaved groups. 1259/*Assume=*/true,
/*ShouldCheckWrap=*/false).value_or(0);
1262 AccessStrideInfo[&
I] = StrideDescriptor(Stride, Scev,
Size,
1267// Analyze interleaved accesses and collect them into interleaved load and 1270// When generating code for an interleaved load group, we effectively hoist all 1271// loads in the group to the location of the first load in program order. When 1272// generating code for an interleaved store group, we sink all stores to the 1273// location of the last store. This code motion can change the order of load 1274// and store instructions and may break dependences. 1276// The code generation strategy mentioned above ensures that we won't violate 1277// any write-after-read (WAR) dependences. 1279// E.g., for the WAR dependence: a = A[i]; // (1) 1282// The store group of (2) is always inserted at or below (2), and the load 1283// group of (1) is always inserted at or above (1). Thus, the instructions will 1284// never be reordered. All other dependences are checked to ensure the 1285// correctness of the instruction reordering. 1287// The algorithm visits all memory accesses in the loop in bottom-up program 1288// order. Program order is established by traversing the blocks in the loop in 1289// reverse postorder when collecting the accesses. 1291// We visit the memory accesses in bottom-up order because it can simplify the 1292// construction of store groups in the presence of write-after-write (WAW) 1295// E.g., for the WAW dependence: A[i] = a; // (1) 1297// A[i + 1] = c; // (3) 1299// We will first create a store group with (3) and (2). (1) can't be added to 1300// this group because it and (2) are dependent. However, (1) can be grouped 1301// with other accesses that may precede it in program order. Note that a 1302// bottom-up order does not imply that WAW dependences should not be checked. 1304bool EnablePredicatedInterleavedMemAccesses) {
1308// Holds all accesses with a constant stride. 1310 collectConstStrideAccesses(AccessStrideInfo, Strides);
1312if (AccessStrideInfo.
empty())
1315// Collect the dependences in the loop. 1316 collectDependences();
1318// Holds all interleaved store groups temporarily. 1320// Holds all interleaved load groups temporarily. 1322// Groups added to this set cannot have new members added. 1325// Search in bottom-up program order for pairs of accesses (A and B) that can 1326// form interleaved load or store groups. In the algorithm below, access A 1327// precedes access B in program order. We initialize a group for B in the 1328// outer loop of the algorithm, and then in the inner loop, we attempt to 1329// insert each A into B's group if: 1331// 1. A and B have the same stride, 1332// 2. A and B have the same memory object size, and 1333// 3. A belongs in B's group according to its distance from B. 1335// Special care is taken to ensure group formation will not break any 1337for (
auto BI = AccessStrideInfo.
rbegin(), E = AccessStrideInfo.
rend();
1340 StrideDescriptor DesB = BI->second;
1342// Initialize a group for B if it has an allowable stride. Even if we don't 1343// create a group for B, we continue with the bottom-up algorithm to ensure 1344// we don't break any of B's dependences. 1346if (isStrided(DesB.Stride) &&
1347 (!isPredicated(
B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
1352 GroupB = createInterleaveGroup(
B, DesB.Stride, DesB.Alignment);
1353if (
B->mayWriteToMemory())
1354 StoreGroups.
insert(GroupB);
1356 LoadGroups.
insert(GroupB);
1360for (
auto AI = std::next(BI); AI != E; ++AI) {
1362 StrideDescriptor DesA = AI->second;
1364// Our code motion strategy implies that we can't have dependences 1365// between accesses in an interleaved group and other accesses located 1366// between the first and last member of the group. Note that this also 1367// means that a group can't have more than one member at a given offset. 1368// The accesses in a group can have dependences with other accesses, but 1369// we must ensure we don't extend the boundaries of the group such that 1370// we encompass those dependent accesses. 1372// For example, assume we have the sequence of accesses shown below in a 1375// (1, 2) is a group | A[i] = a; // (1) 1376// | A[i-1] = b; // (2) | 1377// A[i-3] = c; // (3) 1378// A[i] = d; // (4) | (2, 4) is not a group 1380// Because accesses (2) and (3) are dependent, we can group (2) with (1) 1381// but not with (4). If we did, the dependent access (3) would be within 1382// the boundaries of the (2, 4) group. 1387if (MemberOfGroupB && !canReorderMemAccessesForInterleavedGroups(
1388A, &*AccessStrideInfo.
find(MemberOfGroupB)))
1389return MemberOfGroupB;
1395// If A is a load, dependencies are tolerable, there's nothing to do here. 1396// If both A and B belong to the same (store) group, they are independent, 1397// even if dependencies have not been recorded. 1398// If both GroupA and GroupB are null, there's nothing to do here. 1399if (
A->mayWriteToMemory() && GroupA != GroupB) {
1401// If GroupB is a load group, we have to compare AI against all 1402// members of GroupB because if any load within GroupB has a dependency 1403// on AI, we need to mark GroupB as complete and also release the 1404// store GroupA (if A belongs to one). The former prevents incorrect 1405// hoisting of load B above store A while the latter prevents incorrect 1406// sinking of store A below load B. 1407if (GroupB && LoadGroups.
contains(GroupB))
1408 DependentInst = DependentMember(GroupB, &*AI);
1409elseif (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI))
1413// A has a store dependence on B (or on some load within GroupB) and 1414// is part of a store group. Release A's group to prevent illegal 1415// sinking of A below B. A will then be free to form another group 1416// with instructions that precede it. 1417if (GroupA && StoreGroups.
contains(GroupA)) {
1419"dependence between " 1420 << *
A <<
" and " << *DependentInst <<
'\n');
1421 StoreGroups.
remove(GroupA);
1422 releaseGroup(GroupA);
1424// If B is a load and part of an interleave group, no earlier loads 1425// can be added to B's interleave group, because this would mean the 1426// DependentInst would move across store A. Mark the interleave group 1428if (GroupB && LoadGroups.
contains(GroupB)) {
1430 <<
" as complete.\n");
1431 CompletedLoadGroups.
insert(GroupB);
1435if (CompletedLoadGroups.
contains(GroupB)) {
1436// Skip trying to add A to B, continue to look for other conflicting A's 1437// in groups to be released. 1441// At this point, we've checked for illegal code motion. If either A or B 1442// isn't strided, there's nothing left to do. 1443if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
1446// Ignore A if it's already in a group or isn't the same kind of memory 1448// Note that mayReadFromMemory() isn't mutually exclusive to 1449// mayWriteToMemory in the case of atomic loads. We shouldn't see those 1450// here, canVectorizeMemory() should have returned false - except for the 1451// case we asked for optimization remarks. 1453 (
A->mayReadFromMemory() !=
B->mayReadFromMemory()) ||
1454 (
A->mayWriteToMemory() !=
B->mayWriteToMemory()))
1457// Check rules 1 and 2. Ignore A if its stride or size is different from 1459if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
1462// Ignore A if the memory object of A and B don't belong to the same 1467// Calculate the distance from A to B. 1474// Check rule 3. Ignore A if its distance to B is not a multiple of the 1476if (DistanceToB %
static_cast<int64_t
>(DesB.Size))
1479// All members of a predicated interleave-group must have the same predicate, 1480// and currently must reside in the same BB. 1483if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
1484 (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
1487// The index of A is the index of B plus A's distance to B in multiples 1490 GroupB->
getIndex(
B) + DistanceToB /
static_cast<int64_t
>(DesB.Size);
1492// Try to insert A into B's group. 1495 <<
" into the interleave group with" << *
B 1497 InterleaveGroupMap[
A] = GroupB;
1499// Set the first load in program order as the insert position. 1500if (
A->mayReadFromMemory())
1503 }
// Iteration over A accesses. 1504 }
// Iteration over B accesses. 1508constchar *FirstOrLast) ->
bool {
1510assert(Member &&
"Group member does not exist");
1513if (
getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
1514/*Assume=*/false,
/*ShouldCheckWrap=*/true).value_or(0))
1516LLVM_DEBUG(
dbgs() <<
"LV: Invalidate candidate interleaved group due to " 1518 <<
" group member potentially pointer-wrapping.\n");
1519 releaseGroup(Group);
1523// Remove interleaved groups with gaps whose memory 1524// accesses may wrap around. We have to revisit the getPtrStride analysis, 1525// this time with ShouldCheckWrap=true, since collectConstStrideAccesses does 1526// not check wrapping (see documentation there). 1527// FORNOW we use Assume=false; 1528// TODO: Change to Assume=true but making sure we don't exceed the threshold 1529// of runtime SCEV assumptions checks (thereby potentially failing to 1530// vectorize altogether). 1531// Additional optional optimizations: 1532// TODO: If we are peeling the loop and we know that the first pointer doesn't 1533// wrap then we can deduce that all pointers in the group don't wrap. 1534// This means that we can forcefully peel the loop in order to only have to 1535// check the first pointer for no-wrap. When we'll change to use Assume=true 1536// we'll only need at most one runtime check per interleaved group. 1537for (
auto *Group : LoadGroups) {
1538// Case 1: A full group. Can Skip the checks; For full groups, if the wide 1539// load would wrap around the address space we would do a memory access at 1540// nullptr even without the transformation. 1544// Case 2: If first and last members of the group don't wrap this implies 1545// that all the pointers in the group don't wrap. 1546// So we check only group member 0 (which is always guaranteed to exist), 1547// and group member Factor - 1; If the latter doesn't exist we rely on 1548// peeling (if it is a non-reversed access -- see Case 3). 1549if (InvalidateGroupIfMemberMayWrap(Group, 0,
"first"))
1552 InvalidateGroupIfMemberMayWrap(Group, Group->
getFactor() - 1,
"last");
1554// Case 3: A non-reversed interleaved load group with gaps: We need 1555// to execute at least one scalar epilogue iteration. This will ensure 1556// we don't speculatively access memory out-of-bounds. We only need 1557// to look for a member at index factor - 1, since every group must have 1558// a member at index zero. 1561dbgs() <<
"LV: Invalidate candidate interleaved group due to " 1562"a reverse access with gaps.\n");
1563 releaseGroup(Group);
1567dbgs() <<
"LV: Interleaved group requires epilogue iteration.\n");
1568 RequiresScalarEpilogue =
true;
1572for (
auto *Group : StoreGroups) {
1573// Case 1: A full group. Can Skip the checks; For full groups, if the wide 1574// store would wrap around the address space we would do a memory access at 1575// nullptr even without the transformation. 1579// Interleave-store-group with gaps is implemented using masked wide store. 1580// Remove interleaved store groups with gaps if 1581// masked-interleaved-accesses are not enabled by the target. 1582if (!EnablePredicatedInterleavedMemAccesses) {
1584dbgs() <<
"LV: Invalidate candidate interleaved store group due " 1586 releaseGroup(Group);
1590// Case 2: If first and last members of the group don't wrap this implies 1591// that all the pointers in the group don't wrap. 1592// So we check only group member 0 (which is always guaranteed to exist), 1593// and the last group member. Case 3 (scalar epilog) is not relevant for 1594// stores with gaps, which are implemented with masked-store (rather than 1595// speculative access, as in loads). 1596if (InvalidateGroupIfMemberMayWrap(Group, 0,
"first"))
1598for (
int Index = Group->
getFactor() - 1; Index > 0; Index--)
1600 InvalidateGroupIfMemberMayWrap(Group, Index,
"last");
1607// If no group had triggered the requirement to create an epilogue loop, 1608// there is nothing to do. 1612// Release groups requiring scalar epilogues. Note that this also removes them 1613// from InterleaveGroups. 1614bool ReleasedGroup = InterleaveGroups.remove_if([&](
auto *Group) {
1615if (!Group->requiresScalarEpilogue())
1619 <<
"LV: Invalidate candidate interleaved group due to gaps that " 1620"require a scalar epilogue (not allowed under optsize) and cannot " 1621"be masked (not enabled). \n");
1622 releaseGroupWithoutRemovingFromSet(Group);
1625assert(ReleasedGroup &&
"At least one group must be invalidated, as a " 1626"scalar epilogue was required");
1627 (void)ReleasedGroup;
1628 RequiresScalarEpilogue =
false;
1631template <
typename InstT>
1640 std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
1641 [](std::pair<int, Instruction *> p) { return p.second; });
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
DenseMap< Block *, BlockRelaxAux > Blocks
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
This pass exposes codegen information to IR-level passes.
static Value * concatenateTwoVectors(IRBuilderBase &Builder, Value *V1, Value *V2)
A helper function for concatenating vectors.
static cl::opt< unsigned > MaxInterleaveGroupFactor("max-interleave-group-factor", cl::Hidden, cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8))
Maximum factor for an interleaved memory access.
static void addToAccessGroupList(ListT &List, MDNode *AccGroups)
Add all access groups in AccGroups to List.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
unsigned getBitWidth() const
Return the number of bits in the APInt.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
int64_t getSExtValue() const
Get sign extended value.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
LLVM Basic Block Representation.
const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
This class represents a function call, abstracting a target machine's calling convention.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
const ElemTy & getOrInsertLeaderValue(const ElemTy &V)
getOrInsertLeaderValue - Return the leader for the specified value that is in the set.
member_iterator member_end() const
member_iterator member_begin(iterator I) const
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
union - Merge the two equivalence sets for the specified values, inserting them if they do not alread...
Common base class shared among various IRBuilders.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
This instruction inserts a single (scalar) element into a VectorType value.
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadataOtherThanDebugLoc(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
This does the same thing as getAllMetadata, except that it filters out the debug location.
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
void setInsertPos(InstTy *Inst)
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign)
Try to insert a new member Instr with index Index and alignment NewAlign.
uint32_t getNumMembers() const
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
This is an important class for using LLVM in a threaded context.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
BlockT * getHeader() const
Store the result of a depth first search within basic blocks contained by a single loop.
static MDNode * getMostGenericAliasScope(MDNode *A, MDNode *B)
static MDNode * getMostGenericTBAA(MDNode *A, MDNode *B)
ArrayRef< MDOperand > operands() const
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static MDNode * getMostGenericFPMath(MDNode *A, MDNode *B)
unsigned getNumOperands() const
Return number of MDNode operands.
static MDNode * intersect(MDNode *A, MDNode *B)
LLVMContext & getContext() const
Tracking metadata reference owned by Metadata.
static MDNode * combine(LLVMContext &Ctx, const MMRAMetadata &A, const MMRAMetadata &B)
Combines A and B according to MMRA semantics.
This class implements a map that also provides access to all stored values in a deterministic order.
iterator find(const KeyT &Key)
reverse_iterator rbegin()
Root of the metadata hierarchy.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents a constant integer value.
const APInt & getAPInt() const
This class represents an analyzed expression in the program.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool remove(const value_type &X)
Remove an item from the set vector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx) const
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const
bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx) const
Identifies if the vector form of the intrinsic has a scalar operand.
bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
static bool isVPCast(Intrinsic::ID ID)
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVMContext & getContext() const
All values hold a context through their type.
Base class of all SIMD vector types.
Type * getElementType() const
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
bool isTargetIntrinsic(ID IID)
isTargetIntrinsic - Returns true if IID is an intrinsic specific to a certain target.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
bool match(Val *V, const Pattern &P)
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool isTriviallyScalarizable(Intrinsic::ID ID, const TargetTransformInfo *TTI)
Identify if the intrinsic is trivially scalarizable.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
APInt possiblyDemandedEltsInMask(Value *Mask)
Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y) for each lane which may be ...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
llvm::SmallVector< int, 16 > createUnaryMask(ArrayRef< int > Mask, unsigned NumElts)
Given a shuffle mask for a binary shuffle, create the equivalent shuffle mask assuming both operands ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
MDNode * intersectAccessGroups(const Instruction *Inst1, const Instruction *Inst2)
Compute the access-group list of access groups that Inst1 and Inst2 are both in.
unsigned M1(unsigned Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
constexpr unsigned MaxAnalysisRecursionDepth
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool maskIsAllOneOrUndef(Value *Mask)
Given a mask vector of i1, Return true if all of the elements of this predicate mask are known to be ...
constexpr int PoisonMaskElem
bool isValidAsAccessGroup(MDNode *AccGroup)
Return whether an MDNode might represent an access group.
Intrinsic::ID getIntrinsicForCallSite(const CallBase &CB, const TargetLibraryInfo *TLI)
Map a call instruction to an intrinsic ID.
bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
MDNode * uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2)
Compute the union of two access-group lists.
unsigned M0(unsigned Val)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
bool maskIsAllZeroOrUndef(Value *Mask)
Given a mask vector of i1, Return true if all of the elements of this predicate mask are known to be ...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
void getShuffleMaskWithWidestElts(ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Repetitively apply widenShuffleMaskElts() for as long as it succeeds, to get the shuffle mask with wi...
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
bool maskContainsAllOneOrUndef(Value *Mask)
Given a mask vector of i1, Return true if any of the elements of this predicate mask are known to be ...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.