1//===- ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ----===// 4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5// See https://llvm.org/LICENSE.txt for license information. 6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 8//===----------------------------------------------------------------------===// 10// This pass replaces masked memory intrinsics - when unsupported by the target 11// - with a chain of basic blocks, that deal with the elements one-by-one if the 12// appropriate mask bit is set. 14//===----------------------------------------------------------------------===// 43#define DEBUG_TYPE "scalarize-masked-mem-intrin" 47classScalarizeMaskedMemIntrinLegacyPass :
publicFunctionPass {
49staticcharID;
// Pass identification, replacement for typeid 59return"Scalarize Masked Memory Intrinsics";
68}
// end anonymous namespace 78char ScalarizeMaskedMemIntrinLegacyPass::ID = 0;
81"Scalarize unsupported masked memory intrinsics",
false,
90returnnew ScalarizeMaskedMemIntrinLegacyPass();
98unsigned NumElts = cast<FixedVectorType>(Mask->getType())->getNumElements();
99for (
unsigned i = 0; i != NumElts; ++i) {
100Constant *CElt =
C->getAggregateElement(i);
101if (!CElt || !isa<ConstantInt>(CElt))
110returnDL.isBigEndian() ? VectorWidth - 1 -
Idx :
Idx;
113// Translate a masked load intrinsic like 114// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, 115// <16 x i1> %mask, <16 x i32> %passthru) 116// to a chain of basic blocks, with loading element one-by-one if 117// the appropriate mask bit is set 119// %1 = bitcast i8* %addr to i32* 120// %2 = extractelement <16 x i1> %mask, i32 0 121// br i1 %2, label %cond.load, label %else 123// cond.load: ; preds = %0 124// %3 = getelementptr i32* %1, i32 0 126// %5 = insertelement <16 x i32> %passthru, i32 %4, i32 0 129// else: ; preds = %0, %cond.load 130// %res.phi.else = phi <16 x i32> [ %5, %cond.load ], [ poison, %0 ] 131// %6 = extractelement <16 x i1> %mask, i32 1 132// br i1 %6, label %cond.load1, label %else2 134// cond.load1: ; preds = %else 135// %7 = getelementptr i32* %1, i32 1 137// %9 = insertelement <16 x i32> %res.phi.else, i32 %8, i32 1 140// else2: ; preds = %else, %cond.load1 141// %res.phi.else3 = phi <16 x i32> [ %9, %cond.load1 ], [ %res.phi.else, %else ] 142// %10 = extractelement <16 x i1> %mask, i32 2 143// br i1 %10, label %cond.load4, label %else5 153constAlign AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
156Type *EltTy = VecType->getElementType();
165// Short-cut if the mask is all-true. 166if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
175// Adjust alignment for the scalar instruction. 176constAlign AdjustedAlignVal =
178unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
181Value *VResult = Src0;
184for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
185if (cast<Constant>(Mask)->getAggregateElement(
Idx)->isNullValue())
196// Optimize the case where the "masked load" is a predicated load - that is, 197// where the mask is the splat of a non-constant scalar boolean. In that case, 198// use that splated value as the guard on a conditional vector load. 201 Mask->getName() +
".first");
204/*BranchWeights=*/nullptr, DTU);
207 CondBlock->
setName(
"cond.load");
211 Load->copyMetadata(*CI);
216 Phi->addIncoming(Load, CondBlock);
217 Phi->addIncoming(Src0, IfBlock);
225// If the mask is not v1i1, use scalar bit test operations. This generates 226// better results on X86 at least. However, don't do this on GPUs and other 227// machines with divergence, as there each i1 needs a vector register. 228Value *SclrMask =
nullptr;
229if (VectorWidth != 1 && !HasBranchDivergence) {
231 SclrMask = Builder.
CreateBitCast(Mask, SclrMaskTy,
"scalar_mask");
234for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
235// Fill the "else" block, created in the previous iteration 237// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, 238// %else ] %mask_1 = and i16 %scalar_mask, i32 1 << Idx %cond = icmp ne i16 239// %mask_1, 0 br i1 %mask_1, label %cond.load, label %else 242// %cond = extrectelement %mask, Idx 245if (SclrMask !=
nullptr) {
249 Builder.
getIntN(VectorWidth, 0));
254// Create "cond" block 256// %EltAddr = getelementptr i32* %1, i32 0 257// %Elt = load i32* %EltAddr 258// VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx 262/*BranchWeights=*/nullptr, DTU);
265 CondBlock->
setName(
"cond.load");
272// Create "else" block, fill it in the next iteration 276 IfBlock = NewIfBlock;
278// Create the phi to join the new and previous value. 281 Phi->addIncoming(NewVResult, CondBlock);
282 Phi->addIncoming(VResult, PrevIfBlock);
292// Translate a masked store intrinsic, like 293// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, 295// to a chain of basic blocks, that stores element one-by-one if 296// the appropriate mask bit is set 298// %1 = bitcast i8* %addr to i32* 299// %2 = extractelement <16 x i1> %mask, i32 0 300// br i1 %2, label %cond.store, label %else 302// cond.store: ; preds = %0 303// %3 = extractelement <16 x i32> %val, i32 0 304// %4 = getelementptr i32* %1, i32 0 305// store i32 %3, i32* %4 308// else: ; preds = %0, %cond.store 309// %5 = extractelement <16 x i1> %mask, i32 1 310// br i1 %5, label %cond.store1, label %else2 312// cond.store1: ; preds = %else 313// %6 = extractelement <16 x i32> %val, i32 1 314// %7 = getelementptr i32* %1, i32 1 315// store i32 %6, i32* %7 326constAlign AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
327auto *VecType = cast<VectorType>(Src->getType());
329Type *EltTy = VecType->getElementType();
336// Short-cut if the mask is all-true. 337if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
340 Store->copyMetadata(*CI);
345// Adjust alignment for the scalar instruction. 346constAlign AdjustedAlignVal =
348unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
351for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
352if (cast<Constant>(Mask)->getAggregateElement(
Idx)->isNullValue())
362// Optimize the case where the "masked store" is a predicated store - that is, 363// when the mask is the splat of a non-constant scalar boolean. In that case, 364// optimize to a conditional store. 367 Mask->getName() +
".first");
370/*BranchWeights=*/nullptr, DTU);
372 CondBlock->
setName(
"cond.store");
377 Store->copyMetadata(*CI);
384// If the mask is not v1i1, use scalar bit test operations. This generates 385// better results on X86 at least. However, don't do this on GPUs or other 386// machines with branch divergence, as there each i1 takes up a register. 387Value *SclrMask =
nullptr;
388if (VectorWidth != 1 && !HasBranchDivergence) {
390 SclrMask = Builder.
CreateBitCast(Mask, SclrMaskTy,
"scalar_mask");
393for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
394// Fill the "else" block, created in the previous iteration 396// %mask_1 = and i16 %scalar_mask, i32 1 << Idx 397// %cond = icmp ne i16 %mask_1, 0 398// br i1 %mask_1, label %cond.store, label %else 401// %cond = extrectelement %mask, Idx 404if (SclrMask !=
nullptr) {
408 Builder.
getIntN(VectorWidth, 0));
413// Create "cond" block 415// %OneElt = extractelement <16 x i32> %Src, i32 Idx 416// %EltAddr = getelementptr i32* %1, i32 0 417// %store i32 %OneElt, i32* %EltAddr 421/*BranchWeights=*/nullptr, DTU);
424 CondBlock->
setName(
"cond.store");
431// Create "else" block, fill it in the next iteration 442// Translate a masked gather intrinsic like 443// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, 444// <16 x i1> %Mask, <16 x i32> %Src) 445// to a chain of basic blocks, with loading element one-by-one if 446// the appropriate mask bit is set 448// %Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind 449// %Mask0 = extractelement <16 x i1> %Mask, i32 0 450// br i1 %Mask0, label %cond.load, label %else 453// %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 454// %Load0 = load i32, i32* %Ptr0, align 4 455// %Res0 = insertelement <16 x i32> poison, i32 %Load0, i32 0 459// %res.phi.else = phi <16 x i32>[%Res0, %cond.load], [poison, %0] 460// %Mask1 = extractelement <16 x i1> %Mask, i32 1 461// br i1 %Mask1, label %cond.load1, label %else2 464// %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 465// %Load1 = load i32, i32* %Ptr1, align 4 466// %Res1 = insertelement <16 x i32> %res.phi.else, i32 %Load1, i32 1 469// %Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src 470// ret <16 x i32> %Result 472bool HasBranchDivergence,
CallInst *CI,
479auto *VecType = cast<FixedVectorType>(CI->
getType());
480Type *EltTy = VecType->getElementType();
486MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
491Value *VResult = Src0;
492unsigned VectorWidth = VecType->getNumElements();
494// Shorten the way if the mask is a vector of constants. 496for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
497if (cast<Constant>(Mask)->getAggregateElement(
Idx)->isNullValue())
510// If the mask is not v1i1, use scalar bit test operations. This generates 511// better results on X86 at least. However, don't do this on GPUs or other 512// machines with branch divergence, as there, each i1 takes up a register. 513Value *SclrMask =
nullptr;
514if (VectorWidth != 1 && !HasBranchDivergence) {
516 SclrMask = Builder.
CreateBitCast(Mask, SclrMaskTy,
"scalar_mask");
519for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
520// Fill the "else" block, created in the previous iteration 522// %Mask1 = and i16 %scalar_mask, i32 1 << Idx 523// %cond = icmp ne i16 %mask_1, 0 524// br i1 %Mask1, label %cond.load, label %else 527// %cond = extrectelement %mask, Idx 531if (SclrMask !=
nullptr) {
535 Builder.
getIntN(VectorWidth, 0));
540// Create "cond" block 542// %EltAddr = getelementptr i32* %1, i32 0 543// %Elt = load i32* %EltAddr 544// VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx 548/*BranchWeights=*/nullptr, DTU);
551 CondBlock->
setName(
"cond.load");
560// Create "else" block, fill it in the next iteration 564 IfBlock = NewIfBlock;
566// Create the phi to join the new and previous value. 569 Phi->addIncoming(NewVResult, CondBlock);
570 Phi->addIncoming(VResult, PrevIfBlock);
580// Translate a masked scatter intrinsic, like 581// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, 583// to a chain of basic blocks, that stores element one-by-one if 584// the appropriate mask bit is set. 586// %Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind 587// %Mask0 = extractelement <16 x i1> %Mask, i32 0 588// br i1 %Mask0, label %cond.store, label %else 591// %Elt0 = extractelement <16 x i32> %Src, i32 0 592// %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 593// store i32 %Elt0, i32* %Ptr0, align 4 597// %Mask1 = extractelement <16 x i1> %Mask, i32 1 598// br i1 %Mask1, label %cond.store1, label %else2 601// %Elt1 = extractelement <16 x i32> %Src, i32 1 602// %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 603// store i32 %Elt1, i32* %Ptr1, align 4 607bool HasBranchDivergence,
CallInst *CI,
614auto *SrcFVTy = cast<FixedVectorType>(Src->getType());
617 isa<VectorType>(Ptrs->
getType()) &&
618 isa<PointerType>(cast<VectorType>(Ptrs->
getType())->getElementType()) &&
619"Vector of pointers is expected in masked scatter intrinsic");
626MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
627unsigned VectorWidth = SrcFVTy->getNumElements();
629// Shorten the way if the mask is a vector of constants. 631for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
632if (cast<Constant>(Mask)->getAggregateElement(
Idx)->isNullValue())
643// If the mask is not v1i1, use scalar bit test operations. This generates 644// better results on X86 at least. 645Value *SclrMask =
nullptr;
646if (VectorWidth != 1 && !HasBranchDivergence) {
648 SclrMask = Builder.
CreateBitCast(Mask, SclrMaskTy,
"scalar_mask");
651for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
652// Fill the "else" block, created in the previous iteration 654// %Mask1 = and i16 %scalar_mask, i32 1 << Idx 655// %cond = icmp ne i16 %mask_1, 0 656// br i1 %Mask1, label %cond.store, label %else 659// %cond = extrectelement %mask, Idx 662if (SclrMask !=
nullptr) {
666 Builder.
getIntN(VectorWidth, 0));
671// Create "cond" block 673// %Elt1 = extractelement <16 x i32> %Src, i32 1 674// %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 675// %store i32 %Elt1, i32* %Ptr1 679/*BranchWeights=*/nullptr, DTU);
682 CondBlock->
setName(
"cond.store");
689// Create "else" block, fill it in the next iteration 701bool HasBranchDivergence,
CallInst *CI,
708auto *VecType = cast<FixedVectorType>(CI->
getType());
710Type *EltTy = VecType->getElementType();
719unsigned VectorWidth = VecType->getNumElements();
722Value *VResult = PassThru;
724// Adjust alignment for the scalar instruction. 725constAlign AdjustedAlignment =
728// Shorten the way if the mask is a vector of constants. 729// Create a build_vector pattern, with loads/poisons as necessary and then 730// shuffle blend with the pass through value. 732unsigned MemIndex = 0;
735for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
737if (cast<Constant>(Mask)->getAggregateElement(
Idx)->isNullValue()) {
739 ShuffleMask[
Idx] =
Idx + VectorWidth;
757// If the mask is not v1i1, use scalar bit test operations. This generates 758// better results on X86 at least. However, don't do this on GPUs or other 759// machines with branch divergence, as there, each i1 takes up a register. 760Value *SclrMask =
nullptr;
761if (VectorWidth != 1 && !HasBranchDivergence) {
763 SclrMask = Builder.
CreateBitCast(Mask, SclrMaskTy,
"scalar_mask");
766for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
767// Fill the "else" block, created in the previous iteration 769// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, 770// %else ] %mask_1 = extractelement <16 x i1> %mask, i32 Idx br i1 %mask_1, 771// label %cond.load, label %else 774// %cond = extrectelement %mask, Idx 778if (SclrMask !=
nullptr) {
782 Builder.
getIntN(VectorWidth, 0));
787// Create "cond" block 789// %EltAddr = getelementptr i32* %1, i32 0 790// %Elt = load i32* %EltAddr 791// VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx 795/*BranchWeights=*/nullptr, DTU);
798 CondBlock->
setName(
"cond.load");
804// Move the pointer if there are more blocks to come. 806if ((
Idx + 1) != VectorWidth)
809// Create "else" block, fill it in the next iteration 813 IfBlock = NewIfBlock;
815// Create the phi to join the new and previous value. 822// Add a PHI for the pointer if this isn't the last iteration. 823if ((
Idx + 1) != VectorWidth) {
838bool HasBranchDivergence,
CallInst *CI,
846auto *VecType = cast<FixedVectorType>(Src->getType());
855Type *EltTy = VecType->getElementType();
857// Adjust alignment for the scalar instruction. 858constAlign AdjustedAlignment =
861unsigned VectorWidth = VecType->getNumElements();
863// Shorten the way if the mask is a vector of constants. 865unsigned MemIndex = 0;
866for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
867if (cast<Constant>(Mask)->getAggregateElement(
Idx)->isNullValue())
879// If the mask is not v1i1, use scalar bit test operations. This generates 880// better results on X86 at least. However, don't do this on GPUs or other 881// machines with branch divergence, as there, each i1 takes up a register. 882Value *SclrMask =
nullptr;
883if (VectorWidth != 1 && !HasBranchDivergence) {
885 SclrMask = Builder.
CreateBitCast(Mask, SclrMaskTy,
"scalar_mask");
888for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
889// Fill the "else" block, created in the previous iteration 891// %mask_1 = extractelement <16 x i1> %mask, i32 Idx 892// br i1 %mask_1, label %cond.store, label %else 895// %cond = extrectelement %mask, Idx 898if (SclrMask !=
nullptr) {
902 Builder.
getIntN(VectorWidth, 0));
907// Create "cond" block 909// %OneElt = extractelement <16 x i32> %Src, i32 Idx 910// %EltAddr = getelementptr i32* %1, i32 0 911// %store i32 %OneElt, i32* %EltAddr 915/*BranchWeights=*/nullptr, DTU);
918 CondBlock->
setName(
"cond.store");
924// Move the pointer if there are more blocks to come. 926if ((
Idx + 1) != VectorWidth)
929// Create "else" block, fill it in the next iteration 933 IfBlock = NewIfBlock;
937// Add a PHI for the pointer if this isn't the last iteration. 938if ((
Idx + 1) != VectorWidth) {
953// If we extend histogram to return a result someday (like the updated vector) 954// then we'll need to support it here. 960auto *AddrType = cast<FixedVectorType>(Ptrs->
getType());
969// FIXME: Do we need to add an alignment parameter to the intrinsic? 970unsigned VectorWidth = AddrType->getNumElements();
972// Shorten the way if the mask is a vector of constants. 974for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
975if (cast<Constant>(Mask)->getAggregateElement(
Idx)->isNullValue())
986for (
unsignedIdx = 0;
Idx < VectorWidth; ++
Idx) {
992/*BranchWeights=*/nullptr, DTU);
995 CondBlock->
setName(
"cond.histogram.update");
1003// Create "else" block, fill it in the next iteration 1015 std::optional<DomTreeUpdater> DTU;
1017 DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
1019bool EverMadeChange =
false;
1020bool MadeChange =
true;
1021auto &
DL =
F.getDataLayout();
1026bool ModifiedDTOnIteration =
false;
1028 HasBranchDivergence, DTU ? &*DTU :
nullptr);
1030// Restart BB iteration if the dominator tree of the Function was changed 1031if (ModifiedDTOnIteration)
1035 EverMadeChange |= MadeChange;
1037return EverMadeChange;
1040bool ScalarizeMaskedMemIntrinLegacyPass::runOnFunction(
Function &
F) {
1041auto &
TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
1043if (
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
1044 DT = &DTWP->getDomTree();
1063bool MadeChange =
false;
1066while (CurInstIterator != BB.
end()) {
1067if (
CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
1083// The scalarization code below does not work for scalable vectors. 1084if (isa<ScalableVectorType>(
II->getType()) ||
1086 [](
Value *V) { return isa<ScalableVectorType>(V->getType()); }))
1088switch (
II->getIntrinsicID()) {
1091case Intrinsic::experimental_vector_histogram_add:
1097case Intrinsic::masked_load:
1098// Scalarize unsupported vector masked load 1105case Intrinsic::masked_store:
1112case Intrinsic::masked_gather: {
1114 cast<ConstantInt>(CI->
getArgOperand(1))->getMaybeAlignValue();
1116Align Alignment =
DL.getValueOrABITypeAlignment(MA,
1124case Intrinsic::masked_scatter: {
1126 cast<ConstantInt>(CI->
getArgOperand(2))->getMaybeAlignValue();
1128Align Alignment =
DL.getValueOrABITypeAlignment(MA,
1137case Intrinsic::masked_expandload:
1144case Intrinsic::masked_compressstore:
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Error unsupported(const char *Str, const Triple &T)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool runImpl(Function &F, const TargetLowering &TLI)
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static void scalarizeMaskedExpandLoad(const DataLayout &DL, bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT)
static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
static void scalarizeMaskedScatter(const DataLayout &DL, bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT)
static unsigned adjustForEndian(const DataLayout &DL, unsigned VectorWidth, unsigned Idx)
static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
static void scalarizeMaskedStore(const DataLayout &DL, bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT)
static void scalarizeMaskedCompressStore(const DataLayout &DL, bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT)
static void scalarizeMaskedGather(const DataLayout &DL, bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT)
static bool runImpl(Function &F, const TargetTransformInfo &TTI, DominatorTree *DT)
static bool isConstantIntVector(Value *Mask)
Scalarize unsupported masked memory intrinsics
static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence, CallInst *CI, DomTreeUpdater *DTU, bool &ModifiedDT)
This pass exposes codegen information to IR-level passes.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
AttributeSet getParamAttrs(unsigned ArgNo) const
The attributes for the argument or parameter at the given index are returned.
MaybeAlign getAlignment() const
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
MaybeAlign getParamAlign(unsigned ArgNo) const
Extract the alignment for a call or parameter (0=unknown).
Value * getArgOperand(unsigned i) const
AttributeList getAttributes() const
Return the attributes for this call.
This class represents a function call, abstracting a target machine's calling convention.
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateConstInBoundsGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
ConstantInt * getInt(const APInt &AI)
Get a constant integer value.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
A wrapper class for inspecting calls to intrinsic functions.
An instruction for reading from memory.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked expand load.
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const
Return true if the target supports masked compress store.
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const
bool forceScalarizeMaskedScatter(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.scatter intrinsics.
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void setName(const Twine &Name)
Change the name of the value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
const ParentTy * getParent() const
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
FunctionPass * createScalarizeMaskedMemIntrinLegacyPass()
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &)
constexpr int PoisonMaskElem
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
This struct is a compact representation of a valid (non-zero power of two) alignment.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)