- Notifications
You must be signed in to change notification settings - Fork14.5k
[SLP]Initial FMA support#149102
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
base:main
Are you sure you want to change the base?
[SLP]Initial FMA support#149102
Uh oh!
There was an error while loading.Please reload this page.
Conversation
Created using spr 1.3.5
llvmbot commentedJul 16, 2025 • edited
Loading Uh oh!
There was an error while loading.Please reload this page.
edited
Uh oh!
There was an error while loading.Please reload this page.
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesAdded initial check for potential fma conversion in reductions and Patch is 89.41 KiB, truncated to 20.00 KiB below, full version:https://github.com/llvm/llvm-project/pull/149102.diff 11 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cppindex 1b8a80d2b3c94..c94f49f941005 100644--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp@@ -3759,6 +3759,7 @@ class BoUpSLP { enum CombinedOpcode { NotCombinedOp = -1, MinMax = Instruction::OtherOpsEnd + 1,+ FMulAdd, }; CombinedOpcode CombinedOp = NotCombinedOp;@@ -3896,6 +3897,9 @@ class BoUpSLP { bool hasState() const { return S.valid(); }+ /// Returns the state of the operations.+ const InstructionsState &getOperations() const { return S; }+ /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. unsigned findLaneForValue(Value *V) const {@@ -11563,6 +11567,84 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) { } }+static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,+ const InstructionsState &S,+ DominatorTree &DT, const DataLayout &DL,+ TargetTransformInfo &TTI,+ const TargetLibraryInfo &TLI) {+ assert(all_of(VL,+ [](Value *V) {+ return V->getType()->getScalarType()->isFloatingPointTy();+ }) &&+ "Can only convert to FMA for floating point types");+ assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");++ auto CheckForContractable = [&](ArrayRef<Value *> VL) {+ FastMathFlags FMF;+ FMF.set();+ for (Value *V : VL) {+ auto *I = dyn_cast<Instruction>(V);+ if (!I)+ continue;+ // TODO: support for copyable elements.+ Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);+ if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)+ continue;+ if (auto *FPCI = dyn_cast<FPMathOperator>(I))+ FMF &= FPCI->getFastMathFlags();+ }+ return FMF.allowContract();+ };+ if (!CheckForContractable(VL))+ return InstructionCost::getInvalid();+ // fmul also should be contractable+ InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);+ SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);++ InstructionsState OpS = getSameOpcode(Operands.front(), TLI);+ if (!OpS.valid())+ return InstructionCost::getInvalid();+ if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)+ return InstructionCost::getInvalid();+ if (!CheckForContractable(Operands.front()))+ return InstructionCost::getInvalid();+ // Compare the costs.+ InstructionCost FMulPlusFaddCost = 0;+ InstructionCost FMACost = 0;+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;+ FastMathFlags FMF;+ FMF.set();+ for (Value *V : VL) {+ auto *I = dyn_cast<Instruction>(V);+ if (!I)+ continue;+ if (auto *FPCI = dyn_cast<FPMathOperator>(I))+ FMF &= FPCI->getFastMathFlags();+ FMulPlusFaddCost += TTI.getInstructionCost(I, CostKind);+ }+ for (auto [V, Op] : zip(VL, Operands.front())) {+ auto *I = dyn_cast<Instruction>(Op);+ if (!I || !I->hasOneUse()) {+ FMACost += TTI.getInstructionCost(cast<Instruction>(V), CostKind);+ if (I)+ FMACost += TTI.getInstructionCost(I, CostKind);+ continue;+ }+ if (auto *FPCI = dyn_cast<FPMathOperator>(I))+ FMF &= FPCI->getFastMathFlags();+ FMulPlusFaddCost += TTI.getInstructionCost(I, CostKind);+ }+ const unsigned NumOps =+ count_if(zip(VL, Operands.front()), [](const auto &P) {+ return isa<Instruction>(std::get<0>(P)) &&+ isa<Instruction>(std::get<1>(P));+ });+ Type *Ty = VL.front()->getType();+ IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);+ FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);+ return FMACost < FMulPlusFaddCost ? FMACost : InstructionCost::getInvalid();+}+ void BoUpSLP::transformNodes() { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; BaseGraphSize = VectorizableTree.size();@@ -11922,6 +12004,25 @@ void BoUpSLP::transformNodes() { } break; }+ case Instruction::FSub:+ case Instruction::FAdd: {+ // Check if possible to convert (a*b)+c to fma.+ if (E.State != TreeEntry::Vectorize ||+ !E.getOperations().isAddSubLikeOp())+ break;+ if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)+ .isValid())+ break;+ // This node is a fmuladd node.+ E.CombinedOp = TreeEntry::FMulAdd;+ TreeEntry *FMulEntry = getOperandEntry(&E, 0);+ if (FMulEntry->UserTreeIndex &&+ FMulEntry->State == TreeEntry::Vectorize) {+ // The FMul node is part of the combined fmuladd node.+ FMulEntry->State = TreeEntry::CombinedVectorize;+ }+ break;+ } default: break; }@@ -13152,6 +13253,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, } return IntrinsicCost; };+ auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,+ Instruction *VI = nullptr) {+ InstructionCost Cost = canConvertToFMA(VI ? ArrayRef<Value *>(VI) : VL, S,+ *DT, *DL, TTI, *TLI);+ return Cost;+ }; switch (ShuffleOrOp) { case Instruction::PHI: { // Count reused scalars.@@ -13492,6 +13599,30 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, }; return GetCostDiff(GetScalarCost, GetVectorCost); }+ case TreeEntry::FMulAdd: {+ auto GetScalarCost = [&](unsigned Idx) {+ if (isa<PoisonValue>(UniqueValues[Idx]))+ return InstructionCost(TTI::TCC_Free);+ return GetFMulAddCost(E->getOperations(),+ cast<Instruction>(UniqueValues[Idx]));+ };+ auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {+ FastMathFlags FMF;+ FMF.set();+ for (Value *V : E->Scalars) {+ if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {+ FMF &= FPCI->getFastMathFlags();+ if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))+ FMF &= FPCIOp->getFastMathFlags();+ }+ }+ IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,+ {VecTy, VecTy, VecTy}, FMF);+ InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);+ return VecCost + CommonCost;+ };+ return GetCostDiff(GetScalarCost, GetVectorCost);+ } case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd:@@ -13529,8 +13660,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, } TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1); TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2);- return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,+ InstructionCost ScalarCost = TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);+ if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);+ I && (ShuffleOrOp == Instruction::FAdd ||+ ShuffleOrOp == Instruction::FSub)) {+ InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);+ if (IntrinsicCost.isValid())+ ScalarCost = IntrinsicCost;+ }+ return ScalarCost; }; auto GetVectorCost = [=](InstructionCost CommonCost) { if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {@@ -22052,11 +22191,21 @@ class HorizontalReduction { /// Try to find a reduction tree. bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, ScalarEvolution &SE, const DataLayout &DL,- const TargetLibraryInfo &TLI) {+ const TargetLibraryInfo &TLI,+ DominatorTree &DT, TargetTransformInfo &TTI) { RdxKind = HorizontalReduction::getRdxKind(Root); if (!isVectorizable(RdxKind, Root)) return false;+ // FMA reduction root - skip.+ auto CheckForFMA = [&](Instruction *I) {+ return RdxKind == RecurKind::FAdd &&+ canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)+ .isValid();+ };+ if (CheckForFMA(Root))+ return false;+ // Analyze "regular" integer/FP types for reductions - no target-specific // types or pointers. Type *Ty = Root->getType();@@ -22095,6 +22244,7 @@ class HorizontalReduction { // foldable. if (!EdgeInst || Level > RecursionMaxDepth || getRdxKind(EdgeInst) != RdxKind ||+ CheckForFMA(EdgeInst) || IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || !isVectorizable(RdxKind, EdgeInst) ||@@ -23658,13 +23808,13 @@ bool SLPVectorizerPass::vectorizeHorReduction( Stack.emplace(SelectRoot(), 0); SmallPtrSet<Value *, 8> VisitedInstrs; bool Res = false;- auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {+ auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * { if (R.isAnalyzedReductionRoot(Inst)) return nullptr; if (!isReductionCandidate(Inst)) return nullptr; HorizontalReduction HorRdx;- if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))+ if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI)) return nullptr; return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); };@@ -23730,6 +23880,12 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType())) return false;+ // Skip potential FMA candidates.+ if ((I->getOpcode() == Instruction::FAdd ||+ I->getOpcode() == Instruction::FSub) &&+ canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)+ .isValid())+ return false; Value *P = I->getParent();diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.llindex 442769937ac12..9e086dcad686c 100644--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll@@ -8,15 +8,18 @@ target triple = "aarch64--linux-gnu" define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry:-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]] ; CHECK: for.body3.lr.ph:-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>-; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4-; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]-; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]-; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])+; CHECK-NEXT: [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float+; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float+; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4+; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4+; CHECK-NEXT: [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]+; CHECK-NEXT: [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]+; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL11]], [[MUL12]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27:@@ -47,15 +50,18 @@ for.end27: define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry:-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]] ; CHECK: for.body3.lr.ph:-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>-; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4-; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]-; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]-; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])+; CHECK-NEXT: [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float+; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float+; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4+; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4+; CHECK-NEXT: [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]+; CHECK-NEXT: [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]+; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27:diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.llindex 295a71899c338..2e684320ba10e 100644--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll@@ -12,7 +12,8 @@ define void @test() { ; CHECK: [[BB63]]: ; CHECK-NEXT: br label %[[BB64]] ; CHECK: [[BB64]]:-; CHECK-NEXT: [[TMP25:%.*]] = phi <16 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]+; CHECK-NEXT: [[I65:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]+; CHECK-NEXT: [[I77:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ] ; CHECK-NEXT: [[I66:%.*]] = load float, ptr poison, align 16 ; CHECK-NEXT: [[I67:%.*]] = load float, ptr poison, align 4 ; CHECK-NEXT: [[I68:%.*]] = load float, ptr poison, align 8@@ -24,57 +25,125 @@ define void @test() { ; CHECK-NEXT: [[I74:%.*]] = load float, ptr poison, align 4 ; CHECK-NEXT: [[I75:%.*]] = load float, ptr poison, align 16 ; CHECK-NEXT: [[I76:%.*]] = load float, ptr poison, align 4-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I67]], i32 14-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15 ; CHECK-NEXT: br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]] ; CHECK: [[BB77]]:-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0-; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP17]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6-; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7-; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison> ; CHECK-NEXT: br label %[[BB78:.*]] ; CHECK: [[BB78]]:-; CHECK-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP23]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]-; CHECK-NEXT: [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]-; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>-; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>-; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]]-; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]]-; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]-; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison-; CHECK-NEXT: [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison-; CHECK-NEXT: [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>-; CHECK-NEXT: [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>+; CHECK-NEXT: [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[I103:%.*]], %[[BB78]] ]+; CHECK-NEXT: [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[I104:%.*]], %[[BB78]] ]+; CHECK-NEXT: [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[I105:%.*]], %[[BB78]] ]+; CHECK-NEXT: [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I106:%.*]], %[[BB78]] ]+; CHECK-NEXT: [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I123:%.*]], %[[BB78]] ]+; CHECK-NEXT: [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[I124:%.*]], %[[BB78]] ]+; CHECK-NEXT: [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I125:%.*]], %[[BB78]] ]+; CHECK-NEXT: [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[I126:%.*]], %[[BB78]] ]+; CHECK-NEXT: [[I87:%.*]] = fmul fast float [[I85]], poison+; CHECK-NEXT: [[I88:%.*]] = fmul fast float [[I80]], poison+; CHECK-NEXT: [[I89:%...[truncated] |
github-actionsbot commentedJul 16, 2025 • edited
Loading Uh oh!
There was an error while loading.Please reload this page.
edited
Uh oh!
There was an error while loading.Please reload this page.
✅ With the latest revision this PR passed the C/C++ code formatter. |
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S'([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)''HEAD~1' HEAD llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: definevoid@fn() { ...bri1undef, ...} Please use the following instead: definevoid@fn(i1%cond) { ...bri1%cond, ...} Please refer to theUndefined Behavior Manual for more information. |
Created using spr 1.3.5
Added initial check for potential fma conversion in reductions and
operands vectorization.