1//===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This file implements a profile inference algorithm. Given an incomplete and 10// possibly imprecise block counts, the algorithm reconstructs realistic block 11// and edge counts that satisfy flow conservation rules, while minimally modify 14//===----------------------------------------------------------------------===// 23#include <unordered_set> 26#define DEBUG_TYPE "sample-profile-inference" 32cl::desc(
"Try to evenly distribute flow when there are multiple equally " 37cl::desc(
"Evenly re-distribute flow among unknown subgraphs."));
41cl::desc(
"Join isolated components having positive flow."));
45cl::desc(
"The cost of increasing a block's count by one."));
49cl::desc(
"The cost of decreasing a block's count by one."));
53cl::desc(
"The cost of increasing the entry block's count by one."));
57cl::desc(
"The cost of decreasing the entry block's count by one."));
61cl::desc(
"The cost of increasing a count of zero-weight block by one."));
65cl::desc(
"The cost of increasing an unknown block's count by one."));
67/// A value indicating an infinite flow/capacity/weight of a block/edge. 68/// Not using numeric_limits<int64_t>::max(), as the values can be summed up 69/// during the execution. 70staticconstexpr int64_t
INF = ((int64_t)1) << 50;
72/// The minimum-cost maximum flow algorithm. 74/// The algorithm finds the maximum flow of minimum cost on a given (directed) 75/// network using a modified version of the classical Moore-Bellman-Ford 76/// approach. The algorithm applies a number of augmentation iterations in which 77/// flow is sent along paths of positive capacity from the source to the sink. 78/// The worst-case time complexity of the implementation is O(v(f)*m*n), where 79/// where m is the number of edges, n is the number of vertices, and v(f) is the 80/// value of the maximum flow. However, the observed running time on typical 81/// instances is sub-quadratic, that is, o(n^2). 83/// The input is a set of edges with specified costs and capacities, and a pair 84/// of nodes (source and sink). The output is the flow along each edge of the 85/// minimum total cost respecting the given edge capacities. 88 MinCostMaxFlow(
constProfiParams &Params) : Params(Params) {}
90// Initialize algorithm's data structures for a network of a given size. 95 Nodes = std::vector<Node>(NodeCount);
96 Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
99 std::vector<std::vector<Edge *>>(NodeCount, std::vector<Edge *>());
104LLVM_DEBUG(
dbgs() <<
"Starting profi for " << Nodes.size() <<
" nodes\n");
106// Iteratively find an augmentation path/dag in the network and send the 107// flow along its edges 108size_t AugmentationIters = applyFlowAugmentation();
110// Compute the total flow and its cost 111 int64_t TotalCost = 0;
112 int64_t TotalFlow = 0;
113for (
uint64_t Src = 0; Src < Nodes.size(); Src++) {
114for (
auto &Edge : Edges[Src]) {
116 TotalCost += Edge.Cost * Edge.Flow;
118 TotalFlow += Edge.Flow;
123 <<
" iterations with " << TotalFlow <<
" total flow" 124 <<
" of " << TotalCost <<
" cost\n");
126 (void)AugmentationIters;
130 /// Adding an edge to the network with a specified capacity and a cost. 131 /// Multiple edges between a pair of nodes are allowed but self-edges 132 /// are not supported. 134assert(Capacity > 0 &&
"adding an edge of zero capacity");
135assert(Src != Dst &&
"loop edge are not supported");
140 SrcEdge.Capacity = Capacity;
142 SrcEdge.RevEdgeIndex = Edges[Dst].size();
146 DstEdge.Cost = -
Cost;
147 DstEdge.Capacity = 0;
149 DstEdge.RevEdgeIndex = Edges[Src].size();
151 Edges[Src].push_back(SrcEdge);
152 Edges[Dst].push_back(DstEdge);
155 /// Adding an edge to the network of infinite capacity and a given cost. 160 /// Get the total flow from a given source node. 161 /// Returns a list of pairs (target node, amount of flow to the target). 162 std::vector<std::pair<uint64_t, int64_t>> getFlow(
uint64_t Src)
const{
163 std::vector<std::pair<uint64_t, int64_t>>
Flow;
164for (
constauto &Edge : Edges[Src]) {
166Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
171 /// Get the total flow between a pair of nodes. 174for (
constauto &Edge : Edges[Src]) {
175if (Edge.Dst == Dst) {
183 /// Iteratively find an augmentation path/dag in the network and send the 184 /// flow along its edges. The method returns the number of applied iterations. 185size_t applyFlowAugmentation() {
186size_t AugmentationIters = 0;
187while (findAugmentingPath()) {
188uint64_t PathCapacity = computeAugmentingPathCapacity();
189while (PathCapacity > 0) {
192// Identify node/edge candidates for augmentation 193 identifyShortestEdges(PathCapacity);
195// Find an augmenting DAG 196auto AugmentingOrder = findAugmentingDAG();
198// Apply the DAG augmentation 199 Progress = augmentFlowAlongDAG(AugmentingOrder);
200 PathCapacity = computeAugmentingPathCapacity();
204 augmentFlowAlongPath(PathCapacity);
211return AugmentationIters;
214 /// Compute the capacity of the cannonical augmenting path. If the path is 215 /// saturated (that is, no flow can be sent along the path), then return 0. 216uint64_t computeAugmentingPathCapacity() {
219while (Now != Source) {
220uint64_t Pred = Nodes[Now].ParentNode;
221auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
223assert(Edge.Capacity >= Edge.Flow &&
"incorrect edge flow");
225 PathCapacity = std::min(PathCapacity, EdgeCapacity);
232 /// Check for existence of an augmenting path with a positive capacity. 233bool findAugmentingPath() {
234// Initialize data structures 235for (
auto &
Node : Nodes) {
242 std::queue<uint64_t> Queue;
244 Nodes[Source].Distance = 0;
245 Nodes[Source].Taken =
true;
246while (!Queue.empty()) {
249 Nodes[Src].Taken =
false;
250// Although the residual network contains edges with negative costs 251// (in particular, backward edges), it can be shown that there are no 252// negative-weight cycles and the following two invariants are maintained: 253// (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V, 254// where Dist is the length of the shortest path between two nodes. This 255// allows to prune the search-space of the path-finding algorithm using 256// the following early-stop criteria: 257// -- If we find a path with zero-distance from Source to Target, stop the 258// search, as the path is the shortest since Dist[Source, Target] >= 0; 259// -- If we have Dist[Source, V] > Dist[Source, Target], then do not 260// process node V, as it is guaranteed _not_ to be on a shortest path 261// from Source to Target; it follows from inequalities 262// Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target] 266if (Nodes[Src].Distance > Nodes[
Target].Distance)
269// Process adjacent edges 270for (
uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
271auto &Edge = Edges[Src][EdgeIdx];
272if (Edge.Flow < Edge.Capacity) {
274 int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
275if (Nodes[Dst].Distance > NewDistance) {
276// Update the distance and the parent node/edge 277 Nodes[Dst].Distance = NewDistance;
278 Nodes[Dst].ParentNode = Src;
279 Nodes[Dst].ParentEdgeIndex = EdgeIdx;
280// Add the node to the queue, if it is not there yet 281if (!Nodes[Dst].Taken) {
283 Nodes[Dst].Taken =
true;
293 /// Update the current flow along the augmenting path. 294void augmentFlowAlongPath(
uint64_t PathCapacity) {
295assert(PathCapacity > 0 &&
"found an incorrect augmenting path");
297while (Now != Source) {
298uint64_t Pred = Nodes[Now].ParentNode;
299auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
300auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
302 Edge.Flow += PathCapacity;
303 RevEdge.Flow -= PathCapacity;
309 /// Find an Augmenting DAG order using a modified version of DFS in which we 310 /// can visit a node multiple times. In the DFS search, when scanning each 311 /// edge out of a node, continue search at Edge.Dst endpoint if it has not 312 /// been discovered yet and its NumCalls < MaxDfsCalls. The algorithm 313 /// runs in O(MaxDfsCalls * |Edges| + |Nodes|) time. 314 /// It returns an Augmenting Order (Taken nodes in decreasing Finish time) 315 /// that starts with Source and ends with Target. 316 std::vector<uint64_t> findAugmentingDAG() {
317// We use a stack based implemenation of DFS to avoid recursion. 318// Defining DFS data structures: 319// A pair (NodeIdx, EdgeIdx) at the top of the Stack denotes that 320// - we are currently visiting Nodes[NodeIdx] and 321// - the next edge to scan is Edges[NodeIdx][EdgeIdx] 322typedef std::pair<uint64_t, uint64_t> StackItemType;
323 std::stack<StackItemType> Stack;
324 std::vector<uint64_t> AugmentingOrder;
326// Phase 0: Initialize Node attributes and Time for DFS run 327for (
auto &
Node : Nodes) {
334// Mark Target as Taken 335// Taken attribute will be propagated backwards from Target towards Source 336 Nodes[
Target].Taken =
true;
338// Phase 1: Start DFS traversal from Source 339 Stack.emplace(Source, 0);
340 Nodes[Source].Discovery = ++Time;
341while (!Stack.empty()) {
342auto NodeIdx = Stack.top().first;
343auto EdgeIdx = Stack.top().second;
345// If we haven't scanned all edges out of NodeIdx, continue scanning 346if (EdgeIdx < Edges[NodeIdx].
size()) {
347auto &Edge = Edges[NodeIdx][EdgeIdx];
348auto &Dst = Nodes[Edge.Dst];
349 Stack.top().second++;
351if (Edge.OnShortestPath) {
352// If we haven't seen Edge.Dst so far, continue DFS search there 353if (Dst.Discovery == 0 && Dst.NumCalls < MaxDfsCalls) {
354 Dst.Discovery = ++Time;
355 Stack.emplace(Edge.Dst, 0);
357 }
elseif (Dst.Taken && Dst.Finish != 0) {
358// Else, if Edge.Dst already have a path to Target, so that NodeIdx 359 Nodes[NodeIdx].Taken =
true;
363// If we are done scanning all edge out of NodeIdx 365// If we haven't found a path from NodeIdx to Target, forget about it 366if (!Nodes[NodeIdx].Taken) {
367 Nodes[NodeIdx].Discovery = 0;
369// If we have found a path from NodeIdx to Target, then finish NodeIdx 370// and propagate Taken flag to DFS parent unless at the Source 371 Nodes[NodeIdx].Finish = ++Time;
372// NodeIdx == Source if and only if the stack is empty 373if (NodeIdx != Source) {
374assert(!Stack.empty() &&
"empty stack while running dfs");
375 Nodes[Stack.top().first].Taken =
true;
377 AugmentingOrder.push_back(NodeIdx);
381// Nodes are collected decreasing Finish time, so the order is reversed 382 std::reverse(AugmentingOrder.begin(), AugmentingOrder.end());
384// Phase 2: Extract all forward (DAG) edges and fill in AugmentingEdges 385for (
size_t Src : AugmentingOrder) {
386 AugmentingEdges[Src].clear();
387for (
auto &Edge : Edges[Src]) {
389if (Edge.OnShortestPath && Nodes[Src].Taken && Nodes[Dst].Taken &&
390 Nodes[Dst].Finish < Nodes[Src].Finish) {
391 AugmentingEdges[Src].push_back(&Edge);
394assert((Src ==
Target || !AugmentingEdges[Src].empty()) &&
395"incorrectly constructed augmenting edges");
398return AugmentingOrder;
401 /// Update the current flow along the given (acyclic) subgraph specified by 402 /// the vertex order, AugmentingOrder. The objective is to send as much flow 403 /// as possible while evenly distributing flow among successors of each node. 404 /// After the update at least one edge is saturated. 405bool augmentFlowAlongDAG(
const std::vector<uint64_t> &AugmentingOrder) {
406// Phase 0: Initialization 407for (
uint64_t Src : AugmentingOrder) {
408 Nodes[Src].FracFlow = 0;
409 Nodes[Src].IntFlow = 0;
410for (
auto &Edge : AugmentingEdges[Src]) {
411 Edge->AugmentedFlow = 0;
415// Phase 1: Send a unit of fractional flow along the DAG 417 Nodes[Source].FracFlow = 1.0;
418for (
uint64_t Src : AugmentingOrder) {
420"incorrectly computed fractional flow");
421// Distribute flow evenly among successors of Src 422uint64_t Degree = AugmentingEdges[Src].size();
423for (
auto &Edge : AugmentingEdges[Src]) {
424double EdgeFlow = Nodes[Src].FracFlow / Degree;
425 Nodes[Edge->Dst].FracFlow += EdgeFlow;
426if (Edge->Capacity ==
INF)
428uint64_t MaxIntFlow = double(Edge->Capacity - Edge->Flow) / EdgeFlow;
429 MaxFlowAmount = std::min(MaxFlowAmount, MaxIntFlow);
432// Stop early if we cannot send any (integral) flow from Source to Target 433if (MaxFlowAmount == 0)
436// Phase 2: Send an integral flow of MaxFlowAmount 437 Nodes[Source].IntFlow = MaxFlowAmount;
438for (
uint64_t Src : AugmentingOrder) {
441// Distribute flow evenly among successors of Src, rounding up to make 442// sure all flow is sent 443uint64_t Degree = AugmentingEdges[Src].size();
444// We are guaranteeed that Node[Src].IntFlow <= SuccFlow * Degree 445uint64_t SuccFlow = (Nodes[Src].IntFlow + Degree - 1) / Degree;
446for (
auto &Edge : AugmentingEdges[Src]) {
448uint64_t EdgeFlow = std::min(Nodes[Src].IntFlow, SuccFlow);
449 EdgeFlow = std::min(EdgeFlow,
uint64_t(Edge->Capacity - Edge->Flow));
450 Nodes[Dst].IntFlow += EdgeFlow;
451 Nodes[Src].IntFlow -= EdgeFlow;
452 Edge->AugmentedFlow += EdgeFlow;
456 Nodes[
Target].IntFlow = 0;
458// Phase 3: Send excess flow back traversing the nodes backwards. 459// Because of rounding, not all flow can be sent along the edges of Src. 460// Hence, sending the remaining flow back to maintain flow conservation 461for (
size_tIdx = AugmentingOrder.size() - 1;
Idx > 0;
Idx--) {
463// Try to send excess flow back along each edge. 464// Make sure we only send back flow we just augmented (AugmentedFlow). 465for (
auto &Edge : AugmentingEdges[Src]) {
467if (Nodes[Dst].IntFlow == 0)
469uint64_t EdgeFlow = std::min(Nodes[Dst].IntFlow, Edge->AugmentedFlow);
470 Nodes[Dst].IntFlow -= EdgeFlow;
471 Nodes[Src].IntFlow += EdgeFlow;
472 Edge->AugmentedFlow -= EdgeFlow;
476// Phase 4: Update flow values along all edges 477bool HasSaturatedEdges =
false;
478for (
uint64_t Src : AugmentingOrder) {
479// Verify that we have sent all the excess flow from the node 480assert(Src == Source || Nodes[Src].IntFlow == 0);
481for (
auto &Edge : AugmentingEdges[Src]) {
482assert(
uint64_t(Edge->Capacity - Edge->Flow) >= Edge->AugmentedFlow);
483// Update flow values along the edge and its reverse copy 484auto &RevEdge = Edges[Edge->Dst][Edge->RevEdgeIndex];
485 Edge->Flow += Edge->AugmentedFlow;
486 RevEdge.Flow -= Edge->AugmentedFlow;
487if (Edge->Capacity == Edge->Flow && Edge->AugmentedFlow > 0)
488 HasSaturatedEdges =
true;
492// The augmentation is successful iff at least one edge becomes saturated 493return HasSaturatedEdges;
496 /// Identify candidate (shortest) edges for augmentation. 497void identifyShortestEdges(
uint64_t PathCapacity) {
498assert(PathCapacity > 0 &&
"found an incorrect augmenting DAG");
499// To make sure the augmentation DAG contains only edges with large residual 500// capacity, we prune all edges whose capacity is below a fraction of 501// the capacity of the augmented path. 502// (All edges of the path itself are always in the DAG) 505// Decide which edges are on a shortest path from Source to Target 506for (
size_t Src = 0; Src < Nodes.size(); Src++) {
507// An edge cannot be augmenting if the endpoint has large distance 508if (Nodes[Src].Distance > Nodes[
Target].Distance)
511for (
auto &Edge : Edges[Src]) {
513 Edge.OnShortestPath =
514 Src !=
Target && Dst != Source &&
515 Nodes[Dst].Distance <= Nodes[
Target].Distance &&
516 Nodes[Dst].Distance == Nodes[Src].Distance + Edge.Cost &&
517 Edge.Capacity > Edge.Flow &&
518uint64_t(Edge.Capacity - Edge.Flow) >= MinCapacity;
523 /// Maximum number of DFS iterations for DAG finding. 524staticconstexpruint64_t MaxDfsCalls = 10;
526 /// A node in a flow network. 528 /// The cost of the cheapest path from the source to the current node. 530 /// The node preceding the current one in the path. 532 /// The index of the edge between ParentNode and the current node. 534 /// An indicator of whether the current node is in a queue. 537 /// Data fields utilized in DAG-augmentation: 550 /// An edge in a flow network. 552 /// The cost of the edge. 554 /// The capacity of the edge. 556 /// The current flow on the edge. 558 /// The destination node of the edge. 560 /// The index of the reverse edge between Dst and the current node. 563 /// Data fields utilized in DAG-augmentation: 564 /// Whether the edge is currently on a shortest path from Source to Target. 566 /// Extra flow along the edge. 570 /// The set of network nodes. 571 std::vector<Node> Nodes;
572 /// The set of network edges. 573 std::vector<std::vector<Edge>> Edges;
574 /// Source node of the flow. 576 /// Target (sink) node of the flow. 578 /// Augmenting edges. 579 std::vector<std::vector<Edge *>> AugmentingEdges;
580 /// Params for flow computation. 584/// A post-processing adjustment of the control flow. It applies two steps by 585/// rerouting some flow and making it more realistic: 587/// - First, it removes all isolated components ("islands") with a positive flow 588/// that are unreachable from the entry block. For every such component, we 589/// find the shortest from the entry to an exit passing through the component, 590/// and increase the flow by one unit along the path. 592/// - Second, it identifies all "unknown subgraphs" consisting of basic blocks 593/// with no sampled counts. Then it rebalnces the flow that goes through such 594/// a subgraph so that each branch is taken with probability 50%. 595/// An unknown subgraph is such that for every two nodes u and v: 596/// - u dominates v and u is not unknown; 597/// - v post-dominates u; and 598/// - all inner-nodes of all (u,v)-paths are unknown. 603 : Params(Params), Func(Func) {}
605 /// Apply the post-processing. 608// Adjust the flow to get rid of isolated components 609 joinIsolatedComponents();
613// Rebalance the flow inside unknown subgraphs 614 rebalanceUnknownSubgraphs();
619void joinIsolatedComponents() {
620// Find blocks that are reachable from the source 621auto Visited =
BitVector(NumBlocks(),
false);
622 findReachable(Func.Entry, Visited);
624// Iterate over all non-reachable blocks and adjust their weights 626auto &
Block = Func.Blocks[
I];
627if (
Block.Flow > 0 && !Visited[
I]) {
628// Find a path from the entry to an exit passing through the block I 629auto Path = findShortestPath(
I);
630// Increase the flow along the path 631assert(Path.size() > 0 && Path[0]->Source == Func.Entry &&
632"incorrectly computed path adjusting control flow");
633 Func.Blocks[Func.Entry].Flow += 1;
634for (
auto &Jump : Path) {
636 Func.Blocks[Jump->Target].Flow += 1;
637// Update reachability 638 findReachable(Jump->Target, Visited);
644 /// Run BFS from a given block along the jumps with a positive flow and mark 645 /// all reachable blocks. 649 std::queue<uint64_t> Queue;
652while (!Queue.empty()) {
655for (
auto *Jump : Func.Blocks[Src].SuccJumps) {
657if (Jump->Flow > 0 && !Visited[Dst]) {
665 /// Find the shortest path from the entry block to an exit block passing 666 /// through a given block. 667 std::vector<FlowJump *> findShortestPath(
uint64_t BlockIdx) {
668// A path from the entry block to BlockIdx 669auto ForwardPath = findShortestPath(Func.Entry, BlockIdx);
670// A path from BlockIdx to an exit block 671auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock);
673// Concatenate the two paths 674 std::vector<FlowJump *> Result;
675 Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end());
676 Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end());
680 /// Apply the Dijkstra algorithm to find the shortest path from a given 681 /// Source to a given Target block. 682 /// If Target == -1, then the path ends at an exit block. 684// Quit early, if possible 686return std::vector<FlowJump *>();
687if (Func.Blocks[Source].isExit() &&
Target == AnyExitBlock)
688return std::vector<FlowJump *>();
690// Initialize data structures 691auto Distance = std::vector<int64_t>(NumBlocks(),
INF);
692auto Parent = std::vector<FlowJump *>(NumBlocks(),
nullptr);
693 Distance[Source] = 0;
694 std::set<std::pair<uint64_t, uint64_t>> Queue;
695 Queue.insert(std::make_pair(Distance[Source], Source));
697// Run the Dijkstra algorithm 698while (!Queue.empty()) {
699uint64_t Src = Queue.begin()->second;
700 Queue.erase(Queue.begin());
701// If we found a solution, quit early 703 (Func.Blocks[Src].isExit() &&
Target == AnyExitBlock))
706for (
auto *Jump : Func.Blocks[Src].SuccJumps) {
708 int64_t JumpDist = jumpDistance(Jump);
709if (Distance[Dst] > Distance[Src] + JumpDist) {
710 Queue.erase(std::make_pair(Distance[Dst], Dst));
712 Distance[Dst] = Distance[Src] + JumpDist;
715 Queue.insert(std::make_pair(Distance[Dst], Dst));
719// If Target is not provided, find the closest exit block 720if (
Target == AnyExitBlock) {
722if (Func.Blocks[
I].isExit() && Parent[
I] !=
nullptr) {
723if (
Target == AnyExitBlock || Distance[
Target] > Distance[
I]) {
729assert(Parent[
Target] !=
nullptr &&
"a path does not exist");
731// Extract the constructed path 732 std::vector<FlowJump *> Result;
734while (Now != Source) {
735assert(Now == Parent[Now]->
Target &&
"incorrect parent jump");
736 Result.push_back(Parent[Now]);
737 Now = Parent[Now]->Source;
739// Reverse the path, since it is extracted from Target to Source 740 std::reverse(Result.begin(), Result.end());
744 /// A distance of a path for a given jump. 745 /// In order to incite the path to use blocks/jumps with large positive flow, 746 /// and avoid changing branch probability of outgoing edges drastically, 747 /// set the jump distance so as: 748 /// - to minimize the number of unlikely jumps used and subject to that, 749 /// - to minimize the number of Flow == 0 jumps used and subject to that, 750 /// - minimizes total multiplicative Flow increase for the remaining edges. 751 /// To capture this objective with integer distances, we round off fractional 752 /// parts to a multiple of 1 / BaseDistance. 753 int64_t jumpDistance(
FlowJump *Jump)
const{
757 std::max(FlowAdjuster::MinBaseDistance,
758 std::min(Func.Blocks[Func.Entry].Flow,
761return BaseDistance + BaseDistance / Jump->
Flow;
762return 2 * BaseDistance * (NumBlocks() + 1);
765uint64_t NumBlocks()
const{
return Func.Blocks.size(); }
767 /// Rebalance unknown subgraphs so that the flow is split evenly across the 768 /// outgoing branches of every block of the subgraph. The method iterates over 769 /// blocks with known weight and identifies unknown subgraphs rooted at the 770 /// blocks. Then it verifies if flow rebalancing is feasible and applies it. 771void rebalanceUnknownSubgraphs() {
772// Try to find unknown subgraphs from each block 773for (
constFlowBlock &SrcBlock : Func.Blocks) {
774// Verify if rebalancing rooted at SrcBlock is feasible 775if (!canRebalanceAtRoot(&SrcBlock))
778// Find an unknown subgraphs starting at SrcBlock. Along the way, 779// fill in known destinations and intermediate unknown blocks. 780 std::vector<FlowBlock *> UnknownBlocks;
781 std::vector<FlowBlock *> KnownDstBlocks;
782 findUnknownSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks);
784// Verify if rebalancing of the subgraph is feasible. If the search is 785// successful, find the unique destination block (which can be null) 787if (!canRebalanceSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks,
791// We cannot rebalance subgraphs containing cycles among unknown blocks 792if (!isAcyclicSubgraph(&SrcBlock, DstBlock, UnknownBlocks))
796 rebalanceUnknownSubgraph(&SrcBlock, DstBlock, UnknownBlocks);
800 /// Verify if rebalancing rooted at a given block is possible. 801bool canRebalanceAtRoot(
constFlowBlock *SrcBlock) {
802// Do not attempt to find unknown subgraphs from an unknown or a 807// Do not attempt to process subgraphs from a block w/o unknown sucessors 808bool HasUnknownSuccs =
false;
810if (Func.Blocks[Jump->
Target].HasUnknownWeight) {
811 HasUnknownSuccs =
true;
821 /// Find an unknown subgraph starting at block SrcBlock. The method sets 822 /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks. 823void findUnknownSubgraph(
constFlowBlock *SrcBlock,
824 std::vector<FlowBlock *> &KnownDstBlocks,
825 std::vector<FlowBlock *> &UnknownBlocks) {
826// Run BFS from SrcBlock and make sure all paths are going through unknown 827// blocks and end at a known DstBlock 828auto Visited =
BitVector(NumBlocks(),
false);
829 std::queue<uint64_t> Queue;
831 Queue.push(SrcBlock->
Index);
832 Visited[SrcBlock->
Index] =
true;
833while (!Queue.empty()) {
834auto &
Block = Func.Blocks[Queue.front()];
836// Process blocks reachable from Block 837for (
auto *Jump :
Block.SuccJumps) {
838// If Jump can be ignored, skip it 839if (ignoreJump(SrcBlock,
nullptr, Jump))
843// If Dst has been visited, skip Jump 848if (!Func.Blocks[Dst].HasUnknownWeight) {
849 KnownDstBlocks.
push_back(&Func.Blocks[Dst]);
852 UnknownBlocks.push_back(&Func.Blocks[Dst]);
858 /// Verify if rebalancing of the subgraph is feasible. If the checks are 859 /// successful, set the unique destination block, DstBlock (can be null). 860bool canRebalanceSubgraph(
constFlowBlock *SrcBlock,
861const std::vector<FlowBlock *> &KnownDstBlocks,
862const std::vector<FlowBlock *> &UnknownBlocks,
864// If the list of unknown blocks is empty, we don't need rebalancing 865if (UnknownBlocks.empty())
868// If there are multiple known sinks, we can't rebalance 869if (KnownDstBlocks.size() > 1)
871 DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front();
873// Verify sinks of the subgraph 874for (
auto *
Block : UnknownBlocks) {
875if (
Block->SuccJumps.empty()) {
876// If there are multiple (known and unknown) sinks, we can't rebalance 877if (DstBlock !=
nullptr)
881size_t NumIgnoredJumps = 0;
882for (
auto *Jump :
Block->SuccJumps) {
883if (ignoreJump(SrcBlock, DstBlock, Jump))
886// If there is a non-sink block in UnknownBlocks with all jumps ignored, 887// then we can't rebalance 888if (NumIgnoredJumps ==
Block->SuccJumps.size())
895 /// Decide whether the Jump is ignored while processing an unknown subgraphs 896 /// rooted at basic block SrcBlock with the destination block, DstBlock. 899// Ignore unlikely jumps with zero flow 903auto JumpSource = &Func.Blocks[Jump->
Source];
904auto JumpTarget = &Func.Blocks[Jump->
Target];
906// Do not ignore jumps coming into DstBlock 907if (DstBlock !=
nullptr && JumpTarget == DstBlock)
910// Ignore jumps out of SrcBlock to known blocks 911if (!JumpTarget->HasUnknownWeight && JumpSource == SrcBlock)
914// Ignore jumps to known blocks with zero flow 915if (!JumpTarget->HasUnknownWeight && JumpTarget->Flow == 0)
921 /// Verify if the given unknown subgraph is acyclic, and if yes, reorder 922 /// UnknownBlocks in the topological order (so that all jumps are "forward"). 924 std::vector<FlowBlock *> &UnknownBlocks) {
925// Extract local in-degrees in the considered subgraph 926auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
928for (
auto *Jump :
Block->SuccJumps) {
929if (ignoreJump(SrcBlock, DstBlock, Jump))
931 LocalInDegree[Jump->
Target]++;
934 fillInDegree(SrcBlock);
935for (
auto *
Block : UnknownBlocks) {
938// A loop containing SrcBlock 939if (LocalInDegree[SrcBlock->
Index] > 0)
942 std::vector<FlowBlock *> AcyclicOrder;
943 std::queue<uint64_t> Queue;
944 Queue.push(SrcBlock->
Index);
945while (!Queue.empty()) {
948// Stop propagation once we reach DstBlock, if any 949if (DstBlock !=
nullptr &&
Block == DstBlock)
952// Keep an acyclic order of unknown blocks 953if (
Block->HasUnknownWeight &&
Block != SrcBlock)
954 AcyclicOrder.push_back(
Block);
956// Add to the queue all successors with zero local in-degree 957for (
auto *Jump :
Block->SuccJumps) {
958if (ignoreJump(SrcBlock, DstBlock, Jump))
961 LocalInDegree[Dst]--;
962if (LocalInDegree[Dst] == 0) {
968// If there is a cycle in the subgraph, AcyclicOrder contains only a subset 970if (UnknownBlocks.size() != AcyclicOrder.size())
972 UnknownBlocks = AcyclicOrder;
976 /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and 977 /// having UnknownBlocks intermediate blocks. 978void rebalanceUnknownSubgraph(
constFlowBlock *SrcBlock,
980const std::vector<FlowBlock *> &UnknownBlocks) {
981assert(SrcBlock->
Flow > 0 &&
"zero-flow block in unknown subgraph");
983// Ditribute flow from the source block 985// SrcBlock's flow is the sum of outgoing flows along non-ignored jumps 987if (ignoreJump(SrcBlock, DstBlock, Jump))
989 BlockFlow += Jump->
Flow;
991 rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow);
993// Ditribute flow from the remaining blocks 994for (
auto *
Block : UnknownBlocks) {
995assert(
Block->HasUnknownWeight &&
"incorrect unknown subgraph");
997// Block's flow is the sum of incoming flows 998for (
auto *Jump :
Block->PredJumps) {
999 BlockFlow += Jump->
Flow;
1001Block->Flow = BlockFlow;
1002 rebalanceBlock(SrcBlock, DstBlock,
Block, BlockFlow);
1006 /// Redistribute flow for a block in a subgraph rooted at SrcBlock, 1007 /// and ending at DstBlock. 1010// Process all successor jumps and update corresponding flow values 1011size_t BlockDegree = 0;
1012for (
auto *Jump :
Block->SuccJumps) {
1013if (ignoreJump(SrcBlock, DstBlock, Jump))
1017// If all successor jumps of the block are ignored, skip it 1018if (DstBlock ==
nullptr && BlockDegree == 0)
1020assert(BlockDegree > 0 &&
"all outgoing jumps are ignored");
1022// Each of the Block's successors gets the following amount of flow. 1023// Rounding the value up so that all flow is propagated 1024uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree;
1025for (
auto *Jump :
Block->SuccJumps) {
1026if (ignoreJump(SrcBlock, DstBlock, Jump))
1032assert(BlockFlow == 0 &&
"not all flow is propagated");
1035 /// A constant indicating an arbitrary exit block of a function. 1037 /// Minimum BaseDistance for the jump distance values in island joining. 1038staticconstexpruint64_t MinBaseDistance = 10000;
1040 /// Params for flow computation. 1046std::pair<int64_t, int64_t> assignBlockCosts(
constProfiParams &Params,
1048std::pair<int64_t, int64_t> assignJumpCosts(
constProfiParams &Params,
1051/// Initializing flow network for a given function. 1053/// Every block is split into two nodes that are responsible for (i) an 1054/// incoming flow, (ii) an outgoing flow; they penalize an increase or a 1055/// reduction of the block weight. 1056void initializeNetwork(
constProfiParams &Params, MinCostMaxFlow &Network,
1058uint64_t NumBlocks = Func.Blocks.size();
1059assert(NumBlocks > 1 &&
"Too few blocks in a function");
1060uint64_t NumJumps = Func.Jumps.size();
1061assert(NumJumps > 0 &&
"Too few jumps in a function");
1063// Introducing dummy source/sink pairs to allow flow circulation. 1064// The nodes corresponding to blocks of the function have indices in 1065// the range [0 .. 2 * NumBlocks); the dummy sources/sinks are indexed by the 1072 Network.initialize(2 * NumBlocks + 4,
S1,
T1);
1074// Initialize nodes of the flow network 1076auto &
Block = Func.Blocks[
B];
1078// Split every block into two auxiliary nodes to allow 1079// increase/reduction of the block count. 1083// Edges from S and to T 1084if (
Block.isEntry()) {
1085 Network.addEdge(S,
Bin, 0);
1086 }
elseif (
Block.isExit()) {
1087 Network.addEdge(Bout,
T, 0);
1090// Assign costs for increasing/decreasing the block counts 1091auto [AuxCostInc, AuxCostDec] = assignBlockCosts(Params,
Block);
1093// Add the corresponding edges to the network 1094 Network.addEdge(
Bin, Bout, AuxCostInc);
1095if (
Block.Weight > 0) {
1096 Network.addEdge(Bout,
Bin,
Block.Weight, AuxCostDec);
1097 Network.addEdge(
S1, Bout,
Block.Weight, 0);
1102// Initialize edges of the flow network 1103for (
uint64_t J = 0; J < NumJumps; J++) {
1104auto &Jump = Func.Jumps[J];
1106// Get the endpoints corresponding to the jump 1110// Assign costs for increasing/decreasing the jump counts 1111auto [AuxCostInc, AuxCostDec] = assignJumpCosts(Params, Jump);
1113// Add the corresponding edges to the network 1114 Network.addEdge(Jin, Jout, AuxCostInc);
1116 Network.addEdge(Jout, Jin, Jump.
Weight, AuxCostDec);
1117 Network.addEdge(
S1, Jout, Jump.
Weight, 0);
1118 Network.addEdge(Jin,
T1, Jump.
Weight, 0);
1122// Make sure we have a valid flow circulation 1123 Network.addEdge(
T, S, 0);
1126/// Assign costs for increasing/decreasing the block counts. 1127std::pair<int64_t, int64_t> assignBlockCosts(
constProfiParams &Params,
1129// Modifying the weight of an unlikely block is expensive 1130if (
Block.IsUnlikely)
1133// Assign default values for the costs 1136// Update the costs depending on the block metadata 1137if (
Block.HasUnknownWeight) {
1141// Increasing the count for "cold" blocks with zero initial count is more 1142// expensive than for "hot" ones 1143if (
Block.Weight == 0)
1145// Modifying the count of the entry block is expensive 1146if (
Block.isEntry()) {
1151return std::make_pair(CostInc, CostDec);
1154/// Assign costs for increasing/decreasing the jump counts. 1155std::pair<int64_t, int64_t> assignJumpCosts(
constProfiParams &Params,
1157// Modifying the weight of an unlikely jump is expensive 1161// Assign default values for the costs 1164// Update the costs depending on the block metadata 1166// Adjusting the fall-through branch 1171// The cost is different for fall-through and non-fall-through branches 1178assert(Jump.
Weight > 0 &&
"found zero-weight jump with a positive weight");
1180return std::make_pair(CostInc, CostDec);
1183/// Extract resulting block and edge counts from the flow network. 1184void extractWeights(
constProfiParams &Params, MinCostMaxFlow &Network,
1186uint64_t NumBlocks = Func.Blocks.size();
1187uint64_t NumJumps = Func.Jumps.size();
1189// Extract resulting jump counts 1190for (
uint64_t J = 0; J < NumJumps; J++) {
1191auto &Jump = Func.Jumps[J];
1196 int64_t AuxFlow = Network.getFlow(SrcOut, DstIn);
1200Flow = int64_t(Jump.
Weight) + (AuxFlow > 0 ? AuxFlow : 0);
1206// Extract resulting block counts 1207auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
1208auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
1209for (
auto &Jump : Func.Jumps) {
1214auto &
Block = Func.Blocks[
B];
1215Block.Flow = std::max(OutFlow[
B], InFlow[
B]);
1220/// Verify that the provided block/jump weights are as expected. 1222// Verify entry and exit blocks 1223assert(Func.Entry == 0 && Func.Blocks[0].isEntry());
1224size_t NumExitBlocks = 0;
1225for (
size_tI = 1;
I < Func.Blocks.size();
I++) {
1226assert(!Func.Blocks[
I].isEntry() &&
"multiple entry blocks");
1227if (Func.Blocks[
I].isExit())
1230assert(NumExitBlocks > 0 &&
"cannot find exit blocks");
1232// Verify that there are no parallel edges 1233for (
auto &
Block : Func.Blocks) {
1234 std::unordered_set<uint64_t> UniqueSuccs;
1235for (
auto &Jump :
Block.SuccJumps) {
1236auto It = UniqueSuccs.insert(Jump->
Target);
1237assert(It.second &&
"input CFG contains parallel edges");
1241for (
auto &
Block : Func.Blocks) {
1243"a block cannot be an entry and an exit");
1245// Verify input block weights 1246for (
auto &
Block : Func.Blocks) {
1248"non-zero weight of a block w/o weight except for an entry");
1250// Verify input jump weights 1251for (
auto &Jump : Func.Jumps) {
1253"non-zero weight of a jump w/o weight");
1257/// Verify that the computed flow values satisfy flow conservation rules. 1259constuint64_t NumBlocks = Func.Blocks.size();
1260auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
1261auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
1262for (
constauto &Jump : Func.Jumps) {
1270auto &
Block = Func.Blocks[
I];
1271if (
Block.isEntry()) {
1272 TotalInFlow +=
Block.Flow;
1273assert(
Block.Flow == OutFlow[
I] &&
"incorrectly computed control flow");
1274 }
elseif (
Block.isExit()) {
1275 TotalOutFlow +=
Block.Flow;
1276assert(
Block.Flow == InFlow[
I] &&
"incorrectly computed control flow");
1278assert(
Block.Flow == OutFlow[
I] &&
"incorrectly computed control flow");
1279assert(
Block.Flow == InFlow[
I] &&
"incorrectly computed control flow");
1282assert(TotalInFlow == TotalOutFlow &&
"incorrectly computed control flow");
1284// Verify that there are no isolated flow components 1285// One could modify FlowFunction to hold edges indexed by the sources, which 1286// will avoid a creation of the object 1287auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks);
1288for (
constauto &Jump : Func.Jumps) {
1294// Run BFS from the source along edges with positive flow 1295 std::queue<uint64_t> Queue;
1296auto Visited =
BitVector(NumBlocks,
false);
1297 Queue.push(Func.Entry);
1298 Visited[Func.Entry] =
true;
1299while (!Queue.empty()) {
1302for (
uint64_t Dst : PositiveFlowEdges[Src]) {
1310// Verify that every block that has a positive flow is reached from the source 1311// along edges with a positive flow 1313auto &
Block = Func.Blocks[
I];
1314assert((Visited[
I] ||
Block.Flow == 0) &&
"an isolated flow component");
1319}
// end of anonymous namespace 1321/// Apply the profile inference algorithm for a given function and provided 1324// Check if the function has samples and assign initial flow values 1325bool HasSamples =
false;
1327if (
Block.Weight > 0)
1337// Quit early for functions with a single block or ones w/o samples 1338if (Func.Blocks.size() <= 1 || !HasSamples)
1342// Verify the input data 1346// Create and apply an inference network model 1347auto InferenceNetwork = MinCostMaxFlow(Params);
1348 initializeNetwork(Params, InferenceNetwork, Func);
1349 InferenceNetwork.run();
1351// Extract flow values for every block and every edge 1352 extractWeights(Params, InferenceNetwork, Func);
1354// Post-processing adjustments to the flow 1355auto Adjuster = FlowAdjuster(Params, Func);
1364/// Apply the profile inference algorithm for a given flow function 1367// Set the params from the command-line flags. This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for the profile inference algorithm, profi.
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
Target - Wrapper for Target specific information.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void applyFlowInference(const ProfiParams &Params, FlowFunction &Func)
Apply the profile inference algorithm for a given function and provided profi options.
A wrapper of a binary basic block.
std::vector< FlowJump * > SuccJumps
A wrapper of binary function with basic blocks and jumps.
A wrapper of a jump between two basic blocks.
Various thresholds and options controlling the behavior of the profile inference algorithm.
unsigned CostJumpUnknownFTInc
The cost of increasing an unknown fall-through jump's count by one.
unsigned CostBlockInc
The cost of increasing a block's count by one.
unsigned CostJumpFTInc
The cost of increasing a fall-through jump's count by one.
bool RebalanceUnknown
Evenly re-distribute flow among unknown subgraphs.
const int64_t CostUnlikely
The cost of taking an unlikely block/jump.
unsigned CostJumpDec
The cost of decreasing a jump's count by one.
bool JoinIslands
Join isolated components having positive flow.
unsigned CostBlockZeroInc
The cost of increasing a count of zero-weight block by one.
unsigned CostBlockEntryDec
The cost of decreasing the entry block's count by one.
unsigned CostJumpInc
The cost of increasing a jump's count by one.
unsigned CostJumpUnknownInc
The cost of increasing an unknown jump's count by one.
unsigned CostBlockUnknownInc
The cost of increasing an unknown block's count by one.
unsigned CostJumpFTDec
The cost of decreasing a fall-through jump's count by one.
unsigned CostBlockDec
The cost of decreasing a block's count by one.
unsigned CostBlockEntryInc
The cost of increasing the entry block's count by one.
bool EvenFlowDistribution
Evenly distribute flow when there are multiple equally likely options.