Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Add comments to cardinality estimator code#19973

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Open
Tmonster wants to merge1 commit intoduckdb:main
base:main
Choose a base branch
Loading
fromTmonster:joo_optimizer_comments
Open
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
updating comments on cardinality estimator relations statistics helpe…
…rs etc.
  • Loading branch information
@Tmonster
Tmonster committedNov 28, 2025
commit77259b9d1d10cb6963e1be9e39b1fe5cd31a976e
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -26,36 +26,38 @@ struct DenomInfo {
double denominator;
};

structRelationsToTDom {
structRelationsSetToStats {
//! column binding sets that are equivalent in a join plan.
//! if you have A.x = B.y and B.y = C.z, then one set is {A.x, B.y, C.z}.
column_binding_set_t equivalent_relations;
//!the estimated total domains of the equivalent relations determined using HLL
idx_ttdom_hll;
idx_tdistinct_count_hll;
//! the estimated total domains of each relation without using HLL
idx_ttdom_no_hll;
boolhas_tdom_hll;
idx_tdistinct_count_no_hll;
boolhas_distinct_count_hll;
vector<optional_ptr<FilterInfo>> filters;
vector<string> column_names;

explicitRelationsToTDom(const column_binding_set_t &column_binding_set)
: equivalent_relations(column_binding_set),tdom_hll(0), tdom_no_hll(NumericLimits<idx_t>::Maximum()),
has_tdom_hll(false) {};
explicitRelationsSetToStats(const column_binding_set_t &column_binding_set)
: equivalent_relations(column_binding_set),distinct_count_hll(0),
distinct_count_no_hll(NumericLimits<idx_t>::Maximum()), has_distinct_count_hll(false) {};
};

// class to wrap a join Filter along with some statistical information about the joined columns
class FilterInfoWithTotalDomains {
public:
FilterInfoWithTotalDomains(optional_ptr<FilterInfo> filter_info, RelationsToTDom &relation2tdom)
: filter_info(filter_info), tdom_hll(relation2tdom.tdom_hll), tdom_no_hll(relation2tdom.tdom_no_hll),
has_tdom_hll(relation2tdom.has_tdom_hll) {
FilterInfoWithTotalDomains(optional_ptr<FilterInfo> filter_info, RelationsSetToStats &relation_set_to_stats)
: filter_info(filter_info), distinct_count_hll(relation_set_to_stats.distinct_count_hll),
distinct_count_no_hll(relation_set_to_stats.distinct_count_no_hll),
has_distinct_count_hll(relation_set_to_stats.has_distinct_count_hll) {
}

optional_ptr<FilterInfo> filter_info;
//!the estimatedtotal domains oftheequivalent relations determined using HLL
idx_ttdom_hll;
//!the estimateddistinct countthejoined columns determined using HLL
idx_tdistinct_count_hll;
//! the estimated total domains of each relation without using HLL
idx_ttdom_no_hll;
boolhas_tdom_hll;
idx_tdistinct_count_no_hll;
boolhas_distinct_count_hll;
};

struct Subgraph2Denominator {
Expand DownExpand Up@@ -91,7 +93,7 @@ class CardinalityEstimator {
explicit CardinalityEstimator() {};

private:
vector<RelationsToTDom> relations_to_tdoms;
vector<RelationsSetToStats> relation_set_stats;
unordered_map<string, CardinalityHelper> relation_set_2_cardinality;
JoinRelationSetManager set_manager;
vector<RelationStats> relation_stats;
Expand All@@ -109,8 +111,8 @@ class CardinalityEstimator {
T EstimateCardinalityWithSet(JoinRelationSet &new_set);

//! used for debugging.
voidAddRelationNamesToTdoms(vector<RelationStats> &stats);
voidPrintRelationToTdomInfo();
voidAddRelationNamesToRelationStats(vector<RelationStats> &stats);
voidPrintRelationStats();

private:
double GetNumerator(JoinRelationSet &set);
Expand All@@ -128,7 +130,7 @@ class CardinalityEstimator {
JoinRelationSet &UpdateNumeratorRelations(Subgraph2Denominator left, Subgraph2Denominator right,
FilterInfoWithTotalDomains &filter);

voidAddRelationTdom(FilterInfo &filter_info);
voidAddRelationStats(FilterInfo &filter_info);
bool EmptyFilter(FilterInfo &filter_info);
};

Expand Down
122 changes: 71 additions & 51 deletionssrc/optimizer/join_order/cardinality_estimator.cpp
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -22,9 +22,9 @@ bool CardinalityEstimator::EmptyFilter(FilterInfo &filter_info) {
return false;
}

void CardinalityEstimator::AddRelationTdom(FilterInfo &filter_info) {
void CardinalityEstimator::AddRelationStats(FilterInfo &filter_info) {
D_ASSERT(filter_info.set.get().count >= 1);
for (constRelationsToTDom &r2tdom :relations_to_tdoms) {
for (constRelationsSetToStats &r2tdom :relation_set_stats) {
auto &i_set = r2tdom.equivalent_relations;
if (i_set.find(filter_info.left_binding) != i_set.end()) {
// found an equivalent filter
Expand All@@ -33,9 +33,9 @@ void CardinalityEstimator::AddRelationTdom(FilterInfo &filter_info) {
}

auto key = ColumnBinding(filter_info.left_binding.table_index, filter_info.left_binding.column_index);
RelationsToTDom new_r2tdom(column_binding_set_t({key}));
RelationsSetToStats new_r2tdom(column_binding_set_t({key}));

relations_to_tdoms.emplace_back(new_r2tdom);
relation_set_stats.emplace_back(new_r2tdom);
}

bool CardinalityEstimator::SingleColumnFilter(duckdb::FilterInfo &filter_info) {
Expand All@@ -56,7 +56,7 @@ vector<idx_t> CardinalityEstimator::DetermineMatchingEquivalentSets(optional_ptr
vector<idx_t> matching_equivalent_sets;
idx_t equivalent_relation_index = 0;

for (constRelationsToTDom &r2tdom :relations_to_tdoms) {
for (constRelationsSetToStats &r2tdom :relation_set_stats) {
auto &i_set = r2tdom.equivalent_relations;
if (i_set.find(filter_info->left_binding) != i_set.end()) {
matching_equivalent_sets.push_back(equivalent_relation_index);
Expand All@@ -77,27 +77,27 @@ void CardinalityEstimator::AddToEquivalenceSets(optional_ptr<FilterInfo> filter_
// an equivalence relation is connecting two sets of equivalence relations
// so push all relations from the second set into the first. Later we will delete
// the second set.
for (ColumnBinding i :relations_to_tdoms.at(matching_equivalent_sets[1]).equivalent_relations) {
relations_to_tdoms.at(matching_equivalent_sets[0]).equivalent_relations.insert(i);
for (ColumnBinding i :relation_set_stats.at(matching_equivalent_sets[1]).equivalent_relations) {
relation_set_stats.at(matching_equivalent_sets[0]).equivalent_relations.insert(i);
}
for (auto &column_name :relations_to_tdoms.at(matching_equivalent_sets[1]).column_names) {
relations_to_tdoms.at(matching_equivalent_sets[0]).column_names.push_back(column_name);
for (auto &column_name :relation_set_stats.at(matching_equivalent_sets[1]).column_names) {
relation_set_stats.at(matching_equivalent_sets[0]).column_names.push_back(column_name);
}
relations_to_tdoms.at(matching_equivalent_sets[1]).equivalent_relations.clear();
relations_to_tdoms.at(matching_equivalent_sets[1]).column_names.clear();
relations_to_tdoms.at(matching_equivalent_sets[0]).filters.push_back(filter_info);
relation_set_stats.at(matching_equivalent_sets[1]).equivalent_relations.clear();
relation_set_stats.at(matching_equivalent_sets[1]).column_names.clear();
relation_set_stats.at(matching_equivalent_sets[0]).filters.push_back(filter_info);
// add all values of one set to the other, delete the empty one
} else if (matching_equivalent_sets.size() == 1) {
auto &tdom_i =relations_to_tdoms.at(matching_equivalent_sets.at(0));
auto &tdom_i =relation_set_stats.at(matching_equivalent_sets.at(0));
tdom_i.equivalent_relations.insert(filter_info->left_binding);
tdom_i.equivalent_relations.insert(filter_info->right_binding);
tdom_i.filters.push_back(filter_info);
} else if (matching_equivalent_sets.empty()) {
column_binding_set_t tmp;
tmp.insert(filter_info->left_binding);
tmp.insert(filter_info->right_binding);
relations_to_tdoms.emplace_back(tmp);
relations_to_tdoms.back().filters.push_back(filter_info);
relation_set_stats.emplace_back(tmp);
relation_set_stats.back().filters.push_back(filter_info);
}
}

Expand All@@ -108,7 +108,7 @@ void CardinalityEstimator::InitEquivalentRelations(const vector<unique_ptr<Filte
if (SingleColumnFilter(*filter)) {
// Filter on one relation, (i.e. string or range filter on a column).
// Grab the first relation and add it to the equivalence_relations
AddRelationTdom(*filter);
AddRelationStats(*filter);
continue;
} else if (EmptyFilter(*filter)) {
continue;
Expand All@@ -123,9 +123,10 @@ void CardinalityEstimator::InitEquivalentRelations(const vector<unique_ptr<Filte
}

void CardinalityEstimator::RemoveEmptyTotalDomains() {
auto remove_start = std::remove_if(relations_to_tdoms.begin(), relations_to_tdoms.end(),
[](RelationsToTDom &r_2_tdom) { return r_2_tdom.equivalent_relations.empty(); });
relations_to_tdoms.erase(remove_start, relations_to_tdoms.end());
auto remove_start =
std::remove_if(relation_set_stats.begin(), relation_set_stats.end(),
[](RelationsSetToStats &r_2_tdom) { return r_2_tdom.equivalent_relations.empty(); });
relation_set_stats.erase(remove_start, relation_set_stats.end());
}

double CardinalityEstimator::GetNumerator(JoinRelationSet &set) {
Expand DownExpand Up@@ -153,7 +154,7 @@ bool EdgeConnects(FilterInfoWithTotalDomains &edge, Subgraph2Denominator &subgra
return false;
}

vector<FilterInfoWithTotalDomains> GetEdges(vector<RelationsToTDom> &relations_to_tdom,
vector<FilterInfoWithTotalDomains> GetEdges(vector<RelationsSetToStats> &relations_to_tdom,
JoinRelationSet &requested_set) {
vector<FilterInfoWithTotalDomains> res;
for (auto &relation_2_tdom : relations_to_tdom) {
Expand DownExpand Up@@ -213,6 +214,8 @@ JoinRelationSet &CardinalityEstimator::UpdateNumeratorRelations(Subgraph2Denomin
}
}

// Given two relations, here is where we considers the filter(s) that join them.
// This could use some work when it comes to join conditions that are not equality join conditions
double CardinalityEstimator::CalculateUpdatedDenom(Subgraph2Denominator left, Subgraph2Denominator right,
FilterInfoWithTotalDomains &filter) {
double new_denom = left.denom * right.denom;
Expand All@@ -226,8 +229,8 @@ double CardinalityEstimator::CalculateUpdatedDenom(Subgraph2Denominator left, Su
}
});
if (comparison_type == ExpressionType::INVALID) {
new_denom *=
filter.has_tdom_hll ? static_cast<double>(filter.tdom_hll): static_cast<double>(filter.tdom_no_hll);
new_denom *= filter.has_distinct_count_hll ? static_cast<double>(filter.distinct_count_hll)
: static_cast<double>(filter.distinct_count_no_hll);
// no comparison is taking place, so the denominator is just the product of the left and right
return new_denom;
}
Expand All@@ -238,8 +241,8 @@ double CardinalityEstimator::CalculateUpdatedDenom(Subgraph2Denominator left, Su
case ExpressionType::COMPARE_EQUAL:
case ExpressionType::COMPARE_NOT_DISTINCT_FROM:
// extra ratio stays 1
extra_ratio =
filter.has_tdom_hll ? static_cast<double>(filter.tdom_hll): static_cast<double>(filter.tdom_no_hll);
extra_ratio = filter.has_distinct_count_hll ? static_cast<double>(filter.distinct_count_hll)
: static_cast<double>(filter.distinct_count_no_hll);
break;
case ExpressionType::COMPARE_LESSTHANOREQUALTO:
case ExpressionType::COMPARE_LESSTHAN:
Expand All@@ -248,8 +251,8 @@ double CardinalityEstimator::CalculateUpdatedDenom(Subgraph2Denominator left, Su
case ExpressionType::COMPARE_NOTEQUAL:
case ExpressionType::COMPARE_DISTINCT_FROM:
// Assume this blows up, but use the tdom to bound it a bit
extra_ratio =
filter.has_tdom_hll ? static_cast<double>(filter.tdom_hll): static_cast<double>(filter.tdom_no_hll);
extra_ratio = filter.has_distinct_count_hll ? static_cast<double>(filter.distinct_count_hll)
: static_cast<double>(filter.distinct_count_no_hll);
extra_ratio = pow(extra_ratio, 2.0 / 3.0);
break;
default:
Expand DownExpand Up@@ -291,12 +294,12 @@ DenomInfo CardinalityEstimator::GetDenominator(JoinRelationSet &set) {

// edges are guaranteed to be in order of largest tdom to smallest tdom.
unordered_set<idx_t> unused_edge_tdoms;
auto edges = GetEdges(relations_to_tdoms, set);
auto edges = GetEdges(relation_set_stats, set);
for (auto &edge : edges) {
if (subgraphs.size() == 1 && subgraphs.at(0).relations->ToString() == set.ToString()) {
// the first subgraph has connected all the desired relations, just skip the rest of the edges
if (edge.has_tdom_hll) {
unused_edge_tdoms.insert(edge.tdom_hll);
if (edge.has_distinct_count_hll) {
unused_edge_tdoms.insert(edge.distinct_count_hll);
}
continue;
}
Expand DownExpand Up@@ -392,6 +395,18 @@ DenomInfo CardinalityEstimator::GetDenominator(JoinRelationSet &set) {
return DenomInfo(*subgraphs.at(0).numerator_relations, 1, subgraphs.at(0).denom * denom_multiplier);
}

// Cardinality is calculatd using logic found in
// https://blobs.duckdb.org/papers/tom-ebergen-msc-thesis-join-order-optimization-with-almost-no-statistics.pdf TL;DR
// Cardinality is estimated based on cardinality of base tables and the distinct counts of joined columns. If you have
// two tables A and B joined using A.x = B.y we assume that each tuple in A will match ~ B/(distinct(y)) tuples in B.
// The cardinality estimation then becomes (|A|x|B|) / max(distinct(x), distinct(y)).
// If there are extra joins, you can add the cardinality of the table to the numerator, and the
// distinct count of the join condition to the denominator.
// One benefit of this cardinality estimation formula is that it is associative and commutative, which means regardless
// of the order of the joins/join tree, the cardinality estimate will always be the same. The drawback of this current
// implementation, however, is that it only considers equality join conditions. Some modification have been made for
// comparison types like <, <=, >, >=, !=, but only a "penalty" was introduced, and the calculated cardinality is not
// based on stats (see CalculateUpdatedDenom()).
template <>
double CardinalityEstimator::EstimateCardinalityWithSet(JoinRelationSet &new_set) {
if (relation_set_2_cardinality.find(new_set.ToString()) != relation_set_2_cardinality.end()) {
Expand All@@ -400,6 +415,8 @@ double CardinalityEstimator::EstimateCardinalityWithSet(JoinRelationSet &new_set

// can happen if a table has cardinality 0, or a tdom is set to 0
auto denom = GetDenominator(new_set);
// we pass numerator relations, because for semi and anti joins, we don't want to
// include cardinalities of relations on the RHS of a semi/anti join.
auto numerator = GetNumerator(denom.numerator_relations);

double result = numerator / denom.denominator;
Expand All@@ -418,17 +435,17 @@ idx_t CardinalityEstimator::EstimateCardinalityWithSet(JoinRelationSet &new_set)
return (idx_t)cardinality_as_double;
}

bool SortTdoms(constRelationsToTDom &a, constRelationsToTDom &b) {
if (a.has_tdom_hll && b.has_tdom_hll) {
return a.tdom_hll > b.tdom_hll;
bool SortTdoms(constRelationsSetToStats &a, constRelationsSetToStats &b) {
if (a.has_distinct_count_hll && b.has_distinct_count_hll) {
return a.distinct_count_hll > b.distinct_count_hll;
}
if (a.has_tdom_hll) {
return a.tdom_hll > b.tdom_no_hll;
if (a.has_distinct_count_hll) {
return a.distinct_count_hll > b.distinct_count_no_hll;
}
if (b.has_tdom_hll) {
return a.tdom_no_hll > b.tdom_hll;
if (b.has_distinct_count_hll) {
return a.distinct_count_no_hll > b.distinct_count_hll;
}
return a.tdom_no_hll > b.tdom_no_hll;
return a.distinct_count_no_hll > b.distinct_count_no_hll;
}

void CardinalityEstimator::InitCardinalityEstimatorProps(optional_ptr<JoinRelationSet> set, RelationStats &stats) {
Expand All@@ -442,7 +459,7 @@ void CardinalityEstimator::InitCardinalityEstimatorProps(optional_ptr<JoinRelati
UpdateTotalDomains(set, stats);

// sort relations from greatest tdom to lowest tdom.
std::sort(relations_to_tdoms.begin(),relations_to_tdoms.end(), SortTdoms);
std::sort(relation_set_stats.begin(),relation_set_stats.end(), SortTdoms);
}

void CardinalityEstimator::UpdateTotalDomains(optional_ptr<JoinRelationSet> set, RelationStats &stats) {
Expand All@@ -456,19 +473,21 @@ void CardinalityEstimator::UpdateTotalDomains(optional_ptr<JoinRelationSet> set,
//! the cardinality
// Update the relation_to_tdom set with the estimated distinct count (or tdom) calculated above
auto key = ColumnBinding(relation_id, i);
for (auto &relation_to_tdom :relations_to_tdoms) {
for (auto &relation_to_tdom :relation_set_stats) {
column_binding_set_t i_set = relation_to_tdom.equivalent_relations;
if (i_set.find(key) == i_set.end()) {
continue;
}
auto distinct_count = stats.column_distinct_count.at(i);
if (distinct_count.from_hll && relation_to_tdom.has_tdom_hll) {
relation_to_tdom.tdom_hll = MaxValue(relation_to_tdom.tdom_hll, distinct_count.distinct_count);
} else if (distinct_count.from_hll && !relation_to_tdom.has_tdom_hll) {
relation_to_tdom.has_tdom_hll = true;
relation_to_tdom.tdom_hll = distinct_count.distinct_count;
if (distinct_count.from_hll && relation_to_tdom.has_distinct_count_hll) {
relation_to_tdom.distinct_count_hll =
MaxValue(relation_to_tdom.distinct_count_hll, distinct_count.distinct_count);
} else if (distinct_count.from_hll && !relation_to_tdom.has_distinct_count_hll) {
relation_to_tdom.has_distinct_count_hll = true;
relation_to_tdom.distinct_count_hll = distinct_count.distinct_count;
} else {
relation_to_tdom.tdom_no_hll = MinValue(distinct_count.distinct_count, relation_to_tdom.tdom_no_hll);
relation_to_tdom.distinct_count_no_hll =
MinValue(distinct_count.distinct_count, relation_to_tdom.distinct_count_no_hll);
}
break;
}
Expand All@@ -477,9 +496,9 @@ void CardinalityEstimator::UpdateTotalDomains(optional_ptr<JoinRelationSet> set,

// LCOV_EXCL_START

void CardinalityEstimator::AddRelationNamesToTdoms(vector<RelationStats> &stats) {
void CardinalityEstimator::AddRelationNamesToRelationStats(vector<RelationStats> &stats) {
#ifdef DEBUG
for (auto &total_domain :relations_to_tdoms) {
for (auto &total_domain :relation_set_stats) {
for (auto &binding : total_domain.equivalent_relations) {
D_ASSERT(binding.table_index < stats.size());
string column_name;
Expand All@@ -494,14 +513,15 @@ void CardinalityEstimator::AddRelationNamesToTdoms(vector<RelationStats> &stats)
#endif
}

void CardinalityEstimator::PrintRelationToTdomInfo() {
for (auto &total_domain :relations_to_tdoms) {
void CardinalityEstimator::PrintRelationStats() {
for (auto &total_domain :relation_set_stats) {
string domain = "Following columns have the same distinct count: ";
for (auto &column_name : total_domain.column_names) {
domain += column_name + ", ";
}
bool have_hll = total_domain.has_tdom_hll;
domain += "\n TOTAL DOMAIN = " + to_string(have_hll ? total_domain.tdom_hll : total_domain.tdom_no_hll);
bool have_hll = total_domain.has_distinct_count_hll;
domain += "\n TOTAL DOMAIN = " +
to_string(have_hll ? total_domain.distinct_count_hll : total_domain.distinct_count_no_hll);
Printer::Print(domain);
}
}
Expand Down
2 changes: 2 additions & 0 deletionssrc/optimizer/join_order/cost_model.cpp
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -8,6 +8,8 @@ CostModel::CostModel(QueryGraphManager &query_graph_manager)
: query_graph_manager(query_graph_manager), cardinality_estimator() {
}

// Currently cost of a join only factors in the cardinalities.
// If join types and join algorithms are to be considered, they should be added here.
double CostModel::ComputeCost(DPJoinNode &left, DPJoinNode &right) {
auto &combination = query_graph_manager.set_manager.Union(left.set, right.set);
auto join_card = cardinality_estimator.EstimateCardinalityWithSet<double>(combination);
Expand Down
2 changes: 1 addition & 1 deletionsrc/optimizer/join_order/plan_enumerator.cpp
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -451,7 +451,7 @@ void PlanEnumerator::InitLeafPlans() {
auto relation_stats = query_graph_manager.relation_manager.GetRelationStats();

cost_model.cardinality_estimator.InitEquivalentRelations(query_graph_manager.GetFilterBindings());
cost_model.cardinality_estimator.AddRelationNamesToTdoms(relation_stats);
cost_model.cardinality_estimator.AddRelationNamesToRelationStats(relation_stats);

// then update the total domains based on the cardinalities of each relation.
for (idx_t i = 0; i < relation_stats.size(); i++) {
Expand Down
Loading

[8]ページ先頭

©2009-2025 Movatter.jp