|
15 | 15 | *
|
16 | 16 | *
|
17 | 17 | * IDENTIFICATION
|
18 |
| - * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.134 2003/03/23 05:14:36 tgl Exp $ |
| 18 | + * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.135 2003/04/15 05:18:12 tgl Exp $ |
19 | 19 | *
|
20 | 20 | *-------------------------------------------------------------------------
|
21 | 21 | */
|
@@ -1591,27 +1591,33 @@ eqjoinsel(PG_FUNCTION_ARGS)
|
1591 | 1591 | {
|
1592 | 1592 | /*
|
1593 | 1593 | * We do not have MCV lists for both sides. Estimate the join
|
1594 |
| - * selectivity as MIN(1/nd1, 1/nd2). This is plausible if we |
1595 |
| - * assume that the values are about equally distributed: a |
1596 |
| - * given tuple of rel1 will join to either 0 or N2/nd2 rows of |
1597 |
| - * rel2, so total join rows are at most N1*N2/nd2 giving a |
1598 |
| - * join selectivity of not more than 1/nd2. By the same logic |
1599 |
| - * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper |
1600 |
| - * bound. Using the MIN() means we estimate from the point of |
1601 |
| - * view of the relation with smaller nd (since the larger nd |
1602 |
| - * is determining the MIN). It is reasonable to assume that |
1603 |
| - * most tuples in this rel will have join partners, so the |
1604 |
| - * bound is probably reasonably tight and should be taken |
1605 |
| - * as-is. |
| 1594 | + * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2). |
| 1595 | + * This is plausible if we assume that the join operator is |
| 1596 | + * strict and the non-null values are about equally distributed: |
| 1597 | + * a given non-null tuple of rel1 will join to either zero or |
| 1598 | + * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are at |
| 1599 | + * most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join |
| 1600 | + * selectivity of not more than (1-nullfrac1)*(1-nullfrac2)/nd2. |
| 1601 | + * By the same logic it is not more than |
| 1602 | + * (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression with MIN() |
| 1603 | + * is an upper bound. Using the MIN() means we estimate from the |
| 1604 | + * point of view of the relation with smaller nd (since the larger |
| 1605 | + * nd is determining the MIN). It is reasonable to assume that |
| 1606 | + * most tuples in this rel will have join partners, so the bound |
| 1607 | + * is probably reasonably tight and should be taken as-is. |
1606 | 1608 | *
|
1607 | 1609 | * XXX Can we be smarter if we have an MCV list for just one
|
1608 | 1610 | * side? It seems that if we assume equal distribution for the
|
1609 | 1611 | * other side, we end up with the same answer anyway.
|
1610 | 1612 | */
|
| 1613 | +doublenullfrac1=stats1->stanullfrac; |
| 1614 | +doublenullfrac2=stats2->stanullfrac; |
| 1615 | + |
| 1616 | +selec= (1.0-nullfrac1)* (1.0-nullfrac2); |
1611 | 1617 | if (nd1>nd2)
|
1612 |
| -selec=1.0 /nd1; |
| 1618 | +selec/=nd1; |
1613 | 1619 | else
|
1614 |
| -selec=1.0 /nd2; |
| 1620 | +selec/=nd2; |
1615 | 1621 | }
|
1616 | 1622 |
|
1617 | 1623 | if (have_mcvs1)
|
|