Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit5c1b7ed

Browse files
committed
Fix get_actual_variable_range() to cope with broken HOT chains.
Commit3ca930f modified get_actual_variable_range() to use a new"SnapshotNonVacuumable" snapshot type for selecting tuples that itwould consider valid. However, because that snapshot type can acceptrecently-dead tuples, this caused a bug when using a recently-createdindex: we might accept a recently-dead tuple that is an early memberof a broken HOT chain and does not actually match the index entry.Then, the data extracted from the heap tuple would not necessarily bean endpoint value of the column; it could even be NULL, leading toget_actual_variable_range() itself reporting "found unexpected nullvalue in index". Even without an error, this could lead to poorplan choices due to an erroneous notion of the endpoint value.We can improve matters by changing the code to use the index-onlyscan technique (which didn't exist when get_actual_variable_range wasoriginally written). If any of the tuples in a HOT chain are liveenough to satisfy SnapshotNonVacuumable, we take the data from theindex entry, ignoring what is in the heap. This fixes the problemwithout changing the live-vs-dead-tuple behavior from what wasintended by commit3ca930f.A side benefit is that for static tables we might not have to touchthe heap at all (when the extremal value is in an all-visible page).In addition, we can save some overhead by not having to create acomplete ExecutorState, and we don't need to run FormIndexDatum,avoiding more cycles as well as the possibility of failure forindexes on expressions. (I'm not sure that this code would everbe used to determine the extreme value of an expression, in thecurrent state of the planner; but it's definitely possible thatlower-order columns of the selected index could be expressions.So one could construct perhaps-artificial examples in which theold code unexpectedly failed due to trying to compute anexpression's value for a now-dead row.)Per report from Manuel Rigger. Back-patch to v11 where commit3ca930f came in.Discussion:https://postgr.es/m/CA+u7OA7W4NWEhCvftdV6_8bbm2vgypi5nuxfnSEJQqVKFSUoMg@mail.gmail.com
1 parent757f1ba commit5c1b7ed

File tree

1 file changed

+170
-112
lines changed

1 file changed

+170
-112
lines changed

‎src/backend/utils/adt/selfuncs.c

Lines changed: 170 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -104,16 +104,16 @@
104104
#include"access/brin.h"
105105
#include"access/gin.h"
106106
#include"access/htup_details.h"
107+
#include"access/relscan.h"
107108
#include"access/sysattr.h"
108-
#include"catalog/index.h"
109+
#include"access/visibilitymap.h"
109110
#include"catalog/pg_am.h"
110111
#include"catalog/pg_collation.h"
111112
#include"catalog/pg_operator.h"
112113
#include"catalog/pg_opfamily.h"
113114
#include"catalog/pg_statistic.h"
114115
#include"catalog/pg_statistic_ext.h"
115116
#include"catalog/pg_type.h"
116-
#include"executor/executor.h"
117117
#include"mb/pg_wchar.h"
118118
#include"miscadmin.h"
119119
#include"nodes/makefuncs.h"
@@ -130,6 +130,7 @@
130130
#include"parser/parse_coerce.h"
131131
#include"parser/parsetree.h"
132132
#include"statistics/statistics.h"
133+
#include"storage/bufmgr.h"
133134
#include"utils/acl.h"
134135
#include"utils/builtins.h"
135136
#include"utils/bytea.h"
@@ -138,6 +139,7 @@
138139
#include"utils/fmgroids.h"
139140
#include"utils/index_selfuncs.h"
140141
#include"utils/lsyscache.h"
142+
#include"utils/memutils.h"
141143
#include"utils/nabstime.h"
142144
#include"utils/pg_locale.h"
143145
#include"utils/rel.h"
@@ -204,6 +206,14 @@ static bool get_actual_variable_range(PlannerInfo *root,
204206
VariableStatData*vardata,
205207
Oidsortop,
206208
Datum*min,Datum*max);
209+
staticboolget_actual_variable_endpoint(RelationheapRel,
210+
RelationindexRel,
211+
ScanDirectionindexscandir,
212+
ScanKeyscankeys,
213+
int16typLen,
214+
booltypByVal,
215+
MemoryContextoutercontext,
216+
Datum*endpointDatum);
207217
staticRelOptInfo*find_join_input_rel(PlannerInfo*root,Relidsrelids);
208218
staticSelectivityprefix_selectivity(PlannerInfo*root,
209219
VariableStatData*vardata,
@@ -5539,31 +5549,22 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
55395549
}
55405550

55415551
/*
5542-
* Found a suitable index to extract data from.We'll need an EState
5543-
*and a bunch of other infrastructure.
5552+
* Found a suitable index to extract data from.Set up some data that
5553+
*can be used by both invocations of get_actual_variable_endpoint.
55445554
*/
55455555
{
5546-
EState*estate;
5547-
ExprContext*econtext;
55485556
MemoryContexttmpcontext;
55495557
MemoryContextoldcontext;
55505558
RelationheapRel;
55515559
RelationindexRel;
5552-
IndexInfo*indexInfo;
5553-
TupleTableSlot*slot;
55545560
int16typLen;
55555561
booltypByVal;
55565562
ScanKeyDatascankeys[1];
5557-
IndexScanDescindex_scan;
5558-
HeapTupletup;
5559-
Datumvalues[INDEX_MAX_KEYS];
5560-
boolisnull[INDEX_MAX_KEYS];
5561-
SnapshotDataSnapshotNonVacuumable;
5562-
5563-
estate=CreateExecutorState();
5564-
econtext=GetPerTupleExprContext(estate);
5565-
/* Make sure any cruft is generated in the econtext's memory */
5566-
tmpcontext=econtext->ecxt_per_tuple_memory;
5563+
5564+
/* Make sure any cruft gets recycled when we're done */
5565+
tmpcontext=AllocSetContextCreate(CurrentMemoryContext,
5566+
"get_actual_variable_range workspace",
5567+
ALLOCSET_DEFAULT_SIZES);
55675568
oldcontext=MemoryContextSwitchTo(tmpcontext);
55685569

55695570
/*
@@ -5574,14 +5575,8 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
55745575
heapRel=heap_open(rte->relid,NoLock);
55755576
indexRel=index_open(index->indexoid,AccessShareLock);
55765577

5577-
/* extract index key information from the index's pg_index info */
5578-
indexInfo=BuildIndexInfo(indexRel);
5579-
5580-
/* some other stuff */
5581-
slot=MakeSingleTupleTableSlot(RelationGetDescr(heapRel));
5582-
econtext->ecxt_scantuple=slot;
5578+
/* build some stuff needed for indexscan execution */
55835579
get_typlenbyval(vardata->atttype,&typLen,&typByVal);
5584-
InitNonVacuumableSnapshot(SnapshotNonVacuumable,RecentGlobalXmin);
55855580

55865581
/* set up an IS NOT NULL scan key so that we ignore nulls */
55875582
ScanKeyEntryInitialize(&scankeys[0],
@@ -5593,108 +5588,44 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
55935588
InvalidOid,/* no reg proc for this */
55945589
(Datum)0);/* constant */
55955590

5596-
have_data= true;
5597-
55985591
/* If min is requested ... */
55995592
if (min)
56005593
{
5601-
/*
5602-
* In principle, we should scan the index with our current
5603-
* active snapshot, which is the best approximation we've got
5604-
* to what the query will see when executed. But that won't
5605-
* be exact if a new snap is taken before running the query,
5606-
* and it can be very expensive if a lot of recently-dead or
5607-
* uncommitted rows exist at the beginning or end of the index
5608-
* (because we'll laboriously fetch each one and reject it).
5609-
* Instead, we use SnapshotNonVacuumable. That will accept
5610-
* recently-dead and uncommitted rows as well as normal
5611-
* visible rows. On the other hand, it will reject known-dead
5612-
* rows, and thus not give a bogus answer when the extreme
5613-
* value has been deleted (unless the deletion was quite
5614-
* recent); that case motivates not using SnapshotAny here.
5615-
*
5616-
* A crucial point here is that SnapshotNonVacuumable, with
5617-
* RecentGlobalXmin as horizon, yields the inverse of the
5618-
* condition that the indexscan will use to decide that index
5619-
* entries are killable (see heap_hot_search_buffer()).
5620-
* Therefore, if the snapshot rejects a tuple and we have to
5621-
* continue scanning past it, we know that the indexscan will
5622-
* mark that index entry killed. That means that the next
5623-
* get_actual_variable_range() call will not have to visit
5624-
* that heap entry. In this way we avoid repetitive work when
5625-
* this function is used a lot during planning.
5626-
*/
5627-
index_scan=index_beginscan(heapRel,indexRel,
5628-
&SnapshotNonVacuumable,
5629-
1,0);
5630-
index_rescan(index_scan,scankeys,1,NULL,0);
5631-
5632-
/* Fetch first tuple in sortop's direction */
5633-
if ((tup=index_getnext(index_scan,
5634-
indexscandir))!=NULL)
5635-
{
5636-
/* Extract the index column values from the heap tuple */
5637-
ExecStoreTuple(tup,slot,InvalidBuffer, false);
5638-
FormIndexDatum(indexInfo,slot,estate,
5639-
values,isnull);
5640-
5641-
/* Shouldn't have got a null, but be careful */
5642-
if (isnull[0])
5643-
elog(ERROR,"found unexpected null value in index \"%s\"",
5644-
RelationGetRelationName(indexRel));
5645-
5646-
/* Copy the index column value out to caller's context */
5647-
MemoryContextSwitchTo(oldcontext);
5648-
*min=datumCopy(values[0],typByVal,typLen);
5649-
MemoryContextSwitchTo(tmpcontext);
5650-
}
5651-
else
5652-
have_data= false;
5653-
5654-
index_endscan(index_scan);
5594+
have_data=get_actual_variable_endpoint(heapRel,
5595+
indexRel,
5596+
indexscandir,
5597+
scankeys,
5598+
typLen,
5599+
typByVal,
5600+
oldcontext,
5601+
min);
5602+
}
5603+
else
5604+
{
5605+
/* If min not requested, assume index is nonempty */
5606+
have_data= true;
56555607
}
56565608

56575609
/* If max is requested, and we didn't find the index is empty */
56585610
if (max&&have_data)
56595611
{
5660-
index_scan=index_beginscan(heapRel,indexRel,
5661-
&SnapshotNonVacuumable,
5662-
1,0);
5663-
index_rescan(index_scan,scankeys,1,NULL,0);
5664-
5665-
/* Fetch first tuple in reverse direction */
5666-
if ((tup=index_getnext(index_scan,
5667-
-indexscandir))!=NULL)
5668-
{
5669-
/* Extract the index column values from the heap tuple */
5670-
ExecStoreTuple(tup,slot,InvalidBuffer, false);
5671-
FormIndexDatum(indexInfo,slot,estate,
5672-
values,isnull);
5673-
5674-
/* Shouldn't have got a null, but be careful */
5675-
if (isnull[0])
5676-
elog(ERROR,"found unexpected null value in index \"%s\"",
5677-
RelationGetRelationName(indexRel));
5678-
5679-
/* Copy the index column value out to caller's context */
5680-
MemoryContextSwitchTo(oldcontext);
5681-
*max=datumCopy(values[0],typByVal,typLen);
5682-
MemoryContextSwitchTo(tmpcontext);
5683-
}
5684-
else
5685-
have_data= false;
5686-
5687-
index_endscan(index_scan);
5612+
/* scan in the opposite direction; all else is the same */
5613+
have_data=get_actual_variable_endpoint(heapRel,
5614+
indexRel,
5615+
-indexscandir,
5616+
scankeys,
5617+
typLen,
5618+
typByVal,
5619+
oldcontext,
5620+
max);
56885621
}
56895622

56905623
/* Clean everything up */
5691-
ExecDropSingleTupleTableSlot(slot);
5692-
56935624
index_close(indexRel,AccessShareLock);
56945625
heap_close(heapRel,NoLock);
56955626

56965627
MemoryContextSwitchTo(oldcontext);
5697-
FreeExecutorState(estate);
5628+
MemoryContextDelete(tmpcontext);
56985629

56995630
/* And we're done */
57005631
break;
@@ -5704,6 +5635,133 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
57045635
returnhave_data;
57055636
}
57065637

5638+
/*
5639+
* Get one endpoint datum (min or max depending on indexscandir) from the
5640+
* specified index. Return true if successful, false if index is empty.
5641+
* On success, endpoint value is stored to *endpointDatum (and copied into
5642+
* outercontext).
5643+
*
5644+
* scankeys is a 1-element scankey array set up to reject nulls.
5645+
* typLen/typByVal describe the datatype of the index's first column.
5646+
* (We could compute these values locally, but that would mean computing them
5647+
* twice when get_actual_variable_range needs both the min and the max.)
5648+
*/
5649+
staticbool
5650+
get_actual_variable_endpoint(RelationheapRel,
5651+
RelationindexRel,
5652+
ScanDirectionindexscandir,
5653+
ScanKeyscankeys,
5654+
int16typLen,
5655+
booltypByVal,
5656+
MemoryContextoutercontext,
5657+
Datum*endpointDatum)
5658+
{
5659+
boolhave_data= false;
5660+
SnapshotDataSnapshotNonVacuumable;
5661+
IndexScanDescindex_scan;
5662+
Buffervmbuffer=InvalidBuffer;
5663+
ItemPointertid;
5664+
Datumvalues[INDEX_MAX_KEYS];
5665+
boolisnull[INDEX_MAX_KEYS];
5666+
MemoryContextoldcontext;
5667+
5668+
/*
5669+
* We use the index-only-scan machinery for this. With mostly-static
5670+
* tables that's a win because it avoids a heap visit. It's also a win
5671+
* for dynamic data, but the reason is less obvious; read on for details.
5672+
*
5673+
* In principle, we should scan the index with our current active
5674+
* snapshot, which is the best approximation we've got to what the query
5675+
* will see when executed. But that won't be exact if a new snap is taken
5676+
* before running the query, and it can be very expensive if a lot of
5677+
* recently-dead or uncommitted rows exist at the beginning or end of the
5678+
* index (because we'll laboriously fetch each one and reject it).
5679+
* Instead, we use SnapshotNonVacuumable. That will accept recently-dead
5680+
* and uncommitted rows as well as normal visible rows. On the other
5681+
* hand, it will reject known-dead rows, and thus not give a bogus answer
5682+
* when the extreme value has been deleted (unless the deletion was quite
5683+
* recent); that case motivates not using SnapshotAny here.
5684+
*
5685+
* A crucial point here is that SnapshotNonVacuumable, with
5686+
* RecentGlobalXmin as horizon, yields the inverse of the condition that
5687+
* the indexscan will use to decide that index entries are killable (see
5688+
* heap_hot_search_buffer()). Therefore, if the snapshot rejects a tuple
5689+
* (or more precisely, all tuples of a HOT chain) and we have to continue
5690+
* scanning past it, we know that the indexscan will mark that index entry
5691+
* killed. That means that the next get_actual_variable_endpoint() call
5692+
* will not have to re-consider that index entry. In this way we avoid
5693+
* repetitive work when this function is used a lot during planning.
5694+
*
5695+
* But using SnapshotNonVacuumable creates a hazard of its own. In a
5696+
* recently-created index, some index entries may point at "broken" HOT
5697+
* chains in which not all the tuple versions contain data matching the
5698+
* index entry. The live tuple version(s) certainly do match the index,
5699+
* but SnapshotNonVacuumable can accept recently-dead tuple versions that
5700+
* don't match. Hence, if we took data from the selected heap tuple, we
5701+
* might get a bogus answer that's not close to the index extremal value,
5702+
* or could even be NULL. We avoid this hazard because we take the data
5703+
* from the index entry not the heap.
5704+
*/
5705+
InitNonVacuumableSnapshot(SnapshotNonVacuumable,RecentGlobalXmin);
5706+
5707+
index_scan=index_beginscan(heapRel,indexRel,
5708+
&SnapshotNonVacuumable,
5709+
1,0);
5710+
/* Set it up for index-only scan */
5711+
index_scan->xs_want_itup= true;
5712+
index_rescan(index_scan,scankeys,1,NULL,0);
5713+
5714+
/* Fetch first/next tuple in specified direction */
5715+
while ((tid=index_getnext_tid(index_scan,indexscandir))!=NULL)
5716+
{
5717+
if (!VM_ALL_VISIBLE(heapRel,
5718+
ItemPointerGetBlockNumber(tid),
5719+
&vmbuffer))
5720+
{
5721+
/* Rats, we have to visit the heap to check visibility */
5722+
if (index_fetch_heap(index_scan)==NULL)
5723+
continue;/* no visible tuple, try next index entry */
5724+
5725+
/*
5726+
* We don't care whether there's more than one visible tuple in
5727+
* the HOT chain; if any are visible, that's good enough.
5728+
*/
5729+
}
5730+
5731+
/*
5732+
* We expect that btree will return data in IndexTuple not HeapTuple
5733+
* format. It's not lossy either.
5734+
*/
5735+
if (!index_scan->xs_itup)
5736+
elog(ERROR,"no data returned for index-only scan");
5737+
if (index_scan->xs_recheck)
5738+
elog(ERROR,"unexpected recheck indication from btree");
5739+
5740+
/* OK to deconstruct the index tuple */
5741+
index_deform_tuple(index_scan->xs_itup,
5742+
index_scan->xs_itupdesc,
5743+
values,isnull);
5744+
5745+
/* Shouldn't have got a null, but be careful */
5746+
if (isnull[0])
5747+
elog(ERROR,"found unexpected null value in index \"%s\"",
5748+
RelationGetRelationName(indexRel));
5749+
5750+
/* Copy the index column value out to caller's context */
5751+
oldcontext=MemoryContextSwitchTo(outercontext);
5752+
*endpointDatum=datumCopy(values[0],typByVal,typLen);
5753+
MemoryContextSwitchTo(oldcontext);
5754+
have_data= true;
5755+
break;
5756+
}
5757+
5758+
if (vmbuffer!=InvalidBuffer)
5759+
ReleaseBuffer(vmbuffer);
5760+
index_endscan(index_scan);
5761+
5762+
returnhave_data;
5763+
}
5764+
57075765
/*
57085766
* find_join_input_rel
57095767
*Look up the input relation for a join.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp