Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd3751ad

Browse files
committed
Fix get_actual_variable_range() to cope with broken HOT chains.
Commit3ca930f modified get_actual_variable_range() to use a new"SnapshotNonVacuumable" snapshot type for selecting tuples that itwould consider valid. However, because that snapshot type can acceptrecently-dead tuples, this caused a bug when using a recently-createdindex: we might accept a recently-dead tuple that is an early memberof a broken HOT chain and does not actually match the index entry.Then, the data extracted from the heap tuple would not necessarily bean endpoint value of the column; it could even be NULL, leading toget_actual_variable_range() itself reporting "found unexpected nullvalue in index". Even without an error, this could lead to poorplan choices due to an erroneous notion of the endpoint value.We can improve matters by changing the code to use the index-onlyscan technique (which didn't exist when get_actual_variable_range wasoriginally written). If any of the tuples in a HOT chain are liveenough to satisfy SnapshotNonVacuumable, we take the data from theindex entry, ignoring what is in the heap. This fixes the problemwithout changing the live-vs-dead-tuple behavior from what wasintended by commit3ca930f.A side benefit is that for static tables we might not have to touchthe heap at all (when the extremal value is in an all-visible page).In addition, we can save some overhead by not having to create acomplete ExecutorState, and we don't need to run FormIndexDatum,avoiding more cycles as well as the possibility of failure forindexes on expressions. (I'm not sure that this code would everbe used to determine the extreme value of an expression, in thecurrent state of the planner; but it's definitely possible thatlower-order columns of the selected index could be expressions.So one could construct perhaps-artificial examples in which theold code unexpectedly failed due to trying to compute anexpression's value for a now-dead row.)Per report from Manuel Rigger. Back-patch to v11 where commit3ca930f came in.Discussion:https://postgr.es/m/CA+u7OA7W4NWEhCvftdV6_8bbm2vgypi5nuxfnSEJQqVKFSUoMg@mail.gmail.com
1 parentcfde234 commitd3751ad

File tree

1 file changed

+178
-105
lines changed

1 file changed

+178
-105
lines changed

‎src/backend/utils/adt/selfuncs.c

Lines changed: 178 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -103,17 +103,14 @@
103103

104104
#include"access/brin.h"
105105
#include"access/gin.h"
106-
#include"access/htup_details.h"
107-
#include"access/sysattr.h"
108106
#include"access/table.h"
109107
#include"access/tableam.h"
110-
#include"catalog/index.h"
108+
#include"access/visibilitymap.h"
111109
#include"catalog/pg_am.h"
112110
#include"catalog/pg_collation.h"
113111
#include"catalog/pg_operator.h"
114112
#include"catalog/pg_statistic.h"
115113
#include"catalog/pg_statistic_ext.h"
116-
#include"executor/executor.h"
117114
#include"executor/nodeAgg.h"
118115
#include"miscadmin.h"
119116
#include"nodes/makefuncs.h"
@@ -127,12 +124,14 @@
127124
#include"parser/parse_clause.h"
128125
#include"parser/parsetree.h"
129126
#include"statistics/statistics.h"
127+
#include"storage/bufmgr.h"
130128
#include"utils/builtins.h"
131129
#include"utils/date.h"
132130
#include"utils/datum.h"
133131
#include"utils/fmgroids.h"
134132
#include"utils/index_selfuncs.h"
135133
#include"utils/lsyscache.h"
134+
#include"utils/memutils.h"
136135
#include"utils/pg_locale.h"
137136
#include"utils/rel.h"
138137
#include"utils/selfuncs.h"
@@ -198,6 +197,15 @@ static bool get_actual_variable_range(PlannerInfo *root,
198197
VariableStatData*vardata,
199198
Oidsortop,
200199
Datum*min,Datum*max);
200+
staticboolget_actual_variable_endpoint(RelationheapRel,
201+
RelationindexRel,
202+
ScanDirectionindexscandir,
203+
ScanKeyscankeys,
204+
int16typLen,
205+
booltypByVal,
206+
TupleTableSlot*tableslot,
207+
MemoryContextoutercontext,
208+
Datum*endpointDatum);
201209
staticRelOptInfo*find_join_input_rel(PlannerInfo*root,Relidsrelids);
202210

203211

@@ -5180,30 +5188,23 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
51805188
}
51815189

51825190
/*
5183-
* Found a suitable index to extract data from.We'll need an EState
5184-
*and a bunch of other infrastructure.
5191+
* Found a suitable index to extract data from.Set up some data that
5192+
*can be used by both invocations of get_actual_variable_endpoint.
51855193
*/
51865194
{
5187-
EState*estate;
5188-
ExprContext*econtext;
51895195
MemoryContexttmpcontext;
51905196
MemoryContextoldcontext;
51915197
RelationheapRel;
51925198
RelationindexRel;
5193-
IndexInfo*indexInfo;
51945199
TupleTableSlot*slot;
51955200
int16typLen;
51965201
booltypByVal;
51975202
ScanKeyDatascankeys[1];
5198-
IndexScanDescindex_scan;
5199-
Datumvalues[INDEX_MAX_KEYS];
5200-
boolisnull[INDEX_MAX_KEYS];
5201-
SnapshotDataSnapshotNonVacuumable;
5202-
5203-
estate=CreateExecutorState();
5204-
econtext=GetPerTupleExprContext(estate);
5205-
/* Make sure any cruft is generated in the econtext's memory */
5206-
tmpcontext=econtext->ecxt_per_tuple_memory;
5203+
5204+
/* Make sure any cruft gets recycled when we're done */
5205+
tmpcontext=AllocSetContextCreate(CurrentMemoryContext,
5206+
"get_actual_variable_range workspace",
5207+
ALLOCSET_DEFAULT_SIZES);
52075208
oldcontext=MemoryContextSwitchTo(tmpcontext);
52085209

52095210
/*
@@ -5213,14 +5214,9 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
52135214
heapRel=table_open(rte->relid,NoLock);
52145215
indexRel=index_open(index->indexoid,NoLock);
52155216

5216-
/* extract index key information from the index's pg_index info */
5217-
indexInfo=BuildIndexInfo(indexRel);
5218-
5219-
/* some other stuff */
5217+
/* build some stuff needed for indexscan execution */
52205218
slot=table_slot_create(heapRel,NULL);
5221-
econtext->ecxt_scantuple=slot;
52225219
get_typlenbyval(vardata->atttype,&typLen,&typByVal);
5223-
InitNonVacuumableSnapshot(SnapshotNonVacuumable,RecentGlobalXmin);
52245220

52255221
/* set up an IS NOT NULL scan key so that we ignore nulls */
52265222
ScanKeyEntryInitialize(&scankeys[0],
@@ -5232,94 +5228,38 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
52325228
InvalidOid,/* no reg proc for this */
52335229
(Datum)0);/* constant */
52345230

5235-
have_data= true;
5236-
52375231
/* If min is requested ... */
52385232
if (min)
52395233
{
5240-
/*
5241-
* In principle, we should scan the index with our current
5242-
* active snapshot, which is the best approximation we've got
5243-
* to what the query will see when executed. But that won't
5244-
* be exact if a new snap is taken before running the query,
5245-
* and it can be very expensive if a lot of recently-dead or
5246-
* uncommitted rows exist at the beginning or end of the index
5247-
* (because we'll laboriously fetch each one and reject it).
5248-
* Instead, we use SnapshotNonVacuumable. That will accept
5249-
* recently-dead and uncommitted rows as well as normal
5250-
* visible rows. On the other hand, it will reject known-dead
5251-
* rows, and thus not give a bogus answer when the extreme
5252-
* value has been deleted (unless the deletion was quite
5253-
* recent); that case motivates not using SnapshotAny here.
5254-
*
5255-
* A crucial point here is that SnapshotNonVacuumable, with
5256-
* RecentGlobalXmin as horizon, yields the inverse of the
5257-
* condition that the indexscan will use to decide that index
5258-
* entries are killable (see heap_hot_search_buffer()).
5259-
* Therefore, if the snapshot rejects a tuple and we have to
5260-
* continue scanning past it, we know that the indexscan will
5261-
* mark that index entry killed. That means that the next
5262-
* get_actual_variable_range() call will not have to visit
5263-
* that heap entry. In this way we avoid repetitive work when
5264-
* this function is used a lot during planning.
5265-
*/
5266-
index_scan=index_beginscan(heapRel,indexRel,
5267-
&SnapshotNonVacuumable,
5268-
1,0);
5269-
index_rescan(index_scan,scankeys,1,NULL,0);
5270-
5271-
/* Fetch first tuple in sortop's direction */
5272-
if (index_getnext_slot(index_scan,indexscandir,slot))
5273-
{
5274-
/* Extract the index column values from the slot */
5275-
FormIndexDatum(indexInfo,slot,estate,
5276-
values,isnull);
5277-
5278-
/* Shouldn't have got a null, but be careful */
5279-
if (isnull[0])
5280-
elog(ERROR,"found unexpected null value in index \"%s\"",
5281-
RelationGetRelationName(indexRel));
5282-
5283-
/* Copy the index column value out to caller's context */
5284-
MemoryContextSwitchTo(oldcontext);
5285-
*min=datumCopy(values[0],typByVal,typLen);
5286-
MemoryContextSwitchTo(tmpcontext);
5287-
}
5288-
else
5289-
have_data= false;
5290-
5291-
index_endscan(index_scan);
5234+
have_data=get_actual_variable_endpoint(heapRel,
5235+
indexRel,
5236+
indexscandir,
5237+
scankeys,
5238+
typLen,
5239+
typByVal,
5240+
slot,
5241+
oldcontext,
5242+
min);
5243+
}
5244+
else
5245+
{
5246+
/* If min not requested, assume index is nonempty */
5247+
have_data= true;
52925248
}
52935249

52945250
/* If max is requested, and we didn't find the index is empty */
52955251
if (max&&have_data)
52965252
{
5297-
index_scan=index_beginscan(heapRel,indexRel,
5298-
&SnapshotNonVacuumable,
5299-
1,0);
5300-
index_rescan(index_scan,scankeys,1,NULL,0);
5301-
5302-
/* Fetch first tuple in reverse direction */
5303-
if (index_getnext_slot(index_scan,-indexscandir,slot))
5304-
{
5305-
/* Extract the index column values from the slot */
5306-
FormIndexDatum(indexInfo,slot,estate,
5307-
values,isnull);
5308-
5309-
/* Shouldn't have got a null, but be careful */
5310-
if (isnull[0])
5311-
elog(ERROR,"found unexpected null value in index \"%s\"",
5312-
RelationGetRelationName(indexRel));
5313-
5314-
/* Copy the index column value out to caller's context */
5315-
MemoryContextSwitchTo(oldcontext);
5316-
*max=datumCopy(values[0],typByVal,typLen);
5317-
MemoryContextSwitchTo(tmpcontext);
5318-
}
5319-
else
5320-
have_data= false;
5321-
5322-
index_endscan(index_scan);
5253+
/* scan in the opposite direction; all else is the same */
5254+
have_data=get_actual_variable_endpoint(heapRel,
5255+
indexRel,
5256+
-indexscandir,
5257+
scankeys,
5258+
typLen,
5259+
typByVal,
5260+
slot,
5261+
oldcontext,
5262+
max);
53235263
}
53245264

53255265
/* Clean everything up */
@@ -5329,7 +5269,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
53295269
table_close(heapRel,NoLock);
53305270

53315271
MemoryContextSwitchTo(oldcontext);
5332-
FreeExecutorState(estate);
5272+
MemoryContextDelete(tmpcontext);
53335273

53345274
/* And we're done */
53355275
break;
@@ -5339,6 +5279,139 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
53395279
returnhave_data;
53405280
}
53415281

5282+
/*
5283+
* Get one endpoint datum (min or max depending on indexscandir) from the
5284+
* specified index. Return true if successful, false if index is empty.
5285+
* On success, endpoint value is stored to *endpointDatum (and copied into
5286+
* outercontext).
5287+
*
5288+
* scankeys is a 1-element scankey array set up to reject nulls.
5289+
* typLen/typByVal describe the datatype of the index's first column.
5290+
* tableslot is a slot suitable to hold table tuples, in case we need
5291+
* to probe the heap.
5292+
* (We could compute these values locally, but that would mean computing them
5293+
* twice when get_actual_variable_range needs both the min and the max.)
5294+
*/
5295+
staticbool
5296+
get_actual_variable_endpoint(RelationheapRel,
5297+
RelationindexRel,
5298+
ScanDirectionindexscandir,
5299+
ScanKeyscankeys,
5300+
int16typLen,
5301+
booltypByVal,
5302+
TupleTableSlot*tableslot,
5303+
MemoryContextoutercontext,
5304+
Datum*endpointDatum)
5305+
{
5306+
boolhave_data= false;
5307+
SnapshotDataSnapshotNonVacuumable;
5308+
IndexScanDescindex_scan;
5309+
Buffervmbuffer=InvalidBuffer;
5310+
ItemPointertid;
5311+
Datumvalues[INDEX_MAX_KEYS];
5312+
boolisnull[INDEX_MAX_KEYS];
5313+
MemoryContextoldcontext;
5314+
5315+
/*
5316+
* We use the index-only-scan machinery for this. With mostly-static
5317+
* tables that's a win because it avoids a heap visit. It's also a win
5318+
* for dynamic data, but the reason is less obvious; read on for details.
5319+
*
5320+
* In principle, we should scan the index with our current active
5321+
* snapshot, which is the best approximation we've got to what the query
5322+
* will see when executed. But that won't be exact if a new snap is taken
5323+
* before running the query, and it can be very expensive if a lot of
5324+
* recently-dead or uncommitted rows exist at the beginning or end of the
5325+
* index (because we'll laboriously fetch each one and reject it).
5326+
* Instead, we use SnapshotNonVacuumable. That will accept recently-dead
5327+
* and uncommitted rows as well as normal visible rows. On the other
5328+
* hand, it will reject known-dead rows, and thus not give a bogus answer
5329+
* when the extreme value has been deleted (unless the deletion was quite
5330+
* recent); that case motivates not using SnapshotAny here.
5331+
*
5332+
* A crucial point here is that SnapshotNonVacuumable, with
5333+
* RecentGlobalXmin as horizon, yields the inverse of the condition that
5334+
* the indexscan will use to decide that index entries are killable (see
5335+
* heap_hot_search_buffer()). Therefore, if the snapshot rejects a tuple
5336+
* (or more precisely, all tuples of a HOT chain) and we have to continue
5337+
* scanning past it, we know that the indexscan will mark that index entry
5338+
* killed. That means that the next get_actual_variable_endpoint() call
5339+
* will not have to re-consider that index entry. In this way we avoid
5340+
* repetitive work when this function is used a lot during planning.
5341+
*
5342+
* But using SnapshotNonVacuumable creates a hazard of its own. In a
5343+
* recently-created index, some index entries may point at "broken" HOT
5344+
* chains in which not all the tuple versions contain data matching the
5345+
* index entry. The live tuple version(s) certainly do match the index,
5346+
* but SnapshotNonVacuumable can accept recently-dead tuple versions that
5347+
* don't match. Hence, if we took data from the selected heap tuple, we
5348+
* might get a bogus answer that's not close to the index extremal value,
5349+
* or could even be NULL. We avoid this hazard because we take the data
5350+
* from the index entry not the heap.
5351+
*/
5352+
InitNonVacuumableSnapshot(SnapshotNonVacuumable,RecentGlobalXmin);
5353+
5354+
index_scan=index_beginscan(heapRel,indexRel,
5355+
&SnapshotNonVacuumable,
5356+
1,0);
5357+
/* Set it up for index-only scan */
5358+
index_scan->xs_want_itup= true;
5359+
index_rescan(index_scan,scankeys,1,NULL,0);
5360+
5361+
/* Fetch first/next tuple in specified direction */
5362+
while ((tid=index_getnext_tid(index_scan,indexscandir))!=NULL)
5363+
{
5364+
if (!VM_ALL_VISIBLE(heapRel,
5365+
ItemPointerGetBlockNumber(tid),
5366+
&vmbuffer))
5367+
{
5368+
/* Rats, we have to visit the heap to check visibility */
5369+
if (!index_fetch_heap(index_scan,tableslot))
5370+
continue;/* no visible tuple, try next index entry */
5371+
5372+
/* We don't actually need the heap tuple for anything */
5373+
ExecClearTuple(tableslot);
5374+
5375+
/*
5376+
* We don't care whether there's more than one visible tuple in
5377+
* the HOT chain; if any are visible, that's good enough.
5378+
*/
5379+
}
5380+
5381+
/*
5382+
* We expect that btree will return data in IndexTuple not HeapTuple
5383+
* format. It's not lossy either.
5384+
*/
5385+
if (!index_scan->xs_itup)
5386+
elog(ERROR,"no data returned for index-only scan");
5387+
if (index_scan->xs_recheck)
5388+
elog(ERROR,"unexpected recheck indication from btree");
5389+
5390+
/* OK to deconstruct the index tuple */
5391+
index_deform_tuple(index_scan->xs_itup,
5392+
index_scan->xs_itupdesc,
5393+
values,isnull);
5394+
5395+
/* Shouldn't have got a null, but be careful */
5396+
if (isnull[0])
5397+
elog(ERROR,"found unexpected null value in index \"%s\"",
5398+
RelationGetRelationName(indexRel));
5399+
5400+
/* Copy the index column value out to caller's context */
5401+
oldcontext=MemoryContextSwitchTo(outercontext);
5402+
*endpointDatum=datumCopy(values[0],typByVal,typLen);
5403+
MemoryContextSwitchTo(oldcontext);
5404+
have_data= true;
5405+
break;
5406+
}
5407+
5408+
if (vmbuffer!=InvalidBuffer)
5409+
ReleaseBuffer(vmbuffer);
5410+
index_endscan(index_scan);
5411+
5412+
returnhave_data;
5413+
}
5414+
53425415
/*
53435416
* find_join_input_rel
53445417
*Look up the input relation for a join.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp