104104#include "access/brin.h"
105105#include "access/gin.h"
106106#include "access/htup_details.h"
107+ #include "access/relscan.h"
107108#include "access/sysattr.h"
108- #include "catalog/index .h"
109+ #include "access/visibilitymap .h"
109110#include "catalog/pg_am.h"
110111#include "catalog/pg_collation.h"
111112#include "catalog/pg_operator.h"
112113#include "catalog/pg_opfamily.h"
113114#include "catalog/pg_statistic.h"
114115#include "catalog/pg_statistic_ext.h"
115116#include "catalog/pg_type.h"
116- #include "executor/executor.h"
117117#include "mb/pg_wchar.h"
118118#include "miscadmin.h"
119119#include "nodes/makefuncs.h"
130130#include "parser/parse_coerce.h"
131131#include "parser/parsetree.h"
132132#include "statistics/statistics.h"
133+ #include "storage/bufmgr.h"
133134#include "utils/acl.h"
134135#include "utils/builtins.h"
135136#include "utils/bytea.h"
138139#include "utils/fmgroids.h"
139140#include "utils/index_selfuncs.h"
140141#include "utils/lsyscache.h"
142+ #include "utils/memutils.h"
141143#include "utils/nabstime.h"
142144#include "utils/pg_locale.h"
143145#include "utils/rel.h"
@@ -204,6 +206,14 @@ static bool get_actual_variable_range(PlannerInfo *root,
204206VariableStatData * vardata ,
205207Oid sortop ,
206208Datum * min ,Datum * max );
209+ static bool get_actual_variable_endpoint (Relation heapRel ,
210+ Relation indexRel ,
211+ ScanDirection indexscandir ,
212+ ScanKey scankeys ,
213+ int16 typLen ,
214+ bool typByVal ,
215+ MemoryContext outercontext ,
216+ Datum * endpointDatum );
207217static RelOptInfo * find_join_input_rel (PlannerInfo * root ,Relids relids );
208218static Selectivity prefix_selectivity (PlannerInfo * root ,
209219VariableStatData * vardata ,
@@ -5539,31 +5549,22 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
55395549}
55405550
55415551/*
5542- * Found a suitable index to extract data from.We'll need an EState
5543- *and a bunch of other infrastructure .
5552+ * Found a suitable index to extract data from.Set up some data that
5553+ *can be used by both invocations of get_actual_variable_endpoint .
55445554 */
55455555{
5546- EState * estate ;
5547- ExprContext * econtext ;
55485556MemoryContext tmpcontext ;
55495557MemoryContext oldcontext ;
55505558Relation heapRel ;
55515559Relation indexRel ;
5552- IndexInfo * indexInfo ;
5553- TupleTableSlot * slot ;
55545560int16 typLen ;
55555561bool typByVal ;
55565562ScanKeyData scankeys [1 ];
5557- IndexScanDesc index_scan ;
5558- HeapTuple tup ;
5559- Datum values [INDEX_MAX_KEYS ];
5560- bool isnull [INDEX_MAX_KEYS ];
5561- SnapshotData SnapshotNonVacuumable ;
5562-
5563- estate = CreateExecutorState ();
5564- econtext = GetPerTupleExprContext (estate );
5565- /* Make sure any cruft is generated in the econtext's memory */
5566- tmpcontext = econtext -> ecxt_per_tuple_memory ;
5563+
5564+ /* Make sure any cruft gets recycled when we're done */
5565+ tmpcontext = AllocSetContextCreate (CurrentMemoryContext ,
5566+ "get_actual_variable_range workspace" ,
5567+ ALLOCSET_DEFAULT_SIZES );
55675568oldcontext = MemoryContextSwitchTo (tmpcontext );
55685569
55695570/*
@@ -5574,14 +5575,8 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
55745575heapRel = heap_open (rte -> relid ,NoLock );
55755576indexRel = index_open (index -> indexoid ,AccessShareLock );
55765577
5577- /* extract index key information from the index's pg_index info */
5578- indexInfo = BuildIndexInfo (indexRel );
5579-
5580- /* some other stuff */
5581- slot = MakeSingleTupleTableSlot (RelationGetDescr (heapRel ));
5582- econtext -> ecxt_scantuple = slot ;
5578+ /* build some stuff needed for indexscan execution */
55835579get_typlenbyval (vardata -> atttype ,& typLen ,& typByVal );
5584- InitNonVacuumableSnapshot (SnapshotNonVacuumable ,RecentGlobalXmin );
55855580
55865581/* set up an IS NOT NULL scan key so that we ignore nulls */
55875582ScanKeyEntryInitialize (& scankeys [0 ],
@@ -5593,108 +5588,44 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
55935588InvalidOid ,/* no reg proc for this */
55945589 (Datum )0 );/* constant */
55955590
5596- have_data = true;
5597-
55985591/* If min is requested ... */
55995592if (min )
56005593{
5601- /*
5602- * In principle, we should scan the index with our current
5603- * active snapshot, which is the best approximation we've got
5604- * to what the query will see when executed. But that won't
5605- * be exact if a new snap is taken before running the query,
5606- * and it can be very expensive if a lot of recently-dead or
5607- * uncommitted rows exist at the beginning or end of the index
5608- * (because we'll laboriously fetch each one and reject it).
5609- * Instead, we use SnapshotNonVacuumable. That will accept
5610- * recently-dead and uncommitted rows as well as normal
5611- * visible rows. On the other hand, it will reject known-dead
5612- * rows, and thus not give a bogus answer when the extreme
5613- * value has been deleted (unless the deletion was quite
5614- * recent); that case motivates not using SnapshotAny here.
5615- *
5616- * A crucial point here is that SnapshotNonVacuumable, with
5617- * RecentGlobalXmin as horizon, yields the inverse of the
5618- * condition that the indexscan will use to decide that index
5619- * entries are killable (see heap_hot_search_buffer()).
5620- * Therefore, if the snapshot rejects a tuple and we have to
5621- * continue scanning past it, we know that the indexscan will
5622- * mark that index entry killed. That means that the next
5623- * get_actual_variable_range() call will not have to visit
5624- * that heap entry. In this way we avoid repetitive work when
5625- * this function is used a lot during planning.
5626- */
5627- index_scan = index_beginscan (heapRel ,indexRel ,
5628- & SnapshotNonVacuumable ,
5629- 1 ,0 );
5630- index_rescan (index_scan ,scankeys ,1 ,NULL ,0 );
5631-
5632- /* Fetch first tuple in sortop's direction */
5633- if ((tup = index_getnext (index_scan ,
5634- indexscandir ))!= NULL )
5635- {
5636- /* Extract the index column values from the heap tuple */
5637- ExecStoreTuple (tup ,slot ,InvalidBuffer , false);
5638- FormIndexDatum (indexInfo ,slot ,estate ,
5639- values ,isnull );
5640-
5641- /* Shouldn't have got a null, but be careful */
5642- if (isnull [0 ])
5643- elog (ERROR ,"found unexpected null value in index \"%s\"" ,
5644- RelationGetRelationName (indexRel ));
5645-
5646- /* Copy the index column value out to caller's context */
5647- MemoryContextSwitchTo (oldcontext );
5648- * min = datumCopy (values [0 ],typByVal ,typLen );
5649- MemoryContextSwitchTo (tmpcontext );
5650- }
5651- else
5652- have_data = false;
5653-
5654- index_endscan (index_scan );
5594+ have_data = get_actual_variable_endpoint (heapRel ,
5595+ indexRel ,
5596+ indexscandir ,
5597+ scankeys ,
5598+ typLen ,
5599+ typByVal ,
5600+ oldcontext ,
5601+ min );
5602+ }
5603+ else
5604+ {
5605+ /* If min not requested, assume index is nonempty */
5606+ have_data = true;
56555607}
56565608
56575609/* If max is requested, and we didn't find the index is empty */
56585610if (max && have_data )
56595611{
5660- index_scan = index_beginscan (heapRel ,indexRel ,
5661- & SnapshotNonVacuumable ,
5662- 1 ,0 );
5663- index_rescan (index_scan ,scankeys ,1 ,NULL ,0 );
5664-
5665- /* Fetch first tuple in reverse direction */
5666- if ((tup = index_getnext (index_scan ,
5667- - indexscandir ))!= NULL )
5668- {
5669- /* Extract the index column values from the heap tuple */
5670- ExecStoreTuple (tup ,slot ,InvalidBuffer , false);
5671- FormIndexDatum (indexInfo ,slot ,estate ,
5672- values ,isnull );
5673-
5674- /* Shouldn't have got a null, but be careful */
5675- if (isnull [0 ])
5676- elog (ERROR ,"found unexpected null value in index \"%s\"" ,
5677- RelationGetRelationName (indexRel ));
5678-
5679- /* Copy the index column value out to caller's context */
5680- MemoryContextSwitchTo (oldcontext );
5681- * max = datumCopy (values [0 ],typByVal ,typLen );
5682- MemoryContextSwitchTo (tmpcontext );
5683- }
5684- else
5685- have_data = false;
5686-
5687- index_endscan (index_scan );
5612+ /* scan in the opposite direction; all else is the same */
5613+ have_data = get_actual_variable_endpoint (heapRel ,
5614+ indexRel ,
5615+ - indexscandir ,
5616+ scankeys ,
5617+ typLen ,
5618+ typByVal ,
5619+ oldcontext ,
5620+ max );
56885621}
56895622
56905623/* Clean everything up */
5691- ExecDropSingleTupleTableSlot (slot );
5692-
56935624index_close (indexRel ,AccessShareLock );
56945625heap_close (heapRel ,NoLock );
56955626
56965627MemoryContextSwitchTo (oldcontext );
5697- FreeExecutorState ( estate );
5628+ MemoryContextDelete ( tmpcontext );
56985629
56995630/* And we're done */
57005631break ;
@@ -5704,6 +5635,133 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
57045635return have_data ;
57055636}
57065637
5638+ /*
5639+ * Get one endpoint datum (min or max depending on indexscandir) from the
5640+ * specified index. Return true if successful, false if index is empty.
5641+ * On success, endpoint value is stored to *endpointDatum (and copied into
5642+ * outercontext).
5643+ *
5644+ * scankeys is a 1-element scankey array set up to reject nulls.
5645+ * typLen/typByVal describe the datatype of the index's first column.
5646+ * (We could compute these values locally, but that would mean computing them
5647+ * twice when get_actual_variable_range needs both the min and the max.)
5648+ */
5649+ static bool
5650+ get_actual_variable_endpoint (Relation heapRel ,
5651+ Relation indexRel ,
5652+ ScanDirection indexscandir ,
5653+ ScanKey scankeys ,
5654+ int16 typLen ,
5655+ bool typByVal ,
5656+ MemoryContext outercontext ,
5657+ Datum * endpointDatum )
5658+ {
5659+ bool have_data = false;
5660+ SnapshotData SnapshotNonVacuumable ;
5661+ IndexScanDesc index_scan ;
5662+ Buffer vmbuffer = InvalidBuffer ;
5663+ ItemPointer tid ;
5664+ Datum values [INDEX_MAX_KEYS ];
5665+ bool isnull [INDEX_MAX_KEYS ];
5666+ MemoryContext oldcontext ;
5667+
5668+ /*
5669+ * We use the index-only-scan machinery for this. With mostly-static
5670+ * tables that's a win because it avoids a heap visit. It's also a win
5671+ * for dynamic data, but the reason is less obvious; read on for details.
5672+ *
5673+ * In principle, we should scan the index with our current active
5674+ * snapshot, which is the best approximation we've got to what the query
5675+ * will see when executed. But that won't be exact if a new snap is taken
5676+ * before running the query, and it can be very expensive if a lot of
5677+ * recently-dead or uncommitted rows exist at the beginning or end of the
5678+ * index (because we'll laboriously fetch each one and reject it).
5679+ * Instead, we use SnapshotNonVacuumable. That will accept recently-dead
5680+ * and uncommitted rows as well as normal visible rows. On the other
5681+ * hand, it will reject known-dead rows, and thus not give a bogus answer
5682+ * when the extreme value has been deleted (unless the deletion was quite
5683+ * recent); that case motivates not using SnapshotAny here.
5684+ *
5685+ * A crucial point here is that SnapshotNonVacuumable, with
5686+ * RecentGlobalXmin as horizon, yields the inverse of the condition that
5687+ * the indexscan will use to decide that index entries are killable (see
5688+ * heap_hot_search_buffer()). Therefore, if the snapshot rejects a tuple
5689+ * (or more precisely, all tuples of a HOT chain) and we have to continue
5690+ * scanning past it, we know that the indexscan will mark that index entry
5691+ * killed. That means that the next get_actual_variable_endpoint() call
5692+ * will not have to re-consider that index entry. In this way we avoid
5693+ * repetitive work when this function is used a lot during planning.
5694+ *
5695+ * But using SnapshotNonVacuumable creates a hazard of its own. In a
5696+ * recently-created index, some index entries may point at "broken" HOT
5697+ * chains in which not all the tuple versions contain data matching the
5698+ * index entry. The live tuple version(s) certainly do match the index,
5699+ * but SnapshotNonVacuumable can accept recently-dead tuple versions that
5700+ * don't match. Hence, if we took data from the selected heap tuple, we
5701+ * might get a bogus answer that's not close to the index extremal value,
5702+ * or could even be NULL. We avoid this hazard because we take the data
5703+ * from the index entry not the heap.
5704+ */
5705+ InitNonVacuumableSnapshot (SnapshotNonVacuumable ,RecentGlobalXmin );
5706+
5707+ index_scan = index_beginscan (heapRel ,indexRel ,
5708+ & SnapshotNonVacuumable ,
5709+ 1 ,0 );
5710+ /* Set it up for index-only scan */
5711+ index_scan -> xs_want_itup = true;
5712+ index_rescan (index_scan ,scankeys ,1 ,NULL ,0 );
5713+
5714+ /* Fetch first/next tuple in specified direction */
5715+ while ((tid = index_getnext_tid (index_scan ,indexscandir ))!= NULL )
5716+ {
5717+ if (!VM_ALL_VISIBLE (heapRel ,
5718+ ItemPointerGetBlockNumber (tid ),
5719+ & vmbuffer ))
5720+ {
5721+ /* Rats, we have to visit the heap to check visibility */
5722+ if (index_fetch_heap (index_scan )== NULL )
5723+ continue ;/* no visible tuple, try next index entry */
5724+
5725+ /*
5726+ * We don't care whether there's more than one visible tuple in
5727+ * the HOT chain; if any are visible, that's good enough.
5728+ */
5729+ }
5730+
5731+ /*
5732+ * We expect that btree will return data in IndexTuple not HeapTuple
5733+ * format. It's not lossy either.
5734+ */
5735+ if (!index_scan -> xs_itup )
5736+ elog (ERROR ,"no data returned for index-only scan" );
5737+ if (index_scan -> xs_recheck )
5738+ elog (ERROR ,"unexpected recheck indication from btree" );
5739+
5740+ /* OK to deconstruct the index tuple */
5741+ index_deform_tuple (index_scan -> xs_itup ,
5742+ index_scan -> xs_itupdesc ,
5743+ values ,isnull );
5744+
5745+ /* Shouldn't have got a null, but be careful */
5746+ if (isnull [0 ])
5747+ elog (ERROR ,"found unexpected null value in index \"%s\"" ,
5748+ RelationGetRelationName (indexRel ));
5749+
5750+ /* Copy the index column value out to caller's context */
5751+ oldcontext = MemoryContextSwitchTo (outercontext );
5752+ * endpointDatum = datumCopy (values [0 ],typByVal ,typLen );
5753+ MemoryContextSwitchTo (oldcontext );
5754+ have_data = true;
5755+ break ;
5756+ }
5757+
5758+ if (vmbuffer != InvalidBuffer )
5759+ ReleaseBuffer (vmbuffer );
5760+ index_endscan (index_scan );
5761+
5762+ return have_data ;
5763+ }
5764+
57075765/*
57085766 * find_join_input_rel
57095767 *Look up the input relation for a join.