Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb5ee4e5

Browse files
Avoid nbtree parallel scan currPos confusion.
Commit1bd4bc8, which refactored nbtree sibling link traversal, made_bt_parallel_seize reset the scan's currPos so that things wereconsistent with the state of a serial backend moving between pages.This overlooked the fact that _bt_readnextpage relied on the existingcurrPos state to decide when to end the scan -- even though it came frombefore the scan was seized. As a result of all this, parallel nbtreescans could needlessly behave like full index scans.To fix, teach _bt_readnextpage to explicitly allow the use of an alreadyread page's so->currPos when deciding whether to end the scan -- evenduring parallel index scans (allow it consistently now). This requiresmoving _bt_readnextpage's seizure of the scan to earlier in its loop.That way _bt_readnextpage either deals with the true so->currPos state,or an initialized-by-_bt_parallel_seize currPos state set from when thescan was seized. Now _bt_steppage (the most important _bt_readnextpagecaller) takes the same uniform approach to setting up its call usingdetails taken from so->currPos -- regardless of whether the scan happensto be parallel or serial.The new loop structure in _bt_readnextpage is prone to getting confusedby P_NONE blknos set when the rightmost or leftmost page was reached.We could avoid that by adding an explicit check, but that would be ugly.Avoid this problem by teaching _bt_parallel_seize to end the parallelscan instead of returning a P_NONE next block/blkno. Doing things thisway was arguably a missed opportunity for commit1bd4bc8. It allows usto remove a similar "blkno == P_NONE" check from _bt_first.Oversight in commit1bd4bc8, which refactored sibling link traversal(as part of optimizing nbtree backward scan locking).Author: Peter Geoghegan <pg@bowt.ie>Reported-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com>Diagnosed-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com>Reviewed-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com>Discussion:https://postgr.es/m/f8efb9c0f8d1a71b44fd7f8e42e49c25@oss.nttdata.com
1 parent14e87ff commitb5ee4e5

File tree

3 files changed

+79
-76
lines changed

3 files changed

+79
-76
lines changed

‎src/backend/access/nbtree/nbtree.c

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -596,9 +596,7 @@ btparallelrescan(IndexScanDesc scan)
596596
* scan, and *last_curr_page returns the page that *next_scan_page came from.
597597
* An invalid *next_scan_page means the scan hasn't yet started, or that
598598
* caller needs to start the next primitive index scan (if it's the latter
599-
* case we'll set so.needPrimScan). The first time a participating process
600-
* reaches the last page, it will return true and set *next_scan_page to
601-
* P_NONE; after that, further attempts to seize the scan will return false.
599+
* case we'll set so.needPrimScan).
602600
*
603601
* Callers should ignore the value of *next_scan_page and *last_curr_page if
604602
* the return value is false.
@@ -608,12 +606,13 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
608606
BlockNumber*last_curr_page,boolfirst)
609607
{
610608
BTScanOpaqueso= (BTScanOpaque)scan->opaque;
611-
boolexit_loop= false;
612-
boolstatus= true;
609+
boolexit_loop= false,
610+
status= true,
611+
endscan= false;
613612
ParallelIndexScanDescparallel_scan=scan->parallel_scan;
614613
BTParallelScanDescbtscan;
615614

616-
*next_scan_page=P_NONE;
615+
*next_scan_page=InvalidBlockNumber;
617616
*last_curr_page=InvalidBlockNumber;
618617

619618
/*
@@ -657,6 +656,13 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
657656
/* We're done with this parallel index scan */
658657
status= false;
659658
}
659+
elseif (btscan->btps_pageStatus==BTPARALLEL_IDLE&&
660+
btscan->btps_nextScanPage==P_NONE)
661+
{
662+
/* End this parallel index scan */
663+
status= false;
664+
endscan= true;
665+
}
660666
elseif (btscan->btps_pageStatus==BTPARALLEL_NEED_PRIMSCAN)
661667
{
662668
Assert(so->numArrayKeys);
@@ -673,7 +679,6 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
673679
array->cur_elem=btscan->btps_arrElems[i];
674680
skey->sk_argument=array->elem_values[array->cur_elem];
675681
}
676-
*next_scan_page=InvalidBlockNumber;
677682
exit_loop= true;
678683
}
679684
else
@@ -701,6 +706,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
701706
* of advancing it to a new page!
702707
*/
703708
btscan->btps_pageStatus=BTPARALLEL_ADVANCING;
709+
Assert(btscan->btps_nextScanPage!=P_NONE);
704710
*next_scan_page=btscan->btps_nextScanPage;
705711
*last_curr_page=btscan->btps_lastCurrPage;
706712
exit_loop= true;
@@ -712,6 +718,10 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
712718
}
713719
ConditionVariableCancelSleep();
714720

721+
/* When the scan has reached the rightmost (or leftmost) page, end it */
722+
if (endscan)
723+
_bt_parallel_done(scan);
724+
715725
returnstatus;
716726
}
717727

@@ -724,6 +734,10 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
724734
* that it can be passed to _bt_parallel_primscan_schedule, should caller
725735
* determine that another primitive index scan is required.
726736
*
737+
* If caller's next_scan_page is P_NONE, the scan has reached the index's
738+
* rightmost/leftmost page. This is treated as reaching the end of the scan
739+
* within _bt_parallel_seize.
740+
*
727741
* Note: unlike the serial case, parallel scans don't need to remember both
728742
* sibling links. next_scan_page is whichever link is next given the scan's
729743
* direction. That's all we'll ever need, since the direction of a parallel
@@ -736,6 +750,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
736750
ParallelIndexScanDescparallel_scan=scan->parallel_scan;
737751
BTParallelScanDescbtscan;
738752

753+
Assert(BlockNumberIsValid(next_scan_page));
754+
739755
btscan= (BTParallelScanDesc)OffsetToPointer((void*)parallel_scan,
740756
parallel_scan->ps_offset);
741757

@@ -770,7 +786,7 @@ _bt_parallel_done(IndexScanDesc scan)
770786

771787
/*
772788
* Should not mark parallel scan done when there's still a pending
773-
* primitive index scan (defensive)
789+
* primitive index scan
774790
*/
775791
if (so->needPrimScan)
776792
return;

‎src/backend/access/nbtree/nbtsearch.c

Lines changed: 53 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
4646
staticbool_bt_readfirstpage(IndexScanDescscan,OffsetNumberoffnum,
4747
ScanDirectiondir);
4848
staticbool_bt_readnextpage(IndexScanDescscan,BlockNumberblkno,
49-
BlockNumberlastcurrblkno,ScanDirectiondir);
49+
BlockNumberlastcurrblkno,ScanDirectiondir,
50+
boolseized);
5051
staticBuffer_bt_lock_and_validate_left(Relationrel,BlockNumber*blkno,
5152
BlockNumberlastcurrblkno);
5253
staticbool_bt_endpoint(IndexScanDescscan,ScanDirectiondir);
@@ -888,7 +889,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
888889
ScanKeystartKeys[INDEX_MAX_KEYS];
889890
ScanKeyDatanotnullkeys[INDEX_MAX_KEYS];
890891
intkeysz=0;
891-
inti;
892892
StrategyNumberstrat_total;
893893
BTScanPosItem*currItem;
894894

@@ -924,33 +924,31 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
924924
{
925925
BlockNumberblkno,
926926
lastcurrblkno;
927-
boolstatus;
928927

929-
status=_bt_parallel_seize(scan,&blkno,&lastcurrblkno, true);
928+
if (!_bt_parallel_seize(scan,&blkno,&lastcurrblkno, true))
929+
return false;
930930

931931
/*
932+
* Successfully seized the scan, which _bt_readfirstpage or possibly
933+
* _bt_readnextpage will release (unless the scan ends right away, in
934+
* which case we'll call _bt_parallel_done directly).
935+
*
932936
* Initialize arrays (when _bt_parallel_seize didn't already set up
933-
* the next primitive index scan)
937+
* the next primitive index scan).
934938
*/
935939
if (so->numArrayKeys&& !so->needPrimScan)
936940
_bt_start_array_keys(scan,dir);
937941

938-
if (!status)
939-
return false;
940-
elseif (blkno==P_NONE)
941-
{
942-
_bt_parallel_done(scan);
943-
return false;
944-
}
945-
elseif (blkno!=InvalidBlockNumber)
942+
Assert(blkno!=P_NONE);
943+
if (blkno!=InvalidBlockNumber)
946944
{
947945
Assert(!so->needPrimScan);
948946

949947
/*
950948
* We anticipated starting another primitive scan, but some other
951949
* worker bet us to it
952950
*/
953-
if (!_bt_readnextpage(scan,blkno,lastcurrblkno,dir))
951+
if (!_bt_readnextpage(scan,blkno,lastcurrblkno,dir, true))
954952
return false;
955953
gotoreadcomplete;
956954
}
@@ -1043,6 +1041,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
10431041
* We don't cast the decision in stone until we reach keys for the
10441042
* next attribute.
10451043
*/
1044+
cur=so->keyData;
10461045
curattr=1;
10471046
chosen=NULL;
10481047
/* Also remember any scankey that implies a NOT NULL constraint */
@@ -1053,7 +1052,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
10531052
* pass to handle after-last-key processing. Actual exit from the
10541053
* loop is at one of the "break" statements below.
10551054
*/
1056-
for (cur=so->keyData,i=0;;cur++,i++)
1055+
for (inti=0;;cur++,i++)
10571056
{
10581057
if (i >=so->numberOfKeys||cur->sk_attno!=curattr)
10591058
{
@@ -1168,7 +1167,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
11681167
* initialized after initial-positioning scan keys are finalized.)
11691168
*/
11701169
Assert(keysz <=INDEX_MAX_KEYS);
1171-
for (i=0;i<keysz;i++)
1170+
for (inti=0;i<keysz;i++)
11721171
{
11731172
ScanKeycur=startKeys[i];
11741173

@@ -2006,18 +2005,12 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
20062005
/*
20072006
*_bt_steppage() -- Step to next page containing valid data for scan
20082007
*
2008+
* Wrapper on _bt_readnextpage that performs final steps for the current page.
2009+
*
20092010
* On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
20102011
* If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
20112012
* the pin eagerly earlier on. The scan must have so->currPos.currPage set to
20122013
* a valid block, in any case.
2013-
*
2014-
* This is a wrapper on _bt_readnextpage that performs final steps for the
2015-
* current page. It sets up the _bt_readnextpage call using either local
2016-
* state saved in so->currPos by the most recent _bt_readpage call, or using
2017-
* shared parallel scan state (obtained by seizing the parallel scan here).
2018-
*
2019-
* Parallel scan callers that have already seized the scan should directly
2020-
* call _bt_readnextpage, rather than calling here.
20212014
*/
20222015
staticbool
20232016
_bt_steppage(IndexScanDescscan,ScanDirectiondir)
@@ -2081,37 +2074,22 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
20812074
BTScanPosUnpinIfPinned(so->currPos);
20822075

20832076
/* Walk to the next page with data */
2084-
if (!scan->parallel_scan)
2085-
{
2086-
/* Not parallel, so use local state set by the last _bt_readpage */
2087-
if (ScanDirectionIsForward(dir))
2088-
blkno=so->currPos.nextPage;
2089-
else
2090-
blkno=so->currPos.prevPage;
2091-
lastcurrblkno=so->currPos.currPage;
2092-
2093-
/*
2094-
* Cancel primitive index scans that were scheduled when the call to
2095-
* _bt_readpage for currPos happened to use the opposite direction to
2096-
* the one that we're stepping in now. (It's okay to leave the scan's
2097-
* array keys as-is, since the next _bt_readpage will advance them.)
2098-
*/
2099-
if (so->currPos.dir!=dir)
2100-
so->needPrimScan= false;
2101-
}
2077+
if (ScanDirectionIsForward(dir))
2078+
blkno=so->currPos.nextPage;
21022079
else
2103-
{
2104-
/*
2105-
* Seize the scan to get the nextPage and currPage from shared
2106-
* parallel state (saved from parallel scan's last _bt_readpage)
2107-
*/
2108-
if (!_bt_parallel_seize(scan,&blkno,&lastcurrblkno, false))
2109-
return false;
2080+
blkno=so->currPos.prevPage;
2081+
lastcurrblkno=so->currPos.currPage;
21102082

2111-
Assert(!so->needPrimScan);
2112-
}
2083+
/*
2084+
* Cancel primitive index scans that were scheduled when the call to
2085+
* _bt_readpage for currPos happened to use the opposite direction to the
2086+
* one that we're stepping in now. (It's okay to leave the scan's array
2087+
* keys as-is, since the next _bt_readpage will advance them.)
2088+
*/
2089+
if (so->currPos.dir!=dir)
2090+
so->needPrimScan= false;
21132091

2114-
return_bt_readnextpage(scan,blkno,lastcurrblkno,dir);
2092+
return_bt_readnextpage(scan,blkno,lastcurrblkno,dir, false);
21152093
}
21162094

21172095
/*
@@ -2203,14 +2181,19 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
22032181
*
22042182
* On entry, caller shouldn't hold any locks or pins on any page (we work
22052183
* directly off of blkno and lastcurrblkno instead). Parallel scan callers
2206-
* must have seized the scan before calling here (blkno and lastcurrblkno
2207-
* arguments should come from the seized scan).
2184+
* that seized the scan before calling here should pass seized=true; such a
2185+
* caller's blkno and lastcurrblkno arguments come from the seized scan.
2186+
* seized=false callers just pass us the blkno/lastcurrblkno taken from their
2187+
* so->currPos, which (along with so->currPos itself) can be used to end the
2188+
* scan. A seized=false caller's blkno can never be assumed to be the page
2189+
* that must be read next during a parallel scan, though. We must figure that
2190+
* part out for ourselves by seizing the scan (the correct page to read might
2191+
* already be beyond the seized=false caller's blkno during a parallel scan).
22082192
*
22092193
* On success exit, so->currPos is updated to contain data from the next
2210-
* interesting page, and we return true (parallel scan callers should not use
2211-
* so->currPos to determine which page to scan next, though). We hold a pin
2212-
* on the buffer on success exit, except when _bt_drop_lock_and_maybe_pin
2213-
* decided it was safe to eagerly drop the pin (to avoid blocking VACUUM).
2194+
* interesting page, and we return true. We hold a pin on the buffer on
2195+
* success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe
2196+
* to eagerly drop the pin (to avoid blocking VACUUM).
22142197
*
22152198
* If there are no more matching records in the given direction, we drop all
22162199
* locks and pins, invalidate so->currPos, and return false.
@@ -2220,12 +2203,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
22202203
*/
22212204
staticbool
22222205
_bt_readnextpage(IndexScanDescscan,BlockNumberblkno,
2223-
BlockNumberlastcurrblkno,ScanDirectiondir)
2206+
BlockNumberlastcurrblkno,ScanDirectiondir,boolseized)
22242207
{
22252208
Relationrel=scan->indexRelation;
22262209
BTScanOpaqueso= (BTScanOpaque)scan->opaque;
22272210

2228-
Assert(so->currPos.currPage==lastcurrblkno||scan->parallel_scan!=NULL);
2211+
Assert(so->currPos.currPage==lastcurrblkno||seized);
22292212
Assert(!BTScanPosIsPinned(so->currPos));
22302213

22312214
/*
@@ -2254,6 +2237,15 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
22542237

22552238
Assert(!so->needPrimScan);
22562239

2240+
/* parallel scan must never actually visit so->currPos blkno */
2241+
if (!seized&&scan->parallel_scan!=NULL&&
2242+
!_bt_parallel_seize(scan,&blkno,&lastcurrblkno, false))
2243+
{
2244+
/* whole scan is now done (or another primitive scan required) */
2245+
BTScanPosInvalidate(so->currPos);
2246+
return false;
2247+
}
2248+
22572249
if (ScanDirectionIsForward(dir))
22582250
{
22592251
/* read blkno, but check for interrupts first */
@@ -2308,14 +2300,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
23082300

23092301
/* no matching tuples on this page */
23102302
_bt_relbuf(rel,so->currPos.buf);
2311-
2312-
/* parallel scan seizes another page (won't use so->currPos blkno) */
2313-
if (scan->parallel_scan!=NULL&&
2314-
!_bt_parallel_seize(scan,&blkno,&lastcurrblkno, false))
2315-
{
2316-
BTScanPosInvalidate(so->currPos);
2317-
return false;
2318-
}
2303+
seized= false;/* released by _bt_readpage (or by us) */
23192304
}
23202305

23212306
/*

‎src/backend/access/nbtree/nbtutils.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2402,6 +2402,8 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
24022402

24032403
new_prim_scan:
24042404

2405+
Assert(pstate->finaltup);/* not on rightmost/leftmost page */
2406+
24052407
/*
24062408
* End this primitive index scan, but schedule another.
24072409
*

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp