Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit896ddf9

Browse files
committed
Avoid fragmentation of logical tapes when writing concurrently.
Disk-based HashAgg relies on writing to multiple tapesconcurrently. Avoid fragmentation of the tapes' blocks bypreallocating many blocks for a tape at once. No file operations areperformed during preallocation; only the block numbers are reserved.Reviewed-by: Tomas VondraDiscussion:https://postgr.es/m/20200519151202.u2p2gpiawoaznsv2%40development
1 parent49223e1 commit896ddf9

File tree

1 file changed

+77
-3
lines changed

1 file changed

+77
-3
lines changed

‎src/backend/utils/sort/logtape.c

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,18 @@ typedef struct TapeBlockTrailer
110110
#defineTapeBlockSetNBytes(buf,nbytes) \
111111
(TapeBlockGetTrailer(buf)->next = -(nbytes))
112112

113+
/*
114+
* When multiple tapes are being written to concurrently (as in HashAgg),
115+
* avoid excessive fragmentation by preallocating block numbers to individual
116+
* tapes. Each preallocation doubles in size starting at
117+
* TAPE_WRITE_PREALLOC_MIN blocks up to TAPE_WRITE_PREALLOC_MAX blocks.
118+
*
119+
* No filesystem operations are performed for preallocation; only the block
120+
* numbers are reserved. This may lead to sparse writes, which will cause
121+
* ltsWriteBlock() to fill in holes with zeros.
122+
*/
123+
#defineTAPE_WRITE_PREALLOC_MIN 8
124+
#defineTAPE_WRITE_PREALLOC_MAX 128
113125

114126
/*
115127
* This data structure represents a single "logical tape" within the set
@@ -151,6 +163,15 @@ typedef struct LogicalTape
151163
intmax_size;/* highest useful, safe buffer_size */
152164
intpos;/* next read/write position in buffer */
153165
intnbytes;/* total # of valid bytes in buffer */
166+
167+
/*
168+
* Preallocated block numbers are held in an array sorted in descending
169+
* order; blocks are consumed from the end of the array (lowest block
170+
* numbers first).
171+
*/
172+
long*prealloc;
173+
intnprealloc;/* number of elements in list */
174+
intprealloc_size;/* number of elements list can hold */
154175
}LogicalTape;
155176

156177
/*
@@ -198,6 +219,7 @@ struct LogicalTapeSet
198219
staticvoidltsWriteBlock(LogicalTapeSet*lts,longblocknum,void*buffer);
199220
staticvoidltsReadBlock(LogicalTapeSet*lts,longblocknum,void*buffer);
200221
staticlongltsGetFreeBlock(LogicalTapeSet*lts);
222+
staticlongltsGetPreallocBlock(LogicalTapeSet*lts,LogicalTape*lt);
201223
staticvoidltsReleaseBlock(LogicalTapeSet*lts,longblocknum);
202224
staticvoidltsConcatWorkerTapes(LogicalTapeSet*lts,TapeShare*shared,
203225
SharedFileSet*fileset);
@@ -397,6 +419,45 @@ ltsGetFreeBlock(LogicalTapeSet *lts)
397419
returnblocknum;
398420
}
399421

422+
/*
423+
* Return the lowest free block number from the tape's preallocation list.
424+
* Refill the preallocation list if necessary.
425+
*/
426+
staticlong
427+
ltsGetPreallocBlock(LogicalTapeSet*lts,LogicalTape*lt)
428+
{
429+
/* sorted in descending order, so return the last element */
430+
if (lt->nprealloc>0)
431+
returnlt->prealloc[--lt->nprealloc];
432+
433+
if (lt->prealloc==NULL)
434+
{
435+
lt->prealloc_size=TAPE_WRITE_PREALLOC_MIN;
436+
lt->prealloc= (long*)palloc(sizeof(long)*lt->prealloc_size);
437+
}
438+
elseif (lt->prealloc_size<TAPE_WRITE_PREALLOC_MAX)
439+
{
440+
/* when the preallocation list runs out, double the size */
441+
lt->prealloc_size *=2;
442+
if (lt->prealloc_size>TAPE_WRITE_PREALLOC_MAX)
443+
lt->prealloc_size=TAPE_WRITE_PREALLOC_MAX;
444+
lt->prealloc= (long*)repalloc(lt->prealloc,
445+
sizeof(long)*lt->prealloc_size);
446+
}
447+
448+
/* refill preallocation list */
449+
lt->nprealloc=lt->prealloc_size;
450+
for (inti=lt->nprealloc;i>0;i--)
451+
{
452+
lt->prealloc[i-1]=ltsGetFreeBlock(lts);
453+
454+
/* verify descending order */
455+
Assert(i==lt->nprealloc||lt->prealloc[i-1]>lt->prealloc[i]);
456+
}
457+
458+
returnlt->prealloc[--lt->nprealloc];
459+
}
460+
400461
/*
401462
* Return a block# to the freelist.
402463
*/
@@ -557,6 +618,9 @@ ltsInitTape(LogicalTape *lt)
557618
lt->max_size=MaxAllocSize;
558619
lt->pos=0;
559620
lt->nbytes=0;
621+
lt->prealloc=NULL;
622+
lt->nprealloc=0;
623+
lt->prealloc_size=0;
560624
}
561625

562626
/*
@@ -709,7 +773,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
709773
Assert(lt->firstBlockNumber==-1);
710774
Assert(lt->pos==0);
711775

712-
lt->curBlockNumber=ltsGetFreeBlock(lts);
776+
lt->curBlockNumber=ltsGetPreallocBlock(lts,lt);
713777
lt->firstBlockNumber=lt->curBlockNumber;
714778

715779
TapeBlockGetTrailer(lt->buffer)->prev=-1L;
@@ -733,7 +797,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
733797
* First allocate the next block, so that we can store it in the
734798
* 'next' pointer of this block.
735799
*/
736-
nextBlockNumber=ltsGetFreeBlock(lts);
800+
nextBlockNumber=ltsGetPreallocBlock(lts,lt);
737801

738802
/* set the next-pointer and dump the current block. */
739803
TapeBlockGetTrailer(lt->buffer)->next=nextBlockNumber;
@@ -835,13 +899,23 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
835899
Assert(lt->frozen);
836900
}
837901

838-
/* Allocate a read buffer (unless the tape is empty) */
839902
if (lt->buffer)
840903
pfree(lt->buffer);
841904

842905
/* the buffer is lazily allocated, but set the size here */
843906
lt->buffer=NULL;
844907
lt->buffer_size=buffer_size;
908+
909+
/* free the preallocation list, and return unused block numbers */
910+
if (lt->prealloc!=NULL)
911+
{
912+
for (inti=lt->nprealloc;i>0;i--)
913+
ltsReleaseBlock(lts,lt->prealloc[i-1]);
914+
pfree(lt->prealloc);
915+
lt->prealloc=NULL;
916+
lt->nprealloc=0;
917+
lt->prealloc_size=0;
918+
}
845919
}
846920

847921
/*

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp