Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit995ba28

Browse files
committed
Rearrange mdsync() looping logic to avoid the problem that a sufficiently
fast flow of new fsync requests can prevent mdsync() from ever completing.This was an unforeseen consequence of a patch added in Mar 2006 to preventthe fsync request queue from overflowing. Problem identified by HeikkiLinnakangas and independently by ITAGAKI Takahiro; fix based on ideas fromTakahiro-san, Heikki, and Tom.Back-patch as far as 8.1 because a previous back-patch introduced the probleminto 8.1 ...
1 parentebb6bae commit995ba28

File tree

1 file changed

+178
-91
lines changed
  • src/backend/storage/smgr

1 file changed

+178
-91
lines changed

‎src/backend/storage/smgr/md.c

Lines changed: 178 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/17 16:25:01 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.128 2007/04/12 17:10:55 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -122,14 +122,19 @@ typedef struct
122122
BlockNumbersegno;/* which segment */
123123
}PendingOperationTag;
124124

125+
typedefuint16CycleCtr;/* can be any convenient integer size */
126+
125127
typedefstruct
126128
{
127129
PendingOperationTagtag;/* hash table key (must be first!) */
128-
intfailures;/* number of failed attempts to fsync */
130+
boolcanceled;/* T => request canceled, not yet removed */
131+
CycleCtrcycle_ctr;/* mdsync_cycle_ctr when request was made */
129132
}PendingOperationEntry;
130133

131134
staticHTAB*pendingOpsTable=NULL;
132135

136+
staticCycleCtrmdsync_cycle_ctr=0;
137+
133138

134139
typedefenum/* behavior for mdopen & _mdfd_getseg */
135140
{
@@ -856,70 +861,125 @@ mdimmedsync(SMgrRelation reln)
856861

857862
/*
858863
*mdsync() -- Sync previous writes to stable storage.
859-
*
860-
* This is only called during checkpoints, and checkpoints should only
861-
* occur in processes that have created a pendingOpsTable.
862864
*/
863865
void
864866
mdsync(void)
865867
{
866-
boolneed_retry;
868+
staticboolmdsync_in_progress= false;
869+
870+
HASH_SEQ_STATUShstat;
871+
PendingOperationEntry*entry;
872+
intabsorb_counter;
867873

874+
/*
875+
* This is only called during checkpoints, and checkpoints should only
876+
* occur in processes that have created a pendingOpsTable.
877+
*/
868878
if (!pendingOpsTable)
869879
elog(ERROR,"cannot sync without a pendingOpsTable");
870880

871881
/*
872-
* The fsync table could contain requests to fsync relations that have
873-
* been deleted (unlinked) by the time we get to them. Rather than
874-
* just hoping an ENOENT (or EACCES on Windows) error can be ignored,
875-
* what we will do is retry the whole process after absorbing fsync
876-
* request messages again. Since mdunlink() queues a "revoke" message
877-
* before actually unlinking, the fsync request is guaranteed to be gone
878-
* the second time if it really was this case. DROP DATABASE likewise
879-
* has to tell us to forget fsync requests before it starts deletions.
882+
* If we are in the bgwriter, the sync had better include all fsync
883+
* requests that were queued by backends before the checkpoint REDO
884+
* point was determined. We go that a little better by accepting all
885+
* requests queued up to the point where we start fsync'ing.
880886
*/
881-
do {
882-
HASH_SEQ_STATUShstat;
883-
PendingOperationEntry*entry;
884-
intabsorb_counter;
887+
AbsorbFsyncRequests();
888+
889+
/*
890+
* To avoid excess fsync'ing (in the worst case, maybe a never-terminating
891+
* checkpoint), we want to ignore fsync requests that are entered into the
892+
* hashtable after this point --- they should be processed next time,
893+
* instead. We use mdsync_cycle_ctr to tell old entries apart from new
894+
* ones: new ones will have cycle_ctr equal to the incremented value of
895+
* mdsync_cycle_ctr.
896+
*
897+
* In normal circumstances, all entries present in the table at this
898+
* point will have cycle_ctr exactly equal to the current (about to be old)
899+
* value of mdsync_cycle_ctr. However, if we fail partway through the
900+
* fsync'ing loop, then older values of cycle_ctr might remain when we
901+
* come back here to try again. Repeated checkpoint failures would
902+
* eventually wrap the counter around to the point where an old entry
903+
* might appear new, causing us to skip it, possibly allowing a checkpoint
904+
* to succeed that should not have. To forestall wraparound, any time
905+
* the previous mdsync() failed to complete, run through the table and
906+
* forcibly set cycle_ctr = mdsync_cycle_ctr.
907+
*
908+
* Think not to merge this loop with the main loop, as the problem is
909+
* exactly that that loop may fail before having visited all the entries.
910+
* From a performance point of view it doesn't matter anyway, as this
911+
* path will never be taken in a system that's functioning normally.
912+
*/
913+
if (mdsync_in_progress)
914+
{
915+
/* prior try failed, so update any stale cycle_ctr values */
916+
hash_seq_init(&hstat,pendingOpsTable);
917+
while ((entry= (PendingOperationEntry*)hash_seq_search(&hstat))!=NULL)
918+
{
919+
entry->cycle_ctr=mdsync_cycle_ctr;
920+
}
921+
}
885922

886-
need_retry= false;
923+
/* Advance counter so that new hashtable entries are distinguishable */
924+
mdsync_cycle_ctr++;
887925

926+
/* Set flag to detect failure if we don't reach the end of the loop */
927+
mdsync_in_progress= true;
928+
929+
/* Now scan the hashtable for fsync requests to process */
930+
absorb_counter=FSYNCS_PER_ABSORB;
931+
hash_seq_init(&hstat,pendingOpsTable);
932+
while ((entry= (PendingOperationEntry*)hash_seq_search(&hstat))!=NULL)
933+
{
888934
/*
889-
* If we are in the bgwriter, the sync had better include all fsync
890-
* requests that were queued by backends before the checkpoint REDO
891-
* point was determined. We go that a little better by accepting all
892-
* requests queued up to the point where we start fsync'ing.
935+
* If the entry is new then don't process it this time. Note that
936+
* "continue" bypasses the hash-remove call at the bottom of the loop.
893937
*/
894-
AbsorbFsyncRequests();
938+
if (entry->cycle_ctr==mdsync_cycle_ctr)
939+
continue;
895940

896-
absorb_counter=FSYNCS_PER_ABSORB;
897-
hash_seq_init(&hstat,pendingOpsTable);
898-
while ((entry= (PendingOperationEntry*)hash_seq_search(&hstat))!=NULL)
941+
/* Else assert we haven't missed it */
942+
Assert((CycleCtr) (entry->cycle_ctr+1)==mdsync_cycle_ctr);
943+
944+
/*
945+
* If fsync is off then we don't have to bother opening the file
946+
* at all. (We delay checking until this point so that changing
947+
* fsync on the fly behaves sensibly.) Also, if the entry is
948+
* marked canceled, fall through to delete it.
949+
*/
950+
if (enableFsync&& !entry->canceled)
899951
{
952+
intfailures;
953+
900954
/*
901-
* If fsync is off then we don't have to bother opening the file
902-
* at all. (We delay checking until this point so that changing
903-
* fsync on the fly behaves sensibly.)
955+
* If in bgwriter, we want to absorb pending requests every so
956+
* often to prevent overflow of the fsync request queue. It is
957+
* unspecified whether newly-added entries will be visited by
958+
* hash_seq_search, but we don't care since we don't need to
959+
* process them anyway.
904960
*/
905-
if (enableFsync)
961+
if (--absorb_counter <=0)
962+
{
963+
AbsorbFsyncRequests();
964+
absorb_counter=FSYNCS_PER_ABSORB;
965+
}
966+
967+
/*
968+
* The fsync table could contain requests to fsync segments that
969+
* have been deleted (unlinked) by the time we get to them.
970+
* Rather than just hoping an ENOENT (or EACCES on Windows) error
971+
* can be ignored, what we do on error is absorb pending requests
972+
* and then retry. Since mdunlink() queues a "revoke" message
973+
* before actually unlinking, the fsync request is guaranteed to
974+
* be marked canceled after the absorb if it really was this case.
975+
* DROP DATABASE likewise has to tell us to forget fsync requests
976+
* before it starts deletions.
977+
*/
978+
for (failures=0; ;failures++)/* loop exits at "break" */
906979
{
907980
SMgrRelationreln;
908981
MdfdVec*seg;
909982

910-
/*
911-
* If in bgwriter, we want to absorb pending requests every so
912-
* often to prevent overflow of the fsync request queue. This
913-
* could result in deleting the current entry out from under
914-
* our hashtable scan, so the procedure is to fall out of the
915-
* scan and start over from the top of the function.
916-
*/
917-
if (--absorb_counter <=0)
918-
{
919-
need_retry= true;
920-
break;
921-
}
922-
923983
/*
924984
* Find or create an smgr hash entry for this relation. This
925985
* may seem a bit unclean -- md calling smgr? But it's really
@@ -940,50 +1000,64 @@ mdsync(void)
9401000
/*
9411001
* It is possible that the relation has been dropped or
9421002
* truncated since the fsync request was entered. Therefore,
943-
* allow ENOENT, but only if we didn't failoncealready on
1003+
* allow ENOENT, but only if we didn't fail already on
9441004
* this file. This applies both during _mdfd_getseg() and
9451005
* during FileSync, since fd.c might have closed the file
9461006
* behind our back.
9471007
*/
9481008
seg=_mdfd_getseg(reln,
9491009
entry->tag.segno* ((BlockNumber)RELSEG_SIZE),
9501010
false,EXTENSION_RETURN_NULL);
951-
if (seg==NULL||
952-
FileSync(seg->mdfd_vfd)<0)
953-
{
954-
/*
955-
* XXX is there any point in allowing more than one try?
956-
* Don't see one at the moment, but easy to change the
957-
* test here if so.
958-
*/
959-
if (!FILE_POSSIBLY_DELETED(errno)||
960-
++(entry->failures)>1)
961-
ereport(ERROR,
962-
(errcode_for_file_access(),
963-
errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
964-
entry->tag.segno,
965-
entry->tag.rnode.spcNode,
966-
entry->tag.rnode.dbNode,
967-
entry->tag.rnode.relNode)));
968-
else
969-
ereport(DEBUG1,
970-
(errcode_for_file_access(),
971-
errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
972-
entry->tag.segno,
973-
entry->tag.rnode.spcNode,
974-
entry->tag.rnode.dbNode,
975-
entry->tag.rnode.relNode)));
976-
need_retry= true;
977-
continue;/* don't delete the hashtable entry */
978-
}
979-
}
1011+
if (seg!=NULL&&
1012+
FileSync(seg->mdfd_vfd) >=0)
1013+
break;/* success; break out of retry loop */
9801014

981-
/* Okay, delete this entry */
982-
if (hash_search(pendingOpsTable,&entry->tag,
983-
HASH_REMOVE,NULL)==NULL)
984-
elog(ERROR,"pendingOpsTable corrupted");
1015+
/*
1016+
* XXX is there any point in allowing more than one retry?
1017+
* Don't see one at the moment, but easy to change the
1018+
* test here if so.
1019+
*/
1020+
if (!FILE_POSSIBLY_DELETED(errno)||
1021+
failures>0)
1022+
ereport(ERROR,
1023+
(errcode_for_file_access(),
1024+
errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1025+
entry->tag.segno,
1026+
entry->tag.rnode.spcNode,
1027+
entry->tag.rnode.dbNode,
1028+
entry->tag.rnode.relNode)));
1029+
else
1030+
ereport(DEBUG1,
1031+
(errcode_for_file_access(),
1032+
errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
1033+
entry->tag.segno,
1034+
entry->tag.rnode.spcNode,
1035+
entry->tag.rnode.dbNode,
1036+
entry->tag.rnode.relNode)));
1037+
1038+
/*
1039+
* Absorb incoming requests and check to see if canceled.
1040+
*/
1041+
AbsorbFsyncRequests();
1042+
absorb_counter=FSYNCS_PER_ABSORB;/* might as well... */
1043+
1044+
if (entry->canceled)
1045+
break;
1046+
}/* end retry loop */
9851047
}
986-
}while (need_retry);
1048+
1049+
/*
1050+
* If we get here, either we fsync'd successfully, or we don't have
1051+
* to because enableFsync is off, or the entry is (now) marked
1052+
* canceled. Okay to delete it.
1053+
*/
1054+
if (hash_search(pendingOpsTable,&entry->tag,
1055+
HASH_REMOVE,NULL)==NULL)
1056+
elog(ERROR,"pendingOpsTable corrupted");
1057+
}/* end loop over hashtable entries */
1058+
1059+
/* Flag successful completion of mdsync */
1060+
mdsync_in_progress= false;
9871061
}
9881062

9891063
/*
@@ -1027,8 +1101,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
10271101
*
10281102
* The range of possible segment numbers is way less than the range of
10291103
* BlockNumber, so we can reserve high values of segno for special purposes.
1030-
* We define two: FORGET_RELATION_FSYNC means todrop pending fsyncs for
1031-
* a relation, and FORGET_DATABASE_FSYNC means todrop pending fsyncs for
1104+
* We define two: FORGET_RELATION_FSYNC means tocancel pending fsyncs for
1105+
* a relation, and FORGET_DATABASE_FSYNC means tocancel pending fsyncs for
10321106
* a whole database. (These are a tad slow because the hash table has to be
10331107
* searched linearly, but it doesn't seem worth rethinking the table structure
10341108
* for them.)
@@ -1049,10 +1123,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
10491123
{
10501124
if (RelFileNodeEquals(entry->tag.rnode,rnode))
10511125
{
1052-
/* Okay, delete this entry */
1053-
if (hash_search(pendingOpsTable,&entry->tag,
1054-
HASH_REMOVE,NULL)==NULL)
1055-
elog(ERROR,"pendingOpsTable corrupted");
1126+
/* Okay, cancel this entry */
1127+
entry->canceled= true;
10561128
}
10571129
}
10581130
}
@@ -1067,10 +1139,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
10671139
{
10681140
if (entry->tag.rnode.dbNode==rnode.dbNode)
10691141
{
1070-
/* Okay, delete this entry */
1071-
if (hash_search(pendingOpsTable,&entry->tag,
1072-
HASH_REMOVE,NULL)==NULL)
1073-
elog(ERROR,"pendingOpsTable corrupted");
1142+
/* Okay, cancel this entry */
1143+
entry->canceled= true;
10741144
}
10751145
}
10761146
}
@@ -1090,8 +1160,25 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
10901160
&key,
10911161
HASH_ENTER,
10921162
&found);
1093-
if (!found)/* new entry, so initialize it */
1094-
entry->failures=0;
1163+
/* if new or previously canceled entry, initialize it */
1164+
if (!found||entry->canceled)
1165+
{
1166+
entry->canceled= false;
1167+
entry->cycle_ctr=mdsync_cycle_ctr;
1168+
}
1169+
/*
1170+
* NB: it's intentional that we don't change cycle_ctr if the entry
1171+
* already exists. The fsync request must be treated as old, even
1172+
* though the new request will be satisfied too by any subsequent
1173+
* fsync.
1174+
*
1175+
* However, if the entry is present but is marked canceled, we should
1176+
* act just as though it wasn't there. The only case where this could
1177+
* happen would be if a file had been deleted, we received but did not
1178+
* yet act on the cancel request, and the same relfilenode was then
1179+
* assigned to a new file. We mustn't lose the new request, but
1180+
* it should be considered new not old.
1181+
*/
10951182
}
10961183
}
10971184

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp