NotificationsYou must be signed in to change notification settings
Fork5
Star26

Commit995ba28

committed

Rearrange mdsync() looping logic to avoid the problem that a sufficiently

fast flow of new fsync requests can prevent mdsync() from ever completing.This was an unforeseen consequence of a patch added in Mar 2006 to preventthe fsync request queue from overflowing. Problem identified by HeikkiLinnakangas and independently by ITAGAKI Takahiro; fix based on ideas fromTakahiro-san, Heikki, and Tom.Back-patch as far as 8.1 because a previous back-patch introduced the probleminto 8.1 ...

1 parentebb6bae commit995ba28Copy full SHA for 995ba28

File tree

1 file changed

+178

-91

lines changed

src/backend/storage/smgr
- md.c

1 file changed

+178

-91

lines changed

`‎src/backend/storage/smgr/md.c`

Lines changed: 178 additions & 91 deletions

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`*`
`9`	`9`	`*`
`10`	`10`	`* IDENTIFICATION`
`11`		`- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/17 16:25:01 tgl Exp $`
	`11`	`+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.128 2007/04/12 17:10:55 tgl Exp $`
`12`	`12`	`*`
`13`	`13`	`*-------------------------------------------------------------------------`
`14`	`14`	`*/`
`@@ -122,14 +122,19 @@ typedef struct`
`122`	`122`	`BlockNumbersegno;/* which segment */`
`123`	`123`	`}PendingOperationTag;`
`124`	`124`
	`125`	`+typedefuint16CycleCtr;/* can be any convenient integer size */`
	`126`	`+`
`125`	`127`	`typedefstruct`
`126`	`128`	`{`
`127`	`129`	`PendingOperationTagtag;/* hash table key (must be first!) */`
`128`		`-intfailures;/* number of failed attempts to fsync */`
	`130`	`+boolcanceled;/* T => request canceled, not yet removed */`
	`131`	`+CycleCtrcycle_ctr;/* mdsync_cycle_ctr when request was made */`
`129`	`132`	`}PendingOperationEntry;`
`130`	`133`
`131`	`134`	`staticHTAB*pendingOpsTable=NULL;`
`132`	`135`
	`136`	`+staticCycleCtrmdsync_cycle_ctr=0;`
	`137`	`+`
`133`	`138`
`134`	`139`	`typedefenum/* behavior for mdopen & _mdfd_getseg */`
`135`	`140`	`{`
`@@ -856,70 +861,125 @@ mdimmedsync(SMgrRelation reln)`
`856`	`861`
`857`	`862`	`/*`
`858`	`863`	`*mdsync() -- Sync previous writes to stable storage.`
`859`		`- *`
`860`		`- * This is only called during checkpoints, and checkpoints should only`
`861`		`- * occur in processes that have created a pendingOpsTable.`
`862`	`864`	`*/`
`863`	`865`	`void`
`864`	`866`	`mdsync(void)`
`865`	`867`	`{`
`866`		`-boolneed_retry;`
	`868`	`+staticboolmdsync_in_progress= false;`
	`869`	`+`
	`870`	`+HASH_SEQ_STATUShstat;`
	`871`	`+PendingOperationEntry*entry;`
	`872`	`+intabsorb_counter;`
`867`	`873`
	`874`	`+/*`
	`875`	`+ * This is only called during checkpoints, and checkpoints should only`
	`876`	`+ * occur in processes that have created a pendingOpsTable.`
	`877`	`+ */`
`868`	`878`	`if (!pendingOpsTable)`
`869`	`879`	`elog(ERROR,"cannot sync without a pendingOpsTable");`
`870`	`880`
`871`	`881`	`/*`
`872`		`- * The fsync table could contain requests to fsync relations that have`
`873`		`- * been deleted (unlinked) by the time we get to them. Rather than`
`874`		`- * just hoping an ENOENT (or EACCES on Windows) error can be ignored,`
`875`		`- * what we will do is retry the whole process after absorbing fsync`
`876`		`- * request messages again. Since mdunlink() queues a "revoke" message`
`877`		`- * before actually unlinking, the fsync request is guaranteed to be gone`
`878`		`- * the second time if it really was this case. DROP DATABASE likewise`
`879`		`- * has to tell us to forget fsync requests before it starts deletions.`
	`882`	`+ * If we are in the bgwriter, the sync had better include all fsync`
	`883`	`+ * requests that were queued by backends before the checkpoint REDO`
	`884`	`+ * point was determined. We go that a little better by accepting all`
	`885`	`+ * requests queued up to the point where we start fsync'ing.`
`880`	`886`	`*/`
`881`		`-do {`
`882`		`-HASH_SEQ_STATUShstat;`
`883`		`-PendingOperationEntry*entry;`
`884`		`-intabsorb_counter;`
	`887`	`+AbsorbFsyncRequests();`
	`888`	`+`
	`889`	`+/*`
	`890`	`+ * To avoid excess fsync'ing (in the worst case, maybe a never-terminating`
	`891`	`+ * checkpoint), we want to ignore fsync requests that are entered into the`
	`892`	`+ * hashtable after this point --- they should be processed next time,`
	`893`	`+ * instead. We use mdsync_cycle_ctr to tell old entries apart from new`
	`894`	`+ * ones: new ones will have cycle_ctr equal to the incremented value of`
	`895`	`+ * mdsync_cycle_ctr.`
	`896`	`+ *`
	`897`	`+ * In normal circumstances, all entries present in the table at this`
	`898`	`+ * point will have cycle_ctr exactly equal to the current (about to be old)`
	`899`	`+ * value of mdsync_cycle_ctr. However, if we fail partway through the`
	`900`	`+ * fsync'ing loop, then older values of cycle_ctr might remain when we`
	`901`	`+ * come back here to try again. Repeated checkpoint failures would`
	`902`	`+ * eventually wrap the counter around to the point where an old entry`
	`903`	`+ * might appear new, causing us to skip it, possibly allowing a checkpoint`
	`904`	`+ * to succeed that should not have. To forestall wraparound, any time`
	`905`	`+ * the previous mdsync() failed to complete, run through the table and`
	`906`	`+ * forcibly set cycle_ctr = mdsync_cycle_ctr.`
	`907`	`+ *`
	`908`	`+ * Think not to merge this loop with the main loop, as the problem is`
	`909`	`+ * exactly that that loop may fail before having visited all the entries.`
	`910`	`+ * From a performance point of view it doesn't matter anyway, as this`
	`911`	`+ * path will never be taken in a system that's functioning normally.`
	`912`	`+ */`
	`913`	`+if (mdsync_in_progress)`
	`914`	`+{`
	`915`	`+/* prior try failed, so update any stale cycle_ctr values */`
	`916`	`+hash_seq_init(&hstat,pendingOpsTable);`
	`917`	`+while ((entry= (PendingOperationEntry*)hash_seq_search(&hstat))!=NULL)`
	`918`	`+{`
	`919`	`+entry->cycle_ctr=mdsync_cycle_ctr;`
	`920`	`+}`
	`921`	`+}`
`885`	`922`
`886`		`-need_retry= false;`
	`923`	`+/* Advance counter so that new hashtable entries are distinguishable */`
	`924`	`+mdsync_cycle_ctr++;`
`887`	`925`
	`926`	`+/* Set flag to detect failure if we don't reach the end of the loop */`
	`927`	`+mdsync_in_progress= true;`
	`928`	`+`
	`929`	`+/* Now scan the hashtable for fsync requests to process */`
	`930`	`+absorb_counter=FSYNCS_PER_ABSORB;`
	`931`	`+hash_seq_init(&hstat,pendingOpsTable);`
	`932`	`+while ((entry= (PendingOperationEntry*)hash_seq_search(&hstat))!=NULL)`
	`933`	`+{`
`888`	`934`	`/*`
`889`		`- * If we are in the bgwriter, the sync had better include all fsync`
`890`		`- * requests that were queued by backends before the checkpoint REDO`
`891`		`- * point was determined. We go that a little better by accepting all`
`892`		`- * requests queued up to the point where we start fsync'ing.`
	`935`	`+ * If the entry is new then don't process it this time. Note that`
	`936`	`+ * "continue" bypasses the hash-remove call at the bottom of the loop.`
`893`	`937`	`*/`
`894`		`-AbsorbFsyncRequests();`
	`938`	`+if (entry->cycle_ctr==mdsync_cycle_ctr)`
	`939`	`+continue;`
`895`	`940`
`896`		`-absorb_counter=FSYNCS_PER_ABSORB;`
`897`		`-hash_seq_init(&hstat,pendingOpsTable);`
`898`		`-while ((entry= (PendingOperationEntry*)hash_seq_search(&hstat))!=NULL)`
	`941`	`+/* Else assert we haven't missed it */`
	`942`	`+Assert((CycleCtr) (entry->cycle_ctr+1)==mdsync_cycle_ctr);`
	`943`	`+`
	`944`	`+/*`
	`945`	`+ * If fsync is off then we don't have to bother opening the file`
	`946`	`+ * at all. (We delay checking until this point so that changing`
	`947`	`+ * fsync on the fly behaves sensibly.) Also, if the entry is`
	`948`	`+ * marked canceled, fall through to delete it.`
	`949`	`+ */`
	`950`	`+if (enableFsync&& !entry->canceled)`
`899`	`951`	`{`
	`952`	`+intfailures;`
	`953`	`+`
`900`	`954`	`/*`
`901`		`- * If fsync is off then we don't have to bother opening the file`
`902`		`- * at all. (We delay checking until this point so that changing`
`903`		`- * fsync on the fly behaves sensibly.)`
	`955`	`+ * If in bgwriter, we want to absorb pending requests every so`
	`956`	`+ * often to prevent overflow of the fsync request queue. It is`
	`957`	`+ * unspecified whether newly-added entries will be visited by`
	`958`	`+ * hash_seq_search, but we don't care since we don't need to`
	`959`	`+ * process them anyway.`
`904`	`960`	`*/`
`905`		`-if (enableFsync)`
	`961`	`+if (--absorb_counter <=0)`
	`962`	`+{`
	`963`	`+AbsorbFsyncRequests();`
	`964`	`+absorb_counter=FSYNCS_PER_ABSORB;`
	`965`	`+}`
	`966`	`+`
	`967`	`+/*`
	`968`	`+ * The fsync table could contain requests to fsync segments that`
	`969`	`+ * have been deleted (unlinked) by the time we get to them.`
	`970`	`+ * Rather than just hoping an ENOENT (or EACCES on Windows) error`
	`971`	`+ * can be ignored, what we do on error is absorb pending requests`
	`972`	`+ * and then retry. Since mdunlink() queues a "revoke" message`
	`973`	`+ * before actually unlinking, the fsync request is guaranteed to`
	`974`	`+ * be marked canceled after the absorb if it really was this case.`
	`975`	`+ * DROP DATABASE likewise has to tell us to forget fsync requests`
	`976`	`+ * before it starts deletions.`
	`977`	`+ */`
	`978`	`+for (failures=0; ;failures++)/* loop exits at "break" */`
`906`	`979`	`{`
`907`	`980`	`SMgrRelationreln;`
`908`	`981`	`MdfdVec*seg;`
`909`	`982`
`910`		`-/*`
`911`		`- * If in bgwriter, we want to absorb pending requests every so`
`912`		`- * often to prevent overflow of the fsync request queue. This`
`913`		`- * could result in deleting the current entry out from under`
`914`		`- * our hashtable scan, so the procedure is to fall out of the`
`915`		`- * scan and start over from the top of the function.`
`916`		`- */`
`917`		`-if (--absorb_counter <=0)`
`918`		`-{`
`919`		`-need_retry= true;`
`920`		`-break;`
`921`		`-}`
`922`		`-`
`923`	`983`	`/*`
`924`	`984`	`* Find or create an smgr hash entry for this relation. This`
`925`	`985`	`* may seem a bit unclean -- md calling smgr? But it's really`
`@@ -940,50 +1000,64 @@ mdsync(void)`
`940`	`1000`	`/*`
`941`	`1001`	`* It is possible that the relation has been dropped or`
`942`	`1002`	`* truncated since the fsync request was entered. Therefore,`
`943`		`- * allow ENOENT, but only if we didn't failoncealready on`
	`1003`	`+ * allow ENOENT, but only if we didn't fail already on`
`944`	`1004`	`* this file. This applies both during _mdfd_getseg() and`
`945`	`1005`	`* during FileSync, since fd.c might have closed the file`
`946`	`1006`	`* behind our back.`
`947`	`1007`	`*/`
`948`	`1008`	`seg=_mdfd_getseg(reln,`
`949`	`1009`	`entry->tag.segno* ((BlockNumber)RELSEG_SIZE),`
`950`	`1010`	`false,EXTENSION_RETURN_NULL);`
`951`		`-if (seg==NULL\|\|`
`952`		`-FileSync(seg->mdfd_vfd)<0)`
`953`		`-{`
`954`		`-/*`
`955`		`- * XXX is there any point in allowing more than one try?`
`956`		`- * Don't see one at the moment, but easy to change the`
`957`		`- * test here if so.`
`958`		`- */`
`959`		`-if (!FILE_POSSIBLY_DELETED(errno)\|\|`
`960`		`-++(entry->failures)>1)`
`961`		`-ereport(ERROR,`
`962`		`-(errcode_for_file_access(),`
`963`		`-errmsg("could not fsync segment %u of relation %u/%u/%u: %m",`
`964`		`-entry->tag.segno,`
`965`		`-entry->tag.rnode.spcNode,`
`966`		`-entry->tag.rnode.dbNode,`
`967`		`-entry->tag.rnode.relNode)));`
`968`		`-else`
`969`		`-ereport(DEBUG1,`
`970`		`-(errcode_for_file_access(),`
`971`		`-errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",`
`972`		`-entry->tag.segno,`
`973`		`-entry->tag.rnode.spcNode,`
`974`		`-entry->tag.rnode.dbNode,`
`975`		`-entry->tag.rnode.relNode)));`
`976`		`-need_retry= true;`
`977`		`-continue;/* don't delete the hashtable entry */`
`978`		`-}`
`979`		`-}`
	`1011`	`+if (seg!=NULL&&`
	`1012`	`+FileSync(seg->mdfd_vfd) >=0)`
	`1013`	`+break;/* success; break out of retry loop */`
`980`	`1014`
`981`		`-/* Okay, delete this entry */`
`982`		`-if (hash_search(pendingOpsTable,&entry->tag,`
`983`		`-HASH_REMOVE,NULL)==NULL)`
`984`		`-elog(ERROR,"pendingOpsTable corrupted");`
	`1015`	`+/*`
	`1016`	`+ * XXX is there any point in allowing more than one retry?`
	`1017`	`+ * Don't see one at the moment, but easy to change the`
	`1018`	`+ * test here if so.`
	`1019`	`+ */`
	`1020`	`+if (!FILE_POSSIBLY_DELETED(errno)\|\|`
	`1021`	`+failures>0)`
	`1022`	`+ereport(ERROR,`
	`1023`	`+(errcode_for_file_access(),`
	`1024`	`+errmsg("could not fsync segment %u of relation %u/%u/%u: %m",`
	`1025`	`+entry->tag.segno,`
	`1026`	`+entry->tag.rnode.spcNode,`
	`1027`	`+entry->tag.rnode.dbNode,`
	`1028`	`+entry->tag.rnode.relNode)));`
	`1029`	`+else`
	`1030`	`+ereport(DEBUG1,`
	`1031`	`+(errcode_for_file_access(),`
	`1032`	`+errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",`
	`1033`	`+entry->tag.segno,`
	`1034`	`+entry->tag.rnode.spcNode,`
	`1035`	`+entry->tag.rnode.dbNode,`
	`1036`	`+entry->tag.rnode.relNode)));`
	`1037`	`+`
	`1038`	`+/*`
	`1039`	`+ * Absorb incoming requests and check to see if canceled.`
	`1040`	`+ */`
	`1041`	`+AbsorbFsyncRequests();`
	`1042`	`+absorb_counter=FSYNCS_PER_ABSORB;/* might as well... */`
	`1043`	`+`
	`1044`	`+if (entry->canceled)`
	`1045`	`+break;`
	`1046`	`+}/* end retry loop */`
`985`	`1047`	`}`
`986`		`-}while (need_retry);`
	`1048`	`+`
	`1049`	`+/*`
	`1050`	`+ * If we get here, either we fsync'd successfully, or we don't have`
	`1051`	`+ * to because enableFsync is off, or the entry is (now) marked`
	`1052`	`+ * canceled. Okay to delete it.`
	`1053`	`+ */`
	`1054`	`+if (hash_search(pendingOpsTable,&entry->tag,`
	`1055`	`+HASH_REMOVE,NULL)==NULL)`
	`1056`	`+elog(ERROR,"pendingOpsTable corrupted");`
	`1057`	`+}/* end loop over hashtable entries */`
	`1058`	`+`
	`1059`	`+/* Flag successful completion of mdsync */`
	`1060`	`+mdsync_in_progress= false;`
`987`	`1061`	`}`
`988`	`1062`
`989`	`1063`	`/*`
`@@ -1027,8 +1101,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)`
`1027`	`1101`	`*`
`1028`	`1102`	`* The range of possible segment numbers is way less than the range of`
`1029`	`1103`	`* BlockNumber, so we can reserve high values of segno for special purposes.`
`1030`		`- * We define two: FORGET_RELATION_FSYNC means todrop pending fsyncs for`
`1031`		`- * a relation, and FORGET_DATABASE_FSYNC means todrop pending fsyncs for`
	`1104`	`+ * We define two: FORGET_RELATION_FSYNC means tocancel pending fsyncs for`
	`1105`	`+ * a relation, and FORGET_DATABASE_FSYNC means tocancel pending fsyncs for`
`1032`	`1106`	`* a whole database. (These are a tad slow because the hash table has to be`
`1033`	`1107`	`* searched linearly, but it doesn't seem worth rethinking the table structure`
`1034`	`1108`	`* for them.)`
`@@ -1049,10 +1123,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)`
`1049`	`1123`	`{`
`1050`	`1124`	`if (RelFileNodeEquals(entry->tag.rnode,rnode))`
`1051`	`1125`	`{`
`1052`		`-/* Okay, delete this entry */`
`1053`		`-if (hash_search(pendingOpsTable,&entry->tag,`
`1054`		`-HASH_REMOVE,NULL)==NULL)`
`1055`		`-elog(ERROR,"pendingOpsTable corrupted");`
	`1126`	`+/* Okay, cancel this entry */`
	`1127`	`+entry->canceled= true;`
`1056`	`1128`	`}`
`1057`	`1129`	`}`
`1058`	`1130`	`}`
`@@ -1067,10 +1139,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)`
`1067`	`1139`	`{`
`1068`	`1140`	`if (entry->tag.rnode.dbNode==rnode.dbNode)`
`1069`	`1141`	`{`
`1070`		`-/* Okay, delete this entry */`
`1071`		`-if (hash_search(pendingOpsTable,&entry->tag,`
`1072`		`-HASH_REMOVE,NULL)==NULL)`
`1073`		`-elog(ERROR,"pendingOpsTable corrupted");`
	`1142`	`+/* Okay, cancel this entry */`
	`1143`	`+entry->canceled= true;`
`1074`	`1144`	`}`
`1075`	`1145`	`}`
`1076`	`1146`	`}`
`@@ -1090,8 +1160,25 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)`
`1090`	`1160`	`&key,`
`1091`	`1161`	`HASH_ENTER,`
`1092`	`1162`	`&found);`
`1093`		`-if (!found)/* new entry, so initialize it */`
`1094`		`-entry->failures=0;`
	`1163`	`+/* if new or previously canceled entry, initialize it */`
	`1164`	`+if (!found\|\|entry->canceled)`
	`1165`	`+{`
	`1166`	`+entry->canceled= false;`
	`1167`	`+entry->cycle_ctr=mdsync_cycle_ctr;`
	`1168`	`+}`
	`1169`	`+/*`
	`1170`	`+ * NB: it's intentional that we don't change cycle_ctr if the entry`
	`1171`	`+ * already exists. The fsync request must be treated as old, even`
	`1172`	`+ * though the new request will be satisfied too by any subsequent`
	`1173`	`+ * fsync.`
	`1174`	`+ *`
	`1175`	`+ * However, if the entry is present but is marked canceled, we should`
	`1176`	`+ * act just as though it wasn't there. The only case where this could`
	`1177`	`+ * happen would be if a file had been deleted, we received but did not`
	`1178`	`+ * yet act on the cancel request, and the same relfilenode was then`
	`1179`	`+ * assigned to a new file. We mustn't lose the new request, but`
	`1180`	`+ * it should be considered new not old.`
	`1181`	`+ */`
`1095`	`1182`	`}`
`1096`	`1183`	`}`
`1097`	`1184`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit995ba28

File tree

1 file changed

1 file changed

`‎src/backend/storage/smgr/md.c`

0 commit comments