88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/17 16:25:01 tgl Exp $
11+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.128 2007/04/12 17:10:55 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
@@ -122,14 +122,19 @@ typedef struct
122122BlockNumber segno ;/* which segment */
123123}PendingOperationTag ;
124124
125+ typedef uint16 CycleCtr ;/* can be any convenient integer size */
126+
125127typedef struct
126128{
127129PendingOperationTag tag ;/* hash table key (must be first!) */
128- int failures ;/* number of failed attempts to fsync */
130+ bool canceled ;/* T => request canceled, not yet removed */
131+ CycleCtr cycle_ctr ;/* mdsync_cycle_ctr when request was made */
129132}PendingOperationEntry ;
130133
131134static HTAB * pendingOpsTable = NULL ;
132135
136+ static CycleCtr mdsync_cycle_ctr = 0 ;
137+
133138
134139typedef enum /* behavior for mdopen & _mdfd_getseg */
135140{
@@ -856,70 +861,125 @@ mdimmedsync(SMgrRelation reln)
856861
857862/*
858863 *mdsync() -- Sync previous writes to stable storage.
859- *
860- * This is only called during checkpoints, and checkpoints should only
861- * occur in processes that have created a pendingOpsTable.
862864 */
863865void
864866mdsync (void )
865867{
866- bool need_retry ;
868+ static bool mdsync_in_progress = false;
869+
870+ HASH_SEQ_STATUS hstat ;
871+ PendingOperationEntry * entry ;
872+ int absorb_counter ;
867873
874+ /*
875+ * This is only called during checkpoints, and checkpoints should only
876+ * occur in processes that have created a pendingOpsTable.
877+ */
868878if (!pendingOpsTable )
869879elog (ERROR ,"cannot sync without a pendingOpsTable" );
870880
871881/*
872- * The fsync table could contain requests to fsync relations that have
873- * been deleted (unlinked) by the time we get to them. Rather than
874- * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
875- * what we will do is retry the whole process after absorbing fsync
876- * request messages again. Since mdunlink() queues a "revoke" message
877- * before actually unlinking, the fsync request is guaranteed to be gone
878- * the second time if it really was this case. DROP DATABASE likewise
879- * has to tell us to forget fsync requests before it starts deletions.
882+ * If we are in the bgwriter, the sync had better include all fsync
883+ * requests that were queued by backends before the checkpoint REDO
884+ * point was determined. We go that a little better by accepting all
885+ * requests queued up to the point where we start fsync'ing.
880886 */
881- do {
882- HASH_SEQ_STATUS hstat ;
883- PendingOperationEntry * entry ;
884- int absorb_counter ;
887+ AbsorbFsyncRequests ();
888+
889+ /*
890+ * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
891+ * checkpoint), we want to ignore fsync requests that are entered into the
892+ * hashtable after this point --- they should be processed next time,
893+ * instead. We use mdsync_cycle_ctr to tell old entries apart from new
894+ * ones: new ones will have cycle_ctr equal to the incremented value of
895+ * mdsync_cycle_ctr.
896+ *
897+ * In normal circumstances, all entries present in the table at this
898+ * point will have cycle_ctr exactly equal to the current (about to be old)
899+ * value of mdsync_cycle_ctr. However, if we fail partway through the
900+ * fsync'ing loop, then older values of cycle_ctr might remain when we
901+ * come back here to try again. Repeated checkpoint failures would
902+ * eventually wrap the counter around to the point where an old entry
903+ * might appear new, causing us to skip it, possibly allowing a checkpoint
904+ * to succeed that should not have. To forestall wraparound, any time
905+ * the previous mdsync() failed to complete, run through the table and
906+ * forcibly set cycle_ctr = mdsync_cycle_ctr.
907+ *
908+ * Think not to merge this loop with the main loop, as the problem is
909+ * exactly that that loop may fail before having visited all the entries.
910+ * From a performance point of view it doesn't matter anyway, as this
911+ * path will never be taken in a system that's functioning normally.
912+ */
913+ if (mdsync_in_progress )
914+ {
915+ /* prior try failed, so update any stale cycle_ctr values */
916+ hash_seq_init (& hstat ,pendingOpsTable );
917+ while ((entry = (PendingOperationEntry * )hash_seq_search (& hstat ))!= NULL )
918+ {
919+ entry -> cycle_ctr = mdsync_cycle_ctr ;
920+ }
921+ }
885922
886- need_retry = false;
923+ /* Advance counter so that new hashtable entries are distinguishable */
924+ mdsync_cycle_ctr ++ ;
887925
926+ /* Set flag to detect failure if we don't reach the end of the loop */
927+ mdsync_in_progress = true;
928+
929+ /* Now scan the hashtable for fsync requests to process */
930+ absorb_counter = FSYNCS_PER_ABSORB ;
931+ hash_seq_init (& hstat ,pendingOpsTable );
932+ while ((entry = (PendingOperationEntry * )hash_seq_search (& hstat ))!= NULL )
933+ {
888934/*
889- * If we are in the bgwriter, the sync had better include all fsync
890- * requests that were queued by backends before the checkpoint REDO
891- * point was determined. We go that a little better by accepting all
892- * requests queued up to the point where we start fsync'ing.
935+ * If the entry is new then don't process it this time. Note that
936+ * "continue" bypasses the hash-remove call at the bottom of the loop.
893937 */
894- AbsorbFsyncRequests ();
938+ if (entry -> cycle_ctr == mdsync_cycle_ctr )
939+ continue ;
895940
896- absorb_counter = FSYNCS_PER_ABSORB ;
897- hash_seq_init (& hstat ,pendingOpsTable );
898- while ((entry = (PendingOperationEntry * )hash_seq_search (& hstat ))!= NULL )
941+ /* Else assert we haven't missed it */
942+ Assert ((CycleCtr ) (entry -> cycle_ctr + 1 )== mdsync_cycle_ctr );
943+
944+ /*
945+ * If fsync is off then we don't have to bother opening the file
946+ * at all. (We delay checking until this point so that changing
947+ * fsync on the fly behaves sensibly.) Also, if the entry is
948+ * marked canceled, fall through to delete it.
949+ */
950+ if (enableFsync && !entry -> canceled )
899951{
952+ int failures ;
953+
900954/*
901- * If fsync is off then we don't have to bother opening the file
902- * at all. (We delay checking until this point so that changing
903- * fsync on the fly behaves sensibly.)
955+ * If in bgwriter, we want to absorb pending requests every so
956+ * often to prevent overflow of the fsync request queue. It is
957+ * unspecified whether newly-added entries will be visited by
958+ * hash_seq_search, but we don't care since we don't need to
959+ * process them anyway.
904960 */
905- if (enableFsync )
961+ if (-- absorb_counter <=0 )
962+ {
963+ AbsorbFsyncRequests ();
964+ absorb_counter = FSYNCS_PER_ABSORB ;
965+ }
966+
967+ /*
968+ * The fsync table could contain requests to fsync segments that
969+ * have been deleted (unlinked) by the time we get to them.
970+ * Rather than just hoping an ENOENT (or EACCES on Windows) error
971+ * can be ignored, what we do on error is absorb pending requests
972+ * and then retry. Since mdunlink() queues a "revoke" message
973+ * before actually unlinking, the fsync request is guaranteed to
974+ * be marked canceled after the absorb if it really was this case.
975+ * DROP DATABASE likewise has to tell us to forget fsync requests
976+ * before it starts deletions.
977+ */
978+ for (failures = 0 ; ;failures ++ )/* loop exits at "break" */
906979{
907980SMgrRelation reln ;
908981MdfdVec * seg ;
909982
910- /*
911- * If in bgwriter, we want to absorb pending requests every so
912- * often to prevent overflow of the fsync request queue. This
913- * could result in deleting the current entry out from under
914- * our hashtable scan, so the procedure is to fall out of the
915- * scan and start over from the top of the function.
916- */
917- if (-- absorb_counter <=0 )
918- {
919- need_retry = true;
920- break ;
921- }
922-
923983/*
924984 * Find or create an smgr hash entry for this relation. This
925985 * may seem a bit unclean -- md calling smgr? But it's really
@@ -940,50 +1000,64 @@ mdsync(void)
9401000/*
9411001 * It is possible that the relation has been dropped or
9421002 * truncated since the fsync request was entered. Therefore,
943- * allow ENOENT, but only if we didn't failonce already on
1003+ * allow ENOENT, but only if we didn't fail already on
9441004 * this file. This applies both during _mdfd_getseg() and
9451005 * during FileSync, since fd.c might have closed the file
9461006 * behind our back.
9471007 */
9481008seg = _mdfd_getseg (reln ,
9491009entry -> tag .segno * ((BlockNumber )RELSEG_SIZE ),
9501010 false,EXTENSION_RETURN_NULL );
951- if (seg == NULL ||
952- FileSync (seg -> mdfd_vfd )< 0 )
953- {
954- /*
955- * XXX is there any point in allowing more than one try?
956- * Don't see one at the moment, but easy to change the
957- * test here if so.
958- */
959- if (!FILE_POSSIBLY_DELETED (errno )||
960- ++ (entry -> failures )> 1 )
961- ereport (ERROR ,
962- (errcode_for_file_access (),
963- errmsg ("could not fsync segment %u of relation %u/%u/%u: %m" ,
964- entry -> tag .segno ,
965- entry -> tag .rnode .spcNode ,
966- entry -> tag .rnode .dbNode ,
967- entry -> tag .rnode .relNode )));
968- else
969- ereport (DEBUG1 ,
970- (errcode_for_file_access (),
971- errmsg ("could not fsync segment %u of relation %u/%u/%u, but retrying: %m" ,
972- entry -> tag .segno ,
973- entry -> tag .rnode .spcNode ,
974- entry -> tag .rnode .dbNode ,
975- entry -> tag .rnode .relNode )));
976- need_retry = true;
977- continue ;/* don't delete the hashtable entry */
978- }
979- }
1011+ if (seg != NULL &&
1012+ FileSync (seg -> mdfd_vfd ) >=0 )
1013+ break ;/* success; break out of retry loop */
9801014
981- /* Okay, delete this entry */
982- if (hash_search (pendingOpsTable ,& entry -> tag ,
983- HASH_REMOVE ,NULL )== NULL )
984- elog (ERROR ,"pendingOpsTable corrupted" );
1015+ /*
1016+ * XXX is there any point in allowing more than one retry?
1017+ * Don't see one at the moment, but easy to change the
1018+ * test here if so.
1019+ */
1020+ if (!FILE_POSSIBLY_DELETED (errno )||
1021+ failures > 0 )
1022+ ereport (ERROR ,
1023+ (errcode_for_file_access (),
1024+ errmsg ("could not fsync segment %u of relation %u/%u/%u: %m" ,
1025+ entry -> tag .segno ,
1026+ entry -> tag .rnode .spcNode ,
1027+ entry -> tag .rnode .dbNode ,
1028+ entry -> tag .rnode .relNode )));
1029+ else
1030+ ereport (DEBUG1 ,
1031+ (errcode_for_file_access (),
1032+ errmsg ("could not fsync segment %u of relation %u/%u/%u, but retrying: %m" ,
1033+ entry -> tag .segno ,
1034+ entry -> tag .rnode .spcNode ,
1035+ entry -> tag .rnode .dbNode ,
1036+ entry -> tag .rnode .relNode )));
1037+
1038+ /*
1039+ * Absorb incoming requests and check to see if canceled.
1040+ */
1041+ AbsorbFsyncRequests ();
1042+ absorb_counter = FSYNCS_PER_ABSORB ;/* might as well... */
1043+
1044+ if (entry -> canceled )
1045+ break ;
1046+ }/* end retry loop */
9851047}
986- }while (need_retry );
1048+
1049+ /*
1050+ * If we get here, either we fsync'd successfully, or we don't have
1051+ * to because enableFsync is off, or the entry is (now) marked
1052+ * canceled. Okay to delete it.
1053+ */
1054+ if (hash_search (pendingOpsTable ,& entry -> tag ,
1055+ HASH_REMOVE ,NULL )== NULL )
1056+ elog (ERROR ,"pendingOpsTable corrupted" );
1057+ }/* end loop over hashtable entries */
1058+
1059+ /* Flag successful completion of mdsync */
1060+ mdsync_in_progress = false;
9871061}
9881062
9891063/*
@@ -1027,8 +1101,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
10271101 *
10281102 * The range of possible segment numbers is way less than the range of
10291103 * BlockNumber, so we can reserve high values of segno for special purposes.
1030- * We define two: FORGET_RELATION_FSYNC means todrop pending fsyncs for
1031- * a relation, and FORGET_DATABASE_FSYNC means todrop pending fsyncs for
1104+ * We define two: FORGET_RELATION_FSYNC means tocancel pending fsyncs for
1105+ * a relation, and FORGET_DATABASE_FSYNC means tocancel pending fsyncs for
10321106 * a whole database. (These are a tad slow because the hash table has to be
10331107 * searched linearly, but it doesn't seem worth rethinking the table structure
10341108 * for them.)
@@ -1049,10 +1123,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
10491123{
10501124if (RelFileNodeEquals (entry -> tag .rnode ,rnode ))
10511125{
1052- /* Okay, delete this entry */
1053- if (hash_search (pendingOpsTable ,& entry -> tag ,
1054- HASH_REMOVE ,NULL )== NULL )
1055- elog (ERROR ,"pendingOpsTable corrupted" );
1126+ /* Okay, cancel this entry */
1127+ entry -> canceled = true;
10561128}
10571129}
10581130}
@@ -1067,10 +1139,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
10671139{
10681140if (entry -> tag .rnode .dbNode == rnode .dbNode )
10691141{
1070- /* Okay, delete this entry */
1071- if (hash_search (pendingOpsTable ,& entry -> tag ,
1072- HASH_REMOVE ,NULL )== NULL )
1073- elog (ERROR ,"pendingOpsTable corrupted" );
1142+ /* Okay, cancel this entry */
1143+ entry -> canceled = true;
10741144}
10751145}
10761146}
@@ -1090,8 +1160,25 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
10901160& key ,
10911161HASH_ENTER ,
10921162& found );
1093- if (!found )/* new entry, so initialize it */
1094- entry -> failures = 0 ;
1163+ /* if new or previously canceled entry, initialize it */
1164+ if (!found || entry -> canceled )
1165+ {
1166+ entry -> canceled = false;
1167+ entry -> cycle_ctr = mdsync_cycle_ctr ;
1168+ }
1169+ /*
1170+ * NB: it's intentional that we don't change cycle_ctr if the entry
1171+ * already exists. The fsync request must be treated as old, even
1172+ * though the new request will be satisfied too by any subsequent
1173+ * fsync.
1174+ *
1175+ * However, if the entry is present but is marked canceled, we should
1176+ * act just as though it wasn't there. The only case where this could
1177+ * happen would be if a file had been deleted, we received but did not
1178+ * yet act on the cancel request, and the same relfilenode was then
1179+ * assigned to a new file. We mustn't lose the new request, but
1180+ * it should be considered new not old.
1181+ */
10951182}
10961183}
10971184