88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.126 2007/01/1700:17:21 tgl Exp $
11+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/1716:25:01 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
3131/* interval for calling AbsorbFsyncRequests in mdsync */
3232#define FSYNCS_PER_ABSORB 10
3333
34+ /* special values for the segno arg to RememberFsyncRequest */
35+ #define FORGET_RELATION_FSYNC (InvalidBlockNumber)
36+ #define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
37+
3438/*
3539 * On Windows, we have to interpret EACCES as possibly meaning the same as
3640 * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
@@ -258,30 +262,7 @@ mdunlink(RelFileNode rnode, bool isRedo)
258262 * We have to clean out any pending fsync requests for the doomed relation,
259263 * else the next mdsync() will fail.
260264 */
261- if (pendingOpsTable )
262- {
263- /* standalone backend or startup process: fsync state is local */
264- RememberFsyncRequest (rnode ,InvalidBlockNumber );
265- }
266- else if (IsUnderPostmaster )
267- {
268- /*
269- * Notify the bgwriter about it. If we fail to queue the revoke
270- * message, we have to sleep and try again ... ugly, but hopefully
271- * won't happen often.
272- *
273- * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with
274- * an error would leave the no-longer-used file still present on
275- * disk, which would be bad, so I'm inclined to assume that the
276- * bgwriter will always empty the queue soon.
277- */
278- while (!ForwardFsyncRequest (rnode ,InvalidBlockNumber ))
279- pg_usleep (10000L );/* 10 msec seems a good number */
280- /*
281- * Note we don't wait for the bgwriter to actually absorb the
282- * revoke message; see mdsync() for the implications.
283- */
284- }
265+ ForgetRelationFsyncRequests (rnode );
285266
286267path = relpath (rnode );
287268
@@ -894,7 +875,8 @@ mdsync(void)
894875 * what we will do is retry the whole process after absorbing fsync
895876 * request messages again. Since mdunlink() queues a "revoke" message
896877 * before actually unlinking, the fsync request is guaranteed to be gone
897- * the second time if it really was this case.
878+ * the second time if it really was this case. DROP DATABASE likewise
879+ * has to tell us to forget fsync requests before it starts deletions.
898880 */
899881do {
900882HASH_SEQ_STATUS hstat ;
@@ -1043,17 +1025,58 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
10431025 * We stuff the fsync request into the local hash table for execution
10441026 * during the bgwriter's next checkpoint.
10451027 *
1046- * segno == InvalidBlockNumber is a "revoke" request: remove any pending
1047- * fsync requests for the whole relation.
1028+ * The range of possible segment numbers is way less than the range of
1029+ * BlockNumber, so we can reserve high values of segno for special purposes.
1030+ * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
1031+ * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
1032+ * a whole database. (These are a tad slow because the hash table has to be
1033+ * searched linearly, but it doesn't seem worth rethinking the table structure
1034+ * for them.)
10481035 */
10491036void
10501037RememberFsyncRequest (RelFileNode rnode ,BlockNumber segno )
10511038{
10521039Assert (pendingOpsTable );
10531040
1054- if (segno != InvalidBlockNumber )
1041+ if (segno == FORGET_RELATION_FSYNC )
1042+ {
1043+ /* Remove any pending requests for the entire relation */
1044+ HASH_SEQ_STATUS hstat ;
1045+ PendingOperationEntry * entry ;
1046+
1047+ hash_seq_init (& hstat ,pendingOpsTable );
1048+ while ((entry = (PendingOperationEntry * )hash_seq_search (& hstat ))!= NULL )
1049+ {
1050+ if (RelFileNodeEquals (entry -> tag .rnode ,rnode ))
1051+ {
1052+ /* Okay, delete this entry */
1053+ if (hash_search (pendingOpsTable ,& entry -> tag ,
1054+ HASH_REMOVE ,NULL )== NULL )
1055+ elog (ERROR ,"pendingOpsTable corrupted" );
1056+ }
1057+ }
1058+ }
1059+ else if (segno == FORGET_DATABASE_FSYNC )
1060+ {
1061+ /* Remove any pending requests for the entire database */
1062+ HASH_SEQ_STATUS hstat ;
1063+ PendingOperationEntry * entry ;
1064+
1065+ hash_seq_init (& hstat ,pendingOpsTable );
1066+ while ((entry = (PendingOperationEntry * )hash_seq_search (& hstat ))!= NULL )
1067+ {
1068+ if (entry -> tag .rnode .dbNode == rnode .dbNode )
1069+ {
1070+ /* Okay, delete this entry */
1071+ if (hash_search (pendingOpsTable ,& entry -> tag ,
1072+ HASH_REMOVE ,NULL )== NULL )
1073+ elog (ERROR ,"pendingOpsTable corrupted" );
1074+ }
1075+ }
1076+ }
1077+ else
10551078{
1056- /*Enter a request to fsync this segment */
1079+ /*Normal case: enter a request to fsync this segment */
10571080PendingOperationTag key ;
10581081PendingOperationEntry * entry ;
10591082bool found ;
@@ -1070,29 +1093,66 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
10701093if (!found )/* new entry, so initialize it */
10711094entry -> failures = 0 ;
10721095}
1073- else
1096+ }
1097+
1098+ /*
1099+ * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
1100+ */
1101+ void
1102+ ForgetRelationFsyncRequests (RelFileNode rnode )
1103+ {
1104+ if (pendingOpsTable )
1105+ {
1106+ /* standalone backend or startup process: fsync state is local */
1107+ RememberFsyncRequest (rnode ,FORGET_RELATION_FSYNC );
1108+ }
1109+ else if (IsUnderPostmaster )
10741110{
10751111/*
1076- * Remove any pending requests for the entire relation. (This is a
1077- * tad slow but it doesn't seem worth rethinking the table structure.)
1112+ * Notify the bgwriter about it. If we fail to queue the revoke
1113+ * message, we have to sleep and try again ... ugly, but hopefully
1114+ * won't happen often.
1115+ *
1116+ * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with
1117+ * an error would leave the no-longer-used file still present on
1118+ * disk, which would be bad, so I'm inclined to assume that the
1119+ * bgwriter will always empty the queue soon.
10781120 */
1079- HASH_SEQ_STATUS hstat ;
1080- PendingOperationEntry * entry ;
1121+ while (!ForwardFsyncRequest (rnode ,FORGET_RELATION_FSYNC ))
1122+ pg_usleep (10000L );/* 10 msec seems a good number */
1123+ /*
1124+ * Note we don't wait for the bgwriter to actually absorb the
1125+ * revoke message; see mdsync() for the implications.
1126+ */
1127+ }
1128+ }
10811129
1082- hash_seq_init (& hstat ,pendingOpsTable );
1083- while ((entry = (PendingOperationEntry * )hash_seq_search (& hstat ))!= NULL )
1084- {
1085- if (RelFileNodeEquals (entry -> tag .rnode ,rnode ))
1086- {
1087- /* Okay, delete this entry */
1088- if (hash_search (pendingOpsTable ,& entry -> tag ,
1089- HASH_REMOVE ,NULL )== NULL )
1090- elog (ERROR ,"pendingOpsTable corrupted" );
1091- }
1092- }
1130+ /*
1131+ * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
1132+ */
1133+ void
1134+ ForgetDatabaseFsyncRequests (Oid dbid )
1135+ {
1136+ RelFileNode rnode ;
1137+
1138+ rnode .dbNode = dbid ;
1139+ rnode .spcNode = 0 ;
1140+ rnode .relNode = 0 ;
1141+
1142+ if (pendingOpsTable )
1143+ {
1144+ /* standalone backend or startup process: fsync state is local */
1145+ RememberFsyncRequest (rnode ,FORGET_DATABASE_FSYNC );
1146+ }
1147+ else if (IsUnderPostmaster )
1148+ {
1149+ /* see notes in ForgetRelationFsyncRequests */
1150+ while (!ForwardFsyncRequest (rnode ,FORGET_DATABASE_FSYNC ))
1151+ pg_usleep (10000L );/* 10 msec seems a good number */
10931152}
10941153}
10951154
1155+
10961156/*
10971157 *_fdvec_alloc() -- Make a MdfdVec object.
10981158 */