88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.129 2007/07/03 14:51:24 tgl Exp $
11+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
3434/* special values for the segno arg to RememberFsyncRequest */
3535#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
3636#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
37+ #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
3738
3839/*
3940 * On Windows, we have to interpret EACCES as possibly meaning the same as
@@ -113,6 +114,10 @@ static MemoryContext MdCxt;/* context for all md.c allocations */
113114 * table remembers the pending operations.We use a hash table mostly as
114115 * a convenient way of eliminating duplicate requests.
115116 *
117+ * We use a similar mechanism to remember no-longer-needed files that can
118+ * be deleted after the next checkpoint, but we use a linked list instead of
119+ * a hash table, because we don't expect there to be any duplicate requests.
120+ *
116121 * (Regular backends do not track pending operations locally, but forward
117122 * them to the bgwriter.)
118123 */
@@ -131,9 +136,17 @@ typedef struct
131136CycleCtr cycle_ctr ;/* mdsync_cycle_ctr when request was made */
132137}PendingOperationEntry ;
133138
139+ typedef struct
140+ {
141+ RelFileNode rnode ;/* the dead relation to delete */
142+ CycleCtr cycle_ctr ;/* mdckpt_cycle_ctr when request was made */
143+ }PendingUnlinkEntry ;
144+
134145static HTAB * pendingOpsTable = NULL ;
146+ static List * pendingUnlinks = NIL ;
135147
136148static CycleCtr mdsync_cycle_ctr = 0 ;
149+ static CycleCtr mdckpt_cycle_ctr = 0 ;
137150
138151
139152typedef enum /* behavior for mdopen & _mdfd_getseg */
@@ -146,6 +159,7 @@ typedef enum/* behavior for mdopen & _mdfd_getseg */
146159/* local routines */
147160static MdfdVec * mdopen (SMgrRelation reln ,ExtensionBehavior behavior );
148161static void register_dirty_segment (SMgrRelation reln ,MdfdVec * seg );
162+ static void register_unlink (RelFileNode rnode );
149163static MdfdVec * _fdvec_alloc (void );
150164
151165#ifndef LET_OS_MANAGE_FILESIZE
@@ -188,6 +202,7 @@ mdinit(void)
188202100L ,
189203& hash_ctl ,
190204HASH_ELEM |HASH_FUNCTION |HASH_CONTEXT );
205+ pendingUnlinks = NIL ;
191206}
192207}
193208
@@ -254,14 +269,37 @@ mdcreate(SMgrRelation reln, bool isRedo)
254269 * Note that we're passed a RelFileNode --- by the time this is called,
255270 * there won't be an SMgrRelation hashtable entry anymore.
256271 *
272+ * Actually, we don't unlink the first segment file of the relation, but
273+ * just truncate it to zero length, and record a request to unlink it after
274+ * the next checkpoint. Additional segments can be unlinked immediately,
275+ * however. Leaving the empty file in place prevents that relfilenode
276+ * number from being reused. The scenario this protects us from is:
277+ * 1. We delete a relation (and commit, and actually remove its file).
278+ * 2. We create a new relation, which by chance gets the same relfilenode as
279+ * the just-deleted one (OIDs must've wrapped around for that to happen).
280+ * 3. We crash before another checkpoint occurs.
281+ * During replay, we would delete the file and then recreate it, which is fine
282+ * if the contents of the file were repopulated by subsequent WAL entries.
283+ * But if we didn't WAL-log insertions, but instead relied on fsyncing the
284+ * file after populating it (as for instance CLUSTER and CREATE INDEX do),
285+ * the contents of the file would be lost forever. By leaving the empty file
286+ * until after the next checkpoint, we prevent reassignment of the relfilenode
287+ * number until it's safe, because relfilenode assignment skips over any
288+ * existing file.
289+ *
257290 * If isRedo is true, it's okay for the relation to be already gone.
258- * Also, any failure should be reported as WARNING not ERROR, because
291+ * Also, we should remove the file immediately instead of queuing a request
292+ * for later, since during redo there's no possibility of creating a
293+ * conflicting relation.
294+ *
295+ * Note: any failure should be reported as WARNING not ERROR, because
259296 * we are usually not in a transaction anymore when this is called.
260297 */
261298void
262299mdunlink (RelFileNode rnode ,bool isRedo )
263300{
264301char * path ;
302+ int ret ;
265303
266304/*
267305 * We have to clean out any pending fsync requests for the doomed relation,
@@ -271,8 +309,15 @@ mdunlink(RelFileNode rnode, bool isRedo)
271309
272310path = relpath (rnode );
273311
274- /* Delete the first segment, or only segment if not doing segmenting */
275- if (unlink (path )< 0 )
312+ /*
313+ * Delete or truncate the first segment, or only segment if not doing
314+ * segmenting
315+ */
316+ if (isRedo )
317+ ret = unlink (path );
318+ else
319+ ret = truncate (path ,0 );
320+ if (ret < 0 )
276321{
277322if (!isRedo || errno != ENOENT )
278323ereport (WARNING ,
@@ -316,6 +361,10 @@ mdunlink(RelFileNode rnode, bool isRedo)
316361#endif
317362
318363pfree (path );
364+
365+ /* Register request to unlink first segment later */
366+ if (!isRedo )
367+ register_unlink (rnode );
319368}
320369
321370/*
@@ -1063,6 +1112,91 @@ mdsync(void)
10631112mdsync_in_progress = false;
10641113}
10651114
1115+ /*
1116+ * mdpreckpt() -- Do pre-checkpoint work
1117+ *
1118+ * To distinguish unlink requests that arrived before this checkpoint
1119+ * started from those that arrived during the checkpoint, we use a cycle
1120+ * counter similar to the one we use for fsync requests. That cycle
1121+ * counter is incremented here.
1122+ *
1123+ * This must be called *before* the checkpoint REDO point is determined.
1124+ * That ensures that we won't delete files too soon.
1125+ *
1126+ * Note that we can't do anything here that depends on the assumption
1127+ * that the checkpoint will be completed.
1128+ */
1129+ void
1130+ mdpreckpt (void )
1131+ {
1132+ ListCell * cell ;
1133+
1134+ /*
1135+ * In case the prior checkpoint wasn't completed, stamp all entries in
1136+ * the list with the current cycle counter. Anything that's in the
1137+ * list at the start of checkpoint can surely be deleted after the
1138+ * checkpoint is finished, regardless of when the request was made.
1139+ */
1140+ foreach (cell ,pendingUnlinks )
1141+ {
1142+ PendingUnlinkEntry * entry = (PendingUnlinkEntry * )lfirst (cell );
1143+
1144+ entry -> cycle_ctr = mdckpt_cycle_ctr ;
1145+ }
1146+
1147+ /*
1148+ * Any unlink requests arriving after this point will be assigned the
1149+ * next cycle counter, and won't be unlinked until next checkpoint.
1150+ */
1151+ mdckpt_cycle_ctr ++ ;
1152+ }
1153+
1154+ /*
1155+ * mdpostckpt() -- Do post-checkpoint work
1156+ *
1157+ * Remove any lingering files that can now be safely removed.
1158+ */
1159+ void
1160+ mdpostckpt (void )
1161+ {
1162+ while (pendingUnlinks != NIL )
1163+ {
1164+ PendingUnlinkEntry * entry = (PendingUnlinkEntry * )linitial (pendingUnlinks );
1165+ char * path ;
1166+
1167+ /*
1168+ * New entries are appended to the end, so if the entry is new
1169+ * we've reached the end of old entries.
1170+ */
1171+ if (entry -> cycle_ctr == mdsync_cycle_ctr )
1172+ break ;
1173+
1174+ /* Else assert we haven't missed it */
1175+ Assert ((CycleCtr ) (entry -> cycle_ctr + 1 )== mdckpt_cycle_ctr );
1176+
1177+ /* Unlink the file */
1178+ path = relpath (entry -> rnode );
1179+ if (unlink (path )< 0 )
1180+ {
1181+ /*
1182+ * ENOENT shouldn't happen either, but it doesn't really matter
1183+ * because we would've deleted it now anyway.
1184+ */
1185+ if (errno != ENOENT )
1186+ ereport (WARNING ,
1187+ (errcode_for_file_access (),
1188+ errmsg ("could not remove relation %u/%u/%u: %m" ,
1189+ entry -> rnode .spcNode ,
1190+ entry -> rnode .dbNode ,
1191+ entry -> rnode .relNode )));
1192+ }
1193+ pfree (path );
1194+
1195+ pendingUnlinks = list_delete_first (pendingUnlinks );
1196+ pfree (entry );
1197+ }
1198+ }
1199+
10661200/*
10671201 * register_dirty_segment() -- Mark a relation segment as needing fsync
10681202 *
@@ -1096,19 +1230,53 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
10961230}
10971231}
10981232
1233+ /*
1234+ * register_unlink() -- Schedule a file to be deleted after next checkpoint
1235+ *
1236+ * As with register_dirty_segment, this could involve either a local or
1237+ * a remote pending-ops table.
1238+ */
1239+ static void
1240+ register_unlink (RelFileNode rnode )
1241+ {
1242+ if (pendingOpsTable )
1243+ {
1244+ /* push it into local pending-ops table */
1245+ RememberFsyncRequest (rnode ,UNLINK_RELATION_REQUEST );
1246+ }
1247+ else
1248+ {
1249+ /*
1250+ * Notify the bgwriter about it. If we fail to queue the request
1251+ * message, we have to sleep and try again, because we can't simply
1252+ * delete the file now. Ugly, but hopefully won't happen often.
1253+ *
1254+ * XXX should we just leave the file orphaned instead?
1255+ */
1256+ Assert (IsUnderPostmaster );
1257+ while (!ForwardFsyncRequest (rnode ,UNLINK_RELATION_REQUEST ))
1258+ pg_usleep (10000L );/* 10 msec seems a good number */
1259+ }
1260+ }
1261+
10991262/*
11001263 * RememberFsyncRequest() -- callback from bgwriter side of fsync request
11011264 *
1102- * We stuff the fsync request into the local hash table for execution
1103- * during the bgwriter's next checkpoint.
1265+ * We stuff most fsync requests into the local hash table for execution
1266+ * during the bgwriter's next checkpoint. UNLINK requests go into a
1267+ * separate linked list, however, because they get processed separately.
11041268 *
11051269 * The range of possible segment numbers is way less than the range of
11061270 * BlockNumber, so we can reserve high values of segno for special purposes.
1107- * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
1108- * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
1109- * a whole database. (These are a tad slow because the hash table has to be
1110- * searched linearly, but it doesn't seem worth rethinking the table structure
1111- * for them.)
1271+ * We define three:
1272+ * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1273+ * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1274+ * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1275+ * checkpoint.
1276+ *
1277+ * (Handling the FORGET_* requests is a tad slow because the hash table has
1278+ * to be searched linearly, but it doesn't seem worth rethinking the table
1279+ * structure for them.)
11121280 */
11131281void
11141282RememberFsyncRequest (RelFileNode rnode ,BlockNumber segno )
@@ -1147,6 +1315,20 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
11471315}
11481316}
11491317}
1318+ else if (segno == UNLINK_RELATION_REQUEST )
1319+ {
1320+ /* Unlink request: put it in the linked list */
1321+ MemoryContext oldcxt = MemoryContextSwitchTo (MdCxt );
1322+ PendingUnlinkEntry * entry ;
1323+
1324+ entry = palloc (sizeof (PendingUnlinkEntry ));
1325+ entry -> rnode = rnode ;
1326+ entry -> cycle_ctr = mdckpt_cycle_ctr ;
1327+
1328+ pendingUnlinks = lappend (pendingUnlinks ,entry );
1329+
1330+ MemoryContextSwitchTo (oldcxt );
1331+ }
11501332else
11511333{
11521334/* Normal case: enter a request to fsync this segment */