NotificationsYou must be signed in to change notification settings
Fork5
Star27

Commit6cc4451

committed

Prevent re-use of a deleted relation's relfilenode until after the next

checkpoint. This guards against an unlikely data-loss scenario in whichwe re-use the relfilenode, then crash, then replay the deletion andrecreation of the file. Even then we'd be OK if all insertions into thenew relation had been WAL-logged ... but that's not guaranteed given allthe no-WAL-logging optimizations that have recently been added.Patch by Heikki Linnakangas, per a discussion last month.

1 parent7a550cb commit6cc4451Copy full SHA for 6cc4451

File tree

5 files changed

+274

-25

lines changed

src
- backend
  - access/transam
    - xlog.c
  - commands
    - tablespace.c
  - storage/smgr
    - md.c
    - smgr.c
- include/storage
  - smgr.h

5 files changed

+274

-25

lines changed

`‎src/backend/access/transam/xlog.c‎`

Lines changed: 15 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group`
`8`	`8`	`* Portions Copyright (c) 1994, Regents of the University of California`
`9`	`9`	`*`
`10`		`- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.286 2007/10/12 19:39:59 tgl Exp $`
	`10`	`+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.287 2007/11/15 20:36:40 tgl Exp $`
`11`	`11`	`*`
`12`	`12`	`*-------------------------------------------------------------------------`
`13`	`13`	`*/`
`@@ -45,6 +45,7 @@`
`45`	`45`	`#include"storage/fd.h"`
`46`	`46`	`#include"storage/pmsignal.h"`
`47`	`47`	`#include"storage/procarray.h"`
	`48`	`+#include"storage/smgr.h"`
`48`	`49`	`#include"storage/spin.h"`
`49`	`50`	`#include"utils/builtins.h"`
`50`	`51`	`#include"utils/pg_locale.h"`
`@@ -5663,6 +5664,14 @@ CreateCheckPoint(int flags)`
`5663`	`5664`	`UpdateControlFile();`
`5664`	`5665`	`}`
`5665`	`5666`
	`5667`	`+/*`
	`5668`	`+ * Let smgr prepare for checkpoint; this has to happen before we`
	`5669`	`+ * determine the REDO pointer. Note that smgr must not do anything`
	`5670`	`+ * that'd have to be undone if we decide no checkpoint is needed.`
	`5671`	`+ */`
	`5672`	`+smgrpreckpt();`
	`5673`	`+`
	`5674`	`+/* Begin filling in the checkpoint WAL record */`
`5666`	`5675`	`MemSet(&checkPoint,0,sizeof(checkPoint));`
`5667`	`5676`	`checkPoint.ThisTimeLineID=ThisTimeLineID;`
`5668`	`5677`	`checkPoint.time=time(NULL);`
`@@ -5886,6 +5895,11 @@ CreateCheckPoint(int flags)`
`5886`	`5895`	`*/`
`5887`	`5896`	`END_CRIT_SECTION();`
`5888`	`5897`
	`5898`	`+/*`
	`5899`	`+ * Let smgr do post-checkpoint cleanup (eg, deleting old files).`
	`5900`	`+ */`
	`5901`	`+smgrpostckpt();`
	`5902`	`+`
`5889`	`5903`	`/*`
`5890`	`5904`	`* Delete old log files (those no longer needed even for previous`
`5891`	`5905`	`* checkpoint).`

`‎src/backend/commands/tablespace.c‎`

Lines changed: 23 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@`
`37`	`37`	`*`
`38`	`38`	`*`
`39`	`39`	`* IDENTIFICATION`
`40`		`- * $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.49 2007/08/01 22:45:08 tgl Exp $`
	`40`	`+ * $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.50 2007/11/15 20:36:40 tgl Exp $`
`41`	`41`	`*`
`42`	`42`	`*-------------------------------------------------------------------------`
`43`	`43`	`*/`
`@@ -57,6 +57,7 @@`
`57`	`57`	`#include"commands/comment.h"`
`58`	`58`	`#include"commands/tablespace.h"`
`59`	`59`	`#include"miscadmin.h"`
	`60`	`+#include"postmaster/bgwriter.h"`
`60`	`61`	`#include"storage/fd.h"`
`61`	`62`	`#include"utils/acl.h"`
`62`	`63`	`#include"utils/builtins.h"`
`@@ -460,13 +461,29 @@ DropTableSpace(DropTableSpaceStmt *stmt)`
`460`	`461`	`LWLockAcquire(TablespaceCreateLock,LW_EXCLUSIVE);`
`461`	`462`
`462`	`463`	`/*`
`463`		`- * Try to remove the physical infrastructure`
	`464`	`+ * Try to remove the physical infrastructure.`
`464`	`465`	`*/`
`465`	`466`	`if (!remove_tablespace_directories(tablespaceoid, false))`
`466`		`-ereport(ERROR,`
`467`		`-(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),`
`468`		`-errmsg("tablespace \"%s\" is not empty",`
`469`		`-tablespacename)));`
	`467`	`+{`
	`468`	`+/*`
	`469`	`+ * Not all files deleted? However, there can be lingering empty files`
	`470`	`+ * in the directories, left behind by for example DROP TABLE, that`
	`471`	`+ * have been scheduled for deletion at next checkpoint (see comments`
	`472`	`+ * in mdunlink() for details). We could just delete them immediately,`
	`473`	`+ * but we can't tell them apart from important data files that we`
	`474`	`+ * mustn't delete. So instead, we force a checkpoint which will clean`
	`475`	`+ * out any lingering files, and try again.`
	`476`	`+ */`
	`477`	`+RequestCheckpoint(CHECKPOINT_IMMEDIATE \|CHECKPOINT_FORCE \|CHECKPOINT_WAIT);`
	`478`	`+if (!remove_tablespace_directories(tablespaceoid, false))`
	`479`	`+{`
	`480`	`+/* Still not empty, the files must be important then */`
	`481`	`+ereport(ERROR,`
	`482`	`+(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),`
	`483`	`+errmsg("tablespace \"%s\" is not empty",`
	`484`	`+tablespacename)));`
	`485`	`+}`
	`486`	`+}`
`470`	`487`
`471`	`488`	`/* Record the filesystem change in XLOG */`
`472`	`489`	`{`

`‎src/backend/storage/smgr/md.c‎`

Lines changed: 193 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`*`
`9`	`9`	`*`
`10`	`10`	`* IDENTIFICATION`
`11`		`- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.129 2007/07/03 14:51:24 tgl Exp $`
	`11`	`+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $`
`12`	`12`	`*`
`13`	`13`	`*-------------------------------------------------------------------------`
`14`	`14`	`*/`
`@@ -34,6 +34,7 @@`
`34`	`34`	`/* special values for the segno arg to RememberFsyncRequest */`
`35`	`35`	`#defineFORGET_RELATION_FSYNC(InvalidBlockNumber)`
`36`	`36`	`#defineFORGET_DATABASE_FSYNC(InvalidBlockNumber-1)`
	`37`	`+#defineUNLINK_RELATION_REQUEST(InvalidBlockNumber-2)`
`37`	`38`
`38`	`39`	`/*`
`39`	`40`	`* On Windows, we have to interpret EACCES as possibly meaning the same as`
`@@ -113,6 +114,10 @@ static MemoryContext MdCxt;/* context for all md.c allocations */`
`113`	`114`	`* table remembers the pending operations.We use a hash table mostly as`
`114`	`115`	`* a convenient way of eliminating duplicate requests.`
`115`	`116`	`*`
	`117`	`+ * We use a similar mechanism to remember no-longer-needed files that can`
	`118`	`+ * be deleted after the next checkpoint, but we use a linked list instead of`
	`119`	`+ * a hash table, because we don't expect there to be any duplicate requests.`
	`120`	`+ *`
`116`	`121`	`* (Regular backends do not track pending operations locally, but forward`
`117`	`122`	`* them to the bgwriter.)`
`118`	`123`	`*/`
`@@ -131,9 +136,17 @@ typedef struct`
`131`	`136`	`CycleCtrcycle_ctr;/* mdsync_cycle_ctr when request was made */`
`132`	`137`	`}PendingOperationEntry;`
`133`	`138`
	`139`	`+typedefstruct`
	`140`	`+{`
	`141`	`+RelFileNodernode;/* the dead relation to delete */`
	`142`	`+CycleCtrcycle_ctr;/* mdckpt_cycle_ctr when request was made */`
	`143`	`+}PendingUnlinkEntry;`
	`144`	`+`
`134`	`145`	`staticHTAB*pendingOpsTable=NULL;`
	`146`	`+staticList*pendingUnlinks=NIL;`
`135`	`147`
`136`	`148`	`staticCycleCtrmdsync_cycle_ctr=0;`
	`149`	`+staticCycleCtrmdckpt_cycle_ctr=0;`
`137`	`150`
`138`	`151`
`139`	`152`	`typedefenum/* behavior for mdopen & _mdfd_getseg */`
`@@ -146,6 +159,7 @@ typedef enum/* behavior for mdopen & _mdfd_getseg */`
`146`	`159`	`/* local routines */`
`147`	`160`	`staticMdfdVec*mdopen(SMgrRelationreln,ExtensionBehaviorbehavior);`
`148`	`161`	`staticvoidregister_dirty_segment(SMgrRelationreln,MdfdVec*seg);`
	`162`	`+staticvoidregister_unlink(RelFileNodernode);`
`149`	`163`	`staticMdfdVec*_fdvec_alloc(void);`
`150`	`164`
`151`	`165`	`#ifndefLET_OS_MANAGE_FILESIZE`
`@@ -188,6 +202,7 @@ mdinit(void)`
`188`	`202`	`100L,`
`189`	`203`	`&hash_ctl,`
`190`	`204`	`HASH_ELEM \|HASH_FUNCTION \|HASH_CONTEXT);`
	`205`	`+pendingUnlinks=NIL;`
`191`	`206`	`}`
`192`	`207`	`}`
`193`	`208`
`@@ -254,14 +269,37 @@ mdcreate(SMgrRelation reln, bool isRedo)`
`254`	`269`	`* Note that we're passed a RelFileNode --- by the time this is called,`
`255`	`270`	`* there won't be an SMgrRelation hashtable entry anymore.`
`256`	`271`	`*`
	`272`	`+ * Actually, we don't unlink the first segment file of the relation, but`
	`273`	`+ * just truncate it to zero length, and record a request to unlink it after`
	`274`	`+ * the next checkpoint. Additional segments can be unlinked immediately,`
	`275`	`+ * however. Leaving the empty file in place prevents that relfilenode`
	`276`	`+ * number from being reused. The scenario this protects us from is:`
	`277`	`+ * 1. We delete a relation (and commit, and actually remove its file).`
	`278`	`+ * 2. We create a new relation, which by chance gets the same relfilenode as`
	`279`	`+ * the just-deleted one (OIDs must've wrapped around for that to happen).`
	`280`	`+ * 3. We crash before another checkpoint occurs.`
	`281`	`+ * During replay, we would delete the file and then recreate it, which is fine`
	`282`	`+ * if the contents of the file were repopulated by subsequent WAL entries.`
	`283`	`+ * But if we didn't WAL-log insertions, but instead relied on fsyncing the`
	`284`	`+ * file after populating it (as for instance CLUSTER and CREATE INDEX do),`
	`285`	`+ * the contents of the file would be lost forever. By leaving the empty file`
	`286`	`+ * until after the next checkpoint, we prevent reassignment of the relfilenode`
	`287`	`+ * number until it's safe, because relfilenode assignment skips over any`
	`288`	`+ * existing file.`
	`289`	`+ *`
`257`	`290`	`* If isRedo is true, it's okay for the relation to be already gone.`
`258`		`- * Also, any failure should be reported as WARNING not ERROR, because`
	`291`	`+ * Also, we should remove the file immediately instead of queuing a request`
	`292`	`+ * for later, since during redo there's no possibility of creating a`
	`293`	`+ * conflicting relation.`
	`294`	`+ *`
	`295`	`+ * Note: any failure should be reported as WARNING not ERROR, because`
`259`	`296`	`* we are usually not in a transaction anymore when this is called.`
`260`	`297`	`*/`
`261`	`298`	`void`
`262`	`299`	`mdunlink(RelFileNodernode,boolisRedo)`
`263`	`300`	`{`
`264`	`301`	`char*path;`
	`302`	`+intret;`
`265`	`303`
`266`	`304`	`/*`
`267`	`305`	`* We have to clean out any pending fsync requests for the doomed relation,`
`@@ -271,8 +309,15 @@ mdunlink(RelFileNode rnode, bool isRedo)`
`271`	`309`
`272`	`310`	`path=relpath(rnode);`
`273`	`311`
`274`		`-/* Delete the first segment, or only segment if not doing segmenting */`
`275`		`-if (unlink(path)<0)`
	`312`	`+/*`
	`313`	`+ * Delete or truncate the first segment, or only segment if not doing`
	`314`	`+ * segmenting`
	`315`	`+ */`
	`316`	`+if (isRedo)`
	`317`	`+ret=unlink(path);`
	`318`	`+else`
	`319`	`+ret=truncate(path,0);`
	`320`	`+if (ret<0)`
`276`	`321`	`{`
`277`	`322`	`if (!isRedo\|\|errno!=ENOENT)`
`278`	`323`	`ereport(WARNING,`
`@@ -316,6 +361,10 @@ mdunlink(RelFileNode rnode, bool isRedo)`
`316`	`361`	`#endif`
`317`	`362`
`318`	`363`	`pfree(path);`
	`364`	`+`
	`365`	`+/* Register request to unlink first segment later */`
	`366`	`+if (!isRedo)`
	`367`	`+register_unlink(rnode);`
`319`	`368`	`}`
`320`	`369`
`321`	`370`	`/*`
`@@ -1063,6 +1112,91 @@ mdsync(void)`
`1063`	`1112`	`mdsync_in_progress= false;`
`1064`	`1113`	`}`
`1065`	`1114`
	`1115`	`+/*`
	`1116`	`+ * mdpreckpt() -- Do pre-checkpoint work`
	`1117`	`+ *`
	`1118`	`+ * To distinguish unlink requests that arrived before this checkpoint`
	`1119`	`+ * started from those that arrived during the checkpoint, we use a cycle`
	`1120`	`+ * counter similar to the one we use for fsync requests. That cycle`
	`1121`	`+ * counter is incremented here.`
	`1122`	`+ *`
	`1123`	`+ * This must be called before the checkpoint REDO point is determined.`
	`1124`	`+ * That ensures that we won't delete files too soon.`
	`1125`	`+ *`
	`1126`	`+ * Note that we can't do anything here that depends on the assumption`
	`1127`	`+ * that the checkpoint will be completed.`
	`1128`	`+ */`
	`1129`	`+void`
	`1130`	`+mdpreckpt(void)`
	`1131`	`+{`
	`1132`	`+ListCell*cell;`
	`1133`	`+`
	`1134`	`+/*`
	`1135`	`+ * In case the prior checkpoint wasn't completed, stamp all entries in`
	`1136`	`+ * the list with the current cycle counter. Anything that's in the`
	`1137`	`+ * list at the start of checkpoint can surely be deleted after the`
	`1138`	`+ * checkpoint is finished, regardless of when the request was made.`
	`1139`	`+ */`
	`1140`	`+foreach(cell,pendingUnlinks)`
	`1141`	`+{`
	`1142`	`+PendingUnlinkEntryentry= (PendingUnlinkEntry)lfirst(cell);`
	`1143`	`+`
	`1144`	`+entry->cycle_ctr=mdckpt_cycle_ctr;`
	`1145`	`+}`
	`1146`	`+`
	`1147`	`+/*`
	`1148`	`+ * Any unlink requests arriving after this point will be assigned the`
	`1149`	`+ * next cycle counter, and won't be unlinked until next checkpoint.`
	`1150`	`+ */`
	`1151`	`+mdckpt_cycle_ctr++;`
	`1152`	`+}`
	`1153`	`+`
	`1154`	`+/*`
	`1155`	`+ * mdpostckpt() -- Do post-checkpoint work`
	`1156`	`+ *`
	`1157`	`+ * Remove any lingering files that can now be safely removed.`
	`1158`	`+ */`
	`1159`	`+void`
	`1160`	`+mdpostckpt(void)`
	`1161`	`+{`
	`1162`	`+while (pendingUnlinks!=NIL)`
	`1163`	`+{`
	`1164`	`+PendingUnlinkEntryentry= (PendingUnlinkEntry)linitial(pendingUnlinks);`
	`1165`	`+char*path;`
	`1166`	`+`
	`1167`	`+/*`
	`1168`	`+ * New entries are appended to the end, so if the entry is new`
	`1169`	`+ * we've reached the end of old entries.`
	`1170`	`+ */`
	`1171`	`+if (entry->cycle_ctr==mdsync_cycle_ctr)`
	`1172`	`+break;`
	`1173`	`+`
	`1174`	`+/* Else assert we haven't missed it */`
	`1175`	`+Assert((CycleCtr) (entry->cycle_ctr+1)==mdckpt_cycle_ctr);`
	`1176`	`+`
	`1177`	`+/* Unlink the file */`
	`1178`	`+path=relpath(entry->rnode);`
	`1179`	`+if (unlink(path)<0)`
	`1180`	`+{`
	`1181`	`+/*`
	`1182`	`+ * ENOENT shouldn't happen either, but it doesn't really matter`
	`1183`	`+ * because we would've deleted it now anyway.`
	`1184`	`+ */`
	`1185`	`+if (errno!=ENOENT)`
	`1186`	`+ereport(WARNING,`
	`1187`	`+(errcode_for_file_access(),`
	`1188`	`+errmsg("could not remove relation %u/%u/%u: %m",`
	`1189`	`+entry->rnode.spcNode,`
	`1190`	`+entry->rnode.dbNode,`
	`1191`	`+entry->rnode.relNode)));`
	`1192`	`+}`
	`1193`	`+pfree(path);`
	`1194`	`+`
	`1195`	`+pendingUnlinks=list_delete_first(pendingUnlinks);`
	`1196`	`+pfree(entry);`
	`1197`	`+}`
	`1198`	`+}`
	`1199`	`+`
`1066`	`1200`	`/*`
`1067`	`1201`	`* register_dirty_segment() -- Mark a relation segment as needing fsync`
`1068`	`1202`	`*`
`@@ -1096,19 +1230,53 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)`
`1096`	`1230`	`}`
`1097`	`1231`	`}`
`1098`	`1232`
	`1233`	`+/*`
	`1234`	`+ * register_unlink() -- Schedule a file to be deleted after next checkpoint`
	`1235`	`+ *`
	`1236`	`+ * As with register_dirty_segment, this could involve either a local or`
	`1237`	`+ * a remote pending-ops table.`
	`1238`	`+ */`
	`1239`	`+staticvoid`
	`1240`	`+register_unlink(RelFileNodernode)`
	`1241`	`+{`
	`1242`	`+if (pendingOpsTable)`
	`1243`	`+{`
	`1244`	`+/* push it into local pending-ops table */`
	`1245`	`+RememberFsyncRequest(rnode,UNLINK_RELATION_REQUEST);`
	`1246`	`+}`
	`1247`	`+else`
	`1248`	`+{`
	`1249`	`+/*`
	`1250`	`+ * Notify the bgwriter about it. If we fail to queue the request`
	`1251`	`+ * message, we have to sleep and try again, because we can't simply`
	`1252`	`+ * delete the file now. Ugly, but hopefully won't happen often.`
	`1253`	`+ *`
	`1254`	`+ * XXX should we just leave the file orphaned instead?`
	`1255`	`+ */`
	`1256`	`+Assert(IsUnderPostmaster);`
	`1257`	`+while (!ForwardFsyncRequest(rnode,UNLINK_RELATION_REQUEST))`
	`1258`	`+pg_usleep(10000L);/* 10 msec seems a good number */`
	`1259`	`+}`
	`1260`	`+}`
	`1261`	`+`
`1099`	`1262`	`/*`
`1100`	`1263`	`* RememberFsyncRequest() -- callback from bgwriter side of fsync request`
`1101`	`1264`	`*`
`1102`		`- * We stuff the fsync request into the local hash table for execution`
`1103`		`- * during the bgwriter's next checkpoint.`
	`1265`	`+ * We stuff most fsync requests into the local hash table for execution`
	`1266`	`+ * during the bgwriter's next checkpoint. UNLINK requests go into a`
	`1267`	`+ * separate linked list, however, because they get processed separately.`
`1104`	`1268`	`*`
`1105`	`1269`	`* The range of possible segment numbers is way less than the range of`
`1106`	`1270`	`* BlockNumber, so we can reserve high values of segno for special purposes.`
`1107`		`- * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for`
`1108`		`- * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for`
`1109`		`- * a whole database. (These are a tad slow because the hash table has to be`
`1110`		`- * searched linearly, but it doesn't seem worth rethinking the table structure`
`1111`		`- * for them.)`
	`1271`	`+ * We define three:`
	`1272`	`+ * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation`
	`1273`	`+ * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database`
	`1274`	`+ * - UNLINK_RELATION_REQUEST is a request to delete the file after the next`
	`1275`	`+ * checkpoint.`
	`1276`	`+ *`
	`1277`	`+ * (Handling the FORGET_* requests is a tad slow because the hash table has`
	`1278`	`+ * to be searched linearly, but it doesn't seem worth rethinking the table`
	`1279`	`+ * structure for them.)`
`1112`	`1280`	`*/`
`1113`	`1281`	`void`
`1114`	`1282`	`RememberFsyncRequest(RelFileNodernode,BlockNumbersegno)`
`@@ -1147,6 +1315,20 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)`
`1147`	`1315`	`}`
`1148`	`1316`	`}`
`1149`	`1317`	`}`
	`1318`	`+elseif (segno==UNLINK_RELATION_REQUEST)`
	`1319`	`+{`
	`1320`	`+/* Unlink request: put it in the linked list */`
	`1321`	`+MemoryContextoldcxt=MemoryContextSwitchTo(MdCxt);`
	`1322`	`+PendingUnlinkEntry*entry;`
	`1323`	`+`
	`1324`	`+entry=palloc(sizeof(PendingUnlinkEntry));`
	`1325`	`+entry->rnode=rnode;`
	`1326`	`+entry->cycle_ctr=mdckpt_cycle_ctr;`
	`1327`	`+`
	`1328`	`+pendingUnlinks=lappend(pendingUnlinks,entry);`
	`1329`	`+`
	`1330`	`+MemoryContextSwitchTo(oldcxt);`
	`1331`	`+}`
`1150`	`1332`	`else`
`1151`	`1333`	`{`
`1152`	`1334`	`/* Normal case: enter a request to fsync this segment */`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit6cc4451

File tree

5 files changed

5 files changed

`‎src/backend/access/transam/xlog.c‎`

`‎src/backend/commands/tablespace.c‎`

`‎src/backend/storage/smgr/md.c‎`

0 commit comments