Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit6cc4451

Browse files
committed
Prevent re-use of a deleted relation's relfilenode until after the next
checkpoint. This guards against an unlikely data-loss scenario in whichwe re-use the relfilenode, then crash, then replay the deletion andrecreation of the file. Even then we'd be OK if all insertions into thenew relation had been WAL-logged ... but that's not guaranteed given allthe no-WAL-logging optimizations that have recently been added.Patch by Heikki Linnakangas, per a discussion last month.
1 parent7a550cb commit6cc4451

File tree

5 files changed

+274
-25
lines changed

5 files changed

+274
-25
lines changed

‎src/backend/access/transam/xlog.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.286 2007/10/12 19:39:59 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.287 2007/11/15 20:36:40 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -45,6 +45,7 @@
4545
#include"storage/fd.h"
4646
#include"storage/pmsignal.h"
4747
#include"storage/procarray.h"
48+
#include"storage/smgr.h"
4849
#include"storage/spin.h"
4950
#include"utils/builtins.h"
5051
#include"utils/pg_locale.h"
@@ -5663,6 +5664,14 @@ CreateCheckPoint(int flags)
56635664
UpdateControlFile();
56645665
}
56655666

5667+
/*
5668+
* Let smgr prepare for checkpoint; this has to happen before we
5669+
* determine the REDO pointer. Note that smgr must not do anything
5670+
* that'd have to be undone if we decide no checkpoint is needed.
5671+
*/
5672+
smgrpreckpt();
5673+
5674+
/* Begin filling in the checkpoint WAL record */
56665675
MemSet(&checkPoint,0,sizeof(checkPoint));
56675676
checkPoint.ThisTimeLineID=ThisTimeLineID;
56685677
checkPoint.time=time(NULL);
@@ -5886,6 +5895,11 @@ CreateCheckPoint(int flags)
58865895
*/
58875896
END_CRIT_SECTION();
58885897

5898+
/*
5899+
* Let smgr do post-checkpoint cleanup (eg, deleting old files).
5900+
*/
5901+
smgrpostckpt();
5902+
58895903
/*
58905904
* Delete old log files (those no longer needed even for previous
58915905
* checkpoint).

‎src/backend/commands/tablespace.c

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
*
3838
*
3939
* IDENTIFICATION
40-
* $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.49 2007/08/01 22:45:08 tgl Exp $
40+
* $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.50 2007/11/15 20:36:40 tgl Exp $
4141
*
4242
*-------------------------------------------------------------------------
4343
*/
@@ -57,6 +57,7 @@
5757
#include"commands/comment.h"
5858
#include"commands/tablespace.h"
5959
#include"miscadmin.h"
60+
#include"postmaster/bgwriter.h"
6061
#include"storage/fd.h"
6162
#include"utils/acl.h"
6263
#include"utils/builtins.h"
@@ -460,13 +461,29 @@ DropTableSpace(DropTableSpaceStmt *stmt)
460461
LWLockAcquire(TablespaceCreateLock,LW_EXCLUSIVE);
461462

462463
/*
463-
* Try to remove the physical infrastructure
464+
* Try to remove the physical infrastructure.
464465
*/
465466
if (!remove_tablespace_directories(tablespaceoid, false))
466-
ereport(ERROR,
467-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
468-
errmsg("tablespace \"%s\" is not empty",
469-
tablespacename)));
467+
{
468+
/*
469+
* Not all files deleted? However, there can be lingering empty files
470+
* in the directories, left behind by for example DROP TABLE, that
471+
* have been scheduled for deletion at next checkpoint (see comments
472+
* in mdunlink() for details). We could just delete them immediately,
473+
* but we can't tell them apart from important data files that we
474+
* mustn't delete. So instead, we force a checkpoint which will clean
475+
* out any lingering files, and try again.
476+
*/
477+
RequestCheckpoint(CHECKPOINT_IMMEDIATE |CHECKPOINT_FORCE |CHECKPOINT_WAIT);
478+
if (!remove_tablespace_directories(tablespaceoid, false))
479+
{
480+
/* Still not empty, the files must be important then */
481+
ereport(ERROR,
482+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
483+
errmsg("tablespace \"%s\" is not empty",
484+
tablespacename)));
485+
}
486+
}
470487

471488
/* Record the filesystem change in XLOG */
472489
{

‎src/backend/storage/smgr/md.c

Lines changed: 193 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.129 2007/07/03 14:51:24 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -34,6 +34,7 @@
3434
/* special values for the segno arg to RememberFsyncRequest */
3535
#defineFORGET_RELATION_FSYNC(InvalidBlockNumber)
3636
#defineFORGET_DATABASE_FSYNC(InvalidBlockNumber-1)
37+
#defineUNLINK_RELATION_REQUEST(InvalidBlockNumber-2)
3738

3839
/*
3940
* On Windows, we have to interpret EACCES as possibly meaning the same as
@@ -113,6 +114,10 @@ static MemoryContext MdCxt;/* context for all md.c allocations */
113114
* table remembers the pending operations.We use a hash table mostly as
114115
* a convenient way of eliminating duplicate requests.
115116
*
117+
* We use a similar mechanism to remember no-longer-needed files that can
118+
* be deleted after the next checkpoint, but we use a linked list instead of
119+
* a hash table, because we don't expect there to be any duplicate requests.
120+
*
116121
* (Regular backends do not track pending operations locally, but forward
117122
* them to the bgwriter.)
118123
*/
@@ -131,9 +136,17 @@ typedef struct
131136
CycleCtrcycle_ctr;/* mdsync_cycle_ctr when request was made */
132137
}PendingOperationEntry;
133138

139+
typedefstruct
140+
{
141+
RelFileNodernode;/* the dead relation to delete */
142+
CycleCtrcycle_ctr;/* mdckpt_cycle_ctr when request was made */
143+
}PendingUnlinkEntry;
144+
134145
staticHTAB*pendingOpsTable=NULL;
146+
staticList*pendingUnlinks=NIL;
135147

136148
staticCycleCtrmdsync_cycle_ctr=0;
149+
staticCycleCtrmdckpt_cycle_ctr=0;
137150

138151

139152
typedefenum/* behavior for mdopen & _mdfd_getseg */
@@ -146,6 +159,7 @@ typedef enum/* behavior for mdopen & _mdfd_getseg */
146159
/* local routines */
147160
staticMdfdVec*mdopen(SMgrRelationreln,ExtensionBehaviorbehavior);
148161
staticvoidregister_dirty_segment(SMgrRelationreln,MdfdVec*seg);
162+
staticvoidregister_unlink(RelFileNodernode);
149163
staticMdfdVec*_fdvec_alloc(void);
150164

151165
#ifndefLET_OS_MANAGE_FILESIZE
@@ -188,6 +202,7 @@ mdinit(void)
188202
100L,
189203
&hash_ctl,
190204
HASH_ELEM |HASH_FUNCTION |HASH_CONTEXT);
205+
pendingUnlinks=NIL;
191206
}
192207
}
193208

@@ -254,14 +269,37 @@ mdcreate(SMgrRelation reln, bool isRedo)
254269
* Note that we're passed a RelFileNode --- by the time this is called,
255270
* there won't be an SMgrRelation hashtable entry anymore.
256271
*
272+
* Actually, we don't unlink the first segment file of the relation, but
273+
* just truncate it to zero length, and record a request to unlink it after
274+
* the next checkpoint. Additional segments can be unlinked immediately,
275+
* however. Leaving the empty file in place prevents that relfilenode
276+
* number from being reused. The scenario this protects us from is:
277+
* 1. We delete a relation (and commit, and actually remove its file).
278+
* 2. We create a new relation, which by chance gets the same relfilenode as
279+
* the just-deleted one (OIDs must've wrapped around for that to happen).
280+
* 3. We crash before another checkpoint occurs.
281+
* During replay, we would delete the file and then recreate it, which is fine
282+
* if the contents of the file were repopulated by subsequent WAL entries.
283+
* But if we didn't WAL-log insertions, but instead relied on fsyncing the
284+
* file after populating it (as for instance CLUSTER and CREATE INDEX do),
285+
* the contents of the file would be lost forever. By leaving the empty file
286+
* until after the next checkpoint, we prevent reassignment of the relfilenode
287+
* number until it's safe, because relfilenode assignment skips over any
288+
* existing file.
289+
*
257290
* If isRedo is true, it's okay for the relation to be already gone.
258-
* Also, any failure should be reported as WARNING not ERROR, because
291+
* Also, we should remove the file immediately instead of queuing a request
292+
* for later, since during redo there's no possibility of creating a
293+
* conflicting relation.
294+
*
295+
* Note: any failure should be reported as WARNING not ERROR, because
259296
* we are usually not in a transaction anymore when this is called.
260297
*/
261298
void
262299
mdunlink(RelFileNodernode,boolisRedo)
263300
{
264301
char*path;
302+
intret;
265303

266304
/*
267305
* We have to clean out any pending fsync requests for the doomed relation,
@@ -271,8 +309,15 @@ mdunlink(RelFileNode rnode, bool isRedo)
271309

272310
path=relpath(rnode);
273311

274-
/* Delete the first segment, or only segment if not doing segmenting */
275-
if (unlink(path)<0)
312+
/*
313+
* Delete or truncate the first segment, or only segment if not doing
314+
* segmenting
315+
*/
316+
if (isRedo)
317+
ret=unlink(path);
318+
else
319+
ret=truncate(path,0);
320+
if (ret<0)
276321
{
277322
if (!isRedo||errno!=ENOENT)
278323
ereport(WARNING,
@@ -316,6 +361,10 @@ mdunlink(RelFileNode rnode, bool isRedo)
316361
#endif
317362

318363
pfree(path);
364+
365+
/* Register request to unlink first segment later */
366+
if (!isRedo)
367+
register_unlink(rnode);
319368
}
320369

321370
/*
@@ -1063,6 +1112,91 @@ mdsync(void)
10631112
mdsync_in_progress= false;
10641113
}
10651114

1115+
/*
1116+
* mdpreckpt() -- Do pre-checkpoint work
1117+
*
1118+
* To distinguish unlink requests that arrived before this checkpoint
1119+
* started from those that arrived during the checkpoint, we use a cycle
1120+
* counter similar to the one we use for fsync requests. That cycle
1121+
* counter is incremented here.
1122+
*
1123+
* This must be called *before* the checkpoint REDO point is determined.
1124+
* That ensures that we won't delete files too soon.
1125+
*
1126+
* Note that we can't do anything here that depends on the assumption
1127+
* that the checkpoint will be completed.
1128+
*/
1129+
void
1130+
mdpreckpt(void)
1131+
{
1132+
ListCell*cell;
1133+
1134+
/*
1135+
* In case the prior checkpoint wasn't completed, stamp all entries in
1136+
* the list with the current cycle counter. Anything that's in the
1137+
* list at the start of checkpoint can surely be deleted after the
1138+
* checkpoint is finished, regardless of when the request was made.
1139+
*/
1140+
foreach(cell,pendingUnlinks)
1141+
{
1142+
PendingUnlinkEntry*entry= (PendingUnlinkEntry*)lfirst(cell);
1143+
1144+
entry->cycle_ctr=mdckpt_cycle_ctr;
1145+
}
1146+
1147+
/*
1148+
* Any unlink requests arriving after this point will be assigned the
1149+
* next cycle counter, and won't be unlinked until next checkpoint.
1150+
*/
1151+
mdckpt_cycle_ctr++;
1152+
}
1153+
1154+
/*
1155+
* mdpostckpt() -- Do post-checkpoint work
1156+
*
1157+
* Remove any lingering files that can now be safely removed.
1158+
*/
1159+
void
1160+
mdpostckpt(void)
1161+
{
1162+
while (pendingUnlinks!=NIL)
1163+
{
1164+
PendingUnlinkEntry*entry= (PendingUnlinkEntry*)linitial(pendingUnlinks);
1165+
char*path;
1166+
1167+
/*
1168+
* New entries are appended to the end, so if the entry is new
1169+
* we've reached the end of old entries.
1170+
*/
1171+
if (entry->cycle_ctr==mdsync_cycle_ctr)
1172+
break;
1173+
1174+
/* Else assert we haven't missed it */
1175+
Assert((CycleCtr) (entry->cycle_ctr+1)==mdckpt_cycle_ctr);
1176+
1177+
/* Unlink the file */
1178+
path=relpath(entry->rnode);
1179+
if (unlink(path)<0)
1180+
{
1181+
/*
1182+
* ENOENT shouldn't happen either, but it doesn't really matter
1183+
* because we would've deleted it now anyway.
1184+
*/
1185+
if (errno!=ENOENT)
1186+
ereport(WARNING,
1187+
(errcode_for_file_access(),
1188+
errmsg("could not remove relation %u/%u/%u: %m",
1189+
entry->rnode.spcNode,
1190+
entry->rnode.dbNode,
1191+
entry->rnode.relNode)));
1192+
}
1193+
pfree(path);
1194+
1195+
pendingUnlinks=list_delete_first(pendingUnlinks);
1196+
pfree(entry);
1197+
}
1198+
}
1199+
10661200
/*
10671201
* register_dirty_segment() -- Mark a relation segment as needing fsync
10681202
*
@@ -1096,19 +1230,53 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
10961230
}
10971231
}
10981232

1233+
/*
1234+
* register_unlink() -- Schedule a file to be deleted after next checkpoint
1235+
*
1236+
* As with register_dirty_segment, this could involve either a local or
1237+
* a remote pending-ops table.
1238+
*/
1239+
staticvoid
1240+
register_unlink(RelFileNodernode)
1241+
{
1242+
if (pendingOpsTable)
1243+
{
1244+
/* push it into local pending-ops table */
1245+
RememberFsyncRequest(rnode,UNLINK_RELATION_REQUEST);
1246+
}
1247+
else
1248+
{
1249+
/*
1250+
* Notify the bgwriter about it. If we fail to queue the request
1251+
* message, we have to sleep and try again, because we can't simply
1252+
* delete the file now. Ugly, but hopefully won't happen often.
1253+
*
1254+
* XXX should we just leave the file orphaned instead?
1255+
*/
1256+
Assert(IsUnderPostmaster);
1257+
while (!ForwardFsyncRequest(rnode,UNLINK_RELATION_REQUEST))
1258+
pg_usleep(10000L);/* 10 msec seems a good number */
1259+
}
1260+
}
1261+
10991262
/*
11001263
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
11011264
*
1102-
* We stuff the fsync request into the local hash table for execution
1103-
* during the bgwriter's next checkpoint.
1265+
* We stuff most fsync requests into the local hash table for execution
1266+
* during the bgwriter's next checkpoint. UNLINK requests go into a
1267+
* separate linked list, however, because they get processed separately.
11041268
*
11051269
* The range of possible segment numbers is way less than the range of
11061270
* BlockNumber, so we can reserve high values of segno for special purposes.
1107-
* We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
1108-
* a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
1109-
* a whole database. (These are a tad slow because the hash table has to be
1110-
* searched linearly, but it doesn't seem worth rethinking the table structure
1111-
* for them.)
1271+
* We define three:
1272+
* - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1273+
* - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1274+
* - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1275+
* checkpoint.
1276+
*
1277+
* (Handling the FORGET_* requests is a tad slow because the hash table has
1278+
* to be searched linearly, but it doesn't seem worth rethinking the table
1279+
* structure for them.)
11121280
*/
11131281
void
11141282
RememberFsyncRequest(RelFileNodernode,BlockNumbersegno)
@@ -1147,6 +1315,20 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
11471315
}
11481316
}
11491317
}
1318+
elseif (segno==UNLINK_RELATION_REQUEST)
1319+
{
1320+
/* Unlink request: put it in the linked list */
1321+
MemoryContextoldcxt=MemoryContextSwitchTo(MdCxt);
1322+
PendingUnlinkEntry*entry;
1323+
1324+
entry=palloc(sizeof(PendingUnlinkEntry));
1325+
entry->rnode=rnode;
1326+
entry->cycle_ctr=mdckpt_cycle_ctr;
1327+
1328+
pendingUnlinks=lappend(pendingUnlinks,entry);
1329+
1330+
MemoryContextSwitchTo(oldcxt);
1331+
}
11501332
else
11511333
{
11521334
/* Normal case: enter a request to fsync this segment */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp