Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commite26b0ab

Browse files
committed
Arrange to fsync two-phase-commit state files only during checkpoints;
given reasonably short lifespans for prepared transactions, this shouldmean that only a small minority of state files ever need to be fsyncedat all. Per discussion with Heikki Linnakangas.
1 parentba90268 commite26b0ab

File tree

4 files changed

+197
-58
lines changed

4 files changed

+197
-58
lines changed

‎src/backend/access/transam/twophase.c

Lines changed: 189 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
99
* IDENTIFICATION
10-
*$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.3 2005/06/18 19:33:41 tgl Exp $
10+
*$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.4 2005/06/19 20:00:38 tgl Exp $
1111
*
1212
* NOTES
1313
*Each global transaction is associated with a global transaction
@@ -49,12 +49,12 @@
4949
#include"catalog/pg_type.h"
5050
#include"funcapi.h"
5151
#include"miscadmin.h"
52+
#include"pgstat.h"
5253
#include"storage/fd.h"
5354
#include"storage/proc.h"
5455
#include"storage/procarray.h"
5556
#include"storage/smgr.h"
5657
#include"utils/builtins.h"
57-
#include"pgstat.h"
5858

5959

6060
/*
@@ -105,6 +105,7 @@ typedef struct GlobalTransactionData
105105
{
106106
PGPROCproc;/* dummy proc */
107107
TimestampTzprepared_at;/* time of preparation */
108+
XLogRecPtrprepare_lsn;/* XLOG offset of prepare record */
108109
AclIdowner;/* ID of user that executed the xact */
109110
TransactionIdlocking_xid;/* top-level XID of backend working on xact */
110111
boolvalid;/* TRUE if fully prepared */
@@ -281,6 +282,9 @@ MarkAsPreparing(TransactionId xid, const char *gid,
281282
gxact->proc.subxids.nxids=0;
282283

283284
gxact->prepared_at=prepared_at;
285+
/* initialize LSN to 0 (start of WAL) */
286+
gxact->prepare_lsn.xlogid=0;
287+
gxact->prepare_lsn.xrecoff=0;
284288
gxact->owner=owner;
285289
gxact->locking_xid=xid;
286290
gxact->valid= false;
@@ -324,7 +328,7 @@ GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts,
324328
* MarkAsPrepared
325329
*Mark the GXACT as fully valid, and enter it into the global ProcArray.
326330
*/
327-
void
331+
staticvoid
328332
MarkAsPrepared(GlobalTransactiongxact)
329333
{
330334
/* Lock here may be overkill, but I'm not convinced of that ... */
@@ -433,6 +437,40 @@ RemoveGXact(GlobalTransaction gxact)
433437
elog(ERROR,"failed to find %p in GlobalTransaction array",gxact);
434438
}
435439

440+
/*
441+
* TransactionIdIsPrepared
442+
*True iff transaction associated with the identifier is prepared
443+
* for two-phase commit
444+
*
445+
* Note: only gxacts marked "valid" are considered; but notice we do not
446+
* check the locking status.
447+
*
448+
* This is not currently exported, because it is only needed internally.
449+
*/
450+
staticbool
451+
TransactionIdIsPrepared(TransactionIdxid)
452+
{
453+
boolresult= false;
454+
inti;
455+
456+
LWLockAcquire(TwoPhaseStateLock,LW_SHARED);
457+
458+
for (i=0;i<TwoPhaseState->numPrepXacts;i++)
459+
{
460+
GlobalTransactiongxact=TwoPhaseState->prepXacts[i];
461+
462+
if (gxact->valid&&gxact->proc.xid==xid)
463+
{
464+
result= true;
465+
break;
466+
}
467+
}
468+
469+
LWLockRelease(TwoPhaseStateLock);
470+
471+
returnresult;
472+
}
473+
436474
/*
437475
* Returns an array of all prepared transactions for the user-level
438476
* function pg_prepared_xact.
@@ -790,7 +828,6 @@ EndPrepare(GlobalTransaction gxact)
790828
TwoPhaseFileHeader*hdr;
791829
charpath[MAXPGPATH];
792830
XLogRecData*record;
793-
XLogRecPtrrecptr;
794831
pg_crc32statefile_crc;
795832
pg_crc32bogus_crc;
796833
intfd;
@@ -841,14 +878,9 @@ EndPrepare(GlobalTransaction gxact)
841878
FIN_CRC32(statefile_crc);
842879

843880
/*
844-
* Write a deliberately bogus CRC to the state file, and flush it to disk.
845-
* This is to minimize the odds of failure within the critical section
846-
* below --- in particular, running out of disk space.
847-
*
848-
* On most filesystems, write() rather than fsync() detects out-of-space,
849-
* so the fsync might be considered optional. Using it means there
850-
* are three fsyncs not two associated with preparing a transaction; is
851-
* the risk of an error from fsync high enough to justify that?
881+
* Write a deliberately bogus CRC to the state file; this is just
882+
* paranoia to catch the case where four more bytes will run us out of
883+
* disk space.
852884
*/
853885
bogus_crc= ~statefile_crc;
854886

@@ -860,14 +892,6 @@ EndPrepare(GlobalTransaction gxact)
860892
errmsg("could not write twophase state file: %m")));
861893
}
862894

863-
if (pg_fsync(fd)!=0)
864-
{
865-
close(fd);
866-
ereport(ERROR,
867-
(errcode_for_file_access(),
868-
errmsg("could not fsync twophase state file: %m")));
869-
}
870-
871895
/* Back up to prepare for rewriting the CRC */
872896
if (lseek(fd,-((off_t)sizeof(pg_crc32)),SEEK_CUR)<0)
873897
{
@@ -881,28 +905,34 @@ EndPrepare(GlobalTransaction gxact)
881905
* The state file isn't valid yet, because we haven't written the correct
882906
* CRC yet. Before we do that, insert entry in WAL and flush it to disk.
883907
*
884-
* Between the time we have written the WAL entry and the time we
885-
* flush the correct state file CRC to disk, we have an inconsistency:
886-
* the xact is prepared according to WAL but not according to our on-disk
887-
* state. We use a critical section to force a PANIC if we are unable to
888-
* complete the flush --- then, WAL replay should repair the
889-
* inconsistency.
908+
* Between the time we have written the WAL entry and the time we write
909+
* out the correct state file CRC, we have an inconsistency: the xact is
910+
* prepared according to WAL but not according to our on-disk state.
911+
* We use a critical section to force a PANIC if we are unable to complete
912+
* the write --- then, WAL replay should repair the inconsistency. The
913+
* odds of a PANIC actually occurring should be very tiny given that we
914+
* were able to write the bogus CRC above.
890915
*
891916
* We have to lock out checkpoint start here, too; otherwise a checkpoint
892917
* starting immediately after the WAL record is inserted could complete
893-
* before we've finished flushing, meaning that the WAL record would not
894-
* get replayed if a crash follows.
918+
* without fsync'ing our state file. (This is essentially the same kind
919+
* of race condition as the COMMIT-to-clog-write case that
920+
* RecordTransactionCommit uses CheckpointStartLock for; see notes there.)
921+
*
922+
* We save the PREPARE record's location in the gxact for later use by
923+
* CheckPointTwoPhase.
895924
*/
896925
START_CRIT_SECTION();
897926

898927
LWLockAcquire(CheckpointStartLock,LW_SHARED);
899928

900-
recptr=XLogInsert(RM_XACT_ID,XLOG_XACT_PREPARE,records.head);
901-
XLogFlush(recptr);
929+
gxact->prepare_lsn=XLogInsert(RM_XACT_ID,XLOG_XACT_PREPARE,
930+
records.head);
931+
XLogFlush(gxact->prepare_lsn);
902932

903933
/* If we crash now, we have prepared: WAL replay will fix things */
904934

905-
/* write correct CRC, flush, and close file */
935+
/* write correct CRC and close file */
906936
if ((write(fd,&statefile_crc,sizeof(pg_crc32)))!=sizeof(pg_crc32))
907937
{
908938
close(fd);
@@ -911,19 +941,29 @@ EndPrepare(GlobalTransaction gxact)
911941
errmsg("could not write twophase state file: %m")));
912942
}
913943

914-
if (pg_fsync(fd)!=0)
915-
{
916-
close(fd);
917-
ereport(ERROR,
918-
(errcode_for_file_access(),
919-
errmsg("could not fsync twophase state file: %m")));
920-
}
921-
922944
if (close(fd)!=0)
923945
ereport(ERROR,
924946
(errcode_for_file_access(),
925947
errmsg("could not close twophase state file: %m")));
926948

949+
/*
950+
* Mark the prepared transaction as valid. As soon as xact.c marks
951+
* MyProc as not running our XID (which it will do immediately after
952+
* this function returns), others can commit/rollback the xact.
953+
*
954+
* NB: a side effect of this is to make a dummy ProcArray entry for the
955+
* prepared XID. This must happen before we clear the XID from MyProc,
956+
* else there is a window where the XID is not running according to
957+
* TransactionIdInProgress, and onlookers would be entitled to assume
958+
* the xact crashed. Instead we have a window where the same XID
959+
* appears twice in ProcArray, which is OK.
960+
*/
961+
MarkAsPrepared(gxact);
962+
963+
/*
964+
* Now we can release the checkpoint start lock: a checkpoint starting
965+
* after this will certainly see the gxact as a candidate for fsyncing.
966+
*/
927967
LWLockRelease(CheckpointStartLock);
928968

929969
END_CRIT_SECTION();
@@ -1119,6 +1159,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
11191159
* In case we fail while running the callbacks, mark the gxact invalid
11201160
* so no one else will try to commit/rollback, and so it can be recycled
11211161
* properly later. It is still locked by our XID so it won't go away yet.
1162+
*
1163+
* (We assume it's safe to do this without taking TwoPhaseStateLock.)
11221164
*/
11231165
gxact->valid= false;
11241166

@@ -1248,7 +1290,10 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
12481290
errmsg("could not write twophase state file: %m")));
12491291
}
12501292

1251-
/* Sync and close the file */
1293+
/*
1294+
* We must fsync the file because the end-of-replay checkpoint will
1295+
* not do so, there being no GXACT in shared memory yet to tell it to.
1296+
*/
12521297
if (pg_fsync(fd)!=0)
12531298
{
12541299
close(fd);
@@ -1263,6 +1308,103 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
12631308
errmsg("could not close twophase state file: %m")));
12641309
}
12651310

1311+
/*
1312+
* CheckPointTwoPhase -- handle 2PC component of checkpointing.
1313+
*
1314+
* We must fsync the state file of any GXACT that is valid and has a PREPARE
1315+
* LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or
1316+
* has a later LSN, this checkpoint is not responsible for fsyncing it.)
1317+
*
1318+
* This is deliberately run as late as possible in the checkpoint sequence,
1319+
* because GXACTs ordinarily have short lifespans, and so it is quite
1320+
* possible that GXACTs that were valid at checkpoint start will no longer
1321+
* exist if we wait a little bit.
1322+
*
1323+
* If a GXACT remains valid across multiple checkpoints, it'll be fsynced
1324+
* each time. This is considered unusual enough that we don't bother to
1325+
* expend any extra code to avoid the redundant fsyncs. (They should be
1326+
* reasonably cheap anyway, since they won't cause I/O.)
1327+
*/
1328+
void
1329+
CheckPointTwoPhase(XLogRecPtrredo_horizon)
1330+
{
1331+
TransactionId*xids;
1332+
intnxids;
1333+
charpath[MAXPGPATH];
1334+
inti;
1335+
1336+
/*
1337+
* We don't want to hold the TwoPhaseStateLock while doing I/O,
1338+
* so we grab it just long enough to make a list of the XIDs that
1339+
* require fsyncing, and then do the I/O afterwards.
1340+
*
1341+
* This approach creates a race condition: someone else could delete
1342+
* a GXACT between the time we release TwoPhaseStateLock and the time
1343+
* we try to open its state file. We handle this by special-casing
1344+
* ENOENT failures: if we see that, we verify that the GXACT is no
1345+
* longer valid, and if so ignore the failure.
1346+
*/
1347+
if (max_prepared_xacts <=0)
1348+
return;/* nothing to do */
1349+
xids= (TransactionId*)palloc(max_prepared_xacts*sizeof(TransactionId));
1350+
nxids=0;
1351+
1352+
LWLockAcquire(TwoPhaseStateLock,LW_SHARED);
1353+
1354+
for (i=0;i<TwoPhaseState->numPrepXacts;i++)
1355+
{
1356+
GlobalTransactiongxact=TwoPhaseState->prepXacts[i];
1357+
1358+
if (gxact->valid&&
1359+
XLByteLE(gxact->prepare_lsn,redo_horizon))
1360+
xids[nxids++]=gxact->proc.xid;
1361+
}
1362+
1363+
LWLockRelease(TwoPhaseStateLock);
1364+
1365+
for (i=0;i<nxids;i++)
1366+
{
1367+
TransactionIdxid=xids[i];
1368+
intfd;
1369+
1370+
TwoPhaseFilePath(path,xid);
1371+
1372+
fd=BasicOpenFile(path,O_RDWR |PG_BINARY,0);
1373+
if (fd<0)
1374+
{
1375+
if (errno==ENOENT)
1376+
{
1377+
/* OK if gxact is no longer valid */
1378+
if (!TransactionIdIsPrepared(xid))
1379+
continue;
1380+
/* Restore errno in case it was changed */
1381+
errno=ENOENT;
1382+
}
1383+
ereport(ERROR,
1384+
(errcode_for_file_access(),
1385+
errmsg("could not open twophase state file \"%s\": %m",
1386+
path)));
1387+
}
1388+
1389+
if (pg_fsync(fd)!=0)
1390+
{
1391+
close(fd);
1392+
ereport(ERROR,
1393+
(errcode_for_file_access(),
1394+
errmsg("could not fsync twophase state file \"%s\": %m",
1395+
path)));
1396+
}
1397+
1398+
if (close(fd)!=0)
1399+
ereport(ERROR,
1400+
(errcode_for_file_access(),
1401+
errmsg("could not close twophase state file \"%s\": %m",
1402+
path)));
1403+
}
1404+
1405+
pfree(xids);
1406+
}
1407+
12661408
/*
12671409
* PrescanPreparedTransactions
12681410
*
@@ -1492,6 +1634,13 @@ RecoverPreparedTransactions(void)
14921634

14931635
/*
14941636
* Recreate its GXACT and dummy PGPROC
1637+
*
1638+
* Note: since we don't have the PREPARE record's WAL location
1639+
* at hand, we leave prepare_lsn zeroes. This means the GXACT
1640+
* will be fsync'd on every future checkpoint. We assume this
1641+
* situation is infrequent enough that the performance cost is
1642+
* negligible (especially since we know the state file has
1643+
* already been fsynced).
14951644
*/
14961645
gxact=MarkAsPreparing(xid,hdr->gid,
14971646
hdr->prepared_at,

‎src/backend/access/transam/xact.c

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
*
1111
*
1212
* IDENTIFICATION
13-
* $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.206 2005/06/18 19:33:41 tgl Exp $
13+
* $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.207 2005/06/19 20:00:38 tgl Exp $
1414
*
1515
*-------------------------------------------------------------------------
1616
*/
@@ -1741,19 +1741,6 @@ PrepareTransaction(void)
17411741
*/
17421742
EndPrepare(gxact);
17431743

1744-
/*
1745-
* Mark the prepared transaction as valid. As soon as we mark ourselves
1746-
* not running in MyProc below, others can commit/rollback the xact.
1747-
*
1748-
* NB: a side effect of this is to make a dummy ProcArray entry for the
1749-
* prepared XID. This must happen before we clear the XID from MyProc,
1750-
* else there is a window where the XID is not running according to
1751-
* TransactionIdInProgress, and onlookers would be entitled to assume
1752-
* the xact crashed. Instead we have a window where the same XID
1753-
* appears twice in ProcArray, which is OK.
1754-
*/
1755-
MarkAsPrepared(gxact);
1756-
17571744
/*
17581745
* Now we clean up backend-internal state and release internal
17591746
* resources.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp