77 * Portions Copyright (c) 1994, Regents of the University of California
88 *
99 * IDENTIFICATION
10- *$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.3 2005/06/18 19:33:41 tgl Exp $
10+ *$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.4 2005/06/19 20:00:38 tgl Exp $
1111 *
1212 * NOTES
1313 *Each global transaction is associated with a global transaction
4949#include "catalog/pg_type.h"
5050#include "funcapi.h"
5151#include "miscadmin.h"
52+ #include "pgstat.h"
5253#include "storage/fd.h"
5354#include "storage/proc.h"
5455#include "storage/procarray.h"
5556#include "storage/smgr.h"
5657#include "utils/builtins.h"
57- #include "pgstat.h"
5858
5959
6060/*
@@ -105,6 +105,7 @@ typedef struct GlobalTransactionData
105105{
106106PGPROC proc ;/* dummy proc */
107107TimestampTz prepared_at ;/* time of preparation */
108+ XLogRecPtr prepare_lsn ;/* XLOG offset of prepare record */
108109AclId owner ;/* ID of user that executed the xact */
109110TransactionId locking_xid ;/* top-level XID of backend working on xact */
110111bool valid ;/* TRUE if fully prepared */
@@ -281,6 +282,9 @@ MarkAsPreparing(TransactionId xid, const char *gid,
281282gxact -> proc .subxids .nxids = 0 ;
282283
283284gxact -> prepared_at = prepared_at ;
285+ /* initialize LSN to 0 (start of WAL) */
286+ gxact -> prepare_lsn .xlogid = 0 ;
287+ gxact -> prepare_lsn .xrecoff = 0 ;
284288gxact -> owner = owner ;
285289gxact -> locking_xid = xid ;
286290gxact -> valid = false;
@@ -324,7 +328,7 @@ GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts,
324328 * MarkAsPrepared
325329 *Mark the GXACT as fully valid, and enter it into the global ProcArray.
326330 */
327- void
331+ static void
328332MarkAsPrepared (GlobalTransaction gxact )
329333{
330334/* Lock here may be overkill, but I'm not convinced of that ... */
@@ -433,6 +437,40 @@ RemoveGXact(GlobalTransaction gxact)
433437elog (ERROR ,"failed to find %p in GlobalTransaction array" ,gxact );
434438}
435439
440+ /*
441+ * TransactionIdIsPrepared
442+ *True iff transaction associated with the identifier is prepared
443+ * for two-phase commit
444+ *
445+ * Note: only gxacts marked "valid" are considered; but notice we do not
446+ * check the locking status.
447+ *
448+ * This is not currently exported, because it is only needed internally.
449+ */
450+ static bool
451+ TransactionIdIsPrepared (TransactionId xid )
452+ {
453+ bool result = false;
454+ int i ;
455+
456+ LWLockAcquire (TwoPhaseStateLock ,LW_SHARED );
457+
458+ for (i = 0 ;i < TwoPhaseState -> numPrepXacts ;i ++ )
459+ {
460+ GlobalTransaction gxact = TwoPhaseState -> prepXacts [i ];
461+
462+ if (gxact -> valid && gxact -> proc .xid == xid )
463+ {
464+ result = true;
465+ break ;
466+ }
467+ }
468+
469+ LWLockRelease (TwoPhaseStateLock );
470+
471+ return result ;
472+ }
473+
436474/*
437475 * Returns an array of all prepared transactions for the user-level
438476 * function pg_prepared_xact.
@@ -790,7 +828,6 @@ EndPrepare(GlobalTransaction gxact)
790828TwoPhaseFileHeader * hdr ;
791829char path [MAXPGPATH ];
792830XLogRecData * record ;
793- XLogRecPtr recptr ;
794831pg_crc32 statefile_crc ;
795832pg_crc32 bogus_crc ;
796833int fd ;
@@ -841,14 +878,9 @@ EndPrepare(GlobalTransaction gxact)
841878FIN_CRC32 (statefile_crc );
842879
843880/*
844- * Write a deliberately bogus CRC to the state file, and flush it to disk.
845- * This is to minimize the odds of failure within the critical section
846- * below --- in particular, running out of disk space.
847- *
848- * On most filesystems, write() rather than fsync() detects out-of-space,
849- * so the fsync might be considered optional. Using it means there
850- * are three fsyncs not two associated with preparing a transaction; is
851- * the risk of an error from fsync high enough to justify that?
881+ * Write a deliberately bogus CRC to the state file; this is just
882+ * paranoia to catch the case where four more bytes will run us out of
883+ * disk space.
852884 */
853885bogus_crc = ~statefile_crc ;
854886
@@ -860,14 +892,6 @@ EndPrepare(GlobalTransaction gxact)
860892errmsg ("could not write twophase state file: %m" )));
861893}
862894
863- if (pg_fsync (fd )!= 0 )
864- {
865- close (fd );
866- ereport (ERROR ,
867- (errcode_for_file_access (),
868- errmsg ("could not fsync twophase state file: %m" )));
869- }
870-
871895/* Back up to prepare for rewriting the CRC */
872896if (lseek (fd ,- ((off_t )sizeof (pg_crc32 )),SEEK_CUR )< 0 )
873897{
@@ -881,28 +905,34 @@ EndPrepare(GlobalTransaction gxact)
881905 * The state file isn't valid yet, because we haven't written the correct
882906 * CRC yet. Before we do that, insert entry in WAL and flush it to disk.
883907 *
884- * Between the time we have written the WAL entry and the time we
885- * flush the correct state file CRC to disk, we have an inconsistency:
886- * the xact is prepared according to WAL but not according to our on-disk
887- * state. We use a critical section to force a PANIC if we are unable to
888- * complete the flush --- then, WAL replay should repair the
889- * inconsistency.
908+ * Between the time we have written the WAL entry and the time we write
909+ * out the correct state file CRC, we have an inconsistency: the xact is
910+ * prepared according to WAL but not according to our on-disk state.
911+ * We use a critical section to force a PANIC if we are unable to complete
912+ * the write --- then, WAL replay should repair the inconsistency. The
913+ * odds of a PANIC actually occurring should be very tiny given that we
914+ * were able to write the bogus CRC above.
890915 *
891916 * We have to lock out checkpoint start here, too; otherwise a checkpoint
892917 * starting immediately after the WAL record is inserted could complete
893- * before we've finished flushing, meaning that the WAL record would not
894- * get replayed if a crash follows.
918+ * without fsync'ing our state file. (This is essentially the same kind
919+ * of race condition as the COMMIT-to-clog-write case that
920+ * RecordTransactionCommit uses CheckpointStartLock for; see notes there.)
921+ *
922+ * We save the PREPARE record's location in the gxact for later use by
923+ * CheckPointTwoPhase.
895924 */
896925START_CRIT_SECTION ();
897926
898927LWLockAcquire (CheckpointStartLock ,LW_SHARED );
899928
900- recptr = XLogInsert (RM_XACT_ID ,XLOG_XACT_PREPARE ,records .head );
901- XLogFlush (recptr );
929+ gxact -> prepare_lsn = XLogInsert (RM_XACT_ID ,XLOG_XACT_PREPARE ,
930+ records .head );
931+ XLogFlush (gxact -> prepare_lsn );
902932
903933/* If we crash now, we have prepared: WAL replay will fix things */
904934
905- /* write correct CRC, flush, and close file */
935+ /* write correct CRC and close file */
906936if ((write (fd ,& statefile_crc ,sizeof (pg_crc32 )))!= sizeof (pg_crc32 ))
907937{
908938close (fd );
@@ -911,19 +941,29 @@ EndPrepare(GlobalTransaction gxact)
911941errmsg ("could not write twophase state file: %m" )));
912942}
913943
914- if (pg_fsync (fd )!= 0 )
915- {
916- close (fd );
917- ereport (ERROR ,
918- (errcode_for_file_access (),
919- errmsg ("could not fsync twophase state file: %m" )));
920- }
921-
922944if (close (fd )!= 0 )
923945ereport (ERROR ,
924946(errcode_for_file_access (),
925947errmsg ("could not close twophase state file: %m" )));
926948
949+ /*
950+ * Mark the prepared transaction as valid. As soon as xact.c marks
951+ * MyProc as not running our XID (which it will do immediately after
952+ * this function returns), others can commit/rollback the xact.
953+ *
954+ * NB: a side effect of this is to make a dummy ProcArray entry for the
955+ * prepared XID. This must happen before we clear the XID from MyProc,
956+ * else there is a window where the XID is not running according to
957+ * TransactionIdInProgress, and onlookers would be entitled to assume
958+ * the xact crashed. Instead we have a window where the same XID
959+ * appears twice in ProcArray, which is OK.
960+ */
961+ MarkAsPrepared (gxact );
962+
963+ /*
964+ * Now we can release the checkpoint start lock: a checkpoint starting
965+ * after this will certainly see the gxact as a candidate for fsyncing.
966+ */
927967LWLockRelease (CheckpointStartLock );
928968
929969END_CRIT_SECTION ();
@@ -1119,6 +1159,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
11191159 * In case we fail while running the callbacks, mark the gxact invalid
11201160 * so no one else will try to commit/rollback, and so it can be recycled
11211161 * properly later. It is still locked by our XID so it won't go away yet.
1162+ *
1163+ * (We assume it's safe to do this without taking TwoPhaseStateLock.)
11221164 */
11231165gxact -> valid = false;
11241166
@@ -1248,7 +1290,10 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
12481290errmsg ("could not write twophase state file: %m" )));
12491291}
12501292
1251- /* Sync and close the file */
1293+ /*
1294+ * We must fsync the file because the end-of-replay checkpoint will
1295+ * not do so, there being no GXACT in shared memory yet to tell it to.
1296+ */
12521297if (pg_fsync (fd )!= 0 )
12531298{
12541299close (fd );
@@ -1263,6 +1308,103 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
12631308errmsg ("could not close twophase state file: %m" )));
12641309}
12651310
1311+ /*
1312+ * CheckPointTwoPhase -- handle 2PC component of checkpointing.
1313+ *
1314+ * We must fsync the state file of any GXACT that is valid and has a PREPARE
1315+ * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or
1316+ * has a later LSN, this checkpoint is not responsible for fsyncing it.)
1317+ *
1318+ * This is deliberately run as late as possible in the checkpoint sequence,
1319+ * because GXACTs ordinarily have short lifespans, and so it is quite
1320+ * possible that GXACTs that were valid at checkpoint start will no longer
1321+ * exist if we wait a little bit.
1322+ *
1323+ * If a GXACT remains valid across multiple checkpoints, it'll be fsynced
1324+ * each time. This is considered unusual enough that we don't bother to
1325+ * expend any extra code to avoid the redundant fsyncs. (They should be
1326+ * reasonably cheap anyway, since they won't cause I/O.)
1327+ */
1328+ void
1329+ CheckPointTwoPhase (XLogRecPtr redo_horizon )
1330+ {
1331+ TransactionId * xids ;
1332+ int nxids ;
1333+ char path [MAXPGPATH ];
1334+ int i ;
1335+
1336+ /*
1337+ * We don't want to hold the TwoPhaseStateLock while doing I/O,
1338+ * so we grab it just long enough to make a list of the XIDs that
1339+ * require fsyncing, and then do the I/O afterwards.
1340+ *
1341+ * This approach creates a race condition: someone else could delete
1342+ * a GXACT between the time we release TwoPhaseStateLock and the time
1343+ * we try to open its state file. We handle this by special-casing
1344+ * ENOENT failures: if we see that, we verify that the GXACT is no
1345+ * longer valid, and if so ignore the failure.
1346+ */
1347+ if (max_prepared_xacts <=0 )
1348+ return ;/* nothing to do */
1349+ xids = (TransactionId * )palloc (max_prepared_xacts * sizeof (TransactionId ));
1350+ nxids = 0 ;
1351+
1352+ LWLockAcquire (TwoPhaseStateLock ,LW_SHARED );
1353+
1354+ for (i = 0 ;i < TwoPhaseState -> numPrepXacts ;i ++ )
1355+ {
1356+ GlobalTransaction gxact = TwoPhaseState -> prepXacts [i ];
1357+
1358+ if (gxact -> valid &&
1359+ XLByteLE (gxact -> prepare_lsn ,redo_horizon ))
1360+ xids [nxids ++ ]= gxact -> proc .xid ;
1361+ }
1362+
1363+ LWLockRelease (TwoPhaseStateLock );
1364+
1365+ for (i = 0 ;i < nxids ;i ++ )
1366+ {
1367+ TransactionId xid = xids [i ];
1368+ int fd ;
1369+
1370+ TwoPhaseFilePath (path ,xid );
1371+
1372+ fd = BasicOpenFile (path ,O_RDWR |PG_BINARY ,0 );
1373+ if (fd < 0 )
1374+ {
1375+ if (errno == ENOENT )
1376+ {
1377+ /* OK if gxact is no longer valid */
1378+ if (!TransactionIdIsPrepared (xid ))
1379+ continue ;
1380+ /* Restore errno in case it was changed */
1381+ errno = ENOENT ;
1382+ }
1383+ ereport (ERROR ,
1384+ (errcode_for_file_access (),
1385+ errmsg ("could not open twophase state file \"%s\": %m" ,
1386+ path )));
1387+ }
1388+
1389+ if (pg_fsync (fd )!= 0 )
1390+ {
1391+ close (fd );
1392+ ereport (ERROR ,
1393+ (errcode_for_file_access (),
1394+ errmsg ("could not fsync twophase state file \"%s\": %m" ,
1395+ path )));
1396+ }
1397+
1398+ if (close (fd )!= 0 )
1399+ ereport (ERROR ,
1400+ (errcode_for_file_access (),
1401+ errmsg ("could not close twophase state file \"%s\": %m" ,
1402+ path )));
1403+ }
1404+
1405+ pfree (xids );
1406+ }
1407+
12661408/*
12671409 * PrescanPreparedTransactions
12681410 *
@@ -1492,6 +1634,13 @@ RecoverPreparedTransactions(void)
14921634
14931635/*
14941636 * Recreate its GXACT and dummy PGPROC
1637+ *
1638+ * Note: since we don't have the PREPARE record's WAL location
1639+ * at hand, we leave prepare_lsn zeroes. This means the GXACT
1640+ * will be fsync'd on every future checkpoint. We assume this
1641+ * situation is infrequent enough that the performance cost is
1642+ * negligible (especially since we know the state file has
1643+ * already been fsynced).
14951644 */
14961645gxact = MarkAsPreparing (xid ,hdr -> gid ,
14971646hdr -> prepared_at ,