77 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88 * Portions Copyright (c) 1994, Regents of the University of California
99 *
10- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.210 2005/07/23 15:31:16 momjian Exp $
10+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.211 2005/07/29 03:22:33 momjian Exp $
1111 *
1212 *-------------------------------------------------------------------------
1313 */
4747#include "utils/relcache.h"
4848
4949
50+ /*
51+ *Becauase O_DIRECT bypasses the kernel buffers, and because we never
52+ *read those buffers except during crash recovery, it is a win to use
53+ *it in all cases where we sync on each write(). We could allow O_DIRECT
54+ *with fsync(), but because skipping the kernel buffer forces writes out
55+ *quickly, it seems best just to use it for O_SYNC. It is hard to imagine
56+ *how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
57+ */
58+ #ifdef O_DIRECT
59+ #define PG_O_DIRECT O_DIRECT
60+ #else
61+ #define PG_O_DIRECT 0
62+ #endif
63+
5064/*
5165 * This chunk of hackery attempts to determine which file sync methods
5266 * are available on the current platform, and to choose an appropriate
5367 * default method.We assume that fsync() is always available, and that
5468 * configure determined whether fdatasync() is.
5569 */
5670#if defined(O_SYNC )
57- #define OPEN_SYNC_FLAG O_SYNC
71+ #define CMP_OPEN_SYNC_FLAG O_SYNC
5872#else
5973#if defined(O_FSYNC )
60- #define OPEN_SYNC_FLAG O_FSYNC
74+ #define CMP_OPEN_SYNC_FLAG O_FSYNC
6175#endif
6276#endif
77+ #define OPEN_SYNC_FLAG (CMP_OPEN_SYNC_FLAG | PG_O_DIRECT)
6378
6479#if defined(O_DSYNC )
6580#if defined(OPEN_SYNC_FLAG )
66- #if O_DSYNC != OPEN_SYNC_FLAG
67- #define OPEN_DATASYNC_FLAG O_DSYNC
81+ #if O_DSYNC != CMP_OPEN_SYNC_FLAG
82+ #define OPEN_DATASYNC_FLAG ( O_DSYNC | PG_O_DIRECT)
6883#endif
6984#else /* !defined(OPEN_SYNC_FLAG) */
7085/* Win32 only has O_DSYNC */
71- #define OPEN_DATASYNC_FLAG O_DSYNC
86+ #define OPEN_DATASYNC_FLAG ( O_DSYNC | PG_O_DIRECT)
7287#endif
7388#endif
7489
90+ /*
91+ * Limitation of buffer-alignment for direct io depend on OS and filesystem,
92+ * but BLCKSZ is assumed to be enough for it.
93+ */
94+ #ifdef O_DIRECT
95+ #define ALIGNOF_XLOG_BUFFER BLCKSZ
96+ #else
97+ #define ALIGNOF_XLOG_BUFFER MAXIMUM_ALIGNOF
98+ #endif
99+
100+ /*
101+ * Switch the alignment routine because ShmemAlloc() returns a max-aligned
102+ * buffer and ALIGNOF_XLOG_BUFFER may be greater than MAXIMUM_ALIGNOF.
103+ */
104+ #if ALIGNOF_XLOG_BUFFER <=MAXIMUM_ALIGNOF
105+ #define XLOG_BUFFER_ALIGN (LEN )MAXALIGN((LEN))
106+ #else
107+ #define XLOG_BUFFER_ALIGN (LEN )((LEN) + (ALIGNOF_XLOG_BUFFER))
108+ #endif
109+ /* assume sizeof(ptrdiff_t) == sizeof(void*) */
110+ #define POINTERALIGN (ALIGNVAL ,PTR )\
111+ ((char *)(((ptrdiff_t) (PTR) + (ALIGNVAL-1)) & ~((ptrdiff_t) (ALIGNVAL-1))))
112+ #define XLOG_BUFFER_POINTERALIGN (PTR )\
113+ POINTERALIGN((ALIGNOF_XLOG_BUFFER), (PTR))
114+
75115#if defined(OPEN_DATASYNC_FLAG )
76116#define DEFAULT_SYNC_METHOD_STR "open_datasync"
77117#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN
@@ -469,6 +509,17 @@ static void ReadControlFile(void);
469509static char * str_time (time_t tnow );
470510static void issue_xlog_fsync (void );
471511
512+ /* XLog gather-write staffs */
513+ typedef struct XLogPages
514+ {
515+ char * head ;/* Head of first page */
516+ int size ;/* Total bytes of pages == count(pages) * BLCKSZ */
517+ int offset ;/* Offset in xlog segment file */
518+ }XLogPages ;
519+ static void XLogPageReset (XLogPages * pages );
520+ static void XLogPageWrite (XLogPages * pages ,int index );
521+ static void XLogPageFlush (XLogPages * pages ,int index );
522+
472523#ifdef WAL_DEBUG
473524static void xlog_outrec (char * buf ,XLogRecord * record );
474525#endif
@@ -1245,9 +1296,10 @@ static void
12451296XLogWrite (XLogwrtRqst WriteRqst )
12461297{
12471298XLogCtlWrite * Write = & XLogCtl -> Write ;
1248- char * from ;
12491299bool ispartialpage ;
12501300bool use_existent ;
1301+ int currentIndex = Write -> curridx ;
1302+ XLogPages pages ;
12511303
12521304/* We should always be inside a critical section here */
12531305Assert (CritSectionCount > 0 );
@@ -1258,6 +1310,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
12581310 */
12591311LogwrtResult = Write -> LogwrtResult ;
12601312
1313+ XLogPageReset (& pages );
1314+
12611315while (XLByteLT (LogwrtResult .Write ,WriteRqst .Write ))
12621316{
12631317/*
@@ -1266,21 +1320,22 @@ XLogWrite(XLogwrtRqst WriteRqst)
12661320 * end of the last page that's been initialized by
12671321 * AdvanceXLInsertBuffer.
12681322 */
1269- if (!XLByteLT (LogwrtResult .Write ,XLogCtl -> xlblocks [Write -> curridx ]))
1323+ if (!XLByteLT (LogwrtResult .Write ,XLogCtl -> xlblocks [currentIndex ]))
12701324elog (PANIC ,"xlog write request %X/%X is past end of log %X/%X" ,
12711325LogwrtResult .Write .xlogid ,LogwrtResult .Write .xrecoff ,
1272- XLogCtl -> xlblocks [Write -> curridx ].xlogid ,
1273- XLogCtl -> xlblocks [Write -> curridx ].xrecoff );
1326+ XLogCtl -> xlblocks [currentIndex ].xlogid ,
1327+ XLogCtl -> xlblocks [currentIndex ].xrecoff );
12741328
12751329/* Advance LogwrtResult.Write to end of current buffer page */
1276- LogwrtResult .Write = XLogCtl -> xlblocks [Write -> curridx ];
1330+ LogwrtResult .Write = XLogCtl -> xlblocks [currentIndex ];
12771331ispartialpage = XLByteLT (WriteRqst .Write ,LogwrtResult .Write );
12781332
12791333if (!XLByteInPrevSeg (LogwrtResult .Write ,openLogId ,openLogSeg ))
12801334{
12811335/*
12821336 * Switch to new logfile segment.
12831337 */
1338+ XLogPageFlush (& pages ,currentIndex );
12841339if (openLogFile >=0 )
12851340{
12861341if (close (openLogFile ))
@@ -1354,31 +1409,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
13541409openLogOff = 0 ;
13551410}
13561411
1357- /* Need to seek in the file? */
1358- if (openLogOff != (LogwrtResult .Write .xrecoff - BLCKSZ ) %XLogSegSize )
1359- {
1360- openLogOff = (LogwrtResult .Write .xrecoff - BLCKSZ ) %XLogSegSize ;
1361- if (lseek (openLogFile , (off_t )openLogOff ,SEEK_SET )< 0 )
1362- ereport (PANIC ,
1363- (errcode_for_file_access (),
1364- errmsg ("could not seek in log file %u, segment %u to offset %u: %m" ,
1365- openLogId ,openLogSeg ,openLogOff )));
1366- }
1367-
1368- /* OK to write the page */
1369- from = XLogCtl -> pages + Write -> curridx * BLCKSZ ;
1370- errno = 0 ;
1371- if (write (openLogFile ,from ,BLCKSZ )!= BLCKSZ )
1372- {
1373- /* if write didn't set errno, assume problem is no disk space */
1374- if (errno == 0 )
1375- errno = ENOSPC ;
1376- ereport (PANIC ,
1377- (errcode_for_file_access (),
1378- errmsg ("could not write to log file %u, segment %u at offset %u: %m" ,
1379- openLogId ,openLogSeg ,openLogOff )));
1380- }
1381- openLogOff += BLCKSZ ;
1412+ /* Add a page to buffer */
1413+ XLogPageWrite (& pages ,currentIndex );
13821414
13831415/*
13841416 * If we just wrote the whole last page of a logfile segment,
@@ -1390,8 +1422,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
13901422 * This is also the right place to notify the Archiver that the
13911423 * segment is ready to copy to archival storage.
13921424 */
1393- if (openLogOff >=XLogSegSize && !ispartialpage )
1425+ if (openLogOff + pages . size >=XLogSegSize && !ispartialpage )
13941426{
1427+ XLogPageFlush (& pages ,currentIndex );
13951428issue_xlog_fsync ();
13961429LogwrtResult .Flush = LogwrtResult .Write ;/* end of current page */
13971430
@@ -1405,8 +1438,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
14051438LogwrtResult .Write = WriteRqst .Write ;
14061439break ;
14071440}
1408- Write -> curridx = NextBufIdx (Write -> curridx );
1441+ currentIndex = NextBufIdx (currentIndex );
14091442}
1443+ XLogPageFlush (& pages ,currentIndex );
14101444
14111445/*
14121446 * If asked to flush, do so
@@ -3584,7 +3618,7 @@ XLOGShmemSize(void)
35843618if (XLOGbuffers < MinXLOGbuffers )
35853619XLOGbuffers = MinXLOGbuffers ;
35863620
3587- return MAXALIGN (sizeof (XLogCtlData )+ sizeof (XLogRecPtr )* XLOGbuffers )
3621+ return XLOG_BUFFER_ALIGN (sizeof (XLogCtlData )+ sizeof (XLogRecPtr )* XLOGbuffers )
35883622+ BLCKSZ * XLOGbuffers +
35893623MAXALIGN (sizeof (ControlFileData ));
35903624}
@@ -3601,7 +3635,7 @@ XLOGShmemInit(void)
36013635
36023636XLogCtl = (XLogCtlData * )
36033637ShmemInitStruct ("XLOG Ctl" ,
3604- MAXALIGN (sizeof (XLogCtlData )+
3638+ XLOG_BUFFER_ALIGN (sizeof (XLogCtlData )+
36053639sizeof (XLogRecPtr )* XLOGbuffers )
36063640+ BLCKSZ * XLOGbuffers ,
36073641& foundXLog );
@@ -3630,9 +3664,9 @@ XLOGShmemInit(void)
36303664 * Here, on the other hand, we must MAXALIGN to ensure the page
36313665 * buffers have worst-case alignment.
36323666 */
3633- XLogCtl -> pages =
3634- ((char * )XLogCtl )+ MAXALIGN ( sizeof ( XLogCtlData ) +
3635- sizeof (XLogRecPtr )* XLOGbuffers );
3667+ XLogCtl -> pages = XLOG_BUFFER_POINTERALIGN (
3668+ ((char * )XLogCtl )
3669+ + sizeof ( XLogCtlData ) + sizeof (XLogRecPtr )* XLOGbuffers );
36363670memset (XLogCtl -> pages ,0 ,BLCKSZ * XLOGbuffers );
36373671
36383672/*
@@ -3690,10 +3724,9 @@ BootStrapXLOG(void)
36903724/* First timeline ID is always 1 */
36913725ThisTimeLineID = 1 ;
36923726
3693- /* Use malloc() to ensure buffer is MAXALIGNED */
3694- buffer = (char * )malloc (BLCKSZ );
3695- page = (XLogPageHeader )buffer ;
3696- memset (buffer ,0 ,BLCKSZ );
3727+ buffer = (char * )malloc (BLCKSZ + ALIGNOF_XLOG_BUFFER );
3728+ page = (XLogPageHeader )XLOG_BUFFER_POINTERALIGN (buffer );
3729+ memset (page ,0 ,BLCKSZ );
36973730
36983731/* Set up information for the initial checkpoint record */
36993732checkPoint .redo .xlogid = 0 ;
@@ -3745,7 +3778,7 @@ BootStrapXLOG(void)
37453778
37463779/* Write the first page with the initial record */
37473780errno = 0 ;
3748- if (write (openLogFile ,buffer ,BLCKSZ )!= BLCKSZ )
3781+ if (write (openLogFile ,page ,BLCKSZ )!= BLCKSZ )
37493782{
37503783/* if write didn't set errno, assume problem is no disk space */
37513784if (errno == 0 )
@@ -5837,3 +5870,71 @@ remove_backup_label(void)
58375870errmsg ("could not remove file \"%s\": %m" ,
58385871BACKUP_LABEL_FILE )));
58395872}
5873+
5874+
5875+ /* XLog gather-write staffs */
5876+
5877+ static void
5878+ XLogPageReset (XLogPages * pages )
5879+ {
5880+ memset (pages ,0 ,sizeof (* pages ));
5881+ }
5882+
5883+ static void
5884+ XLogPageWrite (XLogPages * pages ,int index )
5885+ {
5886+ char * page = XLogCtl -> pages + index * BLCKSZ ;
5887+ int size = BLCKSZ ;
5888+ int offset = (LogwrtResult .Write .xrecoff - BLCKSZ ) %XLogSegSize ;
5889+
5890+ if (pages -> head + pages -> size == page
5891+ && pages -> offset + pages -> size == offset )
5892+ {/* Pages are continuous. Append new page. */
5893+ pages -> size += size ;
5894+ }
5895+ else
5896+ {/* Pages are not continuous. Flush and clear. */
5897+ XLogPageFlush (pages ,PrevBufIdx (index ));
5898+ pages -> head = page ;
5899+ pages -> size = size ;
5900+ pages -> offset = offset ;
5901+ }
5902+ }
5903+
5904+ static void
5905+ XLogPageFlush (XLogPages * pages ,int index )
5906+ {
5907+ if (!pages -> head )
5908+ {/* No needs to write pages. */
5909+ XLogCtl -> Write .curridx = index ;
5910+ return ;
5911+ }
5912+
5913+ /* Need to seek in the file? */
5914+ if (openLogOff != pages -> offset )
5915+ {
5916+ openLogOff = pages -> offset ;
5917+ if (lseek (openLogFile , (off_t )openLogOff ,SEEK_SET )< 0 )
5918+ ereport (PANIC ,
5919+ (errcode_for_file_access (),
5920+ errmsg ("could not seek in log file %u, segment %u to offset %u: %m" ,
5921+ openLogId ,openLogSeg ,openLogOff )));
5922+ }
5923+
5924+ /* OK to write the page */
5925+ errno = 0 ;
5926+ if (write (openLogFile ,pages -> head ,pages -> size )!= pages -> size )
5927+ {
5928+ /* if write didn't set errno, assume problem is no disk space */
5929+ if (errno == 0 )
5930+ errno = ENOSPC ;
5931+ ereport (PANIC ,
5932+ (errcode_for_file_access (),
5933+ errmsg ("could not write to log file %u, segment %u at offset %u: %m" ,
5934+ openLogId ,openLogSeg ,openLogOff )));
5935+ }
5936+
5937+ openLogOff += pages -> size ;
5938+ XLogCtl -> Write .curridx = index ;
5939+ XLogPageReset (pages );
5940+ }