Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc34bb00

Browse files
committed
Use O_DIRECT if available when using O_SYNC for wal_sync_method.Also, write multiple WAL buffers out in one write() operation.ITAGAKI Takahiro---------------------------------------------------------------------------> If we disable writeback-cache and use open_sync, the per-page writing> behavior in WAL module will show up as bad result. O_DIRECT is similar> to O_DSYNC (at least on linux), so that the benefit of it will disappear> behind the slow disk revolution.>> In the current source, WAL is written as:> for (i = 0; i < N; i++) { write(&buffers[i], BLCKSZ); }> Is this intentional? Can we rewrite it as follows?> write(&buffers[0], N * BLCKSZ);>> In order to achieve it, I wrote a 'gather-write' patch (xlog.gw.diff).> Aside from this, I'll also send the fixed direct io patch (xlog.dio.diff).> These two patches are independent, so they can be applied either or both.>>> I tested them on my machine and the results as follows. It shows that> direct-io and gather-write is the best choice when writeback-cache is off.> Are these two patches worth trying if they are used together?>>> | writeback | fsync= | fdata | open_ | fsync_ | open_> patch | cache | false | sync | sync | direct | direct> ------------+-----------+--------+-------+-------+--------+---------> direct io | off | 124.2 | 105.7 | 48.3 | 48.3 | 48.2> direct io | on | 129.1 | 112.3 | 114.1 | 142.9 | 144.5> gather-write| off | 124.3 | 108.7 | 105.4 | (N/A) | (N/A)> both | off | 131.5 | 115.5 | 114.4 | 145.4 | 145.2>> - 20runs * pgbench -s 100 -c 50 -t 200> - with tuning (wal_buffers=64, commit_delay=500, checkpoint_segments=8)> - using 2 ATA disks:> - hda(reiserfs) includes system and wal.> - hdc(jfs) includes database files. writeback-cache is always on.>> ---> ITAGAKI Takahiro
1 parent722f31f commitc34bb00

File tree

1 file changed

+149
-48
lines changed
  • src/backend/access/transam

1 file changed

+149
-48
lines changed

‎src/backend/access/transam/xlog.c

Lines changed: 149 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.210 2005/07/23 15:31:16 momjian Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.211 2005/07/29 03:22:33 momjian Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -47,31 +47,71 @@
4747
#include"utils/relcache.h"
4848

4949

50+
/*
51+
*Becauase O_DIRECT bypasses the kernel buffers, and because we never
52+
*read those buffers except during crash recovery, it is a win to use
53+
*it in all cases where we sync on each write(). We could allow O_DIRECT
54+
*with fsync(), but because skipping the kernel buffer forces writes out
55+
*quickly, it seems best just to use it for O_SYNC. It is hard to imagine
56+
*how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
57+
*/
58+
#ifdefO_DIRECT
59+
#definePG_O_DIRECTO_DIRECT
60+
#else
61+
#definePG_O_DIRECT0
62+
#endif
63+
5064
/*
5165
* This chunk of hackery attempts to determine which file sync methods
5266
* are available on the current platform, and to choose an appropriate
5367
* default method.We assume that fsync() is always available, and that
5468
* configure determined whether fdatasync() is.
5569
*/
5670
#if defined(O_SYNC)
57-
#defineOPEN_SYNC_FLAGO_SYNC
71+
#defineCMP_OPEN_SYNC_FLAGO_SYNC
5872
#else
5973
#if defined(O_FSYNC)
60-
#defineOPEN_SYNC_FLAGO_FSYNC
74+
#defineCMP_OPEN_SYNC_FLAGO_FSYNC
6175
#endif
6276
#endif
77+
#defineOPEN_SYNC_FLAG(CMP_OPEN_SYNC_FLAG | PG_O_DIRECT)
6378

6479
#if defined(O_DSYNC)
6580
#if defined(OPEN_SYNC_FLAG)
66-
#ifO_DSYNC!=OPEN_SYNC_FLAG
67-
#defineOPEN_DATASYNC_FLAGO_DSYNC
81+
#ifO_DSYNC!=CMP_OPEN_SYNC_FLAG
82+
#defineOPEN_DATASYNC_FLAG(O_DSYNC | PG_O_DIRECT)
6883
#endif
6984
#else/* !defined(OPEN_SYNC_FLAG) */
7085
/* Win32 only has O_DSYNC */
71-
#defineOPEN_DATASYNC_FLAGO_DSYNC
86+
#defineOPEN_DATASYNC_FLAG(O_DSYNC | PG_O_DIRECT)
7287
#endif
7388
#endif
7489

90+
/*
91+
* Limitation of buffer-alignment for direct io depend on OS and filesystem,
92+
* but BLCKSZ is assumed to be enough for it.
93+
*/
94+
#ifdefO_DIRECT
95+
#defineALIGNOF_XLOG_BUFFERBLCKSZ
96+
#else
97+
#defineALIGNOF_XLOG_BUFFERMAXIMUM_ALIGNOF
98+
#endif
99+
100+
/*
101+
* Switch the alignment routine because ShmemAlloc() returns a max-aligned
102+
* buffer and ALIGNOF_XLOG_BUFFER may be greater than MAXIMUM_ALIGNOF.
103+
*/
104+
#ifALIGNOF_XLOG_BUFFER <=MAXIMUM_ALIGNOF
105+
#defineXLOG_BUFFER_ALIGN(LEN)MAXALIGN((LEN))
106+
#else
107+
#defineXLOG_BUFFER_ALIGN(LEN)((LEN) + (ALIGNOF_XLOG_BUFFER))
108+
#endif
109+
/* assume sizeof(ptrdiff_t) == sizeof(void*) */
110+
#definePOINTERALIGN(ALIGNVAL,PTR)\
111+
((char *)(((ptrdiff_t) (PTR) + (ALIGNVAL-1)) & ~((ptrdiff_t) (ALIGNVAL-1))))
112+
#defineXLOG_BUFFER_POINTERALIGN(PTR)\
113+
POINTERALIGN((ALIGNOF_XLOG_BUFFER), (PTR))
114+
75115
#if defined(OPEN_DATASYNC_FLAG)
76116
#defineDEFAULT_SYNC_METHOD_STR"open_datasync"
77117
#defineDEFAULT_SYNC_METHODSYNC_METHOD_OPEN
@@ -469,6 +509,17 @@ static void ReadControlFile(void);
469509
staticchar*str_time(time_ttnow);
470510
staticvoidissue_xlog_fsync(void);
471511

512+
/* XLog gather-write staffs */
513+
typedefstructXLogPages
514+
{
515+
char*head;/* Head of first page */
516+
intsize;/* Total bytes of pages == count(pages) * BLCKSZ */
517+
intoffset;/* Offset in xlog segment file */
518+
}XLogPages;
519+
staticvoidXLogPageReset(XLogPages*pages);
520+
staticvoidXLogPageWrite(XLogPages*pages,intindex);
521+
staticvoidXLogPageFlush(XLogPages*pages,intindex);
522+
472523
#ifdefWAL_DEBUG
473524
staticvoidxlog_outrec(char*buf,XLogRecord*record);
474525
#endif
@@ -1245,9 +1296,10 @@ static void
12451296
XLogWrite(XLogwrtRqstWriteRqst)
12461297
{
12471298
XLogCtlWrite*Write=&XLogCtl->Write;
1248-
char*from;
12491299
boolispartialpage;
12501300
booluse_existent;
1301+
intcurrentIndex=Write->curridx;
1302+
XLogPagespages;
12511303

12521304
/* We should always be inside a critical section here */
12531305
Assert(CritSectionCount>0);
@@ -1258,6 +1310,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
12581310
*/
12591311
LogwrtResult=Write->LogwrtResult;
12601312

1313+
XLogPageReset(&pages);
1314+
12611315
while (XLByteLT(LogwrtResult.Write,WriteRqst.Write))
12621316
{
12631317
/*
@@ -1266,21 +1320,22 @@ XLogWrite(XLogwrtRqst WriteRqst)
12661320
* end of the last page that's been initialized by
12671321
* AdvanceXLInsertBuffer.
12681322
*/
1269-
if (!XLByteLT(LogwrtResult.Write,XLogCtl->xlblocks[Write->curridx]))
1323+
if (!XLByteLT(LogwrtResult.Write,XLogCtl->xlblocks[currentIndex]))
12701324
elog(PANIC,"xlog write request %X/%X is past end of log %X/%X",
12711325
LogwrtResult.Write.xlogid,LogwrtResult.Write.xrecoff,
1272-
XLogCtl->xlblocks[Write->curridx].xlogid,
1273-
XLogCtl->xlblocks[Write->curridx].xrecoff);
1326+
XLogCtl->xlblocks[currentIndex].xlogid,
1327+
XLogCtl->xlblocks[currentIndex].xrecoff);
12741328

12751329
/* Advance LogwrtResult.Write to end of current buffer page */
1276-
LogwrtResult.Write=XLogCtl->xlblocks[Write->curridx];
1330+
LogwrtResult.Write=XLogCtl->xlblocks[currentIndex];
12771331
ispartialpage=XLByteLT(WriteRqst.Write,LogwrtResult.Write);
12781332

12791333
if (!XLByteInPrevSeg(LogwrtResult.Write,openLogId,openLogSeg))
12801334
{
12811335
/*
12821336
* Switch to new logfile segment.
12831337
*/
1338+
XLogPageFlush(&pages,currentIndex);
12841339
if (openLogFile >=0)
12851340
{
12861341
if (close(openLogFile))
@@ -1354,31 +1409,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
13541409
openLogOff=0;
13551410
}
13561411

1357-
/* Need to seek in the file? */
1358-
if (openLogOff!= (LogwrtResult.Write.xrecoff-BLCKSZ) %XLogSegSize)
1359-
{
1360-
openLogOff= (LogwrtResult.Write.xrecoff-BLCKSZ) %XLogSegSize;
1361-
if (lseek(openLogFile, (off_t)openLogOff,SEEK_SET)<0)
1362-
ereport(PANIC,
1363-
(errcode_for_file_access(),
1364-
errmsg("could not seek in log file %u, segment %u to offset %u: %m",
1365-
openLogId,openLogSeg,openLogOff)));
1366-
}
1367-
1368-
/* OK to write the page */
1369-
from=XLogCtl->pages+Write->curridx*BLCKSZ;
1370-
errno=0;
1371-
if (write(openLogFile,from,BLCKSZ)!=BLCKSZ)
1372-
{
1373-
/* if write didn't set errno, assume problem is no disk space */
1374-
if (errno==0)
1375-
errno=ENOSPC;
1376-
ereport(PANIC,
1377-
(errcode_for_file_access(),
1378-
errmsg("could not write to log file %u, segment %u at offset %u: %m",
1379-
openLogId,openLogSeg,openLogOff)));
1380-
}
1381-
openLogOff+=BLCKSZ;
1412+
/* Add a page to buffer */
1413+
XLogPageWrite(&pages,currentIndex);
13821414

13831415
/*
13841416
* If we just wrote the whole last page of a logfile segment,
@@ -1390,8 +1422,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
13901422
* This is also the right place to notify the Archiver that the
13911423
* segment is ready to copy to archival storage.
13921424
*/
1393-
if (openLogOff >=XLogSegSize&& !ispartialpage)
1425+
if (openLogOff+pages.size>=XLogSegSize&& !ispartialpage)
13941426
{
1427+
XLogPageFlush(&pages,currentIndex);
13951428
issue_xlog_fsync();
13961429
LogwrtResult.Flush=LogwrtResult.Write;/* end of current page */
13971430

@@ -1405,8 +1438,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
14051438
LogwrtResult.Write=WriteRqst.Write;
14061439
break;
14071440
}
1408-
Write->curridx=NextBufIdx(Write->curridx);
1441+
currentIndex=NextBufIdx(currentIndex);
14091442
}
1443+
XLogPageFlush(&pages,currentIndex);
14101444

14111445
/*
14121446
* If asked to flush, do so
@@ -3584,7 +3618,7 @@ XLOGShmemSize(void)
35843618
if (XLOGbuffers<MinXLOGbuffers)
35853619
XLOGbuffers=MinXLOGbuffers;
35863620

3587-
returnMAXALIGN(sizeof(XLogCtlData)+sizeof(XLogRecPtr)*XLOGbuffers)
3621+
returnXLOG_BUFFER_ALIGN(sizeof(XLogCtlData)+sizeof(XLogRecPtr)*XLOGbuffers)
35883622
+BLCKSZ*XLOGbuffers+
35893623
MAXALIGN(sizeof(ControlFileData));
35903624
}
@@ -3601,7 +3635,7 @@ XLOGShmemInit(void)
36013635

36023636
XLogCtl= (XLogCtlData*)
36033637
ShmemInitStruct("XLOG Ctl",
3604-
MAXALIGN(sizeof(XLogCtlData)+
3638+
XLOG_BUFFER_ALIGN(sizeof(XLogCtlData)+
36053639
sizeof(XLogRecPtr)*XLOGbuffers)
36063640
+BLCKSZ*XLOGbuffers,
36073641
&foundXLog);
@@ -3630,9 +3664,9 @@ XLOGShmemInit(void)
36303664
* Here, on the other hand, we must MAXALIGN to ensure the page
36313665
* buffers have worst-case alignment.
36323666
*/
3633-
XLogCtl->pages=
3634-
((char*)XLogCtl)+MAXALIGN(sizeof(XLogCtlData)+
3635-
sizeof(XLogRecPtr)*XLOGbuffers);
3667+
XLogCtl->pages=XLOG_BUFFER_POINTERALIGN(
3668+
((char*)XLogCtl)
3669+
+sizeof(XLogCtlData)+sizeof(XLogRecPtr)*XLOGbuffers);
36363670
memset(XLogCtl->pages,0,BLCKSZ*XLOGbuffers);
36373671

36383672
/*
@@ -3690,10 +3724,9 @@ BootStrapXLOG(void)
36903724
/* First timeline ID is always 1 */
36913725
ThisTimeLineID=1;
36923726

3693-
/* Use malloc() to ensure buffer is MAXALIGNED */
3694-
buffer= (char*)malloc(BLCKSZ);
3695-
page= (XLogPageHeader)buffer;
3696-
memset(buffer,0,BLCKSZ);
3727+
buffer= (char*)malloc(BLCKSZ+ALIGNOF_XLOG_BUFFER);
3728+
page= (XLogPageHeader)XLOG_BUFFER_POINTERALIGN(buffer);
3729+
memset(page,0,BLCKSZ);
36973730

36983731
/* Set up information for the initial checkpoint record */
36993732
checkPoint.redo.xlogid=0;
@@ -3745,7 +3778,7 @@ BootStrapXLOG(void)
37453778

37463779
/* Write the first page with the initial record */
37473780
errno=0;
3748-
if (write(openLogFile,buffer,BLCKSZ)!=BLCKSZ)
3781+
if (write(openLogFile,page,BLCKSZ)!=BLCKSZ)
37493782
{
37503783
/* if write didn't set errno, assume problem is no disk space */
37513784
if (errno==0)
@@ -5837,3 +5870,71 @@ remove_backup_label(void)
58375870
errmsg("could not remove file \"%s\": %m",
58385871
BACKUP_LABEL_FILE)));
58395872
}
5873+
5874+
5875+
/* XLog gather-write staffs */
5876+
5877+
staticvoid
5878+
XLogPageReset(XLogPages*pages)
5879+
{
5880+
memset(pages,0,sizeof(*pages));
5881+
}
5882+
5883+
staticvoid
5884+
XLogPageWrite(XLogPages*pages,intindex)
5885+
{
5886+
char*page=XLogCtl->pages+index*BLCKSZ;
5887+
intsize=BLCKSZ;
5888+
intoffset= (LogwrtResult.Write.xrecoff-BLCKSZ) %XLogSegSize;
5889+
5890+
if (pages->head+pages->size==page
5891+
&&pages->offset+pages->size==offset)
5892+
{/* Pages are continuous. Append new page. */
5893+
pages->size+=size;
5894+
}
5895+
else
5896+
{/* Pages are not continuous. Flush and clear. */
5897+
XLogPageFlush(pages,PrevBufIdx(index));
5898+
pages->head=page;
5899+
pages->size=size;
5900+
pages->offset=offset;
5901+
}
5902+
}
5903+
5904+
staticvoid
5905+
XLogPageFlush(XLogPages*pages,intindex)
5906+
{
5907+
if (!pages->head)
5908+
{/* No needs to write pages. */
5909+
XLogCtl->Write.curridx=index;
5910+
return;
5911+
}
5912+
5913+
/* Need to seek in the file? */
5914+
if (openLogOff!=pages->offset)
5915+
{
5916+
openLogOff=pages->offset;
5917+
if (lseek(openLogFile, (off_t)openLogOff,SEEK_SET)<0)
5918+
ereport(PANIC,
5919+
(errcode_for_file_access(),
5920+
errmsg("could not seek in log file %u, segment %u to offset %u: %m",
5921+
openLogId,openLogSeg,openLogOff)));
5922+
}
5923+
5924+
/* OK to write the page */
5925+
errno=0;
5926+
if (write(openLogFile,pages->head,pages->size)!=pages->size)
5927+
{
5928+
/* if write didn't set errno, assume problem is no disk space */
5929+
if (errno==0)
5930+
errno=ENOSPC;
5931+
ereport(PANIC,
5932+
(errcode_for_file_access(),
5933+
errmsg("could not write to log file %u, segment %u at offset %u: %m",
5934+
openLogId,openLogSeg,openLogOff)));
5935+
}
5936+
5937+
openLogOff+=pages->size;
5938+
XLogCtl->Write.curridx=index;
5939+
XLogPageReset(pages);
5940+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp