Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb966dd6

Browse files
committed
Add fsync capability to initdb, and use sync_file_range() if available.
Historically we have not worried about fsync'ing anything during initdb(in fact, initdb intentionally passes -F to each backend launch to preventit from fsync'ing). But with filesystems getting more aggressive aboutcaching data, that's not such a good plan anymore. Make initdb do a passover the finished data directory tree to fsync everything. For testingpurposes, the -N/--nosync flag can be used to restore the old behavior.Also, testing shows that on Linux, sync_file_range() is much faster thanposix_fadvise() for hinting to the kernel that an fsync is coming,apparently because the latter blocks on a rather small request queue whilethe former doesn't. So use this function if available in initdb, and alsoin the backend's pg_flush_data() (where it currently will affect only thespeed of CREATE DATABASE's cloning step).We will later make pg_regress invoke initdb with the --nosync flagto avoid slowing down cases such as "make check" in contrib. Butlet's not do so until we've shaken out any portability issues in thispatch.Jeff Davis, reviewed by Andres Freund
1 parent1a9405d commitb966dd6

File tree

7 files changed

+258
-5
lines changed

7 files changed

+258
-5
lines changed

‎configure

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19254,7 +19254,8 @@ fi
1925419254

1925519255

1925619256

19257-
for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink towlower utime utimes wcstombs wcstombs_l
19257+
19258+
for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
1925819259
do
1925919260
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
1926019261
{ $as_echo "$as_me:$LINENO: checking for $ac_func" >&5

‎configure.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1207,7 +1207,7 @@ PGAC_VAR_INT_TIMEZONE
12071207
AC_FUNC_ACCEPT_ARGTYPES
12081208
PGAC_FUNC_GETTIMEOFDAY_1ARG
12091209

1210-
AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink towlower utime utimes wcstombs wcstombs_l])
1210+
AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlinksync_file_rangetowlower utime utimes wcstombs wcstombs_l])
12111211

12121212
AC_REPLACE_FUNCS(fseeko)
12131213
case $host_os in

‎doc/src/sgml/ref/initdb.sgml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,21 @@ PostgreSQL documentation
219219
</listitem>
220220
</varlistentry>
221221

222+
<varlistentry>
223+
<term><option>-N</option></term>
224+
<term><option>--nosync</option></term>
225+
<listitem>
226+
<para>
227+
By default, <command>initdb</command> will wait for all files to be
228+
written safely to disk. This option causes <command>initdb</command>
229+
to return without waiting, which is faster, but means that a
230+
subsequent operating system crash can leave the data directory
231+
corrupt. Generally, this option is useful for testing, but should not
232+
be used when creating a production installation.
233+
</para>
234+
</listitem>
235+
</varlistentry>
236+
222237
<varlistentry>
223238
<term><option>--pwfile=<replaceable>filename</></option></term>
224239
<listitem>

‎src/backend/storage/file/fd.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,12 +336,15 @@ pg_fdatasync(int fd)
336336
/*
337337
* pg_flush_data --- advise OS that the data described won't be needed soon
338338
*
339-
* Not all platforms have posix_fadvise; treat as noop if not available.
339+
* Not all platforms have sync_file_range or posix_fadvise; treat as no-op
340+
* if not available.
340341
*/
341342
int
342343
pg_flush_data(intfd,off_toffset,off_tamount)
343344
{
344-
#if defined(USE_POSIX_FADVISE)&& defined(POSIX_FADV_DONTNEED)
345+
#if defined(HAVE_SYNC_FILE_RANGE)
346+
returnsync_file_range(fd,offset,amount,SYNC_FILE_RANGE_WRITE);
347+
#elif defined(USE_POSIX_FADVISE)&& defined(POSIX_FADV_DONTNEED)
345348
returnposix_fadvise(fd,offset,amount,POSIX_FADV_DONTNEED);
346349
#else
347350
return0;

‎src/bin/initdb/initdb.c

Lines changed: 229 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include"postgres_fe.h"
5050

5151
#include<dirent.h>
52+
#include<fcntl.h>
5253
#include<sys/stat.h>
5354
#include<unistd.h>
5455
#include<locale.h>
@@ -116,6 +117,7 @@ static const char *authmethodhost = "";
116117
staticconstchar*authmethodlocal="";
117118
staticbooldebug= false;
118119
staticboolnoclean= false;
120+
staticbooldo_sync= true;
119121
staticboolshow_setting= false;
120122
staticchar*xlog_dir="";
121123

@@ -160,6 +162,9 @@ static char *authwarning = NULL;
160162
/*
161163
* Centralized knowledge of switches to pass to backend
162164
*
165+
* Note: we run the backend with -F (fsync disabled) and then do a single
166+
* pass of fsync'ing at the end. This is faster than fsync'ing each step.
167+
*
163168
* Note: in the shell-script version, we also passed PGDATA as a -D switch,
164169
* but here it is more convenient to pass it as an environment variable
165170
* (no quoting to worry about).
@@ -182,6 +187,9 @@ static char **filter_lines_with_token(char **lines, const char *token);
182187
#endif
183188
staticchar**readfile(constchar*path);
184189
staticvoidwritefile(char*path,char**lines);
190+
staticvoidwalkdir(char*path,void (*action)(char*fname,boolisdir));
191+
staticvoidpre_sync_fname(char*fname,boolisdir);
192+
staticvoidfsync_fname(char*fname,boolisdir);
185193
staticFILE*popen_check(constchar*command,constchar*mode);
186194
staticvoidexit_nicely(void);
187195
staticchar*get_id(void);
@@ -209,6 +217,7 @@ static void load_plpgsql(void);
209217
staticvoidvacuum_db(void);
210218
staticvoidmake_template0(void);
211219
staticvoidmake_postgres(void);
220+
staticvoidperform_fsync(void);
212221
staticvoidtrapsig(intsignum);
213222
staticvoidcheck_ok(void);
214223
staticchar*escape_quotes(constchar*src);
@@ -489,6 +498,174 @@ writefile(char *path, char **lines)
489498
}
490499
}
491500

501+
/*
502+
* walkdir: recursively walk a directory, applying the action to each
503+
* regular file and directory (including the named directory itself).
504+
*
505+
* Adapted from copydir() in copydir.c.
506+
*/
507+
staticvoid
508+
walkdir(char*path,void (*action) (char*fname,boolisdir))
509+
{
510+
DIR*dir;
511+
structdirent*direntry;
512+
charsubpath[MAXPGPATH];
513+
514+
dir=opendir(path);
515+
if (dir==NULL)
516+
{
517+
fprintf(stderr,_("%s: could not open directory \"%s\": %s\n"),
518+
progname,path,strerror(errno));
519+
exit_nicely();
520+
}
521+
522+
while (errno=0, (direntry=readdir(dir))!=NULL)
523+
{
524+
structstatfst;
525+
526+
if (strcmp(direntry->d_name,".")==0||
527+
strcmp(direntry->d_name,"..")==0)
528+
continue;
529+
530+
snprintf(subpath,MAXPGPATH,"%s/%s",path,direntry->d_name);
531+
532+
if (lstat(subpath,&fst)<0)
533+
{
534+
fprintf(stderr,_("%s: could not stat file \"%s\": %s\n"),
535+
progname,subpath,strerror(errno));
536+
exit_nicely();
537+
}
538+
539+
if (S_ISDIR(fst.st_mode))
540+
walkdir(subpath,action);
541+
elseif (S_ISREG(fst.st_mode))
542+
(*action) (subpath, false);
543+
}
544+
545+
#ifdefWIN32
546+
/*
547+
* This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
548+
* released version
549+
*/
550+
if (GetLastError()==ERROR_NO_MORE_FILES)
551+
errno=0;
552+
#endif
553+
554+
if (errno)
555+
{
556+
fprintf(stderr,_("%s: could not read directory \"%s\": %s\n"),
557+
progname,path,strerror(errno));
558+
exit_nicely();
559+
}
560+
561+
closedir(dir);
562+
563+
/*
564+
* It's important to fsync the destination directory itself as individual
565+
* file fsyncs don't guarantee that the directory entry for the file is
566+
* synced. Recent versions of ext4 have made the window much wider but
567+
* it's been an issue for ext3 and other filesystems in the past.
568+
*/
569+
(*action) (path, true);
570+
}
571+
572+
/*
573+
* Hint to the OS that it should get ready to fsync() this file.
574+
*/
575+
staticvoid
576+
pre_sync_fname(char*fname,boolisdir)
577+
{
578+
#if defined(HAVE_SYNC_FILE_RANGE)|| \
579+
(defined(USE_POSIX_FADVISE)&& defined(POSIX_FADV_DONTNEED))
580+
intfd;
581+
582+
fd=open(fname,O_RDONLY |PG_BINARY);
583+
584+
/*
585+
* Some OSs don't allow us to open directories at all (Windows returns
586+
* EACCES)
587+
*/
588+
if (fd<0&&isdir&& (errno==EISDIR||errno==EACCES))
589+
return;
590+
591+
if (fd<0)
592+
{
593+
fprintf(stderr,_("%s: could not open file \"%s\": %s\n"),
594+
progname,fname,strerror(errno));
595+
exit_nicely();
596+
}
597+
598+
/*
599+
* Prefer sync_file_range, else use posix_fadvise. We ignore any error
600+
* here since this operation is only a hint anyway.
601+
*/
602+
#if defined(HAVE_SYNC_FILE_RANGE)
603+
sync_file_range(fd,0,0,SYNC_FILE_RANGE_WRITE);
604+
#elif defined(USE_POSIX_FADVISE)&& defined(POSIX_FADV_DONTNEED)
605+
posix_fadvise(fd,0,0,POSIX_FADV_DONTNEED);
606+
#endif
607+
608+
close(fd);
609+
#endif
610+
}
611+
612+
/*
613+
* fsync a file or directory
614+
*
615+
* Try to fsync directories but ignore errors that indicate the OS
616+
* just doesn't allow/require fsyncing directories.
617+
*
618+
* Adapted from fsync_fname() in copydir.c.
619+
*/
620+
staticvoid
621+
fsync_fname(char*fname,boolisdir)
622+
{
623+
intfd;
624+
intreturncode;
625+
626+
/*
627+
* Some OSs require directories to be opened read-only whereas other
628+
* systems don't allow us to fsync files opened read-only; so we need both
629+
* cases here
630+
*/
631+
if (!isdir)
632+
fd=open(fname,O_RDWR |PG_BINARY);
633+
else
634+
fd=open(fname,O_RDONLY |PG_BINARY);
635+
636+
/*
637+
* Some OSs don't allow us to open directories at all (Windows returns
638+
* EACCES)
639+
*/
640+
if (fd<0&&isdir&& (errno==EISDIR||errno==EACCES))
641+
return;
642+
643+
elseif (fd<0)
644+
{
645+
fprintf(stderr,_("%s: could not open file \"%s\": %s\n"),
646+
progname,fname,strerror(errno));
647+
exit_nicely();
648+
}
649+
650+
returncode=fsync(fd);
651+
652+
/* Some OSs don't allow us to fsync directories at all */
653+
if (returncode!=0&&isdir&&errno==EBADF)
654+
{
655+
close(fd);
656+
return;
657+
}
658+
659+
if (returncode!=0)
660+
{
661+
fprintf(stderr,_("%s: could not fsync file \"%s\": %s\n"),
662+
progname,fname,strerror(errno));
663+
exit_nicely();
664+
}
665+
666+
close(fd);
667+
}
668+
492669
/*
493670
* Open a subcommand with suitable error messaging
494671
*/
@@ -2092,6 +2269,47 @@ make_postgres(void)
20922269
check_ok();
20932270
}
20942271

2272+
/*
2273+
* fsync everything down to disk
2274+
*/
2275+
staticvoid
2276+
perform_fsync(void)
2277+
{
2278+
charpdir[MAXPGPATH];
2279+
2280+
fputs(_("syncing data to disk ... "),stdout);
2281+
fflush(stdout);
2282+
2283+
/*
2284+
* We need to name the parent of PGDATA. get_parent_directory() isn't
2285+
* enough here, because it can result in an empty string.
2286+
*/
2287+
snprintf(pdir,MAXPGPATH,"%s/..",pg_data);
2288+
canonicalize_path(pdir);
2289+
2290+
/*
2291+
* Hint to the OS so that we're going to fsync each of these files soon.
2292+
*/
2293+
2294+
/* first the parent of the PGDATA directory */
2295+
pre_sync_fname(pdir, true);
2296+
2297+
/* then recursively through the directory */
2298+
walkdir(pg_data,pre_sync_fname);
2299+
2300+
/*
2301+
* Now, do the fsync()s in the same order.
2302+
*/
2303+
2304+
/* first the parent of the PGDATA directory */
2305+
fsync_fname(pdir, true);
2306+
2307+
/* then recursively through the directory */
2308+
walkdir(pg_data,fsync_fname);
2309+
2310+
check_ok();
2311+
}
2312+
20952313

20962314
/*
20972315
* signal handler in case we are interrupted.
@@ -2532,6 +2750,7 @@ usage(const char *progname)
25322750
printf(_(" -d, --debug generate lots of debugging output\n"));
25332751
printf(_(" -L DIRECTORY where to find the input files\n"));
25342752
printf(_(" -n, --noclean do not clean up after errors\n"));
2753+
printf(_(" -N, --nosync do not wait for changes to be written safely to disk\n"));
25352754
printf(_(" -s, --show show internal settings\n"));
25362755
printf(_("\nOther options:\n"));
25372756
printf(_(" -V, --version output version information, then exit\n"));
@@ -2621,6 +2840,7 @@ main(int argc, char *argv[])
26212840
{"debug",no_argument,NULL,'d'},
26222841
{"show",no_argument,NULL,'s'},
26232842
{"noclean",no_argument,NULL,'n'},
2843+
{"nosync",no_argument,NULL,'N'},
26242844
{"xlogdir",required_argument,NULL,'X'},
26252845
{NULL,0,NULL,0}
26262846
};
@@ -2676,7 +2896,7 @@ main(int argc, char *argv[])
26762896

26772897
/* process command-line options */
26782898

2679-
while ((c=getopt_long(argc,argv,"dD:E:L:nU:WA:sT:X:",long_options,&option_index))!=-1)
2899+
while ((c=getopt_long(argc,argv,"dD:E:L:nNU:WA:sT:X:",long_options,&option_index))!=-1)
26802900
{
26812901
switch (c)
26822902
{
@@ -2719,6 +2939,9 @@ main(int argc, char *argv[])
27192939
noclean= true;
27202940
printf(_("Running in noclean mode. Mistakes will not be cleaned up.\n"));
27212941
break;
2942+
case'N':
2943+
do_sync= false;
2944+
break;
27222945
case'L':
27232946
share_path=xstrdup(optarg);
27242947
break;
@@ -3310,6 +3533,11 @@ main(int argc, char *argv[])
33103533

33113534
make_postgres();
33123535

3536+
if (do_sync)
3537+
perform_fsync();
3538+
else
3539+
printf(_("\nSync to disk skipped.\nThe data directory might become corrupt if the operating system crashes.\n"));
3540+
33133541
if (authwarning!=NULL)
33143542
fprintf(stderr,"%s",authwarning);
33153543

‎src/include/pg_config.h.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,9 @@
511511
/* Define to 1 if you have the `symlink' function. */
512512
#undef HAVE_SYMLINK
513513

514+
/* Define to 1 if you have the `sync_file_range' function. */
515+
#undef HAVE_SYNC_FILE_RANGE
516+
514517
/* Define to 1 if you have the syslog interface. */
515518
#undef HAVE_SYSLOG
516519

‎src/include/pg_config.h.win32

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,9 @@
420420
/* Define to 1 if you have the `symlink' function. */
421421
#define HAVE_SYMLINK 1
422422

423+
/* Define to 1 if you have the `sync_file_range' function. */
424+
/* #undef HAVE_SYNC_FILE_RANGE */
425+
423426
/* Define to 1 if you have the `sysconf' function. */
424427
/* #undef HAVE_SYSCONF */
425428

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp