6464#ifndef WIN32
6565#include <sys/mman.h>
6666#endif
67+ #include <limits.h>
6768#include <unistd.h>
6869#include <fcntl.h>
6970#ifdef HAVE_SYS_RESOURCE_H
@@ -391,34 +392,36 @@ pg_fdatasync(int fd)
391392/*
392393 * pg_flush_data --- advise OS that the described dirty data should be flushed
393394 *
394- * An offset of 0 with an nbytes 0 means that the entire file should be
395- * flushed.
395+ * offset of 0 with nbytes 0 means that the entire file should be flushed;
396+ * in this case, this function may have side-effects on the file's
397+ * seek position!
396398 */
397399void
398400pg_flush_data (int fd ,off_t offset ,off_t nbytes )
399401{
400402/*
401403 * Right now file flushing is primarily used to avoid making later
402- * fsync()/fdatasync() calls havea less impact. Thus don't trigger
403- *flushes if fsyncs are disabled - that's a decision we might want to
404- *make configurable at some point.
404+ * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
405+ * if fsyncs are disabled - that's a decision we might want to make
406+ * configurable at some point.
405407 */
406408if (!enableFsync )
407409return ;
408410
409411/*
410- * XXX: compile all alternatives, to find portability problems more easily
412+ * We compile all alternatives that are supported on the current platform,
413+ * to find portability problems more easily.
411414 */
412415#if defined(HAVE_SYNC_FILE_RANGE )
413416{
414- int rc = 0 ;
417+ int rc ;
415418
416419/*
417420 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
418- * tells the OS that writeback for thepassed in blocks should be
421+ * tells the OS that writeback for thespecified blocks should be
419422 * started, but that we don't want to wait for completion. Note that
420423 * this call might block if too much dirty data exists in the range.
421- * This is thepreferrable method on OSs supporting it, as it works
424+ * This is thepreferable method on OSs supporting it, as it works
422425 * reliably when available (contrast to msync()) and doesn't flush out
423426 * clean data (like FADV_DONTNEED).
424427 */
@@ -438,72 +441,107 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
438441#endif
439442#if !defined(WIN32 )&& defined(MS_ASYNC )
440443{
441- int rc = 0 ;
442444void * p ;
445+ static int pagesize = 0 ;
443446
444447/*
445448 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
446- * writeback. On linux it only does sowith MS_SYNC is specified, but
449+ * writeback. On linux it only does soif MS_SYNC is specified, but
447450 * then it does the writeback synchronously. Luckily all common linux
448- * systems have sync_file_range(). This ispreferrable over
451+ * systems have sync_file_range(). This ispreferable over
449452 * FADV_DONTNEED because it doesn't flush out clean data.
450453 *
451454 * We map the file (mmap()), tell the kernel to sync back the contents
452455 * (msync()), and then remove the mapping again (munmap()).
453456 */
454- p = mmap (NULL ,nbytes ,
455- PROT_READ |PROT_WRITE ,MAP_SHARED ,
456- fd ,offset );
457- if (p == MAP_FAILED )
458- {
459- ereport (WARNING ,
460- (errcode_for_file_access (),
461- errmsg ("could not mmap while flushing dirty data: %m" )));
462- return ;
463- }
464457
465- rc = msync ( p , nbytes , MS_ASYNC );
466- if (rc ! =0 )
458+ /* mmap() needs actual length if we want to map whole file */
459+ if (offset == 0 && nbytes = =0 )
467460{
468- ereport (WARNING ,
469- (errcode_for_file_access (),
470- errmsg ("could not flush dirty data: %m" )));
471- /* NB: need to fall through to munmap()! */
461+ nbytes = lseek (fd ,0 ,SEEK_END );
462+ if (nbytes < 0 )
463+ {
464+ ereport (WARNING ,
465+ (errcode_for_file_access (),
466+ errmsg ("could not determine dirty data size: %m" )));
467+ return ;
468+ }
472469}
473470
474- rc = munmap (p ,nbytes );
475- if (rc != 0 )
471+ /*
472+ * Some platforms reject partial-page mmap() attempts. To deal with
473+ * that, just truncate the request to a page boundary. If any extra
474+ * bytes don't get flushed, well, it's only a hint anyway.
475+ */
476+
477+ /* fetch pagesize only once */
478+ if (pagesize == 0 )
479+ pagesize = sysconf (_SC_PAGESIZE );
480+
481+ /* align length to pagesize, dropping any fractional page */
482+ if (pagesize > 0 )
483+ nbytes = (nbytes /pagesize )* pagesize ;
484+
485+ /* fractional-page request is a no-op */
486+ if (nbytes <=0 )
487+ return ;
488+
489+ /*
490+ * mmap could well fail, particularly on 32-bit platforms where there
491+ * may simply not be enough address space. If so, silently fall
492+ * through to the next implementation.
493+ */
494+ if (nbytes <= (off_t )SSIZE_MAX )
495+ p = mmap (NULL ,nbytes ,PROT_READ ,MAP_SHARED ,fd ,offset );
496+ else
497+ p = MAP_FAILED ;
498+
499+ if (p != MAP_FAILED )
476500{
477- /* FATAL error because mapping would remain */
478- ereport (FATAL ,
479- (errcode_for_file_access (),
480- errmsg ("could not munmap while flushing blocks: %m" )));
481- }
501+ int rc ;
482502
483- return ;
503+ rc = msync (p , (size_t )nbytes ,MS_ASYNC );
504+ if (rc != 0 )
505+ {
506+ ereport (WARNING ,
507+ (errcode_for_file_access (),
508+ errmsg ("could not flush dirty data: %m" )));
509+ /* NB: need to fall through to munmap()! */
510+ }
511+
512+ rc = munmap (p , (size_t )nbytes );
513+ if (rc != 0 )
514+ {
515+ /* FATAL error because mapping would remain */
516+ ereport (FATAL ,
517+ (errcode_for_file_access (),
518+ errmsg ("could not munmap() while flushing data: %m" )));
519+ }
520+
521+ return ;
522+ }
484523}
485524#endif
486525#if defined(USE_POSIX_FADVISE )&& defined(POSIX_FADV_DONTNEED )
487526{
488- int rc = 0 ;
527+ int rc ;
489528
490529/*
491530 * Signal the kernel that the passed in range should not be cached
492531 * anymore. This has the, desired, side effect of writing out dirty
493532 * data, and the, undesired, side effect of likely discarding useful
494533 * clean cached blocks. For the latter reason this is the least
495- *preferrable method.
534+ *preferable method.
496535 */
497536
498537rc = posix_fadvise (fd ,offset ,nbytes ,POSIX_FADV_DONTNEED );
499538
500- /* don't error out, this is just a performance optimization */
501539if (rc != 0 )
502540{
541+ /* don't error out, this is just a performance optimization */
503542ereport (WARNING ,
504543(errcode_for_file_access (),
505544errmsg ("could not flush dirty data: %m" )));
506- return ;
507545}
508546
509547return ;
@@ -1510,6 +1548,13 @@ FileWriteback(File file, off_t offset, int amount)
15101548file ,VfdCache [file ].fileName ,
15111549 (int64 )offset ,amount ));
15121550
1551+ /*
1552+ * Caution: do not call pg_flush_data with amount = 0, it could trash the
1553+ * file's seek position.
1554+ */
1555+ if (amount <=0 )
1556+ return ;
1557+
15131558returnCode = FileAccess (file );
15141559if (returnCode < 0 )
15151560return ;
@@ -2904,11 +2949,15 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
29042949{
29052950int fd ;
29062951
2952+ /* Don't try to flush directories, it'll likely just fail */
2953+ if (isdir )
2954+ return ;
2955+
29072956fd = OpenTransientFile ((char * )fname ,O_RDONLY |PG_BINARY ,0 );
29082957
29092958if (fd < 0 )
29102959{
2911- if (errno == EACCES || ( isdir && errno == EISDIR ) )
2960+ if (errno == EACCES )
29122961return ;
29132962ereport (elevel ,
29142963(errcode_for_file_access (),