@@ -132,19 +132,69 @@ smgr_bulk_finish(BulkWriteState *bulkstate)
132132smgr_bulk_flush (bulkstate );
133133
134134/*
135- * When we wrote out the pages, we passed skipFsync=true to avoid the
136- * overhead of registering all the writes with the checkpointer. Register
137- * the whole relation now.
138- *
139- * There is one hole in that idea: If a checkpoint occurred while we were
140- * writing the pages, it already missed fsyncing the pages we had written
141- * before the checkpoint started. A crash later on would replay the WAL
142- * starting from the checkpoint, therefore it wouldn't replay our earlier
143- * WAL records. So if a checkpoint started after the bulk write, fsync
144- * the files now.
135+ * Fsync the relation, or register it for the next checkpoint, if
136+ * necessary.
145137 */
146- if (! SmgrIsTemp (bulkstate -> smgr ))
138+ if (SmgrIsTemp (bulkstate -> smgr ))
147139{
140+ /* Temporary relations don't need to be fsync'd, ever */
141+ }
142+ else if (!bulkstate -> use_wal )
143+ {
144+ /*----------
145+ * This is either an unlogged relation, or a permanent relation but we
146+ * skipped WAL-logging because wal_level=minimal:
147+ *
148+ * A) Unlogged relation
149+ *
150+ * Unlogged relations will go away on crash, but they need to be
151+ * fsync'd on a clean shutdown. It's sufficient to call
152+ * smgrregistersync(), that ensures that the checkpointer will
153+ * flush it at the shutdown checkpoint. (It will flush it on the
154+ * next online checkpoint too, which is not strictly necessary.)
155+ *
156+ * Note that the init-fork of an unlogged relation is not
157+ * considered unlogged for our purposes. It's treated like a
158+ * regular permanent relation. The callers will pass use_wal=true
159+ * for the init fork.
160+ *
161+ * B) Permanent relation, WAL-logging skipped because wal_level=minimal
162+ *
163+ * This is a new relation, and we didn't WAL-log the pages as we
164+ * wrote, but they need to be fsync'd before commit.
165+ *
166+ * We don't need to do that here, however. The fsync() is done at
167+ * commit, by smgrDoPendingSyncs() (*).
168+ *
169+ * (*) smgrDoPendingSyncs() might decide to WAL-log the whole
170+ * relation at commit instead of fsyncing it, if the relation was
171+ * very small, but it's smgrDoPendingSyncs() responsibility in any
172+ * case.
173+ *
174+ * We cannot distinguish the two here, so conservatively assume it's
175+ * an unlogged relation. A permanent relation with wal_level=minimal
176+ * would require no actions, see above.
177+ */
178+ smgrregistersync (bulkstate -> smgr ,bulkstate -> forknum );
179+ }
180+ else
181+ {
182+ /*
183+ * Permanent relation, WAL-logged normally.
184+ *
185+ * We already WAL-logged all the pages, so they will be replayed from
186+ * WAL on crash. However, when we wrote out the pages, we passed
187+ * skipFsync=true to avoid the overhead of registering all the writes
188+ * with the checkpointer. Register the whole relation now.
189+ *
190+ * There is one hole in that idea: If a checkpoint occurred while we
191+ * were writing the pages, it already missed fsyncing the pages we had
192+ * written before the checkpoint started. A crash later on would
193+ * replay the WAL starting from the checkpoint, therefore it wouldn't
194+ * replay our earlier WAL records. So if a checkpoint started after
195+ * the bulk write, fsync the files now.
196+ */
197+
148198/*
149199 * Prevent a checkpoint from starting between the GetRedoRecPtr() and
150200 * smgrregistersync() calls.