Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit0650ff2

Browse files
committed
Add GUC to log long wait times on recovery conflicts.
This commit adds GUC log_recovery_conflict_waits that controls whethera log message is produced when the startup process is waiting longer thandeadlock_timeout for recovery conflicts. This is useful in determiningif recovery conflicts prevent the recovery from applying WAL.Note that currently a log message is produced only when recovery conflicthas not been resolved yet even after deadlock_timeout passes, i.e.,only when the startup process is still waiting for recovery conflicteven after deadlock_timeout.Author: Bertrand Drouvot, Masahiko SawadaReviewed-by: Alvaro Herrera, Kyotaro Horiguchi, Fujii MasaoDiscussion:https://postgr.es/m/9a60178c-a853-1440-2cdc-c3af916cff59@amazon.com
1 parentf7a1a80 commit0650ff2

File tree

8 files changed

+292
-30
lines changed

8 files changed

+292
-30
lines changed

‎doc/src/sgml/config.sgml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6952,6 +6952,28 @@ log_line_prefix = '%m [%p] %q%u@%d/%a '
69526952
</listitem>
69536953
</varlistentry>
69546954

6955+
<varlistentry id="guc-log-recovery-conflict-waits" xreflabel="log_recovery_conflict_waits">
6956+
<term><varname>log_recovery_conflict_waits</varname> (<type>boolean</type>)
6957+
<indexterm>
6958+
<primary><varname>log_recovery_conflict_waits</varname> configuration parameter</primary>
6959+
</indexterm>
6960+
</term>
6961+
<listitem>
6962+
<para>
6963+
Controls whether a log message is produced when the startup process
6964+
is waiting longer than <varname>deadlock_timeout</varname>
6965+
for recovery conflicts. This is useful in determining if recovery
6966+
conflicts prevent the recovery from applying WAL.
6967+
</para>
6968+
6969+
<para>
6970+
The default is <literal>off</literal>. This parameter can only be set
6971+
in the <filename>postgresql.conf</filename> file or on the server
6972+
command line.
6973+
</para>
6974+
</listitem>
6975+
</varlistentry>
6976+
69556977
<varlistentry id="guc-log-parameter-max-length" xreflabel="log_parameter_max_length">
69566978
<term><varname>log_parameter_max_length</varname> (<type>integer</type>)
69576979
<indexterm>

‎doc/src/sgml/high-availability.sgml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2071,6 +2071,12 @@ if (!triggered)
20712071
server. The <structname>pg_stat_database</structname> system view also contains
20722072
summary information.
20732073
</para>
2074+
2075+
<para>
2076+
Users can control whether a log message is produced when WAL replay is waiting
2077+
longer than <varname>deadlock_timeout</varname> for conflicts. This
2078+
is controlled by the <xref linkend="guc-log-recovery-conflict-waits"/> parameter.
2079+
</para>
20742080
</sect2>
20752081

20762082
<sect2 id="hot-standby-admin">

‎src/backend/storage/buffer/bufmgr.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3809,6 +3809,8 @@ LockBufferForCleanup(Buffer buffer)
38093809
{
38103810
BufferDesc*bufHdr;
38113811
char*new_status=NULL;
3812+
TimestampTzwaitStart=0;
3813+
boollogged_recovery_conflict= false;
38123814

38133815
Assert(BufferIsPinned(buffer));
38143816
Assert(PinCountWaitBuf==NULL);
@@ -3882,6 +3884,34 @@ LockBufferForCleanup(Buffer buffer)
38823884
new_status[len]='\0';/* truncate off " waiting" */
38833885
}
38843886

3887+
/*
3888+
* Emit the log message if the startup process is waiting longer
3889+
* than deadlock_timeout for recovery conflict on buffer pin.
3890+
*
3891+
* Skip this if first time through because the startup process has
3892+
* not started waiting yet in this case. So, the wait start
3893+
* timestamp is set after this logic.
3894+
*/
3895+
if (waitStart!=0&& !logged_recovery_conflict)
3896+
{
3897+
TimestampTznow=GetCurrentTimestamp();
3898+
3899+
if (TimestampDifferenceExceeds(waitStart,now,
3900+
DeadlockTimeout))
3901+
{
3902+
LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
3903+
waitStart,now,NULL);
3904+
logged_recovery_conflict= true;
3905+
}
3906+
}
3907+
3908+
/*
3909+
* Set the wait start timestamp if logging is enabled and first
3910+
* time through.
3911+
*/
3912+
if (log_recovery_conflict_waits&&waitStart==0)
3913+
waitStart=GetCurrentTimestamp();
3914+
38853915
/* Publish the bufid that Startup process waits on */
38863916
SetStartupBufferPinWaitBufId(buffer-1);
38873917
/* Set alarm and then wait to be signaled by UnpinBuffer() */

‎src/backend/storage/ipc/standby.c

Lines changed: 173 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
intvacuum_defer_cleanup_age;
4040
intmax_standby_archive_delay=30*1000;
4141
intmax_standby_streaming_delay=30*1000;
42+
boollog_recovery_conflict_waits= false;
4243

4344
staticHTAB*RecoveryLockLists;
4445

@@ -53,6 +54,7 @@ static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlis
5354
staticvoidSendRecoveryConflictWithBufferPin(ProcSignalReasonreason);
5455
staticXLogRecPtrLogCurrentRunningXacts(RunningTransactionsCurrRunningXacts);
5556
staticvoidLogAccessExclusiveLocks(intnlocks,xl_standby_lock*locks);
57+
staticconstchar*get_recovery_conflict_desc(ProcSignalReasonreason);
5658

5759
/*
5860
* Keep track of all the locks owned by a given transaction.
@@ -218,31 +220,100 @@ WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
218220
return false;
219221
}
220222

223+
/*
224+
* Log the recovery conflict.
225+
*
226+
* wait_start is the timestamp when the caller started to wait.
227+
* now is the timestamp when this function has been called.
228+
* wait_list is the list of virtual transaction ids assigned to
229+
* conflicting processes.
230+
*/
231+
void
232+
LogRecoveryConflict(ProcSignalReasonreason,TimestampTzwait_start,
233+
TimestampTznow,VirtualTransactionId*wait_list)
234+
{
235+
longsecs;
236+
intusecs;
237+
longmsecs;
238+
StringInfoDatabuf;
239+
intnprocs=0;
240+
241+
TimestampDifference(wait_start,now,&secs,&usecs);
242+
msecs=secs*1000+usecs /1000;
243+
usecs=usecs %1000;
244+
245+
if (wait_list)
246+
{
247+
VirtualTransactionId*vxids;
248+
249+
/* Construct a string of list of the conflicting processes */
250+
vxids=wait_list;
251+
while (VirtualTransactionIdIsValid(*vxids))
252+
{
253+
PGPROC*proc=BackendIdGetProc(vxids->backendId);
254+
255+
/* proc can be NULL if the target backend is not active */
256+
if (proc)
257+
{
258+
if (nprocs==0)
259+
{
260+
initStringInfo(&buf);
261+
appendStringInfo(&buf,"%d",proc->pid);
262+
}
263+
else
264+
appendStringInfo(&buf,", %d",proc->pid);
265+
266+
nprocs++;
267+
}
268+
269+
vxids++;
270+
}
271+
}
272+
273+
/*
274+
* If wait_list is specified, report the list of PIDs of active
275+
* conflicting backends in a detail message. Note that if all the backends
276+
* in the list are not active, no detail message is logged.
277+
*/
278+
ereport(LOG,
279+
errmsg("recovery still waiting after %ld.%03d ms: %s",
280+
msecs,usecs,_(get_recovery_conflict_desc(reason))),
281+
nprocs>0 ?errdetail_log_plural("Conflicting process: %s.",
282+
"Conflicting processes: %s.",
283+
nprocs,buf.data) :0);
284+
285+
if (nprocs>0)
286+
pfree(buf.data);
287+
}
288+
221289
/*
222290
* This is the main executioner for any query backend that conflicts with
223291
* recovery processing. Judgement has already been passed on it within
224292
* a specific rmgr. Here we just issue the orders to the procs. The procs
225293
* then throw the required error as instructed.
226294
*
227-
* If report_waiting is true, "waiting" is reported in PS display if necessary.
228-
* If the caller has already reported that, report_waiting should be false.
229-
* Otherwise, "waiting" is reported twice unexpectedly.
295+
* If report_waiting is true, "waiting" is reported in PS display and the
296+
* wait for recovery conflict is reported in the log, if necessary. If
297+
* the caller is responsible for reporting them, report_waiting should be
298+
* false. Otherwise, both the caller and this function report the same
299+
* thing unexpectedly.
230300
*/
231301
staticvoid
232302
ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId*waitlist,
233303
ProcSignalReasonreason,uint32wait_event_info,
234304
boolreport_waiting)
235305
{
236306
TimestampTzwaitStart=0;
237-
char*new_status;
307+
char*new_status=NULL;
308+
boollogged_recovery_conflict= false;
238309

239310
/* Fast exit, to avoid a kernel call if there's no work to be done. */
240311
if (!VirtualTransactionIdIsValid(*waitlist))
241312
return;
242313

243-
if (report_waiting)
314+
/* Set the wait start timestamp for reporting */
315+
if (report_waiting&& (log_recovery_conflict_waits||update_process_title))
244316
waitStart=GetCurrentTimestamp();
245-
new_status=NULL;/* we haven't changed the ps display */
246317

247318
while (VirtualTransactionIdIsValid(*waitlist))
248319
{
@@ -252,25 +323,6 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
252323
/* wait until the virtual xid is gone */
253324
while (!VirtualXactLock(*waitlist, false))
254325
{
255-
/*
256-
* Report via ps if we have been waiting for more than 500 msec
257-
* (should that be configurable?)
258-
*/
259-
if (update_process_title&&new_status==NULL&&report_waiting&&
260-
TimestampDifferenceExceeds(waitStart,GetCurrentTimestamp(),
261-
500))
262-
{
263-
constchar*old_status;
264-
intlen;
265-
266-
old_status=get_ps_display(&len);
267-
new_status= (char*)palloc(len+8+1);
268-
memcpy(new_status,old_status,len);
269-
strcpy(new_status+len," waiting");
270-
set_ps_display(new_status);
271-
new_status[len]='\0';/* truncate off " waiting" */
272-
}
273-
274326
/* Is it time to kill it? */
275327
if (WaitExceedsMaxStandbyDelay(wait_event_info))
276328
{
@@ -289,6 +341,50 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
289341
if (pid!=0)
290342
pg_usleep(5000L);
291343
}
344+
345+
if (waitStart!=0&& (!logged_recovery_conflict||new_status==NULL))
346+
{
347+
TimestampTznow=0;
348+
boolmaybe_log_conflict;
349+
boolmaybe_update_title;
350+
351+
maybe_log_conflict= (log_recovery_conflict_waits&& !logged_recovery_conflict);
352+
maybe_update_title= (update_process_title&&new_status==NULL);
353+
354+
/* Get the current timestamp if not report yet */
355+
if (maybe_log_conflict||maybe_update_title)
356+
now=GetCurrentTimestamp();
357+
358+
/*
359+
* Report via ps if we have been waiting for more than 500
360+
* msec (should that be configurable?)
361+
*/
362+
if (maybe_update_title&&
363+
TimestampDifferenceExceeds(waitStart,now,500))
364+
{
365+
constchar*old_status;
366+
intlen;
367+
368+
old_status=get_ps_display(&len);
369+
new_status= (char*)palloc(len+8+1);
370+
memcpy(new_status,old_status,len);
371+
strcpy(new_status+len," waiting");
372+
set_ps_display(new_status);
373+
new_status[len]='\0';/* truncate off " waiting" */
374+
}
375+
376+
/*
377+
* Emit the log message if the startup process is waiting
378+
* longer than deadlock_timeout for recovery conflict on
379+
* buffer pin.
380+
*/
381+
if (maybe_log_conflict&&
382+
TimestampDifferenceExceeds(waitStart,now,DeadlockTimeout))
383+
{
384+
LogRecoveryConflict(reason,waitStart,now,waitlist);
385+
logged_recovery_conflict= true;
386+
}
387+
}
292388
}
293389

294390
/* The virtual transaction is gone now, wait for the next one */
@@ -405,9 +501,18 @@ ResolveRecoveryConflictWithDatabase(Oid dbid)
405501
* hot-standby backend processes. If deadlock_timeout is reached in
406502
* this function, all the backends holding the conflicting locks are
407503
* requested to check themselves for deadlocks.
504+
*
505+
* logging_conflict should be true if the recovery conflict has not been
506+
* logged yet even though logging is enabled. After deadlock_timeout is
507+
* reached and the request for deadlock check is sent, we wait again to
508+
* be signaled by the release of the lock if logging_conflict is false.
509+
* Otherwise we return without waiting again so that the caller can report
510+
* the recovery conflict. In this case, then, this function is called again
511+
* with logging_conflict=false (because the recovery conflict has already
512+
* been logged) and we will wait again for the lock to be released.
408513
*/
409514
void
410-
ResolveRecoveryConflictWithLock(LOCKTAGlocktag)
515+
ResolveRecoveryConflictWithLock(LOCKTAGlocktag,boollogging_conflict)
411516
{
412517
TimestampTzltime;
413518

@@ -494,6 +599,15 @@ ResolveRecoveryConflictWithLock(LOCKTAG locktag)
494599
backends++;
495600
}
496601

602+
/*
603+
* Exit if the recovery conflict has not been logged yet even though
604+
* logging is enabled, so that the caller can log that. Then
605+
* RecoveryConflictWithLock() is called again and we will wait again
606+
* for the lock to be released.
607+
*/
608+
if (logging_conflict)
609+
gotocleanup;
610+
497611
/*
498612
* Wait again here to be signaled by the release of the Relation Lock,
499613
* to prevent the subsequent RecoveryConflictWithLock() from causing
@@ -1209,3 +1323,36 @@ LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
12091323
nmsgs*sizeof(SharedInvalidationMessage));
12101324
XLogInsert(RM_STANDBY_ID,XLOG_INVALIDATIONS);
12111325
}
1326+
1327+
/* Return the description of recovery conflict */
1328+
staticconstchar*
1329+
get_recovery_conflict_desc(ProcSignalReasonreason)
1330+
{
1331+
constchar*reasonDesc=gettext_noop("unknown reason");
1332+
1333+
switch (reason)
1334+
{
1335+
casePROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
1336+
reasonDesc=gettext_noop("recovery conflict on buffer pin");
1337+
break;
1338+
casePROCSIG_RECOVERY_CONFLICT_LOCK:
1339+
reasonDesc=gettext_noop("recovery conflict on lock");
1340+
break;
1341+
casePROCSIG_RECOVERY_CONFLICT_TABLESPACE:
1342+
reasonDesc=gettext_noop("recovery conflict on tablespace");
1343+
break;
1344+
casePROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
1345+
reasonDesc=gettext_noop("recovery conflict on snapshot");
1346+
break;
1347+
casePROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
1348+
reasonDesc=gettext_noop("recovery conflict on buffer deadlock");
1349+
break;
1350+
casePROCSIG_RECOVERY_CONFLICT_DATABASE:
1351+
reasonDesc=gettext_noop("recovery conflict on database");
1352+
break;
1353+
default:
1354+
break;
1355+
}
1356+
1357+
returnreasonDesc;
1358+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp