Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitfe0972e

Browse files
committed
Add further debug info to help debug 019_replslot_limit.pl failures.
See alsoafdeff1. Failures after that commit provided a few more hints,but not yet enough to understand what's going on.In 019_replslot_limit.pl shut down nodes with fast instead of immediate modeif we observe the failure mode. That should tell us whether the failures we'reobserving are just a timing issue under high load. PGCTLTIMEOUT should preventbuildfarm animals from hanging endlessly.Also adds a bit more logging to replication slot drop and ShutdownPostgres().Discussion:https://postgr.es/m/20220225192941.hqnvefgdzaro6gzg@alap3.anarazel.de
1 parent638300f commitfe0972e

File tree

5 files changed

+55
-1
lines changed

5 files changed

+55
-1
lines changed

‎src/backend/replication/slot.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,10 @@ ReplicationSlotCleanup(void)
569569
if (!s->in_use)
570570
continue;
571571

572+
/* unlocked read of active_pid is ok for debugging purposes */
573+
elog(DEBUG3,"temporary replication slot cleanup: %d in use, active_pid: %d",
574+
i,s->active_pid);
575+
572576
SpinLockAcquire(&s->mutex);
573577
if (s->active_pid==MyProcPid)
574578
{
@@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
629633
charpath[MAXPGPATH];
630634
chartmppath[MAXPGPATH];
631635

636+
/* temp debugging aid to analyze 019_replslot_limit failures */
637+
elog(DEBUG3,"replication slot drop: %s: begin",NameStr(slot->data.name));
638+
632639
/*
633640
* If some other backend ran this code concurrently with us, we might try
634641
* to delete a slot with a certain name while someone else was trying to
@@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
679686
path,tmppath)));
680687
}
681688

689+
elog(DEBUG3,"replication slot drop: %s: removed on-disk",
690+
NameStr(slot->data.name));
691+
682692
/*
683693
* The slot is definitely gone. Lock out concurrent scans of the array
684694
* long enough to kill it. It's OK to clear the active PID here without
@@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
734744
* a slot while we're still cleaning up the detritus of the old one.
735745
*/
736746
LWLockRelease(ReplicationSlotAllocationLock);
747+
748+
elog(DEBUG3,"replication slot drop: %s: done",
749+
NameStr(slot->data.name));
737750
}
738751

739752
/*

‎src/backend/storage/lmgr/lwlock.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
19451945
}
19461946
return false;
19471947
}
1948+
1949+
/* temp debugging aid to analyze 019_replslot_limit failures */
1950+
int
1951+
LWLockHeldCount(void)
1952+
{
1953+
returnnum_held_lwlocks;
1954+
}

‎src/backend/utils/init/postinit.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg)
12621262
* them explicitly.
12631263
*/
12641264
LockReleaseAll(USER_LOCKMETHOD, true);
1265+
1266+
/*
1267+
* temp debugging aid to analyze 019_replslot_limit failures
1268+
*
1269+
* If an error were thrown outside of a transaction nothing up to now
1270+
* would have released lwlocks. We probably will add an
1271+
* LWLockReleaseAll(). But for now make it easier to understand such cases
1272+
* by warning if any lwlocks are held.
1273+
*/
1274+
#ifdefUSE_ASSERT_CHECKING
1275+
{
1276+
intheld_lwlocks=LWLockHeldCount();
1277+
if (held_lwlocks)
1278+
elog(WARNING,"holding %d lwlocks at the end of ShutdownPostgres()",
1279+
held_lwlocks);
1280+
}
1281+
#endif
12651282
}
12661283

12671284

‎src/include/storage/lwlock.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
121121
externvoidLWLockReleaseAll(void);
122122
externboolLWLockHeldByMe(LWLock*lock);
123123
externboolLWLockHeldByMeInMode(LWLock*lock,LWLockModemode);
124+
externintLWLockHeldCount(void);
124125

125126
externboolLWLockWaitForVar(LWLock*lock,uint64*valptr,uint64oldval,uint64*newval);
126127
externvoidLWLockUpdateVar(LWLock*lock,uint64*valptr,uint64value);

‎src/test/recovery/t/019_replslot_limit.pl

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,23 @@
335335
$node_primary3->wait_for_catchup($node_standby3);
336336
my$senderpid =$node_primary3->safe_psql('postgres',
337337
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
338-
like($senderpid,qr/^[0-9]+$/,"have walsender pid$senderpid");
338+
339+
# We've seen occasional cases where multiple walsender pids are active. An
340+
# immediate shutdown may hide evidence of a locking bug. So if multiple
341+
# walsenders are observed, shut down in fast mode, and collect some more
342+
# information.
343+
if (not like($senderpid,qr/^[0-9]+$/,"have walsender pid$senderpid"))
344+
{
345+
my ($stdout,$stderr);
346+
$node_primary3->psql('postgres',
347+
"\\a\\t\nSELECT * FROM pg_stat_activity",
348+
stdout=> \$stdout,stderr=> \$stderr);
349+
diag$stdout,$stderr;
350+
$node_primary3->stop('fast');
351+
$node_standby3->stop('fast');
352+
die"could not determine walsender pid, can't continue";
353+
}
354+
339355
my$receiverpid =$node_standby3->safe_psql('postgres',
340356
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
341357
like($receiverpid,qr/^[0-9]+$/,"have walreceiver pid$receiverpid");

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp