NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commitfe0972e

committed

Add further debug info to help debug 019_replslot_limit.pl failures.

See alsoafdeff1. Failures after that commit provided a few more hints,but not yet enough to understand what's going on.In 019_replslot_limit.pl shut down nodes with fast instead of immediate modeif we observe the failure mode. That should tell us whether the failures we'reobserving are just a timing issue under high load. PGCTLTIMEOUT should preventbuildfarm animals from hanging endlessly.Also adds a bit more logging to replication slot drop and ShutdownPostgres().Discussion:https://postgr.es/m/20220225192941.hqnvefgdzaro6gzg@alap3.anarazel.de

1 parent638300f commitfe0972eCopy full SHA for fe0972e

File tree

5 files changed

+55

-1

lines changed

src
- backend
  - replication
    - slot.c
  - storage/lmgr
    - lwlock.c
  - utils/init
    - postinit.c
- include/storage
  - lwlock.h
- test/recovery/t
  - 019_replslot_limit.pl

5 files changed

+55

-1

lines changed

`‎src/backend/replication/slot.c`

Lines changed: 13 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -569,6 +569,10 @@ ReplicationSlotCleanup(void)`
`569`	`569`	`if (!s->in_use)`
`570`	`570`	`continue;`
`571`	`571`
	`572`	`+/* unlocked read of active_pid is ok for debugging purposes */`
	`573`	`+elog(DEBUG3,"temporary replication slot cleanup: %d in use, active_pid: %d",`
	`574`	`+i,s->active_pid);`
	`575`	`+`
`572`	`576`	`SpinLockAcquire(&s->mutex);`
`573`	`577`	`if (s->active_pid==MyProcPid)`
`574`	`578`	`{`
`@@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)`
`629`	`633`	`charpath[MAXPGPATH];`
`630`	`634`	`chartmppath[MAXPGPATH];`
`631`	`635`
	`636`	`+/* temp debugging aid to analyze 019_replslot_limit failures */`
	`637`	`+elog(DEBUG3,"replication slot drop: %s: begin",NameStr(slot->data.name));`
	`638`	`+`
`632`	`639`	`/*`
`633`	`640`	`* If some other backend ran this code concurrently with us, we might try`
`634`	`641`	`* to delete a slot with a certain name while someone else was trying to`
`@@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)`
`679`	`686`	`path,tmppath)));`
`680`	`687`	`}`
`681`	`688`
	`689`	`+elog(DEBUG3,"replication slot drop: %s: removed on-disk",`
	`690`	`+NameStr(slot->data.name));`
	`691`	`+`
`682`	`692`	`/*`
`683`	`693`	`* The slot is definitely gone. Lock out concurrent scans of the array`
`684`	`694`	`* long enough to kill it. It's OK to clear the active PID here without`
`@@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)`
`734`	`744`	`* a slot while we're still cleaning up the detritus of the old one.`
`735`	`745`	`*/`
`736`	`746`	`LWLockRelease(ReplicationSlotAllocationLock);`
	`747`	`+`
	`748`	`+elog(DEBUG3,"replication slot drop: %s: done",`
	`749`	`+NameStr(slot->data.name));`
`737`	`750`	`}`
`738`	`751`
`739`	`752`	`/*`

`‎src/backend/storage/lmgr/lwlock.c`

Lines changed: 7 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)`
`1945`	`1945`	`}`
`1946`	`1946`	`return false;`
`1947`	`1947`	`}`
	`1948`	`+`
	`1949`	`+/* temp debugging aid to analyze 019_replslot_limit failures */`
	`1950`	`+int`
	`1951`	`+LWLockHeldCount(void)`
	`1952`	`+{`
	`1953`	`+returnnum_held_lwlocks;`
	`1954`	`+}`

`‎src/backend/utils/init/postinit.c`

Lines changed: 17 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg)`
`1262`	`1262`	`* them explicitly.`
`1263`	`1263`	`*/`
`1264`	`1264`	`LockReleaseAll(USER_LOCKMETHOD, true);`
	`1265`	`+`
	`1266`	`+/*`
	`1267`	`+ * temp debugging aid to analyze 019_replslot_limit failures`
	`1268`	`+ *`
	`1269`	`+ * If an error were thrown outside of a transaction nothing up to now`
	`1270`	`+ * would have released lwlocks. We probably will add an`
	`1271`	`+ * LWLockReleaseAll(). But for now make it easier to understand such cases`
	`1272`	`+ * by warning if any lwlocks are held.`
	`1273`	`+ */`
	`1274`	`+#ifdefUSE_ASSERT_CHECKING`
	`1275`	`+{`
	`1276`	`+intheld_lwlocks=LWLockHeldCount();`
	`1277`	`+if (held_lwlocks)`
	`1278`	`+elog(WARNING,"holding %d lwlocks at the end of ShutdownPostgres()",`
	`1279`	`+held_lwlocks);`
	`1280`	`+}`
	`1281`	`+#endif`
`1265`	`1282`	`}`
`1266`	`1283`
`1267`	`1284`

`‎src/include/storage/lwlock.h`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock lock, uint64 valptr, uint64 val);`
`121`	`121`	`externvoidLWLockReleaseAll(void);`
`122`	`122`	`externboolLWLockHeldByMe(LWLock*lock);`
`123`	`123`	`externboolLWLockHeldByMeInMode(LWLock*lock,LWLockModemode);`
	`124`	`+externintLWLockHeldCount(void);`
`124`	`125`
`125`	`126`	`externboolLWLockWaitForVar(LWLocklock,uint64valptr,uint64oldval,uint64*newval);`
`126`	`127`	`externvoidLWLockUpdateVar(LWLocklock,uint64valptr,uint64value);`

`‎src/test/recovery/t/019_replslot_limit.pl`

Lines changed: 17 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -335,7 +335,23 @@`
`335`	`335`	`$node_primary3->wait_for_catchup($node_standby3);`
`336`	`336`	`my$senderpid =$node_primary3->safe_psql('postgres',`
`337`	`337`	`"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");`
`338`		`-like($senderpid,qr/^[0-9]+$/,"have walsender pid$senderpid");`
	`338`	`+`
	`339`	`+# We've seen occasional cases where multiple walsender pids are active. An`
	`340`	`+# immediate shutdown may hide evidence of a locking bug. So if multiple`
	`341`	`+# walsenders are observed, shut down in fast mode, and collect some more`
	`342`	`+# information.`
	`343`	`+if (not like($senderpid,qr/^[0-9]+$/,"have walsender pid$senderpid"))`
	`344`	`+{`
	`345`	`+my ($stdout,$stderr);`
	`346`	`+$node_primary3->psql('postgres',`
	`347`	`+"\\a\\t\nSELECT * FROM pg_stat_activity",`
	`348`	`+stdout=> \$stdout,stderr=> \$stderr);`
	`349`	`+diag$stdout,$stderr;`
	`350`	`+$node_primary3->stop('fast');`
	`351`	`+$node_standby3->stop('fast');`
	`352`	`+die"could not determine walsender pid, can't continue";`
	`353`	`+}`
	`354`	`+`
`339`	`355`	`my$receiverpid =$node_standby3->safe_psql('postgres',`
`340`	`356`	`"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");`
`341`	`357`	`like($receiverpid,qr/^[0-9]+$/,"have walreceiver pid$receiverpid");`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitfe0972e

File tree

5 files changed

5 files changed

`‎src/backend/replication/slot.c`

`‎src/backend/storage/lmgr/lwlock.c`

`‎src/backend/utils/init/postinit.c`

`‎src/include/storage/lwlock.h`

`‎src/test/recovery/t/019_replslot_limit.pl`

0 commit comments