|
| 1 | +use strict; |
| 2 | +use warnings; |
| 3 | +use PostgreSQL::Test::Cluster; |
| 4 | +use Test::More; |
| 5 | + |
| 6 | +# Test that vacuum prunes away all dead tuples killed before OldestXmin |
| 7 | +# |
| 8 | +# This test creates a table on a primary, updates the table to generate dead |
| 9 | +# tuples for vacuum, and then, during the vacuum, uses the replica to force |
| 10 | +# GlobalVisState->maybe_needed on the primary to move backwards and precede |
| 11 | +# the value of OldestXmin set at the beginning of vacuuming the table. |
| 12 | + |
| 13 | +# Set up nodes |
| 14 | +my$node_primary = PostgreSQL::Test::Cluster->new('primary'); |
| 15 | +$node_primary->init(allows_streaming=>'physical'); |
| 16 | + |
| 17 | +$node_primary->append_conf( |
| 18 | +'postgresql.conf',qq[ |
| 19 | +hot_standby_feedback = on |
| 20 | +log_recovery_conflict_waits = true |
| 21 | +autovacuum = off |
| 22 | +log_min_messages = INFO |
| 23 | +maintenance_work_mem = 1024 |
| 24 | +]); |
| 25 | +$node_primary->start; |
| 26 | + |
| 27 | +my$node_replica = PostgreSQL::Test::Cluster->new('standby'); |
| 28 | + |
| 29 | +$node_primary->backup('my_backup'); |
| 30 | +$node_replica->init_from_backup($node_primary,'my_backup', |
| 31 | +has_streaming=> 1); |
| 32 | + |
| 33 | +$node_replica->start; |
| 34 | + |
| 35 | +my$test_db ="test_db"; |
| 36 | +$node_primary->safe_psql('postgres',"CREATE DATABASE$test_db"); |
| 37 | + |
| 38 | +# Save the original connection info for later use |
| 39 | +my$orig_conninfo =$node_primary->connstr(); |
| 40 | + |
| 41 | +my$table1 ="vac_horizon_floor_table"; |
| 42 | + |
| 43 | +# Long-running Primary Session A |
| 44 | +my$psql_primaryA = |
| 45 | +$node_primary->background_psql($test_db,on_error_stop=> 1); |
| 46 | + |
| 47 | +# Long-running Primary Session B |
| 48 | +my$psql_primaryB = |
| 49 | +$node_primary->background_psql($test_db,on_error_stop=> 1); |
| 50 | + |
| 51 | +# The TIDStore vacuum uses to store dead items is optimized for its target |
| 52 | +# system. On a 32-bit system, our example requires twice as many pages with |
| 53 | +# the same number of dead items per page to fill the TIDStore and trigger a |
| 54 | +# second round of index vacuuming. |
| 55 | +my$is_64bit =$node_primary->safe_psql($test_db, |
| 56 | +qq[SELECT typbyval FROM pg_type WHERE typname = 'int8';]); |
| 57 | + |
| 58 | +my$nrows =$is_64biteq't' ? 400000 : 800000; |
| 59 | + |
| 60 | +# Because vacuum's first pass, pruning, is where we use the GlobalVisState to |
| 61 | +# check tuple visibility, GlobalVisState->maybe_needed must move backwards |
| 62 | +# during pruning before checking the visibility for a tuple which would have |
| 63 | +# been considered HEAPTUPLE_DEAD prior to maybe_needed moving backwards but |
| 64 | +# HEAPTUPLE_RECENTLY_DEAD compared to the new, older value of maybe_needed. |
| 65 | +# |
| 66 | +# We must not only force the horizon on the primary to move backwards but also |
| 67 | +# force the vacuuming backend's GlobalVisState to be updated. GlobalVisState |
| 68 | +# is forced to update during index vacuuming. |
| 69 | +# |
| 70 | +# _bt_pendingfsm_finalize() calls GetOldestNonRemovableTransactionId() at the |
| 71 | +# end of a round of index vacuuming, updating the backend's GlobalVisState |
| 72 | +# and, in our case, moving maybe_needed backwards. |
| 73 | +# |
| 74 | +# Then vacuum's first (pruning) pass will continue and pruning will find our |
| 75 | +# later inserted and updated tuple HEAPTUPLE_RECENTLY_DEAD when compared to |
| 76 | +# maybe_needed but HEAPTUPLE_DEAD when compared to OldestXmin. |
| 77 | +# |
| 78 | +# Thus, we must force at least two rounds of index vacuuming to ensure that |
| 79 | +# some tuple visibility checks will happen after a round of index vacuuming. |
| 80 | +# To accomplish this, we set maintenance_work_mem to its minimum value and |
| 81 | +# insert and update enough rows that we force at least one round of index |
| 82 | +# vacuuming before getting to a dead tuple which was killed after the standby |
| 83 | +# is disconnected. |
| 84 | +$node_primary->safe_psql($test_db,qq[ |
| 85 | +CREATE TABLE${table1}(col1 int) |
| 86 | +WITH (autovacuum_enabled=false, fillfactor=10); |
| 87 | +INSERT INTO$table1 VALUES(7); |
| 88 | +INSERT INTO$table1 SELECT generate_series(1,$nrows) % 3; |
| 89 | +CREATE INDEX on${table1}(col1); |
| 90 | +UPDATE$table1 SET col1 = 3 WHERE col1 = 0; |
| 91 | +INSERT INTO$table1 VALUES(7); |
| 92 | +]); |
| 93 | + |
| 94 | +# We will later move the primary forward while the standby is disconnected. |
| 95 | +# For now, however, there is no reason not to wait for the standby to catch |
| 96 | +# up. |
| 97 | +my$primary_lsn =$node_primary->lsn('flush'); |
| 98 | +$node_primary->wait_for_catchup($node_replica,'replay',$primary_lsn); |
| 99 | + |
| 100 | +# Test that the WAL receiver is up and running. |
| 101 | +$node_replica->poll_query_until($test_db,qq[ |
| 102 | +select exists (select * from pg_stat_wal_receiver);] ,'t'); |
| 103 | + |
| 104 | +# Set primary_conninfo to something invalid on the replica and reload the |
| 105 | +# config. Once the config is reloaded, the startup process will force the WAL |
| 106 | +# receiver to restart and it will be unable to reconnect because of the |
| 107 | +# invalid connection information. |
| 108 | +$node_replica->safe_psql($test_db,qq[ |
| 109 | +ALTER SYSTEM SET primary_conninfo = ''; |
| 110 | +SELECT pg_reload_conf(); |
| 111 | +]); |
| 112 | + |
| 113 | +# Wait until the WAL receiver has shut down and been unable to start up again. |
| 114 | +$node_replica->poll_query_until($test_db,qq[ |
| 115 | +select exists (select * from pg_stat_wal_receiver);] ,'f'); |
| 116 | + |
| 117 | +# Now insert and update a tuple which will be visible to the vacuum on the |
| 118 | +# primary but which will have xmax newer than the oldest xmin on the standby |
| 119 | +# that was recently disconnected. |
| 120 | +my$res =$psql_primaryA->query_safe( |
| 121 | +qq[ |
| 122 | +INSERT INTO$table1 VALUES (99); |
| 123 | +UPDATE$table1 SET col1 = 100 WHERE col1 = 99; |
| 124 | +SELECT 'after_update'; |
| 125 | +] |
| 126 | +); |
| 127 | + |
| 128 | +# Make sure the UPDATE finished |
| 129 | +like($res,qr/^after_update$/m,"UPDATE occurred on primary session A"); |
| 130 | + |
| 131 | +# Open a cursor on the primary whose pin will keep VACUUM from getting a |
| 132 | +# cleanup lock on the first page of the relation. We want VACUUM to be able to |
| 133 | +# start, calculate initial values for OldestXmin and GlobalVisState and then |
| 134 | +# be unable to proceed with pruning our dead tuples. This will allow us to |
| 135 | +# reconnect the standby and push the horizon back before we start actual |
| 136 | +# pruning and vacuuming. |
| 137 | +my$primary_cursor1 ="vac_horizon_floor_cursor1"; |
| 138 | + |
| 139 | +# The first value inserted into the table was a 7, so FETCH FORWARD should |
| 140 | +# return a 7. That's how we know the cursor has a pin. |
| 141 | +$res =$psql_primaryB->query_safe( |
| 142 | +qq[ |
| 143 | +BEGIN; |
| 144 | +DECLARE$primary_cursor1 CURSOR FOR SELECT * FROM$table1 WHERE col1 = 7; |
| 145 | +FETCH$primary_cursor1; |
| 146 | +] |
| 147 | +); |
| 148 | + |
| 149 | +is($res, 7,qq[Cursor query returned$res. Expected value 7.]); |
| 150 | + |
| 151 | +# Get the PID of the session which will run the VACUUM FREEZE so that we can |
| 152 | +# use it to filter pg_stat_activity later. |
| 153 | +my$vacuum_pid =$psql_primaryA->query_safe("SELECT pg_backend_pid();"); |
| 154 | + |
| 155 | +# Now start a VACUUM FREEZE on the primary. It will call vacuum_get_cutoffs() |
| 156 | +# and establish values of OldestXmin and GlobalVisState which are newer than |
| 157 | +# all of our dead tuples. Then it will be unable to get a cleanup lock to |
| 158 | +# start pruning, so it will hang. |
| 159 | +# |
| 160 | +# We use VACUUM FREEZE because it will wait for a cleanup lock instead of |
| 161 | +# skipping the page pinned by the cursor. Note that works because the target |
| 162 | +# tuple's xmax precedes OldestXmin which ensures that lazy_scan_noprune() will |
| 163 | +# return false and we will wait for the cleanup lock. |
| 164 | +$psql_primaryA->{stdin} .=qq[ |
| 165 | +VACUUM (VERBOSE, FREEZE)$table1; |
| 166 | +\\echo VACUUM |
| 167 | +]; |
| 168 | + |
| 169 | +# Make sure the VACUUM command makes it to the server. |
| 170 | +$psql_primaryA->{run}->pump_nb(); |
| 171 | + |
| 172 | +# Make sure that the VACUUM has already called vacuum_get_cutoffs() and is |
| 173 | +# just waiting on the lock to start vacuuming. We don't want the standby to |
| 174 | +# re-establish a connection to the primary and push the horizon back until |
| 175 | +# we've saved initial values in GlobalVisState and calculated OldestXmin. |
| 176 | +$node_primary->poll_query_until($test_db, |
| 177 | +qq[ |
| 178 | +SELECT count(*) >= 1 FROM pg_stat_activity |
| 179 | +WHERE pid =$vacuum_pid |
| 180 | +AND wait_event = 'BufferPin'; |
| 181 | +], |
| 182 | +'t'); |
| 183 | + |
| 184 | +# Ensure the WAL receiver is still not active on the replica. |
| 185 | +$node_replica->poll_query_until($test_db,qq[ |
| 186 | +SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);] ,'f'); |
| 187 | + |
| 188 | +# Allow the WAL receiver connection to re-establish. |
| 189 | +$node_replica->safe_psql( |
| 190 | +$test_db,qq[ |
| 191 | +ALTER SYSTEM SET primary_conninfo = '$orig_conninfo'; |
| 192 | +SELECT pg_reload_conf(); |
| 193 | +]); |
| 194 | + |
| 195 | +# Ensure the new WAL receiver has connected. |
| 196 | +$node_replica->poll_query_until($test_db,qq[ |
| 197 | +SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);] ,'t'); |
| 198 | + |
| 199 | +# Once the WAL sender is shown on the primary, the replica should have |
| 200 | +# connected with the primary and pushed the horizon backward. Primary Session |
| 201 | +# A won't see that until the VACUUM FREEZE proceeds and does its first round |
| 202 | +# of index vacuuming. |
| 203 | +$node_primary->poll_query_until($test_db,qq[ |
| 204 | +SELECT EXISTS (SELECT * FROm pg_stat_replication);] ,'t'); |
| 205 | + |
| 206 | +# Move the cursor forward to the next 7. We inserted the 7 much later, so |
| 207 | +# advancing the cursor should allow vacuum to proceed vacuuming most pages of |
| 208 | +# the relation. Because we set maintanence_work_mem sufficiently low, we |
| 209 | +# expect that a round of index vacuuming has happened and that the vacuum is |
| 210 | +# now waiting for the cursor to release its pin on the last page of the |
| 211 | +# relation. |
| 212 | +$res =$psql_primaryB->query_safe("FETCH$primary_cursor1"); |
| 213 | +is($res, 7, |
| 214 | +qq[Cursor query returned$res from second fetch. Expected value 7.]); |
| 215 | + |
| 216 | +# Prevent the test from incorrectly passing by confirming that we did indeed |
| 217 | +# do a pass of index vacuuming. |
| 218 | +$node_primary->poll_query_until($test_db,qq[ |
| 219 | +SELECT index_vacuum_count > 0 |
| 220 | +FROM pg_stat_progress_vacuum |
| 221 | +WHERE datname='$test_db' AND relid::regclass = '$table1'::regclass; |
| 222 | +] ,'t'); |
| 223 | + |
| 224 | +# Commit the transaction with the open cursor so that the VACUUM can finish. |
| 225 | +$psql_primaryB->query_until( |
| 226 | +qr/^commit$/m, |
| 227 | +qq[ |
| 228 | +COMMIT; |
| 229 | +\\echo commit |
| 230 | +] |
| 231 | +); |
| 232 | + |
| 233 | +# VACUUM proceeds with pruning and does a visibility check on each tuple. In |
| 234 | +# older versions of Postgres, pruning found our final dead tuple |
| 235 | +# non-removable (HEAPTUPLE_RECENTLY_DEAD) since its xmax is after the new |
| 236 | +# value of maybe_needed. Then heap_prepare_freeze_tuple() would decide the |
| 237 | +# tuple xmax should be frozen because it precedes OldestXmin. Vacuum would |
| 238 | +# then error out in heap_pre_freeze_checks() with "cannot freeze committed |
| 239 | +# xmax". This was fixed by changing pruning to find all |
| 240 | +# HEAPTUPLE_RECENTLY_DEAD tuples with xmaxes preceding OldestXmin |
| 241 | +# HEAPTUPLE_DEAD and removing them. |
| 242 | + |
| 243 | +# With the fix, VACUUM should finish successfully, incrementing the table |
| 244 | +# vacuum_count. |
| 245 | +$node_primary->poll_query_until($test_db, |
| 246 | +qq[ |
| 247 | +SELECT vacuum_count > 0 |
| 248 | +FROM pg_stat_all_tables WHERE relname = '${table1}'; |
| 249 | +] |
| 250 | +,'t'); |
| 251 | + |
| 252 | +$primary_lsn =$node_primary->lsn('flush'); |
| 253 | + |
| 254 | +# Make sure something causes us to flush |
| 255 | +$node_primary->safe_psql($test_db,"INSERT INTO$table1 VALUES (1);"); |
| 256 | + |
| 257 | +# Nothing on the replica should cause a recovery conflict, so this should |
| 258 | +# finish successfully. |
| 259 | +$node_primary->wait_for_catchup($node_replica,'replay',$primary_lsn); |
| 260 | + |
| 261 | +## Shut down psqls |
| 262 | +$psql_primaryA->quit; |
| 263 | +$psql_primaryB->quit; |
| 264 | + |
| 265 | +$node_replica->stop(); |
| 266 | +$node_primary->stop(); |
| 267 | + |
| 268 | +done_testing(); |