From ee895583126f6991ae885b03c2b832b4dd8ab24c Mon Sep 17 00:00:00 2001 From: Brandon Nesterenko Date: Thu, 8 Feb 2024 09:55:02 -0700 Subject: [PATCH] MDEV-14357: rpl.rpl_domain_id_filter_io_crash failed in buildbot with wrong result MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A race condition with the SQL thread, where depending on if it was killed before or after it had executed the fake/generated IGN_GTIDS Gtid_list_log_event, may or may not update gtid_slave_pos with the position of the ignored events. Then, the slave would be restarted while resetting IGNORE_DOMAIN_IDS to be empty, which would result in the slave requesting different starting locations, depending on whether or not gtid_slave_pos was updated. And, because previously ignored events could now be requested and executed (no longer ignored), their presence would fail the test. This patch fixes this in two ways. First, to use GTID positions for synchronization rather than binlog file positions. Then second, to synchronize the SQL thread’s gtid_slave_pos with the ignored events before killing the SQL thread. To consistently reproduce the test failure, the following patch can be applied: diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index f51f5b7deec..de62233acff 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -3686,6 +3686,12 @@ Gtid_list_log_event::do_apply_event(rpl_group_info *rgi) void *hton= NULL; uint32 i; + sleep(1); + if (rli->sql_driver_thd->killed || rli->abort_slave) + { + return 0; + } + Reviewed By: ============ Kristian Nielsen --- .../r/rpl_domain_id_filter_io_crash.result | 27 ++++++++ .../rpl/t/rpl_domain_id_filter_io_crash.test | 66 ++++++++++++++----- 2 files changed, 75 insertions(+), 18 deletions(-) diff --git a/mysql-test/suite/rpl/r/rpl_domain_id_filter_io_crash.result b/mysql-test/suite/rpl/r/rpl_domain_id_filter_io_crash.result index 5250c4bb36a..9d3b2ccdab7 100644 --- a/mysql-test/suite/rpl/r/rpl_domain_id_filter_io_crash.result +++ b/mysql-test/suite/rpl/r/rpl_domain_id_filter_io_crash.result @@ -11,6 +11,7 @@ SELECT * FROM t1; i 1 connection slave; +include/save_master_gtid.inc connection slave; call mtr.add_suppression("Slave I/O: Relay log write failure: could not queue event from master.*"); # Case 0 : Start slave with IGNORE_DOMAIN_IDS=(), then restart @@ -24,6 +25,7 @@ DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : SET @saved_dbug = @@GLOBAL.debug_dbug; @@ -33,6 +35,7 @@ START TRANSACTION; INSERT INTO t1 VALUES(2); INSERT INTO t1 VALUES(3); COMMIT; +include/save_master_gtid.inc SELECT * FROM t1; i 1 @@ -46,6 +49,7 @@ i SET @@global.debug_dbug=@saved_dbug; START SLAVE io_thread; include/wait_for_slave_io_to_start.inc +include/sync_with_master_gtid.inc SELECT * FROM t1; i 1 @@ -59,6 +63,7 @@ DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : 1 SET @@global.debug_dbug="d,kill_slave_io_before_commit"; @@ -67,6 +72,7 @@ START TRANSACTION; INSERT INTO t1 VALUES(4); INSERT INTO t1 VALUES(5); COMMIT; +include/save_master_gtid.inc SELECT * FROM t1; i 1 @@ -84,6 +90,7 @@ i SET @@global.debug_dbug=@saved_dbug; START SLAVE io_thread; include/wait_for_slave_io_to_start.inc +include/sync_with_master_gtid.inc SELECT * FROM t1; i 1 @@ -97,6 +104,7 @@ DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : 1 CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : SET @@global.debug_dbug="d,kill_slave_io_before_commit"; @@ -114,6 +122,7 @@ START TRANSACTION; INSERT INTO t1 VALUES(10); INSERT INTO t1 VALUES(11); COMMIT; +include/save_master_gtid.inc SELECT * FROM t1; i 1 @@ -140,6 +149,7 @@ DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : 1 SELECT * FROM t1; @@ -157,6 +167,7 @@ DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : 1 CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : 1 SET @@global.debug_dbug="d,kill_slave_io_before_commit"; @@ -166,6 +177,7 @@ START TRANSACTION; INSERT INTO t1 VALUES(12); INSERT INTO t1 VALUES(13); COMMIT; +include/save_master_gtid.inc START TRANSACTION; INSERT INTO t1 VALUES(14); INSERT INTO t1 VALUES(15); @@ -204,11 +216,16 @@ i 10 11 SET @@global.debug_dbug=@saved_dbug; +include/sync_with_master_gtid.inc include/stop_slave_sql.inc DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : 1 CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; +connection master; +include/save_master_gtid.inc +connection slave; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : SELECT * FROM t1; @@ -230,6 +247,7 @@ DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : 1 SET @@global.debug_dbug="d,kill_slave_io_after_2_events"; @@ -239,6 +257,7 @@ START TRANSACTION; INSERT INTO t1 VALUES(18); INSERT INTO t1 VALUES(19); COMMIT; +include/save_master_gtid.inc START TRANSACTION; INSERT INTO t1 VALUES(20); INSERT INTO t1 VALUES(21); @@ -287,11 +306,16 @@ i 16 17 SET @@global.debug_dbug=@saved_dbug; +include/sync_with_master_gtid.inc include/stop_slave_sql.inc DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : 1 CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; +connection master; +include/save_master_gtid.inc +connection slave; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : SELECT * FROM t1; @@ -317,6 +341,7 @@ DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : SET @@global.debug_dbug="d,kill_slave_io_after_2_events"; @@ -335,6 +360,7 @@ START TRANSACTION; INSERT INTO t1 VALUES(28); INSERT INTO t1 VALUES(29); COMMIT; +include/save_master_gtid.inc SELECT * FROM t1; i 1 @@ -389,6 +415,7 @@ DO_DOMAIN_IDS (BEFORE) : IGNORE_DOMAIN_IDS (BEFORE) : CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; include/start_slave.inc +include/sync_with_master_gtid.inc DO_DOMAIN_IDS (AFTER) : IGNORE_DOMAIN_IDS (AFTER) : 1 SELECT * FROM t1; diff --git a/mysql-test/suite/rpl/t/rpl_domain_id_filter_io_crash.test b/mysql-test/suite/rpl/t/rpl_domain_id_filter_io_crash.test index 95fac6c2edb..c7ef3a4eff9 100644 --- a/mysql-test/suite/rpl/t/rpl_domain_id_filter_io_crash.test +++ b/mysql-test/suite/rpl/t/rpl_domain_id_filter_io_crash.test @@ -9,6 +9,7 @@ CREATE TABLE t1(i INT) ENGINE=INNODB; INSERT INTO t1 VALUES(1); SELECT * FROM t1; sync_slave_with_master; +--source include/save_master_gtid.inc connection slave; @@ -28,7 +29,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -44,7 +45,7 @@ START TRANSACTION; INSERT INTO t1 VALUES(2); INSERT INTO t1 VALUES(3); COMMIT; -save_master_pos; +--source include/save_master_gtid.inc SELECT * FROM t1; connection slave; @@ -55,7 +56,7 @@ SET @@global.debug_dbug=@saved_dbug; START SLAVE io_thread; --source include/wait_for_slave_io_to_start.inc -sync_with_master; +--source include/sync_with_master_gtid.inc SELECT * FROM t1; --echo # Case 1 : Start slave with IGNORE_DOMAIN_IDS=(1), then restart @@ -70,7 +71,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -86,7 +87,7 @@ INSERT INTO t1 VALUES(4); INSERT INTO t1 VALUES(5); COMMIT; -save_master_pos; +--source include/save_master_gtid.inc SELECT * FROM t1; connection slave; @@ -97,7 +98,7 @@ SET @@global.debug_dbug=@saved_dbug; START SLAVE io_thread; --source include/wait_for_slave_io_to_start.inc -sync_with_master; +--source include/sync_with_master_gtid.inc SELECT * FROM t1; --echo # Case 2 : Start slave with IGNORE_DOMAIN_IDS=(), then restart @@ -112,7 +113,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -140,7 +141,7 @@ INSERT INTO t1 VALUES(10); INSERT INTO t1 VALUES(11); COMMIT; -save_master_pos; +--source include/save_master_gtid.inc SELECT * FROM t1; connection slave; @@ -157,7 +158,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -178,7 +179,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -196,6 +197,15 @@ INSERT INTO t1 VALUES(12); INSERT INTO t1 VALUES(13); COMMIT; # IO thread gets killed here. +# MDEV-14357 +# As the prior transaction will be ignored on slave because its domain id is +# ignored, the replica's gtid_slave_pos will be updated to have seen it, +# despite its eventual failure to queue the whole transaction to the relay log. +# So for test consistency, we need to synchronize the SQL thread with this +# position; otherwise, when restarting the server after resetting +# IGNORE_DOMAIN_IDS, we will re-fetch this event and execute it. +--source include/save_master_gtid.inc + START TRANSACTION; INSERT INTO t1 VALUES(14); INSERT INTO t1 VALUES(15); @@ -207,7 +217,6 @@ INSERT INTO t1 VALUES(16); INSERT INTO t1 VALUES(17); COMMIT; -save_master_pos; SELECT * FROM t1; connection slave; @@ -217,6 +226,11 @@ SELECT * FROM t1; SET @@global.debug_dbug=@saved_dbug; +# MDEV-14357 +# Ensure the SQL thread is updated with the GTID of the ignored transaction +# so we don't fetch it and execute it after restarting without any ignored +# domain ids. +--source include/sync_with_master_gtid.inc --source include/stop_slave_sql.inc let $do_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -224,8 +238,12 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno --echo IGNORE_DOMAIN_IDS (BEFORE) : $ignore_domain_ids_before CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; + +--connection master +--source include/save_master_gtid.inc +--connection slave --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -246,7 +264,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -264,6 +282,11 @@ INSERT INTO t1 VALUES(18); INSERT INTO t1 VALUES(19); # IO thread gets killed here. COMMIT; +# MDEV-14357 +# Synchronize gtid_slave_pos with the ignored event. See prior comments about +# MDEV-14357 for details. +--source include/save_master_gtid.inc + START TRANSACTION; INSERT INTO t1 VALUES(20); INSERT INTO t1 VALUES(21); @@ -275,7 +298,6 @@ INSERT INTO t1 VALUES(22); INSERT INTO t1 VALUES(23); COMMIT; -save_master_pos; SELECT * FROM t1; connection slave; @@ -285,6 +307,10 @@ SELECT * FROM t1; SET @@global.debug_dbug=@saved_dbug; +# MDEV-14357 +# Synchronize gtid_slave_pos with the ignored event. See prior comments about +# MDEV-14357 for details. +--source include/sync_with_master_gtid.inc --source include/stop_slave_sql.inc let $do_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -292,8 +318,12 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno --echo IGNORE_DOMAIN_IDS (BEFORE) : $ignore_domain_ids_before CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; + +--connection master +--source include/save_master_gtid.inc +--connection slave --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -314,7 +344,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos; --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1); @@ -343,7 +373,7 @@ INSERT INTO t1 VALUES(28); INSERT INTO t1 VALUES(29); COMMIT; -save_master_pos; +--source include/save_master_gtid.inc SELECT * FROM t1; connection slave; @@ -361,7 +391,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos; --source include/start_slave.inc -sync_with_master; +--source include/sync_with_master_gtid.inc let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1); let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);