diff --git a/mysql-test/suite/rpl/r/rpl_gtid_crash.result b/mysql-test/suite/rpl/r/rpl_gtid_crash.result index 45482e3b03e..debd107221f 100644 --- a/mysql-test/suite/rpl/r/rpl_gtid_crash.result +++ b/mysql-test/suite/rpl/r/rpl_gtid_crash.result @@ -112,5 +112,98 @@ SHOW VARIABLES like 'gtid_strict_mode'; Variable_name Value gtid_strict_mode ON include/start_slave.inc +*** MDEV-6462: Incorrect recovery on a slave reconnecting to crashed master *** +set sql_log_bin= 0; +call mtr.add_suppression("Error writing file 'master-bin'"); +set sql_log_bin= 1; +set sql_log_bin= 0; +call mtr.add_suppression("The server_id of master server changed in the middle of GTID"); +call mtr.add_suppression("Unexpected change of master binlog file name in the middle of GTID"); +set sql_log_bin= 1; +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (11); +COMMIT; +ERROR HY000: Error writing file 'master-bin' (errno: 11 "Resource temporarily unavailable") +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +COMMIT; +Got one of the listed errors +SELECT @@GLOBAL.server_id; +@@GLOBAL.server_id +3 +SELECT * from t1 WHERE a > 10 ORDER BY a; +a +# Wait 30 seconds for SQL thread to catch up with IO thread +SELECT * from t1 WHERE a > 10 ORDER BY a; +a +# Repeat this with additional transactions on the master +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (12); +COMMIT; +ERROR HY000: Error writing file 'master-bin' (errno: 11 "Resource temporarily unavailable") +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +COMMIT; +Got one of the listed errors +SELECT @@GLOBAL.server_id; +@@GLOBAL.server_id +1 +INSERT INTO t1 VALUES (13); +INSERT INTO t1 VALUES (14); +SELECT * from t1 WHERE a > 10 ORDER BY a; +a +13 +14 +include/save_master_gtid.inc +include/sync_with_master_gtid.inc +SELECT * from t1 WHERE a > 10 ORDER BY a; +a +13 +14 +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (21); +COMMIT; +ERROR HY000: Error writing file 'master-bin' (errno: 11 "Resource temporarily unavailable") +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +COMMIT; +Got one of the listed errors +SELECT @@GLOBAL.server_id; +@@GLOBAL.server_id +1 +SELECT * from t1 WHERE a > 10 ORDER BY a; +a +13 +14 +# Wait 30 seconds for SQL thread to catch up with IO thread +SELECT * from t1 WHERE a > 10 ORDER BY a; +a +13 +14 +# Repeat this with additional transactions on the master +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (22); +COMMIT; +ERROR HY000: Error writing file 'master-bin' (errno: 11 "Resource temporarily unavailable") +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +COMMIT; +Got one of the listed errors +INSERT INTO t1 VALUES (23); +INSERT INTO t1 VALUES (24); +SELECT * from t1 WHERE a > 10 ORDER BY a; +a +13 +14 +23 +24 +include/save_master_gtid.inc +include/sync_with_master_gtid.inc +SELECT * from t1 WHERE a > 10 ORDER BY a; +a +13 +14 +23 +24 DROP TABLE t1; include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_gtid_crash.test b/mysql-test/suite/rpl/t/rpl_gtid_crash.test index 82667b31e57..0caad2a12fe 100644 --- a/mysql-test/suite/rpl/t/rpl_gtid_crash.test +++ b/mysql-test/suite/rpl/t/rpl_gtid_crash.test @@ -294,6 +294,192 @@ SHOW VARIABLES like 'gtid_strict_mode'; eval SET GLOBAL gtid_strict_mode= $old_gtid_strict; --enable_query_log + +--echo *** MDEV-6462: Incorrect recovery on a slave reconnecting to crashed master *** + +--connection server_1 +set sql_log_bin= 0; +call mtr.add_suppression("Error writing file 'master-bin'"); +set sql_log_bin= 1; +--connection server_2 +set sql_log_bin= 0; +call mtr.add_suppression("The server_id of master server changed in the middle of GTID"); +call mtr.add_suppression("Unexpected change of master binlog file name in the middle of GTID"); +set sql_log_bin= 1; + +--connection server_1 +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +wait +EOF + +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (11); +--error ER_ERROR_ON_WRITE +COMMIT; +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +--error 2006,2013 +COMMIT; + +--source include/wait_until_disconnected.inc + +# Simulate that we reconnect to a different server (new server_id). +--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +restart: --server-id=3 +EOF + +--enable_reconnect +--source include/wait_until_connected_again.inc + +SELECT @@GLOBAL.server_id; +SELECT * from t1 WHERE a > 10 ORDER BY a; + +--echo # Wait 30 seconds for SQL thread to catch up with IO thread +--connection server_2 +--let $wait_timeout= 300 +while ($wait_timeout != 0) +{ + --let $read_log_pos= query_get_value('SHOW SLAVE STATUS', Read_Master_Log_Pos, 1) + --let $exec_log_pos= query_get_value('SHOW SLAVE STATUS', Exec_Master_Log_Pos, 1) + if ($read_log_pos == $exec_log_pos) + { + --let $wait_timeout= 0 + } + if ($read_log_pos != $exec_log_pos) + { + --sleep 0.1 + --dec $wait_timeout + } +} +if ($read_log_pos != $exec_log_pos) +{ + --die Timeout wait for SQL thread to catch up with IO thread +} + +SELECT * from t1 WHERE a > 10 ORDER BY a; + +--echo # Repeat this with additional transactions on the master + +--connection server_1 +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +wait +EOF + +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (12); +--error ER_ERROR_ON_WRITE +COMMIT; +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +--error 2006,2013 +COMMIT; + +--source include/wait_until_disconnected.inc + +# Simulate that we reconnect to a different server (new server_id). +--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +restart: --server-id=1 +EOF + +--enable_reconnect +--source include/wait_until_connected_again.inc + +SELECT @@GLOBAL.server_id; +INSERT INTO t1 VALUES (13); +INSERT INTO t1 VALUES (14); +SELECT * from t1 WHERE a > 10 ORDER BY a; +--source include/save_master_gtid.inc + +--connection server_2 +--source include/sync_with_master_gtid.inc +SELECT * from t1 WHERE a > 10 ORDER BY a; + +--connection server_1 +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +wait +EOF + +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (21); +--error ER_ERROR_ON_WRITE +COMMIT; +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +--error 2006,2013 +COMMIT; + +--source include/wait_until_disconnected.inc + +# Simulate that we reconnect to the same server (same server_id). +--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +restart +EOF + +--enable_reconnect +--source include/wait_until_connected_again.inc + +SELECT @@GLOBAL.server_id; +SELECT * from t1 WHERE a > 10 ORDER BY a; + +--echo # Wait 30 seconds for SQL thread to catch up with IO thread +--connection server_2 +--let $wait_timeout= 300 +while ($wait_timeout != 0) +{ + --let $read_log_pos= query_get_value('SHOW SLAVE STATUS', Read_Master_Log_Pos, 1) + --let $exec_log_pos= query_get_value('SHOW SLAVE STATUS', Exec_Master_Log_Pos, 1) + if ($read_log_pos == $exec_log_pos) + { + --let $wait_timeout= 0 + } + if ($read_log_pos != $exec_log_pos) + { + --sleep 0.1 + --dec $wait_timeout + } +} +if ($read_log_pos != $exec_log_pos) +{ + --die Timeout wait for SQL thread to catch up with IO thread +} + +SELECT * from t1 WHERE a > 10 ORDER BY a; + +--echo # Repeat this with additional transactions on the master + +--connection server_1 +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +wait +EOF + +SET GLOBAL debug_dbug="+d,inject_error_writing_xid"; +BEGIN; +INSERT INTO t1 VALUES (22); +--error ER_ERROR_ON_WRITE +COMMIT; +SET GLOBAL debug_dbug="+d,crash_dispatch_command_before"; +--error 2006,2013 +COMMIT; + +--source include/wait_until_disconnected.inc + +--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +restart +EOF + +--enable_reconnect +--source include/wait_until_connected_again.inc + +INSERT INTO t1 VALUES (23); +INSERT INTO t1 VALUES (24); +SELECT * from t1 WHERE a > 10 ORDER BY a; +--source include/save_master_gtid.inc + +--connection server_2 +--source include/sync_with_master_gtid.inc +SELECT * from t1 WHERE a > 10 ORDER BY a; + + --connection server_1 DROP TABLE t1; diff --git a/sql/log.cc b/sql/log.cc index 71975cd83b7..e6d9b6d15cb 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -7469,6 +7469,13 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry, } } + DBUG_EXECUTE_IF("inject_error_writing_xid", + { + entry->error_cache= NULL; + entry->commit_errno= 28; + DBUG_RETURN(ER_ERROR_ON_WRITE); + }); + if (entry->end_event->write(&log_file)) { entry->error_cache= NULL; diff --git a/sql/rpl_mi.cc b/sql/rpl_mi.cc index 7764400becb..f2c39ea7f8d 100644 --- a/sql/rpl_mi.cc +++ b/sql/rpl_mi.cc @@ -38,6 +38,7 @@ Master_info::Master_info(LEX_STRING *connection_name_arg, connect_retry(DEFAULT_CONNECT_RETRY), inited(0), abort_slave(0), slave_running(0), slave_run_id(0), sync_counter(0), heartbeat_period(0), received_heartbeats(0), master_id(0), + prev_master_id(0), using_gtid(USE_GTID_NO), events_queued_since_last_gtid(0), gtid_reconnect_event_skip_count(0), gtid_event_seen(false) { diff --git a/sql/rpl_mi.h b/sql/rpl_mi.h index af739b1dad4..ebb1b541728 100644 --- a/sql/rpl_mi.h +++ b/sql/rpl_mi.h @@ -135,6 +135,12 @@ class Master_info : public Slave_reporting_capability ulonglong received_heartbeats; // counter of received heartbeat events DYNAMIC_ARRAY ignore_server_ids; ulong master_id; + /* + At reconnect and until the first rotate event is seen, prev_master_id is + the value of master_id during the previous connection, used to detect + silent change of master server during reconnects. + */ + ulong prev_master_id; /* Which kind of GTID position (if any) is used when connecting to master. diff --git a/sql/slave.cc b/sql/slave.cc index ae3cb86e8be..9422ecc7f22 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -5265,6 +5265,86 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len) event_len - BINLOG_CHECKSUM_LEN : event_len, mi->rli.relay_log.description_event_for_queue); + if (unlikely(mi->gtid_reconnect_event_skip_count) && + unlikely(!mi->gtid_event_seen) && + rev.is_artificial_event() && + (mi->prev_master_id != mi->master_id || + strcmp(rev.new_log_ident, mi->master_log_name) != 0)) + { + /* + Artificial Rotate_log_event is the first event we receive at the start + of each master binlog file. It gives the name of the new binlog file. + + Normally, we already have this name from the real rotate event at the + end of the previous binlog file (unless we are making a new connection + using GTID). But if the master server restarted/crashed, there is no + rotate event at the end of the prior binlog file, so the name is new. + + We use this fact to handle a special case of master crashing. If the + master crashed while writing the binlog, it might end with a partial + event group lacking the COMMIT/XID event, which must be rolled + back. If the slave IO thread happens to get a disconnect in the middle + of exactly this event group, it will try to reconnect at the same GTID + and skip already fetched events. However, that GTID did not commit on + the master before the crash, so it does not really exist, and the + master will connect the slave at the next following GTID starting in + the next binlog. This could confuse the slave and make it mix the + start of one event group with the end of another. + + But we detect this case here, by noticing the change of binlog name + which detects the missing rotate event at the end of the previous + binlog file. In this case, we reset the counters to make us not skip + the next event group, and queue an artificial Format Description + event. The previously fetched incomplete event group will then be + rolled back when the Format Description event is executed by the SQL + thread. + + A similar case is if the reconnect somehow connects to a different + master server (like due to a network proxy or IP address takeover). + We detect this case by noticing a change of server_id and in this + case likewise rollback the partially received event group. + */ + Format_description_log_event fdle(4); + + if (mi->prev_master_id != mi->master_id) + sql_print_warning("The server_id of master server changed in the " + "middle of GTID %u-%u-%llu. Assuming a change of " + "master server, so rolling back the previously " + "received partial transaction. Expected: %lu, " + "received: %lu", mi->last_queued_gtid.domain_id, + mi->last_queued_gtid.server_id, + mi->last_queued_gtid.seq_no, + mi->prev_master_id, mi->master_id); + else if (strcmp(rev.new_log_ident, mi->master_log_name) != 0) + sql_print_warning("Unexpected change of master binlog file name in the " + "middle of GTID %u-%u-%llu, assuming that master has " + "crashed and rolling back the transaction. Expected: " + "'%s', received: '%s'", + mi->last_queued_gtid.domain_id, + mi->last_queued_gtid.server_id, + mi->last_queued_gtid.seq_no, + mi->master_log_name, rev.new_log_ident); + + mysql_mutex_lock(log_lock); + if (likely(!fdle.write(rli->relay_log.get_log_file()) && + !rli->relay_log.flush_and_sync(NULL))) + { + rli->relay_log.harvest_bytes_written(&rli->log_space_total); + } + else + { + error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE; + mysql_mutex_unlock(log_lock); + goto err; + } + rli->relay_log.signal_update(); + mysql_mutex_unlock(log_lock); + + mi->gtid_reconnect_event_skip_count= 0; + mi->events_queued_since_last_gtid= 0; + } + mi->prev_master_id= mi->master_id; + if (unlikely(process_io_rotate(mi, &rev))) { error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;