From 3dcd01e5e68c08dea6e898fc30dfee15b0a8207c Mon Sep 17 00:00:00 2001 From: Kristian Nielsen Date: Thu, 13 Nov 2014 10:46:09 +0100 Subject: [PATCH] MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction The retry of an event group in parallel replication set the wrong value for the end log position of the event that was retried (qev->future_event_relay_log_pos). It was too large by the size of the event, so it pointed into the middle of the following event. If the retry happened in the very last event of the event group, _and_ the SQL thread was stopped just after successfully retrying that event, then the SQL threads's relay log position would be left incorrect. Restarting the SQL thread could then try to read events from a garbage offset in the relay log, usually leading to an error about not being able to read the event. --- .../suite/rpl/r/rpl_parallel_retry.result | 46 ++++++++++++++++ .../suite/rpl/t/rpl_parallel_retry.test | 53 ++++++++++++++++++- sql/rpl_parallel.cc | 14 ++++- 3 files changed, 111 insertions(+), 2 deletions(-) diff --git a/mysql-test/suite/rpl/r/rpl_parallel_retry.result b/mysql-test/suite/rpl/r/rpl_parallel_retry.result index cd12d92430b..12e630b3bfa 100644 --- a/mysql-test/suite/rpl/r/rpl_parallel_retry.result +++ b/mysql-test/suite/rpl/r/rpl_parallel_retry.result @@ -188,6 +188,52 @@ a LENGTH(b) 3 5012 4 5000 SET GLOBAL max_relay_log_size=@old_max; +*** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction *** +include/stop_slave.inc +BEGIN; +INSERT INTO t1 VALUES (100, 0); +INSERT INTO t1 VALUES (101, 0); +INSERT INTO t1 VALUES (102, 0); +INSERT INTO t1 VALUES (103, 0); +COMMIT; +SELECT * FROM t1 WHERE a >= 100 ORDER BY a; +a b +100 0 +101 0 +102 0 +103 0 +SET @old_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid"; +include/start_slave.inc +SET GLOBAL debug_dbug=@old_dbug; +retries +1 +SELECT * FROM t1 WHERE a >= 100 ORDER BY a; +a b +100 0 +101 0 +102 0 +103 0 +include/stop_slave_sql.inc +INSERT INTO t1 VALUES (104, 1); +INSERT INTO t1 VALUES (105, 1); +INSERT INTO t1 VALUES (106, 1); +INSERT INTO t1 VALUES (107, 1); +INSERT INTO t1 VALUES (108, 1); +INSERT INTO t1 VALUES (109, 1); +include/start_slave.inc +SELECT * FROM t1 WHERE a >= 100 ORDER BY a; +a b +100 0 +101 0 +102 0 +103 0 +104 1 +105 1 +106 1 +107 1 +108 1 +109 1 include/stop_slave.inc SET GLOBAL slave_parallel_threads=@old_parallel_threads; include/start_slave.inc diff --git a/mysql-test/suite/rpl/t/rpl_parallel_retry.test b/mysql-test/suite/rpl/t/rpl_parallel_retry.test index d3be6262cb0..6afe11dfa2e 100644 --- a/mysql-test/suite/rpl/t/rpl_parallel_retry.test +++ b/mysql-test/suite/rpl/t/rpl_parallel_retry.test @@ -135,7 +135,6 @@ SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parall let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1); START SLAVE; --let $slave_sql_errno= 1213 ---let $slave_timeout= 10 --source include/wait_for_slave_sql_error.inc SET GLOBAL debug_dbug=@old_dbug; let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1); @@ -208,6 +207,58 @@ SELECT a, LENGTH(b) FROM t2 ORDER BY a; SET GLOBAL max_relay_log_size=@old_max; +--echo *** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction *** + +--connection server_2 +--source include/stop_slave.inc + +--connection server_1 +BEGIN; +INSERT INTO t1 VALUES (100, 0); +INSERT INTO t1 VALUES (101, 0); +INSERT INTO t1 VALUES (102, 0); +INSERT INTO t1 VALUES (103, 0); +COMMIT; +SELECT * FROM t1 WHERE a >= 100 ORDER BY a; +--save_master_pos + +--connection server_2 +# Inject a DBUG error insert to cause the XID event of the single transaction +# from the master to fail with a deadlock error and be retried. +# The bug was that the retry of the XID would leave the relay log position +# incorrect (off by the size of XID event). +SET @old_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid"; +let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1); +--source include/start_slave.inc +--sync_with_master +SET GLOBAL debug_dbug=@old_dbug; +let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1); +--disable_query_log +eval SELECT $new_retry - $old_retry AS retries; +--enable_query_log + +SELECT * FROM t1 WHERE a >= 100 ORDER BY a; +# Stop the SQL thread. When the bug was there to give the incorrect relay log +# position, the restart of the SQL thread would read garbage data from the +# middle of an event and fail with relay log IO error. +--source include/stop_slave_sql.inc + +--connection server_1 +INSERT INTO t1 VALUES (104, 1); +INSERT INTO t1 VALUES (105, 1); +INSERT INTO t1 VALUES (106, 1); +INSERT INTO t1 VALUES (107, 1); +INSERT INTO t1 VALUES (108, 1); +INSERT INTO t1 VALUES (109, 1); +--save_master_pos + +--connection server_2 +--source include/start_slave.inc +--sync_with_master +SELECT * FROM t1 WHERE a >= 100 ORDER BY a; + + --connection server_2 --source include/stop_slave.inc SET GLOBAL slave_parallel_threads=@old_parallel_threads; diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 154a8c0f98f..267f40b7cf2 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -446,7 +446,7 @@ do_retry: ev->thd= thd; mysql_mutex_lock(&rpt->LOCK_rpl_thread); - qev= rpt->retry_get_qev(ev, orig_qev, log_name, cur_offset, + qev= rpt->retry_get_qev(ev, orig_qev, log_name, old_offset, cur_offset - old_offset); mysql_mutex_unlock(&rpt->LOCK_rpl_thread); if (!qev) @@ -776,6 +776,18 @@ handle_rpl_parallel_thread(void *arg) if (likely(!rgi->worker_error) && !skip_event_group) { ++rgi->retry_event_count; +#ifndef DBUG_OFF + err= 0; + DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_xid", + if (event_type == XID_EVENT) + { + thd->clear_error(); + thd->get_stmt_da()->reset_diagnostics_area(); + my_error(ER_LOCK_DEADLOCK, MYF(0)); + err= 1; + }); + if (!err) +#endif err= rpt_handle_event(qev, rpt); delete_or_keep_event_post_apply(rgi, event_type, qev->ev); DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",