MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction

The retry of an event group in parallel replication set the wrong value for
the end log position of the event that was retried
(qev->future_event_relay_log_pos). It was too large by the size of the event,
so it pointed into the middle of the following event.

If the retry happened in the very last event of the event group, _and_ the SQL
thread was stopped just after successfully retrying that event, then the SQL
threads's relay log position would be left incorrect. Restarting the SQL
thread could then try to read events from a garbage offset in the relay log,
usually leading to an error about not being able to read the event.
This commit is contained in:
Kristian Nielsen 2014-11-13 10:46:09 +01:00
parent d08b893b39
commit 3dcd01e5e6
3 changed files with 111 additions and 2 deletions

View File

@ -188,6 +188,52 @@ a LENGTH(b)
3 5012
4 5000
SET GLOBAL max_relay_log_size=@old_max;
*** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction ***
include/stop_slave.inc
BEGIN;
INSERT INTO t1 VALUES (100, 0);
INSERT INTO t1 VALUES (101, 0);
INSERT INTO t1 VALUES (102, 0);
INSERT INTO t1 VALUES (103, 0);
COMMIT;
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
a b
100 0
101 0
102 0
103 0
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
include/start_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
retries
1
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
a b
100 0
101 0
102 0
103 0
include/stop_slave_sql.inc
INSERT INTO t1 VALUES (104, 1);
INSERT INTO t1 VALUES (105, 1);
INSERT INTO t1 VALUES (106, 1);
INSERT INTO t1 VALUES (107, 1);
INSERT INTO t1 VALUES (108, 1);
INSERT INTO t1 VALUES (109, 1);
include/start_slave.inc
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
a b
100 0
101 0
102 0
103 0
104 1
105 1
106 1
107 1
108 1
109 1
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
include/start_slave.inc

View File

@ -135,7 +135,6 @@ SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parall
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
START SLAVE;
--let $slave_sql_errno= 1213
--let $slave_timeout= 10
--source include/wait_for_slave_sql_error.inc
SET GLOBAL debug_dbug=@old_dbug;
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
@ -208,6 +207,58 @@ SELECT a, LENGTH(b) FROM t2 ORDER BY a;
SET GLOBAL max_relay_log_size=@old_max;
--echo *** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction ***
--connection server_2
--source include/stop_slave.inc
--connection server_1
BEGIN;
INSERT INTO t1 VALUES (100, 0);
INSERT INTO t1 VALUES (101, 0);
INSERT INTO t1 VALUES (102, 0);
INSERT INTO t1 VALUES (103, 0);
COMMIT;
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
--save_master_pos
--connection server_2
# Inject a DBUG error insert to cause the XID event of the single transaction
# from the master to fail with a deadlock error and be retried.
# The bug was that the retry of the XID would leave the relay log position
# incorrect (off by the size of XID event).
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
--source include/start_slave.inc
--sync_with_master
SET GLOBAL debug_dbug=@old_dbug;
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
--disable_query_log
eval SELECT $new_retry - $old_retry AS retries;
--enable_query_log
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
# Stop the SQL thread. When the bug was there to give the incorrect relay log
# position, the restart of the SQL thread would read garbage data from the
# middle of an event and fail with relay log IO error.
--source include/stop_slave_sql.inc
--connection server_1
INSERT INTO t1 VALUES (104, 1);
INSERT INTO t1 VALUES (105, 1);
INSERT INTO t1 VALUES (106, 1);
INSERT INTO t1 VALUES (107, 1);
INSERT INTO t1 VALUES (108, 1);
INSERT INTO t1 VALUES (109, 1);
--save_master_pos
--connection server_2
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;

View File

@ -446,7 +446,7 @@ do_retry:
ev->thd= thd;
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
qev= rpt->retry_get_qev(ev, orig_qev, log_name, cur_offset,
qev= rpt->retry_get_qev(ev, orig_qev, log_name, old_offset,
cur_offset - old_offset);
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
if (!qev)
@ -776,6 +776,18 @@ handle_rpl_parallel_thread(void *arg)
if (likely(!rgi->worker_error) && !skip_event_group)
{
++rgi->retry_event_count;
#ifndef DBUG_OFF
err= 0;
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_xid",
if (event_type == XID_EVENT)
{
thd->clear_error();
thd->get_stmt_da()->reset_diagnostics_area();
my_error(ER_LOCK_DEADLOCK, MYF(0));
err= 1;
});
if (!err)
#endif
err= rpt_handle_event(qev, rpt);
delete_or_keep_event_post_apply(rgi, event_type, qev->ev);
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",