diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result index 7ceb5ee6622..3c66a541cc1 100644 --- a/mysql-test/suite/rpl/r/rpl_parallel.result +++ b/mysql-test/suite/rpl/r/rpl_parallel.result @@ -1136,6 +1136,80 @@ SET GLOBAL debug_dbug=@old_dbug; SET GLOBAL slave_parallel_threads=0; SET GLOBAL slave_parallel_threads=10; include/start_slave.inc +*** MDEV-7335: Potential parallel slave deadlock with specific binlog corruption *** +include/stop_slave.inc +SET GLOBAL slave_parallel_threads=1; +SET @old_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000"; +INSERT INTO t2 VALUES (101); +INSERT INTO t2 VALUES (102); +INSERT INTO t2 VALUES (103); +INSERT INTO t2 VALUES (104); +INSERT INTO t2 VALUES (105); +SET gtid_seq_no=1000; +INSERT INTO t2 VALUES (106); +INSERT INTO t2 VALUES (107); +INSERT INTO t2 VALUES (108); +INSERT INTO t2 VALUES (109); +INSERT INTO t2 VALUES (110); +INSERT INTO t2 VALUES (111); +INSERT INTO t2 VALUES (112); +INSERT INTO t2 VALUES (113); +INSERT INTO t2 VALUES (114); +INSERT INTO t2 VALUES (115); +INSERT INTO t2 VALUES (116); +INSERT INTO t2 VALUES (117); +INSERT INTO t2 VALUES (118); +INSERT INTO t2 VALUES (119); +INSERT INTO t2 VALUES (120); +INSERT INTO t2 VALUES (121); +INSERT INTO t2 VALUES (122); +INSERT INTO t2 VALUES (123); +INSERT INTO t2 VALUES (124); +INSERT INTO t2 VALUES (125); +INSERT INTO t2 VALUES (126); +INSERT INTO t2 VALUES (127); +INSERT INTO t2 VALUES (128); +INSERT INTO t2 VALUES (129); +INSERT INTO t2 VALUES (130); +include/save_master_gtid.inc +include/start_slave.inc +include/sync_with_master_gtid.inc +SELECT * FROM t2 WHERE a >= 100 ORDER BY a; +a +101 +102 +103 +104 +105 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +include/stop_slave.inc +SET GLOBAL debug_dbug=@old_dbug; +SET GLOBAL slave_parallel_threads=10; +include/start_slave.inc include/stop_slave.inc SET GLOBAL slave_parallel_threads=@old_parallel_threads; include/start_slave.inc diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test index d4b99d4b0f7..7397ede14b3 100644 --- a/mysql-test/suite/rpl/t/rpl_parallel.test +++ b/mysql-test/suite/rpl/t/rpl_parallel.test @@ -1843,6 +1843,62 @@ SET GLOBAL slave_parallel_threads=10; --source include/start_slave.inc +--echo *** MDEV-7335: Potential parallel slave deadlock with specific binlog corruption *** + +--connection server_2 +--source include/stop_slave.inc +SET GLOBAL slave_parallel_threads=1; +SET @old_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000"; + +--connection server_1 +INSERT INTO t2 VALUES (101); +INSERT INTO t2 VALUES (102); +INSERT INTO t2 VALUES (103); +INSERT INTO t2 VALUES (104); +INSERT INTO t2 VALUES (105); +# Inject a partial event group (missing XID at the end). The bug was that such +# partial group was not handled appropriately, leading to server deadlock. +SET gtid_seq_no=1000; +INSERT INTO t2 VALUES (106); +INSERT INTO t2 VALUES (107); +INSERT INTO t2 VALUES (108); +INSERT INTO t2 VALUES (109); +INSERT INTO t2 VALUES (110); +INSERT INTO t2 VALUES (111); +INSERT INTO t2 VALUES (112); +INSERT INTO t2 VALUES (113); +INSERT INTO t2 VALUES (114); +INSERT INTO t2 VALUES (115); +INSERT INTO t2 VALUES (116); +INSERT INTO t2 VALUES (117); +INSERT INTO t2 VALUES (118); +INSERT INTO t2 VALUES (119); +INSERT INTO t2 VALUES (120); +INSERT INTO t2 VALUES (121); +INSERT INTO t2 VALUES (122); +INSERT INTO t2 VALUES (123); +INSERT INTO t2 VALUES (124); +INSERT INTO t2 VALUES (125); +INSERT INTO t2 VALUES (126); +INSERT INTO t2 VALUES (127); +INSERT INTO t2 VALUES (128); +INSERT INTO t2 VALUES (129); +INSERT INTO t2 VALUES (130); +--source include/save_master_gtid.inc + +--connection server_2 +--source include/start_slave.inc +--source include/sync_with_master_gtid.inc +# The partial event group (a=106) should be rolled back and thus missing. +SELECT * FROM t2 WHERE a >= 100 ORDER BY a; + +--source include/stop_slave.inc +SET GLOBAL debug_dbug=@old_dbug; +SET GLOBAL slave_parallel_threads=10; +--source include/start_slave.inc + + # Clean up. --connection server_2 --source include/stop_slave.inc diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 46c3e4aaaf4..c6bb974f62f 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -640,7 +640,7 @@ handle_rpl_parallel_thread(void *arg) } DBUG_ASSERT(qev->typ==rpl_parallel_thread::queued_event::QUEUED_EVENT); - thd->rgi_slave= group_rgi= rgi; + thd->rgi_slave= rgi; gco= rgi->gco; /* Handle a new event group, which will be initiated by a GTID event. */ if ((event_type= qev->ev->get_type_code()) == GTID_EVENT) @@ -657,6 +657,21 @@ handle_rpl_parallel_thread(void *arg) } }); + if(unlikely(thd->wait_for_commit_ptr) && group_rgi != NULL) + { + /* + This indicates that we get a new GTID event in the middle of + a not completed event group. This is corrupt binlog (the master + will never write such binlog), so it does not happen unless + someone tries to inject wrong crafted binlog, but let us still + try to handle it somewhat nicely. + */ + group_rgi->cleanup_context(thd, true); + finish_event_group(rpt, group_rgi->gtid_sub_id, + group_rgi->parallel_entry, group_rgi); + rpt->loc_free_rgi(group_rgi); + } + in_event_group= true; /* If the standalone flag is set, then this event group consists of a @@ -742,19 +757,6 @@ handle_rpl_parallel_thread(void *arg) unlock_or_exit_cond(thd, &entry->LOCK_parallel_entry, &did_enter_cond, &old_stage); - if(thd->wait_for_commit_ptr) - { - /* - This indicates that we get a new GTID event in the middle of - a not completed event group. This is corrupt binlog (the master - will never write such binlog), so it does not happen unless - someone tries to inject wrong crafted binlog, but let us still - try to handle it somewhat nicely. - */ - rgi->cleanup_context(thd, true); - thd->wait_for_commit_ptr->unregister_wait_for_prior_commit(); - thd->wait_for_commit_ptr->wakeup_subsequent_commits(rgi->worker_error); - } thd->wait_for_commit_ptr= &rgi->commit_orderer; if (opt_gtid_ignore_duplicates) @@ -780,6 +782,7 @@ handle_rpl_parallel_thread(void *arg) } } + group_rgi= rgi; group_ending= is_group_ending(qev->ev, event_type); if (group_ending && likely(!rgi->worker_error)) { diff --git a/sql/slave.cc b/sql/slave.cc index ba56ff54d5a..4635b575204 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -5648,6 +5648,18 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len) } break; +#ifndef DBUG_OFF + case XID_EVENT: + DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000", + { + /* Inject an event group that is missing its XID commit event. */ + if (mi->last_queued_gtid.domain_id == 0 && + mi->last_queued_gtid.seq_no == 1000) + goto skip_relay_logging; + }); + /* Fall through to default case ... */ +#endif + default: default_action: if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen)