MDEV-7335: Potential parallel slave deadlock with specific binlog corruption
If somehow the COMMIT or XID event in an event group was missing, the code in parallel replication to handle this was not sufficient, leading to server deadlock.
This commit is contained in:
parent
79e9ff44d1
commit
a227cf8046
@ -1136,6 +1136,80 @@ SET GLOBAL debug_dbug=@old_dbug;
|
||||
SET GLOBAL slave_parallel_threads=0;
|
||||
SET GLOBAL slave_parallel_threads=10;
|
||||
include/start_slave.inc
|
||||
*** MDEV-7335: Potential parallel slave deadlock with specific binlog corruption ***
|
||||
include/stop_slave.inc
|
||||
SET GLOBAL slave_parallel_threads=1;
|
||||
SET @old_dbug= @@GLOBAL.debug_dbug;
|
||||
SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
|
||||
INSERT INTO t2 VALUES (101);
|
||||
INSERT INTO t2 VALUES (102);
|
||||
INSERT INTO t2 VALUES (103);
|
||||
INSERT INTO t2 VALUES (104);
|
||||
INSERT INTO t2 VALUES (105);
|
||||
SET gtid_seq_no=1000;
|
||||
INSERT INTO t2 VALUES (106);
|
||||
INSERT INTO t2 VALUES (107);
|
||||
INSERT INTO t2 VALUES (108);
|
||||
INSERT INTO t2 VALUES (109);
|
||||
INSERT INTO t2 VALUES (110);
|
||||
INSERT INTO t2 VALUES (111);
|
||||
INSERT INTO t2 VALUES (112);
|
||||
INSERT INTO t2 VALUES (113);
|
||||
INSERT INTO t2 VALUES (114);
|
||||
INSERT INTO t2 VALUES (115);
|
||||
INSERT INTO t2 VALUES (116);
|
||||
INSERT INTO t2 VALUES (117);
|
||||
INSERT INTO t2 VALUES (118);
|
||||
INSERT INTO t2 VALUES (119);
|
||||
INSERT INTO t2 VALUES (120);
|
||||
INSERT INTO t2 VALUES (121);
|
||||
INSERT INTO t2 VALUES (122);
|
||||
INSERT INTO t2 VALUES (123);
|
||||
INSERT INTO t2 VALUES (124);
|
||||
INSERT INTO t2 VALUES (125);
|
||||
INSERT INTO t2 VALUES (126);
|
||||
INSERT INTO t2 VALUES (127);
|
||||
INSERT INTO t2 VALUES (128);
|
||||
INSERT INTO t2 VALUES (129);
|
||||
INSERT INTO t2 VALUES (130);
|
||||
include/save_master_gtid.inc
|
||||
include/start_slave.inc
|
||||
include/sync_with_master_gtid.inc
|
||||
SELECT * FROM t2 WHERE a >= 100 ORDER BY a;
|
||||
a
|
||||
101
|
||||
102
|
||||
103
|
||||
104
|
||||
105
|
||||
107
|
||||
108
|
||||
109
|
||||
110
|
||||
111
|
||||
112
|
||||
113
|
||||
114
|
||||
115
|
||||
116
|
||||
117
|
||||
118
|
||||
119
|
||||
120
|
||||
121
|
||||
122
|
||||
123
|
||||
124
|
||||
125
|
||||
126
|
||||
127
|
||||
128
|
||||
129
|
||||
130
|
||||
include/stop_slave.inc
|
||||
SET GLOBAL debug_dbug=@old_dbug;
|
||||
SET GLOBAL slave_parallel_threads=10;
|
||||
include/start_slave.inc
|
||||
include/stop_slave.inc
|
||||
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
|
||||
include/start_slave.inc
|
||||
|
@ -1843,6 +1843,62 @@ SET GLOBAL slave_parallel_threads=10;
|
||||
--source include/start_slave.inc
|
||||
|
||||
|
||||
--echo *** MDEV-7335: Potential parallel slave deadlock with specific binlog corruption ***
|
||||
|
||||
--connection server_2
|
||||
--source include/stop_slave.inc
|
||||
SET GLOBAL slave_parallel_threads=1;
|
||||
SET @old_dbug= @@GLOBAL.debug_dbug;
|
||||
SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
|
||||
|
||||
--connection server_1
|
||||
INSERT INTO t2 VALUES (101);
|
||||
INSERT INTO t2 VALUES (102);
|
||||
INSERT INTO t2 VALUES (103);
|
||||
INSERT INTO t2 VALUES (104);
|
||||
INSERT INTO t2 VALUES (105);
|
||||
# Inject a partial event group (missing XID at the end). The bug was that such
|
||||
# partial group was not handled appropriately, leading to server deadlock.
|
||||
SET gtid_seq_no=1000;
|
||||
INSERT INTO t2 VALUES (106);
|
||||
INSERT INTO t2 VALUES (107);
|
||||
INSERT INTO t2 VALUES (108);
|
||||
INSERT INTO t2 VALUES (109);
|
||||
INSERT INTO t2 VALUES (110);
|
||||
INSERT INTO t2 VALUES (111);
|
||||
INSERT INTO t2 VALUES (112);
|
||||
INSERT INTO t2 VALUES (113);
|
||||
INSERT INTO t2 VALUES (114);
|
||||
INSERT INTO t2 VALUES (115);
|
||||
INSERT INTO t2 VALUES (116);
|
||||
INSERT INTO t2 VALUES (117);
|
||||
INSERT INTO t2 VALUES (118);
|
||||
INSERT INTO t2 VALUES (119);
|
||||
INSERT INTO t2 VALUES (120);
|
||||
INSERT INTO t2 VALUES (121);
|
||||
INSERT INTO t2 VALUES (122);
|
||||
INSERT INTO t2 VALUES (123);
|
||||
INSERT INTO t2 VALUES (124);
|
||||
INSERT INTO t2 VALUES (125);
|
||||
INSERT INTO t2 VALUES (126);
|
||||
INSERT INTO t2 VALUES (127);
|
||||
INSERT INTO t2 VALUES (128);
|
||||
INSERT INTO t2 VALUES (129);
|
||||
INSERT INTO t2 VALUES (130);
|
||||
--source include/save_master_gtid.inc
|
||||
|
||||
--connection server_2
|
||||
--source include/start_slave.inc
|
||||
--source include/sync_with_master_gtid.inc
|
||||
# The partial event group (a=106) should be rolled back and thus missing.
|
||||
SELECT * FROM t2 WHERE a >= 100 ORDER BY a;
|
||||
|
||||
--source include/stop_slave.inc
|
||||
SET GLOBAL debug_dbug=@old_dbug;
|
||||
SET GLOBAL slave_parallel_threads=10;
|
||||
--source include/start_slave.inc
|
||||
|
||||
|
||||
# Clean up.
|
||||
--connection server_2
|
||||
--source include/stop_slave.inc
|
||||
|
@ -640,7 +640,7 @@ handle_rpl_parallel_thread(void *arg)
|
||||
}
|
||||
DBUG_ASSERT(qev->typ==rpl_parallel_thread::queued_event::QUEUED_EVENT);
|
||||
|
||||
thd->rgi_slave= group_rgi= rgi;
|
||||
thd->rgi_slave= rgi;
|
||||
gco= rgi->gco;
|
||||
/* Handle a new event group, which will be initiated by a GTID event. */
|
||||
if ((event_type= qev->ev->get_type_code()) == GTID_EVENT)
|
||||
@ -657,6 +657,21 @@ handle_rpl_parallel_thread(void *arg)
|
||||
}
|
||||
});
|
||||
|
||||
if(unlikely(thd->wait_for_commit_ptr) && group_rgi != NULL)
|
||||
{
|
||||
/*
|
||||
This indicates that we get a new GTID event in the middle of
|
||||
a not completed event group. This is corrupt binlog (the master
|
||||
will never write such binlog), so it does not happen unless
|
||||
someone tries to inject wrong crafted binlog, but let us still
|
||||
try to handle it somewhat nicely.
|
||||
*/
|
||||
group_rgi->cleanup_context(thd, true);
|
||||
finish_event_group(rpt, group_rgi->gtid_sub_id,
|
||||
group_rgi->parallel_entry, group_rgi);
|
||||
rpt->loc_free_rgi(group_rgi);
|
||||
}
|
||||
|
||||
in_event_group= true;
|
||||
/*
|
||||
If the standalone flag is set, then this event group consists of a
|
||||
@ -742,19 +757,6 @@ handle_rpl_parallel_thread(void *arg)
|
||||
unlock_or_exit_cond(thd, &entry->LOCK_parallel_entry,
|
||||
&did_enter_cond, &old_stage);
|
||||
|
||||
if(thd->wait_for_commit_ptr)
|
||||
{
|
||||
/*
|
||||
This indicates that we get a new GTID event in the middle of
|
||||
a not completed event group. This is corrupt binlog (the master
|
||||
will never write such binlog), so it does not happen unless
|
||||
someone tries to inject wrong crafted binlog, but let us still
|
||||
try to handle it somewhat nicely.
|
||||
*/
|
||||
rgi->cleanup_context(thd, true);
|
||||
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
|
||||
thd->wait_for_commit_ptr->wakeup_subsequent_commits(rgi->worker_error);
|
||||
}
|
||||
thd->wait_for_commit_ptr= &rgi->commit_orderer;
|
||||
|
||||
if (opt_gtid_ignore_duplicates)
|
||||
@ -780,6 +782,7 @@ handle_rpl_parallel_thread(void *arg)
|
||||
}
|
||||
}
|
||||
|
||||
group_rgi= rgi;
|
||||
group_ending= is_group_ending(qev->ev, event_type);
|
||||
if (group_ending && likely(!rgi->worker_error))
|
||||
{
|
||||
|
12
sql/slave.cc
12
sql/slave.cc
@ -5648,6 +5648,18 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
|
||||
}
|
||||
break;
|
||||
|
||||
#ifndef DBUG_OFF
|
||||
case XID_EVENT:
|
||||
DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
|
||||
{
|
||||
/* Inject an event group that is missing its XID commit event. */
|
||||
if (mi->last_queued_gtid.domain_id == 0 &&
|
||||
mi->last_queued_gtid.seq_no == 1000)
|
||||
goto skip_relay_logging;
|
||||
});
|
||||
/* Fall through to default case ... */
|
||||
#endif
|
||||
|
||||
default:
|
||||
default_action:
|
||||
if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen)
|
||||
|
Loading…
x
Reference in New Issue
Block a user