MDEV-31448: Killing a replica thread awaiting its GCO can hang/crash a parallel replica
The problem was an incorrect unmark_start_commit() in signal_error_to_sql_driver_thread(). If an event group gets an error, this unmark could run after the following GCO started, and the subsequent re-marking could access de-allocated GCO. The offending unmark_start_commit() looks obviously incorrect, and the fix is to just remove it. It was introduced in the MDEV-8302 patch, the commit message of which suggests it was added there solely to satisfy an assertion in ha_rollback_trans(). So update this assertion instead to not trigger for event groups that experienced an error (rgi->worker_error). When an error occurs in an event group, all following event groups are skipped anyway, so the unmark should never be needed in this case. Reviewed-by: Andrei Elkin <andrei.elkin@mariadb.com> Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
This commit is contained in:
parent
60bec1d54d
commit
a8ea6627a4
@ -1898,13 +1898,22 @@ int ha_rollback_trans(THD *thd, bool all)
|
|||||||
attempt. Otherwise those following transactions can run too early, and
|
attempt. Otherwise those following transactions can run too early, and
|
||||||
possibly cause replication to fail. See comments in retry_event_group().
|
possibly cause replication to fail. See comments in retry_event_group().
|
||||||
|
|
||||||
|
(This concerns rollbacks due to temporary errors where the transaction
|
||||||
|
will be retried afterwards. For non-recoverable errors, following
|
||||||
|
transactions will not start but just be skipped as the worker threads
|
||||||
|
perform the error stop).
|
||||||
|
|
||||||
There were several bugs with this in the past that were very hard to
|
There were several bugs with this in the past that were very hard to
|
||||||
track down (MDEV-7458, MDEV-8302). So we add here an assertion for
|
track down (MDEV-7458, MDEV-8302). So we add here an assertion for
|
||||||
rollback without signalling following transactions. And in release
|
rollback without signalling following transactions. And in release
|
||||||
builds, we explicitly do the signalling before rolling back.
|
builds, we explicitly do the signalling before rolling back.
|
||||||
*/
|
*/
|
||||||
DBUG_ASSERT(!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit));
|
DBUG_ASSERT( !(thd->rgi_slave &&
|
||||||
if (thd->rgi_slave && thd->rgi_slave->did_mark_start_commit)
|
!thd->rgi_slave->worker_error &&
|
||||||
|
thd->rgi_slave->did_mark_start_commit));
|
||||||
|
if (thd->rgi_slave &&
|
||||||
|
!thd->rgi_slave->worker_error &&
|
||||||
|
thd->rgi_slave->did_mark_start_commit)
|
||||||
thd->rgi_slave->unmark_start_commit();
|
thd->rgi_slave->unmark_start_commit();
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -286,16 +286,11 @@ static void
|
|||||||
signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
|
signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
|
||||||
{
|
{
|
||||||
rgi->worker_error= err;
|
rgi->worker_error= err;
|
||||||
/*
|
|
||||||
In case we get an error during commit, inform following transactions that
|
|
||||||
we aborted our commit.
|
|
||||||
*/
|
|
||||||
DBUG_EXECUTE_IF("hold_worker2_favor_worker3", {
|
DBUG_EXECUTE_IF("hold_worker2_favor_worker3", {
|
||||||
if (rgi->current_gtid.seq_no == 2002) {
|
if (rgi->current_gtid.seq_no == 2002) {
|
||||||
debug_sync_set_action(thd, STRING_WITH_LEN("now WAIT_FOR cont_worker2"));
|
debug_sync_set_action(thd, STRING_WITH_LEN("now WAIT_FOR cont_worker2"));
|
||||||
}});
|
}});
|
||||||
|
|
||||||
rgi->unmark_start_commit();
|
|
||||||
rgi->cleanup_context(thd, true);
|
rgi->cleanup_context(thd, true);
|
||||||
rgi->rli->abort_slave= true;
|
rgi->rli->abort_slave= true;
|
||||||
rgi->rli->stop_for_until= false;
|
rgi->rli->stop_for_until= false;
|
||||||
|
@ -91,6 +91,10 @@ struct group_commit_orderer {
|
|||||||
};
|
};
|
||||||
uint8 flags;
|
uint8 flags;
|
||||||
#ifndef DBUG_OFF
|
#ifndef DBUG_OFF
|
||||||
|
/*
|
||||||
|
Flag set when the GCO has been freed and entered the free list, to catch
|
||||||
|
(in debug) errors in the complex lifetime of this object.
|
||||||
|
*/
|
||||||
bool gc_done;
|
bool gc_done;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user