From a8ea6627a4d3a099e69a88962b71649da8dadfa4 Mon Sep 17 00:00:00 2001 From: Kristian Nielsen Date: Sun, 11 Jun 2023 17:44:58 +0200 Subject: [PATCH] MDEV-31448: Killing a replica thread awaiting its GCO can hang/crash a parallel replica The problem was an incorrect unmark_start_commit() in signal_error_to_sql_driver_thread(). If an event group gets an error, this unmark could run after the following GCO started, and the subsequent re-marking could access de-allocated GCO. The offending unmark_start_commit() looks obviously incorrect, and the fix is to just remove it. It was introduced in the MDEV-8302 patch, the commit message of which suggests it was added there solely to satisfy an assertion in ha_rollback_trans(). So update this assertion instead to not trigger for event groups that experienced an error (rgi->worker_error). When an error occurs in an event group, all following event groups are skipped anyway, so the unmark should never be needed in this case. Reviewed-by: Andrei Elkin Signed-off-by: Kristian Nielsen --- sql/handler.cc | 13 +++++++++++-- sql/rpl_parallel.cc | 5 ----- sql/rpl_parallel.h | 4 ++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/sql/handler.cc b/sql/handler.cc index f7d781eb82f..48ce7b3f1f8 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -1898,13 +1898,22 @@ int ha_rollback_trans(THD *thd, bool all) attempt. Otherwise those following transactions can run too early, and possibly cause replication to fail. See comments in retry_event_group(). + (This concerns rollbacks due to temporary errors where the transaction + will be retried afterwards. For non-recoverable errors, following + transactions will not start but just be skipped as the worker threads + perform the error stop). + There were several bugs with this in the past that were very hard to track down (MDEV-7458, MDEV-8302). So we add here an assertion for rollback without signalling following transactions. And in release builds, we explicitly do the signalling before rolling back. */ - DBUG_ASSERT(!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit)); - if (thd->rgi_slave && thd->rgi_slave->did_mark_start_commit) + DBUG_ASSERT( !(thd->rgi_slave && + !thd->rgi_slave->worker_error && + thd->rgi_slave->did_mark_start_commit)); + if (thd->rgi_slave && + !thd->rgi_slave->worker_error && + thd->rgi_slave->did_mark_start_commit) thd->rgi_slave->unmark_start_commit(); } #endif diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 98256118909..1dab1224286 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -286,16 +286,11 @@ static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err) { rgi->worker_error= err; - /* - In case we get an error during commit, inform following transactions that - we aborted our commit. - */ DBUG_EXECUTE_IF("hold_worker2_favor_worker3", { if (rgi->current_gtid.seq_no == 2002) { debug_sync_set_action(thd, STRING_WITH_LEN("now WAIT_FOR cont_worker2")); }}); - rgi->unmark_start_commit(); rgi->cleanup_context(thd, true); rgi->rli->abort_slave= true; rgi->rli->stop_for_until= false; diff --git a/sql/rpl_parallel.h b/sql/rpl_parallel.h index b7304d204ee..6b03306692b 100644 --- a/sql/rpl_parallel.h +++ b/sql/rpl_parallel.h @@ -91,6 +91,10 @@ struct group_commit_orderer { }; uint8 flags; #ifndef DBUG_OFF + /* + Flag set when the GCO has been freed and entered the free list, to catch + (in debug) errors in the complex lifetime of this object. + */ bool gc_done; #endif };