From f219fb84893e9837ce09a8811929650347aea4f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 7 Nov 2024 09:04:16 +0200 Subject: [PATCH] MDEV-35355 : Galera test failure on galera_sr.mysql-wsrep-features#165 Problem was that in DeadlockChecker::trx_rollback() we hold lock_sys before we call wsrep_handle_SR_rollback() where THD::LOCK_thd_data (and some cases THD::LOCK_thd_kill) are acquired. This is against current mutex ordering rules. However, acquiring THD::LOCK_thd_data is not necessary because we always are in victim_thd context, either client session is rolling back or rollbacker thread should be in control. Therefore, we should always use wsrep_thd_self_abort() and then no additional mutexes are required. Fixed by removing locking of THD::LOCK_thd_data and using only wsrep_thd_self_abort(). In debug builds added assertions to verify that we are always in victim_thd context. This fix is for MariaDB 10.5 and we already have a test case that sporadically fail in Jenkins before this fix. Signed-off-by: Julius Goryavsky --- sql/service_wsrep.cc | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/sql/service_wsrep.cc b/sql/service_wsrep.cc index c131e3c7fb5..6aab1bec99b 100644 --- a/sql/service_wsrep.cc +++ b/sql/service_wsrep.cc @@ -186,42 +186,29 @@ extern "C" my_bool wsrep_thd_is_SR(const THD *thd) return thd && thd->wsrep_cs().transaction().is_streaming(); } -extern "C" void wsrep_handle_SR_rollback(THD *bf_thd, +extern "C" void wsrep_handle_SR_rollback(THD *bf_thd __attribute__((unused)), THD *victim_thd) { + /* + We should always be in victim_thd context, either client session is + rolling back or rollbacker thread should be in control. + */ DBUG_ASSERT(victim_thd); + DBUG_ASSERT(current_thd == victim_thd); DBUG_ASSERT(wsrep_thd_is_SR(victim_thd)); - if (!victim_thd || !wsrep_on(bf_thd)) return; - wsrep_thd_LOCK(victim_thd); + /* Defensive measure to avoid crash in production. */ + if (!victim_thd) return; - WSREP_DEBUG("handle rollback, for deadlock: thd %llu trx_id %" PRIu64 " frags %zu conf %s", + WSREP_DEBUG("Handle SR rollback, for deadlock: thd %llu trx_id %" PRIu64 " frags %zu conf %s", victim_thd->thread_id, victim_thd->wsrep_trx_id(), victim_thd->wsrep_sr().fragments_certified(), wsrep_thd_transaction_state_str(victim_thd)); - /* Note: do not store/reset globals before wsrep_bf_abort() call - to avoid losing BF thd context. */ - if (!(bf_thd && bf_thd != victim_thd)) - { - DEBUG_SYNC(victim_thd, "wsrep_before_SR_rollback"); - } - if (bf_thd) - { - wsrep_bf_abort(bf_thd, victim_thd); - } - else - { - wsrep_thd_self_abort(victim_thd); - } + DEBUG_SYNC(victim_thd, "wsrep_before_SR_rollback"); - wsrep_thd_UNLOCK(victim_thd); - - if (bf_thd) - { - wsrep_store_threadvars(bf_thd); - } + wsrep_thd_self_abort(victim_thd); } extern "C" my_bool wsrep_thd_bf_abort(THD *bf_thd, THD *victim_thd,