MDEV-25114: Crash: WSREP: invalid state ROLLED_BACK (FATAL)

Revert "MDEV-23328 Server hang due to Galera lock conflict resolution"

This reverts commit eac8341df4c3c7b98360f4e9498acf393dc055e3.
This commit is contained in:
Jan Lindström 2021-10-22 09:50:11 +03:00
parent e10838268e
commit aa7ca987db
11 changed files with 102 additions and 191 deletions

View File

@ -84,7 +84,7 @@ SET DEBUG_SYNC = "now SIGNAL continue_kill";
--reap
--connection node_2a
--error 0,1213,2013
--error 0,1213
select * from t1;
--connection node_2

View File

@ -8985,20 +8985,8 @@ err:
}
#endif /* MYSQL_CLIENT */
#if defined(HAVE_REPLICATION) && !defined(MYSQL_CLIENT)
static bool wsrep_must_replay(THD *thd)
{
#ifdef WITH_WSREP
mysql_mutex_lock(&thd->LOCK_thd_data);
bool res= WSREP(thd) && thd->wsrep_trx().state() == wsrep::transaction::s_must_replay;
mysql_mutex_unlock(&thd->LOCK_thd_data);
return res;
#else
return false;
#endif
}
int Xid_log_event::do_apply_event(rpl_group_info *rgi)
{
bool res;
@ -9063,8 +9051,14 @@ int Xid_log_event::do_apply_event(rpl_group_info *rgi)
res= trans_commit(thd); /* Automatically rolls back on error. */
thd->release_transactional_locks();
if (sub_id && (!res || wsrep_must_replay(thd)))
mysql_mutex_lock(&thd->LOCK_thd_data);
#ifdef WITH_WSREP
if (sub_id && (!res || (WSREP(thd) && thd->wsrep_trx().state() == wsrep::transaction::s_must_replay)))
#else
if (sub_id && !res)
#endif /* WITH_WSREP */
rpl_global_gtid_slave_state->update_state_hash(sub_id, &gtid, hton, rgi);
mysql_mutex_unlock(&thd->LOCK_thd_data);
/*
Increment the global status commit count variable
*/

View File

@ -218,8 +218,16 @@ extern "C" void wsrep_handle_SR_rollback(THD *bf_thd,
extern "C" my_bool wsrep_thd_bf_abort(THD *bf_thd, THD *victim_thd,
my_bool signal)
{
mysql_mutex_assert_owner(&victim_thd->LOCK_thd_kill);
mysql_mutex_assert_not_owner(&victim_thd->LOCK_thd_data);
DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
{
const char act[]=
"now "
"SIGNAL sync.before_wsrep_thd_abort_reached "
"WAIT_FOR signal.before_wsrep_thd_abort";
DBUG_ASSERT(!debug_sync_set_action(bf_thd,
STRING_WITH_LEN(act)));
};);
my_bool ret= wsrep_bf_abort(bf_thd, victim_thd);
/*
Send awake signal if victim was BF aborted or does not
@ -228,6 +236,8 @@ extern "C" my_bool wsrep_thd_bf_abort(THD *bf_thd, THD *victim_thd,
*/
if ((ret || !wsrep_on(victim_thd)) && signal)
{
mysql_mutex_assert_not_owner(&victim_thd->LOCK_thd_data);
mysql_mutex_assert_not_owner(&victim_thd->LOCK_thd_kill);
mysql_mutex_lock(&victim_thd->LOCK_thd_data);
if (victim_thd->wsrep_aborter && victim_thd->wsrep_aborter != bf_thd->thread_id)
@ -238,8 +248,10 @@ extern "C" my_bool wsrep_thd_bf_abort(THD *bf_thd, THD *victim_thd,
return false;
}
mysql_mutex_lock(&victim_thd->LOCK_thd_kill);
victim_thd->wsrep_aborter= bf_thd->thread_id;
victim_thd->awake_no_mutex(KILL_QUERY);
mysql_mutex_unlock(&victim_thd->LOCK_thd_kill);
mysql_mutex_unlock(&victim_thd->LOCK_thd_data);
} else {
WSREP_DEBUG("wsrep_thd_bf_abort skipped awake");

View File

@ -1069,8 +1069,8 @@ terminate_slave_thread(THD *thd,
int error __attribute__((unused));
DBUG_PRINT("loop", ("killing slave thread"));
mysql_mutex_lock(&thd->LOCK_thd_kill);
mysql_mutex_lock(&thd->LOCK_thd_data);
mysql_mutex_lock(&thd->LOCK_thd_kill);
#ifndef DONT_USE_THR_ALARM
/*
Error codes from pthread_kill are:

View File

@ -788,7 +788,6 @@ THD::THD(my_thread_id id, bool is_wsrep_applier)
mysql_mutex_init(key_LOCK_wakeup_ready, &LOCK_wakeup_ready, MY_MUTEX_INIT_FAST);
mysql_mutex_init(key_LOCK_thd_kill, &LOCK_thd_kill, MY_MUTEX_INIT_FAST);
mysql_cond_init(key_COND_wakeup_ready, &COND_wakeup_ready, 0);
mysql_mutex_record_order(&LOCK_thd_kill, &LOCK_thd_data);
/* Variables with default values */
proc_info="login";
@ -5108,13 +5107,11 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
#ifdef WITH_WSREP
/* wsrep applier, replayer and TOI processing threads are ordered
by replication provider, relaxed GAP locking protocol can be used
between high priority wsrep threads.
Note that wsrep_thd_is_BF() doesn't take LOCK_thd_data for either thd,
the caller should guarantee that the BF state won't change.
(e.g. InnoDB does it by keeping lock_sys.mutex locked)
between high priority wsrep threads
*/
if (WSREP_ON && wsrep_thd_is_BF(thd, false) &&
wsrep_thd_is_BF(other_thd, false))
if (WSREP_ON &&
wsrep_thd_is_BF(const_cast<THD *>(thd), false) &&
wsrep_thd_is_BF(const_cast<THD *>(other_thd), true))
return 0;
#endif /* WITH_WSREP */
rgi= thd->rgi_slave;

View File

@ -3327,11 +3327,11 @@ public:
void awake_no_mutex(killed_state state_to_set);
void awake(killed_state state_to_set)
{
mysql_mutex_lock(&LOCK_thd_kill);
mysql_mutex_lock(&LOCK_thd_data);
mysql_mutex_lock(&LOCK_thd_kill);
awake_no_mutex(state_to_set);
mysql_mutex_unlock(&LOCK_thd_data);
mysql_mutex_unlock(&LOCK_thd_kill);
mysql_mutex_unlock(&LOCK_thd_data);
}
void abort_current_cond_wait(bool force);

View File

@ -9144,6 +9144,7 @@ THD *find_thread_by_id(longlong id, bool query_id)
return arg.thd;
}
/**
kill one thread.
@ -9267,8 +9268,8 @@ static my_bool kill_threads_callback(THD *thd, kill_threads_callback_arg *arg)
return 1;
if (!arg->threads_to_kill.push_back(thd, arg->thd->mem_root))
{
mysql_mutex_lock(&thd->LOCK_thd_kill); // Lock from delete
mysql_mutex_lock(&thd->LOCK_thd_data);
mysql_mutex_lock(&thd->LOCK_thd_kill); // Lock from delete
}
}
}

View File

@ -3497,8 +3497,8 @@ static my_bool kill_callback(THD *thd, kill_callback_arg *arg)
thd->variables.server_id == arg->slave_server_id)
{
arg->thd= thd;
mysql_mutex_lock(&thd->LOCK_thd_kill); // Lock from delete
mysql_mutex_lock(&thd->LOCK_thd_data);
mysql_mutex_lock(&thd->LOCK_thd_kill); // Lock from delete
return 1;
}
return 0;

View File

@ -69,13 +69,20 @@ bool Wsrep_client_service::interrupted(
wsrep::unique_lock<wsrep::mutex>& lock WSREP_UNUSED) const
{
DBUG_ASSERT(m_thd == current_thd);
/* Underlying mutex in lock object points to LOCK_thd_data, which
protects m_thd->wsrep_trx(), LOCK_thd_kill protects m_thd->killed.
Locking order is:
1) LOCK_thd_data
2) LOCK_thd_kill */
mysql_mutex_assert_owner(static_cast<mysql_mutex_t*>(lock.mutex()->native()));
mysql_mutex_lock(&m_thd->LOCK_thd_kill);
bool ret= (m_thd->killed != NOT_KILLED);
if (ret)
{
WSREP_DEBUG("wsrep state is interrupted, THD::killed %d trx state %d",
m_thd->killed, m_thd->wsrep_trx().state());
}
mysql_mutex_unlock(&m_thd->LOCK_thd_kill);
return ret;
}

View File

@ -41,7 +41,6 @@ static void init_service_thd(THD* thd, char* thread_stack)
thd->prior_thr_create_utime= thd->start_utime= microsecond_interval_timer();
thd->set_command(COM_SLEEP);
thd->reset_for_next_command(true);
server_threads.insert(thd); // as wsrep_innobase_kill_one_trx() uses find_thread_by_id()
}
Wsrep_storage_service*
@ -81,7 +80,6 @@ void Wsrep_server_service::release_storage_service(
static_cast<Wsrep_storage_service*>(storage_service);
THD* thd= ss->m_thd;
wsrep_reset_threadvars(thd);
server_threads.erase(thd);
delete ss;
delete thd;
}
@ -95,8 +93,7 @@ wsrep_create_streaming_applier(THD *orig_thd, const char *ctx)
streaming transaction is BF aborted and streaming applier
is created from BF aborter context. */
Wsrep_threadvars saved_threadvars(wsrep_save_threadvars());
if (saved_threadvars.cur_thd)
wsrep_reset_threadvars(saved_threadvars.cur_thd);
wsrep_reset_threadvars(saved_threadvars.cur_thd);
THD *thd= 0;
Wsrep_applier_service *ret= 0;
if (!wsrep_create_threadvars() &&
@ -113,8 +110,7 @@ wsrep_create_streaming_applier(THD *orig_thd, const char *ctx)
}
/* Restore original thread local storage state before returning. */
wsrep_restore_threadvars(saved_threadvars);
if (saved_threadvars.cur_thd)
wsrep_store_threadvars(saved_threadvars.cur_thd);
wsrep_store_threadvars(saved_threadvars.cur_thd);
return ret;
}
@ -143,7 +139,6 @@ void Wsrep_server_service::release_high_priority_service(wsrep::high_priority_se
THD* thd= hps->m_thd;
delete hps;
wsrep_store_threadvars(thd);
server_threads.erase(thd);
delete thd;
wsrep_delete_threadvars();
}

View File

@ -62,7 +62,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include <my_service_manager.h>
#include <key.h>
#include <sql_manager.h>
/* Include necessary InnoDB headers */
#include "btr0btr.h"
@ -5204,8 +5203,6 @@ static void innobase_kill_query(handlerton*, THD *thd, enum thd_kill_levels)
if (lock_t *lock= trx->lock.wait_lock)
{
trx_mutex_enter(trx);
if (trx->is_wsrep() && wsrep_thd_is_aborting(thd))
trx->lock.was_chosen_as_deadlock_victim= TRUE;
lock_cancel_waiting_and_release(lock);
trx_mutex_exit(trx);
}
@ -18607,67 +18604,62 @@ static struct st_mysql_storage_engine innobase_storage_engine=
#ifdef WITH_WSREP
struct bg_wsrep_kill_trx_arg {
my_thread_id thd_id, bf_thd_id;
trx_id_t trx_id, bf_trx_id;
bool signal;
};
/** This function is used to kill one transaction.
/** Kill one transaction from a background manager thread
This transaction was open on this node (not-yet-committed), and a
conflicting writeset from some other node that was being applied
caused a locking conflict. First committed (from other node)
wins, thus open transaction is rolled back. BF stands for
brute-force: any transaction can get aborted by galera any time
it is necessary.
wsrep_innobase_kill_one_trx() is invoked when lock_sys.mutex and trx mutex
are taken, wsrep_thd_bf_abort() cannot be used there as it takes THD mutexes
that must be taken before lock_sys.mutex and trx mutex. That's why
wsrep_innobase_kill_one_trx only posts the killing task to the manager thread
and the actual killing happens asynchronously here.
This conflict can happen only when the replicated writeset (from
other node) is being applied, not when its waiting in the queue.
If our local transaction reached its COMMIT and this conflicting
writeset was in the queue, then it should fail the local
certification test instead.
As no mutexes were held we don't know whether THD or trx pointers are still
valid, so we need to pass thread/trx ids and perform a lookup.
*/
static void bg_wsrep_kill_trx(void *void_arg)
A brute force abort is only triggered by a locking conflict
between a writeset being applied by an applier thread (slave thread)
and an open transaction on the node, not by a Galera writeset
comparison as in the local certification failure.
@param[in] bf_thd Brute force (BF) thread
@param[in,out] victim_trx Vimtim trx to be killed
@param[in] signal Should victim be signaled */
UNIV_INTERN
void
wsrep_innobase_kill_one_trx(
THD* bf_thd,
trx_t *victim_trx,
bool signal)
{
bg_wsrep_kill_trx_arg *arg= (bg_wsrep_kill_trx_arg *)void_arg;
THD *thd, *bf_thd;
trx_t *victim_trx;
bool aborting= false;
ut_ad(bf_thd);
ut_ad(victim_trx);
ut_ad(lock_mutex_own());
ut_ad(trx_mutex_own(victim_trx));
if ((bf_thd= find_thread_by_id(arg->bf_thd_id)))
wsrep_thd_LOCK(bf_thd);
if ((thd= find_thread_by_id(arg->thd_id)))
wsrep_thd_LOCK(thd);
if (!thd || !bf_thd || !(victim_trx= thd_to_trx(thd)))
goto ret0;
lock_mutex_enter();
trx_mutex_enter(victim_trx);
if (victim_trx->id != arg->trx_id
|| victim_trx->state == TRX_STATE_COMMITTED_IN_MEMORY)
{
/* Apparently victim trx was meanwhile rolled back or
committed. Tell bf thd not to wait, in case it already
started to. */
trx_t *trx= thd_to_trx(bf_thd);
if (!trx) {
/* bf_thd might not be associated with a
transaction, in case of MDL conflict */
} else if (lock_t *lock = trx->lock.wait_lock) {
trx_mutex_enter(trx);
lock_cancel_waiting_and_release(lock);
trx_mutex_exit(trx);
}
goto ret1;
}
DBUG_ENTER("wsrep_innobase_kill_one_trx");
THD *thd= (THD *) victim_trx->mysql_thd;
ut_ad(thd);
/* Note that bf_trx might not exist here e.g. on MDL conflict
case (test: galera_concurrent_ctas). Similarly, BF thread
could be also acquiring MDL-lock causing victim to be
aborted. However, we have not yet called innobase_trx_init()
for BF transaction (test: galera_many_columns)*/
trx_t* bf_trx= thd_to_trx(bf_thd);
DBUG_ASSERT(wsrep_on(bf_thd));
wsrep_thd_LOCK(thd);
WSREP_LOG_CONFLICT(bf_thd, thd, TRUE);
WSREP_DEBUG("Aborter %s trx_id: " TRX_ID_FMT " thread: %ld "
"seqno: %lld client_state: %s client_mode: %s transaction_mode: %s "
"query: %s",
wsrep_thd_is_BF(bf_thd, false) ? "BF" : "normal",
arg->bf_trx_id,
bf_trx ? bf_trx->id : TRX_ID_MAX,
thd_get_thread_id(bf_thd),
wsrep_thd_trx_seqno(bf_thd),
wsrep_thd_client_state_str(bf_thd),
@ -18692,84 +18684,28 @@ static void bg_wsrep_kill_trx(void *void_arg)
if (wsrep_thd_set_wsrep_aborter(bf_thd, thd))
{
WSREP_DEBUG("innodb kill transaction skipped due to wsrep_aborter set");
goto ret1;
wsrep_thd_UNLOCK(thd);
DBUG_VOID_RETURN;
}
aborting= true;
/* Note that we need to release this as it will be acquired
below in wsrep-lib */
wsrep_thd_UNLOCK(thd);
DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort");
ret1:
trx_mutex_exit(victim_trx);
lock_mutex_exit();
ret0:
if (thd) {
wsrep_thd_UNLOCK(thd);
if (aborting) {
DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort");
wsrep_thd_bf_abort(bf_thd, thd, arg->signal);
if (wsrep_thd_bf_abort(bf_thd, thd, signal))
{
lock_t* wait_lock = victim_trx->lock.wait_lock;
if (wait_lock) {
DBUG_ASSERT(victim_trx->is_wsrep());
WSREP_DEBUG("victim has wait flag: %lu",
thd_get_thread_id(thd));
WSREP_DEBUG("canceling wait lock");
victim_trx->lock.was_chosen_as_deadlock_victim= TRUE;
lock_cancel_waiting_and_release(wait_lock);
}
wsrep_thd_kill_UNLOCK(thd);
}
if (bf_thd) {
wsrep_thd_UNLOCK(bf_thd);
wsrep_thd_kill_UNLOCK(bf_thd);
}
free(arg);
}
/** This function is used to kill one transaction.
This transaction was open on this node (not-yet-committed), and a
conflicting writeset from some other node that was being applied
caused a locking conflict. First committed (from other node)
wins, thus open transaction is rolled back. BF stands for
brute-force: any transaction can get aborted by galera any time
it is necessary.
This conflict can happen only when the replicated writeset (from
other node) is being applied, not when its waiting in the queue.
If our local transaction reached its COMMIT and this conflicting
writeset was in the queue, then it should fail the local
certification test instead.
A brute force abort is only triggered by a locking conflict
between a writeset being applied by an applier thread (slave thread)
and an open transaction on the node, not by a Galera writeset
comparison as in the local certification failure.
@param[in] bf_thd Brute force (BF) thread
@param[in,out] victim_trx Transaction to be killed
@param[in] signal Should victim be signaled */
void
wsrep_innobase_kill_one_trx(
THD* bf_thd,
trx_t *victim_trx,
bool signal)
{
ut_ad(bf_thd);
ut_ad(victim_trx);
ut_ad(lock_mutex_own());
ut_ad(trx_mutex_own(victim_trx));
DBUG_ENTER("wsrep_innobase_kill_one_trx");
DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
{
const char act[]=
"now "
"SIGNAL sync.before_wsrep_thd_abort_reached "
"WAIT_FOR signal.before_wsrep_thd_abort";
DBUG_ASSERT(!debug_sync_set_action(bf_thd,
STRING_WITH_LEN(act)));
};);
trx_t* bf_trx= thd_to_trx(bf_thd);
bg_wsrep_kill_trx_arg *arg = (bg_wsrep_kill_trx_arg*)malloc(sizeof(*arg));
arg->thd_id = thd_get_thread_id(victim_trx->mysql_thd);
arg->trx_id = victim_trx->id;
arg->bf_thd_id = thd_get_thread_id(bf_thd);
arg->bf_trx_id = bf_trx ? bf_trx->id : TRX_ID_MAX;
arg->signal = signal;
mysql_manager_submit(bg_wsrep_kill_trx, arg);
DBUG_VOID_RETURN;
}
@ -18805,44 +18741,13 @@ wsrep_abort_transaction(
if (victim_trx) {
lock_mutex_enter();
trx_mutex_enter(victim_trx);
victim_trx->lock.was_chosen_as_wsrep_victim= true;
wsrep_innobase_kill_one_trx(bf_thd, victim_trx, signal);
trx_mutex_exit(victim_trx);
lock_mutex_exit();
wsrep_thd_kill_LOCK(victim_thd);
wsrep_thd_LOCK(victim_thd);
bool aborting= !wsrep_thd_set_wsrep_aborter(bf_thd, victim_thd);
wsrep_thd_UNLOCK(victim_thd);
if (aborting) {
DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort");
DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
{
const char act[]=
"now "
"SIGNAL sync.before_wsrep_thd_abort_reached "
"WAIT_FOR signal.before_wsrep_thd_abort";
DBUG_ASSERT(!debug_sync_set_action(bf_thd,
STRING_WITH_LEN(act)));
};);
wsrep_thd_bf_abort(bf_thd, victim_thd, signal);
}
wsrep_thd_kill_UNLOCK(victim_thd);
wsrep_srv_conc_cancel_wait(victim_trx);
DBUG_VOID_RETURN;
} else {
DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
{
const char act[]=
"now "
"SIGNAL sync.before_wsrep_thd_abort_reached "
"WAIT_FOR signal.before_wsrep_thd_abort";
DBUG_ASSERT(!debug_sync_set_action(bf_thd,
STRING_WITH_LEN(act)));
};);
wsrep_thd_kill_LOCK(victim_thd);
wsrep_thd_bf_abort(bf_thd, victim_thd, signal);
wsrep_thd_kill_UNLOCK(victim_thd);
}
DBUG_VOID_RETURN;