From b54e7b0cea5dffe57d71b4ec7486ecce700086df Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Fri, 12 May 2023 12:11:53 +0300 Subject: [PATCH] MDEV-31185 rw_trx_hash_t::find() unpins pins too early MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rw_trx_hash_t::find() acquires element->mutex, then unpins pins, used for lf_hash element search. After that the "element" can be deallocated and reused by some other thread. If we take a look rw_trx_hash_t::insert()->lf_hash_insert()->lf_alloc_new() calls, we will not find any element->mutex acquisition, as it was not initialized yet before it's allocation. rw_trx_hash_t::insert() can reuse the chunk, unpinned in rw_trx_hash_t::find(). The scenario is the following: 1. Thread 1 have just executed lf_hash_search() in rw_trx_hash_t::find(), but have not acquired element->mutex yet. 2. Thread 2 have removed the element from hash table with rw_trx_hash_t::erase() call. 3. Thread 1 acquired element->mutex and unpinned pin 2 pin with lf_hash_search_unpin(pins) call. 4. Some thread purged memory of the element. 5. Thread 3 reused the memory for the element, filled element->id, element->trx. 6. Thread 1 crashes with failed "DBUG_ASSERT(trx_id == trx->id)" assertion. Note that trx_t objects are also reused, see the code around trx_pools for details. The fix is to invoke "lf_hash_search_unpin(pins);" after element->trx is stored in local variable in rw_trx_hash_t::find(). Reviewed by: Nikita Malyavin, Marko Mäkelä. --- .../r/trx_sys_t_find_lf_hash_error.result | 25 ++++++ .../t/trx_sys_t_find_lf_hash_error.test | 87 +++++++++++++++++++ mysys/lf_alloc-pin.c | 3 + sql/table_cache.cc | 4 + sql/xa.cc | 5 ++ storage/innobase/include/trx0sys.h | 21 ++++- 6 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 mysql-test/suite/innodb/r/trx_sys_t_find_lf_hash_error.result create mode 100644 mysql-test/suite/innodb/t/trx_sys_t_find_lf_hash_error.test diff --git a/mysql-test/suite/innodb/r/trx_sys_t_find_lf_hash_error.result b/mysql-test/suite/innodb/r/trx_sys_t_find_lf_hash_error.result new file mode 100644 index 00000000000..11d3fb4a7a3 --- /dev/null +++ b/mysql-test/suite/innodb/r/trx_sys_t_find_lf_hash_error.result @@ -0,0 +1,25 @@ +create table t1 (a int) engine=innodb STATS_PERSISTENT=0; +create table t2 (a int) engine=innodb STATS_PERSISTENT=0; +BEGIN; +insert into t1 values(1); +connect con_1, localhost, root,,; +SET DEBUG_SYNC="before_trx_hash_find_element_mutex_enter SIGNAL before_mutex_enter WAIT_FOR cont1"; +SET DEBUG_SYNC="after_trx_hash_find_element_mutex_enter SIGNAL after_mutex_enter WAIT_FOR cont2"; +SELECT * FROM t1 WHERE a = 1 FOR UPDATE; +connection default; +SET DEBUG_SYNC="now WAIT_FOR before_mutex_enter"; +COMMIT; +SET DEBUG_SYNC="now SIGNAL cont1"; +SET DEBUG_SYNC="now WAIT_FOR after_mutex_enter"; +insert into t2 values(1); +BEGIN; +INSERT INTO t2 VALUES(2); +SET DEBUG_SYNC="now SIGNAL cont2"; +connection con_1; +a +1 +disconnect con_1; +connection default; +DROP TABLE t1; +DROP TABLE t2; +SET DEBUG_SYNC="reset"; diff --git a/mysql-test/suite/innodb/t/trx_sys_t_find_lf_hash_error.test b/mysql-test/suite/innodb/t/trx_sys_t_find_lf_hash_error.test new file mode 100644 index 00000000000..833919f9ad6 --- /dev/null +++ b/mysql-test/suite/innodb/t/trx_sys_t_find_lf_hash_error.test @@ -0,0 +1,87 @@ +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/count_sessions.inc + +create table t1 (a int) engine=innodb STATS_PERSISTENT=0; +create table t2 (a int) engine=innodb STATS_PERSISTENT=0; + +BEGIN; # trx1 +# register rw-transaction in trx_sys.rw_trx_hash +insert into t1 values(1); + +--connect (con_1, localhost, root,,) +SET DEBUG_SYNC="before_trx_hash_find_element_mutex_enter SIGNAL before_mutex_enter WAIT_FOR cont1"; +SET DEBUG_SYNC="after_trx_hash_find_element_mutex_enter SIGNAL after_mutex_enter WAIT_FOR cont2"; + +# trx2 is converting implicit lock of trx1 to explicit one, it's invoking +# ▾ l_search +# ▾ lf_hash_search_using_hash_value +# ▾ lf_hash_search +# ▾ rw_trx_hash_t::find +# ▾ trx_sys_t::find +# ▾ lock_rec_convert_impl_to_expl +# rw_trx_hash_t::find returns lf_hash element, pin 2 is pinned, +# but element->mutex has not been acquired yet, what allows trx1 element to be +# removed from trx_sys.rw_trx_hash at one hand, and at the other hand, the +# content of the element is still valid as it's pinned. +# +# trx2 +--send SELECT * FROM t1 WHERE a = 1 FOR UPDATE +--connection default +SET DEBUG_SYNC="now WAIT_FOR before_mutex_enter"; +--disable_query_log +SET @saved_dbug = @@debug_dbug; + +# Usually pinbox purgatory is purged either when the number of elements in +# purgatory is greater then some limit(see lf_pinbox_free()), or when thread +# invokes rw_trx_hash_t::put_pins() explicitly. For this test the first +# variant was choosen. The following option makes lf_pinbox_free() to purge +# pinbox purgatory on each call, ignoring pins->purgatory_count. +SET DEBUG_DBUG='+d,unconditional_pinbox_free'; +--enable_query_log + +# trx1 is committed and removed from trx_sys.rw_trx_hash. It can be done as +# trx2 has not been acquired element->mutex yet. +COMMIT; +--disable_query_log +SET DEBUG_DBUG = @saved_dbug; +--enable_query_log + +# Let trx2 to acquire element->mutex and unpin pin 2 +SET DEBUG_SYNC="now SIGNAL cont1"; +SET DEBUG_SYNC="now WAIT_FOR after_mutex_enter"; + +--disable_query_log +SET @saved_dbug = @@debug_dbug; +SET DEBUG_DBUG='+d,unconditional_pinbox_free'; +--enable_query_log +# trx3 commits and invokes lf_pinbox_free(), which purges pin 2 of trx2 and +# places its pointer on trx_sys.rw_trx_hash.hash.alloc.top. +insert into t2 values(1); +--disable_query_log +SET DEBUG_DBUG = @saved_dbug; +--enable_query_log + +BEGIN; # trx4 +# trx_sys.rw_trx_hash.hash.alloc.top points to "freed" trx2 lf_hash element, +# lf_alloc_new() gets the pointer from trx_sys.rw_trx_hash.hash.alloc.top, +# so the memory for lf_hash element will be reused for trx4 if MDEV-31185 is +# not fixed +INSERT INTO t2 VALUES(2); + +# let trx2 to invoke DBUG_ASSERT(trx_id == trx->id) and crash if MDEV-31185 +# is not fixed +SET DEBUG_SYNC="now SIGNAL cont2"; + +--connection con_1 +# trx 2 assertion failure if MDEV-31185 is not fixed +--reap +--disconnect con_1 +--connection default +DROP TABLE t1; +DROP TABLE t2; + +SET DEBUG_SYNC="reset"; + +--source include/wait_until_count_sessions.inc diff --git a/mysys/lf_alloc-pin.c b/mysys/lf_alloc-pin.c index f844920a664..6d80b381e5e 100644 --- a/mysys/lf_alloc-pin.c +++ b/mysys/lf_alloc-pin.c @@ -270,6 +270,9 @@ void lf_pinbox_free(LF_PINS *pins, void *addr) add_to_purgatory(pins, addr); if (pins->purgatory_count % LF_PURGATORY_SIZE == 0) lf_pinbox_real_free(pins); + DBUG_EXECUTE_IF("unconditional_pinbox_free", + if (pins->purgatory_count % LF_PURGATORY_SIZE) + lf_pinbox_real_free(pins);); } struct st_harvester { diff --git a/sql/table_cache.cc b/sql/table_cache.cc index 15255c56083..9b28e7b7b0f 100644 --- a/sql/table_cache.cc +++ b/sql/table_cache.cc @@ -824,6 +824,10 @@ retry: element= (TDC_element*) lf_hash_search_using_hash_value(&tdc_hash, thd->tdc_hash_pins, hash_value, (uchar*) key, key_length); + /* It's safe to unpin the pins here, because an empty element was inserted + above, "empty" means at least element->share = 0. Some other thread can't + delete it while element->share == 0. And element->share is also protected + with element->LOCK_table_share mutex. */ lf_hash_search_unpin(thd->tdc_hash_pins); DBUG_ASSERT(element); diff --git a/sql/xa.cc b/sql/xa.cc index 85fbe9358a3..7b5296f20f9 100644 --- a/sql/xa.cc +++ b/sql/xa.cc @@ -246,9 +246,14 @@ static XID_cache_element *xid_cache_search(THD *thd, XID *xid) xid->key(), xid->key_length()); if (element) { + /* The element can be removed from lf_hash by other thread, but + element->acquire_recovered() will return false in this case. */ if (!element->acquire_recovered()) element= 0; lf_hash_search_unpin(thd->xid_hash_pins); + /* Once the element is acquired (i.e. got the ACQUIRED bit) by this thread, + only this thread can delete it. The deletion happens in xid_cache_delete(). + See also the XID_cache_element documentation. */ DEBUG_SYNC(thd, "xa_after_search"); } return element; diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 246af942419..0f12cd90495 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -632,9 +632,21 @@ public: sizeof(trx_id_t))); if (element) { + /* rw_trx_hash_t::erase() sets element->trx to nullptr under + element->mutex protection before removing the element from hash table. + If the element was removed before the mutex acquisition, element->trx + will be equal to nullptr. */ + DEBUG_SYNC_C("before_trx_hash_find_element_mutex_enter"); mutex_enter(&element->mutex); + /* element_trx can't point to reused object now. If transaction was + deregistered before element->mutex acquisition, element->trx is nullptr. + It can't be deregistered while element->mutex is held. */ + trx_t *element_trx = element->trx; lf_hash_search_unpin(pins); - if ((trx= element->trx)) { + /* The *element can be reused now, as element->trx value is stored + locally in element_trx. */ + DEBUG_SYNC_C("after_trx_hash_find_element_mutex_enter"); + if ((trx= element_trx)) { DBUG_ASSERT(trx_id == trx->id); ut_d(validate_element(trx)); if (do_ref_count) @@ -650,12 +662,19 @@ public: trx_mutex_enter(trx); const trx_state_t state= trx->state; trx_mutex_exit(trx); + /* trx_t::commit_in_memory() sets the state to + TRX_STATE_COMMITTED_IN_MEMORY before deregistering the transaction. + It also waits for any implicit-to-explicit lock conversions to cease + after deregistering. */ if (state == TRX_STATE_COMMITTED_IN_MEMORY) trx= NULL; else trx->reference(); } } + /* element's lifetime is equal to the hash lifetime, that's why + element->mutex is valid here despite the element is unpinned. In the + worst case some thread will wait for element->mutex releasing. */ mutex_exit(&element->mutex); } if (!caller_trx)