From a474e3278c202d5b429134071cedada3d525af95 Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Thu, 9 Feb 2023 13:51:57 +0300 Subject: [PATCH 1/4] MDEV-27701 Race on trx->lock.wait_lock between lock_rec_move() and lock_sys_t::cancel() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The initial issue was in assertion failure, which checked the equality of lock to cancel with trx->lock.wait_lock in lock_sys_t::cancel(). If we analyze lock_sys_t::cancel() code from the perspective of trx->lock.wait_lock racing, we won't find the error there, except the cases when we need to reload it after the corresponding latches acquiring. So the fix is just to remove the assertion and reload trx->lock.wait_lock after acquiring necessary latches. Reviewed by: Marko Mäkelä --- .../innodb/r/lock_move_wait_lock_race.result | 34 +++++++++++ .../innodb/t/lock_move_wait_lock_race.test | 58 +++++++++++++++++++ storage/innobase/include/lock0lock.h | 2 +- storage/innobase/lock/lock0lock.cc | 48 ++++++++++++++- 4 files changed, 138 insertions(+), 4 deletions(-) create mode 100644 mysql-test/suite/innodb/r/lock_move_wait_lock_race.result create mode 100644 mysql-test/suite/innodb/t/lock_move_wait_lock_race.test diff --git a/mysql-test/suite/innodb/r/lock_move_wait_lock_race.result b/mysql-test/suite/innodb/r/lock_move_wait_lock_race.result new file mode 100644 index 00000000000..572fbc9b1d1 --- /dev/null +++ b/mysql-test/suite/innodb/r/lock_move_wait_lock_race.result @@ -0,0 +1,34 @@ +CREATE TABLE t (pk int PRIMARY KEY, c varchar(10)) ENGINE=InnoDB; +INSERT INTO t VALUES (10, "0123456789"); +connection default; +BEGIN; +SELECT * FROM t WHERE c = 10 FOR UPDATE; +pk c +connect trx2, localhost,root,,; +BEGIN; +SET DEBUG_SYNC="lock_wait_start SIGNAL trx2_start_waiting"; +SET DEBUG_SYNC="lock_wait_end SIGNAL trx2_wait_end WAIT_FOR trx2_cont_upd"; +SET DEBUG_SYNC="lock_rec_store_on_page_infimum_end SIGNAL trx2_moved_locks WAIT_FOR trx2_cont"; +UPDATE t SET c = NULL WHERE pk = 10; +connect trx3, localhost,root,,; +SET DEBUG_SYNC="now WAIT_FOR trx2_start_waiting"; +SET innodb_lock_wait_timeout=1; +BEGIN; +SET DEBUG_SYNC="lock_wait_start SIGNAL trx3_start_waiting WAIT_FOR trx3_cont_waiting"; +SET DEBUG_SYNC="lock_sys_t_cancel_enter SIGNAL trx3_cancel_enter WAIT_FOR trx3_cont_cancel_waiting"; +UPDATE t SET c = "abcdefghij" WHERE pk = 10; +connection default; +SET DEBUG_SYNC="now WAIT_FOR trx3_start_waiting"; +COMMIT; +SET DEBUG_SYNC="now WAIT_FOR trx2_wait_end"; +SET DEBUG_SYNC="now SIGNAL trx3_cont_waiting"; +SET DEBUG_SYNC="now WAIT_FOR trx3_cancel_enter"; +SET DEBUG_SYNC="now SIGNAL trx2_cont_upd"; +SET DEBUG_SYNC="now WAIT_FOR trx2_moved_locks"; +SET DEBUG_SYNC="now SIGNAL trx3_cont_cancel_waiting"; +SET DEBUG_SYNC="now SIGNAL trx2_cont"; +disconnect trx2; +disconnect trx3; +connection default; +SET DEBUG_SYNC="RESET"; +DROP TABLE t; diff --git a/mysql-test/suite/innodb/t/lock_move_wait_lock_race.test b/mysql-test/suite/innodb/t/lock_move_wait_lock_race.test new file mode 100644 index 00000000000..3a04c7127c8 --- /dev/null +++ b/mysql-test/suite/innodb/t/lock_move_wait_lock_race.test @@ -0,0 +1,58 @@ +--source include/have_innodb.inc +--source include/count_sessions.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc + +CREATE TABLE t (pk int PRIMARY KEY, c varchar(10)) ENGINE=InnoDB; +INSERT INTO t VALUES (10, "0123456789"); + +--connection default +BEGIN; +SELECT * FROM t WHERE c = 10 FOR UPDATE; + +--connect(trx2, localhost,root,,) +BEGIN; +SET DEBUG_SYNC="lock_wait_start SIGNAL trx2_start_waiting"; +SET DEBUG_SYNC="lock_wait_end SIGNAL trx2_wait_end WAIT_FOR trx2_cont_upd"; +SET DEBUG_SYNC="lock_rec_store_on_page_infimum_end SIGNAL trx2_moved_locks WAIT_FOR trx2_cont"; +################# +# We need to update clustered record without changing ordering fields and +# changing the size of non-ordering fields to cause locks moving from deleted +# record to infimum. +### +--send UPDATE t SET c = NULL WHERE pk = 10 + + +--connect(trx3, localhost,root,,) +SET DEBUG_SYNC="now WAIT_FOR trx2_start_waiting"; +################# +# The condition wariable waiting in lock_wait() must be finished by timeout +### +SET innodb_lock_wait_timeout=1; +BEGIN; +SET DEBUG_SYNC="lock_wait_start SIGNAL trx3_start_waiting WAIT_FOR trx3_cont_waiting"; +SET DEBUG_SYNC="lock_sys_t_cancel_enter SIGNAL trx3_cancel_enter WAIT_FOR trx3_cont_cancel_waiting"; +--send UPDATE t SET c = "abcdefghij" WHERE pk = 10 + +--connection default +SET DEBUG_SYNC="now WAIT_FOR trx3_start_waiting"; +COMMIT; +SET DEBUG_SYNC="now WAIT_FOR trx2_wait_end"; +SET DEBUG_SYNC="now SIGNAL trx3_cont_waiting"; +SET DEBUG_SYNC="now WAIT_FOR trx3_cancel_enter"; +SET DEBUG_SYNC="now SIGNAL trx2_cont_upd"; +SET DEBUG_SYNC="now WAIT_FOR trx2_moved_locks"; +################# +# If the bug is not fixed, there will be assertion failure here, because trx2 +# moved trx3 lock from deleted record to infimum when trx3 tried to cancel the +# lock. +### +SET DEBUG_SYNC="now SIGNAL trx3_cont_cancel_waiting"; +SET DEBUG_SYNC="now SIGNAL trx2_cont"; + +--disconnect trx2 +--disconnect trx3 +--connection default +SET DEBUG_SYNC="RESET"; +DROP TABLE t; +--source include/wait_until_count_sessions.inc diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index b67a1011f6b..16acd031177 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -891,8 +891,8 @@ public: /** Cancel a waiting lock request. @tparam check_victim whether to check for DB_DEADLOCK - @param lock waiting lock request @param trx active transaction + @param lock waiting lock request @retval DB_SUCCESS if no lock existed @retval DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set @retval DB_LOCK_WAIT if the lock was canceled */ diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 9577dfc62aa..2b30b9b1a03 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -46,12 +46,12 @@ Created 5/7/1996 Heikki Tuuri #include "srv0mon.h" #include "que0que.h" #include "scope.h" +#include #include #ifdef WITH_WSREP #include -#include #endif /* WITH_WSREP */ /** The value of innodb_deadlock_detect */ @@ -1882,6 +1882,7 @@ check_trx_error: if (row_lock_wait) lock_sys.wait_resume(trx->mysql_thd, suspend_time, my_hrtime_coarse()); + /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */ if (lock_t *lock= trx->lock.wait_lock) { lock_sys_t::cancel(trx, lock); @@ -1905,6 +1906,12 @@ void lock_wait_end(trx_t *trx) ut_d(const auto state= trx->state); ut_ad(state == TRX_STATE_COMMITTED_IN_MEMORY || state == TRX_STATE_ACTIVE || state == TRX_STATE_PREPARED); + /* lock_wait() checks trx->lock.was_chosen_as_deadlock_victim flag before + requesting lock_sys.wait_mutex, and if the flag is set, it returns error, + what causes transaction rollback, which can reset trx->lock.wait_thr before + deadlock resolution starts cancelling victim's waiting lock. That's why we + don't check trx->lock.wait_thr here if the function was called from deadlock + resolution function. */ ut_ad(from_deadlock || trx->lock.wait_thr); if (trx->lock.was_chosen_as_deadlock_victim) @@ -3193,6 +3200,8 @@ lock_rec_store_on_page_infimum( ut_ad(block->page.frame == page_align(rec)); const page_id_t id{block->page.id()}; + ut_d(SCOPE_EXIT( + []() { DEBUG_SYNC_C("lock_rec_store_on_page_infimum_end"); })); LockGuard g{lock_sys.rec_hash, id}; lock_rec_move(g.cell(), *block, id, g.cell(), id, @@ -5770,17 +5779,30 @@ void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx) /** Cancel a waiting lock request. @tparam check_victim whether to check for DB_DEADLOCK -@param lock waiting lock request @param trx active transaction +@param lock waiting lock request @retval DB_SUCCESS if no lock existed @retval DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set @retval DB_LOCK_WAIT if the lock was canceled */ template dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock) { + DEBUG_SYNC_C("lock_sys_t_cancel_enter"); mysql_mutex_assert_owner(&lock_sys.wait_mutex); - ut_ad(trx->lock.wait_lock == lock); ut_ad(trx->state == TRX_STATE_ACTIVE); + /* trx->lock.wait_lock may be changed by other threads as long as + we are not holding lock_sys.latch. + + So, trx->lock.wait_lock==lock does not necessarily hold, but both + pointers should be valid, because other threads cannot assign + trx->lock.wait_lock=nullptr (or invalidate *lock) while we are + holding lock_sys.wait_mutex. Also, the type of trx->lock.wait_lock + (record or table lock) cannot be changed by other threads. So, it is + safe to call lock->is_table() while not holding lock_sys.latch. If + we have to release and reacquire lock_sys.wait_mutex, we must reread + trx->lock.wait_lock. We must also reread trx->lock.wait_lock after + lock_sys.latch acquiring, as it can be changed to not-null in lock moving + functions even if we hold lock_sys.wait_mutex. */ dberr_t err= DB_SUCCESS; /* This would be too large for a memory transaction, except in the DB_DEADLOCK case, which was already tested in lock_trx_handle_wait(). */ @@ -5802,6 +5824,15 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock) } else { + /* This function is invoked from the thread which executes the + transaction. Table locks are requested before record locks. Some other + transaction can't change trx->lock.wait_lock from table to record for the + current transaction at this point, because the current transaction has not + requested record locks yet. There is no need to move any table locks by + other threads. And trx->lock.wait_lock can't be set to null while we are + holding lock_sys.wait_mutex. That's why there is no need to reload + trx->lock.wait_lock here. */ + ut_ad(lock == trx->lock.wait_lock); resolve_table_lock: dict_table_t *table= lock->un_member.tab_lock.table; if (!table->lock_mutex_trylock()) @@ -5812,6 +5843,7 @@ resolve_table_lock: mysql_mutex_unlock(&lock_sys.wait_mutex); table->lock_mutex_lock(); mysql_mutex_lock(&lock_sys.wait_mutex); + /* Cache trx->lock.wait_lock under the corresponding latches. */ lock= trx->lock.wait_lock; if (!lock) goto retreat; @@ -5821,6 +5853,10 @@ resolve_table_lock: goto retreat; } } + else + /* Cache trx->lock.wait_lock under the corresponding latches if + it was not cached yet */ + lock= trx->lock.wait_lock; if (lock->is_waiting()) lock_cancel_waiting_and_release(lock); /* Even if lock->is_waiting() did not hold above, we must return @@ -5844,6 +5880,7 @@ retreat: mysql_mutex_unlock(&lock_sys.wait_mutex); lock_sys.wr_lock(SRW_LOCK_CALL); mysql_mutex_lock(&lock_sys.wait_mutex); + /* Cache trx->lock.wait_lock under the corresponding latches. */ lock= trx->lock.wait_lock; /* Even if waiting lock was cancelled while lock_sys.wait_mutex was unlocked, we need to return deadlock error if transaction was chosen @@ -5855,6 +5892,9 @@ retreat: } else { + /* Cache trx->lock.wait_lock under the corresponding latches if + it was not cached yet */ + lock= trx->lock.wait_lock; resolve_record_lock: if (lock->is_waiting()) lock_cancel_waiting_and_release(lock); @@ -5876,6 +5916,7 @@ resolve_record_lock: void lock_sys_t::cancel(trx_t *trx) { mysql_mutex_lock(&lock_sys.wait_mutex); + /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */ if (lock_t *lock= trx->lock.wait_lock) { /* Dictionary transactions must be immune to KILL, because they @@ -5943,6 +5984,7 @@ dberr_t lock_trx_handle_wait(trx_t *trx) mysql_mutex_lock(&lock_sys.wait_mutex); if (trx->lock.was_chosen_as_deadlock_victim) err= DB_DEADLOCK; + /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */ else if (lock_t *wait_lock= trx->lock.wait_lock) err= lock_sys_t::cancel(trx, wait_lock); lock_sys.deadlock_check(); From df9f9ba12bc0f2a564d05319173efc1602d2edd9 Mon Sep 17 00:00:00 2001 From: Thirunarayanan Balathandayuthapani Date: Tue, 21 Feb 2023 10:57:52 +0530 Subject: [PATCH 2/4] MDEV-29871 innodb_fts.fulltext_misc unexpectedly reports a result - match()+0 returns the floating result and converts into integer value and it leads to sporadic failure. --- mysql-test/suite/innodb_fts/r/fulltext_misc.result | 3 ++- mysql-test/suite/innodb_fts/t/fulltext_misc.test | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/mysql-test/suite/innodb_fts/r/fulltext_misc.result b/mysql-test/suite/innodb_fts/r/fulltext_misc.result index c58cf5ba62c..69812ff8b72 100644 --- a/mysql-test/suite/innodb_fts/r/fulltext_misc.result +++ b/mysql-test/suite/innodb_fts/r/fulltext_misc.result @@ -129,8 +129,9 @@ test select * from t1 where a like "te_t"; a test -select * from t1 where match a against ("te*" in boolean mode)+0; +select * from t1 where match a against ("te*" in boolean mode); a +test drop table t1; # # Bug #49734: Crash on EXPLAIN EXTENDED UNION ... ORDER BY diff --git a/mysql-test/suite/innodb_fts/t/fulltext_misc.test b/mysql-test/suite/innodb_fts/t/fulltext_misc.test index 7a1ddd98d2b..083953a44ce 100644 --- a/mysql-test/suite/innodb_fts/t/fulltext_misc.test +++ b/mysql-test/suite/innodb_fts/t/fulltext_misc.test @@ -152,10 +152,7 @@ insert into t1 values ("a"),("abc"),("abcd"),("hello"),("test"); select * from t1 where a like "abc%"; select * from t1 where a like "test%"; select * from t1 where a like "te_t"; -# InnoDB_FTS: we don't support the postfix "+0" -# Work around MDEV-29871 (FIXME: remove this) ---echo select * from t1 where match a against ("te*" in boolean mode)+0; ---echo a +select * from t1 where match a against ("te*" in boolean mode); drop table t1; From db245e11404463a2d0d38de76a7c89cdf672f613 Mon Sep 17 00:00:00 2001 From: Thirunarayanan Balathandayuthapani Date: Wed, 22 Feb 2023 16:57:18 +0530 Subject: [PATCH 3/4] MDEV-25984 Assertion `max_doc_id > 0' failed in fts_init_doc_id() - rollback_inplace_alter_table() locks the fts internal tables. At the time, insert tries to fetch the doc id from config table, fails to lock the config table and returns doc id as 0. fts_cmp_set_sync_doc_id(): Retry to fetch the doc id again if it encounter DB_LOCK_WAIT_TIMEOUT error --- .../innodb_fts/r/concurrent_insert.result | 30 ++++++++++++++++++- .../suite/innodb_fts/t/concurrent_insert.test | 30 ++++++++++++++++++- storage/innobase/fts/fts0fts.cc | 7 +++-- storage/innobase/handler/handler0alter.cc | 1 + 4 files changed, 63 insertions(+), 5 deletions(-) diff --git a/mysql-test/suite/innodb_fts/r/concurrent_insert.result b/mysql-test/suite/innodb_fts/r/concurrent_insert.result index 3cb48d22df1..2335982816b 100644 --- a/mysql-test/suite/innodb_fts/r/concurrent_insert.result +++ b/mysql-test/suite/innodb_fts/r/concurrent_insert.result @@ -31,5 +31,33 @@ set DEBUG_SYNC= 'now SIGNAL fts_drop_index'; connection con1; drop table t1, t2; connection default; -set DEBUG_SYNC=RESET; SET @@GLOBAL.debug_dbug = @saved_dbug; +disconnect con1; +# +# MDEV-25984 Assertion `max_doc_id > 0' failed in fts_init_doc_id() +# +call mtr.add_suppression("InnoDB: \\(Lock wait timeout\\) while getting next doc id for table `test`.`t1`"); +CREATE TABLE t1(f1 CHAR(100), f2 INT, fulltext(f1))ENGINE=InnoDB; +INSERT INTO t1 VALUES("mariadb", 1), ("innodb", 1); +# restart +SET DEBUG_SYNC='innodb_rollback_after_fts_lock SIGNAL insert_dml WAIT_FOR ddl_continue'; +ALTER TABLE t1 ADD UNIQUE INDEX(f2); +connect con1,localhost,root,,,; +SET DEBUG_SYNC='now WAIT_FOR insert_dml'; +SET DEBUG_SYNC='fts_cmp_set_sync_doc_id_retry SIGNAL ddl_continue WAIT_FOR dml_finish'; +INSERT INTO t1 VALUES("index", 2); +connection default; +ERROR 23000: Duplicate entry '1' for key 'f2' +SET DEBUG_SYNC="now SIGNAL dml_finish"; +connection con1; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `f1` char(100) DEFAULT NULL, + `f2` int(11) DEFAULT NULL, + FULLTEXT KEY `f1` (`f1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +connection default; +disconnect con1; +DROP TABLE t1; +set DEBUG_SYNC=RESET; diff --git a/mysql-test/suite/innodb_fts/t/concurrent_insert.test b/mysql-test/suite/innodb_fts/t/concurrent_insert.test index d70fc0f63c4..b6991f6e503 100644 --- a/mysql-test/suite/innodb_fts/t/concurrent_insert.test +++ b/mysql-test/suite/innodb_fts/t/concurrent_insert.test @@ -48,5 +48,33 @@ connection con1; reap; drop table t1, t2; connection default; -set DEBUG_SYNC=RESET; SET @@GLOBAL.debug_dbug = @saved_dbug; +disconnect con1; + +--echo # +--echo # MDEV-25984 Assertion `max_doc_id > 0' failed in fts_init_doc_id() +--echo # +call mtr.add_suppression("InnoDB: \\(Lock wait timeout\\) while getting next doc id for table `test`.`t1`"); + +CREATE TABLE t1(f1 CHAR(100), f2 INT, fulltext(f1))ENGINE=InnoDB; +INSERT INTO t1 VALUES("mariadb", 1), ("innodb", 1); +--source include/restart_mysqld.inc +SET DEBUG_SYNC='innodb_rollback_after_fts_lock SIGNAL insert_dml WAIT_FOR ddl_continue'; +SEND ALTER TABLE t1 ADD UNIQUE INDEX(f2); + +connect(con1,localhost,root,,,); +SET DEBUG_SYNC='now WAIT_FOR insert_dml'; +SET DEBUG_SYNC='fts_cmp_set_sync_doc_id_retry SIGNAL ddl_continue WAIT_FOR dml_finish'; +send INSERT INTO t1 VALUES("index", 2); + +connection default; +--error ER_DUP_ENTRY +reap; +SET DEBUG_SYNC="now SIGNAL dml_finish"; +connection con1; +reap; +SHOW CREATE TABLE t1; +connection default; +disconnect con1; +DROP TABLE t1; +set DEBUG_SYNC=RESET; diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc index e26aebedfaa..9afbd8604a0 100644 --- a/storage/innobase/fts/fts0fts.cc +++ b/storage/innobase/fts/fts0fts.cc @@ -2575,7 +2575,6 @@ fts_cmp_set_sync_doc_id( que_t* graph = NULL; fts_cache_t* cache = table->fts->cache; char table_name[MAX_FULL_NAME_LEN]; -retry: ut_a(table->fts->doc_col != ULINT_UNDEFINED); fts_table.suffix = "CONFIG"; @@ -2583,7 +2582,8 @@ retry: fts_table.type = FTS_COMMON_TABLE; fts_table.table = table; - trx = trx_create(); + trx= trx_create(); +retry: trx_start_internal(trx); trx->op_info = "update the next FTS document id"; @@ -2663,7 +2663,8 @@ func_exit: "for table " << table->name; fts_sql_rollback(trx); - if (error == DB_DEADLOCK) { + if (error == DB_DEADLOCK || error == DB_LOCK_WAIT_TIMEOUT) { + DEBUG_SYNC_C("fts_cmp_set_sync_doc_id_retry"); std::this_thread::sleep_for(FTS_DEADLOCK_RETRY_WAIT); goto retry; } diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index f2a2ae7008b..9e9c0a17a39 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -9019,6 +9019,7 @@ inline bool rollback_inplace_alter_table(Alter_inplace_info *ha_alter_info, ut_a(!lock_table_for_trx(dict_sys.sys_fields, ctx->trx, LOCK_X)); } innodb_lock_wait_timeout= save_timeout; + DEBUG_SYNC_C("innodb_rollback_after_fts_lock"); row_mysql_lock_data_dictionary(ctx->trx); ctx->rollback_instant(); innobase_rollback_sec_index(ctx->old_table, table, From 0de3be8cfdfc26f5c236eaefe12d03c7b4af22c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Fri, 24 Feb 2023 14:24:44 +0200 Subject: [PATCH 4/4] MDEV-30671 InnoDB undo log truncation fails to wait for purge of history It is not safe to invoke trx_purge_free_segment() or execute innodb_undo_log_truncate=ON before all undo log records in the rollback segment has been processed. A prominent failure that would occur due to premature freeing of undo log pages is that trx_undo_get_undo_rec() would crash when trying to copy an undo log record to fetch the previous version of a record. If trx_undo_get_undo_rec() was not invoked in the unlucky time frame, then the symptom would be that some committed transaction history is never removed. This would be detected by CHECK TABLE...EXTENDED that was impleented in commit ab0190101b0587e0e03b2d75a967050b9a85fd1b. Such a garbage collection leak should be possible even when using innodb_undo_log_truncate=OFF, just involving trx_purge_free_segment(). trx_rseg_t::needs_purge: Change the type from Boolean to a transaction identifier, noting the most recent non-purged transaction, or 0 if everything has been purged. On transaction start, we initialize this to 1 more than the transaction start ID. On recovery, the field may be adjusted to the transaction end ID (TRX_UNDO_TRX_NO) if it is larger. The field TRX_UNDO_NEEDS_PURGE becomes write-only; only some debug assertions that would validate the value. The field reflects the old inaccurate Boolean field trx_rseg_t::needs_purge. trx_undo_mem_create_at_db_start(), trx_undo_lists_init(), trx_rseg_mem_restore(): Remove the parameter max_trx_id. Instead, store the maximum in trx_rseg_t::needs_purge, where trx_rseg_array_init() will find it. trx_purge_free_segment(): Contiguously hold a lock on trx_rseg_t to prevent any concurrent allocation of undo log. trx_purge_truncate_rseg_history(): Only invoke trx_purge_free_segment() if the rollback segment is empty and there are no pending transactions associated with it. trx_purge_truncate_history(): Only proceed with innodb_undo_log_truncate=ON if trx_rseg_t::needs_purge indicates that all history has been purged. Tested by: Matthias Leich --- mysql-test/suite/gcol/r/gcol_purge.result | 7 +- mysql-test/suite/gcol/t/gcol_purge.test | 8 +- .../innodb/r/cursor-restore-locking.result | 4 + mysql-test/suite/innodb/r/dml_purge.result | 5 + .../suite/innodb/r/gap_lock_split.result | 1 + .../suite/innodb/r/innodb_bug84958.result | 11 +- .../innodb/t/cursor-restore-locking.test | 4 + mysql-test/suite/innodb/t/dml_purge.test | 6 + mysql-test/suite/innodb/t/gap_lock_split.test | 1 + .../suite/innodb/t/innodb_bug84958.test | 11 +- storage/innobase/include/trx0rseg.h | 18 ++- storage/innobase/include/trx0undo.h | 6 +- storage/innobase/trx/trx0purge.cc | 120 +++++++----------- storage/innobase/trx/trx0rseg.cc | 48 ++++--- storage/innobase/trx/trx0trx.cc | 92 ++++++-------- storage/innobase/trx/trx0undo.cc | 60 +++++---- 16 files changed, 204 insertions(+), 198 deletions(-) diff --git a/mysql-test/suite/gcol/r/gcol_purge.result b/mysql-test/suite/gcol/r/gcol_purge.result index ea8369ad8e5..11063c7cd6f 100644 --- a/mysql-test/suite/gcol/r/gcol_purge.result +++ b/mysql-test/suite/gcol/r/gcol_purge.result @@ -1,7 +1,11 @@ +SET @save_frequency=@@GLOBAL.innodb_purge_rseg_truncate_frequency; +SET @save_dbug=@@GLOBAL.debug_dbug; +SET GLOBAL innodb_purge_rseg_truncate_frequency=1; CREATE TABLE t1(f1 INT NOT NULL, f2 int not null, f3 int generated always as (f2 * 2) VIRTUAL, primary key(f1), INDEX (f3))ENGINE=InnoDB; connect con1,localhost,root,,,; +InnoDB 0 transactions not purged START TRANSACTION WITH CONSISTENT SNAPSHOT; connection default; INSERT INTO t1(f1, f2) VALUES(1,2); @@ -18,5 +22,6 @@ commit; disconnect con1; disconnect con2; connection default; -set global debug_dbug=default; +SET GLOBAL innodb_purge_rseg_truncate_frequency=@save_frequency; +SET GLOBAL debug_dbug=@save_dbug; DROP TABLE t1; diff --git a/mysql-test/suite/gcol/t/gcol_purge.test b/mysql-test/suite/gcol/t/gcol_purge.test index 3696b41b3d8..ecfd89f4469 100644 --- a/mysql-test/suite/gcol/t/gcol_purge.test +++ b/mysql-test/suite/gcol/t/gcol_purge.test @@ -1,9 +1,14 @@ --source include/have_innodb.inc --source include/have_debug.inc + +SET @save_frequency=@@GLOBAL.innodb_purge_rseg_truncate_frequency; +SET @save_dbug=@@GLOBAL.debug_dbug; +SET GLOBAL innodb_purge_rseg_truncate_frequency=1; CREATE TABLE t1(f1 INT NOT NULL, f2 int not null, f3 int generated always as (f2 * 2) VIRTUAL, primary key(f1), INDEX (f3))ENGINE=InnoDB; connect(con1,localhost,root,,,); +--source ../innodb/include/wait_all_purged.inc START TRANSACTION WITH CONSISTENT SNAPSHOT; connection default; @@ -26,5 +31,6 @@ commit; disconnect con1; disconnect con2; connection default; -set global debug_dbug=default; +SET GLOBAL innodb_purge_rseg_truncate_frequency=@save_frequency; +SET GLOBAL debug_dbug=@save_dbug; DROP TABLE t1; diff --git a/mysql-test/suite/innodb/r/cursor-restore-locking.result b/mysql-test/suite/innodb/r/cursor-restore-locking.result index da95c57ce64..fc56f0935fa 100644 --- a/mysql-test/suite/innodb/r/cursor-restore-locking.result +++ b/mysql-test/suite/innodb/r/cursor-restore-locking.result @@ -1,4 +1,7 @@ +SET @save_freq=@@GLOBAL.innodb_purge_rseg_truncate_frequency; +SET GLOBAL innodb_purge_rseg_truncate_frequency=1; CREATE TABLE t (a int PRIMARY KEY, b int NOT NULL UNIQUE) engine = InnoDB; +InnoDB 0 transactions not purged connect prevent_purge,localhost,root,,; start transaction with consistent snapshot; connect con_del_1,localhost,root,,; @@ -34,3 +37,4 @@ disconnect con_del_2; connection default; SET DEBUG_SYNC = 'RESET'; DROP TABLE t; +SET GLOBAL innodb_purge_rseg_truncate_frequency=@save_freq; diff --git a/mysql-test/suite/innodb/r/dml_purge.result b/mysql-test/suite/innodb/r/dml_purge.result index 95330b80d33..38273d571c0 100644 --- a/mysql-test/suite/innodb/r/dml_purge.result +++ b/mysql-test/suite/innodb/r/dml_purge.result @@ -7,6 +7,7 @@ SET GLOBAL innodb_purge_rseg_truncate_frequency = 1; SET GLOBAL innodb_purge_rseg_truncate_frequency = 1; CREATE TABLE t1(a INT PRIMARY KEY, b INT NOT NULL) ROW_FORMAT=REDUNDANT ENGINE=InnoDB; +InnoDB 0 transactions not purged connect prevent_purge,localhost,root; START TRANSACTION WITH CONSISTENT SNAPSHOT; connection default; @@ -19,7 +20,11 @@ UPDATE t1 SET b=4 WHERE a=3; disconnect prevent_purge; connection default; InnoDB 0 transactions not purged +connection con1; +ROLLBACK; disconnect con1; +connection default; +InnoDB 0 transactions not purged FLUSH TABLE t1 FOR EXPORT; Clustered index root page contents: N_RECS=3; LEVEL=0 diff --git a/mysql-test/suite/innodb/r/gap_lock_split.result b/mysql-test/suite/innodb/r/gap_lock_split.result index 25a3cf711f9..a5765cb5694 100644 --- a/mysql-test/suite/innodb/r/gap_lock_split.result +++ b/mysql-test/suite/innodb/r/gap_lock_split.result @@ -3,6 +3,7 @@ SET GLOBAL innodb_purge_rseg_truncate_frequency=1; CREATE TABLE t1(id INT PRIMARY key, val VARCHAR(16000)) ENGINE=InnoDB; INSERT INTO t1 (id,val) SELECT 2*seq,'x' FROM seq_0_to_1023; connect con1,localhost,root,,; +InnoDB 0 transactions not purged START TRANSACTION WITH CONSISTENT SNAPSHOT; connection default; DELETE FROM t1 WHERE id=1788; diff --git a/mysql-test/suite/innodb/r/innodb_bug84958.result b/mysql-test/suite/innodb/r/innodb_bug84958.result index b721c73a0fc..9a4129647b9 100644 --- a/mysql-test/suite/innodb/r/innodb_bug84958.result +++ b/mysql-test/suite/innodb/r/innodb_bug84958.result @@ -9,12 +9,10 @@ SET GLOBAL innodb_purge_rseg_truncate_frequency= 1; CREATE PROCEDURE insert_n(start int, end int) BEGIN DECLARE i INT DEFAULT start; -START TRANSACTION; WHILE i <= end do INSERT INTO t1 VALUES (1, 2, 3) ON DUPLICATE KEY UPDATE c = i; SET i = i + 1; END WHILE; -COMMIT; END~~ CREATE FUNCTION num_pages_get() RETURNS INT @@ -29,7 +27,8 @@ END~~ # Create a table with one record in it and start an RR transaction # CREATE TABLE t1 (a INT, b INT, c INT, PRIMARY KEY(a,b), KEY (b,c)) -ENGINE=InnoDB; +ENGINE=InnoDB STATS_PERSISTENT=0; +InnoDB 0 transactions not purged BEGIN; SELECT * FROM t1; a b c @@ -38,20 +37,24 @@ a b c # connect con2, localhost, root,,; connection con2; +BEGIN; INSERT INTO t1 VALUES (1, 2, 3) ON DUPLICATE KEY UPDATE c = NULL; CALL insert_n(1, 50);; connect con3, localhost, root,,; connection con3; +BEGIN; CALL insert_n(51, 100);; connection con2; +COMMIT; connection con3; INSERT INTO t1 VALUES (1, 2, 1) ON DUPLICATE KEY UPDATE c = NULL; +COMMIT; connection default; # # Connect to default and record how many pages were accessed # when selecting the record using the secondary key. # -InnoDB 4 transactions not purged +InnoDB 2 transactions not purged SET @num_pages_1 = num_pages_get(); SELECT * FROM t1 force index (b); a b c diff --git a/mysql-test/suite/innodb/t/cursor-restore-locking.test b/mysql-test/suite/innodb/t/cursor-restore-locking.test index f3a60f25568..0f083f9295b 100644 --- a/mysql-test/suite/innodb/t/cursor-restore-locking.test +++ b/mysql-test/suite/innodb/t/cursor-restore-locking.test @@ -3,8 +3,11 @@ source include/have_debug.inc; source include/have_debug_sync.inc; +SET @save_freq=@@GLOBAL.innodb_purge_rseg_truncate_frequency; +SET GLOBAL innodb_purge_rseg_truncate_frequency=1; CREATE TABLE t (a int PRIMARY KEY, b int NOT NULL UNIQUE) engine = InnoDB; +--source include/wait_all_purged.inc --connect(prevent_purge,localhost,root,,) start transaction with consistent snapshot; @@ -80,4 +83,5 @@ INSERT INTO t VALUES(30, 20); SET DEBUG_SYNC = 'RESET'; DROP TABLE t; +SET GLOBAL innodb_purge_rseg_truncate_frequency=@save_freq; --source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/innodb/t/dml_purge.test b/mysql-test/suite/innodb/t/dml_purge.test index 37178982c8d..7034939aa4e 100644 --- a/mysql-test/suite/innodb/t/dml_purge.test +++ b/mysql-test/suite/innodb/t/dml_purge.test @@ -14,6 +14,7 @@ SET GLOBAL innodb_purge_rseg_truncate_frequency = 1; CREATE TABLE t1(a INT PRIMARY KEY, b INT NOT NULL) ROW_FORMAT=REDUNDANT ENGINE=InnoDB; +--source include/wait_all_purged.inc --connect (prevent_purge,localhost,root) START TRANSACTION WITH CONSISTENT SNAPSHOT; @@ -33,7 +34,12 @@ UPDATE t1 SET b=4 WHERE a=3; # Initiate a full purge, which should reset the DB_TRX_ID except for a=3. --source include/wait_all_purged.inc # Initiate a ROLLBACK of the update, which should reset the DB_TRX_ID for a=3. +--connection con1 +ROLLBACK; --disconnect con1 +--connection default +# Reset the DB_TRX_ID for the hidden ADD COLUMN metadata record. +--source include/wait_all_purged.inc FLUSH TABLE t1 FOR EXPORT; # The following is based on innodb.table_flags: diff --git a/mysql-test/suite/innodb/t/gap_lock_split.test b/mysql-test/suite/innodb/t/gap_lock_split.test index 462f073ddce..8211a612d35 100644 --- a/mysql-test/suite/innodb/t/gap_lock_split.test +++ b/mysql-test/suite/innodb/t/gap_lock_split.test @@ -9,6 +9,7 @@ CREATE TABLE t1(id INT PRIMARY key, val VARCHAR(16000)) ENGINE=InnoDB; INSERT INTO t1 (id,val) SELECT 2*seq,'x' FROM seq_0_to_1023; connect(con1,localhost,root,,); +source include/wait_all_purged.inc; # Prevent purge. START TRANSACTION WITH CONSISTENT SNAPSHOT; connection default; diff --git a/mysql-test/suite/innodb/t/innodb_bug84958.test b/mysql-test/suite/innodb/t/innodb_bug84958.test index cbcc5d1abc6..b42f7bd639e 100644 --- a/mysql-test/suite/innodb/t/innodb_bug84958.test +++ b/mysql-test/suite/innodb/t/innodb_bug84958.test @@ -13,12 +13,10 @@ DELIMITER ~~; CREATE PROCEDURE insert_n(start int, end int) BEGIN DECLARE i INT DEFAULT start; - START TRANSACTION; WHILE i <= end do INSERT INTO t1 VALUES (1, 2, 3) ON DUPLICATE KEY UPDATE c = i; SET i = i + 1; END WHILE; - COMMIT; END~~ CREATE FUNCTION num_pages_get() @@ -36,7 +34,8 @@ DELIMITER ;~~ --echo # Create a table with one record in it and start an RR transaction --echo # CREATE TABLE t1 (a INT, b INT, c INT, PRIMARY KEY(a,b), KEY (b,c)) -ENGINE=InnoDB; +ENGINE=InnoDB STATS_PERSISTENT=0; +--source include/wait_all_purged.inc BEGIN; SELECT * FROM t1; @@ -45,18 +44,22 @@ SELECT * FROM t1; --echo # connect (con2, localhost, root,,); connection con2; +BEGIN; INSERT INTO t1 VALUES (1, 2, 3) ON DUPLICATE KEY UPDATE c = NULL; --send CALL insert_n(1, 50); connect (con3, localhost, root,,); connection con3; +BEGIN; --send CALL insert_n(51, 100); connection con2; reap; +COMMIT; connection con3; reap; INSERT INTO t1 VALUES (1, 2, 1) ON DUPLICATE KEY UPDATE c = NULL; +COMMIT; connection default; @@ -64,7 +67,7 @@ connection default; --echo # Connect to default and record how many pages were accessed --echo # when selecting the record using the secondary key. --echo # ---let $wait_all_purged=4 +--let $wait_all_purged=2 --source include/wait_all_purged.inc SET @num_pages_1 = num_pages_get(); SELECT * FROM t1 force index (b); diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h index d08ed709b14..96655c7020f 100644 --- a/storage/innobase/include/trx0rseg.h +++ b/storage/innobase/include/trx0rseg.h @@ -128,15 +128,19 @@ struct trx_rseg_t { /** trx_t::no | last_offset << 48 */ uint64_t last_commit_and_offset; - /** Whether the log segment needs purge */ - bool needs_purge; + /** Last known transaction that has not been purged yet, + or 0 if everything has been purged. */ + trx_id_t needs_purge; - /** Reference counter to track rseg allocated transactions. */ - ulint trx_ref_count; + /** Number of active (non-committed) transactions associated with a + an is_persistent() rollback segment. Needed for protecting + trx->rsegs.m_redo.rseg assignments + before trx->rsegs.m_redo.undo has been assigned. */ + ulint trx_ref_count; - /** If true, then skip allocating this rseg as it reside in - UNDO-tablespace marked for truncate. */ - bool skip_allocation; + /** whether undo log truncation was initiated, and transactions + cannot be allocated in this is_persistent() rollback segment */ + bool skip_allocation; /** @return the commit ID of the last committed transaction */ trx_id_t last_trx_no() const diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h index 1ae23856087..a4578d61fe2 100644 --- a/storage/innobase/include/trx0undo.h +++ b/storage/innobase/include/trx0undo.h @@ -258,12 +258,10 @@ trx_undo_free_at_shutdown(trx_t *trx); @param[in,out] rseg rollback segment @param[in] id rollback segment slot @param[in] page_no undo log segment page number -@param[in,out] max_trx_id the largest observed transaction ID @return the undo log @retval nullptr on error */ trx_undo_t * -trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no, - trx_id_t &max_trx_id); +trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no); #endif /* !UNIV_INNOCHECKSUM */ @@ -407,6 +405,8 @@ or 0 if the transaction has not been committed */ /** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of surviving user records, this used to be called TRX_UNDO_DEL_MARKS. +This field is redundant; it is only being read by some debug assertions. + The value 1 indicates that purge needs to process the undo log segment. The value 0 indicates that all of it has been processed, and trx_purge_free_segment() has been invoked, so the log is not safe to access. diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 4d84f295c0b..f273903ef93 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -221,6 +221,7 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr) trx_ulogf_t* undo_header = undo_page->frame + undo->hdr_offset; ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1); + ut_ad(rseg->needs_purge > trx->id); if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_header->frame))) { @@ -309,7 +310,6 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr) rseg->last_page_no = undo->hdr_page_no; rseg->set_last_commit(undo->hdr_offset, trx->rw_trx_hash_element->no); - rseg->needs_purge = true; } trx_sys.rseg_history_len++; @@ -339,16 +339,15 @@ static void trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log, } /** Free an undo log segment, and remove the header from the history list. +@param[in,out] mtr mini-transaction @param[in,out] rseg rollback segment @param[in] hdr_addr file address of log_hdr */ static -void -trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr) +void trx_purge_free_segment(mtr_t &mtr, trx_rseg_t* rseg, fil_addr_t hdr_addr) { - mtr_t mtr; - + mtr.commit(); mtr.start(); - mutex_enter(&rseg->mutex); + ut_ad(mutex_own(&rseg->mutex)); buf_block_t* rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr); buf_block_t* block = trx_undo_page_get( @@ -365,13 +364,9 @@ trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr) while (!fseg_free_step_not_header( TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER + block->frame, &mtr)) { - mutex_exit(&rseg->mutex); - mtr.commit(); mtr.start(); - mutex_enter(&rseg->mutex); - rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr); block = trx_undo_page_get( @@ -410,10 +405,6 @@ trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr) ut_ad(rseg->curr_size >= seg_size); rseg->curr_size -= seg_size; - - mutex_exit(&(rseg->mutex)); - - mtr_commit(&mtr); } /** Remove unnecessary history data from a rollback segment. @@ -431,8 +422,6 @@ trx_purge_truncate_rseg_history( trx_id_t undo_trx_no; mtr.start(); - ut_ad(rseg.is_persistent()); - mutex_enter(&rseg.mutex); buf_block_t* rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr); @@ -444,7 +433,6 @@ trx_purge_truncate_rseg_history( loop: if (hdr_addr.page == FIL_NULL) { func_exit: - mutex_exit(&rseg.mutex); mtr.commit(); return; } @@ -470,30 +458,26 @@ func_exit: prev_hdr_addr.boffset = static_cast(prev_hdr_addr.boffset - TRX_UNDO_HISTORY_NODE); - if (mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame) + if (!rseg.trx_ref_count + && rseg.needs_purge <= (purge_sys.head.trx_no + ? purge_sys.head.trx_no + : purge_sys.tail.trx_no) + && mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + + block->frame) == TRX_UNDO_TO_PURGE && !mach_read_from_2(block->frame + hdr_addr.boffset + TRX_UNDO_NEXT_LOG)) { - - /* We can free the whole log segment */ - - mutex_exit(&rseg.mutex); - mtr.commit(); - - /* calls the trx_purge_remove_log_hdr() - inside trx_purge_free_segment(). */ - trx_purge_free_segment(&rseg, hdr_addr); + /* We can free the whole log segment. + This will call trx_purge_remove_log_hdr(). */ + trx_purge_free_segment(mtr, &rseg, hdr_addr); } else { /* Remove the log hdr from the rseg history. */ trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset, &mtr); - - mutex_exit(&rseg.mutex); - mtr.commit(); } + mtr.commit(); mtr.start(); - mutex_enter(&rseg.mutex); rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr); @@ -568,7 +552,10 @@ static void trx_purge_truncate_history() if (trx_rseg_t *rseg= trx_sys.rseg_array[i]) { ut_ad(rseg->id == i); + ut_ad(rseg->is_persistent()); + mutex_enter(&rseg->mutex); trx_purge_truncate_rseg_history(*rseg, head); + mutex_exit(&rseg->mutex); } } @@ -611,48 +598,44 @@ static void trx_purge_truncate_history() DBUG_LOG("undo", "marking for truncate: " << file->name); - for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i) - if (trx_rseg_t *rseg= trx_sys.rseg_array[i]) - if (rseg->space == &space) - /* Once set, this rseg will not be allocated to subsequent - transactions, but we will wait for existing active - transactions to finish. */ - rseg->skip_allocation= true; - for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i) { trx_rseg_t *rseg= trx_sys.rseg_array[i]; if (!rseg || rseg->space != &space) continue; - mutex_enter(&rseg->mutex); - ut_ad(rseg->skip_allocation); ut_ad(rseg->is_persistent()); - if (rseg->trx_ref_count) + + mutex_enter(&rseg->mutex); + /* Once set, this rseg will not be allocated to subsequent + transactions, but we will wait for existing active + transactions to finish and to be purged. */ + rseg->skip_allocation = true; + + if (rseg->trx_ref_count || rseg->needs_purge > head.trx_no) { -not_free: + not_free: mutex_exit(&rseg->mutex); return; } - if (rseg->curr_size != 1) + ut_ad(UT_LIST_GET_LEN(rseg->undo_list) == 0); + /* Check if all segments are cached and safe to remove. */ + ulint cached= 0; + + for (const trx_undo_t *undo= UT_LIST_GET_FIRST(rseg->undo_cached); undo; + undo= UT_LIST_GET_NEXT(undo_list, undo)) { - /* Check if all segments are cached and safe to remove. */ - ulint cached= 0; - for (trx_undo_t *undo= UT_LIST_GET_FIRST(rseg->undo_cached); undo; - undo= UT_LIST_GET_NEXT(undo_list, undo)) - { - if (head.trx_no < undo->trx_id) - goto not_free; - else - cached+= undo->size; - } - - ut_ad(rseg->curr_size > cached); - - if (rseg->curr_size > cached + 1) + if (head.trx_no < undo->trx_id) goto not_free; + else + cached+= undo->size; } + ut_ad(rseg->curr_size > cached); + + if (rseg->curr_size > cached + 1) + goto not_free; + mutex_exit(&rseg->mutex); } @@ -753,6 +736,8 @@ not_free: ut_ad(rseg->id == i); ut_ad(rseg->is_persistent()); + ut_ad(!rseg->trx_ref_count); + ut_ad(rseg->needs_purge <= head.trx_no); ut_d(const auto old_page= rseg->page_no); buf_block_t *rblock= trx_rseg_header_create(&space, i, @@ -775,9 +760,6 @@ not_free: ut_free(undo); } - UT_LIST_INIT(rseg->undo_list, &trx_undo_t::undo_list); - UT_LIST_INIT(rseg->undo_cached, &trx_undo_t::undo_list); - /* These were written by trx_rseg_header_create(). */ ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rblock->frame)); ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE + @@ -786,9 +768,10 @@ not_free: the rseg header */ rseg->curr_size= 1; rseg->trx_ref_count= 0; + rseg->needs_purge= 0; + rseg->skip_allocation= false; rseg->last_page_no= FIL_NULL; rseg->last_commit_and_offset= 0; - rseg->needs_purge= false; } mtr.commit_shrink(space); @@ -812,17 +795,6 @@ not_free: log_buffer_flush_to_disk(); DBUG_SUICIDE();); - for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i) - { - if (trx_rseg_t *rseg= trx_sys.rseg_array[i]) - { - ut_ad(rseg->id == i); - ut_ad(rseg->is_persistent()); - if (rseg->space == &space) - rseg->skip_allocation= false; - } - } - ib::info() << "Truncated " << file->name; purge_sys.truncate.last= purge_sys.truncate.current; ut_ad(&space == purge_sys.truncate.current); @@ -891,7 +863,6 @@ static void trx_purge_rseg_get_next_history_log( trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1); - const byte needs_purge = log_hdr[TRX_UNDO_NEEDS_PURGE + 1]; mtr.commit(); @@ -899,7 +870,6 @@ static void trx_purge_rseg_get_next_history_log( purge_sys.rseg->last_page_no = prev_log_addr.page; purge_sys.rseg->set_last_commit(prev_log_addr.boffset, trx_no); - purge_sys.rseg->needs_purge = needs_purge != 0; /* Purge can also produce events, however these are already ordered in the rollback segment and any user generated event will be greater diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc index 34e1ccfc277..35690fb1775 100644 --- a/storage/innobase/trx/trx0rseg.cc +++ b/storage/innobase/trx/trx0rseg.cc @@ -421,10 +421,9 @@ trx_rseg_mem_create(ulint id, fil_space_t* space, uint32_t page_no) /** Read the undo log lists. @param[in,out] rseg rollback segment -@param[in,out] max_trx_id maximum observed transaction identifier @param[in] rseg_header rollback segment header @return error code */ -static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id, +static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, const buf_block_t *rseg_header) { ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN); @@ -434,8 +433,8 @@ static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id, uint32_t page_no= trx_rsegf_get_nth_undo(rseg_header, i); if (page_no != FIL_NULL) { - const trx_undo_t *undo= trx_undo_mem_create_at_db_start(rseg, i, page_no, - max_trx_id); + const trx_undo_t *undo= + trx_undo_mem_create_at_db_start(rseg, i, page_no); if (!undo) return DB_CORRUPTION; rseg->curr_size+= undo->size; @@ -448,11 +447,9 @@ static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id, /** Restore the state of a persistent rollback segment. @param[in,out] rseg persistent rollback segment -@param[in,out] max_trx_id maximum observed transaction identifier @param[in,out] mtr mini-transaction @return error code */ -static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id, - mtr_t *mtr) +static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr) { buf_block_t* rseg_hdr = trx_rsegf_get_new( rseg->space->id, rseg->page_no, mtr); @@ -460,9 +457,8 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id, if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->frame)) { trx_id_t id = mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID + rseg_hdr->frame); - - if (id > max_trx_id) { - max_trx_id = id; + if (id > rseg->needs_purge) { + rseg->needs_purge = id; } const byte* binlog_name = TRX_RSEG + TRX_RSEG_BINLOG_NAME @@ -505,7 +501,7 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id, rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->frame) + 1; - if (dberr_t err = trx_undo_lists_init(rseg, max_trx_id, rseg_hdr)) { + if (dberr_t err = trx_undo_lists_init(rseg, rseg_hdr)) { return err; } @@ -524,23 +520,20 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id, const buf_block_t* block = trx_undo_page_get( page_id_t(rseg->space->id, node_addr.page), mtr); - trx_id_t id = mach_read_from_8(block->frame + node_addr.boffset - + TRX_UNDO_TRX_ID); - if (id > max_trx_id) { - max_trx_id = id; - } + trx_id_t trx_id, id; + trx_id = mach_read_from_8(block->frame + node_addr.boffset + + TRX_UNDO_TRX_ID); id = mach_read_from_8(block->frame + node_addr.boffset + TRX_UNDO_TRX_NO); - if (id > max_trx_id) { - max_trx_id = id; + trx_id = std::max(trx_id, id); + + if (trx_id > rseg->needs_purge) { + rseg->needs_purge = trx_id; } rseg->set_last_commit(node_addr.boffset, id); - unsigned purge = mach_read_from_2(block->frame - + node_addr.boffset - + TRX_UNDO_NEEDS_PURGE); - ut_ad(purge <= 1); - rseg->needs_purge = purge != 0; + ut_ad(mach_read_from_2(block->frame + node_addr.boffset + + TRX_UNDO_NEEDS_PURGE) <= 1); if (rseg->last_page_no != FIL_NULL) { @@ -618,9 +611,12 @@ dberr_t trx_rseg_array_init() ut_ad(rseg->id == rseg_id); ut_ad(!trx_sys.rseg_array[rseg_id]); trx_sys.rseg_array[rseg_id] = rseg; - if ((err = trx_rseg_mem_restore( - rseg, max_trx_id, &mtr)) - != DB_SUCCESS) { + err = trx_rseg_mem_restore(rseg, &mtr); + if (rseg->needs_purge > max_trx_id) { + max_trx_id = rseg->needs_purge; + } + + if (err != DB_SUCCESS) { mtr.commit(); break; } diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 6effe925050..4ccf67c3bf9 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -665,6 +665,7 @@ static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg, uint64_t *rows_to_undo) { trx_state_t state; + ut_ad(rseg->needs_purge >= undo->trx_id); /* This is single-threaded startup code, we do not need the protection of trx->mutex here. @@ -688,6 +689,7 @@ static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg, return; } + ++rseg->trx_ref_count; trx_t *trx= trx_create(); trx->state= state; ut_d(trx->start_file= __FILE__); @@ -696,12 +698,6 @@ static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg, trx->rsegs.m_redo.undo= undo; trx->undo_no= undo->top_undo_no + 1; trx->rsegs.m_redo.rseg= rseg; - /* - For transactions with active data will not have rseg size = 1 - or will not qualify for purge limit criteria. So it is safe to increment - this trx_ref_count w/o mutex protection. - */ - ++trx->rsegs.m_redo.rseg->trx_ref_count; *trx->xid= undo->xid; trx->id= undo->trx_id; trx->is_recovered= true; @@ -776,7 +772,8 @@ dberr_t trx_lists_init_at_db_start() ut_ad(trx->start_time == start_time); ut_ad(trx->is_recovered); ut_ad(trx->rsegs.m_redo.rseg == rseg); - ut_ad(trx->rsegs.m_redo.rseg->trx_ref_count); + ut_ad(rseg->trx_ref_count); + ut_ad(rseg->needs_purge); trx->rsegs.m_redo.undo = undo; if (undo->top_undo_no >= trx->undo_no) { @@ -808,20 +805,18 @@ dberr_t trx_lists_init_at_db_start() /** Assign a persistent rollback segment in a round-robin fashion, evenly distributed between 0 and innodb_undo_logs-1 -@return persistent rollback segment -@retval NULL if innodb_read_only */ -static trx_rseg_t* trx_assign_rseg_low() +@param trx transaction */ +static void trx_assign_rseg_low(trx_t *trx) { - if (high_level_read_only) { - ut_ad(!srv_available_undo_logs); - return(NULL); - } - + ut_ad(!trx->rsegs.m_redo.rseg); ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS); /* The first slot is always assigned to the system tablespace. */ ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space); + trx_sys.register_rw(trx); + ut_ad(trx->id); + /* Choose a rollback segment evenly distributed between 0 and innodb_undo_logs-1 in a round-robin fashion, skipping those undo tablespaces that are scheduled for truncation. */ @@ -835,7 +830,7 @@ static trx_rseg_t* trx_assign_rseg_low() bool look_for_rollover = false; #endif /* UNIV_DEBUG */ - bool allocated = false; + bool skip_allocation; do { for (;;) { @@ -879,20 +874,18 @@ static trx_rseg_t* trx_assign_rseg_low() break; } - /* By now we have only selected the rseg but not marked it - allocated. By marking it allocated we are ensuring that it will - never be selected for UNDO truncate purge. */ mutex_enter(&rseg->mutex); - if (!rseg->skip_allocation) { - rseg->trx_ref_count++; - allocated = true; + ut_ad(rseg->is_persistent()); + skip_allocation = rseg->skip_allocation; + if (!skip_allocation) { + /* Ensure that the allocation remains valid until + trx_undo_reuse_cached() is invoked. */ + ++rseg->trx_ref_count; } mutex_exit(&rseg->mutex); - } while (!allocated); + } while (skip_allocation); - ut_ad(rseg->trx_ref_count > 0); - ut_ad(rseg->is_persistent()); - return(rseg); + trx->rsegs.m_redo.rseg = rseg; } /** Assign a rollback segment for modifying temporary tables. @@ -976,15 +969,11 @@ trx_start_low( if (!trx->read_only && (trx->mysql_thd == 0 || read_write || trx->ddl)) { - /* Temporary rseg is assigned only if the transaction updates a temporary table */ - trx->rsegs.m_redo.rseg = trx_assign_rseg_low(); - ut_ad(trx->rsegs.m_redo.rseg != 0 - || srv_read_only_mode - || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); - - trx_sys.register_rw(trx); + if (!high_level_read_only) { + trx_assign_rseg_low(trx); + } } else { if (!trx->is_autocommit_non_locking()) { @@ -1081,25 +1070,22 @@ trx_write_serialisation_history( trx_undo_t*& undo = trx->rsegs.m_redo.undo; - if (!undo) { - return; - } - ut_ad(!trx->read_only); - ut_ad(!undo || undo->rseg == rseg); mutex_enter(&rseg->mutex); + ut_ad(rseg->trx_ref_count); + --rseg->trx_ref_count; /* Assign the transaction serialisation number and add any undo log to the purge queue. */ - trx_serialise(trx); if (undo) { + ut_ad(undo->rseg == rseg); + trx_serialise(trx); UT_LIST_REMOVE(rseg->undo_list, undo); trx_purge_add_undo_to_history(trx, undo, mtr); + MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); } mutex_exit(&rseg->mutex); - - MONITOR_INC(MONITOR_TRX_COMMIT_UNDO); } /******************************************************************** @@ -1401,16 +1387,6 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr) ut_ad(!rsegs.m_noredo.undo); - /* Only after trx_undo_commit_cleanup() it is safe to release - our rseg reference. */ - if (trx_rseg_t *rseg= rsegs.m_redo.rseg) - { - mutex_enter(&rseg->mutex); - ut_ad(rseg->trx_ref_count > 0); - --rseg->trx_ref_count; - mutex_exit(&rseg->mutex); - } - /* Free all savepoints, starting from the first. */ trx_named_savept_t *savep= UT_LIST_GET_FIRST(trx_savepoints); @@ -1491,6 +1467,15 @@ void trx_t::commit_low(mtr_t *mtr) mtr->commit(); } + else if (trx_rseg_t *rseg= rsegs.m_redo.rseg) + { + ut_ad(id); + ut_ad(!rsegs.m_redo.undo); + mutex_enter(&rseg->mutex); + --rseg->trx_ref_count; + mutex_exit(&rseg->mutex); + } + #ifndef DBUG_OFF if (debug_sync) DEBUG_SYNC_C("before_trx_state_committed_in_memory"); @@ -2295,10 +2280,7 @@ trx_set_rw_mode( return; } - trx->rsegs.m_redo.rseg = trx_assign_rseg_low(); - ut_ad(trx->rsegs.m_redo.rseg != 0); - - trx_sys.register_rw(trx); + trx_assign_rseg_low(trx); /* So that we can see our own changes. */ if (trx->read_view.is_open()) { diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index 8a9bf2c7732..a69b748d78b 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -837,12 +837,10 @@ static void trx_undo_seg_free(const trx_undo_t *undo) @param[in,out] rseg rollback segment @param[in] id rollback segment slot @param[in] page_no undo log segment page number -@param[in,out] max_trx_id the largest observed transaction ID @return the undo log @retval nullptr on error */ trx_undo_t * -trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no, - trx_id_t &max_trx_id) +trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no) { mtr_t mtr; XID xid; @@ -876,10 +874,20 @@ corrupted: const trx_ulogf_t* const undo_header = block->frame + offset; uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame); + const trx_id_t trx_id= mach_read_from_8(undo_header + TRX_UNDO_TRX_ID); + if (trx_id >> 48) { + sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id); + goto corrupted; + } + /* We will increment rseg->needs_purge, like trx_undo_reuse_cached() + would do it, to avoid trouble on rollback or XA COMMIT. */ + trx_id_t trx_no = trx_id + 1; + switch (state) { case TRX_UNDO_ACTIVE: case TRX_UNDO_PREPARED: if (UNIV_LIKELY(type != 1)) { + trx_no = trx_id + 1; break; } sql_print_error("InnoDB: upgrade from older version than" @@ -902,13 +910,14 @@ corrupted: goto corrupted_type; } read_trx_no: - trx_id_t id = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header); - if (id >> 48) { - sql_print_error("InnoDB: corrupted TRX_NO %llx", id); + trx_no = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header); + if (trx_no >> 48) { + sql_print_error("InnoDB: corrupted TRX_NO %llx", + trx_no); goto corrupted; } - if (id > max_trx_id) { - max_trx_id = id; + if (trx_no < trx_id) { + trx_no = trx_id; } } @@ -921,16 +930,10 @@ corrupted: xid.null(); } - trx_id_t trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID); - if (trx_id >> 48) { - sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id); - goto corrupted; - } - if (trx_id > max_trx_id) { - max_trx_id = trx_id; - } - mutex_enter(&rseg->mutex); + if (trx_no > rseg->needs_purge) { + rseg->needs_purge = trx_no; + } trx_undo_t* undo = trx_undo_mem_create( rseg, id, trx_id, &xid, page_no, offset); mutex_exit(&rseg->mutex); @@ -1128,6 +1131,22 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo, { ut_ad(mutex_own(&rseg->mutex)); + if (rseg->is_persistent()) { + ut_ad(rseg->trx_ref_count); + if (rseg->needs_purge <= trx->id) { + /* trx_purge_truncate_history() compares + rseg->needs_purge <= head.trx_no + so we need to compensate for that. + The rseg->needs_purge after crash + recovery would be at least trx->id + 1, + because that is the minimum possible value + assigned by trx_serialise() on commit. */ + rseg->needs_purge = trx->id + 1; + } + } else { + ut_ad(!rseg->trx_ref_count); + } + trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached); if (!undo) { return NULL; @@ -1236,10 +1255,8 @@ buf_block_t* trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, dberr_t* err, mtr_t* mtr) { - const bool is_temp __attribute__((unused)) = rseg == trx->rsegs.m_noredo.rseg; - - ut_ad(rseg == trx->rsegs.m_redo.rseg - || rseg == trx->rsegs.m_noredo.rseg); + ut_d(const bool is_temp = rseg == trx->rsegs.m_noredo.rseg); + ut_ad(is_temp || rseg == trx->rsegs.m_redo.rseg); ut_ad(undo == (is_temp ? &trx->rsegs.m_noredo.undo : &trx->rsegs.m_redo.undo)); @@ -1259,7 +1276,6 @@ trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, ); mutex_enter(&rseg->mutex); - buf_block_t* block = trx_undo_reuse_cached(trx, rseg, undo, mtr); if (!block) {