From 727b549310869b83a1566f7b5b06e178a2308b48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 22 May 2024 12:30:30 +0300 Subject: [PATCH] MDEV-34212 InnoDB transaction recovery is incorrect trx_undo_mem_create_at_db_start(): Invoke recv_sys_t::recover() instead of buf_page_get_gen(), so that all undo log pages will be recovered correctly. Failure to do this could prevent InnoDB from starting up due to "Data structure corruption", or it could potentially lead to a situation where InnoDB starts up but some transactions were recovered incorrectly. recv_sys_t::recover(): Only acquire a buffer-fix on the pages, not a shared latch. This is adequate protection, because this function is only being invoked during early startup when no "users" are modifying buffer pool pages. The only writes are due to server bootstrap (the data files being created) or crash recovery (changes from ib_logfile0 being applied). buf_page_get_gen(): Assert that the function is not invoked while crash recovery is in progress, and that the special mode BUF_GET_RECOVER is only invoked during crash recovery or server bootstrap. All this should really have been part of commit 850d61736deba354377634cf344256ee850b93b7 (MDEV-32042). --- storage/innobase/buf/buf0buf.cc | 10 ++++++++++ storage/innobase/include/buf0buf.h | 1 + storage/innobase/log/log0recv.cc | 6 +++--- storage/innobase/trx/trx0undo.cc | 7 +++---- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 43401e9df2c..bdeaae23da9 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2587,6 +2587,15 @@ buf_page_get_gen( { ulint retries = 0; + /* BUF_GET_RECOVER is only used by recv_sys_t::recover(), + which must be invoked during early server startup when crash + recovery may be in progress. The only case when it may be + invoked outside recovery is when dict_create() has initialized + a new database and is invoking dict_boot(). In this case, the + LSN will be small. */ + ut_ad(mode == BUF_GET_RECOVER + ? recv_recovery_is_on() || log_sys.get_lsn() < 50000 + : !recv_recovery_is_on() || recv_sys.after_apply); ut_ad(!mtr || mtr->is_active()); ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL); ut_ad((rw_latch == RW_S_LATCH) @@ -2608,6 +2617,7 @@ buf_page_get_gen( /* The caller may pass a dummy page size, because it does not really matter. */ break; + case BUF_GET_RECOVER: case BUF_GET: ut_ad(!mtr->is_freeing_tree()); fil_space_t* s = fil_space_get(page_id.space()); diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index e154f788820..b30763fa232 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -45,6 +45,7 @@ Created 11/5/1995 Heikki Tuuri /** @name Modes for buf_page_get_gen */ /* @{ */ #define BUF_GET 10 /*!< get always */ +#define BUF_GET_RECOVER 9 /*!< like BUF_GET, but in recv_sys.recover() */ #define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */ #define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make the block young in the LRU list */ diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index f73b409e85d..44ba50e5130 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -3678,8 +3678,8 @@ recv_sys_t::recover(const page_id_t page_id, mtr_t *mtr, dberr_t *err) { if (!recovery_on) must_read: - return buf_page_get_gen(page_id, 0, RW_S_LATCH, nullptr, BUF_GET, mtr, - err); + return buf_page_get_gen(page_id, 0, RW_NO_LATCH, nullptr, BUF_GET_RECOVER, + mtr, err); mysql_mutex_lock(&mutex); map::iterator p= pages.find(page_id); @@ -3728,7 +3728,7 @@ recv_sys_t::recover(const page_id_t page_id, mtr_t *mtr, dberr_t *err) goto corrupted; } - mtr->page_lock(block, RW_S_LATCH); + mtr->page_lock(block, RW_NO_LATCH); return block; } diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index a249fbaf14e..0801ddb2b26 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -980,7 +980,7 @@ trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no) mtr.start(); const page_id_t page_id{rseg->space->id, page_no}; - const buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr); + const buf_block_t* block = recv_sys.recover(page_id, &mtr, nullptr); if (UNIV_UNLIKELY(!block)) { corrupted: mtr.commit(); @@ -1094,9 +1094,8 @@ corrupted_type: undo->last_page_no = last_addr.page; undo->top_page_no = last_addr.page; - const buf_block_t* last = buf_page_get( - page_id_t(rseg->space->id, undo->last_page_no), 0, - RW_X_LATCH, &mtr); + const buf_block_t* last = recv_sys.recover( + page_id_t(rseg->space->id, undo->last_page_no), &mtr, nullptr); if (UNIV_UNLIKELY(!last)) { goto corrupted_undo;