MDEV-30148 Race condition between non-persistent statistics and purge

btr_cur_t::open_random_leaf(): Replaces btr_cur_open_at_rnd_pos().
Acquire a shared latch on each page, and finally release all
latches except the one on the leaf page.

This fixes a race condition between the purge of history and
btr_estimate_number_of_different_key_vals(), which turned out
to only hold a buffer-fix on the randomly chosen leaf page.
Typically, an assertion would fail in page_rec_is_supremum().

ibuf_contract(): Start from the beginning of the change buffer,
to simplify the logic. Starting with
commit b42294bc6409794bdbd2051b32fa079d81cea61d
it does not matter much where the change buffer merge is being initiated.

The race condition may have been introduced as early as
mysql/mysql-server@ac74632293
from where it was copied to
commit 2e814d4702d71a04388386a9f591d14a35980bfe.

Reviewed by: Vladislav Lesin
Tested by: Matthias Leich
This commit is contained in:
Marko Mäkelä 2022-12-05 18:00:22 +02:00
parent 95d71272ef
commit 0a7d85c97f
4 changed files with 76 additions and 318 deletions

View File

@ -2629,288 +2629,6 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
return err;
}
/**********************************************************************//**
Positions a cursor at a randomly chosen position within a B-tree.
@return true if the index is available and we have put the cursor, false
if the index is unavailable */
bool
btr_cur_open_at_rnd_pos(
dict_index_t* index, /*!< in: index */
btr_latch_mode latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_cur_t* cursor, /*!< in/out: B-tree cursor */
mtr_t* mtr) /*!< in: mtr */
{
page_cur_t* page_cursor;
ulint node_ptr_max_size = srv_page_size / 2;
ulint height;
rec_t* node_ptr;
btr_intention_t lock_intention;
buf_block_t* tree_blocks[BTR_MAX_LEVELS];
ulint tree_savepoints[BTR_MAX_LEVELS];
ulint n_blocks = 0;
ulint n_releases = 0;
mem_heap_t* heap = NULL;
rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
rec_offs* offsets = offsets_;
rec_offs_init(offsets_);
ut_ad(!index->is_spatial());
lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
ulint savepoint = mtr_set_savepoint(mtr);
rw_lock_type_t upper_rw_latch;
switch (latch_mode) {
case BTR_MODIFY_TREE:
/* Most of delete-intended operations are purging.
Free blocks and read IO bandwidth should be prior
for them, when the history list is glowing huge. */
if (lock_intention == BTR_INTENTION_DELETE
&& buf_pool.n_pend_reads
&& trx_sys.history_size_approx()
> BTR_CUR_FINE_HISTORY_LENGTH) {
mtr_x_lock_index(index, mtr);
} else {
mtr_sx_lock_index(index, mtr);
}
upper_rw_latch = RW_X_LATCH;
break;
case BTR_SEARCH_PREV:
case BTR_MODIFY_PREV:
/* This function doesn't support left uncle
page lock for left leaf page lock, when
needed. */
case BTR_SEARCH_TREE:
case BTR_CONT_MODIFY_TREE:
case BTR_CONT_SEARCH_TREE:
ut_ad(0);
/* fall through */
default:
if (!srv_read_only_mode) {
mtr_s_lock_index(index, mtr);
upper_rw_latch = RW_S_LATCH;
} else {
upper_rw_latch = RW_NO_LATCH;
}
}
DBUG_EXECUTE_IF("test_index_is_unavailable",
return(false););
if (index->page == FIL_NULL) {
/* Since we don't hold index lock until just now, the index
could be modified by others, for example, if this is a
statistics updater for referenced table, it could be marked
as unavailable by 'DROP TABLE' in the mean time, since
we don't hold lock for statistics updater */
return(false);
}
const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
latch_mode);
page_cursor = btr_cur_get_page_cur(cursor);
page_cursor->index = index;
page_id_t page_id(index->table->space_id, index->page);
const ulint zip_size = index->table->space->zip_size();
dberr_t err;
if (root_leaf_rw_latch == RW_X_LATCH) {
node_ptr_max_size = btr_node_ptr_max_size(index);
}
height = ULINT_UNDEFINED;
for (;;) {
page_t* page;
ut_ad(n_blocks < BTR_MAX_LEVELS);
tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
const rw_lock_type_t rw_latch = height
&& latch_mode != BTR_MODIFY_TREE
? upper_rw_latch : RW_NO_LATCH;
buf_block_t* block = buf_page_get_gen(page_id, zip_size,
rw_latch, NULL, BUF_GET,
mtr, &err,
height == 0
&& !index->is_clust());
tree_blocks[n_blocks] = block;
ut_ad((block != NULL) == (err == DB_SUCCESS));
if (!block) {
if (err == DB_DECRYPTION_FAILED) {
btr_decryption_failed(*index);
}
break;
}
page = buf_block_get_frame(block);
if (height == ULINT_UNDEFINED
&& page_is_leaf(page)
&& rw_latch != RW_NO_LATCH
&& rw_latch != root_leaf_rw_latch) {
/* We should retry to get the page, because the root page
is latched with different level as a leaf page. */
ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
ut_ad(rw_latch == RW_S_LATCH);
ut_ad(n_blocks == 0);
mtr_release_block_at_savepoint(
mtr, tree_savepoints[n_blocks],
tree_blocks[n_blocks]);
upper_rw_latch = root_leaf_rw_latch;
continue;
}
ut_ad(fil_page_index_page_check(page));
ut_ad(index->id == btr_page_get_index_id(page));
if (height == ULINT_UNDEFINED) {
/* We are in the root node */
height = btr_page_get_level(page);
}
if (height == 0) {
if (rw_latch == RW_NO_LATCH
|| srv_read_only_mode) {
btr_cur_latch_leaves(block, latch_mode, cursor,
mtr);
}
/* btr_cur_t::open_leaf() and
btr_cur_search_to_nth_level() release
tree s-latch here.*/
switch (latch_mode) {
case BTR_MODIFY_TREE:
case BTR_CONT_MODIFY_TREE:
case BTR_CONT_SEARCH_TREE:
break;
default:
/* Release the tree s-latch */
if (!srv_read_only_mode) {
mtr_release_s_latch_at_savepoint(
mtr, savepoint,
&index->lock);
}
/* release upper blocks */
for (; n_releases < n_blocks; n_releases++) {
mtr_release_block_at_savepoint(
mtr,
tree_savepoints[n_releases],
tree_blocks[n_releases]);
}
}
}
page_cursor->block = block;
page_cur_open_on_rnd_user_rec(page_cursor);
if (height == 0) {
break;
}
ut_ad(height > 0);
height--;
node_ptr = page_cur_get_rec(page_cursor);
offsets = rec_get_offsets(node_ptr, page_cursor->index,
offsets, 0, ULINT_UNDEFINED, &heap);
/* If the rec is the first or last in the page for
pessimistic delete intention, it might cause node_ptr insert
for the upper level. We should change the intention and retry.
*/
if (latch_mode == BTR_MODIFY_TREE
&& btr_cur_need_opposite_intention(
page, lock_intention, node_ptr)) {
ut_ad(upper_rw_latch == RW_X_LATCH);
/* release all blocks */
for (; n_releases <= n_blocks; n_releases++) {
mtr_release_block_at_savepoint(
mtr, tree_savepoints[n_releases],
tree_blocks[n_releases]);
}
lock_intention = BTR_INTENTION_BOTH;
page_id.set_page_no(dict_index_get_page(index));
height = ULINT_UNDEFINED;
n_blocks = 0;
n_releases = 0;
continue;
}
if (latch_mode == BTR_MODIFY_TREE
&& !btr_cur_will_modify_tree(
page_cursor->index, page, lock_intention,
node_ptr, node_ptr_max_size, zip_size, mtr)) {
ut_ad(upper_rw_latch == RW_X_LATCH);
ut_ad(n_releases <= n_blocks);
/* we can release upper blocks */
for (; n_releases < n_blocks; n_releases++) {
if (n_releases == 0) {
/* we should not release root page
to pin to same block. */
continue;
}
/* release unused blocks to unpin */
mtr_release_block_at_savepoint(
mtr, tree_savepoints[n_releases],
tree_blocks[n_releases]);
}
}
if (height == 0
&& latch_mode == BTR_MODIFY_TREE) {
ut_ad(upper_rw_latch == RW_X_LATCH);
/* we should sx-latch root page, if released already.
It contains seg_header. */
if (n_releases > 0) {
mtr->sx_latch_at_savepoint(
tree_savepoints[0],
tree_blocks[0]);
}
/* x-latch the branch blocks not released yet. */
for (ulint i = n_releases; i <= n_blocks; i++) {
mtr->x_latch_at_savepoint(
tree_savepoints[i],
tree_blocks[i]);
}
}
/* Go to the child node */
page_id.set_page_no(
btr_node_ptr_get_child_page_no(node_ptr, offsets));
n_blocks++;
}
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
return err == DB_SUCCESS;
}
/*==================== B-TREE INSERT =========================*/
/*************************************************************//**

View File

@ -1079,6 +1079,60 @@ btr_record_not_null_field_in_rec(
}
}
inline dberr_t
btr_cur_t::open_random_leaf(rec_offs *&offsets, mem_heap_t *&heap, mtr_t &mtr)
{
ut_ad(!index()->is_spatial());
ut_ad(!mtr.get_savepoint());
mtr_s_lock_index(index(), &mtr);
if (index()->page == FIL_NULL)
return DB_CORRUPTION;
dberr_t err;
auto offset= index()->page;
bool merge= false;
ulint height= ULINT_UNDEFINED;
while (buf_block_t *block=
btr_block_get(*index(), offset, RW_S_LATCH, merge, &mtr, &err))
{
page_cur.block= block;
if (height == ULINT_UNDEFINED)
{
height= btr_page_get_level(block->page.frame);
if (height > BTR_MAX_LEVELS)
return DB_CORRUPTION;
if (height == 0)
goto got_leaf;
}
if (height == 0)
{
mtr.rollback_to_savepoint(0, mtr.get_savepoint() - 1);
got_leaf:
page_cur.rec= page_get_infimum_rec(block->page.frame);
return DB_SUCCESS;
}
if (!--height)
merge= !index()->is_clust();
page_cur_open_on_rnd_user_rec(&page_cur);
offsets= rec_get_offsets(page_cur.rec, page_cur.index, offsets, 0,
ULINT_UNDEFINED, &heap);
/* Go to the child node */
offset= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
}
return err;
}
/** Estimated table level stats from sampled value.
@param value sampled stats
@param index index being sampled
@ -1107,7 +1161,6 @@ std::vector<index_field_stats_t>
btr_estimate_number_of_different_key_vals(dict_index_t* index,
trx_id_t bulk_trx_id)
{
btr_cur_t cursor;
page_t* page;
rec_t* rec;
ulint n_cols;
@ -1222,14 +1275,15 @@ btr_estimate_number_of_different_key_vals(dict_index_t* index,
ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
/* We sample some pages in the index to get an estimate */
btr_cur_t cursor;
cursor.page_cur.index = index;
for (ulint i = 0; i < n_sample_pages; i++) {
mtr.start();
if (!btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
&cursor, &mtr)
|| index->table->bulk_trx_id != bulk_trx_id
|| !index->is_readable()) {
if (cursor.open_random_leaf(offsets_rec, heap, mtr) !=
DB_SUCCESS
|| index->table->bulk_trx_id != bulk_trx_id) {
mtr.commit();
goto exit_loop;
}
@ -1242,9 +1296,8 @@ btr_estimate_number_of_different_key_vals(dict_index_t* index,
page = btr_cur_get_page(&cursor);
rec = page_rec_get_next(page_get_infimum_rec(page));
const ulint n_core = page_is_leaf(page)
? index->n_core_fields : 0;
rec = page_rec_get_next(cursor.page_cur.rec);
const ulint n_core = index->n_core_fields;
if (rec && !page_rec_is_supremum(rec)) {
not_empty_flag = 1;

View File

@ -2416,36 +2416,26 @@ will be merged from ibuf trees to the pages read
ulint ibuf_contract()
{
mtr_t mtr;
btr_pcur_t pcur;
btr_cur_t cur;
ulint sum_sizes;
uint32_t page_nos[IBUF_MAX_N_PAGES_MERGED];
uint32_t space_ids[IBUF_MAX_N_PAGES_MERGED];
ibuf_mtr_start(&mtr);
/* Open a cursor to a randomly chosen leaf of the tree, at a random
position within the leaf */
pcur.pos_state = BTR_PCUR_IS_POSITIONED;
pcur.old_rec = nullptr;
pcur.trx_if_known = nullptr;
pcur.search_mode = PAGE_CUR_G;
pcur.latch_mode = BTR_SEARCH_LEAF;
btr_pcur_init(&pcur);
if (!btr_cur_open_at_rnd_pos(ibuf.index, BTR_SEARCH_LEAF,
btr_pcur_get_btr_cur(&pcur), &mtr)) {
if (cur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr) !=
DB_SUCCESS) {
return 0;
}
ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
ut_ad(page_validate(btr_cur_get_page(&cur), ibuf.index));
if (page_is_empty(btr_pcur_get_page(&pcur))) {
if (page_is_empty(btr_cur_get_page(&cur))) {
/* If a B-tree page is empty, it must be the root page
and the whole B-tree must be empty. InnoDB does not
allow empty B-tree pages other than the root. */
ut_ad(ibuf.empty);
ut_ad(btr_pcur_get_block(&pcur)->page.id()
ut_ad(btr_cur_get_block(&cur)->page.id()
== page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
ibuf_mtr_commit(&mtr);
@ -2455,7 +2445,7 @@ ulint ibuf_contract()
ulint n_pages = 0;
sum_sizes = ibuf_get_merge_page_nos(TRUE,
btr_pcur_get_rec(&pcur), &mtr,
btr_cur_get_rec(&cur), &mtr,
space_ids,
page_nos, &n_pages);
ibuf_mtr_commit(&mtr);

View File

@ -172,17 +172,6 @@ dberr_t btr_cur_search_to_nth_level(ulint level,
btr_cur_t *cursor, mtr_t *mtr,
ib_uint64_t autoinc= 0);
/**********************************************************************//**
Positions a cursor at a randomly chosen position within a B-tree.
@return true if the index is available and we have put the cursor, false
if the index is unavailable */
bool
btr_cur_open_at_rnd_pos(
dict_index_t* index, /*!< in: index */
btr_latch_mode latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
btr_cur_t* cursor, /*!< in/out: B-tree cursor */
mtr_t* mtr) /*!< in: mtr */
MY_ATTRIBUTE((nonnull,warn_unused_result));
/*************************************************************//**
Tries to perform an insert to a page in an index tree, next to cursor.
It is assumed that mtr holds an x-latch on the page. The operation does
@ -813,6 +802,14 @@ struct btr_cur_t {
@return error code */
dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
mtr_t *mtr);
/** Open the cursor at a random leaf page record.
@param offsets temporary memory for rec_get_offsets()
@param heap memory heap for rec_get_offsets()
@param mtr mini-transaction
@return error code */
inline dberr_t open_random_leaf(rec_offs *&offsets, mem_heap_t *& heap,
mtr_t &mtr);
};
/** Modify the delete-mark flag of a record.