From f127fb98076c47929e8581d0d99eed886d4307ab Mon Sep 17 00:00:00 2001 From: Yasuhiro Horimoto Date: Tue, 12 Nov 2019 20:05:17 +0900 Subject: [PATCH 1/7] Fix a typo in mariadb-plugin-mroonga.prerm Closes #1407 --- debian/mariadb-plugin-mroonga.prerm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 debian/mariadb-plugin-mroonga.prerm diff --git a/debian/mariadb-plugin-mroonga.prerm b/debian/mariadb-plugin-mroonga.prerm old mode 100644 new mode 100755 index b64ea064142..cdd26ebbc45 --- a/debian/mariadb-plugin-mroonga.prerm +++ b/debian/mariadb-plugin-mroonga.prerm @@ -2,7 +2,7 @@ set -e -# Install Mroonga +# Uninstall Mroonga mysql --defaults-file=/etc/mysql/debian.cnf < /usr/share/mysql/mroonga/uninstall.sql || true # Always exit with success instead of leaving dpkg in a broken state From abd45cdc38e72ce329365ffe0df4c6f8c319b407 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 13 Nov 2019 09:26:10 +0200 Subject: [PATCH 2/7] MDEV-20934: Correct a debug assertion A search with PAGE_CUR_GE may land on the supremum record on a leaf page that is not the rightmost leaf page. This could occur when all keys on the current page are smaller than the search key, and the smallest key on the successor page is larger than the search key. ibuf_delete_recs(): Correct the debug assertion accordingly. --- storage/innobase/ibuf/ibuf0ibuf.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index 0377bdfe57f..ce6bfe02351 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -4349,7 +4349,7 @@ loop: &pcur, &mtr); if (!btr_pcur_is_on_user_rec(&pcur)) { - ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr)); + ut_ad(btr_pcur_is_after_last_on_page(&pcur)); goto func_exit; } From 3b573c0783eded1662760dd7ec56404ceb5f2975 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 13 Nov 2019 09:51:28 +0200 Subject: [PATCH 3/7] Clean up mtr_t::commit() further memo_block_unfix(), memo_latch_release(): Merge to ReleaseLatches. memo_slot_release(), ReleaseAll: Clean up the formatting. --- storage/innobase/mtr/mtr0mtr.cc | 197 +++++++++++--------------------- 1 file changed, 68 insertions(+), 129 deletions(-) diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 533b10b861e..b75a9c4cf02 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -30,9 +30,7 @@ Created 11/26/1995 Heikki Tuuri #include "buf0flu.h" #include "page0types.h" #include "mtr0log.h" -#include "log0log.h" #include "row0trunc.h" - #include "log0recv.h" /** Iterate over a memo block in reverse. */ @@ -204,143 +202,84 @@ private: /** Release latches and decrement the buffer fix count. @param slot memo slot */ -static -void -memo_slot_release(mtr_memo_slot_t* slot) +static void memo_slot_release(mtr_memo_slot_t *slot) { - switch (slot->type) { - case MTR_MEMO_BUF_FIX: - case MTR_MEMO_PAGE_S_FIX: - case MTR_MEMO_PAGE_SX_FIX: - case MTR_MEMO_PAGE_X_FIX: { - - buf_block_t* block; - - block = reinterpret_cast(slot->object); - - buf_block_unfix(block); - buf_page_release_latch(block, slot->type); - break; - } - - case MTR_MEMO_S_LOCK: - rw_lock_s_unlock(reinterpret_cast(slot->object)); - break; - - case MTR_MEMO_SX_LOCK: - rw_lock_sx_unlock(reinterpret_cast(slot->object)); - break; - - case MTR_MEMO_X_LOCK: - rw_lock_x_unlock(reinterpret_cast(slot->object)); - break; - + switch (slot->type) { #ifdef UNIV_DEBUG - default: - ut_ad(slot->type == MTR_MEMO_MODIFY); + default: + ut_ad(!"invalid type"); + break; + case MTR_MEMO_MODIFY: + break; #endif /* UNIV_DEBUG */ - } - - slot->object = NULL; -} - -/** Unfix a page, do not release the latches on the page. -@param slot memo slot */ -static -void -memo_block_unfix(mtr_memo_slot_t* slot) -{ - switch (slot->type) { - case MTR_MEMO_BUF_FIX: - case MTR_MEMO_PAGE_S_FIX: - case MTR_MEMO_PAGE_X_FIX: - case MTR_MEMO_PAGE_SX_FIX: { - buf_block_unfix(reinterpret_cast(slot->object)); - break; - } - - case MTR_MEMO_S_LOCK: - case MTR_MEMO_X_LOCK: - case MTR_MEMO_SX_LOCK: - break; -#ifdef UNIV_DEBUG - default: -#endif /* UNIV_DEBUG */ - break; - } -} -/** Release latches represented by a slot. -@param slot memo slot */ -static -void -memo_latch_release(mtr_memo_slot_t* slot) -{ - switch (slot->type) { - case MTR_MEMO_BUF_FIX: - case MTR_MEMO_PAGE_S_FIX: - case MTR_MEMO_PAGE_SX_FIX: - case MTR_MEMO_PAGE_X_FIX: { - buf_block_t* block; - - block = reinterpret_cast(slot->object); - - memo_block_unfix(slot); - - buf_page_release_latch(block, slot->type); - - slot->object = NULL; - break; - } - - case MTR_MEMO_S_LOCK: - rw_lock_s_unlock(reinterpret_cast(slot->object)); - slot->object = NULL; - break; - - case MTR_MEMO_X_LOCK: - rw_lock_x_unlock(reinterpret_cast(slot->object)); - slot->object = NULL; - break; - - case MTR_MEMO_SX_LOCK: - rw_lock_sx_unlock(reinterpret_cast(slot->object)); - slot->object = NULL; - break; - -#ifdef UNIV_DEBUG - default: - ut_ad(slot->type == MTR_MEMO_MODIFY); - - slot->object = NULL; -#endif /* UNIV_DEBUG */ - } + case MTR_MEMO_S_LOCK: + rw_lock_s_unlock(reinterpret_cast(slot->object)); + break; + case MTR_MEMO_SX_LOCK: + rw_lock_sx_unlock(reinterpret_cast(slot->object)); + break; + case MTR_MEMO_X_LOCK: + rw_lock_x_unlock(reinterpret_cast(slot->object)); + break; + case MTR_MEMO_BUF_FIX: + case MTR_MEMO_PAGE_S_FIX: + case MTR_MEMO_PAGE_SX_FIX: + case MTR_MEMO_PAGE_X_FIX: + buf_block_t *block= reinterpret_cast(slot->object); + buf_block_unfix(block); + buf_page_release_latch(block, slot->type); + break; + } + slot->object= NULL; } /** Release the latches acquired by the mini-transaction. */ struct ReleaseLatches { - - /** @return true always. */ - bool operator()(mtr_memo_slot_t* slot) const - { - if (slot->object != NULL) { - memo_latch_release(slot); - } - - return(true); - } + /** @return true always. */ + bool operator()(mtr_memo_slot_t *slot) const + { + if (!slot->object) + return true; + switch (slot->type) { +#ifdef UNIV_DEBUG + default: + ut_ad(!"invalid type"); + break; + case MTR_MEMO_MODIFY: + break; +#endif /* UNIV_DEBUG */ + case MTR_MEMO_S_LOCK: + rw_lock_s_unlock(reinterpret_cast(slot->object)); + break; + case MTR_MEMO_X_LOCK: + rw_lock_x_unlock(reinterpret_cast(slot->object)); + break; + case MTR_MEMO_SX_LOCK: + rw_lock_sx_unlock(reinterpret_cast(slot->object)); + break; + case MTR_MEMO_BUF_FIX: + case MTR_MEMO_PAGE_S_FIX: + case MTR_MEMO_PAGE_SX_FIX: + case MTR_MEMO_PAGE_X_FIX: + buf_block_t *block= reinterpret_cast(slot->object); + buf_block_unfix(block); + buf_page_release_latch(block, slot->type); + break; + } + slot->object= NULL; + return true; + } }; /** Release the latches and blocks acquired by the mini-transaction. */ struct ReleaseAll { - /** @return true always. */ - bool operator()(mtr_memo_slot_t* slot) const - { - if (slot->object != NULL) { - memo_slot_release(slot); - } - - return(true); - } + /** @return true always. */ + bool operator()(mtr_memo_slot_t *slot) const + { + if (slot->object) + memo_slot_release(slot); + return true; + } }; #ifdef UNIV_DEBUG @@ -349,7 +288,7 @@ struct DebugCheck { /** @return true always. */ bool operator()(const mtr_memo_slot_t* slot) const { - ut_a(slot->object == NULL); + ut_ad(!slot->object); return(true); } }; From 98694ab0cbaf623c6ad67dd45d6f90c5c6214fd1 Mon Sep 17 00:00:00 2001 From: Eugene Kosov Date: Sun, 3 Nov 2019 00:15:29 +0300 Subject: [PATCH 4/7] MDEV-20949 Stop issuing 'row size' error on DML Move row size check to early CREATE/ALTER TABLE phase. Stop checking on table open. dict_index_add_to_cache(): remove parameter 'strict', stop checking row size dict_index_t::record_size_info_t: this is a result of row size check operation create_table_info_t::row_size_is_acceptable(): performs row size check. Issues error or warning. Writes first overflow field to InnoDB log. create_table_info_t::create_table(): add row size check dict_index_t::record_size_info(): this is a refactored version of dict_index_t::rec_potentially_too_big(). New version doesn't change global state of a program but return all interesting info. And it's callers who decide how to handle row size overflow. dict_index_t::rec_potentially_too_big(): removed --- .../suite/innodb/r/innodb-32k-crash.result | 2 - mysql-test/suite/innodb_zip/r/bug53591.result | 1 + .../r/prefix_index_liftedlimit.result | 1 + mysql-test/suite/innodb_zip/t/bug53591.test | 2 + .../t/prefix_index_liftedlimit.test | 2 + storage/innobase/dict/dict0boot.cc | 18 +- storage/innobase/dict/dict0crea.cc | 3 +- storage/innobase/dict/dict0dict.cc | 203 +------------ storage/innobase/handler/ha_innodb.cc | 270 ++++++++++++++++-- storage/innobase/handler/ha_innodb.h | 7 +- storage/innobase/handler/handler0alter.cc | 9 + storage/innobase/include/dict0dict.h | 6 +- storage/innobase/include/dict0mem.h | 66 ++++- storage/innobase/include/page0zip.ic | 3 + storage/innobase/row/row0mysql.cc | 3 +- 15 files changed, 332 insertions(+), 264 deletions(-) diff --git a/mysql-test/suite/innodb/r/innodb-32k-crash.result b/mysql-test/suite/innodb/r/innodb-32k-crash.result index 25b19310481..6c6b7c7fd35 100644 --- a/mysql-test/suite/innodb/r/innodb-32k-crash.result +++ b/mysql-test/suite/innodb/r/innodb-32k-crash.result @@ -136,8 +136,6 @@ v=@h,w=@h,x=@b,y=@h,z=@h, aa=@h,ba=@h,ca=@h,da=@h,ea=@h,fa=@h,ga=@h,ha=@h,ia=@h,ja=@h, ka=@h,la=@h,ma=@h,na=@h,oa=@h,pa=@h,qa=@h,ra=@h,sa=@h,ta=@b,ua=@h, va=@h,wa=@h,xa=@h,ya=@h,za=@h; -Warnings: -Warning 139 Row size too large (> 16318). Changing some columns to TEXT or BLOB or using ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED may help. In current row format, BLOB prefix of 768 bytes is stored inline. BEGIN; UPDATE t1 SET a=@g,b=@g,c=@g,d=@g,e=@g; UPDATE t1 SET f=@g,g=@g,h=@g,i=@g,j=@g; diff --git a/mysql-test/suite/innodb_zip/r/bug53591.result b/mysql-test/suite/innodb_zip/r/bug53591.result index e14a1942750..0e0a5a05e4b 100644 --- a/mysql-test/suite/innodb_zip/r/bug53591.result +++ b/mysql-test/suite/innodb_zip/r/bug53591.result @@ -1,3 +1,4 @@ +call mtr.add_suppression('InnoDB: Cannot add field.*because after adding it, the row size is'); SET GLOBAL innodb_file_per_table=on; SET GLOBAL innodb_strict_mode=on; set old_alter_table=0; diff --git a/mysql-test/suite/innodb_zip/r/prefix_index_liftedlimit.result b/mysql-test/suite/innodb_zip/r/prefix_index_liftedlimit.result index a7a9917b6f4..92a5cd51c8b 100644 --- a/mysql-test/suite/innodb_zip/r/prefix_index_liftedlimit.result +++ b/mysql-test/suite/innodb_zip/r/prefix_index_liftedlimit.result @@ -1,3 +1,4 @@ +call mtr.add_suppression('InnoDB: Cannot add field.*because after adding it, the row size is'); SET @large_prefix_orig = @@GLOBAL.innodb_large_prefix; CREATE TABLE worklog5743 ( col_1_varchar VARCHAR (4000) , col_2_varchar VARCHAR (4000) , diff --git a/mysql-test/suite/innodb_zip/t/bug53591.test b/mysql-test/suite/innodb_zip/t/bug53591.test index 67223027bad..17c79e0f6f8 100644 --- a/mysql-test/suite/innodb_zip/t/bug53591.test +++ b/mysql-test/suite/innodb_zip/t/bug53591.test @@ -1,5 +1,7 @@ -- source include/innodb_page_size_small.inc +call mtr.add_suppression('InnoDB: Cannot add field.*because after adding it, the row size is'); + let $file_per_table=`select @@innodb_file_per_table`; SET GLOBAL innodb_file_per_table=on; diff --git a/mysql-test/suite/innodb_zip/t/prefix_index_liftedlimit.test b/mysql-test/suite/innodb_zip/t/prefix_index_liftedlimit.test index ac4946e08c6..c52ef09fe90 100644 --- a/mysql-test/suite/innodb_zip/t/prefix_index_liftedlimit.test +++ b/mysql-test/suite/innodb_zip/t/prefix_index_liftedlimit.test @@ -15,6 +15,8 @@ --source include/have_innodb.inc --source include/have_innodb_16k.inc +call mtr.add_suppression('InnoDB: Cannot add field.*because after adding it, the row size is'); + SET @large_prefix_orig = @@GLOBAL.innodb_large_prefix; # Prefix index with VARCHAR data type , primary/secondary index and DML ops diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc index 58bcaf29f89..9294cf6263c 100644 --- a/storage/innobase/dict/dict0boot.cc +++ b/storage/innobase/dict/dict0boot.cc @@ -361,8 +361,7 @@ dict_boot(void) error = dict_index_add_to_cache(table, index, mach_read_from_4(dict_hdr - + DICT_HDR_TABLES), - FALSE); + + DICT_HDR_TABLES)); ut_a(error == DB_SUCCESS); /*-------------------------*/ @@ -371,10 +370,8 @@ dict_boot(void) dict_mem_index_add_field(index, "ID", 0); index->id = DICT_TABLE_IDS_ID; - error = dict_index_add_to_cache(table, index, - mach_read_from_4(dict_hdr - + DICT_HDR_TABLE_IDS), - FALSE); + error = dict_index_add_to_cache( + table, index, mach_read_from_4(dict_hdr + DICT_HDR_TABLE_IDS)); ut_a(error == DB_SUCCESS); /*-------------------------*/ @@ -405,8 +402,7 @@ dict_boot(void) index->id = DICT_COLUMNS_ID; error = dict_index_add_to_cache(table, index, mach_read_from_4(dict_hdr - + DICT_HDR_COLUMNS), - FALSE); + + DICT_HDR_COLUMNS)); ut_a(error == DB_SUCCESS); /*-------------------------*/ @@ -438,8 +434,7 @@ dict_boot(void) index->id = DICT_INDEXES_ID; error = dict_index_add_to_cache(table, index, mach_read_from_4(dict_hdr - + DICT_HDR_INDEXES), - FALSE); + + DICT_HDR_INDEXES)); ut_a(error == DB_SUCCESS); /*-------------------------*/ @@ -465,8 +460,7 @@ dict_boot(void) index->id = DICT_FIELDS_ID; error = dict_index_add_to_cache(table, index, mach_read_from_4(dict_hdr - + DICT_HDR_FIELDS), - FALSE); + + DICT_HDR_FIELDS)); ut_a(error == DB_SUCCESS); mtr_commit(&mtr); diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc index 74c57fdac6f..0687875211a 100644 --- a/storage/innobase/dict/dict0crea.cc +++ b/storage/innobase/dict/dict0crea.cc @@ -1477,8 +1477,7 @@ dict_create_index_step( if (node->state == INDEX_ADD_TO_CACHE) { err = dict_index_add_to_cache( - node->table, node->index, FIL_NULL, - trx_is_strict(trx), node->add_v); + node->table, node->index, FIL_NULL, node->add_v); ut_ad((node->index == NULL) == (err != DB_SUCCESS)); diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index eded73bad10..4b1549c419d 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -45,11 +45,6 @@ dict_index_t* dict_ind_redundant; extern uint ibuf_debug; #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ -/********************************************************************** -Issue a warning that the row is too big. */ -void -ib_warn_row_too_big(const dict_table_t* table); - #include "btr0btr.h" #include "btr0cur.h" #include "btr0sea.h" @@ -2126,184 +2121,6 @@ dict_col_name_is_reserved( return(FALSE); } -bool -dict_index_t::rec_potentially_too_big(const dict_table_t* candidate_table, - bool strict) const -{ - ut_ad(!table); - - ulint comp; - ulint i; - /* maximum possible storage size of a record */ - ulint rec_max_size; - /* maximum allowed size of a record on a leaf page */ - ulint page_rec_max; - /* maximum allowed size of a node pointer record */ - ulint page_ptr_max; - - /* FTS index consists of auxiliary tables, they shall be excluded from - index row size check */ - if (type & DICT_FTS) { - return false; - } - - DBUG_EXECUTE_IF( - "ib_force_create_table", - return(FALSE);); - - comp = dict_table_is_comp(candidate_table); - - const page_size_t page_size(dict_table_page_size(candidate_table)); - - if (page_size.is_compressed() - && page_size.physical() < univ_page_size.physical()) { - /* On a compressed page, two records must fit in the - uncompressed page modification log. On compressed pages - with size.physical() == univ_page_size.physical(), - this limit will never be reached. */ - ut_ad(comp); - /* The maximum allowed record size is the size of - an empty page, minus a byte for recoding the heap - number in the page modification log. The maximum - allowed node pointer size is half that. */ - page_rec_max = page_zip_empty_size(n_fields, - page_size.physical()); - if (page_rec_max) { - page_rec_max--; - } - page_ptr_max = page_rec_max / 2; - /* On a compressed page, there is a two-byte entry in - the dense page directory for every record. But there - is no record header. */ - rec_max_size = 2; - } else { - /* The maximum allowed record size is half a B-tree - page(16k for 64k page size). No additional sparse - page directory entry will be generated for the first - few user records. */ - page_rec_max = (comp || srv_page_size < UNIV_PAGE_SIZE_MAX) - ? page_get_free_space_of_empty(comp) / 2 - : REDUNDANT_REC_MAX_DATA_SIZE; - - page_ptr_max = page_rec_max; - /* Each record has a header. */ - rec_max_size = comp - ? REC_N_NEW_EXTRA_BYTES - : REC_N_OLD_EXTRA_BYTES; - } - - if (comp) { - /* Include the "null" flags in the - maximum possible record size. */ - rec_max_size += UT_BITS_IN_BYTES(n_nullable); - } else { - /* For each column, include a 2-byte offset and a - "null" flag. The 1-byte format is only used in short - records that do not contain externally stored columns. - Such records could never exceed the page limit, even - when using the 2-byte format. */ - rec_max_size += 2 * n_fields; - } - - const ulint max_local_len - = candidate_table->get_overflow_field_local_len(); - - /* Compute the maximum possible record size. */ - for (i = 0; i < n_fields; i++) { - const dict_field_t* field - = dict_index_get_nth_field(this, i); - const dict_col_t* col - = dict_field_get_col(field); - - /* In dtuple_convert_big_rec(), variable-length columns - that are longer than BTR_EXTERN_LOCAL_STORED_MAX_SIZE - may be chosen for external storage. - - Fixed-length columns, and all columns of secondary - index records are always stored inline. */ - - /* Determine the maximum length of the index field. - The field_ext_max_size should be computed as the worst - case in rec_get_converted_size_comp() for - REC_STATUS_ORDINARY records. */ - - size_t field_max_size = dict_col_get_fixed_size(col, comp); - if (field_max_size && field->fixed_len != 0) { - /* dict_index_add_col() should guarantee this */ - ut_ad(!field->prefix_len - || field->fixed_len == field->prefix_len); - /* Fixed lengths are not encoded - in ROW_FORMAT=COMPACT. */ - goto add_field_size; - } - - field_max_size = dict_col_get_max_size(col); - - if (field->prefix_len) { - if (field->prefix_len < field_max_size) { - field_max_size = field->prefix_len; - } - - // those conditions were copied from dtuple_convert_big_rec() - } else if (field_max_size > max_local_len - && field_max_size > BTR_EXTERN_LOCAL_STORED_MAX_SIZE - && DATA_BIG_COL(col) - && dict_index_is_clust(this)) { - - /* In the worst case, we have a locally stored - column of BTR_EXTERN_LOCAL_STORED_MAX_SIZE bytes. - The length can be stored in one byte. If the - column were stored externally, the lengths in - the clustered index page would be - BTR_EXTERN_FIELD_REF_SIZE and 2. */ - field_max_size = max_local_len; - } - - if (comp) { - /* Add the extra size for ROW_FORMAT=COMPACT. - For ROW_FORMAT=REDUNDANT, these bytes were - added to rec_max_size before this loop. */ - rec_max_size += field_max_size < 256 ? 1 : 2; - } -add_field_size: - rec_max_size += field_max_size; - - /* Check the size limit on leaf pages. */ - if (rec_max_size >= page_rec_max) { - // with 4k page size innodb_index_stats becomes too big - // this crutch allows server bootstrapping to continue - if (candidate_table->is_system_db) { - return false; - } - - ib::error_or_warn(strict) - << "Cannot add field " << field->name - << " in table " << candidate_table->name - << " because after adding it, the row size is " - << rec_max_size - << " which is greater than maximum allowed" - " size (" << page_rec_max - << ") for a record on index leaf page."; - - return true; - } - - /* Check the size limit on non-leaf pages. Records - stored in non-leaf B-tree pages consist of the unique - columns of the record (the key columns of the B-tree) - and a node pointer field. When we have processed the - unique columns, rec_max_size equals the size of the - node pointer record minus the node pointer column. */ - if (i + 1 == dict_index_get_n_unique_in_tree(this) - && rec_max_size + REC_NODE_PTR_SIZE >= page_ptr_max) { - - return true; - } - } - - return false; -} - /** Clears the virtual column's index list before index is being freed. @param[in] index Index being freed */ @@ -2348,17 +2165,13 @@ added column. @param[in,out] index index; NOTE! The index memory object is freed in this function! @param[in] page_no root page number of the index -@param[in] strict true=refuse to create the index - if records could be too big to fit in - an B-tree page @param[in] add_v virtual columns being added along with ADD INDEX -@return DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */ +@return DB_SUCCESS, or DB_CORRUPTION */ dberr_t dict_index_add_to_cache( dict_table_t* table, dict_index_t*& index, ulint page_no, - bool strict, const dict_add_v_col_t* add_v) { dict_index_t* new_index; @@ -2404,20 +2217,6 @@ dict_index_add_to_cache( new_index->disable_ahi = index->disable_ahi; #endif - if (new_index->rec_potentially_too_big(table, strict)) { - - if (strict) { - dict_mem_index_free(new_index); - dict_mem_index_free(index); - index = NULL; - return DB_TOO_BIG_RECORD; - } else if (current_thd != NULL) { - /* Avoid the warning to be printed - during recovery. */ - ib_warn_row_too_big((const dict_table_t*)table); - } - } - n_ord = new_index->n_uniq; /* Flag the ordering columns and also set column max_prefix */ diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 8a6a3e3d8fb..caa0b1e861c 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -5578,7 +5578,7 @@ normalize_table_name_c_low( create_table_info_t::create_table_info_t( THD* thd, - TABLE* form, + const TABLE* form, HA_CREATE_INFO* create_info, char* table_name, char* remote_path, @@ -12738,16 +12738,250 @@ int create_table_info_t::create_table(bool create_fk) } } - innobase_table = dict_table_open_on_name( - m_table_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); + innobase_table = dict_table_open_on_name(m_table_name, true, false, + DICT_ERR_IGNORE_NONE); + ut_ad(innobase_table); - if (innobase_table != NULL) { - dict_table_close(innobase_table, TRUE, FALSE); + const bool is_acceptable = row_size_is_acceptable(*innobase_table); + + dict_table_close(innobase_table, true, false); + + if (!is_acceptable) { + DBUG_RETURN(convert_error_code_to_mysql( + DB_TOO_BIG_RECORD, m_flags, NULL)); } DBUG_RETURN(0); } +bool create_table_info_t::row_size_is_acceptable( + const dict_table_t &table) const +{ + for (dict_index_t *index= dict_table_get_first_index(&table); index; + index= dict_table_get_next_index(index)) + { + + if (!row_size_is_acceptable(*index)) + { + return false; + } + } + + return true; +} + +/* FIXME: row size check has some flaws and should be improved */ +dict_index_t::record_size_info_t dict_index_t::record_size_info() const +{ + ut_ad(!(type & DICT_FTS)); + + /* maximum allowed size of a node pointer record */ + ulint page_ptr_max; + const bool comp= dict_table_is_comp(table); + const page_size_t page_size(dict_table_page_size(table)); + record_size_info_t result; + + if (page_size.is_compressed() && + page_size.physical() < univ_page_size.physical()) + { + /* On a ROW_FORMAT=COMPRESSED page, two records must fit in the + uncompressed page modification log. On compressed pages + with size.physical() == univ_page_size.physical(), + this limit will never be reached. */ + ut_ad(comp); + /* The maximum allowed record size is the size of + an empty page, minus a byte for recoding the heap + number in the page modification log. The maximum + allowed node pointer size is half that. */ + result.max_leaf_size= page_zip_empty_size(n_fields, page_size.physical()); + if (result.max_leaf_size) + { + result.max_leaf_size--; + } + page_ptr_max= result.max_leaf_size / 2; + /* On a compressed page, there is a two-byte entry in + the dense page directory for every record. But there + is no record header. */ + result.shortest_size= 2; + } + else + { + /* The maximum allowed record size is half a B-tree + page(16k for 64k page size). No additional sparse + page directory entry will be generated for the first + few user records. */ + result.max_leaf_size= (comp || srv_page_size < UNIV_PAGE_SIZE_MAX) + ? page_get_free_space_of_empty(comp) / 2 + : REDUNDANT_REC_MAX_DATA_SIZE; + + page_ptr_max= result.max_leaf_size; + /* Each record has a header. */ + result.shortest_size= comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES; + } + + if (comp) + { + /* Include the "null" flags in the + maximum possible record size. */ + result.shortest_size+= UT_BITS_IN_BYTES(n_nullable); + } + else + { + /* For each column, include a 2-byte offset and a + "null" flag. The 1-byte format is only used in short + records that do not contain externally stored columns. + Such records could never exceed the page limit, even + when using the 2-byte format. */ + result.shortest_size+= 2 * n_fields; + } + + const ulint max_local_len= table->get_overflow_field_local_len(); + + /* Compute the maximum possible record size. */ + for (unsigned i= 0; i < n_fields; i++) + { + const dict_field_t &f= fields[i]; + const dict_col_t &col= *f.col; + + /* In dtuple_convert_big_rec(), variable-length columns + that are longer than BTR_EXTERN_LOCAL_STORED_MAX_SIZE + may be chosen for external storage. + + Fixed-length columns, and all columns of secondary + index records are always stored inline. */ + + /* Determine the maximum length of the index field. + The field_ext_max_size should be computed as the worst + case in rec_get_converted_size_comp() for + REC_STATUS_ORDINARY records. */ + + size_t field_max_size= dict_col_get_fixed_size(&col, comp); + if (field_max_size && f.fixed_len != 0) + { + /* dict_index_add_col() should guarantee this */ + ut_ad(!f.prefix_len || f.fixed_len == f.prefix_len); + /* Fixed lengths are not encoded + in ROW_FORMAT=COMPACT. */ + goto add_field_size; + } + + field_max_size= dict_col_get_max_size(&col); + + if (f.prefix_len) + { + if (f.prefix_len < field_max_size) + { + field_max_size= f.prefix_len; + } + + /* those conditions were copied from dtuple_convert_big_rec()*/ + } + else if (field_max_size > max_local_len && + field_max_size > BTR_EXTERN_LOCAL_STORED_MAX_SIZE && + DATA_BIG_COL(&col) && dict_index_is_clust(this)) + { + + /* In the worst case, we have a locally stored + column of BTR_EXTERN_LOCAL_STORED_MAX_SIZE bytes. + The length can be stored in one byte. If the + column were stored externally, the lengths in + the clustered index page would be + BTR_EXTERN_FIELD_REF_SIZE and 2. */ + field_max_size= max_local_len; + } + + if (comp) + { + /* Add the extra size for ROW_FORMAT=COMPACT. + For ROW_FORMAT=REDUNDANT, these bytes were + added to result.shortest_size before this loop. */ + result.shortest_size+= field_max_size < 256 ? 1 : 2; + } + add_field_size: + result.shortest_size+= field_max_size; + + /* Check the size limit on leaf pages. */ + if (result.shortest_size >= result.max_leaf_size) + { + result.set_too_big(i); + } + + /* Check the size limit on non-leaf pages. Records + stored in non-leaf B-tree pages consist of the unique + columns of the record (the key columns of the B-tree) + and a node pointer field. When we have processed the + unique columns, result.shortest_size equals the size of the + node pointer record minus the node pointer column. */ + if (i + 1 == dict_index_get_n_unique_in_tree(this) && + result.shortest_size + REC_NODE_PTR_SIZE >= page_ptr_max) + { + result.set_too_big(i); + } + } + + return result; +} + +/** Issue a warning that the row is too big. */ +static void ib_warn_row_too_big(THD *thd, const dict_table_t *table) +{ + /* FIXME: this row size check should be improved */ + /* If prefix is true then a 768-byte prefix is stored + locally for BLOB fields. Refer to dict_table_get_format() */ + const bool prefix= (dict_tf_get_format(table->flags) == UNIV_FORMAT_A); + + const ulint free_space= + page_get_free_space_of_empty(table->flags & DICT_TF_COMPACT) / 2; + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_TO_BIG_ROW, + "Row size too large (> " ULINTPF "). Changing some columns to TEXT" + " or BLOB %smay help. In current row format, BLOB prefix of" + " %d bytes is stored inline.", + free_space, + prefix ? "or using ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED " : "", + prefix ? DICT_MAX_FIXED_COL_LEN : 0); +} + +bool create_table_info_t::row_size_is_acceptable( + const dict_index_t &index) const +{ + if ((index.type & DICT_FTS) || index.table->is_system_db) + { + /* Ignore system tables check because innodb_table_stats + maximum row size can not fit on 4k page. */ + return true; + } + + const bool strict= THDVAR(m_thd, strict_mode); + dict_index_t::record_size_info_t info= index.record_size_info(); + + if (info.row_is_too_big()) + { + ut_ad(info.get_overrun_size() != 0); + ut_ad(info.max_leaf_size != 0); + + const size_t idx= info.get_first_overrun_field_index(); + const dict_field_t *field= dict_index_get_nth_field(&index, idx); + + ib::error_or_warn(strict) + << "Cannot add field " << field->name << " in table " + << index.table->name << " because after adding it, the row size is " + << info.get_overrun_size() + << " which is greater than maximum allowed size (" + << info.max_leaf_size << " bytes) for a record on index leaf page."; + + if (strict) + { + return false; + } + + ib_warn_row_too_big(m_thd, index.table); + } + + return true; +} + /** Update a new table in an InnoDB database. @return error number */ int @@ -22165,32 +22399,6 @@ innobase_convert_to_system_charset( cs2, to, static_cast(len), errors))); } -/********************************************************************** -Issue a warning that the row is too big. */ -void -ib_warn_row_too_big(const dict_table_t* table) -{ - /* If prefix is true then a 768-byte prefix is stored - locally for BLOB fields. Refer to dict_table_get_format() */ - const bool prefix = (dict_tf_get_format(table->flags) - == UNIV_FORMAT_A); - - const ulint free_space = page_get_free_space_of_empty( - table->flags & DICT_TF_COMPACT) / 2; - - THD* thd = current_thd; - - push_warning_printf( - thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_TO_BIG_ROW, - "Row size too large (> " ULINTPF ")." - " Changing some columns to TEXT" - " or BLOB %smay help. In current row format, BLOB prefix of" - " %d bytes is stored inline.", free_space - , prefix ? "or using ROW_FORMAT=DYNAMIC or" - " ROW_FORMAT=COMPRESSED ": "" - , prefix ? DICT_MAX_FIXED_COL_LEN : 0); -} - /** Validate the requested buffer pool size. Also, reserve the necessary memory needed for buffer pool resize. @param[in] thd thread handle diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index b812a6f3d59..312da6451f0 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -631,7 +631,7 @@ public: - all but name/path is used, when validating options and using flags. */ create_table_info_t( THD* thd, - TABLE* form, + const TABLE* form, HA_CREATE_INFO* create_info, char* table_name, char* remote_path, @@ -679,6 +679,11 @@ public: void allocate_trx(); + /** Checks that every index have sane size. Depends on strict mode */ + bool row_size_is_acceptable(const dict_table_t& table) const; + /** Checks that given index have sane size. Depends on strict mode */ + bool row_size_is_acceptable(const dict_index_t& index) const; + /** Determines InnoDB table flags. If strict_mode=OFF, this will adjust the flags to what should be assumed. @retval true if successful, false if error */ diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 9ed13f5ac0d..bae93e399e9 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -4400,6 +4400,10 @@ prepare_inplace_alter_table_dict( new_clustered = DICT_CLUSTERED & index_defs[0].ind_type; + create_table_info_t info(ctx->prebuilt->trx->mysql_thd, altered_table, + ha_alter_info->create_info, NULL, NULL, + srv_file_per_table); + if (num_fts_index > 1) { my_error(ER_INNODB_FT_LIMIT, MYF(0)); goto error_handled; @@ -4840,6 +4844,11 @@ index_created: goto error_handling; } + if (!info.row_size_is_acceptable(*ctx->add_index[a])) { + error = DB_TOO_BIG_RECORD; + goto error_handling; + } + DBUG_ASSERT(ctx->add_index[a]->is_committed() == !!new_clustered); diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index bf4e4b9e289..565ea77374d 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -1099,17 +1099,13 @@ added column. @param[in,out] index index; NOTE! The index memory object is freed in this function! @param[in] page_no root page number of the index -@param[in] strict true=refuse to create the index - if records could be too big to fit in - an B-tree page @param[in] add_v virtual columns being added along with ADD INDEX -@return DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */ +@return DB_SUCCESS, or DB_CORRUPTION */ dberr_t dict_index_add_to_cache( dict_table_t* table, dict_index_t*& index, ulint page_no, - bool strict = false, const dict_add_v_col_t* add_v = NULL) MY_ATTRIBUTE((warn_unused_result)); /********************************************************************//** diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index a6047d6f8c3..0f96cd4e3ba 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -1029,13 +1029,65 @@ struct dict_index_t{ } } - /** If a record of this index might not fit on a single B-tree page, - return true. - @param[in] candidate_table where we're goint to attach this index - @param[in] strict issue error or warning - @return true if the index record could become too big */ - bool rec_potentially_too_big(const dict_table_t* candidate_table, - bool strict) const; + /** This ad-hoc class is used by record_size_info only. */ + class record_size_info_t { + public: + record_size_info_t() + : max_leaf_size(0), shortest_size(0), too_big(false), + first_overrun_field_index(SIZE_T_MAX), overrun_size(0) + { + } + + /** Mark row potentially too big for page and set up first + overflow field index. */ + void set_too_big(size_t field_index) + { + ut_ad(field_index != SIZE_T_MAX); + + too_big = true; + if (first_overrun_field_index > field_index) { + first_overrun_field_index = field_index; + overrun_size = shortest_size; + } + } + + /** @return overrun field index or SIZE_T_MAX if nothing + overflowed*/ + size_t get_first_overrun_field_index() const + { + ut_ad(row_is_too_big()); + ut_ad(first_overrun_field_index != SIZE_T_MAX); + return first_overrun_field_index; + } + + size_t get_overrun_size() const + { + ut_ad(row_is_too_big()); + return overrun_size; + } + + bool row_is_too_big() const { return too_big; } + + size_t max_leaf_size; /** Bigger row size this index can + produce */ + size_t shortest_size; /** shortest because it counts everything + as in overflow pages */ + + private: + bool too_big; /** This one is true when maximum row size this + index can produce is bigger than maximum row + size given page can hold. */ + size_t first_overrun_field_index; /** After adding this field + index row overflowed maximum + allowed size. Useful for + reporting back to user. */ + size_t overrun_size; /** Just overrun row size */ + }; + + /** Returns max possibly record size for that index, size of a shortest + everything in overflow) size of the longest possible row and index + of a field which made index records too big to fit on a page.*/ + inline record_size_info_t record_size_info() const; }; /** Detach a column from an index. diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic index 1258c0d8d4d..5345aa19dd5 100644 --- a/storage/innobase/include/page0zip.ic +++ b/storage/innobase/include/page0zip.ic @@ -164,6 +164,9 @@ page_zip_rec_needs_ext( ulint n_fields, const page_size_t& page_size) { + /* FIXME: row size check is this function seems to be the most correct. + Put it in a separate function and use in more places of InnoDB */ + ut_ad(rec_size > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)); ut_ad(comp || !page_size.is_compressed()); diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index a5795ff90e8..dabdeeec68b 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -2450,8 +2450,7 @@ row_create_index_for_mysql( } else { dict_build_index_def(table, index, trx); - err = dict_index_add_to_cache( - table, index, FIL_NULL, trx_is_strict(trx)); + err = dict_index_add_to_cache(table, index, FIL_NULL); ut_ad((index == NULL) == (err != DB_SUCCESS)); if (err != DB_SUCCESS) { From d4edb0510ec1189f65850bb47977e94ed98b1f71 Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Wed, 13 Nov 2019 18:53:59 +0300 Subject: [PATCH 5/7] MDEV-20646: 10.3.18 is slower than 10.3.17 Fix incorrect change introduced in the fix for MDEV-20109. The patch tried to compute a more precise estimate for the record_count value in SJ-Materialization-Scan strategy (in Sj_materialization_picker::check_qep). However the new formula is worse as it produces extremely optimistic results in common cases where SJ-Materialization-Scan should be used) The old formula produces pessimistic results in cases when Sj-Materialization- Scan is unlikely to be a good choice anyway. So, the old behavior is better. --- mysql-test/main/subselect_sj2_mat.result | 14 +++++++------- mysql-test/main/subselect_sj_jcl6.result | 8 ++++---- sql/opt_subselect.cc | 17 ++++++++++++++++- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/mysql-test/main/subselect_sj2_mat.result b/mysql-test/main/subselect_sj2_mat.result index dd9f560aeed..ab3e4652deb 100644 --- a/mysql-test/main/subselect_sj2_mat.result +++ b/mysql-test/main/subselect_sj2_mat.result @@ -1855,18 +1855,18 @@ AND t3.id_product IN (SELECT id_product FROM t2 t2_3 WHERE t2_3.id_t2 = 18 OR t2 AND t3.id_product IN (SELECT id_product FROM t2 t2_4 WHERE t2_4.id_t2 = 34 OR t2_4.id_t2 = 23) AND t3.id_product IN (SELECT id_product FROM t2 t2_5 WHERE t2_5.id_t2 = 29 OR t2_5.id_t2 = 28 OR t2_5.id_t2 = 26); id select_type table type possible_keys key key_len ref rows Extra -1 PRIMARY ALL distinct_key NULL NULL NULL 12 -1 PRIMARY t3 eq_ref PRIMARY PRIMARY 4 test.t2_2.id_product 1 Using where; Using index +1 PRIMARY t1 index NULL PRIMARY 8 NULL 73 Using index +1 PRIMARY t3 eq_ref PRIMARY PRIMARY 4 test.t1.id_product 1 Using index 1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 Using where 1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 Using where -1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 Using where +1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 Using where 1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 Using where +1 PRIMARY t4 eq_ref PRIMARY PRIMARY 8 test.t1.id_product,const 1 Using where; Using index +1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 Using where 1 PRIMARY t5 ALL NULL NULL NULL NULL 18 Using where; Using join buffer (flat, BNL join) -1 PRIMARY t4 eq_ref PRIMARY PRIMARY 8 test.t3.id_product,const 1 Using where; Using index -1 PRIMARY t1 index NULL PRIMARY 8 NULL 73 Using where; Using index; Using join buffer (flat, BNL join) -3 MATERIALIZED t2_2 ref id_t2,id_product id_t2 5 const 12 Using where 5 MATERIALIZED t2_4 range id_t2,id_product id_t2 5 NULL 18 Using index condition; Using where 4 MATERIALIZED t2_3 range id_t2,id_product id_t2 5 NULL 32 Using index condition; Using where -6 MATERIALIZED t2_5 range id_t2,id_product id_t2 5 NULL 30 Using index condition; Using where +3 MATERIALIZED t2_2 ref id_t2,id_product id_t2 5 const 12 2 MATERIALIZED t2_1 ref id_t2,id_product id_t2 5 const 50 +6 MATERIALIZED t2_5 range id_t2,id_product id_t2 5 NULL 30 Using index condition; Using where drop table t1,t2,t3,t4,t5; diff --git a/mysql-test/main/subselect_sj_jcl6.result b/mysql-test/main/subselect_sj_jcl6.result index 6278b5a0cf5..e9a19b2a1c3 100644 --- a/mysql-test/main/subselect_sj_jcl6.result +++ b/mysql-test/main/subselect_sj_jcl6.result @@ -3527,8 +3527,8 @@ EXPLAIN SELECT a FROM t1 t WHERE a IN (SELECT b FROM t1, t2 WHERE b = a) GROUP BY a HAVING a != 'z'; id select_type table type possible_keys key key_len ref rows Extra -1 PRIMARY ALL distinct_key NULL NULL NULL 2 Using temporary; Using filesort -1 PRIMARY t ref idx_a idx_a 4 test.t2.b 2 Using index +1 PRIMARY t index idx_a idx_a 4 NULL 3 Using index +1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 2 MATERIALIZED t2 ALL NULL NULL NULL NULL 2 Using where 2 MATERIALIZED t1 ref idx_a idx_a 4 test.t2.b 2 Using index SELECT a FROM t1 t WHERE a IN (SELECT b FROM t1, t2 WHERE b = a) @@ -3541,8 +3541,8 @@ EXPLAIN SELECT a FROM t1 t WHERE a IN (SELECT b FROM t1, t2 WHERE b = a) GROUP BY a HAVING a != 'z'; id select_type table type possible_keys key key_len ref rows Extra -1 PRIMARY ALL distinct_key NULL NULL NULL 2 Using temporary; Using filesort -1 PRIMARY t ref idx_a idx_a 4 test.t2.b 2 Using index +1 PRIMARY t index idx_a idx_a 4 NULL 3 Using index +1 PRIMARY eq_ref distinct_key distinct_key 4 func 1 2 MATERIALIZED t2 ALL NULL NULL NULL NULL 2 Using where 2 MATERIALIZED t1 ref idx_a idx_a 4 test.t2.b 2 Using index SELECT a FROM t1 t WHERE a IN (SELECT b FROM t1, t2 WHERE b = a) diff --git a/sql/opt_subselect.cc b/sql/opt_subselect.cc index aeafc13998a..a8afd952a4d 100644 --- a/sql/opt_subselect.cc +++ b/sql/opt_subselect.cc @@ -3029,7 +3029,22 @@ bool Sj_materialization_picker::check_qep(JOIN *join, *strategy= SJ_OPT_MATERIALIZE_SCAN; *read_time= prefix_cost; - *record_count= prefix_rec_count / mat_info->rows_with_duplicates; + /* + Note: the next line means we did not remove the subquery's fanout from + *record_count. It needs to be removed, as the join prefix is + + ntX SJM-SCAN(it1 ... itN) | (ot1 ... otN) ... + + here, the SJM-SCAN may have introduced subquery's fanout (duplicate rows, + rows that don't have matches in ot1_i). All this fanout is gone after + table otN (or earlier) but taking it into account is hard. + + Some consolation here is that SJM-Scan strategy is applicable when the + subquery is smaller than tables otX. If the subquery has large cardinality, + we can greatly overestimate *record_count here, but it doesn't matter as + SJ-Materialization-Lookup is a better strategy anyway. + */ + *record_count= prefix_rec_count; *handled_fanout= mat_nest->sj_inner_tables; return TRUE; } From caa79081c31d15d23f790c2cf32eab662a5a5d8e Mon Sep 17 00:00:00 2001 From: Sujatha Date: Mon, 14 Oct 2019 12:08:28 +0530 Subject: [PATCH 6/7] MDEV-20707: Missing memory barrier in parallel replication error handler in wait_for_prior_commit() revision-id: 673e253724979fd9fe43a4a22bd7e1b2c3a5269e Author: Kristian Nielsen Fix missing memory barrier in wait_for_commit. The function wait_for_commit::wait_for_prior_commit() has a fast path where it checks without locks if wakeup_subsequent_commits() has already been called. This check was missing a memory barrier. The waitee thread does two writes to variables `waitee' and `wakeup_error', and if the waiting thread sees the first write it _must_ also see the second or incorrect behavior will occur. This requires memory barriers between both the writes (release semantics) and the reads (acquire semantics) of those two variables. Other accesses to these variables are done under lock or where only one thread will be accessing them, and can be done without barriers (relaxed semantics). --- sql/log.cc | 19 +++++++++++++------ sql/sql_class.cc | 25 ++++++++++++++----------- sql/sql_class.h | 17 ++++++++++++----- 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/sql/log.cc b/sql/log.cc index b90804ed9f0..d7878db6901 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -7481,8 +7481,10 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) */ wfc= orig_entry->thd->wait_for_commit_ptr; orig_entry->queued_by_other= false; - if (wfc && wfc->waitee) + if (wfc && wfc->waitee.load(std::memory_order_acquire)) { + wait_for_commit *loc_waitee; + mysql_mutex_lock(&wfc->LOCK_wait_commit); /* Do an extra check here, this time safely under lock. @@ -7494,10 +7496,10 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) before setting the flag, so there is no risk that we can queue ahead of it. */ - if (wfc->waitee && !wfc->waitee->commit_started) + if ((loc_waitee= wfc->waitee.load(std::memory_order_relaxed)) && + !loc_waitee->commit_started) { PSI_stage_info old_stage; - wait_for_commit *loc_waitee; /* By setting wfc->opaque_pointer to our own entry, we mark that we are @@ -7519,7 +7521,8 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) &wfc->LOCK_wait_commit, &stage_waiting_for_prior_transaction_to_commit, &old_stage); - while ((loc_waitee= wfc->waitee) && !orig_entry->thd->check_killed(1)) + while ((loc_waitee= wfc->waitee.load(std::memory_order_relaxed)) && + !orig_entry->thd->check_killed(1)) mysql_cond_wait(&wfc->COND_wait_commit, &wfc->LOCK_wait_commit); wfc->opaque_pointer= NULL; DBUG_PRINT("info", ("After waiting for prior commit, queued_by_other=%d", @@ -7537,14 +7540,18 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) do { mysql_cond_wait(&wfc->COND_wait_commit, &wfc->LOCK_wait_commit); - } while (wfc->waitee); + } while (wfc->waitee.load(std::memory_order_relaxed)); } else { /* We were killed, so remove us from the list of waitee. */ wfc->remove_from_list(&loc_waitee->subsequent_commits_list); mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit); - wfc->waitee= NULL; + /* + This is the thread clearing its own status, it is no longer on + the list of waiters. So no memory barriers are needed here. + */ + wfc->waitee.store(NULL, std::memory_order_relaxed); orig_entry->thd->EXIT_COND(&old_stage); /* Interrupted by kill. */ diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 5fbe5704649..1e276d257da 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -7228,7 +7228,7 @@ wait_for_commit::reinit() { subsequent_commits_list= NULL; next_subsequent_commit= NULL; - waitee= NULL; + waitee.store(NULL, std::memory_order_relaxed); opaque_pointer= NULL; wakeup_error= 0; wakeup_subsequent_commits_running= false; @@ -7306,8 +7306,9 @@ wait_for_commit::wakeup(int wakeup_error) */ mysql_mutex_lock(&LOCK_wait_commit); - waitee= NULL; this->wakeup_error= wakeup_error; + /* Memory barrier to make wakeup_error visible to the waiter thread. */ + waitee.store(NULL, std::memory_order_release); /* Note that it is critical that the mysql_cond_signal() here is done while still holding the mutex. As soon as we release the mutex, the waiter might @@ -7338,9 +7339,10 @@ wait_for_commit::wakeup(int wakeup_error) void wait_for_commit::register_wait_for_prior_commit(wait_for_commit *waitee) { - DBUG_ASSERT(!this->waitee /* No prior registration allowed */); + DBUG_ASSERT(!this->waitee.load(std::memory_order_relaxed) + /* No prior registration allowed */); wakeup_error= 0; - this->waitee= waitee; + this->waitee.store(waitee, std::memory_order_relaxed); mysql_mutex_lock(&waitee->LOCK_wait_commit); /* @@ -7349,7 +7351,7 @@ wait_for_commit::register_wait_for_prior_commit(wait_for_commit *waitee) see comments on wakeup_subsequent_commits2() for details. */ if (waitee->wakeup_subsequent_commits_running) - this->waitee= NULL; + this->waitee.store(NULL, std::memory_order_relaxed); else { /* @@ -7379,7 +7381,8 @@ wait_for_commit::wait_for_prior_commit2(THD *thd) thd->ENTER_COND(&COND_wait_commit, &LOCK_wait_commit, &stage_waiting_for_prior_transaction_to_commit, &old_stage); - while ((loc_waitee= this->waitee) && likely(!thd->check_killed(1))) + while ((loc_waitee= this->waitee.load(std::memory_order_relaxed)) && + likely(!thd->check_killed(1))) mysql_cond_wait(&COND_wait_commit, &LOCK_wait_commit); if (!loc_waitee) { @@ -7402,14 +7405,14 @@ wait_for_commit::wait_for_prior_commit2(THD *thd) do { mysql_cond_wait(&COND_wait_commit, &LOCK_wait_commit); - } while (this->waitee); + } while (this->waitee.load(std::memory_order_relaxed)); if (wakeup_error) my_error(ER_PRIOR_COMMIT_FAILED, MYF(0)); goto end; } remove_from_list(&loc_waitee->subsequent_commits_list); mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit); - this->waitee= NULL; + this->waitee.store(NULL, std::memory_order_relaxed); wakeup_error= thd->killed_errno(); if (!wakeup_error) @@ -7511,7 +7514,7 @@ wait_for_commit::unregister_wait_for_prior_commit2() wait_for_commit *loc_waitee; mysql_mutex_lock(&LOCK_wait_commit); - if ((loc_waitee= this->waitee)) + if ((loc_waitee= this->waitee.load(std::memory_order_relaxed))) { mysql_mutex_lock(&loc_waitee->LOCK_wait_commit); if (loc_waitee->wakeup_subsequent_commits_running) @@ -7524,7 +7527,7 @@ wait_for_commit::unregister_wait_for_prior_commit2() See comments on wakeup_subsequent_commits2() for more details. */ mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit); - while (this->waitee) + while (this->waitee.load(std::memory_order_relaxed)) mysql_cond_wait(&COND_wait_commit, &LOCK_wait_commit); } else @@ -7532,7 +7535,7 @@ wait_for_commit::unregister_wait_for_prior_commit2() /* Remove ourselves from the list in the waitee. */ remove_from_list(&loc_waitee->subsequent_commits_list); mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit); - this->waitee= NULL; + this->waitee.store(NULL, std::memory_order_relaxed); } } wakeup_error= 0; diff --git a/sql/sql_class.h b/sql/sql_class.h index 3f732a76e53..36cb42c6314 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -20,6 +20,7 @@ /* Classes in mysql */ +#include #include "dur_prop.h" #include #include "sql_const.h" @@ -2011,7 +2012,7 @@ struct wait_for_commit /* The LOCK_wait_commit protects the fields subsequent_commits_list and wakeup_subsequent_commits_running (for a waitee), and the pointer - waiterr and associated COND_wait_commit (for a waiter). + waitee and associated COND_wait_commit (for a waiter). */ mysql_mutex_t LOCK_wait_commit; mysql_cond_t COND_wait_commit; @@ -2025,8 +2026,14 @@ struct wait_for_commit When this is cleared for wakeup, the COND_wait_commit condition is signalled. + + This pointer is protected by LOCK_wait_commit. But there is also a "fast + path" where the waiter compares this to NULL without holding the lock. + Such read must be done with acquire semantics (and all corresponding + writes done with release semantics). This ensures that a wakeup with error + is reliably detected as (waitee==NULL && wakeup_error != 0). */ - wait_for_commit *waitee; + std::atomic waitee; /* Generic pointer for use by the transaction coordinator to optimise the waiting for improved group commit. @@ -2061,7 +2068,7 @@ struct wait_for_commit Quick inline check, to avoid function call and locking in the common case where no wakeup is registered, or a registered wait was already signalled. */ - if (waitee) + if (waitee.load(std::memory_order_acquire)) return wait_for_prior_commit2(thd); else { @@ -2089,7 +2096,7 @@ struct wait_for_commit } void unregister_wait_for_prior_commit() { - if (waitee) + if (waitee.load(std::memory_order_relaxed)) unregister_wait_for_prior_commit2(); else wakeup_error= 0; @@ -2111,7 +2118,7 @@ struct wait_for_commit } next_ptr_ptr= &cur->next_subsequent_commit; } - waitee= NULL; + waitee.store(NULL, std::memory_order_relaxed); } void wakeup(int wakeup_error); From 3d4a80153345209bad736235d4f66dcaa51a9d51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 14 Nov 2019 11:40:33 +0200 Subject: [PATCH 7/7] MDEV-12353 preparation: Replace mtr_x_lock() and friends Apart from page latches (buf_block_t::lock), mini-transactions are keeping track of at most one dict_index_t::lock and fil_space_t::latch at a time, and in a rare case, purge_sys.latch. Let us introduce interfaces for acquiring an index latch or a tablespace latch. In a later version, we may want to introduce mtr_t members for holding a latched dict_index_t* and fil_space_t*, and replace the remaining use of mtr_t::m_memo with std::set or with a map pointing to log records. --- storage/innobase/btr/btr0btr.cc | 24 +++---- storage/innobase/btr/btr0bulk.cc | 2 +- storage/innobase/btr/btr0cur.cc | 30 ++++----- storage/innobase/btr/btr0defragment.cc | 2 +- storage/innobase/btr/btr0scrub.cc | 2 +- storage/innobase/dict/dict0defrag_bg.cc | 6 +- storage/innobase/dict/dict0stats.cc | 32 ++++----- storage/innobase/fsp/fsp0fsp.cc | 10 +-- storage/innobase/gis/gis0rtree.cc | 2 +- storage/innobase/gis/gis0sea.cc | 2 +- storage/innobase/ibuf/ibuf0ibuf.cc | 8 +-- storage/innobase/include/mtr0mtr.h | 89 ++++++++++++++++--------- storage/innobase/include/mtr0mtr.ic | 33 --------- storage/innobase/mtr/mtr0mtr.cc | 5 +- storage/innobase/row/row0ins.cc | 10 +-- storage/innobase/row/row0purge.cc | 24 +++---- storage/innobase/row/row0uins.cc | 6 +- storage/innobase/row/row0umod.cc | 14 ++-- storage/innobase/row/row0upd.cc | 4 +- storage/innobase/trx/trx0purge.cc | 2 +- storage/innobase/trx/trx0rseg.cc | 2 +- storage/innobase/trx/trx0sys.cc | 2 +- 22 files changed, 148 insertions(+), 163 deletions(-) diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 9a38b0ebf27..2f353c9e7e7 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -4727,13 +4727,13 @@ btr_validate_level( ulint parent_right_page_no = FIL_NULL; bool rightmost_child = false; - mtr_start(&mtr); + mtr.start(); if (!srv_read_only_mode) { if (lockout) { - mtr_x_lock(dict_index_get_lock(index), &mtr); + mtr_x_lock_index(index, &mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } } @@ -4822,9 +4822,9 @@ loop: offsets = offsets2 = NULL; if (!srv_read_only_mode) { if (lockout) { - mtr_x_lock(dict_index_get_lock(index), &mtr); + mtr_x_lock_index(index, &mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } } @@ -5122,13 +5122,13 @@ node_ptr_fails: /* Commit the mini-transaction to release the latch on 'page'. Re-acquire the latch on right_page, which will become 'page' on the next loop. The page has already been checked. */ - mtr_commit(&mtr); + mtr.commit(); if (trx_is_interrupted(trx)) { /* On interrupt, return the current status. */ } else if (right_page_no != FIL_NULL) { - mtr_start(&mtr); + mtr.start(); if (!lockout) { if (rightmost_child) { @@ -5178,9 +5178,9 @@ btr_validate_spatial_index( mtr_t mtr; bool ok = true; - mtr_start(&mtr); + mtr.start(); - mtr_x_lock(dict_index_get_lock(index), &mtr); + mtr_x_lock_index(index, &mtr); page_t* root = btr_root_get(index, &mtr); ulint n = btr_page_get_level(root); @@ -5200,7 +5200,7 @@ btr_validate_spatial_index( } } - mtr_commit(&mtr); + mtr.commit(); return(ok); } @@ -5236,9 +5236,9 @@ btr_validate_index( if (!srv_read_only_mode) { if (lockout) { - mtr_x_lock(dict_index_get_lock(index), &mtr); + mtr_x_lock_index(index, &mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } } diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc index 338c9fc3c8d..6f11796487d 100644 --- a/storage/innobase/btr/btr0bulk.cc +++ b/storage/innobase/btr/btr0bulk.cc @@ -1014,7 +1014,7 @@ BtrBulk::finish(dberr_t err) mtr.start(); m_index->set_modified(mtr); - mtr_x_lock(&m_index->lock, &mtr); + mtr_x_lock_index(m_index, &mtr); ut_ad(last_page_no != FIL_NULL); last_block = btr_block_get( diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index d904e92935c..3465ff2edcb 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -1316,16 +1316,16 @@ btr_cur_search_to_nth_level_func( if (lock_intention == BTR_INTENTION_DELETE && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { - mtr_x_lock(dict_index_get_lock(index), mtr); - } else if (dict_index_is_spatial(index) +x_latch_index: + mtr_x_lock_index(index, mtr); + } else if (index->is_spatial() && lock_intention <= BTR_INTENTION_BOTH) { /* X lock the if there is possibility of pessimistic delete on spatial index. As we could lock upward for the tree */ - - mtr_x_lock(dict_index_get_lock(index), mtr); + goto x_latch_index; } else { - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } upper_rw_latch = RW_X_LATCH; break; @@ -1357,10 +1357,10 @@ btr_cur_search_to_nth_level_func( BTR_ALREADY_S_LATCHED */ ut_ad(latch_mode != BTR_SEARCH_TREE); - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); } else { /* BTR_MODIFY_EXTERNAL needs to be excluded */ - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } upper_rw_latch = RW_S_LATCH; } else { @@ -2450,9 +2450,9 @@ btr_cur_open_at_index_side_func( if (lock_intention == BTR_INTENTION_DELETE && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { - mtr_x_lock(dict_index_get_lock(index), mtr); + mtr_x_lock_index(index, mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } upper_rw_latch = RW_X_LATCH; break; @@ -2468,7 +2468,7 @@ btr_cur_open_at_index_side_func( BTR_ALREADY_S_LATCHED */ ut_ad(latch_mode != BTR_SEARCH_TREE); - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); } upper_rw_latch = RW_S_LATCH; } else { @@ -2779,7 +2779,7 @@ btr_cur_open_at_rnd_pos_func( ulint* offsets = offsets_; rec_offs_init(offsets_); - ut_ad(!dict_index_is_spatial(index)); + ut_ad(!index->is_spatial()); lock_intention = btr_cur_get_and_clear_intention(&latch_mode); @@ -2795,9 +2795,9 @@ btr_cur_open_at_rnd_pos_func( if (lock_intention == BTR_INTENTION_DELETE && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { - mtr_x_lock(dict_index_get_lock(index), mtr); + mtr_x_lock_index(index, mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } upper_rw_latch = RW_X_LATCH; break; @@ -2813,7 +2813,7 @@ btr_cur_open_at_rnd_pos_func( /* fall through */ default: if (!srv_read_only_mode) { - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); upper_rw_latch = RW_S_LATCH; } else { upper_rw_latch = RW_NO_LATCH; @@ -4892,7 +4892,7 @@ btr_cur_pessimistic_update( MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } /* Was the record to be updated positioned as the first user diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc index 86e4cc3d6b5..39007c34258 100644 --- a/storage/innobase/btr/btr0defragment.cc +++ b/storage/innobase/btr/btr0defragment.cc @@ -751,7 +751,7 @@ DECLARE_THREAD(btr_defragment_thread)(void*) index->set_modified(mtr); /* To follow the latching order defined in WL#6326, acquire index->lock X-latch. This entitles us to acquire page latches in any order for the index. */ - mtr_x_lock(&index->lock, &mtr); + mtr_x_lock_index(index, &mtr); /* This will acquire index->lock SX-latch, which per WL#6363 is allowed when we are already holding the X-latch. */ btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr); diff --git a/storage/innobase/btr/btr0scrub.cc b/storage/innobase/btr/btr0scrub.cc index 5c6eeff2d6a..975520220c0 100644 --- a/storage/innobase/btr/btr0scrub.cc +++ b/storage/innobase/btr/btr0scrub.cc @@ -742,7 +742,7 @@ btr_scrub_recheck_page( } mtr_start(mtr); - mtr_x_lock(dict_index_get_lock(scrub_data->current_index), mtr); + mtr_x_lock_index(scrub_data->current_index, mtr); /** set savepoint for X-latch of block */ scrub_data->savepoint = mtr_set_savepoint(mtr); return BTR_SCRUB_PAGE; diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc index 73f55cc8667..7c6f5d75b5d 100644 --- a/storage/innobase/dict/dict0defrag_bg.cc +++ b/storage/innobase/dict/dict0defrag_bg.cc @@ -281,11 +281,11 @@ dict_stats_save_defrag_stats( mtr_t mtr; ulint n_leaf_pages; ulint n_leaf_reserved; - mtr_start(&mtr); - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr.start(); + mtr_s_lock_index(index, &mtr); n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, &n_leaf_pages, &mtr); - mtr_commit(&mtr); + mtr.commit(); if (n_leaf_reserved == ULINT_UNDEFINED) { // The index name is different during fast index creation, diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 0310f275196..faf6483e983 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -854,10 +854,8 @@ dict_stats_update_transient_for_index( mtr_t mtr; ulint size; - mtr_start(&mtr); - - mtr_s_lock(dict_index_get_lock(index), &mtr); - + mtr.start(); + mtr_s_lock_index(index, &mtr); size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); if (size != ULINT_UNDEFINED) { @@ -867,7 +865,7 @@ dict_stats_update_transient_for_index( index, BTR_N_LEAF_PAGES, &mtr); } - mtr_commit(&mtr); + mtr.commit(); switch (size) { case ULINT_UNDEFINED: @@ -1929,10 +1927,8 @@ dict_stats_analyze_index( dict_stats_empty_index(index, false); - mtr_start(&mtr); - - mtr_s_lock(dict_index_get_lock(index), &mtr); - + mtr.start(); + mtr_s_lock_index(index, &mtr); size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); if (size != ULINT_UNDEFINED) { @@ -1941,7 +1937,7 @@ dict_stats_analyze_index( } /* Release the X locks on the root page taken by btr_get_size() */ - mtr_commit(&mtr); + mtr.commit(); switch (size) { case ULINT_UNDEFINED: @@ -1954,10 +1950,8 @@ dict_stats_analyze_index( index->stat_n_leaf_pages = size; - mtr_start(&mtr); - - mtr_sx_lock(dict_index_get_lock(index), &mtr); - + mtr.start(); + mtr_sx_lock_index(index, &mtr); root_level = btr_height_get(index, &mtr); n_uniq = dict_index_get_n_unique(index); @@ -1997,7 +1991,7 @@ dict_stats_analyze_index( index->stat_n_sample_sizes[i] = total_pages; } - mtr_commit(&mtr); + mtr.commit(); dict_stats_assert_initialized_index(index); DBUG_VOID_RETURN; @@ -2043,9 +2037,9 @@ dict_stats_analyze_index( /* Commit the mtr to release the tree S lock to allow other threads to do some work too. */ - mtr_commit(&mtr); - mtr_start(&mtr); - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr.commit(); + mtr.start(); + mtr_sx_lock_index(index, &mtr); if (root_level != btr_height_get(index, &mtr)) { /* Just quit if the tree has changed beyond recognition here. The old stats from previous @@ -2183,7 +2177,7 @@ found_level: data, &mtr); } - mtr_commit(&mtr); + mtr.commit(); UT_DELETE_ARRAY(n_diff_boundaries); diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 594adfb0a94..6942b1f4730 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -657,7 +657,7 @@ void fsp_header_init(fil_space_t* space, ulint size, mtr_t* mtr) const page_id_t page_id(space->id, 0); const page_size_t page_size(space->flags); - mtr_x_lock(&space->latch, mtr); + mtr_x_lock_space(space, mtr); buf_block_t* block = buf_page_create(page_id, page_size, mtr); buf_page_get(page_id, page_size, RW_SX_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_FSP_PAGE); @@ -1944,7 +1944,7 @@ fseg_create( ut_ad(byte_offset + FSEG_HEADER_SIZE <= srv_page_size - FIL_PAGE_DATA_END); - mtr_x_lock(&space->latch, mtr); + mtr_x_lock_space(space, mtr); const page_size_t page_size(space->flags); ut_d(space->modify_check(*mtr)); @@ -2649,7 +2649,7 @@ fsp_reserve_free_extents( ut_ad(mtr); *n_reserved = n_ext; - mtr_x_lock(&space->latch, mtr); + mtr_x_lock_space(space, mtr); const page_size_t page_size(space->flags); space_header = fsp_get_space_header(space, page_size, mtr); @@ -2940,7 +2940,7 @@ fseg_free_page_func( DBUG_ENTER("fseg_free_page"); fseg_inode_t* seg_inode; buf_block_t* iblock; - mtr_x_lock(&space->latch, mtr); + mtr_x_lock_space(space, mtr); const page_size_t page_size(space->flags); DBUG_LOG("fseg_free_page", "space_id: " << space->id @@ -2970,7 +2970,7 @@ fseg_page_is_free(fil_space_t* space, unsigned page) page_no_t dpage = xdes_calc_descriptor_page(page_size, page); mtr.start(); - mtr_s_lock(&space->latch, &mtr); + mtr_s_lock_space(space, &mtr); if (page >= space->free_limit || page >= space->size_in_header) { is_free = true; diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc index 35bfe3f701f..94256abfe71 100644 --- a/storage/innobase/gis/gis0rtree.cc +++ b/storage/innobase/gis/gis0rtree.cc @@ -1846,7 +1846,7 @@ rtr_estimate_n_rows_in_range( mtr.start(); index->set_modified(mtr); - mtr_s_lock(&index->lock, &mtr); + mtr_s_lock_index(index, &mtr); buf_block_t* block = btr_block_get( page_id_t(index->table->space_id, index->page), diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc index 2a42d78938d..3b3de0b2514 100644 --- a/storage/innobase/gis/gis0sea.cc +++ b/storage/innobase/gis/gis0sea.cc @@ -137,7 +137,7 @@ rtr_pcur_getnext_from_path( if (!index_locked) { ut_ad(latch_mode & BTR_SEARCH_LEAF || latch_mode & BTR_MODIFY_LEAF); - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); } else { ut_ad(mtr_memo_contains_flagged(mtr, &index->lock, MTR_MEMO_SX_LOCK diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index c86abda7166..b113fb35817 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -361,7 +361,7 @@ ibuf_tree_root_get( ut_ad(ibuf_inside(mtr)); ut_ad(mutex_own(&ibuf_mutex)); - mtr_sx_lock(dict_index_get_lock(ibuf->index), mtr); + mtr_sx_lock_index(ibuf->index, mtr); /* only segment list access is exclusive each other */ block = buf_page_get( @@ -459,7 +459,7 @@ ibuf_init_at_db_start(void) compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE); compile_time_assert(IBUF_SPACE_ID == 0); - mtr_x_lock(&fil_system.sys_space->latch, &mtr); + mtr_x_lock_space(fil_system.sys_space, &mtr); mutex_enter(&ibuf_mutex); @@ -1977,7 +1977,7 @@ ibuf_add_free_page(void) mtr_start(&mtr); /* Acquire the fsp latch before the ibuf header, obeying the latching order */ - mtr_x_lock(&fil_system.sys_space->latch, &mtr); + mtr_x_lock_space(fil_system.sys_space, &mtr); header_page = ibuf_header_page_get(&mtr); /* Allocate a new page: NOTE that if the page has been a part of a @@ -2056,7 +2056,7 @@ ibuf_remove_free_page(void) /* Acquire the fsp latch before the ibuf header, obeying the latching order */ - mtr_x_lock(&fil_system.sys_space->latch, &mtr); + mtr_x_lock_space(fil_system.sys_space, &mtr); header_page = ibuf_header_page_get(&mtr); /* Prevent pessimistic inserts to insert buffer trees for a while */ diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 3ca753d06e4..074f55971b3 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -85,17 +85,12 @@ savepoint. */ /** Push an object to an mtr memo stack. */ #define mtr_memo_push(m, o, t) (m)->memo_push(o, t) -/** Lock an rw-lock in s-mode. */ -#define mtr_s_lock(l, m) (m)->s_lock((l), __FILE__, __LINE__) - -/** Lock an rw-lock in x-mode. */ -#define mtr_x_lock(l, m) (m)->x_lock((l), __FILE__, __LINE__) - -/** Lock a tablespace in x-mode. */ +#define mtr_s_lock_space(s, m) (m)->s_lock_space((s), __FILE__, __LINE__) #define mtr_x_lock_space(s, m) (m)->x_lock_space((s), __FILE__, __LINE__) -/** Lock an rw-lock in sx-mode. */ -#define mtr_sx_lock(l, m) (m)->sx_lock((l), __FILE__, __LINE__) +#define mtr_s_lock_index(i, m) (m)->s_lock(&(i)->lock, __FILE__, __LINE__) +#define mtr_x_lock_index(i, m) (m)->x_lock(&(i)->lock, __FILE__, __LINE__) +#define mtr_sx_lock_index(i, m) (m)->sx_lock(&(i)->lock, __FILE__, __LINE__) #define mtr_memo_contains_flagged(m, p, l) \ (m)->memo_contains_flagged((p), (l)) @@ -251,29 +246,7 @@ struct mtr_t { inline ulint read_ulint(const byte* ptr, mlog_id_t type) const MY_ATTRIBUTE((warn_unused_result)); - /** Locks a rw-latch in S mode. - NOTE: use mtr_s_lock(). - @param lock rw-lock - @param file file name from where called - @param line line number in file */ - inline void s_lock(rw_lock_t* lock, const char* file, unsigned line); - - /** Locks a rw-latch in X mode. - NOTE: use mtr_x_lock(). - @param lock rw-lock - @param file file name from where called - @param line line number in file */ - inline void x_lock(rw_lock_t* lock, const char* file, unsigned line); - - /** Locks a rw-latch in X mode. - NOTE: use mtr_sx_lock(). - @param lock rw-lock - @param file file name from where called - @param line line number in file */ - inline void sx_lock(rw_lock_t* lock, const char* file, unsigned line); - /** Acquire a tablespace X-latch. - NOTE: use mtr_x_lock_space(). @param[in] space_id tablespace ID @param[in] file file name from where called @param[in] line line number in file @@ -283,6 +256,60 @@ struct mtr_t { const char* file, unsigned line); + /** Acquire a shared rw-latch. + @param[in] lock rw-latch + @param[in] file file name from where called + @param[in] line line number in file */ + void s_lock(rw_lock_t* lock, const char* file, unsigned line) + { + rw_lock_s_lock_inline(lock, 0, file, line); + memo_push(lock, MTR_MEMO_S_LOCK); + } + + /** Acquire an exclusive rw-latch. + @param[in] lock rw-latch + @param[in] file file name from where called + @param[in] line line number in file */ + void x_lock(rw_lock_t* lock, const char* file, unsigned line) + { + rw_lock_x_lock_inline(lock, 0, file, line); + memo_push(lock, MTR_MEMO_X_LOCK); + } + + /** Acquire an shared/exclusive rw-latch. + @param[in] lock rw-latch + @param[in] file file name from where called + @param[in] line line number in file */ + void sx_lock(rw_lock_t* lock, const char* file, unsigned line) + { + rw_lock_sx_lock_inline(lock, 0, file, line); + memo_push(lock, MTR_MEMO_SX_LOCK); + } + + /** Acquire a tablespace S-latch. + @param[in] space tablespace + @param[in] file file name from where called + @param[in] line line number in file */ + void s_lock_space(fil_space_t* space, const char* file, unsigned line) + { + ut_ad(space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT + || space->purpose == FIL_TYPE_TABLESPACE); + s_lock(&space->latch, file, line); + } + + /** Acquire a tablespace X-latch. + @param[in] space tablespace + @param[in] file file name from where called + @param[in] line line number in file */ + void x_lock_space(fil_space_t* space, const char* file, unsigned line) + { + ut_ad(space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT + || space->purpose == FIL_TYPE_TABLESPACE); + x_lock(&space->latch, file, line); + } + /** Release an object in the memo stack. @param object object @param type object type: MTR_MEMO_S_LOCK, ... diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index 7175ede0d6a..4cc55ed13ec 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -228,39 +228,6 @@ mtr_t::set_log_mode(mtr_log_t mode) return(old_mode); } -/** -Locks a lock in s-mode. */ - -void -mtr_t::s_lock(rw_lock_t* lock, const char* file, unsigned line) -{ - rw_lock_s_lock_inline(lock, 0, file, line); - - memo_push(lock, MTR_MEMO_S_LOCK); -} - -/** -Locks a lock in x-mode. */ - -void -mtr_t::x_lock(rw_lock_t* lock, const char* file, unsigned line) -{ - rw_lock_x_lock_inline(lock, 0, file, line); - - memo_push(lock, MTR_MEMO_X_LOCK); -} - -/** -Locks a lock in sx-mode. */ - -void -mtr_t::sx_lock(rw_lock_t* lock, const char* file, unsigned line) -{ - rw_lock_sx_lock_inline(lock, 0, file, line); - - memo_push(lock, MTR_MEMO_SX_LOCK); -} - /** Reads 1 - 4 bytes from a file page buffered in the buffer pool. @return value read */ diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 06b0413447a..14c274b5b40 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -570,10 +570,7 @@ mtr_t::x_lock_space(ulint space_id, const char* file, unsigned line) ut_ad(space); ut_ad(space->id == space_id); - x_lock(&space->latch, file, line); - ut_ad(space->purpose == FIL_TYPE_TEMPORARY - || space->purpose == FIL_TYPE_IMPORT - || space->purpose == FIL_TYPE_TABLESPACE); + x_lock_space(space, file, line); return(space); } diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index 4760960a441..0fb994b547a 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -2633,7 +2633,7 @@ row_ins_clust_index_entry_low( if (mode == BTR_MODIFY_LEAF && dict_index_is_online_ddl(index)) { mode = BTR_MODIFY_LEAF_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } if (unsigned ai = index->table->persistent_autoinc) { @@ -2866,9 +2866,9 @@ row_ins_sec_mtr_start_and_check_if_aborted( } if (search_mode & BTR_ALREADY_S_LATCHED) { - mtr_s_lock(dict_index_get_lock(index), mtr); + mtr_s_lock_index(index, mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), mtr); + mtr_sx_lock_index(index, mtr); } switch (index->online_status) { @@ -2954,9 +2954,9 @@ row_ins_sec_index_entry_low( DEBUG_SYNC_C("row_ins_sec_index_enter"); if (mode == BTR_MODIFY_LEAF) { search_mode |= BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } if (row_log_online_op_try( diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index 8c83482f53e..b71f8491787 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -385,14 +385,14 @@ row_purge_remove_sec_if_poss_tree( enum row_search_result search_result; log_free_check(); - mtr_start(&mtr); + mtr.start(); index->set_modified(mtr); if (!index->is_committed()) { /* The index->online_status may change if the index is or was being created online, but not committed yet. It is protected by index->lock. */ - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); if (dict_index_is_online_ddl(index)) { /* Online secondary index creation will not @@ -487,9 +487,9 @@ row_purge_remove_sec_if_poss_tree( } func_exit: - btr_pcur_close(&pcur); + btr_pcur_close(&pcur); // FIXME: need this? func_exit_no_pcur: - mtr_commit(&mtr); + mtr.commit(); return(success); } @@ -516,7 +516,7 @@ row_purge_remove_sec_if_poss_leaf( log_free_check(); ut_ad(index->table == node->table); ut_ad(!index->table->is_temporary()); - mtr_start(&mtr); + mtr.start(); index->set_modified(mtr); if (!index->is_committed()) { @@ -528,7 +528,7 @@ row_purge_remove_sec_if_poss_leaf( /* The index->online_status may change if the the index is or was being created online, but not committed yet. It is protected by index->lock. */ - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); if (dict_index_is_online_ddl(index)) { /* Online secondary index creation will not @@ -632,7 +632,7 @@ row_purge_remove_sec_if_poss_leaf( ->page.id); btr_pcur_close(&pcur); - mtr_commit(&mtr); + mtr.commit(); return(success); } } @@ -658,9 +658,9 @@ row_purge_remove_sec_if_poss_leaf( /* The deletion was buffered. */ case ROW_NOT_FOUND: /* The index entry does not exist, nothing to do. */ - btr_pcur_close(&pcur); + btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set? func_exit_no_pcur: - mtr_commit(&mtr); + mtr.commit(); return(success); } @@ -950,12 +950,12 @@ skip_secondaries: ut_ad(rseg->id == rseg_id); ut_ad(rseg->is_persistent()); - mtr_start(&mtr); + mtr.start(); /* We have to acquire an SX-latch to the clustered index tree (exclude other tree changes) */ - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); index->set_modified(mtr); @@ -986,7 +986,7 @@ skip_secondaries: data_field + dfield_get_len(&ufield->new_val) - BTR_EXTERN_FIELD_REF_SIZE, NULL, NULL, NULL, 0, false, &mtr); - mtr_commit(&mtr); + mtr.commit(); } } diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc index 47b4a956a29..db7bf9bdcb5 100644 --- a/storage/innobase/row/row0uins.cc +++ b/storage/innobase/row/row0uins.cc @@ -95,7 +95,7 @@ row_undo_ins_remove_clust_rec( ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH); ut_ad(node->table->id != DICT_INDEXES_ID); - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } success = btr_pcur_restore_position( @@ -275,10 +275,10 @@ row_undo_ins_remove_sec_low( if (modify_leaf) { mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE)); - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } if (row_log_online_op_try(index, entry, 0)) { diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc index 2a0779a03ea..0172d47b8e2 100644 --- a/storage/innobase/row/row0umod.cc +++ b/storage/innobase/row/row0umod.cc @@ -187,7 +187,7 @@ static bool row_undo_mod_must_purge(undo_node_t* node, mtr_t* mtr) btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&node->pcur); ut_ad(btr_cur->index->is_primary()); - mtr_s_lock(&purge_sys.latch, mtr); + mtr->s_lock(&purge_sys.latch, __FILE__, __LINE__); if (!purge_sys.view.changes_visible(node->new_trx_id, node->table->name)) { @@ -238,7 +238,7 @@ row_undo_mod_clust( online = dict_index_is_online_ddl(index); if (online) { ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH); - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } mem_heap_t* heap = mem_heap_create(1024); @@ -393,7 +393,7 @@ row_undo_mod_clust( goto mtr_commit_exit; } rec_t* rec = btr_pcur_get_rec(pcur); - mtr_s_lock(&purge_sys.latch, &mtr); + mtr.s_lock(&purge_sys.latch, __FILE__, __LINE__); if (!purge_sys.view.changes_visible(node->new_trx_id, node->table->name)) { goto mtr_commit_exit; @@ -476,10 +476,10 @@ row_undo_mod_del_mark_or_remove_sec_low( is protected by index->lock. */ if (modify_leaf) { mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE)); - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } if (row_log_online_op_try(index, entry, 0)) { @@ -672,10 +672,10 @@ try_again: is protected by index->lock. */ if (mode == BTR_MODIFY_LEAF) { mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { ut_ad(mode == BTR_MODIFY_TREE); - mtr_sx_lock(dict_index_get_lock(index), &mtr); + mtr_sx_lock_index(index, &mtr); } if (row_log_online_op_try(index, entry, trx->id)) { diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index d3d1c9a0fac..29b6e6ec086 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -2341,7 +2341,7 @@ row_upd_sec_index_entry( or was being created online, but not committed yet. It is protected by index->lock. */ - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); switch (dict_index_get_online_status(index)) { case ONLINE_INDEX_COMPLETE: @@ -3109,7 +3109,7 @@ row_upd_clust_step( if (dict_index_is_online_ddl(index)) { ut_ad(node->table->id != DICT_INDEXES_ID); mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mtr_s_lock_index(index, &mtr); } else { mode = BTR_MODIFY_LEAF; } diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 3ae8db63121..cd1a57998f3 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -968,7 +968,7 @@ not_found: mtr_t mtr; const ulint size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; mtr.start(); - mtr_x_lock(&space->latch, &mtr); + mtr_x_lock_space(space, &mtr); fil_truncate_log(space, size, &mtr); fsp_header_init(space, size, &mtr); mutex_enter(&fil_system.mutex); diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc index ea52df9cb5c..46fb2680371 100644 --- a/storage/innobase/trx/trx0rseg.cc +++ b/storage/innobase/trx/trx0rseg.cc @@ -694,7 +694,7 @@ trx_temp_rseg_create() for (ulong i = 0; i < TRX_SYS_N_RSEGS; i++) { mtr.start(); mtr.set_log_mode(MTR_LOG_NO_REDO); - mtr_x_lock(&fil_system.temp_space->latch, &mtr); + mtr_x_lock_space(fil_system.temp_space, &mtr); buf_block_t* rblock = trx_rseg_header_create( fil_system.temp_space, i, NULL, &mtr); diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 3a09d6929fa..3b7d6dab4eb 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -156,7 +156,7 @@ trx_sysf_create( then enter the kernel: we must do it in this order to conform to the latching order rules. */ - mtr_x_lock(&fil_system.sys_space->latch, mtr); + mtr_x_lock_space(fil_system.sys_space, mtr); compile_time_assert(TRX_SYS_SPACE == 0); /* Create the trx sys file block in a new allocated file segment */