From 7056812ed15abb398089b9c6aa6d7bf5c3cb8c0e Mon Sep 17 00:00:00 2001 From: Varun Gupta Date: Mon, 20 May 2019 00:35:30 +0530 Subject: [PATCH 1/7] MDEV-16214: Incorrect plan taken by the optimizer , uses INDEX instead of ref access with ORDER BY The issue in this case is that we take in account the estimates from quick keys instead of rec_per_key. The estimates for quick keys are better than rec_per_key only if we have ref(const), so we need to check that all keyparts in the ref key are of the type ref(const). --- mysql-test/main/order_by.result | 57 +++++++++++++++++++++++++++++++++ mysql-test/main/order_by.test | 37 +++++++++++++++++++++ sql/sql_select.cc | 49 ++++++++++++++++++++-------- 3 files changed, 129 insertions(+), 14 deletions(-) diff --git a/mysql-test/main/order_by.result b/mysql-test/main/order_by.result index db096acb162..8d1e471f618 100644 --- a/mysql-test/main/order_by.result +++ b/mysql-test/main/order_by.result @@ -3266,3 +3266,60 @@ NULLIF(GROUP_CONCAT(v1), null) C B DROP TABLE t1; +# +# MDEV-16214: Incorrect plan taken by the optimizer , uses INDEX instead of ref access with ORDER BY +# +create table t1(a int); +insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t2( +id int primary key, +key1 int,key2 int, +col1 int, +key(key1), key(key2) +); +insert into t2 +select +A.a + B.a*10 + C.a*100, +A.a + 10*B.a, A.a + 10*B.a, +123456 +from t1 A, t1 B, t1 C; +# here type should show ref not index +explain select +(SELECT concat(id, '-', key1, '-', col1) +FROM t2 +WHERE +t2.key1 = t1.a and t2.key1 IS NOT NULL +ORDER BY +t2.key2 ASC +LIMIT 1) +from t1; +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1 ALL NULL NULL NULL NULL 10 +2 DEPENDENT SUBQUERY t2 ref key1 key1 5 test.t1.a 10 Using index condition; Using where; Using filesort +select +(SELECT concat(id, '-', key1, '-', col1) +FROM t2 +WHERE +t2.key1 = t1.a and t2.key1 IS NOT NULL +ORDER BY +t2.key2 ASC +LIMIT 1) +from t1; +(SELECT concat(id, '-', key1, '-', col1) +FROM t2 +WHERE +t2.key1 = t1.a and t2.key1 IS NOT NULL +ORDER BY +t2.key2 ASC +LIMIT 1) +900-0-123456 +901-1-123456 +902-2-123456 +903-3-123456 +904-4-123456 +905-5-123456 +906-6-123456 +907-7-123456 +908-8-123456 +909-9-123456 +drop table t1,t2; diff --git a/mysql-test/main/order_by.test b/mysql-test/main/order_by.test index d67c67de89c..58b91fbda91 100644 --- a/mysql-test/main/order_by.test +++ b/mysql-test/main/order_by.test @@ -2201,3 +2201,40 @@ GROUP BY id ORDER BY id+1 DESC; DROP TABLE t1; + +--echo # +--echo # MDEV-16214: Incorrect plan taken by the optimizer , uses INDEX instead of ref access with ORDER BY +--echo # + +create table t1(a int); +insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t2( + id int primary key, + key1 int,key2 int, + col1 int, + key(key1), key(key2) +); + +insert into t2 + select + A.a + B.a*10 + C.a*100, + A.a + 10*B.a, A.a + 10*B.a, + 123456 +from t1 A, t1 B, t1 C; + +let $query= select + (SELECT concat(id, '-', key1, '-', col1) + FROM t2 + WHERE + t2.key1 = t1.a and t2.key1 IS NOT NULL + ORDER BY + t2.key2 ASC + LIMIT 1) + from t1; + +--echo # here type should show ref not index +eval explain $query; +eval $query; + +drop table t1,t2; diff --git a/sql/sql_select.cc b/sql/sql_select.cc index f36a68bc7ae..6b1706e5451 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -9986,31 +9986,35 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, j->ref.null_rejecting|= (key_part_map)1 << i; keyuse_uses_no_tables= keyuse_uses_no_tables && !keyuse->used_tables; /* - Todo: we should remove this check for thd->lex->describe on the next - line. With SHOW EXPLAIN code, EXPLAIN printout code no longer depends - on it. However, removing the check caused change in lots of query - plans! Does the optimizer depend on the contents of - table_ref->key_copy ? If yes, do we produce incorrect EXPLAINs? + We don't want to compute heavy expressions in EXPLAIN, an example would + select * from t1 where t1.key=(select thats very heavy); + + (select thats very heavy) => is a constant here + eg: (select avg(order_cost) from orders) => constant but expensive */ if (!keyuse->val->used_tables() && !thd->lex->describe) { // Compare against constant - store_key_item tmp(thd, + store_key_item tmp(thd, keyinfo->key_part[i].field, key_buff + maybe_null, maybe_null ? key_buff : 0, keyinfo->key_part[i].length, keyuse->val, FALSE); - if (unlikely(thd->is_fatal_error)) - DBUG_RETURN(TRUE); - tmp.copy(); + if (unlikely(thd->is_fatal_error)) + DBUG_RETURN(TRUE); + tmp.copy(); j->ref.const_ref_part_map |= key_part_map(1) << i ; } else - *ref_key++= get_store_key(thd, - keyuse,join->const_table_map, - &keyinfo->key_part[i], - key_buff, maybe_null); + { + *ref_key++= get_store_key(thd, + keyuse,join->const_table_map, + &keyinfo->key_part[i], + key_buff, maybe_null); + if (!keyuse->val->used_tables()) + j->ref.const_ref_part_map |= key_part_map(1) << i ; + } /* Remember if we are going to use REF_OR_NULL But only if field _really_ can be null i.e. we force JT_REF @@ -25256,6 +25260,15 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta, { if (!(eta->ref_list.append_str(thd->mem_root, "const"))) return 1; + /* + create_ref_for_key() handles keypart=const equalities as follows: + - non-EXPLAIN execution will copy the "const" to lookup tuple + immediately and will not add an element to ref.key_copy + - EXPLAIN will put an element into ref.key_copy. Since we've + just printed "const" for it, we should skip it here + */ + if (thd->lex->describe) + key_ref++; } else { @@ -26921,7 +26934,15 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table, */ if (ref_key >= 0 && ref_key != MAX_KEY && tab->type == JT_REF) { - if (table->quick_keys.is_set(ref_key)) + /* + If ref access uses keypart=const for all its key parts, + and quick select uses the same # of key parts, then they are equivalent. + Reuse #rows estimate from quick select as it is more precise. + */ + if (tab->ref.const_ref_part_map == + make_prev_keypart_map(tab->ref.key_parts) && + table->quick_keys.is_set(ref_key) && + table->quick_key_parts[ref_key] == tab->ref.key_parts) refkey_rows_estimate= table->quick_rows[ref_key]; else { From 48a662dae5cd5dcc2d3cd9cd03104b5152c1d23c Mon Sep 17 00:00:00 2001 From: Eugene Kosov Date: Sun, 19 May 2019 23:15:55 +0300 Subject: [PATCH 2/7] MDEV-19486 Server crashes in row_upd or row_upd_del_mark_clust_rec on REPLACE into a versioned table row_insert_for_mysql(): InnoDB sets values for row_start and row_end. And this function used to return those values to server in ha_innobase::write_row(). This buggy behavior was removed. Also, a piece of code in this function was reformatted. upd_node_t::make_versioned_helper(): Assert that the preallocated size of the update vector is not exceeded. --- mysql-test/suite/versioning/r/replace.result | 14 +++++++++++++ mysql-test/suite/versioning/t/replace.test | 17 +++++++++++++++ storage/innobase/row/row0mysql.cc | 22 +++++++++----------- storage/innobase/row/row0upd.cc | 6 ++++-- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/mysql-test/suite/versioning/r/replace.result b/mysql-test/suite/versioning/r/replace.result index 1711be5212e..4173432ff45 100644 --- a/mysql-test/suite/versioning/r/replace.result +++ b/mysql-test/suite/versioning/r/replace.result @@ -32,5 +32,19 @@ insert into t1 values (1,1); create or replace table t2 (c int); create or replace view v as select t1.* from t1 join t2; replace into v (a, b) select a, b from t1; +drop table t1; +CREATE TABLE t1 ( +pk INT AUTO_INCREMENT, +f INT, +row_start SYS_DATATYPE AS ROW START INVISIBLE, +row_end SYS_DATATYPE AS ROW END INVISIBLE, +PRIMARY KEY(pk), +UNIQUE(f), +PERIOD FOR SYSTEM_TIME(row_start, row_end) +) WITH SYSTEM VERSIONING; +INSERT INTO t1 () VALUES (),(),(),(),(),(); +UPDATE IGNORE t1 SET f = 1; +REPLACE t1 SELECT * FROM t1; +DROP TABLE t1; drop database test; create database test; diff --git a/mysql-test/suite/versioning/t/replace.test b/mysql-test/suite/versioning/t/replace.test index bc7e084758d..391cd2a12fc 100644 --- a/mysql-test/suite/versioning/t/replace.test +++ b/mysql-test/suite/versioning/t/replace.test @@ -35,6 +35,23 @@ insert into t1 values (1,1); create or replace table t2 (c int); create or replace view v as select t1.* from t1 join t2; replace into v (a, b) select a, b from t1; +drop table t1; + +--replace_result $sys_datatype_expl SYS_DATATYPE +eval CREATE TABLE t1 ( + pk INT AUTO_INCREMENT, + f INT, + row_start $sys_datatype_expl AS ROW START INVISIBLE, + row_end $sys_datatype_expl AS ROW END INVISIBLE, + PRIMARY KEY(pk), + UNIQUE(f), + PERIOD FOR SYSTEM_TIME(row_start, row_end) +) WITH SYSTEM VERSIONING; +INSERT INTO t1 () VALUES (),(),(),(),(),(); +UPDATE IGNORE t1 SET f = 1; +REPLACE t1 SELECT * FROM t1; +DROP TABLE t1; + drop database test; create database test; diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index 50a1d7ae568..679aa63f9d5 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -1417,26 +1417,24 @@ row_insert_for_mysql( row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec, &blob_heap); - if (ins_mode != ROW_INS_NORMAL) - { + if (ins_mode != ROW_INS_NORMAL) { ut_ad(table->vers_start != table->vers_end); - /* Return back modified fields into mysql_rec, so that - upper logic may benefit from it (f.ex. 'on duplicate key'). */ - const mysql_row_templ_t* t = prebuilt->get_template_by_col(table->vers_end); + const mysql_row_templ_t* t + = prebuilt->get_template_by_col(table->vers_end); ut_ad(t); ut_ad(t->mysql_col_len == 8); if (ins_mode == ROW_INS_HISTORICAL) { - set_tuple_col_8(node->row, table->vers_end, trx->id, node->vers_end_buf); - } - else /* ROW_INS_VERSIONED */ { - set_tuple_col_8(node->row, table->vers_end, TRX_ID_MAX, node->vers_end_buf); - int8store(&mysql_rec[t->mysql_col_offset], TRX_ID_MAX); + set_tuple_col_8(node->row, table->vers_end, trx->id, + node->vers_end_buf); + } else /* ROW_INS_VERSIONED */ { + set_tuple_col_8(node->row, table->vers_end, TRX_ID_MAX, + node->vers_end_buf); t = prebuilt->get_template_by_col(table->vers_start); ut_ad(t); ut_ad(t->mysql_col_len == 8); - set_tuple_col_8(node->row, table->vers_start, trx->id, node->vers_start_buf); - int8store(&mysql_rec[t->mysql_col_offset], trx->id); + set_tuple_col_8(node->row, table->vers_start, trx->id, + node->vers_start_buf); } } diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index 234a33b844e..15f213d0711 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -3485,9 +3485,11 @@ void upd_node_t::make_versioned_helper(const trx_t* trx, ulint idx) dict_index_t* clust_index = dict_table_get_first_index(table); + /* row_create_update_node_for_mysql() pre-allocated this much */ + ut_ad(update->n_fields < table->n_cols + table->n_v_cols); + update->n_fields++; - upd_field_t* ufield = - upd_get_nth_field(update, upd_get_n_fields(update) - 1); + upd_field_t* ufield = upd_get_nth_field(update, update->n_fields - 1); const dict_col_t* col = dict_table_get_nth_col(table, idx); upd_field_set_field_no(ufield, dict_col_get_clust_pos(col, clust_index), From b77460508eb9b82054ac92a42bbb7c54f34273ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 20 May 2019 13:02:22 +0300 Subject: [PATCH 3/7] MDEV-19486: Fix -Wsign-compare --- storage/innobase/row/row0upd.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index 15f213d0711..c9ad752e96d 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -3486,7 +3486,7 @@ void upd_node_t::make_versioned_helper(const trx_t* trx, ulint idx) dict_index_t* clust_index = dict_table_get_first_index(table); /* row_create_update_node_for_mysql() pre-allocated this much */ - ut_ad(update->n_fields < table->n_cols + table->n_v_cols); + ut_ad(update->n_fields < ulint(table->n_cols + table->n_v_cols)); update->n_fields++; upd_field_t* ufield = upd_get_nth_field(update, update->n_fields - 1); From 6473641b9a04d65b7480831e926aa4150571a617 Mon Sep 17 00:00:00 2001 From: Eugene Kosov Date: Thu, 21 Feb 2019 18:59:28 +0300 Subject: [PATCH 4/7] MDEV-18512 using DATETIME(6) as row_start/row_end crashes server Disallow DATETIME for SYSTEM VERSIONING tables. --- mysql-test/suite/versioning/r/create.result | 7 +++++++ mysql-test/suite/versioning/t/create.test | 9 +++++++++ sql/handler.cc | 3 +-- sql/table.cc | 1 - 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/mysql-test/suite/versioning/r/create.result b/mysql-test/suite/versioning/r/create.result index 6cacdd1fd54..0c68624355b 100644 --- a/mysql-test/suite/versioning/r/create.result +++ b/mysql-test/suite/versioning/r/create.result @@ -508,5 +508,12 @@ row_end bigint as row end, period for system_time (row_start, row_end) ) engine=myisam with system versioning; ERROR HY000: `row_start` must be of type TIMESTAMP(6) for system-versioned table `t1` +create table t ( +a int, +row_start datetime(6) generated always as row start, +row_end datetime(6) generated always as row end, +period for system_time(row_start, row_end) +) with system versioning; +ERROR HY000: `row_start` must be of type TIMESTAMP(6) for system-versioned table `t` drop database test; create database test; diff --git a/mysql-test/suite/versioning/t/create.test b/mysql-test/suite/versioning/t/create.test index 7c7be3e02e3..e7bf3fda803 100644 --- a/mysql-test/suite/versioning/t/create.test +++ b/mysql-test/suite/versioning/t/create.test @@ -387,5 +387,14 @@ create or replace table t1 ( period for system_time (row_start, row_end) ) engine=myisam with system versioning; +--error ER_VERS_FIELD_WRONG_TYPE +create table t ( + a int, + row_start datetime(6) generated always as row start, + row_end datetime(6) generated always as row end, + period for system_time(row_start, row_end) +) with system versioning; + + drop database test; create database test; diff --git a/sql/handler.cc b/sql/handler.cc index 826b6f34746..993ba5947dc 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -7433,8 +7433,7 @@ bool Vers_parse_info::check_conditions(const Lex_table_name &table_name, static bool is_versioning_timestamp(const Create_field *f) { - return (f->type_handler() == &type_handler_datetime2 || - f->type_handler() == &type_handler_timestamp2) && + return f->type_handler() == &type_handler_timestamp2 && f->length == MAX_DATETIME_FULL_WIDTH; } diff --git a/sql/table.cc b/sql/table.cc index 1253710c677..bd651673388 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -2025,7 +2025,6 @@ int TABLE_SHARE::init_from_binary_frm_image(THD *thd, bool write, switch (field_type) { case MYSQL_TYPE_TIMESTAMP2: - case MYSQL_TYPE_DATETIME2: break; case MYSQL_TYPE_LONGLONG: if (vers_can_native) From c86773f46fa80599c5571b62147dde34135f5851 Mon Sep 17 00:00:00 2001 From: Aleksey Midenkov Date: Wed, 6 Feb 2019 22:26:52 +0300 Subject: [PATCH 5/7] MDEV-18136 Server crashes in Item_func_dyncol_create::prepare_arguments [Closes tempesta-tech/mariadb#572] --- .../suite/versioning/r/partition.result | 19 +++++++++++++++++++ mysql-test/suite/versioning/t/partition.test | 14 ++++++++++++++ sql/partition_info.h | 14 ++++++++++++-- sql/sql_yacc.yy | 7 +------ sql/sql_yacc_ora.yy | 7 +------ 5 files changed, 47 insertions(+), 14 deletions(-) diff --git a/mysql-test/suite/versioning/r/partition.result b/mysql-test/suite/versioning/r/partition.result index 3c33967b780..3fcb59bdb40 100644 --- a/mysql-test/suite/versioning/r/partition.result +++ b/mysql-test/suite/versioning/r/partition.result @@ -522,6 +522,25 @@ set timestamp=1523466002.799571; insert into t1 values (11),(12); set timestamp=1523466004.169435; delete from t1 where pk in (11, 12); +# +# MDEV-18136 Server crashes in Item_func_dyncol_create::prepare_arguments +# +create or replace table t1 (pk int) with system versioning +partition by system_time interval 7 second ( +partition ver_p1 history, +partition ver_pn current); +alter table t1 +partition by system_time interval column_get(column_create(7,7), 7 as int) second ( +partition ver_p1 history, +partition ver_pn current); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `pk` int(11) DEFAULT NULL +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING + PARTITION BY SYSTEM_TIME INTERVAL 7 SECOND +(PARTITION `ver_p1` HISTORY ENGINE = DEFAULT_ENGINE, + PARTITION `ver_pn` CURRENT ENGINE = DEFAULT_ENGINE) # Test cleanup drop database test; create database test; diff --git a/mysql-test/suite/versioning/t/partition.test b/mysql-test/suite/versioning/t/partition.test index 88411468516..d5c83b4d3bb 100644 --- a/mysql-test/suite/versioning/t/partition.test +++ b/mysql-test/suite/versioning/t/partition.test @@ -475,6 +475,20 @@ insert into t1 values (11),(12); set timestamp=1523466004.169435; delete from t1 where pk in (11, 12); +--echo # +--echo # MDEV-18136 Server crashes in Item_func_dyncol_create::prepare_arguments +--echo # +create or replace table t1 (pk int) with system versioning +partition by system_time interval 7 second ( + partition ver_p1 history, + partition ver_pn current); +alter table t1 +partition by system_time interval column_get(column_create(7,7), 7 as int) second ( + partition ver_p1 history, + partition ver_pn current); +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + --echo # Test cleanup drop database test; create database test; diff --git a/sql/partition_info.h b/sql/partition_info.h index 5c7dec4c05e..d6a6214c172 100644 --- a/sql/partition_info.h +++ b/sql/partition_info.h @@ -395,16 +395,26 @@ public: bool field_in_partition_expr(Field *field) const; bool vers_init_info(THD *thd); - bool vers_set_interval(Item *item, interval_type int_type, my_time_t start) + bool vers_set_interval(THD *thd, Item *item, + interval_type int_type, my_time_t start) { DBUG_ASSERT(part_type == VERSIONING_PARTITION); vers_info->interval.type= int_type; vers_info->interval.start= start; - return get_interval_value(item, int_type, &vers_info->interval.step) || + if (item->fix_fields_if_needed_for_scalar(thd, &item)) + return true; + bool error= get_interval_value(item, int_type, &vers_info->interval.step) || vers_info->interval.step.neg || vers_info->interval.step.second_part || !(vers_info->interval.step.year || vers_info->interval.step.month || vers_info->interval.step.day || vers_info->interval.step.hour || vers_info->interval.step.minute || vers_info->interval.step.second); + if (error) + { + my_error(ER_PART_WRONG_VALUE, MYF(0), + thd->lex->create_last_non_select_table->table_name.str, + "INTERVAL"); + } + return error; } bool vers_set_limit(ulonglong limit) { diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index f89984d848f..a333bd79dd3 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -6064,13 +6064,8 @@ opt_versioning_rotation: | INTERVAL_SYM expr interval opt_versioning_interval_start { partition_info *part_info= Lex->part_info; - if (unlikely(part_info->vers_set_interval($2, $3, $4))) - { - my_error(ER_PART_WRONG_VALUE, MYF(0), - Lex->create_last_non_select_table->table_name.str, - "INTERVAL"); + if (unlikely(part_info->vers_set_interval(thd, $2, $3, $4))) MYSQL_YYABORT; - } } | LIMIT ulonglong_num { diff --git a/sql/sql_yacc_ora.yy b/sql/sql_yacc_ora.yy index 6a05423ebc3..7d561e7c34a 100644 --- a/sql/sql_yacc_ora.yy +++ b/sql/sql_yacc_ora.yy @@ -5910,13 +5910,8 @@ opt_versioning_rotation: | INTERVAL_SYM expr interval opt_versioning_interval_start { partition_info *part_info= Lex->part_info; - if (unlikely(part_info->vers_set_interval($2, $3, $4))) - { - my_error(ER_PART_WRONG_VALUE, MYF(0), - Lex->create_last_non_select_table->table_name.str, - "INTERVAL"); + if (unlikely(part_info->vers_set_interval(thd, $2, $3, $4))) MYSQL_YYABORT; - } } | LIMIT ulonglong_num { From d0ef948d708a33a7adcb96d73746e53dd2856a42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Fri, 22 Feb 2019 17:06:12 +0200 Subject: [PATCH 6/7] Non-functional change: Remove #ifdef UNIV_DEBUG --- storage/innobase/trx/trx0roll.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc index d5accd076be..4247dd1c005 100644 --- a/storage/innobase/trx/trx0roll.cc +++ b/storage/innobase/trx/trx0roll.cc @@ -809,12 +809,10 @@ void trx_rollback_recovered(bool all) trx_t *trx= trx_list.back(); trx_list.pop_back(); -#ifdef UNIV_DEBUG ut_ad(trx); - trx_mutex_enter(trx); + ut_d(trx_mutex_enter(trx)); ut_ad(trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE)); - trx_mutex_exit(trx); -#endif + ut_d(trx_mutex_exit(trx)); if (!srv_is_being_started && !srv_undo_sources && srv_fast_shutdown) goto discard; From 592dc59d7a5f9bd80bffdd9d0f003243ff639767 Mon Sep 17 00:00:00 2001 From: Daniele Sciascia Date: Thu, 9 May 2019 09:36:43 +0200 Subject: [PATCH 7/7] MDEV-17458 Unable to start galera node Bootstrapping a new cluster from a backup created from a MariaDB version prior to 10.3.5 may result in error "SST position can't be set in past" when attempting to join additional nodes. The problem stems from the fact that when reading the wsrep position from InnoDB, the position is looked up in two places: the TRX_SYS page, where versions prior to 10.3.5 used to store WSREP's position; and rollback segments, this is where newer versions store the position. When starting a new cluster, the starting seqno is 0 and a new cluster UUID is generated. This is persisted in rollback segments, but the old UUID and seqno are not cleared from TRX_SYS page. Subsequently, when reading back the position, trx_rseg_read_wsrep_checkpoint() is going to return the maximum seqno found in both TRX_SYS page and rollback segments. So in the case of a newly bootstrapped cluster, it's always going to return the old cluster information. The fix consists of changing trx_rseg_read_wsrep_checkpoint() so that only rollback segments are looked up. On startup, position is read from the TRX_SYS page, and if present, it is copied to rollback segments (unless a newer position is already present in the rollback segments). Finally the position stored in TRX_SYS page is cleared. --- storage/innobase/trx/trx0rseg.cc | 184 +++++++++++++++++++++---------- 1 file changed, 125 insertions(+), 59 deletions(-) diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc index 6cdc704e5e2..ea52df9cb5c 100644 --- a/storage/innobase/trx/trx0rseg.cc +++ b/storage/innobase/trx/trx0rseg.cc @@ -43,31 +43,16 @@ static long long wsrep_seqno = -1; /** The latest known WSREP XID UUID */ static unsigned char wsrep_uuid[16]; -/** Update the WSREP XID information in rollback segment header. +/** Write the WSREP XID information into rollback segment header. @param[in,out] rseg_header rollback segment header @param[in] xid WSREP XID -@param[in,out] mtr mini-transaction */ -void -trx_rseg_update_wsrep_checkpoint( +@param[in,out] mtr mini transaction */ +static void +trx_rseg_write_wsrep_checkpoint( trx_rsegf_t* rseg_header, const XID* xid, mtr_t* mtr) { - ut_ad(wsrep_is_wsrep_xid(xid)); - -#ifdef UNIV_DEBUG - /* Check that seqno is monotonically increasing */ - long long xid_seqno = wsrep_xid_seqno(xid); - const byte* xid_uuid = wsrep_xid_uuid(xid); - - if (!memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) { - ut_ad(xid_seqno > wsrep_seqno); - } else { - memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid); - } - wsrep_seqno = xid_seqno; -#endif /* UNIV_DEBUG */ - mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header, uint32_t(xid->formatID), MLOG_4BYTES, mtr); @@ -85,6 +70,83 @@ trx_rseg_update_wsrep_checkpoint( XIDDATASIZE, mtr); } +/** Update the WSREP XID information in rollback segment header. +@param[in,out] rseg_header rollback segment header +@param[in] xid WSREP XID +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_wsrep_checkpoint( + trx_rsegf_t* rseg_header, + const XID* xid, + mtr_t* mtr) +{ + ut_ad(wsrep_is_wsrep_xid(xid)); + +#ifdef UNIV_DEBUG + /* Check that seqno is monotonically increasing */ + long long xid_seqno = wsrep_xid_seqno(xid); + const byte* xid_uuid = wsrep_xid_uuid(xid); + + if (xid_seqno != -1 + && !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) { + ut_ad(xid_seqno > wsrep_seqno); + } else { + memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid); + } + wsrep_seqno = xid_seqno; +#endif /* UNIV_DEBUG */ + trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr); +} + +/** Clear the WSREP XID information from rollback segment header. +@param[in,out] rseg_header Rollback segment header +@param[in,out] mtr mini-transaction */ +static void +trx_rseg_clear_wsrep_checkpoint( + trx_rsegf_t* rseg_header, + mtr_t* mtr) +{ + mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header, + 0, MLOG_4BYTES, mtr); +} + +static void +trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr) +{ + const byte* xid_uuid = wsrep_xid_uuid(xid); + /* We must make check against wsrep_uuid here, the + trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with + xid contents in debug mode and the memcmp() will never give nonzero + result. */ + const bool must_clear_rsegs = memcmp(wsrep_uuid, xid_uuid, + sizeof wsrep_uuid); + const trx_rseg_t* rseg = trx_sys.rseg_array[0]; + + trx_rsegf_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no, + mtr); + if (UNIV_UNLIKELY(mach_read_from_4(rseg_header + TRX_RSEG_FORMAT))) { + trx_rseg_format_upgrade(rseg_header, mtr); + } + + trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr); + + if (must_clear_rsegs) { + /* Because the UUID part of the WSREP XID differed + from current_xid_uuid, the WSREP group UUID was + changed, and we must reset the XID in all rollback + segment headers. */ + for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) { + if (const trx_rseg_t* rseg = + trx_sys.rseg_array[rseg_id]) { + trx_rseg_clear_wsrep_checkpoint( + trx_rsegf_get(rseg->space, + rseg->page_no, mtr), + mtr); + } + } + } +} + /** Update WSREP checkpoint XID in first rollback segment header as part of wsrep_set_SE_checkpoint() when it is guaranteed that there are no wsrep transactions committing. @@ -96,36 +158,7 @@ void trx_rseg_update_wsrep_checkpoint(const XID* xid) { mtr_t mtr; mtr.start(); - - const trx_rseg_t* rseg = trx_sys.rseg_array[0]; - - trx_rsegf_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no, - &mtr); - if (UNIV_UNLIKELY(mach_read_from_4(rseg_header + TRX_RSEG_FORMAT))) { - trx_rseg_format_upgrade(rseg_header, &mtr); - } - - trx_rseg_update_wsrep_checkpoint(rseg_header, xid, &mtr); - - const byte* xid_uuid = wsrep_xid_uuid(xid); - if (memcmp(wsrep_uuid, xid_uuid, sizeof wsrep_uuid)) { - memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid); - - /* Because the UUID part of the WSREP XID differed - from current_xid_uuid, the WSREP group UUID was - changed, and we must reset the XID in all rollback - segment headers. */ - for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) { - if (const trx_rseg_t* rseg = - trx_sys.rseg_array[rseg_id]) { - trx_rseg_update_wsrep_checkpoint( - trx_rsegf_get(rseg->space, - rseg->page_no, &mtr), - xid, &mtr); - } - } - } - + trx_rseg_update_wsrep_checkpoint(xid, &mtr); mtr.commit(); } @@ -201,16 +234,6 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid) rseg_id++, mtr.commit()) { mtr.start(); const buf_block_t* sys = trx_sysf_get(&mtr, false); - if (rseg_id == 0) { - found = trx_rseg_init_wsrep_xid(sys->frame, xid); - ut_ad(!found || xid.formatID == 1); - if (found) { - max_xid_seqno = wsrep_xid_seqno(&xid); - memcpy(wsrep_uuid, wsrep_xid_uuid(&xid), - sizeof wsrep_uuid); - } - } - const uint32_t page_no = trx_sysf_rseg_get_page_no( sys, rseg_id); @@ -542,6 +565,9 @@ trx_rseg_array_init() trx_sys.recovered_binlog_offset = 0; #ifdef WITH_WSREP trx_sys.recovered_wsrep_xid.null(); + XID wsrep_sys_xid; + wsrep_sys_xid.null(); + bool wsrep_xid_in_rseg_found = false; #endif for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) { @@ -556,6 +582,9 @@ trx_rseg_array_init() TRX_SYS + TRX_SYS_TRX_ID_STORE + sys->frame); trx_rseg_init_binlog_info(sys->frame); +#ifdef WITH_WSREP + wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid); +#endif } const uint32_t page_no = trx_sysf_rseg_get_page_no( @@ -571,12 +600,49 @@ trx_rseg_array_init() ut_ad(!trx_sys.rseg_array[rseg_id]); trx_sys.rseg_array[rseg_id] = rseg; trx_rseg_mem_restore(rseg, max_trx_id, &mtr); +#ifdef WITH_WSREP + if (!wsrep_sys_xid.is_null() && + !wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) { + wsrep_xid_in_rseg_found = true; + ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid), + wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid), + sizeof wsrep_uuid) + || wsrep_xid_seqno( + &wsrep_sys_xid) + <= wsrep_xid_seqno( + &trx_sys.recovered_wsrep_xid)); + } +#endif } } mtr.commit(); } +#ifdef WITH_WSREP + if (!wsrep_sys_xid.is_null()) { + /* Upgrade from a version prior to 10.3.5, + where WSREP XID was stored in TRX_SYS page. + If no rollback segment has a WSREP XID set, + we must copy the XID found in TRX_SYS page + to rollback segments. */ + mtr_t mtr; + mtr.start(); + + if (!wsrep_xid_in_rseg_found) { + trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr); + } + + /* Finally, clear WSREP XID in TRX_SYS page. */ + const buf_block_t* sys = trx_sysf_get(&mtr); + mlog_write_ulint(TRX_SYS + TRX_SYS_WSREP_XID_INFO + + + TRX_SYS_WSREP_XID_MAGIC_N_FLD + sys->frame, + 0, MLOG_4BYTES, &mtr); + + mtr.commit(); + } +#endif + trx_sys.init_max_trx_id(max_trx_id + 1); }