diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 67c7c09859c..46b040d61bd 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -2100,7 +2100,7 @@ static int prepare_export() IF_WIN("\"","") "\"%s\" --mysqld \"%s\"" " --defaults-extra-file=./backup-my.cnf --defaults-group-suffix=%s --datadir=." " --innodb --innodb-fast-shutdown=0 --loose-partition" - " --innodb_purge_rseg_truncate_frequency=1 --innodb-buffer-pool-size=%llu" + " --innodb-buffer-pool-size=%llu" " --console --skip-log-error --skip-log-bin --bootstrap %s< " BOOTSTRAP_FILENAME IF_WIN("\"",""), mariabackup_exe, @@ -2114,7 +2114,7 @@ static int prepare_export() IF_WIN("\"","") "\"%s\" --mysqld" " --defaults-file=./backup-my.cnf --defaults-group-suffix=%s --datadir=." " --innodb --innodb-fast-shutdown=0 --loose-partition" - " --innodb_purge_rseg_truncate_frequency=1 --innodb-buffer-pool-size=%llu" + " --innodb-buffer-pool-size=%llu" " --console --log-error= --skip-log-bin --bootstrap %s< " BOOTSTRAP_FILENAME IF_WIN("\"",""), mariabackup_exe, diff --git a/mysql-test/main/item_types.result b/mysql-test/main/item_types.result index 0193d33be6d..a0068772cea 100644 --- a/mysql-test/main/item_types.result +++ b/mysql-test/main/item_types.result @@ -42,5 +42,14 @@ SELECT * FROM v WHERE f = '10.5.20'; f drop view v; # +# MDEV-34785: Assertion failure in Item_func_or_sum::do_build_clone +# (Item_func_not_all) +# +CREATE VIEW t AS SELECT 0 AS a; +SELECT * FROM t WHERE a=ALL (SELECT 0); +a +0 +DROP VIEW t; +# # End of 10.5 tests # diff --git a/mysql-test/main/item_types.test b/mysql-test/main/item_types.test index 2818ae582af..0a4100e9163 100644 --- a/mysql-test/main/item_types.test +++ b/mysql-test/main/item_types.test @@ -46,6 +46,15 @@ CREATE VIEW v AS SELECT version() AS f; SELECT * FROM v WHERE f = '10.5.20'; drop view v; +--echo # +--echo # MDEV-34785: Assertion failure in Item_func_or_sum::do_build_clone +--echo # (Item_func_not_all) +--echo # + +CREATE VIEW t AS SELECT 0 AS a; +SELECT * FROM t WHERE a=ALL (SELECT 0); +DROP VIEW t; + --echo # --echo # End of 10.5 tests --echo # diff --git a/mysql-test/main/subselect_innodb.result b/mysql-test/main/subselect_innodb.result index ea0affd575f..c8c0fd693f2 100644 --- a/mysql-test/main/subselect_innodb.result +++ b/mysql-test/main/subselect_innodb.result @@ -554,6 +554,7 @@ id select_type table type possible_keys key key_len ref rows Extra # # MDEV-6081: ORDER BY+ref(const): selectivity is very incorrect (MySQL Bug#14338686) # +insert into t2 select seq,seq,seq from seq_10000_to_11000; alter table t2 add key2 int; update t2 set key2=key1; alter table t2 add key(key2); diff --git a/mysql-test/main/subselect_innodb.test b/mysql-test/main/subselect_innodb.test index f675dda91b4..12ce5cabbeb 100644 --- a/mysql-test/main/subselect_innodb.test +++ b/mysql-test/main/subselect_innodb.test @@ -558,7 +558,9 @@ from --echo # --echo # MDEV-6081: ORDER BY+ref(const): selectivity is very incorrect (MySQL Bug#14338686) --echo # +--source include/have_sequence.inc +insert into t2 select seq,seq,seq from seq_10000_to_11000; alter table t2 add key2 int; update t2 set key2=key1; alter table t2 add key(key2); diff --git a/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result b/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result index 07c96e76213..942c9c94026 100644 --- a/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result +++ b/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result @@ -75,7 +75,7 @@ disconnect con4; optimize table t1; Table Op Msg_type Msg_text test.t1 optimize status OK -check table t1 extended; +check table t1; Table Op Msg_type Msg_text test.t1 check status OK select count(*) from t1; @@ -97,6 +97,7 @@ select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like count(stat_value) > 0 1 drop table t1; +delete from mysql.innodb_index_stats where table_name='t1'; SET GLOBAL innodb_defragment_n_pages = @n_pages; SET GLOBAL innodb_defragment_stats_accuracy = @accuracy; SET GLOBAL innodb_stats_persistent = @sp; diff --git a/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test b/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test index 1e4e14eb7c6..f5b7448032a 100644 --- a/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test +++ b/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test @@ -124,7 +124,7 @@ disconnect con3; disconnect con4; optimize table t1; -check table t1 extended; +check table t1; select count(*) from t1; select count(*) from t1 force index (second); @@ -136,6 +136,7 @@ select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag'); drop table t1; +delete from mysql.innodb_index_stats where table_name='t1'; # reset system SET GLOBAL innodb_defragment_n_pages = @n_pages; diff --git a/mysql-test/suite/mariabackup/slave_provision_nolock.test b/mysql-test/suite/mariabackup/slave_provision_nolock.test index 618f313290c..0253a6c0c2d 100644 --- a/mysql-test/suite/mariabackup/slave_provision_nolock.test +++ b/mysql-test/suite/mariabackup/slave_provision_nolock.test @@ -1,5 +1,7 @@ --source include/have_innodb.inc --source include/have_log_bin.inc +# Test does a lot of queries that take a lot of CPU under Valgrind. +--source include/not_valgrind.inc call mtr.add_suppression("Can't init tc log"); call mtr.add_suppression("Aborting"); diff --git a/mysql-test/suite/rpl/r/rpl_old_master.result b/mysql-test/suite/rpl/r/rpl_old_master.result index 5e9d8a88a20..11da61e09d0 100644 --- a/mysql-test/suite/rpl/r/rpl_old_master.result +++ b/mysql-test/suite/rpl/r/rpl_old_master.result @@ -9,10 +9,7 @@ connection slave; SET @old_parallel= @@GLOBAL.slave_parallel_threads; SET GLOBAL slave_parallel_threads=10; CHANGE MASTER TO master_host='127.0.0.1', master_port=SERVER_MYPORT_1, master_user='root', master_log_file='master-bin.000001', master_log_pos=4, master_use_gtid=no; -FLUSH TABLES WITH READ LOCK; include/start_slave.inc -include/wait_for_slave_param.inc [Seconds_Behind_Master] -UNLOCK TABLES; connection master; CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB; INSERT INTO t2 VALUES (1); diff --git a/mysql-test/suite/rpl/t/rpl_create_drop_event.test b/mysql-test/suite/rpl/t/rpl_create_drop_event.test index 96a7e82d6f7..79bb0ffec90 100644 --- a/mysql-test/suite/rpl/t/rpl_create_drop_event.test +++ b/mysql-test/suite/rpl/t/rpl_create_drop_event.test @@ -14,6 +14,12 @@ SET GLOBAL event_scheduler=on; let $wait_condition= SELECT count(*)>0 FROM t1; --source include/wait_condition.inc SET GLOBAL event_scheduler=off; +# If the time rolls to the next whole second just at this point, a new event +# run may be scheduled. Wait for this to disappear, otherwise we see occasional +# test failures if the table gets dropped before the extra event run completes. +# Expect 5 connections: default, master, master1, server_1, binlog dump thread +--let $wait_condition= SELECT COUNT(*) = 5 FROM INFORMATION_SCHEMA.PROCESSLIST; +--source include/wait_condition.inc SELECT DISTINCT a FROM t1; DELETE FROM t1; diff --git a/mysql-test/suite/rpl/t/rpl_mdev6020.test b/mysql-test/suite/rpl/t/rpl_mdev6020.test index 06f03be1430..ba67265b5e5 100644 --- a/mysql-test/suite/rpl/t/rpl_mdev6020.test +++ b/mysql-test/suite/rpl/t/rpl_mdev6020.test @@ -1,3 +1,5 @@ +# Test applies a large binlog, takes long under Valgrind with little benefit. +--source include/not_valgrind.inc --source include/have_innodb.inc --source include/have_partition.inc --source include/have_binlog_format_mixed_or_row.inc diff --git a/mysql-test/suite/rpl/t/rpl_old_master.test b/mysql-test/suite/rpl/t/rpl_old_master.test index 6faa8212d66..993ef977542 100644 --- a/mysql-test/suite/rpl/t/rpl_old_master.test +++ b/mysql-test/suite/rpl/t/rpl_old_master.test @@ -28,14 +28,7 @@ SET GLOBAL slave_parallel_threads=10; --replace_result $SERVER_MYPORT_1 SERVER_MYPORT_1 eval CHANGE MASTER TO master_host='127.0.0.1', master_port=$SERVER_MYPORT_1, master_user='root', master_log_file='master-bin.000001', master_log_pos=4, master_use_gtid=no; -# Block execution yet when the blocked query timestamp has been already accounted -FLUSH TABLES WITH READ LOCK; --source include/start_slave.inc ---let $slave_param = Seconds_Behind_Master ---let $slave_param_value = 1 ---let $slave_param_comparison= >= ---source include/wait_for_slave_param.inc -UNLOCK TABLES; --connection master CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB; diff --git a/mysql-test/suite/rpl/t/rpl_start_stop_slave.test b/mysql-test/suite/rpl/t/rpl_start_stop_slave.test index 23b25b1bf85..ce7d51ca43d 100644 --- a/mysql-test/suite/rpl/t/rpl_start_stop_slave.test +++ b/mysql-test/suite/rpl/t/rpl_start_stop_slave.test @@ -19,7 +19,17 @@ --source include/master-slave.inc connection slave; ---let $connection_id=`SELECT id FROM information_schema.processlist where state LIKE 'Waiting for master to send event'` +--let $i= 100 +while ($i > 0) { + dec $i; + --let $connection_id=`SELECT id FROM information_schema.processlist where state LIKE 'Waiting for master to send event'` + if ($connection_id) { + let $i= 0; + } + if ($i > 0) { + --sleep 0.1 + } +} if(!$connection_id) { diff --git a/mysql-test/suite/sys_vars/r/innodb_purge_batch_size_basic.result b/mysql-test/suite/sys_vars/r/innodb_purge_batch_size_basic.result index 442d44e7fb2..f5b01aa8016 100644 --- a/mysql-test/suite/sys_vars/r/innodb_purge_batch_size_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_purge_batch_size_basic.result @@ -1,19 +1,19 @@ SET @global_start_value = @@global.innodb_purge_batch_size; SELECT @global_start_value; @global_start_value -1000 +127 '#--------------------FN_DYNVARS_046_01------------------------#' SET @@global.innodb_purge_batch_size = 1; SET @@global.innodb_purge_batch_size = DEFAULT; SELECT @@global.innodb_purge_batch_size; @@global.innodb_purge_batch_size -1000 +127 '#---------------------FN_DYNVARS_046_02-------------------------#' SET innodb_purge_batch_size = 1; ERROR HY000: Variable 'innodb_purge_batch_size' is a GLOBAL variable and should be set with SET GLOBAL SELECT @@innodb_purge_batch_size; @@innodb_purge_batch_size -1000 +127 SELECT local.innodb_purge_batch_size; ERROR 42S02: Unknown table 'local' in field list SET global innodb_purge_batch_size = 1; @@ -112,4 +112,4 @@ SELECT @@global.innodb_purge_batch_size; SET @@global.innodb_purge_batch_size = @global_start_value; SELECT @@global.innodb_purge_batch_size; @@global.innodb_purge_batch_size -1000 +127 diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff index 81db8629d19..47c5b70feb4 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff @@ -221,7 +221,7 @@ VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_TYPE INT UNSIGNED - VARIABLE_COMMENT How many pages to flush on LRU eviction + VARIABLE_COMMENT Deprecated parameter with no effect NUMERIC_MIN_VALUE 1 -NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_MAX_VALUE 4294967295 diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index f86a67bbb7b..a1228716b1d 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -1068,13 +1068,13 @@ SESSION_VALUE NULL DEFAULT_VALUE 32 VARIABLE_SCOPE GLOBAL VARIABLE_TYPE BIGINT UNSIGNED -VARIABLE_COMMENT How many pages to flush on LRU eviction +VARIABLE_COMMENT Deprecated parameter with no effect NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 18446744073709551615 NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -COMMAND_LINE_ARGUMENT REQUIRED +COMMAND_LINE_ARGUMENT NULL VARIABLE_NAME INNODB_LRU_SCAN_DEPTH SESSION_VALUE NULL DEFAULT_VALUE 1536 @@ -1317,7 +1317,7 @@ READ_ONLY NO COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME INNODB_PURGE_BATCH_SIZE SESSION_VALUE NULL -DEFAULT_VALUE 1000 +DEFAULT_VALUE 127 VARIABLE_SCOPE GLOBAL VARIABLE_TYPE BIGINT UNSIGNED VARIABLE_COMMENT Number of UNDO log pages to purge in one batch from the history list. @@ -1338,7 +1338,7 @@ NUMERIC_MAX_VALUE 128 NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -COMMAND_LINE_ARGUMENT OPTIONAL +COMMAND_LINE_ARGUMENT NULL VARIABLE_NAME INNODB_PURGE_THREADS SESSION_VALUE NULL DEFAULT_VALUE 4 diff --git a/mysys/crc32/crc32c_x86.cc b/mysys/crc32/crc32c_x86.cc index 3ddddf1303c..fb5dc19f7a5 100644 --- a/mysys/crc32/crc32c_x86.cc +++ b/mysys/crc32/crc32c_x86.cc @@ -39,7 +39,7 @@ extern "C" unsigned crc32c_sse42(unsigned crc, const void* buf, size_t size); constexpr uint32_t cpuid_ecx_SSE42= 1U << 20; constexpr uint32_t cpuid_ecx_SSE42_AND_PCLMUL= cpuid_ecx_SSE42 | 1U << 1; -constexpr uint32_t cpuid_ecx_XSAVE= 1U << 26; +constexpr uint32_t cpuid_ecx_AVX_AND_XSAVE= 1U << 28 | 1U << 27; static uint32_t cpuid_ecx() { @@ -395,7 +395,7 @@ static bool os_have_avx512() static ATTRIBUTE_NOINLINE bool have_vpclmulqdq(uint32_t cpuid_ecx) { - if (!(cpuid_ecx & cpuid_ecx_XSAVE) || !os_have_avx512()) + if ((~cpuid_ecx & cpuid_ecx_AVX_AND_XSAVE) || !os_have_avx512()) return false; # ifdef _MSC_VER int regs[4]; diff --git a/sql/item.h b/sql/item.h index 1723a1bf3fe..236350e7613 100644 --- a/sql/item.h +++ b/sql/item.h @@ -4912,7 +4912,6 @@ public: } Item *do_get_copy(THD *thd) const override { return get_item_copy(thd, this); } - Item *do_build_clone(THD *thd) const override { return get_copy(thd); } }; @@ -4927,7 +4926,6 @@ public: { } Item *do_get_copy(THD *thd) const override { return get_item_copy(thd, this); } - Item *do_build_clone(THD *thd) const override { return get_copy(thd); } }; @@ -4944,7 +4942,6 @@ public: { } Item *do_get_copy(THD *thd) const override { return get_item_copy(thd, this); } - Item *do_build_clone(THD *thd) const override { return get_copy(thd); } }; @@ -4983,7 +4980,6 @@ public: } Item *do_get_copy(THD *thd) const override { return get_item_copy(thd, this); } - Item *do_build_clone(THD *thd) const override { return get_copy(thd); } }; @@ -5003,7 +4999,6 @@ public: } Item *do_get_copy(THD *thd) const override { return get_item_copy(thd, this); } - Item *do_build_clone(THD *thd) const override { return get_copy(thd); } }; @@ -5167,7 +5162,6 @@ public: void print(String *str, enum_query_type query_type) override; Item *do_get_copy(THD *thd) const override { return get_item_copy(thd, this); } - Item *do_build_clone(THD *thd) const override { return get_copy(thd); } }; diff --git a/sql/item_cmpfunc.h b/sql/item_cmpfunc.h index 6ef66034aa4..3c8a974037e 100644 --- a/sql/item_cmpfunc.h +++ b/sql/item_cmpfunc.h @@ -751,6 +751,8 @@ public: void set_sub_test(Item_maxmin_subselect *item) { test_sub_item= item; test_sum_item= 0;}; bool empty_underlying_subquery(); Item *neg_transformer(THD *thd) override; + Item *do_get_copy(THD *thd) const override + { return get_item_copy(thd, this); } }; diff --git a/sql/mysqld.cc b/sql/mysqld.cc index fa0edb22ed4..a63d7acd6fb 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -5379,7 +5379,9 @@ static int init_server_components() MARIADB_REMOVED_OPTION("innodb-log-compressed-pages"), MARIADB_REMOVED_OPTION("innodb-log-files-in-group"), MARIADB_REMOVED_OPTION("innodb-log-optimize-ddl"), + MARIADB_REMOVED_OPTION("innodb-lru-flush-size"), MARIADB_REMOVED_OPTION("innodb-page-cleaners"), + MARIADB_REMOVED_OPTION("innodb-purge-truncate-frequency"), MARIADB_REMOVED_OPTION("innodb-replication-delay"), MARIADB_REMOVED_OPTION("innodb-scrub-log"), MARIADB_REMOVED_OPTION("innodb-scrub-log-speed"), diff --git a/sql/net_serv.cc b/sql/net_serv.cc index 48d523d53ea..283a768dcd3 100644 --- a/sql/net_serv.cc +++ b/sql/net_serv.cc @@ -780,7 +780,6 @@ net_real_write(NET *net,const uchar *packet, size_t len) { sql_print_warning("Could not write packet: fd: %lld state: %d " "errno: %d vio_errno: %d length: %ld", - MYF(ME_ERROR_LOG | ME_WARNING), (longlong) vio_fd(net->vio), (int) net->vio->state, vio_errno(net->vio), net->last_errno, (ulong) (end-pos)); diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index a75c0e49650..86c1d6d3ca3 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -1496,11 +1496,23 @@ handle_rpl_parallel_thread(void *arg) after mark_start_commit(), we have to unmark, which has at least a theoretical possibility of leaving a window where it looks like all transactions in a GCO have started committing, while in fact one - will need to rollback and retry. This is not supposed to be possible - (since there is a deadlock, at least one transaction should be - blocked from reaching commit), but this seems a fragile ensurance, - and there were historically a number of subtle bugs in this area. + will need to rollback and retry. + + Normally this will not happen, since the kill is there to resolve a + deadlock that is preventing at least one transaction from proceeding. + One case it can happen is with InnoDB dict stats update, which can + temporarily cause transactions to block each other, but locks are + released immediately, they don't linger until commit. There could be + other similar cases, there were historically a number of subtle bugs + in this area. + + But once we start the commit, we can expect that no new lock + conflicts will be introduced. So by handling any lingering deadlock + kill at this point just before mark_start_commit(), we should be + robust even towards spurious deadlock kills. */ + if (rgi->killed_for_retry != rpl_group_info::RETRY_KILL_NONE) + wait_for_pending_deadlock_kill(thd, rgi); if (!thd->killed) { DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index eefc7d75446..b10b810edc9 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -2529,6 +2529,23 @@ rpl_group_info::unmark_start_commit() e= this->parallel_entry; mysql_mutex_lock(&e->LOCK_parallel_entry); + /* + Assert that we have not already wrongly completed this GCO and signalled + the next one to start, only to now unmark and make the signal invalid. + This is to catch problems like MDEV-34696. + + The error inject rpl_parallel_simulate_temp_err_xid is used to test this + precise situation, that we handle it gracefully if it somehow occurs in a + release build. So disable the assert in this case. + */ +#ifndef DBUG_OFF + bool allow_unmark_after_complete= false; + DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_xid", + allow_unmark_after_complete= true;); + DBUG_ASSERT(!gco->next_gco || + gco->next_gco->wait_count > e->count_committing_event_groups || + allow_unmark_after_complete); +#endif --e->count_committing_event_groups; mysql_mutex_unlock(&e->LOCK_parallel_entry); } diff --git a/sql/sql_select.cc b/sql/sql_select.cc index f37d1e26c97..b78f296be1c 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -30876,7 +30876,8 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table, else { const KEY *ref_keyinfo= table->key_info + ref_key; - refkey_rows_estimate= ref_keyinfo->rec_per_key[tab->ref.key_parts - 1]; + refkey_rows_estimate= + (ha_rows)ref_keyinfo->actual_rec_per_key(tab->ref.key_parts - 1); } set_if_bigger(refkey_rows_estimate, 1); } diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 12802bc98e2..0a890a7d2e2 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -610,7 +610,7 @@ public: bool avg_frequency_is_inited() { return avg_frequency != NULL; } - double get_avg_frequency(uint i) + double get_avg_frequency(uint i) const { return (double) avg_frequency[i] / Scale_factor_avg_frequency; } diff --git a/sql/structs.h b/sql/structs.h index 318df056359..df71e405562 100644 --- a/sql/structs.h +++ b/sql/structs.h @@ -167,7 +167,7 @@ typedef struct st_key { engine_option_value *option_list; ha_index_option_struct *option_struct; /* structure with parsed options */ - double actual_rec_per_key(uint i); + double actual_rec_per_key(uint i) const; bool without_overlaps; /* diff --git a/sql/table.cc b/sql/table.cc index 69ed55625ac..b8ccec37ade 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -10106,7 +10106,7 @@ uint TABLE_SHARE::actual_n_key_parts(THD *thd) } -double KEY::actual_rec_per_key(uint i) +double KEY::actual_rec_per_key(uint i) const { if (rec_per_key == 0) return 0; diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 564170baadf..30cbc24986c 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -1278,7 +1278,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH); auto& chain = buf_pool.page_hash.cell_get(page_id.fold()); - if (!row_purge_poss_sec(purge_node, index(), tuple)) + if (!row_purge_poss_sec(purge_node, index(), tuple, mtr)) /* The record cannot be purged yet. */ flag= BTR_CUR_DELETE_REF; else if (ibuf_insert(IBUF_OP_DELETE, tuple, index(), diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index a6eadc9c053..2c94cc0718c 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2702,6 +2702,123 @@ err_exit: return(FALSE); } +ATTRIBUTE_COLD +/** Try to merge buffered changes to a buffer pool page. +@param block buffer-fixed and latched block +@param rw_latch RW_X_LATCH, RW_SX_LATCH, RW_S_LATCH held on block +@param err error code +@return whether the page is invalid (corrupted) */ +static bool buf_page_ibuf_merge_try(buf_block_t *block, ulint rw_latch, + dberr_t *err) +{ + ut_ad(block->page.lock.have_any()); + ut_ad(block->page.buf_fix_count()); + + if (fil_page_get_type(block->page.frame) != FIL_PAGE_INDEX || + !page_is_leaf(block->page.frame)) + return false; + + if (rw_latch != RW_X_LATCH) + { + if (rw_latch == RW_S_LATCH) + { + if (!block->page.lock.s_x_upgrade()) + { + uint32_t state; + state= block->page.state(); + if (state < buf_page_t::UNFIXED) + { + fail: + block->page.lock.x_unlock(); + return true; + } + ut_ad(state & ~buf_page_t::LRU_MASK); + ut_ad(state < buf_page_t::READ_FIX); + if (state < buf_page_t::IBUF_EXIST || state >= buf_page_t::REINIT) + /* ibuf_merge_or_delete_for_page() was already invoked in + another thread. */ + goto downgrade_to_s; + } + } + else + { + ut_ad(rw_latch == RW_SX_LATCH); + block->page.lock.u_x_upgrade(); + } + } + + ut_ad(block->page.lock.have_x()); + block->page.clear_ibuf_exist(); + if (dberr_t e= ibuf_merge_or_delete_for_page(block, block->page.id(), + block->zip_size())) + { + if (err) + *err= e; + goto fail; + } + + switch (rw_latch) { + default: + ut_ad(rw_latch == RW_X_LATCH); + break; + case RW_SX_LATCH: + block->page.lock.x_u_downgrade(); + break; + case RW_S_LATCH: + downgrade_to_s: + block->page.lock.x_u_downgrade(); + block->page.lock.u_s_downgrade(); + break; + } + + return false; +} + +buf_block_t* buf_pool_t::page_fix(const page_id_t id) +{ + ha_handler_stats *const stats= mariadb_stats; + buf_inc_get(stats); + auto& chain= page_hash.cell_get(id.fold()); + page_hash_latch &hash_lock= page_hash.lock_get(chain); + for (;;) + { + hash_lock.lock_shared(); + buf_page_t *b= page_hash.get(id, chain); + if (b) + { + uint32_t state= b->fix(); + hash_lock.unlock_shared(); + ut_ad(!b->in_zip_hash); + ut_ad(b->frame); + ut_ad(state >= buf_page_t::FREED); + if (state >= buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) + { + b->lock.s_lock(); + state= b->state(); + ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX); + b->lock.s_unlock(); + } + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) + { + /* The page was marked as freed or corrupted. */ + b->unfix(); + b= nullptr; + } + return reinterpret_cast(b); + } + + hash_lock.unlock_shared(); + switch (buf_read_page(id, 0)) { + default: + return nullptr; + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + mariadb_increment_pages_read(stats); + buf_read_ahead_random(id, 0, false); + } + } +} + /** Low level function used to get access to a database page. @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @@ -2741,6 +2858,7 @@ buf_page_get_low( || (rw_latch == RW_X_LATCH) || (rw_latch == RW_SX_LATCH) || (rw_latch == RW_NO_LATCH)); + ut_ad(rw_latch != RW_NO_LATCH || !allow_ibuf_merge); if (err) { *err = DB_SUCCESS; @@ -3142,89 +3260,50 @@ re_evict_fail: state to FREED). Therefore, after acquiring the page latch we must recheck the state. */ - if (state >= buf_page_t::UNFIXED - && allow_ibuf_merge - && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX - && page_is_leaf(block->page.frame)) { - block->page.lock.x_lock(); - state = block->page.state(); - ut_ad(state < buf_page_t::READ_FIX); - - if (state >= buf_page_t::IBUF_EXIST - && state < buf_page_t::REINIT) { - block->page.clear_ibuf_exist(); - if (dberr_t local_err = - ibuf_merge_or_delete_for_page(block, page_id, - block->zip_size())) { - if (err) { - *err = local_err; - } - goto release_and_ignore_block; - } - } else if (state < buf_page_t::UNFIXED) { -release_and_ignore_block: - block->page.lock.x_unlock(); - goto ignore_block; - } - -#ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(block, true); -#endif /* BTR_CUR_HASH_ADAPT */ - - switch (rw_latch) { - case RW_NO_LATCH: - block->page.lock.x_unlock(); - break; - case RW_S_LATCH: - block->page.lock.x_unlock(); - block->page.lock.s_lock(); - break; - case RW_SX_LATCH: - block->page.lock.x_u_downgrade(); - break; - default: - ut_ad(rw_latch == RW_X_LATCH); - } - - mtr->memo_push(block, mtr_memo_type_t(rw_latch)); - } else { - switch (rw_latch) { - case RW_NO_LATCH: - mtr->memo_push(block, MTR_MEMO_BUF_FIX); + switch (rw_latch) { + case RW_NO_LATCH: + ut_ad(!allow_ibuf_merge); + mtr->memo_push(block, MTR_MEMO_BUF_FIX); + return block; + case RW_S_LATCH: + block->page.lock.s_lock(); + break; + case RW_SX_LATCH: + block->page.lock.u_lock(); + ut_ad(!block->page.is_io_fixed()); + break; + default: + ut_ad(rw_latch == RW_X_LATCH); + if (block->page.lock.x_lock_upgraded()) { + ut_ad(block->page.id() == page_id); + block->unfix(); + mtr->page_lock_upgrade(*block); return block; - case RW_S_LATCH: - block->page.lock.s_lock(); - break; - case RW_SX_LATCH: - block->page.lock.u_lock(); - ut_ad(!block->page.is_io_fixed()); - break; - default: - ut_ad(rw_latch == RW_X_LATCH); - if (block->page.lock.x_lock_upgraded()) { - ut_ad(block->page.id() == page_id); - block->unfix(); - mtr->page_lock_upgrade(*block); - return block; - } } - - mtr->memo_push(block, mtr_memo_type_t(rw_latch)); - state = block->page.state(); - - if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) { - mtr->release_last_page(); - goto ignore_unfixed; - } - - ut_ad(state < buf_page_t::READ_FIX - || state > buf_page_t::WRITE_FIX); - -#ifdef BTR_CUR_HASH_ADAPT - btr_search_drop_page_hash_index(block, true); -#endif /* BTR_CUR_HASH_ADAPT */ } + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); + state = block->page.state(); + + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) { + corrupted: + mtr->release_last_page(); + goto ignore_unfixed; + } + + ut_ad(state < buf_page_t::READ_FIX + || state > buf_page_t::WRITE_FIX); + if (state >= buf_page_t::IBUF_EXIST && state < buf_page_t::REINIT + && allow_ibuf_merge + && buf_page_ibuf_merge_try(block, rw_latch, err)) { + ut_ad(block == mtr->at_savepoint(mtr->get_savepoint() - 1)); + mtr->lock_register(mtr->get_savepoint() - 1, MTR_MEMO_BUF_FIX); + goto corrupted; + } +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif /* BTR_CUR_HASH_ADAPT */ + ut_ad(page_id_t(page_get_space_id(block->page.frame), page_get_page_no(block->page.frame)) == page_id); return block; diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 32f6b523b56..2076322179a 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -39,9 +39,6 @@ Created 11/5/1995 Heikki Tuuri #include "srv0mon.h" #include "my_cpu.h" -/** Flush this many pages in buf_LRU_get_free_block() */ -size_t innodb_lru_flush_size; - /** The number of blocks from the LRU_old pointer onward, including the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the whole LRU list length, except that the tolerance defined below @@ -369,17 +366,13 @@ block to read in a page. Note that we only ever get a block from the free list. Even when we flush a page or find a page in LRU scan we put it to free list to be used. * iteration 0: - * get a block from the buf_pool.free list, success:done + * get a block from the buf_pool.free list * if buf_pool.try_LRU_scan is set * scan LRU up to 100 pages to free a clean block * success:retry the free list - * flush up to innodb_lru_flush_size LRU blocks to data files - (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth) - * on buf_page_write_complete() the blocks will put on buf_pool.free list - * success: retry the free list + * invoke buf_pool.page_cleaner_wakeup(true) and wait its completion * subsequent iterations: same as iteration 0 except: - * scan whole LRU list - * scan LRU list even if buf_pool.try_LRU_scan is not set + * scan the entire LRU list @param have_mutex whether buf_pool.mutex is already being held @return the free control block, in state BUF_BLOCK_MEMORY */ diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 0b8015e5283..798caa5db61 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -18936,7 +18936,7 @@ static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size, PLUGIN_VAR_OPCMDARG, "Number of UNDO log pages to purge in one batch from the history list.", NULL, NULL, - 1000, /* Default setting */ + 127, /* Default setting */ 1, /* Minimum value */ innodb_purge_batch_size_MAX, 0); @@ -19254,11 +19254,6 @@ static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth, "How deep to scan LRU to keep it clean", NULL, NULL, 1536, 100, ~0UL, 0); -static MYSQL_SYSVAR_SIZE_T(lru_flush_size, innodb_lru_flush_size, - PLUGIN_VAR_RQCMDARG, - "How many pages to flush on LRU eviction", - NULL, NULL, 32, 1, SIZE_T_MAX, 0); - static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors, PLUGIN_VAR_OPCMDARG, "Set to 0 (don't flush neighbors from buffer pool)," @@ -19508,14 +19503,21 @@ static MYSQL_SYSVAR_ULONGLONG(max_undo_log_size, srv_max_undo_log_size, 10 << 20, 10 << 20, 1ULL << (32 + UNIV_PAGE_SIZE_SHIFT_MAX), 0); -static ulong innodb_purge_rseg_truncate_frequency; +static ulong innodb_purge_rseg_truncate_frequency= 128; static MYSQL_SYSVAR_ULONG(purge_rseg_truncate_frequency, innodb_purge_rseg_truncate_frequency, - PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED | PLUGIN_VAR_NOCMDOPT, "Deprecated parameter with no effect", NULL, NULL, 128, 1, 128, 0); +static size_t innodb_lru_flush_size; + +static MYSQL_SYSVAR_SIZE_T(lru_flush_size, innodb_lru_flush_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_DEPRECATED | PLUGIN_VAR_NOCMDOPT, + "Deprecated parameter with no effect", + NULL, NULL, 32, 1, SIZE_T_MAX, 0); + static void innodb_undo_log_truncate_update(THD *thd, struct st_mysql_sys_var*, void*, const void *save) { diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index f6abc9f5e52..96b6d212168 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -78,14 +78,10 @@ page_zip_des_t* btr_cur_get_page_zip( /*=================*/ btr_cur_t* cursor);/*!< in: tree cursor */ -/*********************************************************//** -Returns the page of a tree cursor. +/** Returns the page of a tree cursor. @return pointer to page */ -UNIV_INLINE -page_t* -btr_cur_get_page( -/*=============*/ - btr_cur_t* cursor);/*!< in: tree cursor */ +#define btr_cur_get_page(cursor) (cursor)->block()->page.frame + /*********************************************************//** Returns the index of a cursor. @param cursor b-tree cursor diff --git a/storage/innobase/include/btr0cur.inl b/storage/innobase/include/btr0cur.inl index 955cf34288e..5981b1465c9 100644 --- a/storage/innobase/include/btr0cur.inl +++ b/storage/innobase/include/btr0cur.inl @@ -48,18 +48,6 @@ btr_cur_get_page_zip( return(buf_block_get_page_zip(btr_cur_get_block(cursor))); } -/*********************************************************//** -Returns the page of a tree cursor. -@return pointer to page */ -UNIV_INLINE -page_t* -btr_cur_get_page( -/*=============*/ - btr_cur_t* cursor) /*!< in: tree cursor */ -{ - return(page_align(page_cur_get_rec(&(cursor->page_cur)))); -} - /*********************************************************//** Positions a tree cursor at a given record. */ UNIV_INLINE diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 5b2cb622c53..568d1073d46 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1426,6 +1426,12 @@ public: } public: + /** Look up and buffer-fix a page. + @param id page identifier + @return undo log page, buffer-fixed + @retval nullptr if the undo page was corrupted or freed */ + buf_block_t *page_fix(const page_id_t id); + /** @return whether the buffer pool contains a page @tparam allow_watch whether to allow watch_is_sentinel() @param page_id page identifier diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index 28410276217..53a5f136fd2 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -33,9 +33,6 @@ Created 11/5/1995 Heikki Tuuri struct trx_t; struct fil_space_t; -/** Flush this many pages in buf_LRU_get_free_block() */ -extern size_t innodb_lru_flush_size; - /*####################################################################### These are low-level functions #########################################################################*/ @@ -71,17 +68,13 @@ block to read in a page. Note that we only ever get a block from the free list. Even when we flush a page or find a page in LRU scan we put it to free list to be used. * iteration 0: - * get a block from the buf_pool.free list, success:done + * get a block from the buf_pool.free list * if buf_pool.try_LRU_scan is set * scan LRU up to 100 pages to free a clean block * success:retry the free list - * flush up to innodb_lru_flush_size LRU blocks to data files - (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth) - * on buf_page_write_complete() the blocks will put on buf_pool.free list - * success: retry the free list + * invoke buf_pool.page_cleaner_wakeup(true) and wait its completion * subsequent iterations: same as iteration 0 except: - * scan whole LRU list - * scan LRU list even if buf_pool.try_LRU_scan is not set + * scan the entire LRU list @param have_mutex whether buf_pool.mutex is already being held @return the free control block, in state BUF_BLOCK_MEMORY */ diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h index 28aa30565e4..bef8a679ea0 100644 --- a/storage/innobase/include/page0cur.h +++ b/storage/innobase/include/page0cur.h @@ -31,14 +31,6 @@ Created 10/4/1994 Heikki Tuuri #ifdef UNIV_DEBUG /*********************************************************//** -Gets pointer to the page frame where the cursor is positioned. -@return page */ -UNIV_INLINE -page_t* -page_cur_get_page( -/*==============*/ - page_cur_t* cur); /*!< in: page cursor */ -/*********************************************************//** Gets pointer to the buffer block where the cursor is positioned. @return page */ UNIV_INLINE @@ -60,12 +52,12 @@ page_cur_get_page_zip( UNIV_INLINE rec_t *page_cur_get_rec(const page_cur_t *cur); #else /* UNIV_DEBUG */ -# define page_cur_get_page(cur) page_align((cur)->rec) # define page_cur_get_block(cur) (cur)->block # define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block) # define page_cur_get_rec(cur) (cur)->rec #endif /* UNIV_DEBUG */ -# define is_page_cur_get_page_zip(cur) is_buf_block_get_page_zip((cur)->block) +#define page_cur_get_page(cur) page_cur_get_block(cur)->page.frame +#define is_page_cur_get_page_zip(cur) is_buf_block_get_page_zip((cur)->block) /*********************************************************//** Sets the cursor object to point before the first user record on the page. */ diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl index 7c4eafa266a..9242ff428d0 100644 --- a/storage/innobase/include/page0cur.inl +++ b/storage/innobase/include/page0cur.inl @@ -25,18 +25,6 @@ Created 10/4/1994 Heikki Tuuri *************************************************************************/ #ifdef UNIV_DEBUG -/*********************************************************//** -Gets pointer to the page frame where the cursor is positioned. -@return page */ -UNIV_INLINE -page_t* -page_cur_get_page( -/*==============*/ - page_cur_t* cur) /*!< in: page cursor */ -{ - return page_align(page_cur_get_rec(cur)); -} - /*********************************************************//** Gets pointer to the buffer block where the cursor is positioned. @return page */ diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h index c60f390a092..6485e21e7fc 100644 --- a/storage/innobase/include/que0que.h +++ b/storage/innobase/include/que0que.h @@ -209,17 +209,6 @@ que_eval_sql( const char* sql, /*!< in: SQL string */ trx_t* trx); /*!< in: trx */ -/**********************************************************************//** -Round robin scheduler. -@return a query thread of the graph moved to QUE_THR_RUNNING state, or -NULL; the query thread should be executed by que_run_threads by the -caller */ -que_thr_t* -que_fork_scheduler_round_robin( -/*===========================*/ - que_fork_t* fork, /*!< in: a query fork */ - que_thr_t* thr); /*!< in: current pos */ - /** Query thread states */ enum que_thr_state_t { /** in selects this means that the thread is at the end of its diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h index 1daf4d4abe7..baa7777e6c8 100644 --- a/storage/innobase/include/row0purge.h +++ b/storage/innobase/include/row0purge.h @@ -50,26 +50,13 @@ inserts a record that the secondary index entry would refer to. However, in that case, the user transaction would also re-insert the secondary index entry after purge has removed it and released the leaf page latch. -@param[in,out] node row purge node -@param[in] index secondary index -@param[in] entry secondary index entry -@param[in,out] sec_pcur secondary index cursor or NULL - if it is called for purge buffering - operation. -@param[in,out] sec_mtr mini-transaction which holds - secondary index entry or NULL if it is - called for purge buffering operation. -@param[in] is_tree true=pessimistic purge, - false=optimistic (leaf-page only) -@return true if the secondary index record can be purged */ -bool -row_purge_poss_sec( - purge_node_t* node, - dict_index_t* index, - const dtuple_t* entry, - btr_pcur_t* sec_pcur=NULL, - mtr_t* sec_mtr=NULL, - bool is_tree=false); +@param node row purge node +@param index secondary index +@param entry secondary index entry +@param mtr mini-transaction for looking up clustered index +@return whether the secondary index record can be purged */ +bool row_purge_poss_sec(purge_node_t *node, dict_index_t *index, + const dtuple_t *entry, mtr_t *mtr); /*************************************************************** Does the purge operation. diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h index 60f310e1b0f..2ddffa41af1 100644 --- a/storage/innobase/include/row0vers.h +++ b/storage/innobase/include/row0vers.h @@ -54,32 +54,47 @@ row_vers_impl_x_locked( dict_index_t* index, const rec_offs* offsets); -/** Finds out if a version of the record, where the version >= the current -purge_sys.view, should have ientry as its secondary index entry. We check -if there is any not delete marked version of the record where the trx -id >= purge view, and the secondary index entry == ientry; exactly in -this case we return TRUE. -@param[in] also_curr TRUE if also rec is included in the versions - to search; otherwise only versions prior - to it are searched -@param[in] rec record in the clustered index; the caller - must have a latch on the page -@param[in] mtr mtr holding the latch on rec; it will - also hold the latch on purge_view -@param[in] index secondary index -@param[in] ientry secondary index entry -@param[in] roll_ptr roll_ptr for the purge record -@param[in] trx_id transaction ID on the purging record -@return TRUE if earlier version should have */ +/** Find out whether data tuple has missing data type +for indexed virtual column. +@param tuple data tuple +@param index virtual index +@return true if tuple has missing column type */ +bool dtuple_vcol_data_missing(const dtuple_t &tuple, + const dict_index_t &index); +/** build virtual column value from current cluster index record data +@param[in,out] row the cluster index row in dtuple form +@param[in] clust_index clustered index +@param[in] index the secondary index +@param[in] heap heap used to build virtual dtuple. */ bool -row_vers_old_has_index_entry( - bool also_curr, - const rec_t* rec, - mtr_t* mtr, +row_vers_build_clust_v_col( + dtuple_t* row, + dict_index_t* clust_index, dict_index_t* index, - const dtuple_t* ientry, + mem_heap_t* heap); +/** Build a dtuple contains virtual column data for current cluster index +@param[in] rec cluster index rec +@param[in] clust_index cluster index +@param[in] clust_offsets cluster rec offset +@param[in] index secondary index +@param[in] trx_id transaction ID on the purging record, + or 0 if called outside purge +@param[in] roll_ptr roll_ptr for the purge record +@param[in,out] heap heap memory +@param[in,out] v_heap heap memory to keep virtual column tuple +@param[in,out] mtr mini-transaction +@return dtuple contains virtual column data */ +dtuple_t* +row_vers_build_cur_vrow( + const rec_t* rec, + dict_index_t* clust_index, + rec_offs** clust_offsets, + dict_index_t* index, + trx_id_t trx_id, roll_ptr_t roll_ptr, - trx_id_t trx_id); + mem_heap_t* heap, + mem_heap_t* v_heap, + mtr_t* mtr); /*****************************************************************//** Constructs the version of a clustered index record which a consistent diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 0fe42e277a3..3fe76ac77e8 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -568,6 +568,15 @@ Complete the shutdown tasks such as background DROP TABLE, and optionally change buffer merge (on innodb_fast_shutdown=0). */ void srv_shutdown(bool ibuf_merge); +/** + Fetches and executes tasks from the purge work queue, + until this queue is empty. + This is main part of purge worker task, but also + executed in coordinator. + @note needs current_thd to be set beforehand. +*/ +void srv_purge_worker_task_low(); + } /* extern "C" */ #ifdef UNIV_DEBUG diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h index e65756097a8..10edc2ad0a0 100644 --- a/storage/innobase/include/srw_lock.h +++ b/storage/innobase/include/srw_lock.h @@ -280,6 +280,8 @@ public: #endif } + bool rd_u_upgrade_try() { return writer.wr_lock_try(); } + void u_wr_upgrade() { DBUG_ASSERT(writer.is_locked()); @@ -294,6 +296,13 @@ public: readers.store(0, std::memory_order_release); /* Note: Any pending rd_lock() will not be woken up until u_unlock() */ } + void u_rd_downgrade() + { + DBUG_ASSERT(writer.is_locked()); + ut_d(uint32_t lk=) readers.fetch_add(1, std::memory_order_relaxed); + ut_ad(lk < WRITER); + u_unlock(); + } void rd_unlock() { diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h index a8f51fdcdba..ea9487b943c 100644 --- a/storage/innobase/include/sux_lock.h +++ b/storage/innobase/include/sux_lock.h @@ -198,6 +198,30 @@ public: /** Upgrade an update lock */ inline void u_x_upgrade(); inline void u_x_upgrade(const char *file, unsigned line); + /** @return whether a shared lock was upgraded to exclusive */ + bool s_x_upgrade_try() + { + ut_ad(have_s()); + ut_ad(!have_u_or_x()); + if (!lock.rd_u_upgrade_try()) + return false; + claim_ownership(); + s_unlock(); + lock.u_wr_upgrade(); + recursive= RECURSIVE_X; + return true; + } + __attribute__((warn_unused_result)) + /** @return whether the operation succeeded without waiting */ + bool s_x_upgrade() + { + if (s_x_upgrade_try()) + return true; + s_unlock(); + x_lock(); + return false; + } + /** Downgrade a single exclusive lock to an update lock */ void x_u_downgrade() { @@ -206,6 +230,16 @@ public: recursive*= RECURSIVE_U; lock.wr_u_downgrade(); } + /** Downgrade a single update lock to a shared lock */ + void u_s_downgrade() + { + ut_ad(have_u_or_x()); + ut_ad(recursive == RECURSIVE_U); + recursive= 0; + set_new_owner(0); + lock.u_rd_downgrade(); + ut_d(s_lock_register()); + } /** Acquire an exclusive lock or upgrade an update lock @return whether U locks were upgraded to X */ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 1fb6cd68538..1dcc7845b96 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -149,10 +149,11 @@ public: private: /** number of pending stop() calls without resume() */ Atomic_counter m_paused; - /** number of stop_SYS() calls without resume_SYS() */ - Atomic_counter m_SYS_paused; - /** number of stop_FTS() calls without resume_FTS() */ - Atomic_counter m_FTS_paused; + /** PAUSED_SYS * number of stop_SYS() calls without resume_SYS() + + number of stop_FTS() calls without resume_FTS() */ + Atomic_relaxed m_FTS_paused; + /** The stop_SYS() multiplier in m_FTS_paused */ + static constexpr const uint32_t PAUSED_SYS= 1U << 16; /** latch protecting end_view */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock_low end_latch; @@ -321,16 +322,21 @@ private: void wait_FTS(bool also_sys); public: /** Suspend purge in data dictionary tables */ - void stop_SYS() { m_SYS_paused++; } + void stop_SYS() + { + ut_d(const auto p=) m_FTS_paused.fetch_add(PAUSED_SYS); + ut_ad(p < p + PAUSED_SYS); + } /** Resume purge in data dictionary tables */ static void resume_SYS(void *); /** Pause purge during a DDL operation that could drop FTS_ tables. */ void stop_FTS(); /** Resume purge after stop_FTS(). */ - void resume_FTS() { ut_d(const auto p=) m_FTS_paused--; ut_ad(p); } + void resume_FTS() + { ut_d(const auto p=) m_FTS_paused.fetch_sub(1); ut_ad(p & ~PAUSED_SYS); } /** @return whether stop_SYS() is in effect */ - bool must_wait_FTS() const { return m_FTS_paused; } + bool must_wait_FTS() const { return m_FTS_paused & ~PAUSED_SYS; } private: /** @@ -432,10 +438,17 @@ public: struct view_guard { - inline view_guard(); + enum guard { END_VIEW= -1, PURGE= 0, VIEW= 1}; + guard latch; + inline view_guard(guard latch); inline ~view_guard(); + /** Fetch an undo log page. + @param id page identifier + @param mtr mini-transaction + @return reference to buffer page, possibly buffer-fixed in mtr */ + inline const buf_block_t *get(const page_id_t id, mtr_t *mtr); - /** @return purge_sys.view */ + /** @return purge_sys.view or purge_sys.end_view */ inline const ReadViewBase &view() const; }; @@ -464,14 +477,39 @@ public: /** The global data structure coordinating a purge */ extern purge_sys_t purge_sys; -purge_sys_t::view_guard::view_guard() -{ purge_sys.latch.rd_lock(SRW_LOCK_CALL); } +purge_sys_t::view_guard::view_guard(purge_sys_t::view_guard::guard latch) : + latch(latch) +{ + switch (latch) { + case VIEW: + purge_sys.latch.rd_lock(SRW_LOCK_CALL); + break; + case END_VIEW: + purge_sys.end_latch.rd_lock(); + break; + case PURGE: + /* the access is within a purge batch; purge_coordinator_task + will wait for all workers to complete before updating the views */ + break; + } +} purge_sys_t::view_guard::~view_guard() -{ purge_sys.latch.rd_unlock(); } +{ + switch (latch) { + case VIEW: + purge_sys.latch.rd_unlock(); + break; + case END_VIEW: + purge_sys.end_latch.rd_unlock(); + break; + case PURGE: + break; + } +} const ReadViewBase &purge_sys_t::view_guard::view() const -{ return purge_sys.view; } +{ return latch == END_VIEW ? purge_sys.end_view : purge_sys.view; } purge_sys_t::end_view_guard::end_view_guard() { purge_sys.end_latch.rd_lock(); } diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h index 3d9b18689a7..609dd530498 100644 --- a/storage/innobase/include/trx0rec.h +++ b/storage/innobase/include/trx0rec.h @@ -157,50 +157,44 @@ trx_undo_report_row_operation( /** TRX_UNDO_PREV_IN_PURGE tells trx_undo_prev_version_build() that it is being called purge view and we would like to get the purge record even it is in the purge view (in normal case, it will return without -fetching the purge record */ +fetching the purge record) */ static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1; /** This tells trx_undo_prev_version_build() to fetch the old value in the undo log (which is the after image for an update) */ static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2; -/** indicate a call from row_vers_old_has_index_entry() */ +/** indicate a call from row_undo_mod_sec_is_unsafe() */ static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4; +/** indicate a call from row_purge_is_unsafe() */ +static constexpr ulint TRX_UNDO_CHECK_PURGE_PAGES = 8; + /** Build a previous version of a clustered index record. The caller must hold a latch on the index page of the clustered index record. -@param rec version of a clustered index record -@param index clustered index -@param offsets rec_get_offsets(rec, index) -@param heap memory heap from which the memory needed is - allocated -@param old_vers previous version or NULL if rec is the - first inserted version, or if history data - has been deleted (an error), or if the purge - could have removed the version - though it has not yet done so -@param v_heap memory heap used to create vrow - dtuple if it is not yet created. This heap - diffs from "heap" above in that it could be - prebuilt->old_vers_heap for selection -@param vrow virtual column info, if any -@param v_status status determine if it is going into this - function by purge thread or not. - And if we read "after image" of undo log +@param rec version of a clustered index record +@param index clustered index +@param offsets rec_get_offsets(rec, index) +@param heap memory heap from which the memory needed is allocated +@param old_vers previous version, or NULL if rec is the first inserted + version, or if history data has been deleted (an error), + or if the purge could have removed the version though + it has not yet done so +@param mtr mini-transaction +@param v_status TRX_UNDO_PREV_IN_PURGE, ... +@param v_heap memory heap used to create vrow dtuple if it is not yet + created. This heap diffs from "heap" above in that it could be + prebuilt->old_vers_heap for selection +@param vrow virtual column info, if any @return error code @retval DB_SUCCESS if previous version was successfully built, or if it was an insert or the undo record refers to the table before rebuild @retval DB_MISSING_HISTORY if the history is missing */ -dberr_t -trx_undo_prev_version_build( - const rec_t *rec, - dict_index_t *index, - rec_offs *offsets, - mem_heap_t *heap, - rec_t **old_vers, - mem_heap_t *v_heap, - dtuple_t **vrow, - ulint v_status); +dberr_t trx_undo_prev_version_build(const rec_t *rec, dict_index_t *index, + rec_offs *offsets, mem_heap_t *heap, + rec_t **old_vers, mtr_t *mtr, + ulint v_status, + mem_heap_t *v_heap, dtuple_t **vrow); /** Read from an undo log record a non-virtual column value. @param ptr pointer to remaining part of the undo record diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index ea67e4a895f..c267e45ebe6 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -500,9 +500,8 @@ void lock_sys_t::close() requesting record lock are brute force (BF). If they are check is this BF-BF wait correct and if not report BF wait and assert. -@param[in] lock_rec other waiting record lock -@param[in] trx trx requesting conflicting record lock -@param[in] type_mode lock type mode of requesting trx +@param lock other waiting lock +@param trx transaction requesting conflicting lock */ static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx, const unsigned type_mode = LOCK_NONE) diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index d593c51696a..2b70501dc11 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -4584,7 +4584,7 @@ dberr_t recv_recovery_from_checkpoint_start() ut_ad(recv_sys.pages.empty()); if (log_sys.format == log_t::FORMAT_3_23) { -early_exit: +func_exit: log_sys.latch.wr_unlock(); return err; } @@ -4600,7 +4600,7 @@ read_only_recovery: sql_print_warning("InnoDB: innodb_read_only" " prevents crash recovery"); err = DB_READ_ONLY; - goto early_exit; + goto func_exit; } if (recv_sys.is_corrupt_log()) { sql_print_error("InnoDB: Log scan aborted at LSN " @@ -4638,7 +4638,7 @@ read_only_recovery: rescan, missing_tablespace); if (err != DB_SUCCESS) { - goto early_exit; + goto func_exit; } if (missing_tablespace) { @@ -4660,7 +4660,7 @@ read_only_recovery: rescan, missing_tablespace); if (err != DB_SUCCESS) { - goto early_exit; + goto func_exit; } } while (missing_tablespace); @@ -4719,7 +4719,7 @@ read_only_recovery: if (recv_sys.lsn < log_sys.next_checkpoint_lsn) { err_exit: err = DB_ERROR; - goto early_exit; + goto func_exit; } if (!srv_read_only_mode && log_sys.is_latest()) { @@ -4743,7 +4743,7 @@ err_exit: ut_ad("log parsing error" == 0); mysql_mutex_unlock(&recv_sys.mutex); err = DB_CORRUPTION; - goto early_exit; + goto func_exit; } recv_sys.apply_log_recs = true; recv_no_ibuf_operations = false; @@ -4752,9 +4752,9 @@ err_exit: if (srv_operation == SRV_OPERATION_NORMAL) { err = recv_rename_files(); } - mysql_mutex_unlock(&recv_sys.mutex); recv_lsn_checks_on = true; + mysql_mutex_unlock(&recv_sys.mutex); /* The database is now ready to start almost normal processing of user transactions: transaction rollbacks and the application of the log @@ -4764,8 +4764,7 @@ err_exit: err = DB_CORRUPTION; } - log_sys.latch.wr_unlock(); - return err; + goto func_exit; } bool recv_dblwr_t::validate_page(const page_id_t page_id, diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc index d910ee2a881..5e1e0686c97 100644 --- a/storage/innobase/que/que0que.cc +++ b/storage/innobase/que/que0que.cc @@ -166,40 +166,6 @@ que_thr_init_command( thr->state = QUE_THR_RUNNING; } -/**********************************************************************//** -Round robin scheduler. -@return a query thread of the graph moved to QUE_THR_RUNNING state, or -NULL; the query thread should be executed by que_run_threads by the -caller */ -que_thr_t* -que_fork_scheduler_round_robin( -/*===========================*/ - que_fork_t* fork, /*!< in: a query fork */ - que_thr_t* thr) /*!< in: current pos */ -{ - fork->trx->mutex_lock(); - - /* If no current, start first available. */ - if (thr == NULL) { - thr = UT_LIST_GET_FIRST(fork->thrs); - } else { - thr = UT_LIST_GET_NEXT(thrs, thr); - } - - if (thr) { - - fork->state = QUE_FORK_ACTIVE; - - fork->last_sel_node = NULL; - ut_ad(thr->state == QUE_THR_COMPLETED); - que_thr_init_command(thr); - } - - fork->trx->mutex_unlock(); - - return(thr); -} - /**********************************************************************//** Starts execution of a command in a query fork. Picks a query thread which is not in the QUE_THR_RUNNING state and moves it to that state. If none diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc index 97eda7dba32..46d58326edf 100644 --- a/storage/innobase/read/read0read.cc +++ b/storage/innobase/read/read0read.cc @@ -160,7 +160,7 @@ may be pointing to garbage (an undo log record discarded by purge), but it will never be dereferenced, because the purge view is older than any active transaction. -For details see: row_vers_old_has_index_entry() and row_purge_poss_sec() +For details see: row_undo_mod_sec_is_unsafe() and row_purge_poss_sec() */ diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index c4f463041a3..5d0446e4553 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -3857,7 +3857,7 @@ UndorecApplier::get_old_rec(const dtuple_t &tuple, dict_index_t *index, if (is_same(roll_ptr)) return version; trx_undo_prev_version_build(version, index, *offsets, heap, &prev_version, - nullptr, nullptr, 0); + &mtr, 0, nullptr, nullptr); version= prev_version; } while (version); @@ -4026,7 +4026,7 @@ void UndorecApplier::log_update(const dtuple_t &tuple, copy_rec= rec_copy(mem_heap_alloc( heap, rec_offs_size(offsets)), match_rec, offsets); trx_undo_prev_version_build(match_rec, clust_index, offsets, heap, - &prev_version, nullptr, nullptr, 0); + &prev_version, &mtr, 0, nullptr, nullptr); prev_offsets= rec_get_offsets(prev_version, clust_index, prev_offsets, clust_index->n_core_fields, diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index d83ab86124e..629f4177552 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -271,6 +271,448 @@ row_purge_remove_clust_if_poss( return(false); } +/** Check a virtual column value index secondary virtual index matches +that of current cluster index record, which is recreated from information +stored in undo log +@param[in] rec record in the clustered index +@param[in] icentry the index entry built from a cluster row +@param[in] clust_index cluster index +@param[in] clust_offsets offsets on the cluster record +@param[in] index the secondary index +@param[in] ientry the secondary index entry +@param[in] roll_ptr the rollback pointer for the purging record +@param[in] trx_id trx id for the purging record +@param[in,out] mtr mini-transaction +@param[in,out] v_row dtuple holding the virtual rows (if needed) +@return true if matches, false otherwise */ +static +bool +row_purge_vc_matches_cluster( + const rec_t* rec, + const dtuple_t* icentry, + dict_index_t* clust_index, + rec_offs* clust_offsets, + dict_index_t* index, + const dtuple_t* ientry, + roll_ptr_t roll_ptr, + trx_id_t trx_id, + mtr_t* mtr, + dtuple_t** vrow) +{ + const rec_t* version; + rec_t* prev_version; + mem_heap_t* heap2; + mem_heap_t* heap = NULL; + mem_heap_t* tuple_heap; + ulint num_v = dict_table_get_n_v_cols(index->table); + bool compare[REC_MAX_N_FIELDS]; + ulint n_fields = dtuple_get_n_fields(ientry); + ulint n_non_v_col = 0; + ulint n_cmp_v_col = 0; + const dfield_t* field1; + dfield_t* field2; + ulint i; + + /* First compare non-virtual columns (primary keys) */ + ut_ad(index->n_fields == n_fields); + ut_ad(n_fields == dtuple_get_n_fields(icentry)); + ut_ad(mtr->memo_contains_page_flagged(rec, + MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX)); + + { + const dfield_t* a = ientry->fields; + const dfield_t* b = icentry->fields; + + for (const dict_field_t *ifield = index->fields, + *const end = &index->fields[index->n_fields]; + ifield != end; ifield++, a++, b++) { + if (!ifield->col->is_virtual()) { + if (cmp_dfield_dfield(a, b)) { + return false; + } + n_non_v_col++; + } + } + } + + tuple_heap = mem_heap_create(1024); + + ut_ad(n_fields > n_non_v_col); + + *vrow = dtuple_create_with_vcol(tuple_heap, 0, num_v); + dtuple_init_v_fld(*vrow); + + for (i = 0; i < num_v; i++) { + dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype + = DATA_MISSING; + compare[i] = false; + } + + version = rec; + + while (n_cmp_v_col < n_fields - n_non_v_col) { + heap2 = heap; + heap = mem_heap_create(1024); + roll_ptr_t cur_roll_ptr = row_get_rec_roll_ptr( + version, clust_index, clust_offsets); + + ut_ad(cur_roll_ptr != 0); + ut_ad(roll_ptr != 0); + + trx_undo_prev_version_build( + version, clust_index, clust_offsets, + heap, &prev_version, mtr, + TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE, + nullptr, vrow); + + if (heap2) { + mem_heap_free(heap2); + } + + if (!prev_version) { + /* Versions end here */ + goto func_exit; + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + ulint entry_len = dict_index_get_n_fields(index); + + for (i = 0; i < entry_len; i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + const dict_col_t* col = ind_field->col; + field1 = dtuple_get_nth_field(ientry, i); + + if (!col->is_virtual()) { + continue; + } + + const dict_v_col_t* v_col + = reinterpret_cast(col); + field2 + = dtuple_get_nth_v_field(*vrow, v_col->v_pos); + + if ((dfield_get_type(field2)->mtype != DATA_MISSING) + && (!compare[v_col->v_pos])) { + + if (ind_field->prefix_len != 0 + && !dfield_is_null(field2)) { + field2->len = unsigned( + dtype_get_at_most_n_mbchars( + field2->type.prtype, + field2->type.mbminlen, + field2->type.mbmaxlen, + ind_field->prefix_len, + field2->len, + static_cast + (field2->data))); + } + + /* The index field mismatch */ + if (cmp_dfield_dfield(field2, field1)) { + mem_heap_free(tuple_heap); + mem_heap_free(heap); + return(false); + } + + compare[v_col->v_pos] = true; + n_cmp_v_col++; + } + } + + trx_id_t rec_trx_id = row_get_rec_trx_id( + prev_version, clust_index, clust_offsets); + + if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) { + break; + } + + version = prev_version; + } + +func_exit: + if (n_cmp_v_col == 0) { + *vrow = NULL; + } + + mem_heap_free(tuple_heap); + mem_heap_free(heap); + + /* FIXME: In the case of n_cmp_v_col is not the same as + n_fields - n_non_v_col, callback is needed to compare the rest + columns. At the timebeing, we will need to return true */ + return (true); +} + +/** @return whether two data tuples are equal */ +bool dtuple_coll_eq(const dtuple_t &tuple1, const dtuple_t &tuple2) +{ + ut_ad(tuple1.magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(tuple2.magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(dtuple_check_typed(&tuple1)); + ut_ad(dtuple_check_typed(&tuple2)); + ut_ad(tuple1.n_fields == tuple2.n_fields); + + for (ulint i= 0; i < tuple1.n_fields; i++) + if (cmp_dfield_dfield(&tuple1.fields[i], &tuple2.fields[i])) + return false; + return true; +} + +/** Finds out if a version of the record, where the version >= the current +purge_sys.view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry == ientry; exactly in +this case we return TRUE. +@param node purge node +@param index secondary index +@param ientry secondary index entry +@param mtr mini-transaction +@return whether ientry cannot be purged */ +static bool row_purge_is_unsafe(const purge_node_t &node, + dict_index_t *index, + const dtuple_t *ientry, mtr_t *mtr) +{ + const rec_t* rec = btr_pcur_get_rec(&node.pcur); + roll_ptr_t roll_ptr = node.roll_ptr; + trx_id_t trx_id = node.trx_id; + const rec_t* version; + rec_t* prev_version; + dict_index_t* clust_index = node.pcur.index(); + rec_offs* clust_offsets; + mem_heap_t* heap; + dtuple_t* row; + const dtuple_t* entry; + dtuple_t* vrow = NULL; + mem_heap_t* v_heap = NULL; + dtuple_t* cur_vrow = NULL; + + ut_ad(index->table == clust_index->table); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (dict_index_has_virtual(index)) { + v_heap = mem_heap_create(100); + } + + if (!rec_get_deleted_flag(rec, rec_offs_comp(clust_offsets))) { + row_ext_t* ext; + + /* The top of the stack of versions is locked by the + mtr holding a latch on the page containing the + clustered index record. The bottom of the stack is + locked by the fact that the purge_sys.view must + 'overtake' any read view of an active transaction. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, clust_offsets, + NULL, NULL, NULL, &ext, heap); + + if (dict_index_has_virtual(index)) { + + +#ifdef DBUG_OFF +# define dbug_v_purge false +#else /* DBUG_OFF */ + bool dbug_v_purge = false; +#endif /* DBUG_OFF */ + + DBUG_EXECUTE_IF( + "ib_purge_virtual_index_callback", + dbug_v_purge = true;); + + roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr( + rec, clust_index, clust_offsets); + + /* if the row is newly inserted, then the virtual + columns need to be computed */ + if (trx_undo_roll_ptr_is_insert(t_roll_ptr) + || dbug_v_purge) { + + if (!row_vers_build_clust_v_col( + row, clust_index, index, heap)) { + goto unsafe_to_purge; + } + + entry = row_build_index_entry( + row, ext, index, heap); + if (entry && dtuple_coll_eq(*ientry, *entry)) { + goto unsafe_to_purge; + } + } else { + /* Build index entry out of row */ + entry = row_build_index_entry(row, ext, index, heap); + /* entry could only be NULL if + the clustered index record is an uncommitted + inserted record whose BLOBs have not been + written yet. The secondary index record + can be safely removed, because it cannot + possibly refer to this incomplete + clustered index record. (Insert would + always first be completed for the + clustered index record, then proceed to + secondary indexes.) */ + + if (entry && row_purge_vc_matches_cluster( + rec, entry, + clust_index, clust_offsets, + index, ientry, roll_ptr, + trx_id, mtr, &vrow)) { + goto unsafe_to_purge; + } + } + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index + ->n_core_fields, + ULINT_UNDEFINED, &heap); + } else { + + entry = row_build_index_entry( + row, ext, index, heap); + + /* If entry == NULL, the record contains unset BLOB + pointers. This must be a freshly inserted record. If + this is called from + row_purge_remove_sec_if_poss_low(), the thread will + hold latches on the clustered index and the secondary + index. Because the insert works in three steps: + + (1) insert the record to clustered index + (2) store the BLOBs and update BLOB pointers + (3) insert records to secondary indexes + + the purge thread can safely ignore freshly inserted + records and delete the secondary index record. The + thread that inserted the new record will be inserting + the secondary index records. */ + + /* NOTE that we cannot do the comparison as binary + fields because the row is maybe being modified so that + the clustered index record has already been updated to + a different binary value in a char field, but the + collation identifies the old and new value anyway! */ + if (entry && dtuple_coll_eq(*ientry, *entry)) { +unsafe_to_purge: + mem_heap_free(heap); + + if (v_heap) { + mem_heap_free(v_heap); + } + return true; + } + } + } else if (dict_index_has_virtual(index)) { + /* The current cluster index record could be + deleted, but the previous version of it might not. We will + need to get the virtual column data from undo record + associated with current cluster index */ + + cur_vrow = row_vers_build_cur_vrow( + rec, clust_index, &clust_offsets, + index, trx_id, roll_ptr, heap, v_heap, mtr); + } + + version = rec; + + for (;;) { + mem_heap_t* heap2 = heap; + heap = mem_heap_create(1024); + vrow = NULL; + + trx_undo_prev_version_build(version, + clust_index, clust_offsets, + heap, &prev_version, mtr, + TRX_UNDO_CHECK_PURGE_PAGES, + nullptr, + dict_index_has_virtual(index) + ? &vrow : nullptr); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (!prev_version) { + /* Versions end here */ + mem_heap_free(heap); + + if (v_heap) { + mem_heap_free(v_heap); + } + + return false; + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (dict_index_has_virtual(index)) { + if (vrow) { + if (dtuple_vcol_data_missing(*vrow, *index)) { + goto nochange_index; + } + /* Keep the virtual row info for the next + version, unless it is changed */ + mem_heap_empty(v_heap); + cur_vrow = dtuple_copy(vrow, v_heap); + dtuple_dup_v_fld(cur_vrow, v_heap); + } + + if (!cur_vrow) { + /* Nothing for this index has changed, + continue */ +nochange_index: + version = prev_version; + continue; + } + } + + if (!rec_get_deleted_flag(prev_version, + rec_offs_comp(clust_offsets))) { + row_ext_t* ext; + + /* The stack of versions is locked by mtr. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, clust_offsets, + NULL, NULL, NULL, &ext, heap); + + if (dict_index_has_virtual(index)) { + ut_ad(cur_vrow); + ut_ad(row->n_v_fields == cur_vrow->n_v_fields); + dtuple_copy_v_fields(row, cur_vrow); + } + + entry = row_build_index_entry(row, ext, index, heap); + + /* If entry == NULL, the record contains unset + BLOB pointers. This must be a freshly + inserted record that we can safely ignore. + For the justification, see the comments after + the previous row_build_index_entry() call. */ + + /* NOTE that we cannot do the comparison as binary + fields because maybe the secondary index record has + already been updated to a different binary value in + a char field, but the collation identifies the old + and new value anyway! */ + + if (entry && dtuple_coll_eq(*ientry, *entry)) { + goto unsafe_to_purge; + } + } + + version = prev_version; + } +} + /** Determines if it is possible to remove a secondary index entry. Removal is possible if the secondary index entry does not refer to any not delete marked version of a clustered index record where DB_TRX_ID @@ -284,66 +726,45 @@ inserts a record that the secondary index entry would refer to. However, in that case, the user transaction would also re-insert the secondary index entry after purge has removed it and released the leaf page latch. -@param[in,out] node row purge node -@param[in] index secondary index -@param[in] entry secondary index entry -@param[in,out] sec_pcur secondary index cursor or NULL - if it is called for purge buffering - operation. -@param[in,out] sec_mtr mini-transaction which holds - secondary index entry or NULL if it is - called for purge buffering operation. -@param[in] is_tree true=pessimistic purge, - false=optimistic (leaf-page only) -@return true if the secondary index record can be purged */ -bool -row_purge_poss_sec( - purge_node_t* node, - dict_index_t* index, - const dtuple_t* entry, - btr_pcur_t* sec_pcur, - mtr_t* sec_mtr, - bool is_tree) +@param node row purge node +@param index secondary index +@param entry secondary index entry +@param mtr mini-transaction for looking up clustered index +@return whether the secondary index record can be purged */ +bool row_purge_poss_sec(purge_node_t *node, dict_index_t *index, + const dtuple_t *entry, mtr_t *mtr) { - bool can_delete; - mtr_t mtr; + ut_ad(!index->is_clust()); + const auto savepoint= mtr->get_savepoint(); + bool can_delete= !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, mtr); - ut_ad(!dict_index_is_clust(index)); + if (!can_delete) + { + ut_ad(node->pcur.pos_state == BTR_PCUR_IS_POSITIONED); + can_delete= !row_purge_is_unsafe(*node, index, entry, mtr); + node->pcur.pos_state = BTR_PCUR_WAS_POSITIONED; + node->pcur.latch_mode= BTR_NO_LATCHES; + } - mtr_start(&mtr); - - can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr) - || !row_vers_old_has_index_entry(true, - btr_pcur_get_rec(&node->pcur), - &mtr, index, entry, - node->roll_ptr, node->trx_id); - - /* Persistent cursor is closed if reposition fails. */ - if (node->found_clust) { - btr_pcur_commit_specify_mtr(&node->pcur, &mtr); - } else { - mtr.commit(); - } - - ut_ad(mtr.has_committed()); - - return can_delete; + mtr->rollback_to_savepoint(savepoint); + return can_delete; } -/*************************************************************** -Removes a secondary index entry if possible, by modifying the -index tree. Does not try to buffer the delete. -@return TRUE if success or if not found */ -static MY_ATTRIBUTE((nonnull, warn_unused_result)) -ibool -row_purge_remove_sec_if_poss_tree( -/*==============================*/ - purge_node_t* node, /*!< in: row purge node */ - dict_index_t* index, /*!< in: index */ - const dtuple_t* entry) /*!< in: index entry */ +__attribute__((nonnull, warn_unused_result)) +/** Remove a secondary index entry if possible, by modifying the index tree. +@param node purge node +@param index secondary index +@param entry index entry +@param page_max_trx_id the PAGE_MAX_TRX_ID + when row_purge_remove_sec_if_poss_leaf() was invoked +@return whether the operation succeeded */ +static bool row_purge_remove_sec_if_poss_tree(purge_node_t *node, + dict_index_t *index, + const dtuple_t *entry, + trx_id_t page_max_trx_id) { btr_pcur_t pcur; - ibool success = TRUE; + bool success = true; dberr_t err; mtr_t mtr; @@ -389,7 +810,9 @@ row_purge_remove_sec_if_poss_tree( we should do nothing. */ found: - if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, true)) { + if (page_max_trx_id + == page_get_max_trx_id(btr_cur_get_page(&pcur.btr_cur)) + || row_purge_poss_sec(node, index, entry, &mtr)) { /* Remove the index record, which should have been marked for deletion. */ @@ -428,26 +851,23 @@ found: func_exit: btr_pcur_close(&pcur); // FIXME: need this? mtr.commit(); - - return(success); + return success; } -/*************************************************************** -Removes a secondary index entry without modifying the index tree, -if possible. -@retval true if success or if not found -@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */ -static MY_ATTRIBUTE((nonnull, warn_unused_result)) -bool -row_purge_remove_sec_if_poss_leaf( -/*==============================*/ - purge_node_t* node, /*!< in: row purge node */ - dict_index_t* index, /*!< in: index */ - const dtuple_t* entry) /*!< in: index entry */ +__attribute__((nonnull, warn_unused_result)) +/** Remove a secondary index entry if possible, without modifying the tree. +@param node purge node +@param index secondary index +@param entry index entry +@return PAGE_MAX_TRX_ID for row_purge_remove_sec_if_poss_tree() +@retval 0 if success or if not found */ +static trx_id_t row_purge_remove_sec_if_poss_leaf(purge_node_t *node, + dict_index_t *index, + const dtuple_t *entry) { mtr_t mtr; btr_pcur_t pcur; - bool success = true; + trx_id_t page_max_trx_id = 0; log_free_check(); ut_ad(index->table == node->table); @@ -478,7 +898,7 @@ row_purge_remove_sec_if_poss_leaf( found: /* Before attempting to purge a record, check if it is safe to do so. */ - if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, false)) { + if (row_purge_poss_sec(node, index, entry, &mtr)) { btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); /* Only delete-marked records should be purged. */ @@ -526,8 +946,11 @@ found: } } - success = btr_cur_optimistic_delete(btr_cur, 0, &mtr) - != DB_FAIL; + if (btr_cur_optimistic_delete(btr_cur, 0, &mtr) + == DB_FAIL) { + page_max_trx_id = page_get_max_trx_id( + btr_cur_get_page(btr_cur)); + } } /* (The index entry is still needed, @@ -539,15 +962,15 @@ found: /* The deletion was buffered. */ case ROW_NOT_FOUND: /* The index entry does not exist, nothing to do. */ -func_exit: - mtr.commit(); -cleanup: - btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set? - return(success); + goto func_exit; } - ut_error; - return(false); + ut_ad("invalid state" == 0); +func_exit: + mtr.commit(); +cleanup: + btr_pcur_close(&pcur); // FIXME: remove? when is btr_cur->rtr_info set? + return page_max_trx_id; } /***********************************************************//** @@ -560,38 +983,21 @@ row_purge_remove_sec_if_poss( dict_index_t* index, /*!< in: index */ const dtuple_t* entry) /*!< in: index entry */ { - ibool success; - ulint n_tries = 0; + if (UNIV_UNLIKELY(!entry)) + /* The node->row must have lacked some fields of this index. This + is possible when the undo log record was written before this index + was created. */ + return; - /* fputs("Purge: Removing secondary record\n", stderr); */ - - if (!entry) { - /* The node->row must have lacked some fields of this - index. This is possible when the undo log record was - written before this index was created. */ - return; - } - - if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) { - - return; - } -retry: - success = row_purge_remove_sec_if_poss_tree(node, index, entry); - /* The delete operation may fail if we have little - file space left: TODO: easiest to crash the database - and restart with more file space */ - - if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { - - n_tries++; - - std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME); - - goto retry; - } - - ut_a(success); + if (trx_id_t page_max_trx_id= + row_purge_remove_sec_if_poss_leaf(node, index, entry)) + for (auto n_tries= BTR_CUR_RETRY_DELETE_N_TIMES; + !row_purge_remove_sec_if_poss_tree(node, index, entry, + page_max_trx_id); + std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME)) + /* The delete operation may fail if we have little + file space left (if innodb_file_per_table=0?) */ + ut_a(--n_tries); } /***********************************************************//** diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 1c68312df78..1245bf073ed 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -6610,7 +6610,7 @@ rec_loop: err= trx_undo_prev_version_build(clust_rec, clust_index, clust_offsets, vers_heap, &old_vers, - nullptr, nullptr, 0); + &mtr, 0, nullptr, nullptr); if (prev_heap) mem_heap_free(prev_heap); if (err != DB_SUCCESS) diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc index 52f54443911..054049baaa2 100644 --- a/storage/innobase/row/row0umod.cc +++ b/storage/innobase/row/row0umod.cc @@ -470,6 +470,146 @@ func_exit: return(err); } +bool dtuple_coll_eq(const dtuple_t &tuple1, const dtuple_t &tuple2); + +/** Find out if an accessible version of a clustered index record +corresponds to a secondary index entry. +@param rec record in a latched clustered index page +@param index secondary index +@param ientry secondary index entry +@param mtr mini-transaction +@return whether an accessible non-dete-marked version of rec +corresponds to ientry */ +static bool row_undo_mod_sec_is_unsafe(const rec_t *rec, dict_index_t *index, + const dtuple_t *ientry, mtr_t *mtr) +{ + const rec_t* version; + rec_t* prev_version; + dict_index_t* clust_index; + rec_offs* clust_offsets; + mem_heap_t* heap; + mem_heap_t* heap2; + dtuple_t* row; + const dtuple_t* entry; + ulint comp; + dtuple_t* vrow = NULL; + mem_heap_t* v_heap = NULL; + dtuple_t* cur_vrow = NULL; + + clust_index = dict_table_get_first_index(index->table); + + comp = page_rec_is_comp(rec); + ut_ad(!dict_table_is_comp(index->table) == !comp); + heap = mem_heap_create(1024); + clust_offsets = rec_get_offsets(rec, clust_index, NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (dict_index_has_virtual(index)) { + v_heap = mem_heap_create(100); + /* The current cluster index record could be + deleted, but the previous version of it might not. We will + need to get the virtual column data from undo record + associated with current cluster index */ + + cur_vrow = row_vers_build_cur_vrow( + rec, clust_index, &clust_offsets, + index, 0, 0, heap, v_heap, mtr); + } + + version = rec; + + for (;;) { + heap2 = heap; + heap = mem_heap_create(1024); + vrow = NULL; + + trx_undo_prev_version_build(version, + clust_index, clust_offsets, + heap, &prev_version, + mtr, TRX_UNDO_CHECK_PURGEABILITY, + nullptr, + dict_index_has_virtual(index) + ? &vrow : nullptr); + mem_heap_free(heap2); /* free version and clust_offsets */ + + if (!prev_version) { + break; + } + + clust_offsets = rec_get_offsets(prev_version, clust_index, + NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + if (dict_index_has_virtual(index)) { + if (vrow) { + if (dtuple_vcol_data_missing(*vrow, *index)) { + goto nochange_index; + } + /* Keep the virtual row info for the next + version, unless it is changed */ + mem_heap_empty(v_heap); + cur_vrow = dtuple_copy(vrow, v_heap); + dtuple_dup_v_fld(cur_vrow, v_heap); + } + + if (!cur_vrow) { + /* Nothing for this index has changed, + continue */ +nochange_index: + version = prev_version; + continue; + } + } + + if (!rec_get_deleted_flag(prev_version, comp)) { + row_ext_t* ext; + + /* The stack of versions is locked by mtr. + Thus, it is safe to fetch the prefixes for + externally stored columns. */ + row = row_build(ROW_COPY_POINTERS, clust_index, + prev_version, clust_offsets, + NULL, NULL, NULL, &ext, heap); + + if (dict_index_has_virtual(index)) { + ut_ad(cur_vrow); + ut_ad(row->n_v_fields == cur_vrow->n_v_fields); + dtuple_copy_v_fields(row, cur_vrow); + } + + entry = row_build_index_entry(row, ext, index, heap); + + /* If entry == NULL, the record contains unset + BLOB pointers. This must be a freshly + inserted record that we can safely ignore. + For the justification, see the comments after + the previous row_build_index_entry() call. */ + + /* NOTE that we cannot do the comparison as binary + fields because maybe the secondary index record has + already been updated to a different binary value in + a char field, but the collation identifies the old + and new value anyway! */ + + if (entry && dtuple_coll_eq(*ientry, *entry)) { + break; + } + } + + version = prev_version; + } + + mem_heap_free(heap); + + if (v_heap) { + mem_heap_free(v_heap); + } + + return !!prev_version; +} + /***********************************************************//** Delete marks or removes a secondary index entry if found. @return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ @@ -488,7 +628,6 @@ row_undo_mod_del_mark_or_remove_sec_low( btr_cur_t* btr_cur; dberr_t err = DB_SUCCESS; mtr_t mtr; - mtr_t mtr_vers; const bool modify_leaf = mode == BTR_MODIFY_LEAF; row_mtr_start(&mtr, index, !modify_leaf); @@ -555,17 +694,14 @@ found: which cannot be purged yet, requires its existence. If some requires, we should delete mark the record. */ - mtr_vers.start(); - - ut_a(node->pcur.restore_position(BTR_SEARCH_LEAF, &mtr_vers) == - btr_pcur_t::SAME_ALL); + ut_a(node->pcur.restore_position(BTR_SEARCH_LEAF, &mtr) == + btr_pcur_t::SAME_ALL); /* For temporary table, we can skip to check older version of clustered index entry, because there is no MVCC or purge. */ if (node->table->is_temporary() - || row_vers_old_has_index_entry( - false, btr_pcur_get_rec(&node->pcur), - &mtr_vers, index, entry, 0, 0)) { + || row_undo_mod_sec_is_unsafe( + btr_pcur_get_rec(&node->pcur), index, entry, &mtr)) { btr_rec_set_deleted(btr_cur_get_block(btr_cur), btr_cur_get_rec(btr_cur), &mtr); } else { @@ -599,7 +735,9 @@ found: } } - btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers); + ut_ad(node->pcur.pos_state == BTR_PCUR_IS_POSITIONED); + node->pcur.pos_state = BTR_PCUR_WAS_POSITIONED; + node->pcur.latch_mode = BTR_NO_LATCHES; func_exit: btr_pcur_close(&pcur); diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index a39574d2f64..03118fb25f2 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -702,7 +702,7 @@ fetch; output: fetched length of the prefix @param[in,out] heap heap where to allocate @return BLOB prefix @retval NULL if the record is incomplete (should only happen -in row_vers_vc_matches_cluster() executed concurrently with another purge) */ +in row_purge_vc_matches_cluster() executed concurrently with another purge) */ static byte* row_upd_ext_fetch( diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc index c3acf325f5c..896b7def41d 100644 --- a/storage/innobase/row/row0vers.cc +++ b/storage/innobase/row/row0vers.cc @@ -194,8 +194,8 @@ row_vers_impl_x_locked_low( trx_undo_prev_version_build( version, clust_index, clust_offsets, - heap, &prev_version, NULL, - dict_index_has_virtual(index) ? &vrow : NULL, 0); + heap, &prev_version, mtr, 0, NULL, + dict_index_has_virtual(index) ? &vrow : NULL); ut_d(trx->mutex_lock()); const bool committed = trx_state_eq( @@ -446,7 +446,6 @@ row_vers_impl_x_locked( @param[in] clust_index clustered index @param[in] index the secondary index @param[in] heap heap used to build virtual dtuple. */ -static bool row_vers_build_clust_v_col( dtuple_t* row, @@ -490,26 +489,25 @@ row_vers_build_clust_v_col( } /** Build latest virtual column data from undo log -@param[in] in_purge whether this is the purge thread @param[in] rec clustered index record @param[in] clust_index clustered index @param[in,out] clust_offsets offsets on the clustered index record @param[in] index the secondary index +@param[in] trx_id transaction ID on the purging record, + or 0 if called outside purge @param[in] roll_ptr the rollback pointer for the purging record -@param[in] trx_id trx id for the purging record @param[in,out] v_heap heap used to build vrow @param[out] v_row dtuple holding the virtual rows @param[in,out] mtr mtr holding the latch on rec */ static void row_vers_build_cur_vrow_low( - bool in_purge, const rec_t* rec, dict_index_t* clust_index, rec_offs* clust_offsets, dict_index_t* index, - roll_ptr_t roll_ptr, trx_id_t trx_id, + roll_ptr_t roll_ptr, mem_heap_t* v_heap, dtuple_t** vrow, mtr_t* mtr) @@ -539,7 +537,7 @@ row_vers_build_cur_vrow_low( /* If this is called by purge thread, set TRX_UNDO_PREV_IN_PURGE bit to search the undo log until we hit the current undo log with roll_ptr */ - const ulint status = in_purge + const ulint status = trx_id ? TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE : TRX_UNDO_GET_OLD_V_VALUE; @@ -551,7 +549,7 @@ row_vers_build_cur_vrow_low( trx_undo_prev_version_build( version, clust_index, clust_offsets, - heap, &prev_version, NULL, vrow, status); + heap, &prev_version, mtr, status, nullptr, vrow); if (heap2) { mem_heap_free(heap2); @@ -603,212 +601,27 @@ row_vers_build_cur_vrow_low( mem_heap_free(heap); } -/** Check a virtual column value index secondary virtual index matches -that of current cluster index record, which is recreated from information -stored in undo log -@param[in] rec record in the clustered index -@param[in] icentry the index entry built from a cluster row -@param[in] clust_index cluster index -@param[in] clust_offsets offsets on the cluster record -@param[in] index the secondary index -@param[in] ientry the secondary index entry -@param[in] roll_ptr the rollback pointer for the purging record -@param[in] trx_id trx id for the purging record -@param[in,out] v_heap heap used to build virtual dtuple -@param[in,out] v_row dtuple holding the virtual rows (if needed) -@param[in] mtr mtr holding the latch on rec -@return true if matches, false otherwise */ -static -bool -row_vers_vc_matches_cluster( - const rec_t* rec, - const dtuple_t* icentry, - dict_index_t* clust_index, - rec_offs* clust_offsets, - dict_index_t* index, - const dtuple_t* ientry, - roll_ptr_t roll_ptr, - trx_id_t trx_id, - mem_heap_t* v_heap, - dtuple_t** vrow, - mtr_t* mtr) -{ - const rec_t* version; - rec_t* prev_version; - mem_heap_t* heap2; - mem_heap_t* heap = NULL; - mem_heap_t* tuple_heap; - ulint num_v = dict_table_get_n_v_cols(index->table); - bool compare[REC_MAX_N_FIELDS]; - ulint n_fields = dtuple_get_n_fields(ientry); - ulint n_non_v_col = 0; - ulint n_cmp_v_col = 0; - const dfield_t* field1; - dfield_t* field2; - ulint i; - - /* First compare non-virtual columns (primary keys) */ - ut_ad(index->n_fields == n_fields); - ut_ad(n_fields == dtuple_get_n_fields(icentry)); - ut_ad(mtr->memo_contains_page_flagged(rec, - MTR_MEMO_PAGE_S_FIX - | MTR_MEMO_PAGE_X_FIX)); - - { - const dfield_t* a = ientry->fields; - const dfield_t* b = icentry->fields; - - for (const dict_field_t *ifield = index->fields, - *const end = &index->fields[index->n_fields]; - ifield != end; ifield++, a++, b++) { - if (!ifield->col->is_virtual()) { - if (cmp_dfield_dfield(a, b)) { - return false; - } - n_non_v_col++; - } - } - } - - tuple_heap = mem_heap_create(1024); - - ut_ad(n_fields > n_non_v_col); - - *vrow = dtuple_create_with_vcol(v_heap ? v_heap : tuple_heap, 0, num_v); - dtuple_init_v_fld(*vrow); - - for (i = 0; i < num_v; i++) { - dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype - = DATA_MISSING; - compare[i] = false; - } - - version = rec; - - while (n_cmp_v_col < n_fields - n_non_v_col) { - heap2 = heap; - heap = mem_heap_create(1024); - roll_ptr_t cur_roll_ptr = row_get_rec_roll_ptr( - version, clust_index, clust_offsets); - - ut_ad(cur_roll_ptr != 0); - ut_ad(roll_ptr != 0); - - trx_undo_prev_version_build( - version, clust_index, clust_offsets, - heap, &prev_version, NULL, vrow, - TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE); - - if (heap2) { - mem_heap_free(heap2); - } - - if (!prev_version) { - /* Versions end here */ - goto func_exit; - } - - clust_offsets = rec_get_offsets(prev_version, clust_index, - NULL, - clust_index->n_core_fields, - ULINT_UNDEFINED, &heap); - - ulint entry_len = dict_index_get_n_fields(index); - - for (i = 0; i < entry_len; i++) { - const dict_field_t* ind_field - = dict_index_get_nth_field(index, i); - const dict_col_t* col = ind_field->col; - field1 = dtuple_get_nth_field(ientry, i); - - if (!col->is_virtual()) { - continue; - } - - const dict_v_col_t* v_col - = reinterpret_cast(col); - field2 - = dtuple_get_nth_v_field(*vrow, v_col->v_pos); - - if ((dfield_get_type(field2)->mtype != DATA_MISSING) - && (!compare[v_col->v_pos])) { - - if (ind_field->prefix_len != 0 - && !dfield_is_null(field2)) { - field2->len = unsigned( - dtype_get_at_most_n_mbchars( - field2->type.prtype, - field2->type.mbminlen, - field2->type.mbmaxlen, - ind_field->prefix_len, - field2->len, - static_cast - (field2->data))); - } - - /* The index field mismatch */ - if (v_heap - || cmp_dfield_dfield(field2, field1)) { - if (v_heap) { - dtuple_dup_v_fld(*vrow, v_heap); - } - - mem_heap_free(tuple_heap); - mem_heap_free(heap); - return(false); - } - - compare[v_col->v_pos] = true; - n_cmp_v_col++; - } - } - - trx_id_t rec_trx_id = row_get_rec_trx_id( - prev_version, clust_index, clust_offsets); - - if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) { - break; - } - - version = prev_version; - } - -func_exit: - if (n_cmp_v_col == 0) { - *vrow = NULL; - } - - mem_heap_free(tuple_heap); - mem_heap_free(heap); - - /* FIXME: In the case of n_cmp_v_col is not the same as - n_fields - n_non_v_col, callback is needed to compare the rest - columns. At the timebeing, we will need to return true */ - return (true); -} - /** Build a dtuple contains virtual column data for current cluster index @param[in] in_purge called by purge thread @param[in] rec cluster index rec @param[in] clust_index cluster index @param[in] clust_offsets cluster rec offset @param[in] index secondary index +@param[in] trx_id transaction ID on the purging record, + or 0 if called outside purge @param[in] roll_ptr roll_ptr for the purge record -@param[in] trx_id transaction ID on the purging record @param[in,out] heap heap memory -@param[in,out] v_heap heap memory to keep virtual colum dtuple -@param[in] mtr mtr holding the latch on rec +@param[in,out] v_heap heap memory to keep virtual column tuple +@param[in,out] mtr mini-transaction @return dtuple contains virtual column data */ -static dtuple_t* row_vers_build_cur_vrow( - bool in_purge, const rec_t* rec, dict_index_t* clust_index, rec_offs** clust_offsets, dict_index_t* index, - roll_ptr_t roll_ptr, trx_id_t trx_id, + roll_ptr_t roll_ptr, mem_heap_t* heap, mem_heap_t* v_heap, mtr_t* mtr) @@ -841,8 +654,8 @@ row_vers_build_cur_vrow( } else { /* Try to fetch virtual column data from undo log */ row_vers_build_cur_vrow_low( - in_purge, rec, clust_index, *clust_offsets, - index, roll_ptr, trx_id, v_heap, &cur_vrow, mtr); + rec, clust_index, *clust_offsets, + index, trx_id, roll_ptr, v_heap, &cur_vrow, mtr); } *clust_offsets = rec_get_offsets(rec, clust_index, NULL, @@ -851,312 +664,28 @@ row_vers_build_cur_vrow( return(cur_vrow); } -/** @return whether two data tuples are equal */ -static bool dtuple_coll_eq(const dtuple_t &tuple1, const dtuple_t &tuple2) -{ - ut_ad(tuple1.magic_n == DATA_TUPLE_MAGIC_N); - ut_ad(tuple2.magic_n == DATA_TUPLE_MAGIC_N); - ut_ad(dtuple_check_typed(&tuple1)); - ut_ad(dtuple_check_typed(&tuple2)); - ut_ad(tuple1.n_fields == tuple2.n_fields); - - for (ulint i= 0; i < tuple1.n_fields; i++) - if (cmp_dfield_dfield(&tuple1.fields[i], &tuple2.fields[i])) - return false; - return true; -} - /** Find out whether data tuple has missing data type for indexed virtual column. @param tuple data tuple @param index virtual index @return true if tuple has missing column type */ -static bool dtuple_vcol_data_missing(const dtuple_t &tuple, - dict_index_t *index) +bool dtuple_vcol_data_missing(const dtuple_t &tuple, + const dict_index_t &index) { - for (ulint i= 0; i < index->n_uniq; i++) + for (ulint i= 0; i < index.n_uniq; i++) { - dict_col_t *col= index->fields[i].col; + dict_col_t *col= index.fields[i].col; if (!col->is_virtual()) continue; dict_v_col_t *vcol= reinterpret_cast(col); - for (ulint j= 0; j < index->table->n_v_cols; j++) - { - if (vcol == &index->table->v_cols[j] - && tuple.v_fields[j].type.mtype == DATA_MISSING) + for (ulint j= 0; j < index.table->n_v_cols; j++) + if (vcol == &index.table->v_cols[j] && + tuple.v_fields[j].type.mtype == DATA_MISSING) return true; - } } return false; } -/** Finds out if a version of the record, where the version >= the current -purge_sys.view, should have ientry as its secondary index entry. We check -if there is any not delete marked version of the record where the trx -id >= purge view, and the secondary index entry == ientry; exactly in -this case we return TRUE. -@param[in] also_curr TRUE if also rec is included in the versions - to search; otherwise only versions prior - to it are searched -@param[in] rec record in the clustered index; the caller - must have a latch on the page -@param[in] mtr mtr holding the latch on rec; it will - also hold the latch on purge_view -@param[in] index secondary index -@param[in] ientry secondary index entry -@param[in] roll_ptr roll_ptr for the purge record -@param[in] trx_id transaction ID on the purging record -@return TRUE if earlier version should have */ -bool -row_vers_old_has_index_entry( - bool also_curr, - const rec_t* rec, - mtr_t* mtr, - dict_index_t* index, - const dtuple_t* ientry, - roll_ptr_t roll_ptr, - trx_id_t trx_id) -{ - const rec_t* version; - rec_t* prev_version; - dict_index_t* clust_index; - rec_offs* clust_offsets; - mem_heap_t* heap; - mem_heap_t* heap2; - dtuple_t* row; - const dtuple_t* entry; - ulint comp; - dtuple_t* vrow = NULL; - mem_heap_t* v_heap = NULL; - dtuple_t* cur_vrow = NULL; - - ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX - | MTR_MEMO_PAGE_S_FIX)); - clust_index = dict_table_get_first_index(index->table); - - comp = page_rec_is_comp(rec); - ut_ad(!dict_table_is_comp(index->table) == !comp); - heap = mem_heap_create(1024); - clust_offsets = rec_get_offsets(rec, clust_index, NULL, - clust_index->n_core_fields, - ULINT_UNDEFINED, &heap); - - if (dict_index_has_virtual(index)) { - v_heap = mem_heap_create(100); - } - - DBUG_EXECUTE_IF("ib_purge_virtual_index_crash", - DBUG_SUICIDE();); - - if (also_curr && !rec_get_deleted_flag(rec, comp)) { - row_ext_t* ext; - - /* The top of the stack of versions is locked by the - mtr holding a latch on the page containing the - clustered index record. The bottom of the stack is - locked by the fact that the purge_sys.view must - 'overtake' any read view of an active transaction. - Thus, it is safe to fetch the prefixes for - externally stored columns. */ - row = row_build(ROW_COPY_POINTERS, clust_index, - rec, clust_offsets, - NULL, NULL, NULL, &ext, heap); - - if (dict_index_has_virtual(index)) { - - -#ifdef DBUG_OFF -# define dbug_v_purge false -#else /* DBUG_OFF */ - bool dbug_v_purge = false; -#endif /* DBUG_OFF */ - - DBUG_EXECUTE_IF( - "ib_purge_virtual_index_callback", - dbug_v_purge = true;); - - roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr( - rec, clust_index, clust_offsets); - - /* if the row is newly inserted, then the virtual - columns need to be computed */ - if (trx_undo_roll_ptr_is_insert(t_roll_ptr) - || dbug_v_purge) { - - if (!row_vers_build_clust_v_col( - row, clust_index, index, heap)) { - goto unsafe_to_purge; - } - - entry = row_build_index_entry( - row, ext, index, heap); - if (entry && dtuple_coll_eq(*ientry, *entry)) { - goto unsafe_to_purge; - } - } else { - /* Build index entry out of row */ - entry = row_build_index_entry(row, ext, index, heap); - /* entry could only be NULL if - the clustered index record is an uncommitted - inserted record whose BLOBs have not been - written yet. The secondary index record - can be safely removed, because it cannot - possibly refer to this incomplete - clustered index record. (Insert would - always first be completed for the - clustered index record, then proceed to - secondary indexes.) */ - - if (entry && row_vers_vc_matches_cluster( - rec, entry, - clust_index, clust_offsets, - index, ientry, roll_ptr, - trx_id, NULL, &vrow, mtr)) { - goto unsafe_to_purge; - } - } - clust_offsets = rec_get_offsets(rec, clust_index, NULL, - clust_index - ->n_core_fields, - ULINT_UNDEFINED, &heap); - } else { - - entry = row_build_index_entry( - row, ext, index, heap); - - /* If entry == NULL, the record contains unset BLOB - pointers. This must be a freshly inserted record. If - this is called from - row_purge_remove_sec_if_poss_low(), the thread will - hold latches on the clustered index and the secondary - index. Because the insert works in three steps: - - (1) insert the record to clustered index - (2) store the BLOBs and update BLOB pointers - (3) insert records to secondary indexes - - the purge thread can safely ignore freshly inserted - records and delete the secondary index record. The - thread that inserted the new record will be inserting - the secondary index records. */ - - /* NOTE that we cannot do the comparison as binary - fields because the row is maybe being modified so that - the clustered index record has already been updated to - a different binary value in a char field, but the - collation identifies the old and new value anyway! */ - if (entry && dtuple_coll_eq(*ientry, *entry)) { -unsafe_to_purge: - mem_heap_free(heap); - - if (v_heap) { - mem_heap_free(v_heap); - } - return true; - } - } - } else if (dict_index_has_virtual(index)) { - /* The current cluster index record could be - deleted, but the previous version of it might not. We will - need to get the virtual column data from undo record - associated with current cluster index */ - - cur_vrow = row_vers_build_cur_vrow( - also_curr, rec, clust_index, &clust_offsets, - index, roll_ptr, trx_id, heap, v_heap, mtr); - } - - version = rec; - - for (;;) { - heap2 = heap; - heap = mem_heap_create(1024); - vrow = NULL; - - trx_undo_prev_version_build(version, - clust_index, clust_offsets, - heap, &prev_version, nullptr, - dict_index_has_virtual(index) - ? &vrow : nullptr, - TRX_UNDO_CHECK_PURGEABILITY); - mem_heap_free(heap2); /* free version and clust_offsets */ - - if (!prev_version) { - /* Versions end here */ - mem_heap_free(heap); - - if (v_heap) { - mem_heap_free(v_heap); - } - - return false; - } - - clust_offsets = rec_get_offsets(prev_version, clust_index, - NULL, - clust_index->n_core_fields, - ULINT_UNDEFINED, &heap); - - if (dict_index_has_virtual(index)) { - if (vrow) { - if (dtuple_vcol_data_missing(*vrow, index)) { - goto nochange_index; - } - /* Keep the virtual row info for the next - version, unless it is changed */ - mem_heap_empty(v_heap); - cur_vrow = dtuple_copy(vrow, v_heap); - dtuple_dup_v_fld(cur_vrow, v_heap); - } - - if (!cur_vrow) { - /* Nothing for this index has changed, - continue */ -nochange_index: - version = prev_version; - continue; - } - } - - if (!rec_get_deleted_flag(prev_version, comp)) { - row_ext_t* ext; - - /* The stack of versions is locked by mtr. - Thus, it is safe to fetch the prefixes for - externally stored columns. */ - row = row_build(ROW_COPY_POINTERS, clust_index, - prev_version, clust_offsets, - NULL, NULL, NULL, &ext, heap); - - if (dict_index_has_virtual(index)) { - ut_ad(cur_vrow); - ut_ad(row->n_v_fields == cur_vrow->n_v_fields); - dtuple_copy_v_fields(row, cur_vrow); - } - - entry = row_build_index_entry(row, ext, index, heap); - - /* If entry == NULL, the record contains unset - BLOB pointers. This must be a freshly - inserted record that we can safely ignore. - For the justification, see the comments after - the previous row_build_index_entry() call. */ - - /* NOTE that we cannot do the comparison as binary - fields because maybe the secondary index record has - already been updated to a different binary value in - a char field, but the collation identifies the old - and new value anyway! */ - - if (entry && dtuple_coll_eq(*ientry, *entry)) { - goto unsafe_to_purge; - } - } - - version = prev_version; - } -} - /*****************************************************************//** Constructs the version of a clustered index record which a consistent read should see. We assume that the trx id stored in rec is such that @@ -1223,7 +752,7 @@ row_vers_build_for_consistent_read( err = trx_undo_prev_version_build( version, index, *offsets, heap, - &prev_version, NULL, vrow, 0); + &prev_version, mtr, 0, NULL, vrow); if (prev_heap != NULL) { mem_heap_free(prev_heap); @@ -1385,8 +914,8 @@ committed_version_trx: heap = mem_heap_create(1024); if (trx_undo_prev_version_build(version, index, *offsets, heap, - &prev_version, in_heap, vrow, - 0) != DB_SUCCESS) { + &prev_version, mtr, 0, + in_heap, vrow) != DB_SUCCESS) { mem_heap_free(heap); heap = heap2; heap2 = NULL; diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 2bbc3d1daf8..f15523dc0ae 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -1166,10 +1166,9 @@ bool purge_sys_t::running() void purge_sys_t::stop_FTS() { - latch.rd_lock(SRW_LOCK_CALL); - m_FTS_paused++; - latch.rd_unlock(); - while (m_active) + ut_d(const auto paused=) m_FTS_paused.fetch_add(1); + ut_ad((paused + 1) & ~PAUSED_SYS); + while (m_active.load(std::memory_order_acquire)) std::this_thread::sleep_for(std::chrono::seconds(1)); } @@ -1203,8 +1202,8 @@ void purge_sys_t::stop() /** Resume purge in data dictionary tables */ void purge_sys_t::resume_SYS(void *) { - ut_d(auto paused=) purge_sys.m_SYS_paused--; - ut_ad(paused); + ut_d(auto paused=) purge_sys.m_FTS_paused.fetch_sub(PAUSED_SYS); + ut_ad(paused >= PAUSED_SYS); } /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */ @@ -1425,7 +1424,6 @@ static bool srv_purge_should_exit(size_t old_history_size) /*********************************************************************//** Fetch and execute a task from the work queue. -@param [in,out] slot purge worker thread slot @return true if a task was executed */ static bool srv_task_execute() { @@ -1566,6 +1564,13 @@ static void release_thd(THD *thd, void *ctx) set_current_thd(0); } +void srv_purge_worker_task_low() +{ + ut_ad(current_thd); + while (srv_task_execute()) + ut_ad(purge_sys.running()); +} + static void purge_worker_callback(void*) { ut_ad(!current_thd); @@ -1573,8 +1578,7 @@ static void purge_worker_callback(void*) ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND); void *ctx; THD *thd= acquire_thd(&ctx); - while (srv_task_execute()) - ut_ad(purge_sys.running()); + srv_purge_worker_task_low(); release_thd(thd,ctx); } diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index f32f4de5173..65583c284e6 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -777,26 +777,18 @@ not_free: buf_block_t *purge_sys_t::get_page(page_id_t id) { + ut_ad(!recv_sys.recovery_on); + buf_block_t*& undo_page= pages[id]; - if (undo_page) - return undo_page; - - mtr_t mtr; - mtr.start(); - undo_page= - buf_page_get_gen(id, 0, RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, &mtr); - - if (UNIV_LIKELY(undo_page != nullptr)) + if (!undo_page) { - undo_page->fix(); - mtr.commit(); - return undo_page; + undo_page= buf_pool.page_fix(id); // batch_cleanup() will unfix() + if (!undo_page) + pages.erase(id); } - mtr.commit(); - pages.erase(id); - return nullptr; + return undo_page; } bool purge_sys_t::rseg_get_next_history_log() @@ -1066,15 +1058,8 @@ static void trx_purge_close_tables(purge_node_t *node, THD *thd) void purge_sys_t::wait_FTS(bool also_sys) { - bool paused; - do - { - latch.wr_lock(SRW_LOCK_CALL); - paused= m_FTS_paused || (also_sys && m_SYS_paused); - latch.wr_unlock(); + for (const uint32_t mask= also_sys ? ~0U : ~PAUSED_SYS; m_FTS_paused & mask;) std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - while (paused); } __attribute__((nonnull)) @@ -1215,123 +1200,108 @@ dict_table_t *purge_sys_t::close_and_reopen(table_id_t id, THD *thd, /** Run a purge batch. @param n_purge_threads number of purge threads +@param thd purge coordinator thread handle +@param n_work_items number of work items (currently tables) to process @return new purge_sys.head */ -static purge_sys_t::iterator -trx_purge_attach_undo_recs(ulint n_purge_threads, THD *thd) +static purge_sys_t::iterator trx_purge_attach_undo_recs(THD *thd, + ulint *n_work_items) { - que_thr_t* thr; - ulint i; + que_thr_t *thr; + purge_sys_t::iterator head= purge_sys.tail; - ut_a(n_purge_threads > 0); - ut_a(UT_LIST_GET_LEN(purge_sys.query->thrs) >= n_purge_threads); + /* Fetch and parse the UNDO records. The UNDO records are added + to a per purge node vector. */ + thr= nullptr; - purge_sys_t::iterator head = purge_sys.tail; + std::unordered_map + table_id_map(TRX_PURGE_TABLE_BUCKETS); + purge_sys.m_active= true; + + MDL_context *const mdl_context= + static_cast(thd_mdl_context(thd)); + ut_ad(mdl_context); + + const size_t max_pages= + std::min(buf_pool.curr_size * 3 / 4, size_t{srv_purge_batch_size}); + + while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) + { + /* Track the max {trx_id, undo_no} for truncating the + UNDO logs once we have purged the records. */ + + if (head <= purge_sys.tail) + head= purge_sys.tail; + + /* Fetch the next record, and advance the purge_sys.tail. */ + trx_purge_rec_t purge_rec= purge_sys.fetch_next_rec(); + + if (!purge_rec.undo_rec) + { + if (!purge_rec.roll_ptr) + break; + ut_ad(purge_rec.roll_ptr == 1); + continue; + } + + table_id_t table_id= trx_undo_rec_get_table_id(purge_rec.undo_rec); + + purge_node_t *&table_node= table_id_map[table_id]; + if (table_node) + ut_ad(!table_node->in_progress); + if (!table_node) + { + std::pair p; + p.first= trx_purge_table_open(table_id, mdl_context, &p.second); + if (p.first == reinterpret_cast(-1)) + p.first= purge_sys.close_and_reopen(table_id, thd, &p.second); + + if (!thr || !(thr= UT_LIST_GET_NEXT(thrs, thr))) + thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); + ++*n_work_items; + table_node= static_cast(thr->child); + + ut_a(que_node_get_type(table_node) == QUE_NODE_PURGE); + ut_d(auto pair=) table_node->tables.emplace(table_id, p); + ut_ad(pair.second); + if (p.first) + goto enqueue; + } + else if (table_node->tables[table_id].first) + { + enqueue: + table_node->undo_recs.push(purge_rec); + ut_ad(!table_node->in_progress); + } + + if (purge_sys.n_pages_handled() >= max_pages) + break; + } + + purge_sys.m_active= false; #ifdef UNIV_DEBUG - i = 0; - /* Debug code to validate some pre-requisites and reset done flag. */ - for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); - thr != NULL && i < n_purge_threads; - thr = UT_LIST_GET_NEXT(thrs, thr), ++i) { + thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); + for (ulint i= 0; thr && i < *n_work_items; + i++, thr= UT_LIST_GET_NEXT(thrs, thr)) + { + purge_node_t *node= static_cast(thr->child); + ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); + ut_ad(!node->in_progress); + node->in_progress= true; + } - purge_node_t* node; - - /* Get the purge node. */ - node = (purge_node_t*) thr->child; - - ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); - ut_ad(node->undo_recs.empty()); - ut_ad(!node->in_progress); - ut_d(node->in_progress = true); - } - - /* There should never be fewer nodes than threads, the inverse - however is allowed because we only use purge threads as needed. */ - ut_ad(i == n_purge_threads); + for (; thr; thr= UT_LIST_GET_NEXT(thrs, thr)) + { + purge_node_t *node= static_cast(thr->child); + ut_ad(que_node_get_type(node) == QUE_NODE_PURGE); + ut_ad(!node->in_progress); + ut_ad(node->undo_recs.empty()); + } #endif - /* Fetch and parse the UNDO records. The UNDO records are added - to a per purge node vector. */ - thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); + ut_ad(head <= purge_sys.tail); - ut_ad(head <= purge_sys.tail); - - i = 0; - - std::unordered_map - table_id_map(TRX_PURGE_TABLE_BUCKETS); - purge_sys.m_active = true; - - MDL_context* const mdl_context - = static_cast(thd_mdl_context(thd)); - ut_ad(mdl_context); - - const size_t max_pages = std::min(buf_pool.curr_size * 3 / 4, - size_t{srv_purge_batch_size}); - - while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) { - /* Track the max {trx_id, undo_no} for truncating the - UNDO logs once we have purged the records. */ - - if (head <= purge_sys.tail) { - head = purge_sys.tail; - } - - /* Fetch the next record, and advance the purge_sys.tail. */ - trx_purge_rec_t purge_rec = purge_sys.fetch_next_rec(); - - if (!purge_rec.undo_rec) { - if (!purge_rec.roll_ptr) { - break; - } - ut_ad(purge_rec.roll_ptr == 1); - continue; - } - - table_id_t table_id = trx_undo_rec_get_table_id( - purge_rec.undo_rec); - - purge_node_t*& table_node = table_id_map[table_id]; - - if (!table_node) { - std::pair p; - p.first = trx_purge_table_open(table_id, mdl_context, - &p.second); - if (p.first == reinterpret_cast(-1)) { - p.first = purge_sys.close_and_reopen( - table_id, thd, &p.second); - } - - thr = UT_LIST_GET_NEXT(thrs, thr); - - if (!(++i % n_purge_threads)) { - thr = UT_LIST_GET_FIRST( - purge_sys.query->thrs); - } - - table_node = static_cast(thr->child); - ut_a(que_node_get_type(table_node) == QUE_NODE_PURGE); - ut_d(auto i=) - table_node->tables.emplace(table_id, p); - ut_ad(i.second); - if (p.first) { - goto enqueue; - } - } else if (table_node->tables[table_id].first) { -enqueue: - table_node->undo_recs.push(purge_rec); - } - - if (purge_sys.n_pages_handled() >= max_pages) { - break; - } - } - - purge_sys.m_active = false; - - ut_ad(head <= purge_sys.tail); - - return head; + return head; } extern tpool::waitable_task purge_worker_task; @@ -1389,68 +1359,89 @@ Run a purge batch. @return number of undo log pages handled in the batch */ TRANSACTIONAL_TARGET ulint trx_purge(ulint n_tasks, ulint history_size) { - ut_ad(n_tasks > 0); + ut_ad(n_tasks > 0); - purge_sys.clone_oldest_view(); + purge_sys.clone_oldest_view(); -#ifdef UNIV_DEBUG - if (srv_purge_view_update_only_debug) { - return(0); - } -#endif /* UNIV_DEBUG */ + ut_d(if (srv_purge_view_update_only_debug) return 0); - THD* const thd = current_thd; + THD *const thd= current_thd; - /* Fetch the UNDO recs that need to be purged. */ - const purge_sys_t::iterator head - = trx_purge_attach_undo_recs(n_tasks, thd); - const size_t n_pages = purge_sys.n_pages_handled(); + /* Fetch the UNDO recs that need to be purged. */ + ulint n_work= 0; + const purge_sys_t::iterator head= trx_purge_attach_undo_recs(thd, &n_work); + const size_t n_pages= purge_sys.n_pages_handled(); - { - ulint delay = n_pages ? srv_max_purge_lag : 0; - if (UNIV_UNLIKELY(delay)) { - if (delay >= history_size) { - no_throttle: - delay = 0; - } else if (const ulint max_delay = - srv_max_purge_lag_delay) { - delay = std::min(max_delay, - 10000 * history_size / delay - - 5000); - } else { - goto no_throttle; - } - } - srv_dml_needed_delay = delay; - } + { + ulint delay= n_pages ? srv_max_purge_lag : 0; + if (UNIV_UNLIKELY(delay)) + { + if (delay >= history_size) + no_throttle: + delay= 0; + else if (const ulint max_delay= srv_max_purge_lag_delay) + delay= std::min(max_delay, 10000 * history_size / delay - 5000); + else + goto no_throttle; + } + srv_dml_needed_delay= delay; + } - que_thr_t* thr = nullptr; + ut_ad(n_tasks); + que_thr_t *thr= nullptr; - /* Submit tasks to workers queue if using multi-threaded purge. */ - for (ulint i = n_tasks; --i; ) { - thr = que_fork_scheduler_round_robin(purge_sys.query, thr); - ut_a(thr); - srv_que_task_enqueue_low(thr); - srv_thread_pool->submit_task(&purge_worker_task); - } + if (n_work) + { + for (auto i= n_work; i--; ) + { + if (!thr) + thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); + else + thr= UT_LIST_GET_NEXT(thrs, thr); - thr = que_fork_scheduler_round_robin(purge_sys.query, thr); + if (!thr) + break; - que_run_threads(thr); + ut_ad(thr->state == QUE_THR_COMPLETED); + thr->state= QUE_THR_RUNNING; + thr->run_node= thr; + thr->prev_node= thr->common.parent; + purge_sys.query->state= QUE_FORK_ACTIVE; + purge_sys.query->last_sel_node= nullptr; + srv_que_task_enqueue_low(thr); + } - trx_purge_wait_for_workers_to_complete(); + /* + To reduce context switches we only submit at most n_tasks-1 worker task. + (we can use less tasks, if there is not enough work) - for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); thr; - thr = UT_LIST_GET_NEXT(thrs, thr)) { - purge_node_t* node = static_cast(thr->child); - trx_purge_close_tables(node, thd); - node->tables.clear(); - } + The coordinator does worker's job, instead of waiting and sitting idle, + then waits for all others to finish. - purge_sys.batch_cleanup(head); + This also means if innodb_purge_threads=1, the coordinator does all + the work alone. + */ + const ulint workers{std::min(n_work, n_tasks) - 1}; + for (ulint i= 0; i < workers; i++) + srv_thread_pool->submit_task(&purge_worker_task); + srv_purge_worker_task_low(); - MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1); - MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages); + if (workers) + trx_purge_wait_for_workers_to_complete(); - return n_pages; + for (thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); thr && n_work--; + thr= UT_LIST_GET_NEXT(thrs, thr)) + { + purge_node_t *node= static_cast(thr->child); + trx_purge_close_tables(node, thd); + node->tables.clear(); + } + } + + purge_sys.batch_cleanup(head); + + MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1); + MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages); + + return n_pages; } diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index cf6c050e623..b933e5cd3a8 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -2045,170 +2045,128 @@ err_exit: /*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/ -/** Copy an undo record to heap. -@param[in] roll_ptr roll pointer to a record that exists -@param[in,out] heap memory heap where copied */ -static -trx_undo_rec_t* -trx_undo_get_undo_rec_low( - roll_ptr_t roll_ptr, - mem_heap_t* heap) +static dberr_t trx_undo_prev_version(const rec_t *rec, dict_index_t *index, + rec_offs *offsets, mem_heap_t *heap, + rec_t **old_vers, mem_heap_t *v_heap, + dtuple_t **vrow, ulint v_status, + const trx_undo_rec_t *undo_rec); + +inline const buf_block_t * +purge_sys_t::view_guard::get(const page_id_t id, mtr_t *mtr) { - ulint rseg_id; - uint32_t page_no; - uint16_t offset; - bool is_insert; - mtr_t mtr; - - trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, &offset); - ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO); - ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); - trx_rseg_t *rseg= &trx_sys.rseg_array[rseg_id]; - ut_ad(rseg->is_persistent()); - - mtr.start(); - - trx_undo_rec_t *undo_rec= nullptr; - if (buf_block_t* undo_page= - buf_page_get(page_id_t(rseg->space->id, page_no), 0, RW_S_LATCH, &mtr)) + buf_block_t *block; + ut_ad(mtr->is_active()); + if (!latch) { - buf_page_make_young_if_needed(&undo_page->page); - undo_rec= undo_page->page.frame + offset; - const size_t end= mach_read_from_2(undo_rec); - if (UNIV_UNLIKELY(end <= offset || - end >= srv_page_size - FIL_PAGE_DATA_END)) - undo_rec= nullptr; - else + decltype(purge_sys.pages)::const_iterator i= purge_sys.pages.find(id); + if (i != purge_sys.pages.end()) { - size_t len{end - offset}; - undo_rec= - static_cast(mem_heap_dup(heap, undo_rec, len)); - mach_write_to_2(undo_rec, len); + block= i->second; + ut_ad(block); + return block; } } - - mtr.commit(); - return undo_rec; -} - -/** Copy an undo record to heap, to check if a secondary index record -can be safely purged. -@param trx_id DB_TRX_ID corresponding to roll_ptr -@param name table name -@param roll_ptr DB_ROLL_PTR pointing to the undo log record -@param heap memory heap for allocation -@return copy of the record -@retval nullptr if the version is visible to purge_sys.view */ -static trx_undo_rec_t *trx_undo_get_rec_if_purgeable(trx_id_t trx_id, - const table_name_t &name, - roll_ptr_t roll_ptr, - mem_heap_t* heap) -{ + block= buf_pool.page_fix(id); + if (block) { - purge_sys_t::view_guard check; - if (!check.view().changes_visible(trx_id)) - return trx_undo_get_undo_rec_low(roll_ptr, heap); + mtr->memo_push(block, MTR_MEMO_BUF_FIX); + if (latch) + /* In MVCC operations (outside purge tasks), we will refresh the + buf_pool.LRU position. In purge, we expect the page to be freed + soon, at the end of the current batch. */ + buf_page_make_young_if_needed(&block->page); } - return nullptr; -} - -/** Copy an undo record to heap. -@param trx_id DB_TRX_ID corresponding to roll_ptr -@param name table name -@param roll_ptr DB_ROLL_PTR pointing to the undo log record -@param heap memory heap for allocation -@return copy of the record -@retval nullptr if the undo log is not available */ -static trx_undo_rec_t *trx_undo_get_undo_rec(trx_id_t trx_id, - const table_name_t &name, - roll_ptr_t roll_ptr, - mem_heap_t *heap) -{ - { - purge_sys_t::end_view_guard check; - if (!check.view().changes_visible(trx_id)) - return trx_undo_get_undo_rec_low(roll_ptr, heap); - } - return nullptr; + return block; } /** Build a previous version of a clustered index record. The caller must hold a latch on the index page of the clustered index record. -@param rec version of a clustered index record -@param index clustered index -@param offsets rec_get_offsets(rec, index) -@param heap memory heap from which the memory needed is - allocated -@param old_vers previous version or NULL if rec is the - first inserted version, or if history data - has been deleted (an error), or if the purge - could have removed the version - though it has not yet done so -@param v_heap memory heap used to create vrow - dtuple if it is not yet created. This heap - diffs from "heap" above in that it could be - prebuilt->old_vers_heap for selection -@param v_row virtual column info, if any -@param v_status status determine if it is going into this - function by purge thread or not. - And if we read "after image" of undo log -@param undo_block undo log block which was cached during - online dml apply or nullptr +@param rec version of a clustered index record +@param index clustered index +@param offsets rec_get_offsets(rec, index) +@param heap memory heap from which the memory needed is allocated +@param old_vers previous version, or NULL if rec is the first inserted + version, or if history data has been deleted (an error), + or if the purge could have removed the version though + it has not yet done so +@param mtr mini-transaction +@param v_status TRX_UNDO_PREV_IN_PURGE, ... +@param v_heap memory heap used to create vrow dtuple if it is not yet + created. This heap diffs from "heap" above in that it could be + prebuilt->old_vers_heap for selection +@param vrow virtual column info, if any @return error code @retval DB_SUCCESS if previous version was successfully built, or if it was an insert or the undo record refers to the table before rebuild @retval DB_MISSING_HISTORY if the history is missing */ TRANSACTIONAL_TARGET -dberr_t -trx_undo_prev_version_build( - const rec_t *rec, - dict_index_t *index, - rec_offs *offsets, - mem_heap_t *heap, - rec_t **old_vers, - mem_heap_t *v_heap, - dtuple_t **vrow, - ulint v_status) +dberr_t trx_undo_prev_version_build(const rec_t *rec, dict_index_t *index, + rec_offs *offsets, mem_heap_t *heap, + rec_t **old_vers, mtr_t *mtr, + ulint v_status, + mem_heap_t *v_heap, dtuple_t **vrow) { - dtuple_t* entry; - trx_id_t rec_trx_id; - undo_no_t undo_no; - table_id_t table_id; - trx_id_t trx_id; - roll_ptr_t roll_ptr; - upd_t* update; - byte type; - byte info_bits; - byte cmpl_info; - bool dummy_extern; - byte* buf; + ut_ad(!index->table->is_temporary()); + ut_ad(rec_offs_validate(rec, index, offsets)); - ut_ad(!index->table->is_temporary()); - ut_ad(rec_offs_validate(rec, index, offsets)); + const roll_ptr_t roll_ptr= row_get_rec_roll_ptr(rec, index, offsets); + *old_vers= nullptr; - roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); + if (trx_undo_roll_ptr_is_insert(roll_ptr)) + /* The record rec is the first inserted version */ + return DB_SUCCESS; - *old_vers = NULL; + ut_ad(roll_ptr < 1ULL << 55); + ut_ad(uint16_t(roll_ptr) >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE); + ut_ad(uint32_t(roll_ptr >> 16) >= FSP_FIRST_INODE_PAGE_NO); - if (trx_undo_roll_ptr_is_insert(roll_ptr)) { - /* The record rec is the first inserted version */ - return DB_SUCCESS; - } + const trx_id_t rec_trx_id= row_get_rec_trx_id(rec, index, offsets); - mariadb_increment_undo_records_read(); - rec_trx_id = row_get_rec_trx_id(rec, index, offsets); + ut_ad(!index->table->skip_alter_undo); - ut_ad(!index->table->skip_alter_undo); + mariadb_increment_undo_records_read(); + const auto savepoint= mtr->get_savepoint(); + dberr_t err= DB_MISSING_HISTORY; + purge_sys_t::view_guard check{v_status == TRX_UNDO_CHECK_PURGE_PAGES + ? purge_sys_t::view_guard::PURGE + : v_status == TRX_UNDO_CHECK_PURGEABILITY + ? purge_sys_t::view_guard::VIEW + : purge_sys_t::view_guard::END_VIEW}; + if (!check.view().changes_visible(rec_trx_id)) + { + trx_undo_rec_t *undo_rec= nullptr; + static_assert(ROLL_PTR_RSEG_ID_POS == 48, ""); + static_assert(ROLL_PTR_PAGE_POS == 16, ""); + if (const buf_block_t *undo_page= + check.get(page_id_t{trx_sys.rseg_array[(roll_ptr >> 48) & 0x7f]. + space->id, + uint32_t(roll_ptr >> 16)}, mtr)) + { + static_assert(ROLL_PTR_BYTE_POS == 0, ""); + const uint16_t offset{uint16_t(roll_ptr)}; + undo_rec= undo_page->page.frame + offset; + const size_t end= mach_read_from_2(undo_rec); + if (UNIV_UNLIKELY(end > offset && + end < srv_page_size - FIL_PAGE_DATA_END)) + err= trx_undo_prev_version(rec, index, offsets, heap, + old_vers, v_heap, vrow, v_status, undo_rec); + } + } - trx_undo_rec_t* undo_rec = v_status == TRX_UNDO_CHECK_PURGEABILITY - ? trx_undo_get_rec_if_purgeable(rec_trx_id, index->table->name, - roll_ptr, heap) - : trx_undo_get_undo_rec(rec_trx_id, index->table->name, - roll_ptr, heap); - if (!undo_rec) { - return DB_MISSING_HISTORY; - } + mtr->rollback_to_savepoint(savepoint); + return err; +} +static dberr_t trx_undo_prev_version(const rec_t *rec, dict_index_t *index, + rec_offs *offsets, mem_heap_t *heap, + rec_t **old_vers, mem_heap_t *v_heap, + dtuple_t **vrow, ulint v_status, + const trx_undo_rec_t *undo_rec) +{ + byte type, cmpl_info; + bool dummy_extern; + undo_no_t undo_no; + table_id_t table_id; const byte *ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info, &dummy_extern, &undo_no, &table_id); @@ -2220,6 +2178,10 @@ trx_undo_prev_version_build( return DB_SUCCESS; } + trx_id_t trx_id; + roll_ptr_t roll_ptr; + byte info_bits; + ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, &info_bits); @@ -2247,10 +2209,12 @@ trx_undo_prev_version_build( ptr = trx_undo_rec_skip_row_ref(ptr, index); + upd_t* update; ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id, roll_ptr, info_bits, heap, &update); ut_a(ptr); + byte* buf; if (row_upd_changes_field_size_or_external(index, offsets, update)) { /* We should confirm the existence of disowned external data, @@ -2276,9 +2240,10 @@ trx_undo_prev_version_build( those fields that update updates to become externally stored fields. Store the info: */ - entry = row_rec_to_index_entry(rec, index, offsets, heap); + dtuple_t* entry = row_rec_to_index_entry(rec, index, offsets, + heap); /* The page containing the clustered index record - corresponding to entry is latched in mtr. Thus the + corresponding to entry is latched. Thus the following call is safe. */ if (!row_upd_index_replace_new_col_vals(entry, *index, update, heap)) { diff --git a/storage/innobase/unittest/innodb_sync-t.cc b/storage/innobase/unittest/innodb_sync-t.cc index d0289086b24..5ad726d8429 100644 --- a/storage/innobase/unittest/innodb_sync-t.cc +++ b/storage/innobase/unittest/innodb_sync-t.cc @@ -92,6 +92,25 @@ static void test_ssux_lock() ssux.wr_u_downgrade(); ssux.u_unlock(); } + + for (auto j= M_ROUNDS; j--; ) + { + ssux.rd_lock(); + assert(!critical); + if (ssux.rd_u_upgrade_try()) + { + assert(!critical); + ssux.rd_unlock(); + ssux.u_wr_upgrade(); + assert(!critical); + critical= true; + critical= false; + ssux.wr_u_downgrade(); + ssux.u_rd_downgrade(); + } + assert(!critical); + ssux.rd_unlock(); + } } } @@ -129,6 +148,14 @@ static void test_sux_lock() critical= false; sux.x_u_downgrade(); sux.u_unlock(); + sux.s_lock(); + std::ignore= sux.s_x_upgrade(); + assert(!critical); + sux.x_lock(); + critical= true; + sux.x_unlock(); + critical= false; + sux.x_unlock(); } } } diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c index 91f93755b40..130bc7b8722 100644 --- a/storage/maria/ma_bitmap.c +++ b/storage/maria/ma_bitmap.c @@ -3075,21 +3075,25 @@ my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info, int _ma_bitmap_create_first(MARIA_SHARE *share) { uint block_size= share->bitmap.block_size; + size_t error; File file= share->bitmap.file.file; - uchar marker[CRC_SIZE]; + uchar *temp_buff; + + if (!(temp_buff= (uchar*) my_alloca(block_size))) + return 1; + bzero(temp_buff, block_size); /* Next write operation of the page will write correct CRC if it is needed */ - int4store(marker, MARIA_NO_CRC_BITMAP_PAGE); + int4store(temp_buff + block_size - CRC_SIZE, MARIA_NO_CRC_BITMAP_PAGE); - if (mysql_file_chsize(file, block_size - sizeof(marker), - 0, MYF(MY_WME)) || - my_pwrite(file, marker, sizeof(marker), - block_size - sizeof(marker), - MYF(MY_NABP | MY_WME))) + error= my_pwrite(file, temp_buff, block_size, 0, MYF(MY_NABP | MY_WME)); + my_afree(temp_buff); + if (error) return 1; + share->state.state.data_file_length= block_size; _ma_bitmap_delete_all(share); return 0; diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c index fb4bc05a863..4eb85fb7a8e 100644 --- a/storage/maria/ma_check.c +++ b/storage/maria/ma_check.c @@ -420,6 +420,8 @@ int maria_chk_size(HA_CHECK *param, register MARIA_HA *info) /* We cannot check file sizes for S3 */ DBUG_RETURN(0); } + /* We should never come here with internal temporary tables */ + DBUG_ASSERT(!share->internal_table); if (!(param->testflag & T_SILENT)) puts("- check file-size"); @@ -715,6 +717,8 @@ static int chk_index_down(HA_CHECK *param, MARIA_HA *info, MARIA_PAGE ma_page; DBUG_ENTER("chk_index_down"); + DBUG_ASSERT(!share->internal_table); + /* Key blocks must lay within the key file length entirely. */ if (page + keyinfo->block_length > share->state.state.key_file_length) { @@ -2467,7 +2471,16 @@ static int initialize_variables_for_repair(HA_CHECK *param, return 1; /* calculate max_records */ - sort_info->filelength= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)); + if (!share->internal_table) + { + /* Get real file size */ + sort_info->filelength= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)); + } + else + { + /* For internal temporary files we are using the logical file length */ + sort_info->filelength= share->state.state.data_file_length; + } param->max_progress= sort_info->filelength; if ((param->testflag & T_CREATE_MISSING_KEYS) || @@ -2865,7 +2878,8 @@ int maria_repair(HA_CHECK *param, register MARIA_HA *info, { fputs(" \r",stdout); fflush(stdout); } - if (mysql_file_chsize(share->kfile.file, + if (!share->internal_table && + mysql_file_chsize(share->kfile.file, share->state.state.key_file_length, 0, MYF(0))) { _ma_check_print_warning(param, @@ -4176,7 +4190,8 @@ int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info, if (param->testflag & T_CALC_CHECKSUM) share->state.state.checksum=param->glob_crc; - if (mysql_file_chsize(share->kfile.file, + if (!share->internal_table && + mysql_file_chsize(share->kfile.file, share->state.state.key_file_length, 0, MYF(0))) _ma_check_print_warning(param, "Can't change size of indexfile, error: %d", @@ -4724,7 +4739,8 @@ int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info, if (param->testflag & T_CALC_CHECKSUM) share->state.state.checksum=param->glob_crc; - if (mysql_file_chsize(share->kfile.file, + if (!share->internal_table && + mysql_file_chsize(share->kfile.file, share->state.state.key_file_length, 0, MYF(0))) _ma_check_print_warning(param, "Can't change size of indexfile, error: %d", @@ -6135,6 +6151,8 @@ int maria_test_if_almost_full(MARIA_HA *info) { MARIA_SHARE *share= info->s; + DBUG_ASSERT(!share->internal_table); + if (share->options & HA_OPTION_COMPRESS_RECORD) return 0; return mysql_file_seek(share->kfile.file, 0L, MY_SEEK_END, diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c index f355d0da3e8..12038ac48c1 100644 --- a/storage/maria/ma_delete_all.c +++ b/storage/maria/ma_delete_all.c @@ -130,9 +130,17 @@ int maria_delete_all_rows(MARIA_HA *info) #endif if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX, - FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) || - mysql_file_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) || - mysql_file_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME))) + FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED)) + goto err; + /* + Avoid truncate of internal temporary tables as this can have a big + performance overhead when called by mysql_handle_single_derived() + tables in MariaDB as part of split materialization. + */ + if (!share->internal_table && + (mysql_file_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) || + mysql_file_chsize(share->kfile.file, share->base.keystart, 0, + MYF(MY_WME)))) goto err; if (_ma_initialize_data_file(share, info->dfile.file)) diff --git a/storage/spider/spd_db_conn.cc b/storage/spider/spd_db_conn.cc index 56ad9e9fc5c..7bb0db32271 100644 --- a/storage/spider/spd_db_conn.cc +++ b/storage/spider/spd_db_conn.cc @@ -1486,6 +1486,7 @@ int spider_db_append_key_hint( if (str->reserve( hint_str_len - 2 + SPIDER_SQL_INDEX_USE_LEN + SPIDER_SQL_OPEN_PAREN_LEN + SPIDER_SQL_CLOSE_PAREN_LEN)) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); hint_str += 2; str->q_append(SPIDER_SQL_INDEX_USE_STR, SPIDER_SQL_INDEX_USE_LEN); str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN); @@ -1498,10 +1499,11 @@ int spider_db_append_key_hint( if (str->reserve( hint_str_len - 3 + SPIDER_SQL_INDEX_IGNORE_LEN + SPIDER_SQL_OPEN_PAREN_LEN + SPIDER_SQL_CLOSE_PAREN_LEN)) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); hint_str += 3; str->q_append(SPIDER_SQL_INDEX_IGNORE_STR, SPIDER_SQL_INDEX_IGNORE_LEN); str->q_append(SPIDER_SQL_OPEN_PAREN_STR, SPIDER_SQL_OPEN_PAREN_LEN); - str->q_append(hint_str, hint_str_len - 2); + str->q_append(hint_str, hint_str_len - 3); str->q_append(SPIDER_SQL_CLOSE_PAREN_STR, SPIDER_SQL_CLOSE_PAREN_LEN); } else if (str->reserve(hint_str_len + SPIDER_SQL_SPACE_LEN)) DBUG_RETURN(HA_ERR_OUT_OF_MEM); diff --git a/storage/spider/spd_db_mysql.cc b/storage/spider/spd_db_mysql.cc index e0a9c7ca882..5c9a2bb846e 100644 --- a/storage/spider/spd_db_mysql.cc +++ b/storage/spider/spd_db_mysql.cc @@ -7613,8 +7613,8 @@ int spider_mbase_share::convert_key_hint_str() roop_count < (int) table_share->keys; roop_count++, tmp_key_hint++) { tmp_key_hint->length(0); - if (tmp_key_hint->append(spider_share->key_hint->ptr(), - spider_share->key_hint->length(), system_charset_info)) + if (tmp_key_hint->append(spider_share->key_hint[roop_count].ptr(), + spider_share->key_hint[roop_count].length(), system_charset_info)) DBUG_RETURN(HA_ERR_OUT_OF_MEM); } } else {