From 3fae64b196cfb94ac4084d02c5745285589c6b48 Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Fri, 15 Sep 2017 10:04:49 +0000 Subject: [PATCH 01/55] Copy of commit 184a4a2d82f4f6f3cbcb1015bcdb32bebe73315c Author: Abhinav Sharma Date: Thu Sep 14 11:40:08 2017 -0700 Bump rocksdb submodule Summary: Bump rocksdb to include the fix for rocksdb.trx_info_rpl The bug was introduced in: https://github.com/facebook/rocksdb/pull/2850 Fixed in: https://github.com/facebook/rocksdb/pull/2881 update-submodule: rocksdb Reviewed By: lth Differential Revision: D5834658 fbshipit-source-id: d1551bf --- storage/rocksdb/CMakeLists.txt | 6 + storage/rocksdb/ha_rocksdb.cc | 819 +++++++++++-- storage/rocksdb/ha_rocksdb.h | 72 +- storage/rocksdb/ha_rocksdb_proto.h | 1 + .../include/prefix_index_only_query_check.inc | 21 + storage/rocksdb/mysql-test/rocksdb/my.cnf | 1 + .../rocksdb/r/2pc_group_commit.result | 14 +- .../rocksdb/r/add_index_inplace.result | 13 +- .../rocksdb/r/bulk_load_errors.result | 26 + .../rocksdb/r/bulk_load_unsorted.result | 101 ++ .../r/bulk_load_unsorted_errors.result | 4 + .../mysql-test/rocksdb/r/bytes_written.result | 10 + .../r/cons_snapshot_read_committed.result | 12 +- .../r/cons_snapshot_repeatable_read.result | 2 +- .../r/covered_unpack_info_format.result | 73 ++ .../rocksdb/r/ddl_high_priority.result | 1009 +++++++++++++++++ .../rocksdb/r/deadlock_tracking.result | 490 ++++++++ .../mysql-test/rocksdb/r/drop_table.result | 3 + .../mysql-test/rocksdb/r/drop_table2.result | 2 + .../mysql-test/rocksdb/r/drop_table3.result | 2 + .../rocksdb/r/dup_key_update.result | 4 + .../rocksdb/r/fast_prefix_index_fetch.result | 80 ++ .../mysql-test/rocksdb/r/i_s_ddl.result | 6 +- .../rocksdb/mysql-test/rocksdb/r/index.result | 27 + .../mysql-test/rocksdb/r/index_primary.result | 23 + .../rocksdb/r/index_type_btree.result | 27 + .../rocksdb/r/index_type_hash.result | 27 + .../r/issue243_transactionStatus.result | 18 +- .../rocksdb/r/multi_varchar_sk_lookup.result | 37 + .../mysql-test/rocksdb/r/rocksdb.result | 48 +- .../mysql-test/rocksdb/r/show_engine.result | 33 +- .../rocksdb/r/skip_validate_tmp_table.result | 4 + .../rocksdb/r/truncate_table3.result | 2 + .../mysql-test/rocksdb/r/ttl_primary.result | 4 +- .../mysql-test/rocksdb/r/ttl_secondary.result | 709 ++++++++++++ .../r/ttl_secondary_read_filtering.result | 494 ++++++++ ...ndary_read_filtering_multiple_index.result | 82 ++ .../r/ttl_secondary_with_partitions.result | 389 +++++++ .../mysql-test/rocksdb/r/write_sync.result | 25 +- .../rocksdb/t/2pc_group_commit.test | 6 +- .../rocksdb/t/add_index_inplace.test | 8 +- .../rocksdb/t/bulk_load_errors.test | 26 + .../rocksdb/t/bulk_load_unsorted.test | 136 +++ .../rocksdb/t/bulk_load_unsorted_errors.test | 8 + .../mysql-test/rocksdb/t/bytes_written.test | 22 + .../rocksdb/t/covered_unpack_info_format.test | 79 ++ .../rocksdb/t/ddl_high_priority.test | 18 + .../rocksdb/t/deadlock_tracking.test | 153 +++ .../rocksdb/t/drop_table-master.opt | 1 - .../mysql-test/rocksdb/t/drop_table.test | 5 +- .../mysql-test/rocksdb/t/drop_table2.test | 4 +- .../mysql-test/rocksdb/t/drop_table3.inc | 4 +- .../mysql-test/rocksdb/t/dup_key_update.test | 4 + .../rocksdb/t/fast_prefix_index_fetch.test | 120 ++ .../rocksdb/mysql-test/rocksdb/t/index.inc | 34 + .../mysql-test/rocksdb/t/index_primary.test | 32 + .../rocksdb/t/multi_varchar_sk_lookup.test | 49 + .../rocksdb/mysql-test/rocksdb/t/rocksdb.test | 2 + .../mysql-test/rocksdb/t/show_engine.test | 13 +- .../rocksdb/t/skip_validate_tmp_table.test | 39 + .../mysql-test/rocksdb/t/ttl_primary.test | 14 +- .../rocksdb/t/ttl_secondary-master.opt | 2 + .../mysql-test/rocksdb/t/ttl_secondary.test | 780 +++++++++++++ .../t/ttl_secondary_read_filtering-master.opt | 1 + .../t/ttl_secondary_read_filtering.test | 500 ++++++++ ...condary_read_filtering_multiple_index.test | 87 ++ .../ttl_secondary_with_partitions-master.opt | 1 + .../t/ttl_secondary_with_partitions.test | 300 +++++ .../mysql-test/rocksdb/t/write_sync.test | 27 +- .../rocksdb_rpl/r/multiclient_2pc.result | 9 +- .../r/rpl_ddl_high_priority.result | 39 + .../rocksdb_rpl/t/multiclient_2pc.test | 29 +- .../rocksdb_rpl/t/rpl_ddl_high_priority.test | 2 + .../mysql-test/rocksdb_sys_vars/my.cnf | 4 + ...ksdb_bulk_load_allow_unsorted_basic.result | 100 ++ .../r/rocksdb_concurrent_prepare_basic.result | 14 + ...rocksdb_deadlock_detect_depth_basic.result | 79 ++ .../rocksdb_debug_ttl_ignore_pk_basic.result | 64 ++ ..._force_compute_memtable_stats_basic.result | 4 +- ...pute_memtable_stats_cachetime_basic.result | 68 ++ .../r/rocksdb_large_prefix_basic.result | 64 ++ .../r/rocksdb_manual_wal_flush_basic.result | 14 + .../rocksdb_max_latest_deadlocks_basic.result | 53 + ...rge_tmp_file_removal_delay_ms_basic.result | 93 ++ .../r/rocksdb_sim_cache_size_basic.result | 7 + .../r/rocksdb_use_clock_cache_basic.result | 19 + ...ocksdb_bulk_load_allow_unsorted_basic.test | 18 + .../t/rocksdb_concurrent_prepare_basic.test | 16 + .../rocksdb_deadlock_detect_depth_basic.test | 17 + .../t/rocksdb_debug_ttl_ignore_pk_basic.test | 18 + ...db_force_compute_memtable_stats_basic.test | 4 +- ...ompute_memtable_stats_cachetime_basic.test | 18 + .../t/rocksdb_large_prefix_basic.test | 18 + .../t/rocksdb_manual_wal_flush_basic.test | 16 + .../t/rocksdb_max_latest_deadlocks_basic.test | 17 + ...merge_tmp_file_removal_delay_ms_basic.test | 49 + .../t/rocksdb_sim_cache_size_basic.test | 6 + .../t/rocksdb_use_clock_cache_basic.test | 21 + storage/rocksdb/rdb_buff.h | 16 + storage/rocksdb/rdb_compact_filter.h | 8 + storage/rocksdb/rdb_datadic.cc | 357 +++++- storage/rocksdb/rdb_datadic.h | 66 +- storage/rocksdb/rdb_i_s.cc | 15 +- storage/rocksdb/rdb_index_merge.cc | 186 +-- storage/rocksdb/rdb_index_merge.h | 70 +- storage/rocksdb/rdb_perf_context.cc | 22 + storage/rocksdb/rdb_perf_context.h | 16 +- 107 files changed, 8419 insertions(+), 412 deletions(-) create mode 100644 storage/rocksdb/mysql-test/rocksdb/include/prefix_index_only_query_check.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted_errors.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bytes_written.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/covered_unpack_info_format.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/deadlock_tracking.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/fast_prefix_index_fetch.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/multi_varchar_sk_lookup.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/skip_validate_tmp_table.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/ttl_secondary.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/ttl_secondary_read_filtering.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/ttl_secondary_read_filtering_multiple_index.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/ttl_secondary_with_partitions.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bulk_load_unsorted.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bulk_load_unsorted_errors.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bytes_written.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/covered_unpack_info_format.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ddl_high_priority.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/deadlock_tracking.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/fast_prefix_index_fetch.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/multi_varchar_sk_lookup.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/skip_validate_tmp_table.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ttl_secondary-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ttl_secondary.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ttl_secondary_read_filtering-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ttl_secondary_read_filtering.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ttl_secondary_read_filtering_multiple_index.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ttl_secondary_with_partitions-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/ttl_secondary_with_partitions.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_ddl_high_priority.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_ddl_high_priority.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/my.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_bulk_load_allow_unsorted_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_concurrent_prepare_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_deadlock_detect_depth_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_debug_ttl_ignore_pk_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_force_compute_memtable_stats_cachetime_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_large_prefix_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_manual_wal_flush_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_max_latest_deadlocks_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_merge_tmp_file_removal_delay_ms_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_sim_cache_size_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_use_clock_cache_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_bulk_load_allow_unsorted_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_concurrent_prepare_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_deadlock_detect_depth_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_debug_ttl_ignore_pk_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_force_compute_memtable_stats_cachetime_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_large_prefix_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_manual_wal_flush_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_max_latest_deadlocks_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_merge_tmp_file_removal_delay_ms_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_sim_cache_size_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_use_clock_cache_basic.test diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt index ec9705891e0..9e6c8160e32 100644 --- a/storage/rocksdb/CMakeLists.txt +++ b/storage/rocksdb/CMakeLists.txt @@ -106,6 +106,12 @@ IF (NOT "$ENV{WITH_ZSTD}" STREQUAL "") ADD_DEFINITIONS(-DZSTD) ENDIF() +IF (NOT "$ENV{WITH_TBB}" STREQUAL "") + SET(rocksdb_static_libs ${rocksdb_static_libs} + $ENV{WITH_TBB}/libtbb${PIC_EXT}.a) + ADD_DEFINITIONS(-DTBB) +ENDIF() + CHECK_INCLUDE_FILES(zlib.h HAVE_ZLIB_H) IF (HAVE_ZLIB_H) diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 0df153b4853..cc52b343dd1 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -36,6 +36,7 @@ #include "./debug_sync.h" #include "./my_bit.h" #include "./my_stacktrace.h" +#include "./my_sys.h" #include "./sql_audit.h" #include "./sql_table.h" #include @@ -57,6 +58,7 @@ #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/convenience.h" #include "rocksdb/utilities/memory_util.h" +#include "rocksdb/utilities/sim_cache.h" #include "util/stop_watch.h" /* MyRocks includes */ @@ -101,6 +103,7 @@ namespace myrocks { static st_global_stats global_stats; static st_export_stats export_stats; static st_memory_stats memory_stats; +static st_io_stall_stats io_stall_stats; const std::string DEFAULT_CF_NAME("default"); const std::string DEFAULT_SYSTEM_CF_NAME("__system__"); @@ -127,6 +130,14 @@ bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, const bool use_all_keys); +static rocksdb::CompactRangeOptions getCompactRangeOptions() { + rocksdb::CompactRangeOptions compact_range_options; + compact_range_options.bottommost_level_compaction = + rocksdb::BottommostLevelCompaction::kForce; + compact_range_options.exclusive_manual_compaction = false; + return compact_range_options; +} + /////////////////////////////////////////////////////////// // Parameters and settings /////////////////////////////////////////////////////////// @@ -149,7 +160,6 @@ static std::shared_ptr properties_collector_factory; Rdb_dict_manager dict_manager; Rdb_cf_manager cf_manager; Rdb_ddl_manager ddl_manager; -const char *m_mysql_gtid; Rdb_binlog_manager binlog_manager; Rdb_io_watchdog *io_watchdog = nullptr; @@ -165,7 +175,7 @@ static Rdb_background_thread rdb_bg_thread; // collation check requirement. Regex_list_handler *rdb_collation_exceptions; -static const char **rdb_get_error_messages(); +static const char *rdb_get_error_message(int nr); static void rocksdb_flush_all_memtables() { const Rdb_cf_manager &cf_manager = rdb_get_cf_manager(); @@ -192,7 +202,7 @@ static int rocksdb_compact_column_family(THD *const thd, if (cfh != nullptr && rdb != nullptr) { sql_print_information("RocksDB: Manual compaction of column family: %s\n", cf); - rdb->CompactRange(rocksdb::CompactRangeOptions(), cfh, nullptr, nullptr); + rdb->CompactRange(getCompactRangeOptions(), cfh, nullptr, nullptr); } } return HA_EXIT_SUCCESS; @@ -388,6 +398,10 @@ static void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var, void *var_ptr, const void *save); +static void rocksdb_set_max_latest_deadlocks(THD *thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save); + static void rdb_set_collation_exception_list(const char *exception_list); static void rocksdb_set_collation_exception_list(THD *thd, struct st_mysql_sys_var *var, @@ -404,6 +418,10 @@ rocksdb_set_bulk_load(THD *thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), void *var_ptr, const void *save); +static void rocksdb_set_bulk_load_allow_unsorted( + THD *thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), + void *var_ptr, const void *save); + static void rocksdb_set_max_background_jobs(THD *thd, struct st_mysql_sys_var *const var, void *const var_ptr, @@ -412,12 +430,15 @@ static void rocksdb_set_max_background_jobs(THD *thd, // Options definitions ////////////////////////////////////////////////////////////////////////////// static long long rocksdb_block_cache_size; +static long long rocksdb_sim_cache_size; +static my_bool rocksdb_use_clock_cache; /* Use unsigned long long instead of uint64_t because of MySQL compatibility */ static unsigned long long // NOLINT(runtime/int) rocksdb_rate_limiter_bytes_per_sec; static unsigned long long // NOLINT(runtime/int) rocksdb_sst_mgr_rate_bytes_per_sec; static unsigned long long rocksdb_delayed_write_rate; +static uint32_t rocksdb_max_latest_deadlocks; static unsigned long // NOLINT(runtime/int) rocksdb_persistent_cache_size_mb; static uint64_t rocksdb_info_log_level; @@ -427,6 +448,7 @@ static uint64_t rocksdb_index_type; static uint32_t rocksdb_flush_log_at_trx_commit; static uint32_t rocksdb_debug_optimizer_n_rows; static my_bool rocksdb_force_compute_memtable_stats; +static uint32_t rocksdb_force_compute_memtable_stats_cachetime; static my_bool rocksdb_debug_optimizer_no_zero_cardinality; static uint32_t rocksdb_wal_recovery_mode; static uint32_t rocksdb_access_hint_on_compaction_start; @@ -444,6 +466,7 @@ static my_bool rocksdb_enable_ttl_read_filtering = 1; static int rocksdb_debug_ttl_rec_ts = 0; static int rocksdb_debug_ttl_snapshot_ts = 0; static int rocksdb_debug_ttl_read_filter_ts = 0; +static my_bool rocksdb_debug_ttl_ignore_pk = 0; static my_bool rocksdb_reset_stats = 0; static uint32_t rocksdb_io_write_timeout_secs = 0; static uint64_t rocksdb_number_stat_computes = 0; @@ -456,6 +479,7 @@ static char *rocksdb_datadir; static uint32_t rocksdb_table_stats_sampling_pct; static my_bool rocksdb_enable_bulk_load_api = 1; static my_bool rocksdb_print_snapshot_conflict_queries = 0; +static my_bool rocksdb_large_prefix = 0; std::atomic rocksdb_snapshot_conflict_errors(0); std::atomic rocksdb_wal_group_syncs(0); @@ -468,6 +492,8 @@ static std::unique_ptr rdb_init_rocksdb_db_options(void) { o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL; o->max_subcompactions = DEFAULT_SUBCOMPACTIONS; + o->concurrent_prepare = true; + o->manual_wal_flush = true; return o; } @@ -559,9 +585,12 @@ const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024; const size_t RDB_MIN_MERGE_BUF_SIZE = 100; const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = 1024 * 1024 * 1024; const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100; +const size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = 0; +const size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = 0; const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024; const int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024; const int RDB_MAX_CHECKSUMS_PCT = 100; +const ulong RDB_DEADLOCK_DETECT_DEPTH = 50; // TODO: 0 means don't wait at all, and we don't support it yet? static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG, @@ -572,6 +601,14 @@ static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG, static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG, "Enables deadlock detection", nullptr, nullptr, FALSE); +static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG, + "Number of transactions deadlock detection will " + "traverse through before assuming deadlock", + nullptr, nullptr, + /*default*/ RDB_DEADLOCK_DETECT_DEPTH, + /*min*/ 2, + /*max*/ ULONG_MAX, 0); + static MYSQL_THDVAR_BOOL( trace_sst_api, PLUGIN_VAR_RQCMDARG, "Generate trace output in the log for each call to the SstFileWriter", @@ -583,6 +620,11 @@ static MYSQL_THDVAR_BOOL( "unique_checks and enables rocksdb_commit_in_the_middle.", nullptr, rocksdb_set_bulk_load, FALSE); +static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG, + "Allow unsorted input during bulk-load. " + "Can be changed only when bulk load is disabled.", + nullptr, rocksdb_set_bulk_load_allow_unsorted, FALSE); + static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Enables using SstFileWriter for bulk loading", @@ -662,6 +704,18 @@ static MYSQL_THDVAR_ULONGLONG( /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE, /* max */ SIZE_T_MAX, 1); +static MYSQL_THDVAR_ULONGLONG( + merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG, + "Fast index creation creates a large tmp file on disk during index " + "creation. Removing this large file all at once when index creation is " + "complete can cause trim stalls on Flash. This variable specifies a " + "duration to sleep (in milliseconds) between calling chsize() to truncate " + "the file in chunks. The chunk size is the same as merge_buf_size.", + nullptr, nullptr, + /* default (0ms) */ RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY, + /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY, + /* max */ SIZE_T_MAX, 1); + static MYSQL_SYSVAR_BOOL( create_if_missing, *reinterpret_cast(&rocksdb_db_options->create_if_missing), @@ -669,6 +723,20 @@ static MYSQL_SYSVAR_BOOL( "DBOptions::create_if_missing for RocksDB", nullptr, nullptr, rocksdb_db_options->create_if_missing); +static MYSQL_SYSVAR_BOOL( + concurrent_prepare, + *reinterpret_cast(&rocksdb_db_options->concurrent_prepare), + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "DBOptions::concurrent_prepare for RocksDB", nullptr, nullptr, + rocksdb_db_options->concurrent_prepare); + +static MYSQL_SYSVAR_BOOL( + manual_wal_flush, + *reinterpret_cast(&rocksdb_db_options->manual_wal_flush), + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr, + rocksdb_db_options->manual_wal_flush); + static MYSQL_SYSVAR_BOOL( create_missing_column_families, *reinterpret_cast( @@ -712,6 +780,13 @@ static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate, rocksdb_db_options->delayed_write_rate, 0, UINT64_MAX, 0); +static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks, + PLUGIN_VAR_RQCMDARG, + "Maximum number of recent " + "deadlocks to store", + nullptr, rocksdb_set_max_latest_deadlocks, + rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0); + static MYSQL_SYSVAR_ENUM( info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG, "Filter level for info logs to be written mysqld error log. " @@ -988,6 +1063,22 @@ static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size, /* max */ LONGLONG_MAX, /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE); +static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Simulated cache size for RocksDB", nullptr, + nullptr, + /* default */ 0, + /* min */ 0, + /* max */ LONGLONG_MAX, + /* Block size */ 0); + +static MYSQL_SYSVAR_BOOL( + use_clock_cache, + rocksdb_use_clock_cache, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Use ClockCache instead of default LRUCache for RocksDB", + nullptr, nullptr, false); + static MYSQL_SYSVAR_BOOL( cache_index_and_filter_blocks, *reinterpret_cast( @@ -1076,12 +1167,21 @@ static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options, "Option updates per column family for RocksDB", nullptr, rocksdb_set_update_cf_options, nullptr); +enum rocksdb_flush_log_at_trx_commit_type : unsigned int { + FLUSH_LOG_NEVER = 0, + FLUSH_LOG_SYNC, + FLUSH_LOG_BACKGROUND, + FLUSH_LOG_MAX /* must be last */ +}; + static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit, rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG, "Sync on transaction commit. Similar to " "innodb_flush_log_at_trx_commit. 1: sync on commit, " "0,2: not sync on commit", - nullptr, nullptr, 1, 0, 2, 0); + nullptr, nullptr, /* default */ FLUSH_LOG_SYNC, + /* min */ FLUSH_LOG_NEVER, + /* max */ FLUSH_LOG_BACKGROUND, 0); static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG, "WriteOptions::disableWAL for RocksDB", nullptr, @@ -1125,6 +1225,13 @@ static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats, "Force to always compute memtable stats", nullptr, nullptr, TRUE); +static MYSQL_SYSVAR_UINT(force_compute_memtable_stats_cachetime, + rocksdb_force_compute_memtable_stats_cachetime, + PLUGIN_VAR_RQCMDARG, + "Time in usecs to cache memtable estimates", nullptr, + nullptr, /* default */ 60 * 1000 * 1000, + /* min */ 0, /* max */ INT_MAX, 0); + static MYSQL_SYSVAR_BOOL( debug_optimizer_no_zero_cardinality, rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG, @@ -1191,6 +1298,12 @@ static MYSQL_SYSVAR_INT( "is not set. This variable is a no-op in non-debug builds.", nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0); +static MYSQL_SYSVAR_BOOL( + debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG, + "For debugging purposes only. If true, compaction filtering will not occur " + "on PK TTL data. This variable is a no-op in non-debug builds.", + nullptr, nullptr, FALSE); + static MYSQL_SYSVAR_BOOL( reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG, "Reset the RocksDB internal statistics without restarting the DB.", nullptr, @@ -1346,15 +1459,23 @@ static MYSQL_SYSVAR_UINT( RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0, /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0); +static MYSQL_SYSVAR_BOOL( + large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG, + "Support large index prefix length of 3072 bytes. If off, the maximum " + "index prefix length is 767.", + nullptr, nullptr, FALSE); + static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100; static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(lock_wait_timeout), MYSQL_SYSVAR(deadlock_detect), + MYSQL_SYSVAR(deadlock_detect_depth), MYSQL_SYSVAR(max_row_locks), MYSQL_SYSVAR(write_batch_max_bytes), MYSQL_SYSVAR(lock_scanned_rows), MYSQL_SYSVAR(bulk_load), + MYSQL_SYSVAR(bulk_load_allow_unsorted), MYSQL_SYSVAR(skip_unique_check_tables), MYSQL_SYSVAR(trace_sst_api), MYSQL_SYSVAR(commit_in_the_middle), @@ -1365,15 +1486,19 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(enable_bulk_load_api), MYSQL_SYSVAR(tmpdir), MYSQL_SYSVAR(merge_combine_read_size), + MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms), MYSQL_SYSVAR(skip_bloom_filter_on_read), MYSQL_SYSVAR(create_if_missing), + MYSQL_SYSVAR(concurrent_prepare), + MYSQL_SYSVAR(manual_wal_flush), MYSQL_SYSVAR(create_missing_column_families), MYSQL_SYSVAR(error_if_exists), MYSQL_SYSVAR(paranoid_checks), MYSQL_SYSVAR(rate_limiter_bytes_per_sec), MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec), MYSQL_SYSVAR(delayed_write_rate), + MYSQL_SYSVAR(max_latest_deadlocks), MYSQL_SYSVAR(info_log_level), MYSQL_SYSVAR(max_open_files), MYSQL_SYSVAR(max_total_wal_size), @@ -1413,6 +1538,8 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(enable_write_thread_adaptive_yield), MYSQL_SYSVAR(block_cache_size), + MYSQL_SYSVAR(sim_cache_size), + MYSQL_SYSVAR(use_clock_cache), MYSQL_SYSVAR(cache_index_and_filter_blocks), MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache), MYSQL_SYSVAR(index_type), @@ -1438,6 +1565,7 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(force_index_records_in_range), MYSQL_SYSVAR(debug_optimizer_n_rows), MYSQL_SYSVAR(force_compute_memtable_stats), + MYSQL_SYSVAR(force_compute_memtable_stats_cachetime), MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality), MYSQL_SYSVAR(compact_cf), @@ -1454,6 +1582,7 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(debug_ttl_rec_ts), MYSQL_SYSVAR(debug_ttl_snapshot_ts), MYSQL_SYSVAR(debug_ttl_read_filter_ts), + MYSQL_SYSVAR(debug_ttl_ignore_pk), MYSQL_SYSVAR(reset_stats), MYSQL_SYSVAR(io_write_timeout), MYSQL_SYSVAR(flush_memtable_on_analyze), @@ -1475,13 +1604,15 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(validate_tables), MYSQL_SYSVAR(table_stats_sampling_pct), + + MYSQL_SYSVAR(large_prefix), nullptr}; static rocksdb::WriteOptions rdb_get_rocksdb_write_options(my_core::THD *const thd) { rocksdb::WriteOptions opt; - opt.sync = (rocksdb_flush_log_at_trx_commit == 1); + opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); opt.disableWAL = THDVAR(thd, write_disable_wal); opt.ignore_missing_column_families = THDVAR(thd, write_ignore_missing_column_families); @@ -1743,6 +1874,13 @@ public: } } + void update_bytes_written(ulonglong bytes_written) { + if (m_tbl_io_perf != nullptr) { + m_tbl_io_perf->update_bytes_written(rocksdb_perf_context_level(m_thd), + bytes_written); + } + } + void set_params(int timeout_sec_arg, int max_row_locks_arg) { m_timeout_sec = timeout_sec_arg; m_max_row_locks = max_row_locks_arg; @@ -2238,9 +2376,10 @@ public: tx_opts.set_snapshot = false; tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec); tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect); + tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth); tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes); - write_opts.sync = (rocksdb_flush_log_at_trx_commit == 1); + write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); write_opts.disableWAL = THDVAR(m_thd, write_disable_wal); write_opts.ignore_missing_column_families = THDVAR(m_thd, write_ignore_missing_column_families); @@ -2459,7 +2598,7 @@ public: void start_tx() override { reset(); - write_opts.sync = (rocksdb_flush_log_at_trx_commit == 1); + write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); write_opts.disableWAL = THDVAR(m_thd, write_disable_wal); write_opts.ignore_missing_column_families = THDVAR(m_thd, write_ignore_missing_column_families); @@ -2503,14 +2642,21 @@ namespace { class Rdb_perf_context_guard { Rdb_io_perf m_io_perf; - THD *m_thd; + Rdb_io_perf *m_io_perf_ptr; + Rdb_transaction *m_tx; + uint m_level; -public: + public: Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete; Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete; - explicit Rdb_perf_context_guard(THD *const thd) : m_thd(thd) { - Rdb_transaction *&tx = get_tx_from_thd(m_thd); + explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level) + : m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) { + m_io_perf_ptr->start(m_level); + } + + explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level) + : m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) { /* if perf_context information is already being recorded, this becomes a no-op @@ -2521,9 +2667,10 @@ public: } ~Rdb_perf_context_guard() { - Rdb_transaction *&tx = get_tx_from_thd(m_thd); - if (tx != nullptr) { - tx->io_perf_end_and_record(); + if (m_tx != nullptr) { + m_tx->io_perf_end_and_record(); + } else if (m_io_perf_ptr != nullptr) { + m_io_perf_ptr->end_and_record(m_level); } } }; @@ -2611,8 +2758,17 @@ static std::string rdb_xid_to_string(const XID &src) { static bool rocksdb_flush_wal(handlerton *const hton MY_ATTRIBUTE((__unused__)), ulonglong target_lsn MY_ATTRIBUTE((__unused__))) { DBUG_ASSERT(rdb != nullptr); - rocksdb_wal_group_syncs++; - const rocksdb::Status s = rdb->SyncWAL(); + + rocksdb::Status s; + /* + target_lsn is set to 0 when MySQL wants to sync the wal files + */ + if (target_lsn == 0 || rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) { + rocksdb_wal_group_syncs++; + s = rdb->FlushWAL(target_lsn == 0 || + rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); + } + if (!s.ok()) { rdb_log_status_error(s); return HA_EXIT_FAILURE; @@ -2651,7 +2807,7 @@ static int rocksdb_prepare(handlerton *const hton, THD *const thd, return HA_EXIT_FAILURE; } if (thd->durability_property == HA_IGNORE_DURABILITY && - (rocksdb_flush_log_at_trx_commit == 1)) { + (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER)) { /** we set the log sequence as '1' just to trigger hton->flush_logs */ @@ -2812,12 +2968,12 @@ static int rocksdb_commit(handlerton *const hton, THD *const thd, rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true); - /* this will trigger saving of perf_context information */ - Rdb_perf_context_guard guard(thd); - /* note: h->external_lock(F_UNLCK) is called after this function is called) */ Rdb_transaction *&tx = get_tx_from_thd(thd); + /* this will trigger saving of perf_context information */ + Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); + if (tx != nullptr) { if (commit_tx || (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { @@ -2854,8 +3010,8 @@ static int rocksdb_commit(handlerton *const hton, THD *const thd, static int rocksdb_rollback(handlerton *const hton, THD *const thd, bool rollback_tx) { - Rdb_perf_context_guard guard(thd); Rdb_transaction *&tx = get_tx_from_thd(thd); + Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); if (tx != nullptr) { if (rollback_tx) { @@ -2978,7 +3134,82 @@ private: "=========================================\n"; } -public: + static std::string get_dlock_txn_info(const rocksdb::DeadlockInfo &txn, + const GL_INDEX_ID &gl_index_id, + bool is_last_path = false) { + std::string txn_data; + + /* extract table name and index names using the index id */ + std::string table_name = ddl_manager.safe_get_table_name(gl_index_id); + if (table_name.empty()) { + table_name = + "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id); + } + auto kd = ddl_manager.safe_find(gl_index_id); + std::string idx_name = + (kd) ? kd->get_name() + : "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id); + + /* get the name of the column family */ + rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(txn.m_cf_id); + std::string cf_name = cfh->GetName(); + + txn_data += format_string( + "TRANSACTIONID: %u\n" + "COLUMN FAMILY NAME: %s\n" + "WAITING KEY: %s\n" + "LOCK TYPE: %s\n" + "INDEX NAME: %s\n" + "TABLE NAME: %s\n", + txn.m_txn_id, cf_name.c_str(), + rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length()) + .c_str(), + txn.m_exclusive ? "EXCLUSIVE" : "SHARED", idx_name.c_str(), + table_name.c_str()); + if (!is_last_path) { + txn_data += "---------------WAITING FOR---------------\n"; + } + return txn_data; + } + + static std::string + get_dlock_path_info(const rocksdb::DeadlockPath &path_entry) { + std::string path_data; + if (path_entry.limit_exceeded) { + path_data += "\n-------DEADLOCK EXCEEDED MAX DEPTH-------\n"; + } else { + path_data += "\n*** DEADLOCK PATH\n" + "=========================================\n"; + for (auto it = path_entry.path.begin(); it != path_entry.path.end(); + it++) { + auto txn = *it; + const GL_INDEX_ID gl_index_id = { + txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast( + txn.m_waiting_key.c_str()))}; + path_data += get_dlock_txn_info(txn, gl_index_id); + } + + DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty()); + /* print the first txn in the path to display the full deadlock cycle */ + if (!path_entry.path.empty() && !path_entry.limit_exceeded) { + auto txn = path_entry.path[0]; + const GL_INDEX_ID gl_index_id = { + txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast( + txn.m_waiting_key.c_str()))}; + path_data += get_dlock_txn_info(txn, gl_index_id, true); + + /* prints the txn id of the transaction that caused the deadlock */ + auto deadlocking_txn = *(path_entry.path.end() - 1); + path_data += + format_string("\n--------TRANSACTIONID: %u GOT DEADLOCK---------\n", + deadlocking_txn.m_txn_id); + } + } + + return path_data; + } + + public: Rdb_snapshot_status() : m_data(get_header()) {} std::string getResult() { return m_data + get_footer(); } @@ -3007,6 +3238,15 @@ public: tx->get_delete_count()); } } + + void populate_deadlock_buffer() { + auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + m_data += "----------LATEST DETECTED DEADLOCKS----------\n"; + + for (auto path_entry : dlock_buffer) { + m_data += get_dlock_path_info(path_entry); + } + } }; /** @@ -3103,10 +3343,10 @@ static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd, Rdb_snapshot_status showStatus; Rdb_transaction::walk_tx_list(&showStatus); + showStatus.populate_deadlock_buffer(); /* Send the result data back to MySQL */ - return print_stats(thd, "SNAPSHOTS", "rocksdb", showStatus.getResult(), - stat_print); + return print_stats(thd, "rocksdb", "", showStatus.getResult(), stat_print); } /* @@ -3332,8 +3572,6 @@ static int rocksdb_start_tx_and_assign_read_view( char **gtid_executed, /* out: Gtids logged until last commit */ int *const gtid_executed_length) /*out: Length of gtid_executed string */ { - Rdb_perf_context_guard guard(thd); - ulong const tx_isolation = my_core::thd_tx_isolation(thd); if (tx_isolation != ISO_REPEATABLE_READ) { @@ -3349,6 +3587,8 @@ static int rocksdb_start_tx_and_assign_read_view( } Rdb_transaction *const tx = get_or_create_tx(thd); + Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); + DBUG_ASSERT(!tx->has_snapshot()); tx->set_tx_read_only(true); rocksdb_register_tx(hton, thd, tx); @@ -3395,6 +3635,7 @@ static void rocksdb_update_table_stats( int n_lock_wait, int n_lock_wait_timeout, int n_lock_deadlock, const char *engine)) { my_io_perf_t io_perf_read; + my_io_perf_t io_perf_write; my_io_perf_t io_perf; page_stats_t page_stats; comp_stats_t comp_stats; @@ -3409,6 +3650,7 @@ static void rocksdb_update_table_stats( memset(&io_perf, 0, sizeof(io_perf)); memset(&page_stats, 0, sizeof(page_stats)); memset(&comp_stats, 0, sizeof(comp_stats)); + memset(&io_perf_write, 0, sizeof(io_perf_write)); tablenames = rdb_open_tables.get_table_names(); @@ -3441,6 +3683,8 @@ static void rocksdb_update_table_stats( io_perf_read.bytes = table_handler->m_io_perf_read.bytes.load(); io_perf_read.requests = table_handler->m_io_perf_read.requests.load(); + io_perf_write.bytes = table_handler->m_io_perf_write.bytes.load(); + io_perf_write.requests = table_handler->m_io_perf_write.requests.load(); lock_wait_timeout_stats = table_handler->m_lock_wait_timeout_counter.load(); deadlock_stats = table_handler->m_deadlock_counter.load(); @@ -3468,9 +3712,10 @@ static void rocksdb_update_table_stats( sizeof(dbname_sys)); my_core::filename_to_tablename(tablename.c_str(), tablename_sys, sizeof(tablename_sys)); - (*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read, &io_perf, - &io_perf, &io_perf, &io_perf, &page_stats, &comp_stats, 0, - lock_wait_timeout_stats, deadlock_stats, rocksdb_hton_name); + (*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read, + &io_perf_write, &io_perf, &io_perf, &io_perf, &page_stats, + &comp_stats, 0, lock_wait_timeout_stats, deadlock_stats, + rocksdb_hton_name); } } @@ -3680,8 +3925,18 @@ static int rocksdb_init_func(void *const p) { (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type; if (!rocksdb_tbl_options->no_block_cache) { - rocksdb_tbl_options->block_cache = - rocksdb::NewLRUCache(rocksdb_block_cache_size); + std::shared_ptr block_cache = rocksdb_use_clock_cache + ? rocksdb::NewClockCache(rocksdb_block_cache_size) + : rocksdb::NewLRUCache(rocksdb_block_cache_size); + if (rocksdb_sim_cache_size > 0) { + // Simulated cache enabled + // Wrap block cache inside a simulated cache and pass it to RocksDB + rocksdb_tbl_options->block_cache = + rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, 6); + } else { + // Pass block cache to RocksDB + rocksdb_tbl_options->block_cache = block_cache; + } } // Using newer BlockBasedTable format version for better compression // and better memory allocation. @@ -3870,7 +4125,7 @@ static int rocksdb_init_func(void *const p) { } #endif - err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST, + err = my_error_register(rdb_get_error_message, HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST); if (err != 0) { // NO_LINT_DEBUG @@ -4064,6 +4319,7 @@ Rdb_open_tables_map::get_table_handler(const char *const table_name) { thr_lock_init(&table_handler->m_thr_lock); table_handler->m_io_perf_read.init(); + table_handler->m_io_perf_write.init(); } DBUG_ASSERT(table_handler->m_ref_count >= 0); table_handler->m_ref_count++; @@ -4330,11 +4586,11 @@ bool ha_rocksdb::init_with_fields() { rows within a transaction, etc, because the compaction filter ignores snapshots when filtering keys. */ -bool ha_rocksdb::should_hide_ttl_rec(const rocksdb::Slice &ttl_rec_val, +bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd, + const rocksdb::Slice &ttl_rec_val, const int64_t curr_ts) { - DBUG_ASSERT(m_pk_descr != nullptr); - DBUG_ASSERT(m_pk_descr->has_ttl()); - DBUG_ASSERT(m_pk_descr->m_ttl_rec_offset != UINT_MAX); + DBUG_ASSERT(kd.has_ttl()); + DBUG_ASSERT(kd.m_ttl_rec_offset != UINT_MAX); /* Curr_ts can only be 0 if there are no snapshots open. @@ -4360,7 +4616,7 @@ bool ha_rocksdb::should_hide_ttl_rec(const rocksdb::Slice &ttl_rec_val, Find where the 8-byte ttl is for each record in this index. */ uint64 ts; - if (!reader.read(m_pk_descr->m_ttl_rec_offset) || reader.read_uint64(&ts)) { + if (!reader.read(kd.m_ttl_rec_offset) || reader.read_uint64(&ts)) { /* This condition should never be reached since all TTL records have an 8 byte ttl field in front. Don't filter the record out, and log an error. @@ -4368,7 +4624,7 @@ bool ha_rocksdb::should_hide_ttl_rec(const rocksdb::Slice &ttl_rec_val, std::string buf; buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(), RDB_MAX_HEXDUMP_LEN); - const GL_INDEX_ID gl_index_id = m_pk_descr->get_gl_index_id(); + const GL_INDEX_ID gl_index_id = kd.get_gl_index_id(); // NO_LINT_DEBUG sql_print_error("Decoding ttl from PK value failed, " "for index (%u,%u), val: %s", @@ -4382,10 +4638,23 @@ bool ha_rocksdb::should_hide_ttl_rec(const rocksdb::Slice &ttl_rec_val, #ifndef NDEBUG read_filter_ts += rdb_dbug_set_ttl_read_filter_ts(); #endif - return ts + m_pk_descr->m_ttl_duration + read_filter_ts <= + return ts + kd.m_ttl_duration + read_filter_ts <= static_cast(curr_ts); } +void ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd, + rocksdb::Iterator *const iter, + bool seek_backward) { + if (kd.has_ttl()) { + while (iter->Valid() && + should_hide_ttl_rec( + kd, iter->value(), + get_or_create_tx(table->in_use)->m_snapshot_timestamp)) { + rocksdb_smart_next(seek_backward, iter); + } + } +} + /** Convert record from table->record[0] form into a form that can be written into rocksdb. @@ -4411,6 +4680,7 @@ int ha_rocksdb::convert_record_to_storage_format( if (has_ttl) { /* If it's a TTL record, reserve space for 8 byte TTL value in front. */ m_storage_record.fill(ROCKSDB_SIZEOF_TTL_RECORD + m_null_bytes_in_rec, 0); + m_ttl_bytes_updated = false; /* If the TTL is contained within the key, we use the offset to find the @@ -4441,6 +4711,8 @@ int ha_rocksdb::convert_record_to_storage_format( rdb_netbuf_to_uint64(reinterpret_cast(data)) + rdb_dbug_set_ttl_rec_ts()); #endif + // Also store in m_ttl_bytes to propagate to update_sk + memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); } else if (!has_ttl_column) { /* For implicitly generated TTL records we need to copy over the old @@ -4459,6 +4731,8 @@ int ha_rocksdb::convert_record_to_storage_format( #endif char *const data = const_cast(m_storage_record.ptr()); rdb_netbuf_store_uint64(reinterpret_cast(data), ts); + // Also store in m_ttl_bytes to propagate to update_sk + memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); } } } else { @@ -4540,6 +4814,15 @@ int ha_rocksdb::convert_record_to_storage_format( ts += rdb_dbug_set_ttl_rec_ts(); #endif rdb_netbuf_store_uint64(reinterpret_cast(data), ts); + + // If this is an update and the timestamp has been updated, take note + // so we can avoid updating SKs unnecessarily. + if (!row_info.old_pk_slice.empty()) { + m_ttl_bytes_updated = + memcmp(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); + } + // Store timestamp in m_ttl_bytes to propagate to update_sk + memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); } } } @@ -4818,8 +5101,9 @@ int ha_rocksdb::convert_record_from_storage_format( } if (m_maybe_unpack_info) { - unpack_info = reader.read(RDB_UNPACK_HEADER_SIZE); - if (!unpack_info || unpack_info[0] != RDB_UNPACK_DATA_TAG) { + unpack_info = reader.get_current_ptr(); + if (!unpack_info || !Rdb_key_def::is_unpack_data_tag(unpack_info[0]) || + !reader.read(Rdb_key_def::get_unpack_header_size(unpack_info[0]))) { return HA_ERR_ROCKSDB_CORRUPT_DATA; } @@ -4827,7 +5111,8 @@ int ha_rocksdb::convert_record_from_storage_format( rdb_netbuf_to_uint16(reinterpret_cast(unpack_info + 1)); unpack_slice = rocksdb::Slice(unpack_info, unpack_info_len); - reader.read(unpack_info_len - RDB_UNPACK_HEADER_SIZE); + reader.read(unpack_info_len - + Rdb_key_def::get_unpack_header_size(unpack_info[0])); } int err = m_pk_descr->unpack_record(table, buf, &rowkey_slice, @@ -4983,8 +5268,7 @@ void ha_rocksdb::setup_field_converters() { If hidden pk exists, we skip this check since the field will never be part of the hidden pk. */ - if (!has_hidden_pk(table) && - field->part_of_key.is_set(table->s->primary_key)) { + if (!has_hidden_pk(table)) { KEY *const pk_info = &table->key_info[table->s->primary_key]; for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) { /* key_part->fieldnr is counted from 1 */ @@ -5185,7 +5469,10 @@ int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) { my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock, nullptr); m_io_perf.init(&m_table_handler->m_table_perf_context, - &m_table_handler->m_io_perf_read, &stats); + &m_table_handler->m_io_perf_read, + &m_table_handler->m_io_perf_write, &stats); + Rdb_perf_context_guard guard(&m_io_perf, + rocksdb_perf_context_level(ha_thd())); std::string fullname; err = rdb_normalize_tablename(name, &fullname); @@ -5282,6 +5569,13 @@ int ha_rocksdb::close(void) { m_table_handler = nullptr; } + // These are needed to suppress valgrind errors in rocksdb.partition + m_storage_record.free(); + m_last_rowkey.free(); + m_sk_tails.free(); + m_sk_tails_old.free(); + m_pk_unpack_info.free(); + DBUG_RETURN(HA_EXIT_SUCCESS); } @@ -5317,7 +5611,9 @@ static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[0])) == ((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + 1), "Number of error messages doesn't match number of error codes"); -static const char **rdb_get_error_messages() { return rdb_error_messages; } +static const char *rdb_get_error_message(int nr) { + return rdb_error_messages[nr - HA_ERR_ROCKSDB_FIRST]; +} bool ha_rocksdb::get_error_message(const int error, String *const buf) { DBUG_ENTER_FUNC(); @@ -5865,9 +6161,8 @@ int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, DBUG_RETURN(err); } - /* We don't currently support TTL on tables with secondary keys. */ - if (ttl_duration > 0 && - (table_arg->s->keys > 1 || is_hidden_pk(i, table_arg, tbl_def_arg))) { + /* We don't currently support TTL on tables with hidden primary keys. */ + if (ttl_duration > 0 && is_hidden_pk(i, table_arg, tbl_def_arg)) { my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0)); DBUG_RETURN(HA_EXIT_FAILURE); } @@ -5906,6 +6201,12 @@ int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1; }); + DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT", { + if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) { + kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2; + } + }); + uint32 index_flags = (ttl_duration > 0 ? Rdb_key_def::TTL_FLAG : 0); uint32 ttl_rec_offset = @@ -6263,7 +6564,7 @@ int ha_rocksdb::read_key_exact(const Rdb_key_def &kd, from the POV of the current transaction. If it has, try going to the next key. */ - if (kd.has_ttl() && should_hide_ttl_rec(iter->value(), ttl_filter_ts)) { + if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) { rocksdb_smart_next(kd.m_is_reverse_cf, iter); continue; } @@ -6299,7 +6600,7 @@ int ha_rocksdb::read_before_key(const Rdb_key_def &kd, if ((full_key_match && kd.value_matches_prefix(m_scan_it->key(), key_slice)) || (kd.has_ttl() && - should_hide_ttl_rec(m_scan_it->value(), ttl_filter_ts))) { + should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) { rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it); continue; } @@ -6329,7 +6630,7 @@ int ha_rocksdb::read_after_key(const Rdb_key_def &kd, key. */ while (m_scan_it->Valid() && kd.has_ttl() && - should_hide_ttl_rec(m_scan_it->value(), ttl_filter_ts)) { + should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) { rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it); } @@ -6487,17 +6788,21 @@ int ha_rocksdb::read_row_from_secondary_key(uchar *const buf, int rc = 0; uint pk_size; - if (m_keyread_only && m_lock_rows == RDB_LOCK_NONE && !has_hidden_pk(table)) { - /* Get the key columns and primary key value */ - const rocksdb::Slice &rkey = m_scan_it->key(); + /* Get the key columns and primary key value */ + const rocksdb::Slice &rkey = m_scan_it->key(); + const rocksdb::Slice &value = m_scan_it->value(); + + bool covered_lookup = + m_keyread_only || kd.covers_lookup(table, &value, &m_lookup_bitmap); + if (covered_lookup && m_lock_rows == RDB_LOCK_NONE && !has_hidden_pk(table)) { pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple); - const rocksdb::Slice &value = m_scan_it->value(); if (pk_size == RDB_INVALID_KEY_LEN) { rc = HA_ERR_ROCKSDB_CORRUPT_DATA; } else { rc = kd.unpack_record(table, buf, &rkey, &value, m_verify_row_debug_checksums); + global_stats.covered_secondary_key_lookups.inc(); } } else { if (kd.m_is_reverse_cf) @@ -6607,11 +6912,15 @@ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) { m_last_rowkey.copy((const char *)m_pk_packed_tuple, size, &my_charset_bin); - if (m_keyread_only && m_lock_rows == RDB_LOCK_NONE && + rocksdb::Slice value = m_scan_it->value(); + bool covered_lookup = + m_keyread_only || m_key_descr_arr[keyno]->covers_lookup( + table, &value, &m_lookup_bitmap); + if (covered_lookup && m_lock_rows == RDB_LOCK_NONE && !has_hidden_pk(table)) { - rocksdb::Slice value = m_scan_it->value(); rc = m_key_descr_arr[keyno]->unpack_record( table, buf, &key, &value, m_verify_row_debug_checksums); + global_stats.covered_secondary_key_lookups.inc(); } else { rc = get_row_by_rowid(buf, m_pk_packed_tuple, size); } @@ -6898,6 +7207,8 @@ int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward, const Rdb_key_def &kd = *m_key_descr_arr[active_index]; while (1) { + rocksdb_skip_expired_records(kd, m_scan_it, !move_forward); + if (!m_scan_it->Valid()) { table->status = STATUS_NOT_FOUND; return HA_ERR_END_OF_FILE; @@ -6927,10 +7238,7 @@ int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward, const enum icp_result icp_status = check_index_cond(); if (icp_status == ICP_NO_MATCH) { - if (move_forward) - m_scan_it->Next(); - else - m_scan_it->Prev(); + rocksdb_smart_next(!move_forward, m_scan_it); continue; /* Get the next (or prev) index tuple */ } else if (icp_status == ICP_OUT_OF_RANGE) { /* We have walked out of range we are scanning */ @@ -7263,7 +7571,8 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid, if (found) { /* If we found the record, but it's expired, pretend we didn't find it. */ if (!skip_ttl_check && m_pk_descr->has_ttl() && - should_hide_ttl_rec(rocksdb::Slice(&m_retrieved_record.front(), + should_hide_ttl_rec(*m_pk_descr, + rocksdb::Slice(&m_retrieved_record.front(), m_retrieved_record.size()), tx->m_snapshot_timestamp)) { DBUG_RETURN(HA_ERR_KEY_NOT_FOUND); @@ -7344,6 +7653,8 @@ int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) { else m_scan_it->Prev(); } + rocksdb_skip_expired_records(*m_key_descr_arr[active_index], m_scan_it, + !move_forward); rc = find_icp_matching_index_rec(move_forward, buf); if (!rc) rc = secondary_index_read(active_index, buf); @@ -7696,6 +8007,12 @@ bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg, is_hidden_pk(index, table_arg, tbl_def_arg); } +uint ha_rocksdb::max_supported_key_part_length() const { + DBUG_ENTER_FUNC(); + DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE + : MAX_INDEX_COL_LEN_SMALL); +} + const char *ha_rocksdb::get_key_name(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { @@ -8075,7 +8392,8 @@ int ha_rocksdb::check_uniqueness_and_lock( m_retrieved_record by check_and_lock_unique_pk(). */ if (is_pk(key_id, table, m_tbl_def) && found && m_pk_descr->has_ttl() && - should_hide_ttl_rec(rocksdb::Slice(&m_retrieved_record.front(), + should_hide_ttl_rec(*m_pk_descr, + rocksdb::Slice(&m_retrieved_record.front(), m_retrieved_record.size()), (row_info.tx->m_snapshot_timestamp ? row_info.tx->m_snapshot_timestamp @@ -8123,48 +8441,112 @@ int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg, int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd, const rocksdb::Slice &key, - const rocksdb::Slice &value) { - rocksdb::ColumnFamilyHandle *const cf = kd.get_cf(); + const rocksdb::Slice &value, bool sort) { + DBUG_ENTER_FUNC(); + + rocksdb::ColumnFamilyHandle *cf = kd.get_cf(); DBUG_ASSERT(cf != nullptr); - if (m_sst_info == nullptr) { - m_sst_info = std::make_shared( - rdb, m_table_handler->m_table_name, kd.get_name(), cf, - *rocksdb_db_options, THDVAR(ha_thd(), trace_sst_api)); - tx->start_bulk_load(this); - m_bulk_load_tx = tx; + int res = HA_EXIT_SUCCESS; + + if (sort) { + GL_INDEX_ID kd_gl_id = kd.get_gl_index_id(); + auto it = m_key_merge.find(kd_gl_id); + if (it == m_key_merge.end()) { + m_key_merge.emplace( + std::piecewise_construct, std::make_tuple(kd_gl_id), + std::make_tuple( + thd_rocksdb_tmpdir(), THDVAR(ha_thd(), merge_buf_size), + THDVAR(ha_thd(), merge_combine_read_size), + THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms), cf)); + it = m_key_merge.find(kd_gl_id); + if ((res = it->second.init()) != 0) { + DBUG_RETURN(res); + } + + if (m_bulk_load_tx == nullptr) { + tx->start_bulk_load(this); + m_bulk_load_tx = tx; + } + } + res = it->second.add(key, value); + } else { + if (!m_sst_info) { + m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name, + kd.get_name(), cf, *rocksdb_db_options, + THDVAR(ha_thd(), trace_sst_api))); + tx->start_bulk_load(this); + m_bulk_load_tx = tx; + } + + DBUG_ASSERT(m_sst_info); + + res = m_sst_info->put(key, value); } - DBUG_ASSERT(m_sst_info != nullptr); - - return m_sst_info->put(key, value); + DBUG_RETURN(res); } int ha_rocksdb::finalize_bulk_load() { - int rc = 0; + DBUG_ENTER_FUNC(); + + DBUG_ASSERT_IMP(!m_key_merge.empty() || m_sst_info, + m_bulk_load_tx != nullptr); /* Skip if there are no possible ongoing bulk loads */ - if (m_sst_info == nullptr && m_bulk_load_tx == nullptr) { - return rc; + if (m_key_merge.empty() && !m_sst_info && m_bulk_load_tx == nullptr) { + DBUG_RETURN(HA_EXIT_SUCCESS); } + int res = HA_EXIT_SUCCESS; + RDB_MUTEX_LOCK_CHECK(m_bulk_load_mutex); - /* - We need this check because it's possible that m_sst_info has been - flushed and cleared by another thread by the time the mutex has been - acquired. - */ - if (m_sst_info != nullptr) { - rc = m_sst_info->commit(); - m_sst_info = nullptr; + if (m_sst_info) { + res = m_sst_info->commit(); + m_sst_info.reset(); + } + + if (!m_key_merge.empty()) { + rocksdb::Slice merge_key; + rocksdb::Slice merge_val; + for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) { + const std::string &index_name = + ddl_manager.safe_find(it->first)->get_name(); + Rdb_index_merge &rdb_merge = it->second; + Rdb_sst_info sst_info(rdb, m_table_handler->m_table_name, index_name, + rdb_merge.get_cf(), *rocksdb_db_options, + THDVAR(ha_thd(), trace_sst_api)); + + while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) { + if ((res = sst_info.put(merge_key, merge_val)) != 0) { + break; + } + } + // res == -1 => finished ok; res > 0 => error + if (res <= 0) { + if ((res = sst_info.commit()) != 0) { + break; + } + } + } + m_key_merge.clear(); + + /* + Explicitly tell jemalloc to clean up any unused dirty pages at this point. + See https://reviews.facebook.net/D63723 for more details. + */ + purge_all_jemalloc_arenas(); + } + + if (m_bulk_load_tx != nullptr) { m_bulk_load_tx->end_bulk_load(this); m_bulk_load_tx = nullptr; } RDB_MUTEX_UNLOCK_CHECK(m_bulk_load_mutex); - return rc; + DBUG_RETURN(res); } int ha_rocksdb::update_pk(const Rdb_key_def &kd, @@ -8172,6 +8554,7 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, const bool &pk_changed) { const uint key_id = kd.get_keyno(); const bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def); + ulonglong bytes_written = 0; /* If the PK has changed, or if this PK uses single deletes and this is an @@ -8186,6 +8569,8 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, if (!s.ok()) { return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, m_table_handler); + } else { + bytes_written = row_info.old_pk_slice.size(); } } @@ -8206,7 +8591,8 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, /* Write the primary key directly to an SST file using an SstFileWriter */ - rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice); + rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice, + THDVAR(table->in_use, bulk_load_allow_unsorted)); } else if (row_info.skip_unique_check || row_info.tx->m_ddl_transaction) { /* It is responsibility of the user to make sure that the data being @@ -8228,6 +8614,10 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, } } + if (rc == HA_EXIT_SUCCESS) { + row_info.tx->update_bytes_written( + bytes_written + row_info.new_pk_slice.size() + value_slice.size()); + } return rc; } @@ -8241,24 +8631,31 @@ int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, rocksdb::Slice old_key_slice; const uint key_id = kd.get_keyno(); + + ulonglong bytes_written = 0; + /* - Can skip updating this key if none of the key fields have changed. + Can skip updating this key if none of the key fields have changed and, if + this table has TTL, the TTL timestamp has not changed. */ - if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) { + if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) && + (!kd.has_ttl() || !m_ttl_bytes_updated)) { return HA_EXIT_SUCCESS; } const bool store_row_debug_checksums = should_store_row_debug_checksums(); - new_packed_size = kd.pack_record( - table_arg, m_pack_buffer, row_info.new_data, m_sk_packed_tuple, - &m_sk_tails, store_row_debug_checksums, row_info.hidden_pk_id); + new_packed_size = + kd.pack_record(table_arg, m_pack_buffer, row_info.new_data, + m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums, + row_info.hidden_pk_id, 0, nullptr, nullptr, m_ttl_bytes); if (row_info.old_data != nullptr) { // The old value old_packed_size = kd.pack_record( table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old, - &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id); + &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0, + nullptr, nullptr, m_ttl_bytes); /* Check if we are going to write the same value. This can happen when @@ -8276,6 +8673,7 @@ int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, */ if (old_packed_size == new_packed_size && m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() && + !(kd.has_ttl() && m_ttl_bytes_updated) && memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) == 0 && memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(), @@ -8294,6 +8692,8 @@ int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(), old_key_slice); + + bytes_written = old_key_slice.size(); } new_key_slice = rocksdb::Slice( @@ -8305,6 +8705,9 @@ int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice, new_value_slice); + row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() + + new_value_slice.size()); + return HA_EXIT_SUCCESS; } @@ -8312,14 +8715,19 @@ int ha_rocksdb::update_indexes(const struct update_row_info &row_info, const bool &pk_changed) { int rc; + // The PK must be updated first to pull out the TTL value. + rc = update_pk(*m_pk_descr, row_info, pk_changed); + if (rc != HA_EXIT_SUCCESS) { + return rc; + } + + // Update the remaining indexes. for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) { - const Rdb_key_def &kd = *m_key_descr_arr[key_id]; if (is_pk(key_id, table, m_tbl_def)) { - rc = update_pk(kd, row_info, pk_changed); - } else { - rc = update_sk(table, kd, row_info); + continue; } + rc = update_sk(table, *m_key_descr_arr[key_id], row_info); if (rc != HA_EXIT_SUCCESS) { return rc; } @@ -8597,7 +9005,8 @@ int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) { DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete"); if (m_pk_descr->has_ttl() && - should_hide_ttl_rec(m_scan_it->value(), tx->m_snapshot_timestamp)) { + should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(), + tx->m_snapshot_timestamp)) { continue; } @@ -8628,7 +9037,8 @@ int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) { if (m_pk_descr->has_ttl() && should_hide_ttl_rec( - value, get_or_create_tx(table->in_use)->m_snapshot_timestamp)) { + *m_pk_descr, value, + get_or_create_tx(table->in_use)->m_snapshot_timestamp)) { continue; } @@ -8670,6 +9080,10 @@ int ha_rocksdb::index_init(uint idx, bool sorted) { setup_read_decoders(); + if (!m_keyread_only) { + m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap); + } + // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update // when accessing the index, so don't acquire the snapshot right away. // Otherwise acquire the snapshot immediately. @@ -8689,6 +9103,8 @@ int ha_rocksdb::index_end() { release_scan_iterator(); + bitmap_free(&m_lookup_bitmap); + active_index = MAX_KEY; in_range_check_pushed_down = FALSE; @@ -8733,6 +9149,7 @@ int ha_rocksdb::delete_row(const uchar *const buf) { rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length()); Rdb_transaction *const tx = get_or_create_tx(table->in_use); + ulonglong bytes_written = 0; const uint index = pk_index(table, m_tbl_def); rocksdb::Status s = @@ -8740,6 +9157,8 @@ int ha_rocksdb::delete_row(const uchar *const buf) { if (!s.ok()) { DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def, m_table_handler)); + } else { + bytes_written = key_slice.size(); } longlong hidden_pk_id = 0; @@ -8762,6 +9181,7 @@ int ha_rocksdb::delete_row(const uchar *const buf) { /* Deleting on secondary key doesn't need any locks: */ tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(), secondary_key_slice); + bytes_written += secondary_key_slice.size(); } } @@ -8772,6 +9192,7 @@ int ha_rocksdb::delete_row(const uchar *const buf) { } stats.rows_deleted++; update_row_stats(ROWS_DELETED); + tx->update_bytes_written(bytes_written); DBUG_RETURN(HA_EXIT_SUCCESS); } @@ -8860,13 +9281,37 @@ int ha_rocksdb::info(uint flag) { stats.data_file_length+= sz; } - // Second, compute memtable stats - uint64_t memtableCount; - uint64_t memtableSize; - rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r, - &memtableCount, &memtableSize); - stats.records += memtableCount; - stats.data_file_length += memtableSize; + // Second, compute memtable stats. This call is expensive, so cache + // values computed for some time. + uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime; + uint64_t time = (cachetime == 0) ? 0 : my_micro_time(); + if (cachetime == 0 || + time > m_table_handler->m_mtcache_last_update + cachetime) { + uint64_t memtableCount; + uint64_t memtableSize; + + rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r, + &memtableCount, &memtableSize); + + // Atomically update all of these fields at the same time + if (cachetime > 0) { + if (m_table_handler->m_mtcache_lock.fetch_add( + 1, std::memory_order_acquire) == 0) { + m_table_handler->m_mtcache_count = memtableCount; + m_table_handler->m_mtcache_size = memtableSize; + m_table_handler->m_mtcache_last_update = time; + } + m_table_handler->m_mtcache_lock.fetch_sub(1, + std::memory_order_release); + } + + stats.records += memtableCount; + stats.data_file_length += memtableSize; + } else { + // Cached data is still valid, so use it instead + stats.records += m_table_handler->m_mtcache_count; + stats.data_file_length += m_table_handler->m_mtcache_size; + } if (rocksdb_debug_optimizer_n_rows > 0) stats.records = rocksdb_debug_optimizer_n_rows; @@ -9386,10 +9831,6 @@ void Rdb_drop_index_thread::run() { uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0, is_reverse_cf ? 0 : 1); - rocksdb::CompactRangeOptions compact_range_options; - compact_range_options.bottommost_level_compaction = - rocksdb::BottommostLevelCompaction::kForce; - compact_range_options.exclusive_manual_compaction = false; rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh, &range.start, &range.limit); if (!status.ok()) { @@ -9398,7 +9839,7 @@ void Rdb_drop_index_thread::run() { } rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD); } - status = rdb->CompactRange(compact_range_options, cfh, &range.start, + status = rdb->CompactRange(getCompactRangeOptions(), cfh, &range.start, &range.limit); if (!status.ok()) { if (status.IsShutdownInProgress()) { @@ -9485,6 +9926,7 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { char key_buf[MAX_KEY_LENGTH]; uint key_len; + ulonglong bytes_written = 0; /* Remove all records in each index. (This is is not crash-safe, but it doesn't matter, because bulk row @@ -9516,11 +9958,13 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def, m_table_handler); } - + bytes_written += key.size(); it->Next(); } } + tx->update_bytes_written(bytes_written); + return HA_EXIT_SUCCESS; } @@ -9569,6 +10013,8 @@ int ha_rocksdb::rename_table(const char *const from, const char *const to) { DBUG_RETURN(-1); } + DBUG_EXECUTE_IF("gen_sql_table_name", to_str = to_str + "#sql-test";); + const std::unique_ptr wb = dict_manager.begin(); rocksdb::WriteBatch *const batch = wb.get(); dict_manager.lock(); @@ -9773,7 +10219,7 @@ int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) { for (uint i = 0; i < table->s->keys; i++) { uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; auto range = get_range(i, buf); - const rocksdb::Status s = rdb->CompactRange(rocksdb::CompactRangeOptions(), + const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(), m_key_descr_arr[i]->get_cf(), &range.start, &range.limit); if (!s.ok()) { @@ -10311,7 +10757,6 @@ int ha_rocksdb::inplace_populate_sk( const std::unordered_set> &indexes) { DBUG_ENTER_FUNC(); int res = HA_EXIT_SUCCESS; - const std::unique_ptr wb = dict_manager.begin(); rocksdb::WriteBatch *const batch = wb.get(); @@ -10367,7 +10812,7 @@ int ha_rocksdb::inplace_populate_sk( open handlers have been closed at this point, and the one we're on is the only one left. */ - if (m_sst_info != nullptr) { + if (m_sst_info) { if ((res = finalize_bulk_load())) { DBUG_RETURN(res); } @@ -10377,14 +10822,16 @@ int ha_rocksdb::inplace_populate_sk( const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size); const ulonglong rdb_merge_combine_read_size = THDVAR(ha_thd(), merge_combine_read_size); + const ulonglong rdb_merge_tmp_file_removal_delay = + THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms); for (const auto &index : indexes) { - const rocksdb::Comparator *index_comp = index->get_cf()->GetComparator(); bool is_unique_index = new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME; - Rdb_index_merge rdb_merge(thd_rocksdb_tmpdir(), rdb_merge_buf_size, - rdb_merge_combine_read_size, index_comp); + Rdb_index_merge rdb_merge( + thd_rocksdb_tmpdir(), rdb_merge_buf_size, rdb_merge_combine_read_size, + rdb_merge_tmp_file_removal_delay, index->get_cf()); if ((res = rdb_merge.init())) { DBUG_RETURN(res); @@ -10413,7 +10860,8 @@ int ha_rocksdb::inplace_populate_sk( /* Create new secondary index entry */ const int new_packed_size = index->pack_record( new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple, - &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id); + &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0, + nullptr, nullptr, m_ttl_bytes); const rocksdb::Slice key = rocksdb::Slice( reinterpret_cast(m_sk_packed_tuple), new_packed_size); @@ -10478,7 +10926,7 @@ int ha_rocksdb::inplace_populate_sk( /* Insert key and slice to SST via SSTFileWriter API. */ - if ((res = bulk_load_key(tx, *index, merge_key, merge_val))) { + if ((res = bulk_load_key(tx, *index, merge_key, merge_val, false))) { break; } } @@ -10831,6 +11279,9 @@ static void myrocks_update_status() { export_stats.queries_point = global_stats.queries[QUERIES_POINT]; export_stats.queries_range = global_stats.queries[QUERIES_RANGE]; + + export_stats.covered_secondary_key_lookups = + global_stats.covered_secondary_key_lookups; } static void myrocks_update_memory_status() { @@ -10874,6 +11325,9 @@ static SHOW_VAR myrocks_status_variables[] = { SHOW_LONGLONG), DEF_STATUS_VAR_FUNC("queries_range", &export_stats.queries_range, SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups", + &export_stats.covered_secondary_key_lookups, + SHOW_LONGLONG), {NullS, NullS, SHOW_LONG}}; @@ -10884,6 +11338,91 @@ static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) { var->value = reinterpret_cast(&myrocks_status_variables); } +static ulonglong +io_stall_prop_value(const std::map &props, + const std::string &key) { + std::map::const_iterator iter = + props.find("io_stalls." + key); + if (iter != props.end()) { + return std::stoull(iter->second); + } else { + DBUG_PRINT("warning", + ("RocksDB GetMapPropery hasn't returned key=%s", key.c_str())); + DBUG_ASSERT(0); + return 0; + } +} + +static void update_rocksdb_stall_status() { + st_io_stall_stats local_io_stall_stats; + for (const auto &cf_name : cf_manager.get_cf_names()) { + rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name); + if (cfh == nullptr) { + continue; + } + + std::map props; + if (!rdb->GetMapProperty(cfh, "rocksdb.cfstats", &props)) { + continue; + } + + local_io_stall_stats.level0_slowdown += + io_stall_prop_value(props, "level0_slowdown"); + local_io_stall_stats.level0_slowdown_with_compaction += + io_stall_prop_value(props, "level0_slowdown_with_compaction"); + local_io_stall_stats.level0_numfiles += + io_stall_prop_value(props, "level0_numfiles"); + local_io_stall_stats.level0_numfiles_with_compaction += + io_stall_prop_value(props, "level0_numfiles_with_compaction"); + local_io_stall_stats.stop_for_pending_compaction_bytes += + io_stall_prop_value(props, "stop_for_pending_compaction_bytes"); + local_io_stall_stats.slowdown_for_pending_compaction_bytes += + io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes"); + local_io_stall_stats.memtable_compaction += + io_stall_prop_value(props, "memtable_compaction"); + local_io_stall_stats.memtable_slowdown += + io_stall_prop_value(props, "memtable_slowdown"); + local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop"); + local_io_stall_stats.total_slowdown += + io_stall_prop_value(props, "total_slowdown"); + } + io_stall_stats = local_io_stall_stats; +} + +static SHOW_VAR rocksdb_stall_status_variables[] = { + DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns", + &io_stall_stats.level0_slowdown, SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns", + &io_stall_stats.level0_slowdown_with_compaction, + SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops", + &io_stall_stats.level0_numfiles, SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops", + &io_stall_stats.level0_numfiles_with_compaction, + SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops", + &io_stall_stats.stop_for_pending_compaction_bytes, + SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns", + &io_stall_stats.slowdown_for_pending_compaction_bytes, + SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("memtable_limit_stops", + &io_stall_stats.memtable_compaction, SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns", + &io_stall_stats.memtable_slowdown, SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("total_stops", &io_stall_stats.total_stop, + SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("total_slowdowns", &io_stall_stats.total_slowdown, + SHOW_LONGLONG), + // end of the array marker + {NullS, NullS, SHOW_LONG}}; + +static void show_rocksdb_stall_vars(THD *thd, SHOW_VAR *var, char *buff) { + update_rocksdb_stall_status(); + var->type = SHOW_ARRAY; + var->value = reinterpret_cast(&rocksdb_stall_status_variables); +} + static SHOW_VAR rocksdb_status_vars[] = { DEF_STATUS_VAR(block_cache_miss), DEF_STATUS_VAR(block_cache_hit), @@ -10951,7 +11490,12 @@ static SHOW_VAR rocksdb_status_vars[] = { SHOW_LONGLONG), DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other, SHOW_LONGLONG), + // the variables generated by SHOW_FUNC are sorted only by prefix (first + // arg in the tuple below), so make sure it is unique to make sorting + // deterministic as quick sort is not stable {"rocksdb", reinterpret_cast(&show_myrocks_vars), SHOW_FUNC}, + {"rocksdb_stall", reinterpret_cast(&show_rocksdb_stall_vars), + SHOW_FUNC}, {NullS, NullS, SHOW_LONG}}; /* @@ -10998,10 +11542,13 @@ void Rdb_background_thread::run() { timespec ts; clock_gettime(CLOCK_REALTIME, &ts); - // Flush the WAL. - if (rdb && (rocksdb_flush_log_at_trx_commit == 2)) { + // Flush the WAL. Sync it for both background and never modes to copy + // InnoDB's behavior. For mode never, the wal file isn't even written, + // whereas background writes to the wal file, but issues the syncs in a + // background thread. + if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC)) { DBUG_ASSERT(!rocksdb_db_options->allow_mmap_writes); - const rocksdb::Status s = rdb->SyncWAL(); + const rocksdb::Status s = rdb->FlushWAL(true); if (!s.ok()) { rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD); } @@ -11101,6 +11648,7 @@ int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; } int rdb_dbug_set_ttl_read_filter_ts() { return rocksdb_debug_ttl_read_filter_ts; } +bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; } #endif void rdb_update_global_stats(const operation_type &type, uint count, @@ -11322,6 +11870,17 @@ void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var, RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } +void rocksdb_set_max_latest_deadlocks(THD *thd, struct st_mysql_sys_var *var, + void *var_ptr, const void *save) { + RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); + const uint32_t new_val = *static_cast(save); + if (rocksdb_max_latest_deadlocks != new_val) { + rocksdb_max_latest_deadlocks = new_val; + rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks); + } + RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); +} + void rdb_set_collation_exception_list(const char *const exception_list) { DBUG_ASSERT(rdb_collation_exceptions != nullptr); @@ -11361,6 +11920,18 @@ void rocksdb_set_bulk_load(THD *const thd, struct st_mysql_sys_var *const var *static_cast(var_ptr) = *static_cast(save); } +void rocksdb_set_bulk_load_allow_unsorted( + THD *const thd, + struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), + void *const var_ptr, const void *const save) { + if (THDVAR(thd, bulk_load)) { + my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SET", + "Cannot change this setting while bulk load is enabled"); + } else { + *static_cast(var_ptr) = *static_cast(save); + } +} + static void rocksdb_set_max_background_jobs(THD *thd, struct st_mysql_sys_var *const var, void *const var_ptr, diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index e6d4c2e6485..3b72173f94c 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -22,6 +22,7 @@ /* C++ standard header files */ #include #include +#include #include #include @@ -235,6 +236,12 @@ const char *const RDB_TTL_COL_QUALIFIER = "ttl_col"; */ #define ROCKSDB_SIZEOF_TTL_RECORD sizeof(longlong) +/* + Maximum index prefix length in bytes. +*/ +#define MAX_INDEX_COL_LEN_LARGE 3072 +#define MAX_INDEX_COL_LEN_SMALL 767 + /* MyRocks specific error codes. NB! Please make sure that you will update HA_ERR_ROCKSDB_LAST when adding new ones. Also update the strings in @@ -293,7 +300,14 @@ struct Rdb_table_handler { /* Stores cumulative table statistics */ my_io_perf_atomic_t m_io_perf_read; + my_io_perf_atomic_t m_io_perf_write; Rdb_atomic_perf_counters m_table_perf_context; + + /* Stores cached memtable estimate statistics */ + std::atomic_uint m_mtcache_lock; + uint64_t m_mtcache_count; + uint64_t m_mtcache_size; + uint64_t m_mtcache_last_update; }; class Rdb_key_def; @@ -360,6 +374,8 @@ struct st_global_stats { ib_counter_t system_rows[ROWS_MAX]; ib_counter_t queries[QUERIES_MAX]; + + ib_counter_t covered_secondary_key_lookups; }; /* Struct used for exporting status to MySQL */ @@ -379,6 +395,8 @@ struct st_export_stats { ulonglong queries_point; ulonglong queries_range; + + ulonglong covered_secondary_key_lookups; }; /* Struct used for exporting RocksDB memory status */ @@ -387,6 +405,27 @@ struct st_memory_stats { ulonglong memtable_unflushed; }; +/* Struct used for exporting RocksDB IO stalls stats */ +struct st_io_stall_stats { + ulonglong level0_slowdown; + ulonglong level0_slowdown_with_compaction; + ulonglong level0_numfiles; + ulonglong level0_numfiles_with_compaction; + ulonglong stop_for_pending_compaction_bytes; + ulonglong slowdown_for_pending_compaction_bytes; + ulonglong memtable_compaction; + ulonglong memtable_slowdown; + ulonglong total_stop; + ulonglong total_slowdown; + + st_io_stall_stats() + : level0_slowdown(0), level0_slowdown_with_compaction(0), + level0_numfiles(0), level0_numfiles_with_compaction(0), + stop_for_pending_compaction_bytes(0), + slowdown_for_pending_compaction_bytes(0), memtable_compaction(0), + memtable_slowdown(0), total_stop(0), total_slowdown(0) {} +}; + } // namespace myrocks #include "./rdb_buff.h" @@ -495,6 +534,12 @@ class ha_rocksdb : public my_core::handler { Pointer to the original TTL timestamp value (8 bytes) during UPDATE. */ char m_ttl_bytes[ROCKSDB_SIZEOF_TTL_RECORD]; + /* + The TTL timestamp value can change if the explicit TTL column is + updated. If we detect this when updating the PK, we indicate it here so + we know we must always update any SK's. + */ + bool m_ttl_bytes_updated; /* rowkey of the last record we've read, in StorageFormat. */ String m_last_rowkey; @@ -541,7 +586,9 @@ class ha_rocksdb : public my_core::handler { bool m_update_scope_is_valid; /* SST information used for bulk loading the primary key */ - std::shared_ptr m_sst_info; + std::unique_ptr m_sst_info; + /* External merge sorts for bulk load: key ID -> merge sort instance */ + std::unordered_map m_key_merge; Rdb_transaction *m_bulk_load_tx; /* Mutex to protect finalizing bulk load */ mysql_mutex_t m_bulk_load_mutex; @@ -636,6 +683,13 @@ class ha_rocksdb : public my_core::handler { /* Setup field_decoders based on type of scan and table->read_set */ void setup_read_decoders(); + /* + For the active index, indicates which columns must be covered for the + current lookup to be covered. If the bitmap field is null, that means this + index does not cover the current lookup for any record. + */ + MY_BITMAP m_lookup_bitmap = {nullptr, 0, 0, nullptr, nullptr}; + /* Number of bytes in on-disk (storage) record format that are used for storing SQL NULL flags. @@ -854,11 +908,7 @@ public: DBUG_RETURN(MAX_REF_PARTS); } - uint max_supported_key_part_length() const override { - DBUG_ENTER_FUNC(); - - DBUG_RETURN(2048); - } + uint max_supported_key_part_length() const override; /** @brief unireg.cc will call this to make sure that the storage engine can handle @@ -1040,9 +1090,13 @@ private: rocksdb::Slice *const packed_rec) MY_ATTRIBUTE((__nonnull__)); - bool should_hide_ttl_rec(const rocksdb::Slice &ttl_rec_val, + bool should_hide_ttl_rec(const Rdb_key_def &kd, + const rocksdb::Slice &ttl_rec_val, const int64_t curr_ts) MY_ATTRIBUTE((__warn_unused_result__)); + void rocksdb_skip_expired_records(const Rdb_key_def &kd, + rocksdb::Iterator *const iter, + bool seek_backward); int index_first_intern(uchar *buf) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); @@ -1076,8 +1130,10 @@ private: struct unique_sk_buf_info *sk_info) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); int bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd, - const rocksdb::Slice &key, const rocksdb::Slice &value) + const rocksdb::Slice &key, const rocksdb::Slice &value, + bool sort) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + void update_bytes_written(ulonglong bytes_written); int update_pk(const Rdb_key_def &kd, const struct update_row_info &row_info, const bool &pk_changed) MY_ATTRIBUTE((__warn_unused_result__)); int update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, diff --git a/storage/rocksdb/ha_rocksdb_proto.h b/storage/rocksdb/ha_rocksdb_proto.h index 1beb6b80622..71ec6957456 100644 --- a/storage/rocksdb/ha_rocksdb_proto.h +++ b/storage/rocksdb/ha_rocksdb_proto.h @@ -76,6 +76,7 @@ bool rdb_is_ttl_read_filtering_enabled(); int rdb_dbug_set_ttl_rec_ts(); int rdb_dbug_set_ttl_snapshot_ts(); int rdb_dbug_set_ttl_read_filter_ts(); +bool rdb_dbug_set_ttl_ignore_pk(); #endif enum operation_type : int; diff --git a/storage/rocksdb/mysql-test/rocksdb/include/prefix_index_only_query_check.inc b/storage/rocksdb/mysql-test/rocksdb/include/prefix_index_only_query_check.inc new file mode 100644 index 00000000000..e96eb573c1f --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/include/prefix_index_only_query_check.inc @@ -0,0 +1,21 @@ +# +# A helper include file for prefix index index-only query tests +# +# Parameters: +# $prefix_index_check_title - title of the test +# $prefix_index_check_query - test query +# $prefix_index_check_read_avoided_delta - expected change of +# 'rocksdb_covered_secondary_key_lookups' status variable +# value after running the query + +--let $show_count_statement = show status like 'rocksdb_covered_secondary_key_lookups' + +--echo # $prefix_index_check_title +--let $base_count = query_get_value($show_count_statement, Value, 1) + +--eval $prefix_index_check_query + +--let $count = query_get_value($show_count_statement, Value, 1) +--let $assert_text= $prefix_index_check_title: $prefix_index_check_read_avoided_delta rocksdb_covered_secondary_key_lookups +--let $assert_cond= $count - $base_count = $prefix_index_check_read_avoided_delta +--source include/assert.inc diff --git a/storage/rocksdb/mysql-test/rocksdb/my.cnf b/storage/rocksdb/mysql-test/rocksdb/my.cnf index 2ed68088259..faeae84da5f 100644 --- a/storage/rocksdb/mysql-test/rocksdb/my.cnf +++ b/storage/rocksdb/mysql-test/rocksdb/my.cnf @@ -5,3 +5,4 @@ sql-mode=NO_ENGINE_SUBSTITUTION explicit-defaults-for-timestamp=1 rocksdb_lock_wait_timeout=1 rocksdb_strict_collation_check=0 +rocksdb_force_compute_memtable_stats_cachetime=0 diff --git a/storage/rocksdb/mysql-test/rocksdb/r/2pc_group_commit.result b/storage/rocksdb/mysql-test/rocksdb/r/2pc_group_commit.result index 06452a5437f..7b15ed47d44 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/2pc_group_commit.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/2pc_group_commit.result @@ -4,20 +4,20 @@ CREATE DATABASE mysqlslap; USE mysqlslap; CREATE TABLE t1(id BIGINT AUTO_INCREMENT, value BIGINT, PRIMARY KEY(id)) ENGINE=rocksdb; # 2PC enabled, MyRocks durability enabled -SET GLOBAL rocksdb_enable_2pc=0; +SET GLOBAL rocksdb_enable_2pc=1; SET GLOBAL rocksdb_flush_log_at_trx_commit=1; ## 2PC + durability + single thread select variable_value into @c from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; select case when variable_value-@c = 1000 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; case when variable_value-@c = 1000 then 'true' else 'false' end -false +true ## 2PC + durability + group commit select variable_value into @c from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; select case when variable_value-@c > 0 and variable_value-@c < 10000 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; case when variable_value-@c > 0 and variable_value-@c < 10000 then 'true' else 'false' end -false +true # 2PC enabled, MyRocks durability disabled -SET GLOBAL rocksdb_enable_2pc=0; +SET GLOBAL rocksdb_enable_2pc=1; SET GLOBAL rocksdb_flush_log_at_trx_commit=0; select variable_value into @c from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; select case when variable_value-@c = 0 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; @@ -28,16 +28,16 @@ select case when variable_value-@c = 0 then 'true' else 'false' end from informa case when variable_value-@c = 0 then 'true' else 'false' end true # 2PC disabled, MyRocks durability enabled -SET GLOBAL rocksdb_enable_2pc=1; +SET GLOBAL rocksdb_enable_2pc=0; SET GLOBAL rocksdb_flush_log_at_trx_commit=1; select variable_value into @c from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; select case when variable_value-@c = 0 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; case when variable_value-@c = 0 then 'true' else 'false' end -false +true select variable_value into @c from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; select case when variable_value-@c = 0 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_wal_group_syncs'; case when variable_value-@c = 0 then 'true' else 'false' end -false +true SET GLOBAL rocksdb_enable_2pc=1; SET GLOBAL rocksdb_flush_log_at_trx_commit=1; DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result index f5a2b28ee42..b7b60f26816 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result @@ -284,12 +284,16 @@ DROP TABLE t1; set global rocksdb_bulk_load=1; # Establish connection con1 (user=root) # Switch to connection con1 -show global variables like 'rocksdb_bulk_load'; +show global variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load ON -show session variables like 'rocksdb_bulk_load'; +rocksdb_bulk_load_allow_unsorted OFF +rocksdb_bulk_load_size 1000 +show session variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load ON +rocksdb_bulk_load_allow_unsorted OFF +rocksdb_bulk_load_size 1000 CREATE TABLE t1 (i INT, j INT, PRIMARY KEY (i)) ENGINE = ROCKSDB; INSERT INTO t1 VALUES (1,1); # Disconnecting on con1 @@ -327,10 +331,11 @@ SET @prior_rocksdb_strict_collation_check= @@rocksdb_strict_collation_check; SET @prior_rocksdb_merge_buf_size = @@rocksdb_merge_buf_size; SET global rocksdb_strict_collation_check = off; SET session rocksdb_merge_combine_read_size = 566; -SET session rocksdb_merge_buf_size = 336; -show variables like '%rocksdb_bulk_load%'; +SET session rocksdb_merge_buf_size = 340; +show variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load OFF +rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 CREATE TABLE t1 (a VARCHAR(80)) ENGINE=RocksDB; INSERT INTO t1 (a) VALUES (REPEAT("a", 80)); diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_errors.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_errors.result index 31562d1da10..be198d02aaf 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_errors.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_errors.result @@ -16,4 +16,30 @@ INSERT INTO t1 VALUES(20); INSERT INTO t1 VALUES(21); SET rocksdb_bulk_load=0; ERROR HY000: Lost connection to MySQL server during query +TRUNCATE TABLE t1; +SET rocksdb_bulk_load_allow_unsorted=1; +SET rocksdb_bulk_load=1; +INSERT INTO t1 VALUES(100); +INSERT INTO t1 VALUES(101); +INSERT INTO t1 VALUES(99); +SET rocksdb_bulk_load=0; +SELECT * FROM t1; +pk +99 +100 +101 +TRUNCATE TABLE t1; +SET rocksdb_bulk_load=1; +INSERT INTO t1 VALUES(201); +INSERT INTO t1 VALUES(200); +INSERT INTO t1 VALUES(202); +INSERT INTO t1 VALUES(201); +ERROR 23000: Failed to insert the record: the key already exists +SET rocksdb_bulk_load=0; +SELECT * FROM t1; +pk +200 +201 +202 +SET rocksdb_bulk_load_allow_unsorted=DEFAULT; DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted.result new file mode 100644 index 00000000000..31509227279 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted.result @@ -0,0 +1,101 @@ +DROP TABLE IF EXISTS t1; +SET rocksdb_bulk_load_size=3; +SET rocksdb_bulk_load_allow_unsorted=1; +CREATE TABLE t1(a INT, b INT, PRIMARY KEY(a) COMMENT "cf1"); +SET rocksdb_bulk_load=1; +SELECT * FROM t1; +a b +SET rocksdb_bulk_load=0; +SELECT * FROM t1; +a b +-3 5 +-1 3 +2 0 +4 -2 +6 -4 +DROP TABLE t1; +CREATE TABLE t1(a INT, b INT, PRIMARY KEY(a) COMMENT "cf1", KEY(b)); +SET rocksdb_bulk_load=1; +SELECT * FROM t1; +a b +6 -4 +4 -2 +2 0 +-1 3 +-3 5 +SET rocksdb_bulk_load=0; +DROP TABLE t1; +CREATE TABLE t1(a INT, b INT, PRIMARY KEY(a) COMMENT "cf1"); +CREATE TABLE t2(a INT, b INT, PRIMARY KEY(a) COMMENT "cf1"); +SET rocksdb_bulk_load=1; +INSERT INTO t1 VALUES (1,1); +INSERT INTO t2 VALUES (1,1); +SELECT * FROM t1; +a b +1 1 +INSERT INTO t1 VALUES (2,2); +SELECT * FROM t2; +a b +1 1 +SELECT * FROM t1; +a b +1 1 +SET rocksdb_bulk_load=0; +SELECT * FROM t1; +a b +1 1 +2 2 +DROP TABLE t1, t2; +CREATE TABLE t1(a INT, b INT, PRIMARY KEY(a) COMMENT "cf1"); +CREATE TABLE t2(a INT, b INT, PRIMARY KEY(b) COMMENT "cf1"); +CREATE TABLE t3(a INT, b INT, PRIMARY KEY(a) COMMENT "cf1") +PARTITION BY KEY() PARTITIONS 4; +set session transaction isolation level repeatable read; +select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; +STAT_TYPE VALUE +DB_NUM_SNAPSHOTS 0 +start transaction with consistent snapshot; +select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; +STAT_TYPE VALUE +DB_NUM_SNAPSHOTS 1 +set rocksdb_bulk_load=1; +set rocksdb_bulk_load_size=100000; +LOAD DATA INFILE INTO TABLE t1; +LOAD DATA INFILE INTO TABLE t2; +LOAD DATA INFILE INTO TABLE t3; +set rocksdb_bulk_load=0; +SHOW TABLE STATUS WHERE name LIKE 't%'; +Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment +t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL +t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL +t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL partitioned +ANALYZE TABLE t1, t2, t3; +Table Op Msg_type Msg_text +test.t1 analyze status OK +test.t2 analyze status OK +test.t3 analyze status OK +SHOW TABLE STATUS WHERE name LIKE 't%'; +Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment +t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL +t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL +t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL partitioned +select count(a) from t1; +count(a) +5000000 +select count(b) from t1; +count(b) +5000000 +select count(a) from t2; +count(a) +5000000 +select count(b) from t2; +count(b) +5000000 +select count(a) from t3; +count(a) +5000000 +select count(b) from t3; +count(b) +5000000 +DROP TABLE t1, t2, t3; +SET rocksdb_bulk_load_allow_unsorted=0; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted_errors.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted_errors.result new file mode 100644 index 00000000000..d8e5b92e897 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_unsorted_errors.result @@ -0,0 +1,4 @@ +SET rocksdb_bulk_load=1; +SET rocksdb_bulk_load_allow_unsorted=1; +ERROR HY000: Error when executing command SET: Cannot change this setting while bulk load is enabled +SET rocksdb_bulk_load=0; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bytes_written.result b/storage/rocksdb/mysql-test/rocksdb/r/bytes_written.result new file mode 100644 index 00000000000..d9d29e6ac69 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bytes_written.result @@ -0,0 +1,10 @@ +DROP TABLE IF EXISTS stats_test_table; +CREATE TABLE stats_test_table (a INT, b INT, PRIMARY KEY (a)) ENGINE=ROCKSDB; +SET GLOBAL rocksdb_perf_context_level=3; +INSERT INTO stats_test_table VALUES (7,1); +INSERT INTO stats_test_table VALUES (2,2); +SELECT io_write_bytes > 0 FROM INFORMATION_SCHEMA.TABLE_STATISTICS WHERE TABLE_NAME = "stats_test_table"; +io_write_bytes > 0 +1 +DROP TABLE stats_test_table; +SET GLOBAL rocksdb_perf_context_level=DEFAULT; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_read_committed.result b/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_read_committed.result index 521edec0c83..7f1e3d8e53f 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_read_committed.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_read_committed.result @@ -5,7 +5,7 @@ connection con1; CREATE TABLE t1 (a INT, pk INT AUTO_INCREMENT PRIMARY KEY) ENGINE=ROCKSDB; SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 1938 +ERROR: 12048 connection con2; select * from information_schema.rocksdb_dbstats where stat_type='DB_NUM_SNAPSHOTS'; STAT_TYPE VALUE @@ -18,7 +18,7 @@ STAT_TYPE VALUE DB_NUM_SNAPSHOTS 0 connection con1; START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 1938 +ERROR: 12048 connection con2; INSERT INTO t1 (a) VALUES (1); connection con1; @@ -69,7 +69,7 @@ id value value2 5 5 5 6 6 6 START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 1938 +ERROR: 12048 connection con2; INSERT INTO r1 values (7,7,7); connection con1; @@ -107,12 +107,12 @@ id value value2 7 7 7 8 8 8 START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 1938 +ERROR: 12048 connection con2; INSERT INTO r1 values (9,9,9); connection con1; START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 1938 +ERROR: 12048 connection con2; INSERT INTO r1 values (10,10,10); connection con1; @@ -129,7 +129,7 @@ id value value2 9 9 9 10 10 10 START TRANSACTION WITH CONSISTENT SNAPSHOT; -ERROR: 1938 +ERROR: 12048 INSERT INTO r1 values (11,11,11); ERROR: 0 SELECT * FROM r1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_repeatable_read.result b/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_repeatable_read.result index 805a0aaa0fd..90723ff762c 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_repeatable_read.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/cons_snapshot_repeatable_read.result @@ -125,7 +125,7 @@ id value value2 START TRANSACTION WITH CONSISTENT SNAPSHOT; ERROR: 0 INSERT INTO r1 values (11,11,11); -ERROR: 1935 +ERROR: 12045 SELECT * FROM r1; id value value2 1 1 1 diff --git a/storage/rocksdb/mysql-test/rocksdb/r/covered_unpack_info_format.result b/storage/rocksdb/mysql-test/rocksdb/r/covered_unpack_info_format.result new file mode 100644 index 00000000000..195215331b8 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/covered_unpack_info_format.result @@ -0,0 +1,73 @@ +DROP TABLE IF EXISTS t1; +CREATE TABLE t1 ( +id INT, +fake_id INT, +bigfield VARCHAR(4096), +PRIMARY KEY (id), +KEY bf (bigfield(32)), +KEY fid (fake_id, bigfield(32)) +) ENGINE=rocksdb; +INSERT INTO t1 VALUES (1, 1001, REPEAT('a', 1)), +(8, 1008, REPEAT('b', 8)), +(24, 1024, REPEAT('c', 24)), +(31, 1031, REPEAT('d', 31)), +(32, 1032, REPEAT('x', 32)), +(33, 1033, REPEAT('y', 33)), +(128, 1128, REPEAT('z', 128)); +SELECT * FROM t1; +id fake_id bigfield +1 1001 a +8 1008 bbbbbbbb +24 1024 cccccccccccccccccccccccc +31 1031 ddddddddddddddddddddddddddddddd +32 1032 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +33 1033 yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy +128 1128 zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz +# Eligible for optimization, access via fake_id only +SELECT id, bigfield FROM t1 FORCE INDEX(fid) WHERE fake_id = 1031; +id bigfield +31 ddddddddddddddddddddddddddddddd +include/assert.inc [Eligible for optimization, access via fake_id only: 2 rocksdb_covered_secondary_key_lookups] +# Not eligible for optimization, access via fake_id of big row. +SELECT id, bigfield FROM t1 FORCE INDEX(fid) WHERE fake_id = 1033; +id bigfield +33 yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy +include/assert.inc [Not eligible for optimization, access via fake_id of big row.: 0 rocksdb_covered_secondary_key_lookups] +DROP TABLE t1; +set session debug= '+d,MYROCKS_NO_COVERED_BITMAP_FORMAT'; +CREATE TABLE t1 ( +id INT, +fake_id INT, +bigfield VARCHAR(4096), +PRIMARY KEY (id), +KEY bf (bigfield(32)), +KEY fid (fake_id, bigfield(32)) +) ENGINE=rocksdb; +set session debug= '-d,MYROCKS_NO_COVERED_BITMAP_FORMAT'; +INSERT INTO t1 VALUES (1, 1001, REPEAT('a', 1)), +(8, 1008, REPEAT('b', 8)), +(24, 1024, REPEAT('c', 24)), +(31, 1031, REPEAT('d', 31)), +(32, 1032, REPEAT('x', 32)), +(33, 1033, REPEAT('y', 33)), +(128, 1128, REPEAT('z', 128)); +SELECT * FROM t1; +id fake_id bigfield +1 1001 a +8 1008 bbbbbbbb +24 1024 cccccccccccccccccccccccc +31 1031 ddddddddddddddddddddddddddddddd +32 1032 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +33 1033 yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy +128 1128 zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz +# No longer eligible for optimization since no covered bitmap was stored. +SELECT id, bigfield FROM t1 FORCE INDEX(fid) WHERE fake_id = 1031; +id bigfield +31 ddddddddddddddddddddddddddddddd +include/assert.inc [No longer eligible for optimization since no covered bitmap was stored.: 0 rocksdb_covered_secondary_key_lookups] +# Not eligible for optimization. +SELECT id, bigfield FROM t1 FORCE INDEX(fid) WHERE fake_id = 1033; +id bigfield +33 yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy +include/assert.inc [Not eligible for optimization.: 0 rocksdb_covered_secondary_key_lookups] +DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result new file mode 100644 index 00000000000..39130475349 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result @@ -0,0 +1,1009 @@ +## +## Using the system variable high_priority_ddl" +## +create user test_user1@localhost; +grant all on test to test_user1@localhost; +create user test_user2@localhost; +grant all on test to test_user2@localhost; + +## Test parameters: +## use_sys_var = 1; +## con_block = con1 +## con_kill = default +## cmd = alter table t1 modify i bigint; +## high_priority_cmd = alter high_priority table t1 modify i bigint; +## should_kill = 1 +## recreate_table = 1 +## throw_error = 1 + +drop table if exists t1; +create table t1 (i int); +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `i` int(11) DEFAULT NULL +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 +insert into t1 values (1), (2), (3); +connection: con1 +lock tables t1 read;; +connection: default +set lock_wait_timeout = 0.02; +set high_priority_lock_wait_timeout = 0.02; +describe t1; +Field Type Null Key Default Extra +i int(11) YES NULL +connection: default (for show processlist) +# both con1 and default exist +show processlist; +Id User Host db Command Time State Info Rows examined Rows sent Tid Srv_Id + root test