From 7dc51833172d4d0a2e7f218a1fa6380d5bee35fe Mon Sep 17 00:00:00 2001 From: Daniel Black Date: Tue, 2 Jan 2018 13:40:05 +1100 Subject: [PATCH 01/35] wsrep_sst_rsync: read correct configuration sections 4bb49d84a9df had a single option, --mysqld to obtain the required innodb_log_group_home_dir. 83664e21e4fb675 enumberated the groups when --mysqld was sufficient. --- scripts/wsrep_sst_rsync.sh | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index c6e8a7ef4e1..de881459975 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -102,19 +102,7 @@ fi WSREP_LOG_DIR=${WSREP_LOG_DIR:-""} # if WSREP_LOG_DIR env. variable is not set, try to get it from my.cnf if [ -z "$WSREP_LOG_DIR" ]; then - WSREP_LOG_DIR=$(parse_cnf mariadb-10.0 innodb_log_group_home_dir "") -fi -if [ -z "$WSREP_LOG_DIR" ]; then - WSREP_LOG_DIR=$(parse_cnf mysqld innodb_log_group_home_dir "") -fi -if [ -z "$WSREP_LOG_DIR" ]; then - WSREP_LOG_DIR=$(parse_cnf server innodb_log_group_home_dir "") -fi -if [ -z "$WSREP_LOG_DIR" ]; then - WSREP_LOG_DIR=$(parse_cnf mariadb innodb_log_group_home_dir "") -fi -if [ -z "$WSREP_LOG_DIR" ]; then - WSREP_LOG_DIR=$(parse_cnf mysqld-10.0 innodb_log_group_home_dir "") + WSREP_LOG_DIR=$(parse_cnf --mysqld innodb_log_group_home_dir "") fi if [ -n "$WSREP_LOG_DIR" ]; then From 67f3fb1bc54682a29ee3b407f60b798780db698f Mon Sep 17 00:00:00 2001 From: Daniel Black Date: Tue, 2 Jan 2018 14:26:48 +1100 Subject: [PATCH 02/35] galera_recovery: misses reading default configuration groups --- scripts/galera_recovery.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/galera_recovery.sh b/scripts/galera_recovery.sh index de2e653c497..f6a06a7e8e0 100644 --- a/scripts/galera_recovery.sh +++ b/scripts/galera_recovery.sh @@ -107,8 +107,7 @@ else log "WSREP: mktemp failed" fi -parse_arguments `$print_defaults $cmdline_args --loose-verbose \ - mariadb mariadb_safe mysqld mysqld_safe safe_mysqld galera` +parse_arguments `$print_defaults $cmdline_args --loose-verbose --mysqld` # Perform wsrep position recovery if wsrep_on=1, skip otherwise. if [ "$wsrep_on" -eq 1 ]; then From 46d5e1f2fddbb4589a8763508cea8589941a2b2f Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Thu, 26 Jul 2018 15:42:06 +0200 Subject: [PATCH 03/35] MDEV-10754 wsrep_sst_rsync does not support innodb_data_home_dir If innodb_data_home_dir path specified in the configuration file then rsync IST/SST will fail because the wsrep_sst_rsync.sh script does not read this parameter from the configuration file and then tries to find the data files in the default directory. To fix this error, we need to add reading of the innodb_data_home_dir configuration parameter to the rsync-related SST script. --- .../galera/r/galera_sst_rsync_data_dir.result | 288 ++++++++++++++++++ .../galera/t/galera_sst_rsync_data_dir.cnf | 11 + .../galera/t/galera_sst_rsync_data_dir.test | 17 ++ scripts/wsrep_sst_rsync.sh | 9 +- 4 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result create mode 100644 mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf create mode 100644 mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test diff --git a/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result b/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result new file mode 100644 index 00000000000..ff85a7d6c0f --- /dev/null +++ b/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result @@ -0,0 +1,288 @@ +connection node_1; +connection node_2; +Performing State Transfer on a server that has been shut down cleanly and restarted +connection node_1; +CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +COMMIT; +Shutting down server ... +connection node_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +connect node_1a_galera_st_shutdown_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +connection node_2; +Starting server ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +COMMIT; +connection node_1a_galera_st_shutdown_slave; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +ROLLBACK; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +SET AUTOCOMMIT=ON; +connection node_1; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +SET AUTOCOMMIT=ON; +Performing State Transfer on a server that starts from a clean var directory +This is accomplished by shutting down node #2 and removing its var directory before restarting it +connection node_1; +CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +COMMIT; +Shutting down server ... +connection node_1; +Cleaning var directory ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +connect node_1a_galera_st_clean_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +connection node_2; +Starting server ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +COMMIT; +connection node_1a_galera_st_clean_slave; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +ROLLBACK; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +SET AUTOCOMMIT=ON; +connection node_1; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +SET AUTOCOMMIT=ON; +Performing State Transfer on a server that has been killed and restarted +connection node_1; +CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +INSERT INTO t1 VALUES ('node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +INSERT INTO t1 VALUES ('node2_committed_before'); +COMMIT; +Killing server ... +connection node_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +INSERT INTO t1 VALUES ('node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +connect node_1a_galera_st_kill_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +connection node_2; +Performing --wsrep-recover ... +Starting server ... +Using --wsrep-start-position when starting mysqld ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +INSERT INTO t1 VALUES ('node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +INSERT INTO t1 VALUES ('node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +INSERT INTO t1 VALUES ('node1_committed_after'); +COMMIT; +connection node_1a_galera_st_kill_slave; +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); +ROLLBACK; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +SET AUTOCOMMIT=ON; +connection node_1; +SELECT COUNT(*) = 35 FROM t1; +COUNT(*) = 35 +1 +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +SET AUTOCOMMIT=ON; diff --git a/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf new file mode 100644 index 00000000000..d084740c8a6 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf @@ -0,0 +1,11 @@ +!include ../galera_2nodes.cnf + +[mysqld] +wsrep_sst_method=rsync +innodb_data_home_dir=@ENV.MYSQL_TMP_DIR/rsync_test + +[mysqld.1] +wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' + +[mysqld.2] +wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' diff --git a/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test new file mode 100644 index 00000000000..6220836c11d --- /dev/null +++ b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test @@ -0,0 +1,17 @@ +--source include/big_test.inc +--source include/galera_cluster.inc + +--let $node_1=node_1 +--let $node_2=node_2 +--source include/auto_increment_offset_save.inc + +--source suite/galera/include/galera_st_shutdown_slave.inc +--source suite/galera/include/galera_st_clean_slave.inc + +--source suite/galera/include/galera_st_kill_slave.inc +--source suite/galera/include/galera_st_kill_slave_ddl.inc +--source include/auto_increment_offset_restore.inc + +# cleanup temporary database files: +--remove_files_wildcard $MYSQL_TMP_DIR/rsync_test * +--rmdir $MYSQL_TMP_DIR/rsync_test diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index 81120b66d5a..3659a60832c 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -113,6 +113,12 @@ else WSREP_LOG_DIR=$(cd $WSREP_SST_OPT_DATA; pwd -P) fi +INNODB_DATA_HOME_DIR=${INNODB_DATA_HOME_DIR:-""} +# if INNODB_DATA_HOME_DIR env. variable is not set, try to get it from my.cnf +if [ -z "INNODB_DATA_HOME_DIR" ]; then + INNODB_DATA_HOME_DIR=$(parse_cnf --mysqld innodb-data-home-dir '') +fi + # Old filter - include everything except selected # FILTER=(--exclude '*.err' --exclude '*.pid' --exclude '*.sock' \ # --exclude '*.conf' --exclude core --exclude 'galera.*' \ @@ -121,7 +127,8 @@ fi # New filter - exclude everything except dirs (schemas) and innodb files FILTER=(-f '- /lost+found' -f '- /.fseventsd' -f '- /.Trashes' - -f '+ /wsrep_sst_binlog.tar' -f '+ /ib_lru_dump' -f '+ /ibdata*' -f '+ /*/' -f '- /*') + -f '+ /wsrep_sst_binlog.tar' -f '+ $INNODB_DATA_HOME_DIR/ib_lru_dump' + -f '+ $INNODB_DATA_HOME_DIR/ibdata*' -f '+ /*/' -f '- /*') if [ "$WSREP_SST_OPT_ROLE" = "donor" ] then From 7e8ed15b95b64af2068a3515fe97f2c245311323 Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Thu, 9 Aug 2018 02:24:12 +0000 Subject: [PATCH 04/35] Fixes after review and correction of the problems caused by the fact that during the SST innodb plugin is not yet initialized, as well as problems with running tests from the root user (not directly related to the MDEV-10754). --- .../galera/include/galera_wsrep_recover.inc | 9 ++++ .../galera/t/galera_sst_rsync_data_dir.cnf | 2 +- .../galera/t/galera_sst_rsync_data_dir.test | 3 +- scripts/wsrep_sst_common.sh | 2 + scripts/wsrep_sst_rsync.sh | 33 ++++++++++-- sql/wsrep_mysqld.h | 1 + sql/wsrep_sst.cc | 50 +++++++++++++++++++ storage/innobase/handler/ha_innodb.cc | 6 +++ 8 files changed, 99 insertions(+), 7 deletions(-) diff --git a/mysql-test/suite/galera/include/galera_wsrep_recover.inc b/mysql-test/suite/galera/include/galera_wsrep_recover.inc index 090ffe5f5df..7f1ed1a875d 100644 --- a/mysql-test/suite/galera/include/galera_wsrep_recover.inc +++ b/mysql-test/suite/galera/include/galera_wsrep_recover.inc @@ -1,5 +1,14 @@ --echo Performing --wsrep-recover ... +# +# When mysqld is run by a root user it will fail to start unless +# we specify what user is ran it (using "--user root" option): +# +if ($MYSQL_TEST_ROOT == 1) { +--exec $MYSQLD --defaults-group-suffix=.$galera_wsrep_recover_server_id --defaults-file=$MYSQLTEST_VARDIR/my.cnf --innodb --wsrep-recover --user root > $MYSQL_TMP_DIR/galera_wsrep_recover.log 2>&1 +} +if ($MYSQL_TEST_ROOT != 1) { --exec $MYSQLD --defaults-group-suffix=.$galera_wsrep_recover_server_id --defaults-file=$MYSQLTEST_VARDIR/my.cnf --innodb --wsrep-recover > $MYSQL_TMP_DIR/galera_wsrep_recover.log 2>&1 +} --perl use strict; diff --git a/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf index d084740c8a6..afe9796a11a 100644 --- a/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf +++ b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf @@ -2,10 +2,10 @@ [mysqld] wsrep_sst_method=rsync -innodb_data_home_dir=@ENV.MYSQL_TMP_DIR/rsync_test [mysqld.1] wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' [mysqld.2] +innodb_data_home_dir=@ENV.MYSQL_TMP_DIR/rsync_test_2 wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' diff --git a/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test index 6220836c11d..68aa1068f75 100644 --- a/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test +++ b/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.test @@ -13,5 +13,4 @@ --source include/auto_increment_offset_restore.inc # cleanup temporary database files: ---remove_files_wildcard $MYSQL_TMP_DIR/rsync_test * ---rmdir $MYSQL_TMP_DIR/rsync_test +--remove_files_wildcard $MYSQL_TMP_DIR/rsync_test_2 * diff --git a/scripts/wsrep_sst_common.sh b/scripts/wsrep_sst_common.sh index e160bf5c776..7f1f44d0cdb 100644 --- a/scripts/wsrep_sst_common.sh +++ b/scripts/wsrep_sst_common.sh @@ -27,6 +27,7 @@ WSREP_SST_OPT_PSWD=${WSREP_SST_OPT_PSWD:-} WSREP_SST_OPT_DEFAULT="" WSREP_SST_OPT_EXTRA_DEFAULT="" WSREP_SST_OPT_SUFFIX_DEFAULT="" +WSREP_SST_OPT_SUFFIX_VALUE="" while [ $# -gt 0 ]; do case "$1" in @@ -76,6 +77,7 @@ case "$1" in ;; '--defaults-group-suffix') readonly WSREP_SST_OPT_SUFFIX_DEFAULT="$1=$2" + readonly WSREP_SST_OPT_SUFFIX_VALUE="$2" shift ;; '--host') diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index 6d8d5d1168a..0962e59f437 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -123,8 +123,16 @@ fi INNODB_DATA_HOME_DIR=${INNODB_DATA_HOME_DIR:-""} # if INNODB_DATA_HOME_DIR env. variable is not set, try to get it from my.cnf -if [ -z "INNODB_DATA_HOME_DIR" ]; then - INNODB_DATA_HOME_DIR=$(parse_cnf --mysqld innodb-data-home-dir '') +if [ -z "$INNODB_DATA_HOME_DIR" ]; then + INNODB_DATA_HOME_DIR=$(parse_cnf mysqld$WSREP_SST_OPT_SUFFIX_VALUE innodb-data-home-dir '') +fi + +if [ -n "$INNODB_DATA_HOME_DIR" ]; then + # handle both relative and absolute paths + INNODB_DATA_HOME_DIR=$(cd $WSREP_SST_OPT_DATA; mkdir -p "$INNODB_DATA_HOME_DIR"; cd $INNODB_DATA_HOME_DIR; pwd -P) +else + # default to datadir + INNODB_DATA_HOME_DIR=$(cd $WSREP_SST_OPT_DATA; pwd -P) fi # Old filter - include everything except selected @@ -135,8 +143,8 @@ fi # New filter - exclude everything except dirs (schemas) and innodb files FILTER=(-f '- /lost+found' -f '- /.fseventsd' -f '- /.Trashes' - -f '+ /wsrep_sst_binlog.tar' -f '+ $INNODB_DATA_HOME_DIR/ib_lru_dump' - -f '+ $INNODB_DATA_HOME_DIR/ibdata*' -f '+ /*/' -f '- /*') + -f '+ /wsrep_sst_binlog.tar' -f '- $INNODB_DATA_HOME_DIR/ib_lru_dump' + -f '- $INNODB_DATA_HOME_DIR/ibdata*' -f '+ /*/' -f '- /*'" SSTKEY=$(parse_cnf sst tkey "") SSTCERT=$(parse_cnf sst tcert "") @@ -242,6 +250,19 @@ EOF exit $RC fi + # Transfer InnoDB data files + rsync ${STUNNEL:+--rsh="$STUNNEL"} \ + --owner --group --perms --links --specials \ + --ignore-times --inplace --dirs --delete --quiet \ + $WHOLE_FILE_OPT -f '+ /ibdata*' -f '+ /ib_lru_dump' \ + -f '- **' "$INNODB_DATA_HOME_DIR/" \ + rsync://$WSREP_SST_OPT_ADDR-data_dir >&2 || RC=$? + + if [ $RC -ne 0 ]; then + wsrep_log_error "rsync innodb_data_home_dir returned code $RC:" + exit 255 # unknown error + fi + # second, we transfer InnoDB log files rsync ${STUNNEL:+--rsh="$STUNNEL"} \ --owner --group --perms --links --specials \ @@ -340,8 +361,12 @@ $SILENT path = $WSREP_SST_OPT_DATA [$MODULE-log_dir] path = $WSREP_LOG_DIR +[$MODULE-data_dir] + path = $INNODB_DATA_HOME_DIR EOF +[ "$(whoami)" != root ] || sed -i '/read only = no/s/.*/&\nuid = root\ngid = root\n/' "$RSYNC_CONF" + # rm -rf "$DATA"/ib_logfile* # we don't want old logs around # listen at all interfaces (for firewalled setups) diff --git a/sql/wsrep_mysqld.h b/sql/wsrep_mysqld.h index 272e4ac4984..9fc8aa02df4 100644 --- a/sql/wsrep_mysqld.h +++ b/sql/wsrep_mysqld.h @@ -160,6 +160,7 @@ extern "C" time_t wsrep_thd_query_start(THD *thd); extern "C" query_id_t wsrep_thd_query_id(THD *thd); extern "C" query_id_t wsrep_thd_wsrep_last_query_id(THD *thd); extern "C" void wsrep_thd_set_wsrep_last_query_id(THD *thd, query_id_t id); +extern "C" void wsrep_set_data_home_dir(const char *data_dir); extern void wsrep_close_client_connections(my_bool wait_to_end); extern int wsrep_wait_committing_connections_close(int wait_time); diff --git a/sql/wsrep_sst.cc b/sql/wsrep_sst.cc index 155051bfe84..0274a364bc5 100644 --- a/sql/wsrep_sst.cc +++ b/sql/wsrep_sst.cc @@ -67,6 +67,13 @@ bool wsrep_sst_method_update (sys_var *self, THD* thd, enum_var_type type) return 0; } +static const char* data_home_dir = NULL; + +extern "C" +void wsrep_set_data_home_dir(const char *data_dir) +{ + data_home_dir= (data_dir && *data_dir) ? data_dir : NULL; +} static void make_wsrep_defaults_file() { @@ -549,6 +556,29 @@ static int sst_append_auth_env(wsp::env& env, const char* sst_auth) return -env.error(); } +#define DATA_HOME_DIR_ENV "INNODB_DATA_HOME_DIR" + +static int sst_append_data_dir(wsp::env& env, const char* data_dir) +{ + int const data_dir_size= strlen(DATA_HOME_DIR_ENV) + 1 /* = */ + + (data_dir ? strlen(data_dir) : 0) + 1 /* \0 */; + + wsp::string data_dir_str(data_dir_size); // for automatic cleanup on return + if (!data_dir_str()) return -ENOMEM; + + int ret= snprintf(data_dir_str(), data_dir_size, "%s=%s", + DATA_HOME_DIR_ENV, data_dir ? data_dir : ""); + + if (ret < 0 || ret >= data_dir_size) + { + WSREP_ERROR("sst_append_data_dir(): snprintf() failed: %d", ret); + return (ret < 0 ? ret : -EMSGSIZE); + } + + env.append(data_dir_str()); + return -env.error(); +} + static ssize_t sst_prepare_other (const char* method, const char* sst_auth, const char* addr_in, @@ -610,6 +640,16 @@ static ssize_t sst_prepare_other (const char* method, return ret; } + if (data_home_dir) + { + if ((ret= sst_append_data_dir(env, data_home_dir))) + { + WSREP_ERROR("sst_prepare_other(): appending data " + "directory failed: %d", ret); + return ret; + } + } + pthread_t tmp; sst_thread_arg arg(cmd_str(), env()); mysql_mutex_lock (&arg.lock); @@ -1301,6 +1341,16 @@ wsrep_cb_status_t wsrep_sst_donate_cb (void* app_ctx, void* recv_ctx, return WSREP_CB_FAILURE; } + if (data_home_dir) + { + if ((ret= sst_append_data_dir(env, data_home_dir))) + { + WSREP_ERROR("wsrep_sst_donate_cb(): appending data " + "directory failed: %d", ret); + return WSREP_CB_FAILURE; + } + } + if (!strcmp (WSREP_SST_MYSQLDUMP, method)) { ret = sst_donate_mysqldump(data, ¤t_gtid->uuid, uuid_str, diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 5667efea5df..d2701ecfc0b 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -3623,6 +3623,12 @@ innobase_init( srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir : default_path); +#ifdef WITH_WSREP + /* If we use the wsrep API, then we need to tell the server + the path to the data files (for passing it to the SST scripts): */ + wsrep_set_data_home_dir(innobase_data_home_dir); +#endif /* WITH_WSREP */ + /* Set default InnoDB data file size to 12 MB and let it be auto-extending. Thus users can use InnoDB in >= 4.0 without having to specify any startup options. */ From 36832711c15cc3208f4d76ff6677488ea4bb11a4 Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Mon, 13 Aug 2018 10:40:47 +0200 Subject: [PATCH 05/35] Reverting changes made to support the mtr under the root --- mysql-test/suite/galera/include/galera_wsrep_recover.inc | 9 --------- scripts/wsrep_sst_rsync.sh | 2 -- 2 files changed, 11 deletions(-) diff --git a/mysql-test/suite/galera/include/galera_wsrep_recover.inc b/mysql-test/suite/galera/include/galera_wsrep_recover.inc index 7f1ed1a875d..090ffe5f5df 100644 --- a/mysql-test/suite/galera/include/galera_wsrep_recover.inc +++ b/mysql-test/suite/galera/include/galera_wsrep_recover.inc @@ -1,14 +1,5 @@ --echo Performing --wsrep-recover ... -# -# When mysqld is run by a root user it will fail to start unless -# we specify what user is ran it (using "--user root" option): -# -if ($MYSQL_TEST_ROOT == 1) { ---exec $MYSQLD --defaults-group-suffix=.$galera_wsrep_recover_server_id --defaults-file=$MYSQLTEST_VARDIR/my.cnf --innodb --wsrep-recover --user root > $MYSQL_TMP_DIR/galera_wsrep_recover.log 2>&1 -} -if ($MYSQL_TEST_ROOT != 1) { --exec $MYSQLD --defaults-group-suffix=.$galera_wsrep_recover_server_id --defaults-file=$MYSQLTEST_VARDIR/my.cnf --innodb --wsrep-recover > $MYSQL_TMP_DIR/galera_wsrep_recover.log 2>&1 -} --perl use strict; diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index 0962e59f437..f787262085b 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -365,8 +365,6 @@ $SILENT path = $INNODB_DATA_HOME_DIR EOF -[ "$(whoami)" != root ] || sed -i '/read only = no/s/.*/&\nuid = root\ngid = root\n/' "$RSYNC_CONF" - # rm -rf "$DATA"/ib_logfile* # we don't want old logs around # listen at all interfaces (for firewalled setups) From 4ff7f14feffc8cbb685910ecd595095629f7a764 Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Fri, 17 Aug 2018 15:54:55 +0200 Subject: [PATCH 06/35] Fixes of the base patch for compatibility with the 10.1 branch --- .../galera/r/galera_sst_rsync_data_dir.result | 26 ------------------- scripts/wsrep_sst_rsync.sh | 2 +- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result b/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result index ff85a7d6c0f..cec0f21ee22 100644 --- a/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result +++ b/mysql-test/suite/galera/r/galera_sst_rsync_data_dir.result @@ -1,7 +1,4 @@ -connection node_1; -connection node_2; Performing State Transfer on a server that has been shut down cleanly and restarted -connection node_1; CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; SET AUTOCOMMIT=OFF; START TRANSACTION; @@ -11,7 +8,6 @@ INSERT INTO t1 VALUES ('node1_committed_before'); INSERT INTO t1 VALUES ('node1_committed_before'); INSERT INTO t1 VALUES ('node1_committed_before'); COMMIT; -connection node_2; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES ('node2_committed_before'); @@ -21,7 +17,6 @@ INSERT INTO t1 VALUES ('node2_committed_before'); INSERT INTO t1 VALUES ('node2_committed_before'); COMMIT; Shutting down server ... -connection node_1; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES ('node1_committed_during'); @@ -36,7 +31,6 @@ INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); -connect node_1a_galera_st_shutdown_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); @@ -44,7 +38,6 @@ INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); -connection node_2; Starting server ... SET AUTOCOMMIT=OFF; START TRANSACTION; @@ -54,7 +47,6 @@ INSERT INTO t1 VALUES ('node2_committed_after'); INSERT INTO t1 VALUES ('node2_committed_after'); INSERT INTO t1 VALUES ('node2_committed_after'); COMMIT; -connection node_1; INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); @@ -69,7 +61,6 @@ INSERT INTO t1 VALUES ('node1_committed_after'); INSERT INTO t1 VALUES ('node1_committed_after'); INSERT INTO t1 VALUES ('node1_committed_after'); COMMIT; -connection node_1a_galera_st_shutdown_slave; INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); @@ -84,7 +75,6 @@ COUNT(*) = 0 1 COMMIT; SET AUTOCOMMIT=ON; -connection node_1; SELECT COUNT(*) = 35 FROM t1; COUNT(*) = 35 1 @@ -96,7 +86,6 @@ COMMIT; SET AUTOCOMMIT=ON; Performing State Transfer on a server that starts from a clean var directory This is accomplished by shutting down node #2 and removing its var directory before restarting it -connection node_1; CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; SET AUTOCOMMIT=OFF; START TRANSACTION; @@ -106,7 +95,6 @@ INSERT INTO t1 VALUES ('node1_committed_before'); INSERT INTO t1 VALUES ('node1_committed_before'); INSERT INTO t1 VALUES ('node1_committed_before'); COMMIT; -connection node_2; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES ('node2_committed_before'); @@ -116,7 +104,6 @@ INSERT INTO t1 VALUES ('node2_committed_before'); INSERT INTO t1 VALUES ('node2_committed_before'); COMMIT; Shutting down server ... -connection node_1; Cleaning var directory ... SET AUTOCOMMIT=OFF; START TRANSACTION; @@ -132,7 +119,6 @@ INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); -connect node_1a_galera_st_clean_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); @@ -140,7 +126,6 @@ INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); -connection node_2; Starting server ... SET AUTOCOMMIT=OFF; START TRANSACTION; @@ -150,7 +135,6 @@ INSERT INTO t1 VALUES ('node2_committed_after'); INSERT INTO t1 VALUES ('node2_committed_after'); INSERT INTO t1 VALUES ('node2_committed_after'); COMMIT; -connection node_1; INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); @@ -165,7 +149,6 @@ INSERT INTO t1 VALUES ('node1_committed_after'); INSERT INTO t1 VALUES ('node1_committed_after'); INSERT INTO t1 VALUES ('node1_committed_after'); COMMIT; -connection node_1a_galera_st_clean_slave; INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); @@ -180,7 +163,6 @@ COUNT(*) = 0 1 COMMIT; SET AUTOCOMMIT=ON; -connection node_1; SELECT COUNT(*) = 35 FROM t1; COUNT(*) = 35 1 @@ -191,7 +173,6 @@ DROP TABLE t1; COMMIT; SET AUTOCOMMIT=ON; Performing State Transfer on a server that has been killed and restarted -connection node_1; CREATE TABLE t1 (f1 CHAR(255)) ENGINE=InnoDB; SET AUTOCOMMIT=OFF; START TRANSACTION; @@ -201,7 +182,6 @@ INSERT INTO t1 VALUES ('node1_committed_before'); INSERT INTO t1 VALUES ('node1_committed_before'); INSERT INTO t1 VALUES ('node1_committed_before'); COMMIT; -connection node_2; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES ('node2_committed_before'); @@ -211,7 +191,6 @@ INSERT INTO t1 VALUES ('node2_committed_before'); INSERT INTO t1 VALUES ('node2_committed_before'); COMMIT; Killing server ... -connection node_1; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES ('node1_committed_during'); @@ -226,7 +205,6 @@ INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); -connect node_1a_galera_st_kill_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); @@ -234,7 +212,6 @@ INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); -connection node_2; Performing --wsrep-recover ... Starting server ... Using --wsrep-start-position when starting mysqld ... @@ -246,7 +223,6 @@ INSERT INTO t1 VALUES ('node2_committed_after'); INSERT INTO t1 VALUES ('node2_committed_after'); INSERT INTO t1 VALUES ('node2_committed_after'); COMMIT; -connection node_1; INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); INSERT INTO t1 VALUES ('node1_to_be_committed_after'); @@ -261,7 +237,6 @@ INSERT INTO t1 VALUES ('node1_committed_after'); INSERT INTO t1 VALUES ('node1_committed_after'); INSERT INTO t1 VALUES ('node1_committed_after'); COMMIT; -connection node_1a_galera_st_kill_slave; INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); INSERT INTO t1 VALUES ('node1_to_be_rollbacked_after'); @@ -276,7 +251,6 @@ COUNT(*) = 0 1 COMMIT; SET AUTOCOMMIT=ON; -connection node_1; SELECT COUNT(*) = 35 FROM t1; COUNT(*) = 35 1 diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index f787262085b..77a25ffc530 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -144,7 +144,7 @@ fi # New filter - exclude everything except dirs (schemas) and innodb files FILTER=(-f '- /lost+found' -f '- /.fseventsd' -f '- /.Trashes' -f '+ /wsrep_sst_binlog.tar' -f '- $INNODB_DATA_HOME_DIR/ib_lru_dump' - -f '- $INNODB_DATA_HOME_DIR/ibdata*' -f '+ /*/' -f '- /*'" + -f '- $INNODB_DATA_HOME_DIR/ibdata*' -f '+ /*/' -f '- /*') SSTKEY=$(parse_cnf sst tkey "") SSTCERT=$(parse_cnf sst tcert "") From cded083a370f4b23c6d895d44f2948c267ed5e77 Mon Sep 17 00:00:00 2001 From: Monty Date: Mon, 27 Aug 2018 22:00:14 +0300 Subject: [PATCH 07/35] MDEV-15797 Assertion `thd->killed != 0' failed in ha_maria::enable_indexes Problem was that a parallel open of a table, overwrote info->state that was in used by repair. Fixed by changing _ma_tmp_disable_logging_for_table() to use a new state buffer state.no_logging to store the temporary state. Other things: - Use original number of rows when retrying repair to get rid of a potential warning "Number of rows changed from X to Y" - Changed maria_commit() to make it easier to merge with 10.4 - If table is not locked (like with show commands), use the global number of rows as the local number may not be up to date. (Minor not critical fix) - Added some missing DBUG_RETURN --- mysql-test/suite/maria/concurrent.result | 28 ++++++++++++++++++++++++ mysql-test/suite/maria/concurrent.test | 28 ++++++++++++++++++++++++ storage/maria/ha_maria.cc | 3 +++ storage/maria/ma_commit.c | 2 +- storage/maria/ma_info.c | 6 ++++- storage/maria/ma_recovery.c | 8 +++++-- storage/maria/maria_chk.c | 4 ++-- storage/maria/maria_def.h | 2 ++ 8 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 mysql-test/suite/maria/concurrent.result create mode 100644 mysql-test/suite/maria/concurrent.test diff --git a/mysql-test/suite/maria/concurrent.result b/mysql-test/suite/maria/concurrent.result new file mode 100644 index 00000000000..5498a6410a2 --- /dev/null +++ b/mysql-test/suite/maria/concurrent.result @@ -0,0 +1,28 @@ +CREATE TABLE t1 (a INT, b CHAR(12), c INT, FULLTEXT KEY(b), KEY (c)) ENGINE=Aria; +CREATE TABLE t2 (a INT, b CHAR(12), c INT) ENGINE=Aria; +INSERT INTO t2 VALUES (1,'foo',8), (2,'bar',9); +INSERT INTO t1 SELECT * FROM t2; +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +select 1; +1 +1 +SELECT * FROM t1 WHERE a = ( SELECT 1 FROM non_existing_table2 ); +ERROR 42S02: Table 'test.non_existing_table2' doesn't exist +DROP TABLE t1, t2; diff --git a/mysql-test/suite/maria/concurrent.test b/mysql-test/suite/maria/concurrent.test new file mode 100644 index 00000000000..42adb082d40 --- /dev/null +++ b/mysql-test/suite/maria/concurrent.test @@ -0,0 +1,28 @@ +# +# MDEV-15797 Assertion `thd->killed != 0' failed in ha_maria::enable_indexes +# + +CREATE TABLE t1 (a INT, b CHAR(12), c INT, FULLTEXT KEY(b), KEY (c)) ENGINE=Aria; +CREATE TABLE t2 (a INT, b CHAR(12), c INT) ENGINE=Aria; +INSERT INTO t2 VALUES (1,'foo',8), (2,'bar',9); + +--connect (con1,localhost,root,,test) +--send + INSERT INTO t1 SELECT * FROM t2; +--connection default +select 1; +select 1; +select 1; +select 1; +select 1; +select 1; +select 1; +--error ER_NO_SUCH_TABLE +SELECT * FROM t1 WHERE a = ( SELECT 1 FROM non_existing_table2 ); +--connection con1 +--reap + +# Cleanup +--disconnect con1 +--connection default +DROP TABLE t1, t2; diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc index eee6b502c47..d438aa90378 100644 --- a/storage/maria/ha_maria.cc +++ b/storage/maria/ha_maria.cc @@ -1460,6 +1460,7 @@ int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt) while ((error= repair(thd, param, 0)) && param->retry_repair) { param->retry_repair= 0; + file->state->records= start_records; if (test_all_bits(param->testflag, (uint) (T_RETRY_WITHOUT_QUICK | T_QUICK))) { @@ -1961,6 +1962,7 @@ int ha_maria::disable_indexes(uint mode) int ha_maria::enable_indexes(uint mode) { int error; + ha_rows start_rows= file->state->records; DBUG_PRINT("info", ("ha_maria::enable_indexes mode: %d", mode)); if (maria_is_all_keys_active(file->s->state.key_map, file->s->base.keys)) { @@ -2023,6 +2025,7 @@ int ha_maria::enable_indexes(uint mode) DBUG_ASSERT(thd->killed != 0); /* Repairing by sort failed. Now try standard repair method. */ param->testflag &= ~T_REP_BY_SORT; + file->state->records= start_rows; error= (repair(thd, param, 0) != HA_ADMIN_OK); /* If the standard repair succeeded, clear all error messages which diff --git a/storage/maria/ma_commit.c b/storage/maria/ma_commit.c index 0ae3868dbf6..3850af8246c 100644 --- a/storage/maria/ma_commit.c +++ b/storage/maria/ma_commit.c @@ -98,7 +98,7 @@ int maria_commit(MARIA_HA *info) if (!info->s->now_transactional) return 0; trn= info->trn; - info->trn= 0; /* checked in maria_close() */ + _ma_reset_trn_for_table(info); return ma_commit(trn); } diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c index 1189594fd2b..da44da123d2 100644 --- a/storage/maria/ma_info.c +++ b/storage/maria/ma_info.c @@ -56,7 +56,11 @@ int maria_status(MARIA_HA *info, register MARIA_INFO *x, uint flag) } if (flag & HA_STATUS_VARIABLE) { - x->records = info->state->records; + /* If table is locked, give versioned number otherwise last commited */ + if (info->lock_type == F_UNLCK) + x->records = share->state.state.records; + else + x->records = info->state->records; x->deleted = share->state.state.del; x->delete_length = share->state.state.empty; x->data_file_length = share->state.state.data_file_length; diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c index c06e46c104f..f023dde7817 100644 --- a/storage/maria/ma_recovery.c +++ b/storage/maria/ma_recovery.c @@ -3551,8 +3551,8 @@ void _ma_tmp_disable_logging_for_table(MARIA_HA *info, info->state may point to a state that was deleted by _ma_trnman_end_trans_hook() */ - share->state.common= *info->state; - info->state= &share->state.common; + share->state.no_logging= *info->state; + info->state= &share->state.no_logging; info->switched_transactional= TRUE; /* @@ -3608,6 +3608,10 @@ my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages) _ma_copy_nontrans_state_information(info); _ma_reset_history(info->s); + /* Reset state to point to state.common, as on open() */ + info->state= &share->state.common; + *info->state= share->state.state; + if (flush_pages) { /* Ensure that recover is not executing any redo before this */ diff --git a/storage/maria/maria_chk.c b/storage/maria/maria_chk.c index b47f1b8c824..d03c50891df 100644 --- a/storage/maria/maria_chk.c +++ b/storage/maria/maria_chk.c @@ -1128,7 +1128,7 @@ static int maria_chk(HA_CHECK *param, char *filename) { fprintf(stderr, "Aria table '%s' is not fixed because of errors\n", filename); - return(-1); + DBUG_RETURN(-1); } recreate=1; if (!(param->testflag & T_REP_ANY)) @@ -1150,7 +1150,7 @@ static int maria_chk(HA_CHECK *param, char *filename) param->total_deleted+=info->state->del; descript(param, info, filename); maria_close(info); /* Should always succeed */ - return(0); + DBUG_RETURN(0); } if (!stopwords_inited++) diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h index 8262e1b5e2d..128e78da32b 100644 --- a/storage/maria/maria_def.h +++ b/storage/maria/maria_def.h @@ -149,6 +149,8 @@ typedef struct st_maria_state_info MARIA_STATUS_INFO state; /* maria_ha->state points here for crash-safe but not versioned tables */ MARIA_STATUS_INFO common; + /* State for a versioned table that is temporary non versioned */ + MARIA_STATUS_INFO no_logging; ha_rows split; /* number of split blocks */ my_off_t dellink; /* Link to next removed block */ pgcache_page_no_t first_bitmap_with_space; From faa4d8f8c6c12c09541c4733773ac700c5e643cf Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Tue, 28 Aug 2018 08:23:44 +0000 Subject: [PATCH 08/35] Copy of commit de1e8c7bfe7c875ea284b55040e8f3cd3a56fcc2 Author: Abhinav Sharma Date: Thu Aug 23 14:34:39 2018 -0700 Log updates to semi-sync whitelist in the error log Summary: Plugin variable changes are not logged in the error log even when log_global_var_changes is enabled. Logging updates to whitelist will help in debugging. Reviewed By: guokeno0 Differential Revision: D9483807 fbshipit-source-id: e111cda773d --- storage/rocksdb/CMakeLists.txt | 23 +- storage/rocksdb/ha_rocksdb.cc | 1329 +++++++++++++---- storage/rocksdb/ha_rocksdb.h | 47 +- storage/rocksdb/ha_rocksdb_proto.h | 7 +- .../rocksdb/mysql-test/rocksdb/combinations | 6 + .../rocksdb/include/have_write_committed.inc | 3 + .../rocksdb/r/add_index_inplace.result | 24 + .../r/add_index_inplace_sstfilewriter.result | 2 +- .../rocksdb/r/allow_no_primary_key.result | 25 + .../mysql-test/rocksdb/r/autoinc_debug.result | 40 +- .../mysql-test/rocksdb/r/autoinc_vars.result | 18 + .../mysql-test/rocksdb/r/bloomfilter5.result | 62 + .../rocksdb/r/bloomfilter_bulk_load.result | 15 + .../mysql-test/rocksdb/r/bulk_load_sk.result | 229 +++ .../mysql-test/rocksdb/r/cardinality.result | 17 +- .../mysql-test/rocksdb/r/collation.result | 20 +- .../mysql-test/rocksdb/r/com_rpc_tx.result | 21 + .../r/create_no_primary_key_table.result | 38 + .../rocksdb/r/ddl_high_priority.result | 71 +- .../rocksdb/r/deadlock_tracking.result | 57 + .../mysql-test/rocksdb/r/drop_table.result | 11 + .../rocksdb/r/explicit_snapshot.result | 265 ++++ .../mysql-test/rocksdb/r/i_s_deadlock.result | 55 +- .../rocksdb/r/index_file_map.result | 3 + .../rocksdb/r/index_merge_rocksdb2.result | 78 +- .../rocksdb/r/information_schema.result | 1 + .../mysql-test/rocksdb/r/issue255.result | 2 +- .../rocksdb/mysql-test/rocksdb/r/lock.result | 15 + .../mysql-test/rocksdb/r/mysqldump.result | 5 +- .../percona_nonflushing_analyze_debug.result | 19 + .../r/prefix_extractor_override.result | 7 +- .../mysql-test/rocksdb/r/read_only_tx.result | 2 +- .../mysql-test/rocksdb/r/rocksdb.result | 24 +- .../rocksdb/r/rocksdb_checksums.result | 6 +- .../r/rocksdb_deadlock_detect_rc.result | 16 +- .../r/rocksdb_deadlock_detect_rr.result | 16 +- .../r/secondary_key_update_lock.result | 11 + .../mysql-test/rocksdb/r/show_engine.result | 53 + .../mysql-test/rocksdb/r/transaction.result | 41 + .../mysql-test/rocksdb/r/trx_info_rpl.result | 2 +- .../mysql-test/rocksdb/r/update.result | 8 + .../rocksdb/r/use_direct_reads.result | 18 + .../rocksdb/r/validate_datadic.result | 3 - .../rocksdb/t/add_index_inplace.test | 18 + .../rocksdb/t/add_index_inplace_crash.test | 1 + .../t/add_index_inplace_sstfilewriter.test | 2 +- .../rocksdb/t/allow_no_primary_key.test | 28 + .../mysql-test/rocksdb/t/autoinc_debug.test | 17 +- .../mysql-test/rocksdb/t/autoinc_vars.test | 23 + .../rocksdb/t/bloomfilter5-master.opt | 3 + .../mysql-test/rocksdb/t/bloomfilter5.test | 61 + .../t/bloomfilter_bulk_load-master.opt | 2 + .../rocksdb/t/bloomfilter_bulk_load.test | 35 + .../mysql-test/rocksdb/t/bulk_load_sk.test | 119 ++ .../mysql-test/rocksdb/t/cardinality.test | 21 +- .../t/check_ignore_unknown_options.test | 4 +- .../mysql-test/rocksdb/t/collation.test | 42 +- .../mysql-test/rocksdb/t/com_rpc_tx.cnf | 4 + .../mysql-test/rocksdb/t/com_rpc_tx.test | 87 ++ .../t/create_no_primary_key_table-master.opt | 1 + .../t/create_no_primary_key_table.test | 44 + .../rocksdb/t/deadlock_tracking.test | 76 +- .../mysql-test/rocksdb/t/drop_table.test | 27 + .../rocksdb/t/explicit_snapshot-master.opt | 1 + .../rocksdb/t/explicit_snapshot.test | 263 ++++ .../mysql-test/rocksdb/t/i_s_deadlock.test | 14 +- .../mysql-test/rocksdb/t/index_file_map.test | 3 + .../rocksdb/t/index_merge_rocksdb2-master.opt | 2 +- .../rocksdb/t/information_schema.test | 1 + .../t/insert_optimized_config-master.opt | 8 - .../rocksdb/t/insert_optimized_config.test | 10 + .../mysql-test/rocksdb/t/issue255.test | 2 +- .../rocksdb/mysql-test/rocksdb/t/lock.test | 22 + .../t/percona_nonflushing_analyze_debug.test | 11 + .../rocksdb/t/prefix_extractor_override.test | 12 +- .../rocksdb/mysql-test/rocksdb/t/rocksdb.test | 3 +- .../rocksdb/t/rocksdb_cf_options.test | 1 - .../rocksdb/t/rocksdb_checksums.test | 24 +- .../rocksdb/t/rocksdb_deadlock_detect.inc | 35 +- .../rocksdb/t/secondary_key_update_lock.test | 26 + .../mysql-test/rocksdb/t/set_checkpoint.inc | 2 +- .../mysql-test/rocksdb/t/show_engine.test | 13 + .../mysql-test/rocksdb/t/transaction.test | 53 + .../mysql-test/rocksdb/t/trx_info_rpl.test | 4 +- .../rocksdb/mysql-test/rocksdb/t/update.test | 10 + .../rocksdb/t/use_direct_reads.test | 37 + .../rocksdb/t/validate_datadic.test | 27 +- .../mysql-test/rocksdb_rpl/combinations | 8 +- .../r/rpl_missing_columns_sk_update.result | 62 + ...mts_dependency_unique_key_conflicts.result | 44 + .../rocksdb_rpl/r/rpl_rocksdb_snapshot.result | 4 +- .../t/rpl_gtid_crash_safe_wal_corrupt.inc | 2 +- .../t/rpl_missing_columns_sk_update.cnf | 13 + .../t/rpl_missing_columns_sk_update.test | 69 + ...l_mts_dependency_unique_key_conflicts.test | 64 + .../t/rpl_rocksdb_2pc_crash_recover.test | 1 + .../mysql-test/rocksdb_stress/combinations | 5 + .../r/rocksdb_block_cache_size_basic.result | 84 +- .../r/rocksdb_bulk_load_allow_sk_basic.result | 100 ++ ...ommit_time_batch_for_recovery_basic.result | 121 ++ .../r/rocksdb_create_checkpoint_basic.result | 2 +- ...debug_manual_compaction_delay_basic.result | 46 + ...error_on_suboptimal_collation_basic.result | 7 + ...sdb_manual_compaction_threads_basic.result | 93 ++ ...ocksdb_max_manifest_file_size_basic.result | 2 +- ...ocksdb_max_manual_compactions_basic.result | 57 + .../r/rocksdb_max_row_locks_basic.result | 21 + .../r/rocksdb_stats_recalc_rate_basic.result | 53 + ...rocksdb_write_batch_max_bytes_basic.result | 4 +- .../r/rocksdb_write_policy_basic.result | 15 + .../t/rocksdb_block_cache_size_basic.test | 16 +- .../t/rocksdb_bulk_load_allow_sk_basic.test | 18 + ..._commit_time_batch_for_recovery_basic.test | 20 + .../t/rocksdb_create_checkpoint_basic.test | 2 +- ...b_debug_manual_compaction_delay_basic.test | 16 + ...b_error_on_suboptimal_collation_basic.test | 6 + ...cksdb_manual_compaction_threads_basic.test | 17 + .../rocksdb_max_manual_compactions_basic.test | 17 + .../t/rocksdb_max_row_locks_basic.test | 1 + .../t/rocksdb_stats_recalc_rate_basic.test | 17 + .../rocksdb_write_batch_max_bytes_basic.test | 4 +- .../t/rocksdb_write_policy_basic.test | 17 + storage/rocksdb/properties_collector.cc | 61 +- storage/rocksdb/properties_collector.h | 3 +- storage/rocksdb/rdb_cf_options.cc | 2 - storage/rocksdb/rdb_comparator.h | 46 +- storage/rocksdb/rdb_datadic.cc | 9 +- storage/rocksdb/rdb_datadic.h | 31 +- storage/rocksdb/rdb_i_s.cc | 167 ++- storage/rocksdb/rdb_i_s.h | 1 + storage/rocksdb/rdb_index_merge.cc | 5 + storage/rocksdb/rdb_perf_context.cc | 2 - storage/rocksdb/rdb_psi.cc | 17 +- storage/rocksdb/rdb_psi.h | 10 +- storage/rocksdb/rdb_sst_info.cc | 4 +- storage/rocksdb/rdb_threads.h | 31 +- storage/rocksdb/rdb_utils.cc | 23 - 137 files changed, 4671 insertions(+), 683 deletions(-) create mode 100644 storage/rocksdb/mysql-test/rocksdb/combinations create mode 100644 storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/explicit_snapshot.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/percona_nonflushing_analyze_debug.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/secondary_key_update_lock.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/r/use_direct_reads.result create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bloomfilter_bulk_load-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bloomfilter_bulk_load.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/bulk_load_sk.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/com_rpc_tx.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/com_rpc_tx.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/create_no_primary_key_table-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/create_no_primary_key_table.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/explicit_snapshot-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/explicit_snapshot.test delete mode 100644 storage/rocksdb/mysql-test/rocksdb/t/insert_optimized_config-master.opt create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/percona_nonflushing_analyze_debug.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/secondary_key_update_lock.test create mode 100644 storage/rocksdb/mysql-test/rocksdb/t/use_direct_reads.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_missing_columns_sk_update.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/r/rpl_mts_dependency_unique_key_conflicts.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_missing_columns_sk_update.cnf create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_missing_columns_sk_update.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_rpl/t/rpl_mts_dependency_unique_key_conflicts.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_stress/combinations create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_bulk_load_allow_sk_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_commit_time_batch_for_recovery_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_debug_manual_compaction_delay_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_error_on_suboptimal_collation_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_manual_compaction_threads_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_max_manual_compactions_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_recalc_rate_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_write_policy_basic.result create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_bulk_load_allow_sk_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_commit_time_batch_for_recovery_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_debug_manual_compaction_delay_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_error_on_suboptimal_collation_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_manual_compaction_threads_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_max_manual_compactions_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_stats_recalc_rate_basic.test create mode 100644 storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_write_policy_basic.test diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt index a5ef5651e1f..54b0b1fab6a 100644 --- a/storage/rocksdb/CMakeLists.txt +++ b/storage/rocksdb/CMakeLists.txt @@ -82,32 +82,43 @@ ENDIF() SET(rocksdb_static_libs ) IF (WITH_SNAPPY) + FIND_LIBRARY(SNAPPY_LIBRARY + NAMES libsnappy${PIC_EXT}.a snappy + HINTS ${WITH_SNAPPY}/lib) SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_SNAPPY}/lib/libsnappy${PIC_EXT}.a) + ${SNAPPY_LIBRARY}) ADD_DEFINITIONS(-DSNAPPY) ENDIF() IF (WITH_LZ4) + FIND_LIBRARY(LZ4_LIBRARY + NAMES liblz4${PIC_EXT}.a lz4 + HINTS ${WITH_LZ4}/lib) SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_LZ4}/lib/liblz4${PIC_EXT}.a) + ${LZ4_LIBRARY}) ADD_DEFINITIONS(-DLZ4) ENDIF() IF (WITH_BZ2) + FIND_LIBRARY(BZ2_LIBRARY + NAMES libbz2${PIC_EXT}.a bz2 + HINTS ${WITH_BZ2}/lib) SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_BZ2}/lib/libbz2${PIC_EXT}.a) + ${BZ2_LIBRARY}) ADD_DEFINITIONS(-DBZIP2) ENDIF() IF (WITH_ZSTD) - SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_ZSTD}/lib/libzstd${PIC_EXT}.a) + SET(rocksdb_static_libs ${rocksdb_static_libs} ${ZSTD_LIBRARY}) ADD_DEFINITIONS(-DZSTD) ENDIF() IF (WITH_TBB) + FIND_LIBRARY(TBB_LIBRARY + NAMES libtbb${PIC_EXT}.a tbb + HINTS ${WITH_TBB}/lib) SET(rocksdb_static_libs ${rocksdb_static_libs} - ${WITH_TBB}/lib/libtbb${PIC_EXT}.a) + ${TBB_LIBRARY}) ADD_DEFINITIONS(-DTBB) ENDIF() diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 961ca5d1d17..c7d7cc0bdc4 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -25,6 +25,7 @@ /* C++ standard header files */ #include +#include #include #include #include @@ -109,6 +110,58 @@ const std::string DEFAULT_CF_NAME("default"); const std::string DEFAULT_SYSTEM_CF_NAME("__system__"); const std::string PER_INDEX_CF_NAME("$per_index_cf"); +class Rdb_explicit_snapshot; + +std::mutex explicit_snapshot_mutex; +ulonglong explicit_snapshot_counter = 0; +std::unordered_map> + explicit_snapshots; +static std::vector rdb_indexes_to_recalc; + +class Rdb_explicit_snapshot : public explicit_snapshot { + std::unique_ptr snapshot; + + public: + static std::shared_ptr + create(snapshot_info_st *ss_info, rocksdb::DB *db, + const rocksdb::Snapshot *snapshot) { + std::lock_guard lock(explicit_snapshot_mutex); + auto s = std::unique_ptr( + new rocksdb::ManagedSnapshot(db, snapshot)); + if (!s) { + return nullptr; + } + ss_info->snapshot_id = ++explicit_snapshot_counter; + auto ret = std::make_shared(*ss_info, std::move(s)); + if (!ret) { + return nullptr; + } + explicit_snapshots[ss_info->snapshot_id] = ret; + return ret; + } + + static std::shared_ptr + get(const ulonglong snapshot_id) { + std::lock_guard lock(explicit_snapshot_mutex); + auto elem = explicit_snapshots.find(snapshot_id); + if (elem == explicit_snapshots.end()) { + return nullptr; + } + return elem->second.lock(); + } + + rocksdb::ManagedSnapshot *get_snapshot() { return snapshot.get(); } + + Rdb_explicit_snapshot(snapshot_info_st ss_info, + std::unique_ptr snapshot) + : explicit_snapshot(ss_info), snapshot(std::move(snapshot)) {} + + virtual ~Rdb_explicit_snapshot() { + std::lock_guard lock(explicit_snapshot_mutex); + explicit_snapshots.erase(ss_info.snapshot_id); + } +}; + /** Updates row counters based on the table type and operation type. */ @@ -126,11 +179,15 @@ static handler *rocksdb_create_handler(my_core::handlerton *hton, my_core::TABLE_SHARE *table_arg, my_core::MEM_ROOT *mem_root); -static rocksdb::CompactRangeOptions getCompactRangeOptions() { +static rocksdb::CompactRangeOptions +getCompactRangeOptions(int concurrency = 0) { rocksdb::CompactRangeOptions compact_range_options; compact_range_options.bottommost_level_compaction = rocksdb::BottommostLevelCompaction::kForce; compact_range_options.exclusive_manual_compaction = false; + if (concurrency > 0) { + compact_range_options.max_subcompactions = concurrency; + } return compact_range_options; } @@ -167,6 +224,8 @@ Rdb_io_watchdog *io_watchdog = nullptr; static Rdb_background_thread rdb_bg_thread; +static Rdb_manual_compaction_thread rdb_mc_thread; + // List of table names (using regex) that are exceptions to the strict // collation check requirement. Regex_list_handler *rdb_collation_exceptions; @@ -180,30 +239,6 @@ static void rocksdb_flush_all_memtables() { } } -static void rocksdb_compact_column_family_stub( - THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, - const void *const save) {} - -static int rocksdb_compact_column_family(THD *const thd, - struct st_mysql_sys_var *const var, - void *const var_ptr, - struct st_mysql_value *const value) { - char buff[STRING_BUFFER_USUAL_SIZE]; - int len = sizeof(buff); - - DBUG_ASSERT(value != nullptr); - - if (const char *const cf = value->val_str(value, buff, &len)) { - auto cfh = cf_manager.get_cf(cf); - if (cfh != nullptr && rdb != nullptr) { - sql_print_information("RocksDB: Manual compaction of column family: %s\n", - cf); - rdb->CompactRange(getCompactRangeOptions(), cfh, nullptr, nullptr); - } - } - return HA_EXIT_SUCCESS; -} - /////////////////////////////////////////////////////////// // Hash map: table name => open table handler /////////////////////////////////////////////////////////// @@ -353,6 +388,7 @@ static void rocksdb_drop_index_wakeup_thread( static my_bool rocksdb_pause_background_work = 0; static mysql_mutex_t rdb_sysvars_mutex; +static mysql_mutex_t rdb_block_cache_resize_mutex; static void rocksdb_set_pause_background_work( my_core::THD *const thd MY_ATTRIBUTE((__unused__)), @@ -435,6 +471,9 @@ static void rocksdb_set_wal_bytes_per_sync(THD *thd, struct st_mysql_sys_var *const var, void *const var_ptr, const void *const save); +static int rocksdb_validate_set_block_cache_size( + THD *thd, struct st_mysql_sys_var *const var, void *var_ptr, + struct st_mysql_value *value); ////////////////////////////////////////////////////////////////////////////// // Options definitions ////////////////////////////////////////////////////////////////////////////// @@ -490,11 +529,19 @@ static my_bool rocksdb_enable_bulk_load_api = 1; static my_bool rocksdb_print_snapshot_conflict_queries = 0; static my_bool rocksdb_large_prefix = 0; static my_bool rocksdb_allow_to_start_after_corruption = 0; +static uint64_t rocksdb_write_policy = + rocksdb::TxnDBWritePolicy::WRITE_COMMITTED; +static my_bool rocksdb_error_on_suboptimal_collation = 1; +static uint32_t rocksdb_stats_recalc_rate = 0; +static uint32_t rocksdb_debug_manual_compaction_delay = 0; +static uint32_t rocksdb_max_manual_compactions = 0; std::atomic rocksdb_row_lock_deadlocks(0); std::atomic rocksdb_row_lock_wait_timeouts(0); std::atomic rocksdb_snapshot_conflict_errors(0); std::atomic rocksdb_wal_group_syncs(0); +std::atomic rocksdb_manual_compactions_processed(0); +std::atomic rocksdb_manual_compactions_running(0); static std::unique_ptr rdb_init_rocksdb_db_options(void) { auto o = std::unique_ptr(new rocksdb::DBOptions()); @@ -519,6 +566,14 @@ static std::unique_ptr rocksdb_db_options = static std::shared_ptr rocksdb_rate_limiter; +/* This enum needs to be kept up to date with rocksdb::TxnDBWritePolicy */ +static const char *write_policy_names[] = {"write_committed", "write_prepared", + "write_unprepared", NullS}; + +static TYPELIB write_policy_typelib = {array_elements(write_policy_names) - 1, + "write_policy_typelib", + write_policy_names, nullptr}; + /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */ static const char *info_log_level_names[] = {"debug_level", "info_level", "warn_level", "error_level", @@ -610,6 +665,14 @@ static int rocksdb_validate_flush_log_at_trx_commit( *static_cast(var_ptr) = static_cast(new_value); return HA_EXIT_SUCCESS; } +static void rocksdb_compact_column_family_stub( + THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, + const void *const save) {} + +static int rocksdb_compact_column_family(THD *const thd, + struct st_mysql_sys_var *const var, + void *const var_ptr, + struct st_mysql_value *const value); static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS}; @@ -618,7 +681,8 @@ static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1, nullptr}; const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024; -const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024; +const ulong RDB_DEFAULT_MAX_ROW_LOCKS = 1024 * 1024; +const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024 * 1024; const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000; const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024; const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024; @@ -649,6 +713,11 @@ static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG, /*min*/ 2, /*max*/ ULONG_MAX, 0); +static MYSQL_THDVAR_BOOL( + commit_time_batch_for_recovery, PLUGIN_VAR_RQCMDARG, + "TransactionOptions::commit_time_batch_for_recovery for RocksDB", nullptr, + nullptr, FALSE); + static MYSQL_THDVAR_BOOL( trace_sst_api, PLUGIN_VAR_RQCMDARG, "Generate trace output in the log for each call to the SstFileWriter", @@ -660,6 +729,13 @@ static MYSQL_THDVAR_BOOL( "unique_checks and enables rocksdb_commit_in_the_middle.", rocksdb_check_bulk_load, nullptr, FALSE); +static MYSQL_THDVAR_BOOL(bulk_load_allow_sk, PLUGIN_VAR_RQCMDARG, + "Allow bulk loading of sk keys during bulk-load. " + "Can be changed only when bulk load is disabled.", + /* Intentionally reuse unsorted's check function */ + rocksdb_check_bulk_load_allow_unsorted, nullptr, + FALSE); + static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG, "Allow unsorted input during bulk-load. " "Can be changed only when bulk load is disabled.", @@ -705,7 +781,7 @@ static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG, static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG, "Maximum number of locks a transaction can have", nullptr, nullptr, - /*default*/ RDB_MAX_ROW_LOCKS, + /*default*/ RDB_DEFAULT_MAX_ROW_LOCKS, /*min*/ 1, /*max*/ RDB_MAX_ROW_LOCKS, 0); @@ -757,6 +833,12 @@ static MYSQL_THDVAR_ULONGLONG( /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY, /* max */ SIZE_T_MAX, 1); +static MYSQL_THDVAR_INT( + manual_compaction_threads, PLUGIN_VAR_RQCMDARG, + "How many rocksdb threads to run for manual compactions", nullptr, nullptr, + /* default rocksdb.dboption max_subcompactions */ 0, + /* min */ 0, /* max */ 128, 0); + static MYSQL_SYSVAR_BOOL( create_if_missing, *reinterpret_cast(&rocksdb_db_options->create_if_missing), @@ -778,6 +860,12 @@ static MYSQL_SYSVAR_BOOL( "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr, rocksdb_db_options->manual_wal_flush); +static MYSQL_SYSVAR_ENUM(write_policy, rocksdb_write_policy, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "DBOptions::write_policy for RocksDB", nullptr, + nullptr, rocksdb::TxnDBWritePolicy::WRITE_COMMITTED, + &write_policy_typelib); + static MYSQL_SYSVAR_BOOL( create_missing_column_families, *reinterpret_cast( @@ -988,7 +1076,9 @@ static MYSQL_SYSVAR_INT(table_cache_numshardbits, "DBOptions::table_cache_numshardbits for RocksDB", nullptr, nullptr, rocksdb_db_options->table_cache_numshardbits, - /* min */ 0, /* max */ INT_MAX, 0); + // LRUCache limits this to 19 bits, anything greater + // fails to create a cache and returns a nullptr + /* min */ 0, /* max */ 19, 0); static MYSQL_SYSVAR_ULONG(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -1099,8 +1189,9 @@ static MYSQL_SYSVAR_BOOL( "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true); static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "block_cache size for RocksDB", nullptr, nullptr, + PLUGIN_VAR_RQCMDARG, + "block_cache size for RocksDB", + rocksdb_validate_set_block_cache_size, nullptr, /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE, /* min */ RDB_MIN_BLOCK_CACHE_SIZE, /* max */ LONGLONG_MAX, @@ -1342,6 +1433,18 @@ static MYSQL_SYSVAR_BOOL( "on PK TTL data. This variable is a no-op in non-debug builds.", nullptr, nullptr, FALSE); +static MYSQL_SYSVAR_UINT( + max_manual_compactions, rocksdb_max_manual_compactions, PLUGIN_VAR_RQCMDARG, + "Maximum number of pending + ongoing number of manual compactions.", + nullptr, nullptr, /* default */ 10, /* min */ 0, /* max */ UINT_MAX, 0); + +static MYSQL_SYSVAR_UINT( + debug_manual_compaction_delay, rocksdb_debug_manual_compaction_delay, + PLUGIN_VAR_RQCMDARG, + "For debugging purposes only. Sleeping specified seconds " + "for simulating long running compactions.", + nullptr, nullptr, 0, /* min */ 0, /* max */ UINT_MAX, 0); + static MYSQL_SYSVAR_BOOL( reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG, "Reset the RocksDB internal statistics without restarting the DB.", nullptr, @@ -1497,6 +1600,13 @@ static MYSQL_SYSVAR_UINT( RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0, /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0); +static MYSQL_SYSVAR_UINT( + stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG, + "The number of indexes per second to recalculate statistics for. 0 to " + "disable background recalculation.", + nullptr, nullptr, 0 /* default value */, 0 /* min value */, + UINT_MAX /* max value */, 0); + static MYSQL_SYSVAR_BOOL( large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG, "Support large index prefix length of 3072 bytes. If off, the maximum " @@ -1510,16 +1620,25 @@ static MYSQL_SYSVAR_BOOL( "detected.", nullptr, nullptr, FALSE); +static MYSQL_SYSVAR_BOOL(error_on_suboptimal_collation, + rocksdb_error_on_suboptimal_collation, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Raise an error instead of warning if a sub-optimal " + "collation is used", + nullptr, nullptr, TRUE); + static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100; static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(lock_wait_timeout), MYSQL_SYSVAR(deadlock_detect), MYSQL_SYSVAR(deadlock_detect_depth), + MYSQL_SYSVAR(commit_time_batch_for_recovery), MYSQL_SYSVAR(max_row_locks), MYSQL_SYSVAR(write_batch_max_bytes), MYSQL_SYSVAR(lock_scanned_rows), MYSQL_SYSVAR(bulk_load), + MYSQL_SYSVAR(bulk_load_allow_sk), MYSQL_SYSVAR(bulk_load_allow_unsorted), MYSQL_SYSVAR(skip_unique_check_tables), MYSQL_SYSVAR(trace_sst_api), @@ -1537,6 +1656,7 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(create_if_missing), MYSQL_SYSVAR(two_write_queues), MYSQL_SYSVAR(manual_wal_flush), + MYSQL_SYSVAR(write_policy), MYSQL_SYSVAR(create_missing_column_families), MYSQL_SYSVAR(error_if_exists), MYSQL_SYSVAR(paranoid_checks), @@ -1652,6 +1772,11 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(allow_to_start_after_corruption), + MYSQL_SYSVAR(error_on_suboptimal_collation), + MYSQL_SYSVAR(stats_recalc_rate), + MYSQL_SYSVAR(debug_manual_compaction_delay), + MYSQL_SYSVAR(max_manual_compactions), + MYSQL_SYSVAR(manual_compaction_threads), nullptr}; static rocksdb::WriteOptions @@ -1666,6 +1791,50 @@ rdb_get_rocksdb_write_options(my_core::THD *const thd) { return opt; } +static int rocksdb_compact_column_family(THD *const thd, + struct st_mysql_sys_var *const var, + void *const var_ptr, + struct st_mysql_value *const value) { + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + + DBUG_ASSERT(value != nullptr); + + if (const char *const cf = value->val_str(value, buff, &len)) { + auto cfh = cf_manager.get_cf(cf); + if (cfh != nullptr && rdb != nullptr) { + int mc_id = rdb_mc_thread.request_manual_compaction( + cfh, nullptr, nullptr, THDVAR(thd, manual_compaction_threads)); + if (mc_id == -1) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "Can't schedule more manual compactions. " + "Increase rocksdb_max_manual_compactions or stop issuing " + "more manual compactions."); + return HA_EXIT_FAILURE; + } else if (mc_id < 0) { + return HA_EXIT_FAILURE; + } + // NO_LINT_DEBUG + sql_print_information("RocksDB: Manual compaction of column family: %s\n", + cf); + // Checking thd state every short cycle (100ms). This is for allowing to + // exiting this function without waiting for CompactRange to finish. + do { + my_sleep(100000); + } while (!thd->killed && + !rdb_mc_thread.is_manual_compaction_finished(mc_id)); + + if (thd->killed) { + // This cancels if requested compaction state is INITED. + // TODO(yoshinorim): Cancel running compaction as well once + // it is supported in RocksDB. + rdb_mc_thread.clear_manual_compaction_request(mc_id, true); + } + } + } + return HA_EXIT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////////////////// /** @@ -1763,9 +1932,16 @@ protected: bool m_is_delayed_snapshot = false; bool m_is_two_phase = false; - THD *m_thd = nullptr; +private: + /* + Number of write operations this transaction had when we took the last + savepoint (the idea is not to take another savepoint if we haven't made + any changes) + */ + ulonglong m_writes_at_last_savepoint; - rocksdb::ReadOptions m_read_opts; +protected: + THD *m_thd = nullptr; static std::multiset s_tx_list; static mysql_mutex_t s_tx_list_mutex; @@ -1814,7 +1990,16 @@ protected: return s; } +protected: + /* + The following two are helper functions to be overloaded by child classes. + They should provide RocksDB's savepoint semantics. + */ + virtual void do_set_savepoint() = 0; + virtual void do_rollback_to_savepoint() = 0; + public: + rocksdb::ReadOptions m_read_opts; const char *m_mysql_log_file_name; my_off_t m_mysql_log_offset; const char *m_mysql_gtid; @@ -1822,6 +2007,7 @@ protected: String m_detailed_error; int64_t m_snapshot_timestamp = 0; bool m_ddl_transaction; + std::shared_ptr m_explicit_snapshot; /* Tracks the number of tables in use through external_lock. @@ -1881,7 +2067,7 @@ protected: if (s.IsDeadlock()) { my_core::thd_mark_transaction_to_rollback(thd, - false /* just statement */); + true /* whole transaction */); m_detailed_error = String(); table_handler->m_deadlock_counter.inc(); rocksdb_row_lock_deadlocks++; @@ -1898,7 +2084,7 @@ protected: } m_detailed_error = String(" (snapshot conflict)", system_charset_info); table_handler->m_deadlock_counter.inc(); - return HA_ERR_LOCK_DEADLOCK; + return HA_ERR_ROCKSDB_STATUS_BUSY; } if (s.IsIOError() || s.IsCorruption()) { @@ -2298,6 +2484,52 @@ public: virtual bool is_tx_started() const = 0; virtual void start_tx() = 0; virtual void start_stmt() = 0; + + void set_initial_savepoint() { + /* + Set the initial savepoint. If the first statement in the transaction + fails, we need something to roll back to, without rolling back the + entire transaction. + */ + do_set_savepoint(); + m_writes_at_last_savepoint= m_write_count; + } + + /* + Called when a "top-level" statement inside a transaction completes + successfully and its changes become part of the transaction's changes. + */ + void make_stmt_savepoint_permanent() { + + // Take another RocksDB savepoint only if we had changes since the last + // one. This is very important for long transactions doing lots of + // SELECTs. + if (m_writes_at_last_savepoint != m_write_count) + { + do_set_savepoint(); + m_writes_at_last_savepoint= m_write_count; + } + } + + + /* + Rollback to the savepoint we've set before the last statement + */ + void rollback_to_stmt_savepoint() { + if (m_writes_at_last_savepoint != m_write_count) { + do_rollback_to_savepoint(); + /* + RollbackToSavePoint "removes the most recent SetSavePoint()", so + we need to set it again so that next statement can roll back to this + stage. + It's ok to do it here at statement end (instead of doing it at next + statement start) because setting a savepoint is cheap. + */ + do_set_savepoint(); + m_writes_at_last_savepoint= m_write_count; + } + } + virtual void rollback_stmt() = 0; void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; } @@ -2465,7 +2697,15 @@ public: void acquire_snapshot(bool acquire_now) override { if (m_read_opts.snapshot == nullptr) { - if (is_tx_read_only()) { + const auto thd_ss = std::static_pointer_cast( + m_thd->get_explicit_snapshot()); + if (thd_ss) { + m_explicit_snapshot = thd_ss; + } + if (m_explicit_snapshot) { + auto snapshot = m_explicit_snapshot->get_snapshot()->snapshot(); + snapshot_created(snapshot); + } else if (is_tx_read_only()) { snapshot_created(rdb->GetSnapshot()); } else if (acquire_now) { m_rocksdb_tx->SetSnapshot(); @@ -2482,7 +2722,10 @@ public: if (m_read_opts.snapshot != nullptr) { m_snapshot_timestamp = 0; - if (is_tx_read_only()) { + if (m_explicit_snapshot) { + m_explicit_snapshot.reset(); + need_clear = false; + } else if (is_tx_read_only()) { rdb->ReleaseSnapshot(m_read_opts.snapshot); need_clear = false; } else { @@ -2591,6 +2834,10 @@ public: tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec); tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect); tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth); + // If this variable is set, this will write commit time write batch + // information on recovery or memtable flush. + tx_opts.use_only_the_last_commit_time_batch_for_recovery = + THDVAR(m_thd, commit_time_batch_for_recovery); tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes); write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); @@ -2609,9 +2856,20 @@ public: m_read_opts = rocksdb::ReadOptions(); + set_initial_savepoint(); + m_ddl_transaction = false; } + /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints */ + void do_set_savepoint() override { + m_rocksdb_tx->SetSavePoint(); + } + + void do_rollback_to_savepoint() override { + m_rocksdb_tx->RollbackToSavePoint(); + } + /* Start a statement inside a multi-statement transaction. @@ -2624,7 +2882,6 @@ public: void start_stmt() override { // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation) acquire_snapshot(false); - m_rocksdb_tx->SetSavePoint(); } /* @@ -2635,7 +2892,7 @@ public: /* TODO: here we must release the locks taken since the start_stmt() call */ if (m_rocksdb_tx) { const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot(); - m_rocksdb_tx->RollbackToSavePoint(); + rollback_to_stmt_savepoint(); const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot(); if (org_snapshot != cur_snapshot) { @@ -2673,7 +2930,7 @@ public: /* This is a rocksdb write batch. This class doesn't hold or wait on any transaction locks (skips rocksdb transaction API) thus giving better - performance. The commit is done through rdb->GetBaseDB()->Commit(). + performance. Currently this is only used for replication threads which are guaranteed to be non-conflicting. Any further usage of this class should completely @@ -2695,6 +2952,8 @@ private: bool commit_no_binlog() override { bool res = false; rocksdb::Status s; + rocksdb::TransactionDBWriteOptimizations optimize; + optimize.skip_concurrency_control = true; s = merge_auto_incr_map(m_batch->GetWriteBatch()); if (!s.ok()) { @@ -2705,7 +2964,7 @@ private: release_snapshot(); - s = rdb->GetBaseDB()->Write(write_opts, m_batch->GetWriteBatch()); + s = rdb->Write(write_opts, optimize, m_batch->GetWriteBatch()); if (!s.ok()) { rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); res = true; @@ -2723,6 +2982,15 @@ error: return res; } + /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */ + void do_set_savepoint() override { + m_batch->SetSavePoint(); + } + + void do_rollback_to_savepoint() override { + m_batch->RollbackToSavePoint(); + } + public: bool is_writebatch_trx() const override { return true; } @@ -2810,6 +3078,13 @@ public: get_for_update(rocksdb::ColumnFamilyHandle *const column_family, const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, bool exclusive) override { + if (value == nullptr) { + rocksdb::PinnableSlice pin_val; + rocksdb::Status s = get(column_family, key, &pin_val); + pin_val.Reset(); + return s; + } + return get(column_family, key, value); } @@ -2828,13 +3103,15 @@ public: write_opts.disableWAL = THDVAR(m_thd, write_disable_wal); write_opts.ignore_missing_column_families = THDVAR(m_thd, write_ignore_missing_column_families); + + set_initial_savepoint(); } - void start_stmt() override { m_batch->SetSavePoint(); } + void start_stmt() override {} void rollback_stmt() override { if (m_batch) - m_batch->RollbackToSavePoint(); + rollback_to_stmt_savepoint(); } explicit Rdb_writebatch_impl(THD *const thd) @@ -3043,6 +3320,8 @@ static int rocksdb_prepare(handlerton *const hton, THD *const thd, DEBUG_SYNC(thd, "rocksdb.prepared"); } + else + tx->make_stmt_savepoint_permanent(); return HA_EXIT_SUCCESS; } @@ -3214,11 +3493,9 @@ static int rocksdb_commit(handlerton *const hton, THD *const thd, } else { /* We get here when committing a statement within a transaction. - - We don't need to do anything here. tx->start_stmt() will notify - Rdb_transaction_impl that another statement has started. */ tx->set_tx_failed(false); + tx->make_stmt_savepoint_permanent(); } if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) { @@ -3406,6 +3683,7 @@ private: if (!path_entry.path.empty() && !path_entry.limit_exceeded) { auto deadlocking_txn = *(path_entry.path.end() - 1); deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id; + deadlock_info.deadlock_time = path_entry.deadlock_time; } return deadlock_info; } @@ -3452,16 +3730,18 @@ private: path_data += "\n*** DEADLOCK PATH\n" "=========================================\n"; const auto dl_info = get_dl_path_trx_info(path_entry); + const auto deadlock_time = dl_info.deadlock_time; for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) { const auto trx_info = *it; path_data += format_string( + "TIMESTAMP: %" PRId64 "\n" "TRANSACTION ID: %u\n" "COLUMN FAMILY NAME: %s\n" "WAITING KEY: %s\n" "LOCK TYPE: %s\n" "INDEX NAME: %s\n" "TABLE NAME: %s\n", - trx_info.trx_id, trx_info.cf_name.c_str(), + deadlock_time, trx_info.trx_id, trx_info.cf_name.c_str(), trx_info.waiting_key.c_str(), trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED", trx_info.index_name.c_str(), trx_info.table_name.c_str()); @@ -3772,6 +4052,24 @@ static bool rocksdb_show_status(handlerton *const hton, THD *const thd, str, stat_print); } } + + /* Explicit snapshot information */ + str.clear(); + { + std::lock_guard lock(explicit_snapshot_mutex); + for (const auto &elem : explicit_snapshots) { + const auto &ss = elem.second.lock(); + DBUG_ASSERT(ss != nullptr); + const auto &info = ss->ss_info; + str += "\nSnapshot ID: " + std::to_string(info.snapshot_id) + + "\nBinlog File: " + info.binlog_file + + "\nBinlog Pos: " + std::to_string(info.binlog_pos) + + "\nGtid Executed: " + info.gtid_executed + "\n"; + } + } + if (!str.empty()) { + res |= print_stats(thd, "EXPLICIT_SNAPSHOTS", "rocksdb", str, stat_print); + } } else if (stat_type == HA_ENGINE_TRX) { /* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */ res |= rocksdb_show_snapshot_status(hton, thd, stat_print); @@ -3791,6 +4089,48 @@ static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd, } } +static bool rocksdb_explicit_snapshot( + handlerton *const /* hton */, /*!< in: RocksDB handlerton */ + THD *const thd, /*!< in: MySQL thread handle */ + snapshot_info_st *ss_info) /*!< out: Snapshot information */ +{ + switch (ss_info->op) { + case snapshot_operation::SNAPSHOT_CREATE: { + if (mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } + auto s = Rdb_explicit_snapshot::create(ss_info, rdb, rdb->GetSnapshot()); + if (mysql_bin_log_is_open()) { + mysql_bin_log_unlock_commits(ss_info); + } + + thd->set_explicit_snapshot(s); + return s == nullptr; + } + case snapshot_operation::SNAPSHOT_ATTACH: { + auto s = Rdb_explicit_snapshot::get(ss_info->snapshot_id); + if (!s) { + return true; + } + *ss_info = s->ss_info; + thd->set_explicit_snapshot(s); + return false; + } + case snapshot_operation::SNAPSHOT_RELEASE: { + if (!thd->get_explicit_snapshot()) { + return true; + } + *ss_info = thd->get_explicit_snapshot()->ss_info; + thd->set_explicit_snapshot(nullptr); + return false; + } + default: + DBUG_ASSERT(false); + return true; + } + return true; +} + /* Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT @@ -3813,14 +4153,12 @@ static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd, InnoDB and RocksDB transactions. */ static int rocksdb_start_tx_and_assign_read_view( - handlerton *const hton, /*!< in: RocksDB handlerton */ - THD *const thd, /*!< in: MySQL thread handle of the - user for whom the transaction should - be committed */ - char *const binlog_file, /* out: binlog file for last commit */ - ulonglong *const binlog_pos, /* out: binlog pos for last commit */ - char **gtid_executed, /* out: Gtids logged until last commit */ - int *const gtid_executed_length) /*out: Length of gtid_executed string */ + handlerton *const hton, /*!< in: RocksDB handlerton */ + THD *const thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + snapshot_info_st *ss_info) /*!< in/out: Snapshot info like binlog file, pos, + gtid executed and snapshot ID */ { ulong const tx_isolation = my_core::thd_tx_isolation(thd); @@ -3829,11 +4167,12 @@ static int rocksdb_start_tx_and_assign_read_view( return HA_EXIT_FAILURE; } - if (binlog_file) { - if (binlog_pos && mysql_bin_log_is_open()) - mysql_bin_log_lock_commits(); - else + if (ss_info) { + if (mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } else { return HA_EXIT_FAILURE; + } } Rdb_transaction *const tx = get_or_create_tx(thd); @@ -3844,13 +4183,94 @@ static int rocksdb_start_tx_and_assign_read_view( rocksdb_register_tx(hton, thd, tx); tx->acquire_snapshot(true); - if (binlog_file) - mysql_bin_log_unlock_commits(binlog_file, binlog_pos, gtid_executed, - gtid_executed_length); + if (ss_info) { + mysql_bin_log_unlock_commits(ss_info); + } return HA_EXIT_SUCCESS; } +static int rocksdb_start_tx_with_shared_read_view( + handlerton *const hton, /*!< in: RocksDB handlerton */ + THD *const thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + snapshot_info_st *ss_info) /*!< out: Snapshot info like binlog file, pos, + gtid executed and snapshot ID */ +{ + DBUG_ASSERT(thd != nullptr); + DBUG_ASSERT(ss_info != nullptr); + + int error = HA_EXIT_SUCCESS; + + ulong const tx_isolation = my_core::thd_tx_isolation(thd); + if (tx_isolation != ISO_REPEATABLE_READ) { + my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0)); + return HA_EXIT_FAILURE; + } + + std::shared_ptr explicit_snapshot; + const auto op = ss_info->op; + Rdb_transaction *tx = nullptr; + + DBUG_ASSERT(op == snapshot_operation::SNAPSHOT_CREATE || + op == snapshot_operation::SNAPSHOT_ATTACH); + + // case: if binlogs are available get binlog file/pos and gtid info + if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) { + mysql_bin_log_lock_commits(ss_info); + } + + if (op == snapshot_operation::SNAPSHOT_ATTACH) { + explicit_snapshot = Rdb_explicit_snapshot::get(ss_info->snapshot_id); + if (!explicit_snapshot) { + my_printf_error(ER_UNKNOWN_ERROR, "Snapshot %llu does not exist", MYF(0), + ss_info->snapshot_id); + error = HA_EXIT_FAILURE; + } + } + + // case: all good till now + if (error == HA_EXIT_SUCCESS) { + tx = get_or_create_tx(thd); + Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); + + if (explicit_snapshot) { + tx->m_explicit_snapshot = explicit_snapshot; + } + + DBUG_ASSERT(!tx->has_snapshot()); + tx->set_tx_read_only(true); + rocksdb_register_tx(hton, thd, tx); + tx->acquire_snapshot(true); + + // case: an explicit snapshot was not assigned to this transaction + if (!tx->m_explicit_snapshot) { + tx->m_explicit_snapshot = + Rdb_explicit_snapshot::create(ss_info, rdb, tx->m_read_opts.snapshot); + if (!tx->m_explicit_snapshot) { + my_printf_error(ER_UNKNOWN_ERROR, "Could not create snapshot", MYF(0)); + error = HA_EXIT_FAILURE; + } + } + } + + // case: unlock the binlog + if (op == snapshot_operation::SNAPSHOT_CREATE && mysql_bin_log_is_open()) { + mysql_bin_log_unlock_commits(ss_info); + } + + DBUG_ASSERT(error == HA_EXIT_FAILURE || tx->m_explicit_snapshot); + + // copy over the snapshot details to pass to the upper layers + if (tx->m_explicit_snapshot) { + *ss_info = tx->m_explicit_snapshot->ss_info; + ss_info->op = op; + } + + return error; +} + /* Dummy SAVEPOINT support. This is needed for long running transactions * like mysqldump (https://bugs.mysql.com/bug.php?id=71017). * Current SAVEPOINT does not correctly handle ROLLBACK and does not return @@ -4062,9 +4482,11 @@ static int rocksdb_init_func(void *const p) { rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key); rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key, rdb_signal_drop_idx_psi_cond_key); + rdb_mc_thread.init(rdb_signal_mc_psi_mutex_key, rdb_signal_mc_psi_cond_key); #else rdb_bg_thread.init(); rdb_drop_idx_thread.init(); + rdb_mc_thread.init(); #endif mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex, MY_MUTEX_INIT_FAST); @@ -4080,6 +4502,8 @@ static int rocksdb_init_func(void *const p) { mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex, MY_MUTEX_INIT_FAST); + mysql_mutex_init(rdb_block_cache_resize_mutex_key, + &rdb_block_cache_resize_mutex, MY_MUTEX_INIT_FAST); rdb_open_tables.init_hash(); Rdb_transaction::init_mutex(); @@ -4094,8 +4518,10 @@ static int rocksdb_init_func(void *const p) { rocksdb_hton->rollback = rocksdb_rollback; rocksdb_hton->db_type = DB_TYPE_ROCKSDB; rocksdb_hton->show_status = rocksdb_show_status; + rocksdb_hton->explicit_snapshot = rocksdb_explicit_snapshot; rocksdb_hton->start_consistent_snapshot = rocksdb_start_tx_and_assign_read_view; + rocksdb_hton->start_shared_snapshot = rocksdb_start_tx_with_shared_read_view; rocksdb_hton->savepoint_set = rocksdb_savepoint; rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint; rocksdb_hton->savepoint_rollback_can_release_mdl = @@ -4158,6 +4584,35 @@ static int rocksdb_init_func(void *const p) { DBUG_RETURN(HA_EXIT_FAILURE); } + // Check whether the filesystem backing rocksdb_datadir allows O_DIRECT + if (rocksdb_db_options->use_direct_reads) { + rocksdb::EnvOptions soptions; + rocksdb::Status check_status; + rocksdb::Env *const env = rocksdb_db_options->env; + + std::string fname = format_string("%s/DIRECT_CHECK", rocksdb_datadir); + if (env->FileExists(fname).ok()) { + std::unique_ptr file; + soptions.use_direct_reads = true; + check_status = env->NewSequentialFile(fname, &file, soptions); + } else { + std::unique_ptr file; + soptions.use_direct_writes = true; + check_status = env->ReopenWritableFile(fname, &file, soptions); + if (file != nullptr) { + file->Close(); + } + env->DeleteFile(fname); + } + + if (!check_status.ok()) { + sql_print_error("RocksDB: Unable to use direct io in rocksdb-datadir:" + "(%s)", check_status.getState()); + rdb_open_tables.free_hash(); + DBUG_RETURN(HA_EXIT_FAILURE); + } + } + if (rocksdb_db_options->allow_mmap_writes && rocksdb_db_options->use_direct_io_for_flush_and_compaction) { // See above comment for allow_mmap_reads. (NO_LINT_DEBUG) @@ -4309,8 +4764,10 @@ static int rocksdb_init_func(void *const p) { cf_options_map->get_defaults()); rocksdb::TransactionDBOptions tx_db_options; - tx_db_options.transaction_lock_timeout = 2; // 2 seconds + tx_db_options.transaction_lock_timeout = 2000; // 2 seconds tx_db_options.custom_mutex_factory = std::make_shared(); + tx_db_options.write_policy = + static_cast(rocksdb_write_policy); status = check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr); @@ -4334,7 +4791,7 @@ static int rocksdb_init_func(void *const p) { } cf_manager.init(std::move(cf_options_map), &cf_handles); - if (dict_manager.init(rdb->GetBaseDB(), &cf_manager)) { + if (dict_manager.init(rdb, &cf_manager)) { // NO_LINT_DEBUG sql_print_error("RocksDB: Failed to initialize data dictionary."); rdb_open_tables.free_hash(); @@ -4401,6 +4858,21 @@ static int rocksdb_init_func(void *const p) { DBUG_RETURN(HA_EXIT_FAILURE); } + err = rdb_mc_thread.create_thread(MANUAL_COMPACTION_THREAD_NAME +#ifdef HAVE_PSI_INTERFACE + , + rdb_mc_psi_thread_key +#endif + ); + if (err != 0) { + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Couldn't start the manual compaction thread: (errno=%d)", + err); + rdb_open_tables.free_hash(); + DBUG_RETURN(HA_EXIT_FAILURE); + } + rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions); if (rocksdb_pause_background_work) { @@ -4496,6 +4968,16 @@ static int rocksdb_done_func(void *const p) { sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err); } + // signal the manual compaction thread to stop + rdb_mc_thread.signal(true); + // Wait for the manual compaction thread to finish. + err = rdb_mc_thread.join(); + if (err != 0) { + // NO_LINT_DEBUG + sql_print_error( + "RocksDB: Couldn't stop the manual compaction thread: (errno=%d)", err); + } + if (rdb_open_tables.m_hash.records) { // Looks like we are getting unloaded and yet we have some open tables // left behind. @@ -4505,6 +4987,7 @@ static int rocksdb_done_func(void *const p) { rdb_open_tables.free_hash(); mysql_mutex_destroy(&rdb_open_tables.m_mutex); mysql_mutex_destroy(&rdb_sysvars_mutex); + mysql_mutex_destroy(&rdb_block_cache_resize_mutex); delete rdb_collation_exceptions; mysql_mutex_destroy(&rdb_collation_data_mutex); @@ -4709,6 +5192,12 @@ static ulonglong rdb_get_int_col_max_value(const Field *field) { case HA_KEYTYPE_LONGLONG: max_value = 0x7FFFFFFFFFFFFFFFULL; break; + case HA_KEYTYPE_FLOAT: + max_value = 0x1000000ULL; + break; + case HA_KEYTYPE_DOUBLE: + max_value = 0x20000000000000ULL; + break; default: abort(); } @@ -4896,7 +5385,6 @@ longlong ha_rocksdb::update_hidden_pk_val() { /* Get the id of the hidden pk id from m_last_rowkey */ int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) { - DBUG_ASSERT(hidden_pk_id != nullptr); DBUG_ASSERT(table != nullptr); DBUG_ASSERT(has_hidden_pk(table)); @@ -4953,14 +5441,14 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton, my_core::TABLE_SHARE *const table_arg) : handler(hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr), m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr), + m_scan_it_lower_bound(nullptr), m_scan_it_upper_bound(nullptr), m_tbl_def(nullptr), m_pk_descr(nullptr), m_key_descr_arr(nullptr), m_pk_can_be_decoded(false), m_maybe_unpack_info(false), m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr), m_sk_packed_tuple(nullptr), m_end_key_packed_tuple(nullptr), m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr), m_sk_packed_tuple_old(nullptr), m_dup_sk_packed_tuple(nullptr), - m_dup_sk_packed_tuple_old(nullptr), m_eq_cond_lower_bound(nullptr), - m_eq_cond_upper_bound(nullptr), m_pack_buffer(nullptr), + m_dup_sk_packed_tuple_old(nullptr), m_pack_buffer(nullptr), m_lock_rows(RDB_LOCK_NONE), m_keyread_only(FALSE), m_encoder_arr(nullptr), m_row_checksums_checked(0), m_in_rpl_delete_rows(false), m_in_rpl_update_rows(false), m_force_skip_unique_check(false) {} @@ -5507,9 +5995,6 @@ int ha_rocksdb::convert_field_from_storage_format( int ha_rocksdb::convert_record_from_storage_format( const rocksdb::Slice *const key, const rocksdb::Slice *const value, uchar *const buf) { - DBUG_ASSERT(key != nullptr); - DBUG_ASSERT(buf != nullptr); - Rdb_string_reader reader(value); /* @@ -5753,7 +6238,6 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, DBUG_ENTER_FUNC(); DBUG_ASSERT(m_pk_tuple == nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); std::shared_ptr *const kd_arr = tbl_def_arg->m_key_descr_arr; @@ -5805,9 +6289,9 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, m_pack_buffer = reinterpret_cast(my_malloc(max_packed_sk_len, MYF(0))); - m_eq_cond_upper_bound = + m_scan_it_lower_bound = reinterpret_cast(my_malloc(max_packed_sk_len, MYF(0))); - m_eq_cond_lower_bound = + m_scan_it_upper_bound = reinterpret_cast(my_malloc(max_packed_sk_len, MYF(0))); /* @@ -5824,7 +6308,7 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr || m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr || m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr || - m_eq_cond_upper_bound == nullptr || m_eq_cond_lower_bound == nullptr || + m_scan_it_upper_bound == nullptr || m_scan_it_lower_bound == nullptr || (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr || m_dup_sk_packed_tuple_old == nullptr))) { // One or more of the above allocations failed. Clean up and exit @@ -5864,11 +6348,11 @@ void ha_rocksdb::free_key_buffers() { my_free(m_dup_sk_packed_tuple_old); m_dup_sk_packed_tuple_old = nullptr; - my_free(m_eq_cond_upper_bound); - m_eq_cond_upper_bound = nullptr; + my_free(m_scan_it_lower_bound); + m_scan_it_lower_bound = nullptr; - my_free(m_eq_cond_lower_bound); - m_eq_cond_lower_bound = nullptr; + my_free(m_scan_it_upper_bound); + m_scan_it_upper_bound = nullptr; } void ha_rocksdb::set_use_read_free_rpl(const char *const whitelist) { @@ -6081,9 +6565,8 @@ bool ha_rocksdb::get_error_message(const int error, String *const buf) { static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST, "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST"); - DBUG_ASSERT(buf != nullptr); - - if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK) { + if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK || + error == HA_ERR_ROCKSDB_STATUS_BUSY) { Rdb_transaction *const tx = get_tx_from_thd(ha_thd()); DBUG_ASSERT(tx != nullptr); buf->append(tx->m_detailed_error); @@ -6165,10 +6648,10 @@ int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s, } if (opt_msg) { - my_error(ER_RDB_STATUS_MSG, MYF(0), opt_msg, s.code(), - s.ToString().c_str()); + std::string concatenated_error = s.ToString() + " (" + std::string(opt_msg) + ")"; + my_error(ER_GET_ERRMSG, MYF(0), s.code(), concatenated_error.c_str(), rocksdb_hton_name); } else { - my_error(ER_RDB_STATUS_GENERAL, MYF(0), s.code(), s.ToString().c_str()); + my_error(ER_GET_ERRMSG, MYF(0), s.code(), s.ToString().c_str(), rocksdb_hton_name); } return err; @@ -6213,7 +6696,6 @@ int ha_rocksdb::create_key_defs( /* = nullptr */) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); uint i; @@ -6284,9 +6766,7 @@ int ha_rocksdb::create_cfs( std::array *const cfs) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); char tablename_sys[NAME_LEN + 1]; @@ -6315,11 +6795,22 @@ int ha_rocksdb::create_cfs( } collation_err += coll->name; } - my_error(ER_UNSUPPORTED_COLLATION, MYF(0), - tbl_def_arg->full_tablename().c_str(), - table_arg->key_info[i].key_part[part].field->field_name, - collation_err.c_str()); - DBUG_RETURN(HA_EXIT_FAILURE); + + if (rocksdb_error_on_suboptimal_collation) { + my_error(ER_UNSUPPORTED_COLLATION, MYF(0), + tbl_def_arg->full_tablename().c_str(), + table_arg->key_info[i].key_part[part].field->field_name, + collation_err.c_str()); + DBUG_RETURN(HA_EXIT_FAILURE); + } else { + push_warning_printf( + ha_thd(), Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS, + "Unsupported collation on string indexed column %s.%s Use " + "binary collation (%s).", + tbl_def_arg->full_tablename().c_str(), + table_arg->key_info[i].key_part[part].field->field_name, + collation_err.c_str()); + } } } } @@ -6380,10 +6871,6 @@ int ha_rocksdb::create_inplace_key_defs( const std::array &cfs) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - DBUG_ASSERT(old_tbl_def_arg != nullptr); - std::shared_ptr *const old_key_descr = old_tbl_def_arg->m_key_descr_arr; std::shared_ptr *const new_key_descr = @@ -6451,11 +6938,6 @@ std::unordered_map ha_rocksdb::get_old_key_positions( const Rdb_tbl_def *const old_tbl_def_arg) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(old_table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - DBUG_ASSERT(old_tbl_def_arg != nullptr); - std::shared_ptr *const old_key_descr = old_tbl_def_arg->m_key_descr_arr; std::unordered_map old_key_pos; @@ -6521,9 +7003,6 @@ int ha_rocksdb::compare_keys(const KEY *const old_key, const KEY *const new_key) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(old_key != nullptr); - DBUG_ASSERT(new_key != nullptr); - /* Check index name. */ if (strcmp(old_key->name, new_key->name) != 0) { DBUG_RETURN(HA_EXIT_FAILURE); @@ -6554,9 +7033,6 @@ int ha_rocksdb::compare_key_parts(const KEY *const old_key, const KEY *const new_key) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(old_key != nullptr); - DBUG_ASSERT(new_key != nullptr); - /* Skip if key parts do not match, as it is a different key */ if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) { DBUG_RETURN(HA_EXIT_FAILURE); @@ -6601,7 +7077,6 @@ int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, const struct key_def_cf_info &cf_info) const { DBUG_ENTER_FUNC(); - DBUG_ASSERT(new_key_def != nullptr); DBUG_ASSERT(*new_key_def == nullptr); uint64 ttl_duration = 0; @@ -6688,8 +7163,6 @@ int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, int rdb_normalize_tablename(const std::string &tablename, std::string *const strbuf) { - DBUG_ASSERT(strbuf != nullptr); - if (tablename.size() < 2 || tablename[0] != '.' || tablename[1] != '/') { DBUG_ASSERT(0); // We were not passed table name? return HA_ERR_ROCKSDB_INVALID_TABLE; @@ -7004,8 +7477,6 @@ int ha_rocksdb::read_key_exact(const Rdb_key_def &kd, const bool &full_key_match, const rocksdb::Slice &key_slice, const int64_t ttl_filter_ts) { - DBUG_ASSERT(iter != nullptr); - /* We are looking for the first record such that index_tuple= lookup_tuple. @@ -7213,8 +7684,6 @@ int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd, } int ha_rocksdb::read_row_from_primary_key(uchar *const buf) { - DBUG_ASSERT(buf != nullptr); - int rc; const rocksdb::Slice &rkey = m_scan_it->key(); const uint pk_size = rkey.size(); @@ -7238,8 +7707,6 @@ int ha_rocksdb::read_row_from_primary_key(uchar *const buf) { int ha_rocksdb::read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd, bool move_forward) { - DBUG_ASSERT(buf != nullptr); - int rc = 0; uint pk_size; @@ -7342,7 +7809,6 @@ ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const { pair for. */ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) { - DBUG_ASSERT(buf != nullptr); DBUG_ASSERT(table != nullptr); stats.rows_requested++; @@ -7612,7 +8078,7 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, else rc = read_row_from_secondary_key(buf, kd, move_forward); - if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) break; /* Exit the loop */ // release the snapshot and iterator so they will be regenerated @@ -7655,8 +8121,6 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward, uchar *const buf) { - DBUG_ASSERT(buf != nullptr); - if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) { const Rdb_key_def &kd = *m_key_descr_arr[active_index]; @@ -7974,8 +8438,6 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid, const bool skip_ttl_check) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(buf != nullptr); - DBUG_ASSERT(rowid != nullptr); DBUG_ASSERT(table != nullptr); int rc; @@ -8190,8 +8652,6 @@ int ha_rocksdb::index_last(uchar *const buf) { int ha_rocksdb::index_first_intern(uchar *const buf) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(buf != nullptr); - uchar *key; uint key_size; int rc; @@ -8205,7 +8665,7 @@ int ha_rocksdb::index_first_intern(uchar *const buf) { DBUG_ASSERT(key != nullptr); const Rdb_key_def &kd = *m_key_descr_arr[active_index]; - kd.get_first_key(key, &key_size); + int key_start_matching_bytes = kd.get_first_key(key, &key_size); rocksdb::Slice index_key((const char *)key, key_size); @@ -8216,12 +8676,12 @@ int ha_rocksdb::index_first_intern(uchar *const buf) { // Loop as long as we get a deadlock error AND we end up creating the // snapshot here (i.e. it did not exist prior to this) for (;;) { - setup_scan_iterator(kd, &index_key, false, Rdb_key_def::INDEX_NUMBER_SIZE); + setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes); m_scan_it->Seek(index_key); m_skip_scan_it_next_call = true; rc = index_next_with_direction(buf, true); - if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) break; // exit the loop // release the snapshot and iterator so they will be regenerated @@ -8281,8 +8741,6 @@ int ha_rocksdb::index_first_intern(uchar *const buf) { int ha_rocksdb::index_last_intern(uchar *const buf) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(buf != nullptr); - uchar *key; uint key_size; int rc; @@ -8324,7 +8782,7 @@ int ha_rocksdb::index_last_intern(uchar *const buf) { rc = secondary_index_read(active_index, buf); } - if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !is_new_snapshot) break; /* exit the loop */ // release the snapshot and iterator so they will be regenerated @@ -8405,7 +8863,6 @@ bool ha_rocksdb::commit_in_the_middle() { @retval false if bulk commit was skipped or succeeded */ bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) { - DBUG_ASSERT(tx != nullptr); return commit_in_the_middle() && tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) && tx->flush_batch(); @@ -8418,7 +8875,6 @@ bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) { 'auto-incremented' pk.) */ bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const { - DBUG_ASSERT(table != nullptr); return Rdb_key_def::table_has_hidden_pk(table); } @@ -8428,9 +8884,7 @@ bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const { */ bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); return (table_arg->s->primary_key == MAX_INDEXES && index == tbl_def_arg->m_key_count - 1); @@ -8439,9 +8893,7 @@ bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg, /* Returns index of primary key */ uint ha_rocksdb::pk_index(const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1 : table_arg->s->primary_key; @@ -8450,9 +8902,7 @@ uint ha_rocksdb::pk_index(const TABLE *const table_arg, /* Returns true if given index number is a primary key */ bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); return index == table_arg->s->primary_key || is_hidden_pk(index, table_arg, tbl_def_arg); @@ -8467,9 +8917,6 @@ uint ha_rocksdb::max_supported_key_part_length() const { const char *ha_rocksdb::get_key_name(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - if (is_hidden_pk(index, table_arg, tbl_def_arg)) { return HIDDEN_PK_NAME; } @@ -8483,9 +8930,6 @@ const char *ha_rocksdb::get_key_name(const uint index, const char *ha_rocksdb::get_key_comment(const uint index, const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) { - DBUG_ASSERT(table_arg != nullptr); - DBUG_ASSERT(tbl_def_arg != nullptr); - if (is_hidden_pk(index, table_arg, tbl_def_arg)) { return nullptr; } @@ -8537,7 +8981,6 @@ const std::string ha_rocksdb::generate_cf_name(const uint index, } const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) { - DBUG_ASSERT(table_arg != nullptr); DBUG_ASSERT(table_arg->s != nullptr); return table_arg->s->comment.str; @@ -8651,8 +9094,7 @@ int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id, /* If the keys are the same, then no lock is needed */ - if (!Rdb_pk_comparator::bytewise_compare(row_info.new_pk_slice, - row_info.old_pk_slice)) { + if (!row_info.new_pk_slice.compare(row_info.old_pk_slice)) { *found = false; return HA_EXIT_SUCCESS; } @@ -8741,8 +9183,7 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, rocksdb::Slice((const char *)m_sk_packed_tuple, size); /* - For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs - always require locking. + Acquire lock on the old key in case of UPDATE */ if (row_info.old_data != nullptr) { size = kd.pack_record(table, m_pack_buffer, row_info.old_data, @@ -8751,14 +9192,18 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, const rocksdb::Slice old_slice = rocksdb::Slice((const char *)m_sk_packed_tuple_old, size); - /* - For updates, if the keys are the same, then no lock is needed + const rocksdb::Status s = + get_for_update(row_info.tx, kd.get_cf(), old_slice, nullptr); + if (!s.ok()) { + return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler); + } - Also check to see if the key has any fields set to NULL. If it does, then - this key is unique since NULL is not equal to each other, so no lock is - needed. + /* + If the old and new keys are the same we're done since we've already taken + the lock on the old key */ - if (!Rdb_pk_comparator::bytewise_compare(new_slice, old_slice)) { + if (!new_slice.compare(old_slice)) { return HA_EXIT_SUCCESS; } } @@ -8784,8 +9229,14 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, The bloom filter may need to be disabled for this lookup. */ + uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + rocksdb::Slice lower_bound_slice; + rocksdb::Slice upper_bound_slice; + const bool total_order_seek = !check_bloom_and_set_bounds( - ha_thd(), kd, new_slice, all_parts_used); + ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_NUMBER_SIZE, + lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice); const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache); const rocksdb::Status s = @@ -8796,9 +9247,8 @@ int ha_rocksdb::check_and_lock_sk(const uint &key_id, } rocksdb::Iterator *const iter = row_info.tx->get_iterator( - kd.get_cf(), total_order_seek, fill_cache, - m_eq_cond_lower_bound_slice, m_eq_cond_upper_bound_slice, - true /* read current data */, + kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice, + upper_bound_slice, true /* read current data */, false /* acquire snapshot */); /* Need to scan the transaction to see if there is a duplicate key. @@ -8960,7 +9410,7 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, } } - if (table->next_number_field) { + if (table->found_next_number_field) { update_auto_incr_val_from_field(); } @@ -9008,9 +9458,11 @@ int ha_rocksdb::update_pk(const Rdb_key_def &kd, } int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, - const struct update_row_info &row_info) { + const struct update_row_info &row_info, + const bool bulk_load_sk) { int new_packed_size; int old_packed_size; + int rc = HA_EXIT_SUCCESS; rocksdb::Slice new_key_slice; rocksdb::Slice new_value_slice; @@ -9088,18 +9540,23 @@ int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, rocksdb::Slice(reinterpret_cast(m_sk_tails.ptr()), m_sk_tails.get_current_pos()); - row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice, - new_value_slice); + if (bulk_load_sk && row_info.old_data == nullptr) { + rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true); + } else { + row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice, + new_value_slice); + } row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() + new_value_slice.size()); - return HA_EXIT_SUCCESS; + return rc; } int ha_rocksdb::update_indexes(const struct update_row_info &row_info, const bool &pk_changed) { int rc; + bool bulk_load_sk; // The PK must be updated first to pull out the TTL value. rc = update_pk(*m_pk_descr, row_info, pk_changed); @@ -9107,13 +9564,17 @@ int ha_rocksdb::update_indexes(const struct update_row_info &row_info, return rc; } - // Update the remaining indexes. + // Update the remaining indexes. Allow bulk loading only if + // allow_sk is enabled + bulk_load_sk = rocksdb_enable_bulk_load_api && + THDVAR(table->in_use, bulk_load) && + THDVAR(table->in_use, bulk_load_allow_sk); for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) { if (is_pk(key_id, table, m_tbl_def)) { continue; } - rc = update_sk(table, *m_key_descr_arr[key_id], row_info); + rc = update_sk(table, *m_key_descr_arr[key_id], row_info, bulk_load_sk); if (rc != HA_EXIT_SUCCESS) { return rc; } @@ -9206,25 +9667,22 @@ int ha_rocksdb::update_write_row(const uchar *const old_data, 0x0000b3eb003f65c5e78857, and lower bound would be 0x0000b3eb003f65c5e78859. These cover given eq condition range. */ -void ha_rocksdb::setup_iterator_bounds(const Rdb_key_def &kd, - const rocksdb::Slice &eq_cond) { - uint eq_cond_len = eq_cond.size(); - DBUG_ASSERT(eq_cond_len >= Rdb_key_def::INDEX_NUMBER_SIZE); - memcpy(m_eq_cond_upper_bound, eq_cond.data(), eq_cond_len); - kd.successor(m_eq_cond_upper_bound, eq_cond_len); - memcpy(m_eq_cond_lower_bound, eq_cond.data(), eq_cond_len); - kd.predecessor(m_eq_cond_lower_bound, eq_cond_len); +void ha_rocksdb::setup_iterator_bounds( + const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len, + uchar *const lower_bound, uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) { + uint min_len = std::min(eq_cond.size(), bound_len); + memcpy(upper_bound, eq_cond.data(), min_len); + kd.successor(upper_bound, min_len); + memcpy(lower_bound, eq_cond.data(), min_len); + kd.predecessor(lower_bound, min_len); if (kd.m_is_reverse_cf) { - m_eq_cond_upper_bound_slice = - rocksdb::Slice((const char *)m_eq_cond_lower_bound, eq_cond_len); - m_eq_cond_lower_bound_slice = - rocksdb::Slice((const char *)m_eq_cond_upper_bound, eq_cond_len); + *upper_bound_slice = rocksdb::Slice((const char *)lower_bound, min_len); + *lower_bound_slice = rocksdb::Slice((const char *)upper_bound, min_len); } else { - m_eq_cond_upper_bound_slice = - rocksdb::Slice((const char *)m_eq_cond_upper_bound, eq_cond_len); - m_eq_cond_lower_bound_slice = - rocksdb::Slice((const char *)m_eq_cond_lower_bound, eq_cond_len); + *upper_bound_slice = rocksdb::Slice((const char *)upper_bound, min_len); + *lower_bound_slice = rocksdb::Slice((const char *)lower_bound, min_len); } } @@ -9236,7 +9694,6 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *const slice, const bool use_all_keys, const uint eq_cond_len) { - DBUG_ASSERT(slice != nullptr); DBUG_ASSERT(slice->size() >= eq_cond_len); Rdb_transaction *const tx = get_or_create_tx(table->in_use); @@ -9244,7 +9701,10 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, bool skip_bloom = true; const rocksdb::Slice eq_cond(slice->data(), eq_cond_len); - if (check_bloom_and_set_bounds(ha_thd(), kd, eq_cond, use_all_keys)) { + if (check_bloom_and_set_bounds( + ha_thd(), kd, eq_cond, use_all_keys, eq_cond_len, + m_scan_it_lower_bound, m_scan_it_upper_bound, + &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) { skip_bloom = false; } @@ -9285,8 +9745,8 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, m_scan_it = rdb->NewIterator(read_opts, kd.get_cf()); } else { m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache, - m_eq_cond_lower_bound_slice, - m_eq_cond_upper_bound_slice); + m_scan_it_lower_bound_slice, + m_scan_it_upper_bound_slice); } m_scan_it_skips_bloom = skip_bloom; } @@ -9305,12 +9765,12 @@ void ha_rocksdb::release_scan_iterator() { void ha_rocksdb::setup_iterator_for_rnd_scan() { uint key_size; - m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size); + int key_start_matching_bytes = m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size); rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size); setup_scan_iterator(*m_pk_descr, &table_key, false, - Rdb_key_def::INDEX_NUMBER_SIZE); + key_start_matching_bytes); m_scan_it->Seek(table_key); m_skip_scan_it_next_call = true; } @@ -9353,7 +9813,7 @@ int ha_rocksdb::rnd_next(uchar *const buf) { ha_statistic_increment(&SSV::ha_read_rnd_next_count); for (;;) { rc = rnd_next_with_direction(buf, true); - if (rc != HA_ERR_LOCK_DEADLOCK || !m_rnd_scan_is_new_snapshot) + if (rc != HA_ERR_ROCKSDB_STATUS_BUSY || !m_rnd_scan_is_new_snapshot) break; /* exit the loop */ // release the snapshot and iterator and then regenerate them @@ -9676,7 +10136,7 @@ int ha_rocksdb::info(uint flag) { if (static_cast(stats.data_file_length) < 0 || static_cast(stats.index_file_length) < 0 || static_cast(stats.records) < 0) { - if (analyze(nullptr, nullptr)) { + if (calculate_stats_for_table()) { DBUG_RETURN(HA_EXIT_FAILURE); } @@ -9756,10 +10216,21 @@ int ha_rocksdb::info(uint flag) { KEY *const k = &table->key_info[i]; for (uint j = 0; j < k->actual_key_parts; j++) { const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats; - uint x = k_stats.m_distinct_keys_per_prefix.size() > j && - k_stats.m_distinct_keys_per_prefix[j] > 0 - ? k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j] - : 0; + uint x; + + if (k_stats.m_distinct_keys_per_prefix.size() > j && + k_stats.m_distinct_keys_per_prefix[j] > 0) { + x = k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j]; + /* + If the number of rows is less than the number of prefixes (due to + sampling), the average number of rows with the same prefix is 1. + */ + if (x == 0) { + x = 1; + } + } else { + x = 0; + } if (x > stats.records) x = stats.records; if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) || @@ -9932,7 +10403,7 @@ THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to, m_lock_rows = RDB_LOCK_WRITE; } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) { m_lock_rows = RDB_LOCK_READ; - } else { + } else if (lock_type != TL_IGNORE) { m_lock_rows = RDB_LOCK_NONE; if (THDVAR(thd, lock_scanned_rows)) { /* @@ -10023,22 +10494,24 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { } if (lock_type == F_UNLCK) { - Rdb_transaction *const tx = get_or_create_tx(thd); + Rdb_transaction *const tx = get_tx_from_thd(thd); - tx->io_perf_end_and_record(&m_io_perf); - tx->m_n_mysql_tables_in_use--; - if (tx->m_n_mysql_tables_in_use == 0 && - !my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { - /* - Do like InnoDB: when we get here, it's time to commit a - single-statement transaction. + if (tx) { + tx->io_perf_end_and_record(&m_io_perf); + tx->m_n_mysql_tables_in_use--; + if (tx->m_n_mysql_tables_in_use == 0 && + !my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + /* + Do like InnoDB: when we get here, it's time to commit a + single-statement transaction. - If the statement involved multiple tables, this code will be executed - for each of them, but that's ok because non-first tx->commit() calls - will be no-ops. - */ - if (tx->commit_or_rollback()) { - res = HA_ERR_INTERNAL_ERROR; + If the statement involved multiple tables, this code will be executed + for each of them, but that's ok because non-first tx->commit() calls + will be no-ops. + */ + if (tx->commit_or_rollback()) { + res = HA_ERR_INTERNAL_ERROR; + } } } } else { @@ -10075,6 +10548,11 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { DBUG_RETURN(HA_ERR_UNSUPPORTED); } + if (thd->get_explicit_snapshot()) { + my_error(ER_UPDATES_WITH_EXPLICIT_SNAPSHOT, MYF(0)); + DBUG_RETURN(HA_ERR_UNSUPPORTED); + } + /* SQL layer signals us to take a write lock. It does so when starting DML statement. We should put locks on the rows we're reading. @@ -10282,8 +10760,6 @@ void Rdb_drop_index_thread::run() { } Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) { - DBUG_ASSERT(tablename != nullptr); - std::string str; if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) { // We were not passed table name? @@ -10346,6 +10822,12 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { char key_buf[MAX_KEY_LENGTH]; uint key_len; ulonglong bytes_written = 0; + + uchar lower_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + uchar upper_bound_buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + rocksdb::Slice lower_bound_slice; + rocksdb::Slice upper_bound_slice; + /* Remove all records in each index. (This is is not crash-safe, but it doesn't matter, because bulk row @@ -10356,9 +10838,12 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { kd.get_infimum_key(reinterpret_cast(key_buf), &key_len); rocksdb::ColumnFamilyHandle *cf = kd.get_cf(); const rocksdb::Slice table_key(key_buf, key_len); - setup_iterator_bounds(kd, table_key); - opts.iterate_lower_bound = &m_eq_cond_lower_bound_slice; - opts.iterate_upper_bound = &m_eq_cond_upper_bound_slice; + setup_iterator_bounds(kd, table_key, Rdb_key_def::INDEX_NUMBER_SIZE, + lower_bound_buf, upper_bound_buf, &lower_bound_slice, + &upper_bound_slice); + DBUG_ASSERT(key_len == Rdb_key_def::INDEX_NUMBER_SIZE); + opts.iterate_lower_bound = &lower_bound_slice; + opts.iterate_upper_bound = &upper_bound_slice; std::unique_ptr it(rdb->NewIterator(opts, cf)); it->Seek(table_key); @@ -10397,9 +10882,6 @@ int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { int ha_rocksdb::rename_table(const char *const from, const char *const to) { DBUG_ENTER_FUNC(); - DBUG_ASSERT(from != nullptr); - DBUG_ASSERT(to != nullptr); - std::string from_str; std::string to_str; std::string from_db; @@ -10652,29 +11134,28 @@ int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) { DBUG_RETURN(HA_EXIT_SUCCESS); } -int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, - HA_CHECK_OPT *const check_opt) { +static int calculate_stats( + const std::unordered_map> + &to_recalc, + bool include_memtables) { DBUG_ENTER_FUNC(); // find per column family key ranges which need to be queried std::unordered_map> ranges; - std::unordered_set ids_to_check; - std::vector buf(table_arg->s->keys * 2 * - Rdb_key_def::INDEX_NUMBER_SIZE); std::unordered_map stats; - for (uint i = 0; i < table_arg->s->keys; i++) { - const auto bufp = &buf[i * 2 * Rdb_key_def::INDEX_NUMBER_SIZE]; - const Rdb_key_def &kd = *m_key_descr_arr[i]; - const GL_INDEX_ID index_id = kd.get_gl_index_id(); - ranges[kd.get_cf()].push_back(get_range(i, bufp)); + std::vector buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE); + + uchar *bufp = buf.data(); + for (const auto &it : to_recalc) { + const GL_INDEX_ID index_id = it.first; + auto &kd = it.second; + ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp)); + bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE; - ids_to_check.insert(index_id); - // Initialize the stats to 0. If there are no files that contain - // this gl_index_id, then 0 should be stored for the cached stats. stats[index_id] = Rdb_index_stats(index_id); - DBUG_ASSERT(kd.get_key_parts() > 0); - stats[index_id].m_distinct_keys_per_prefix.resize(kd.get_key_parts()); + DBUG_ASSERT(kd->get_key_parts() > 0); + stats[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts()); } // get RocksDB table properties for these ranges @@ -10685,8 +11166,8 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, it.first, &it.second[0], it.second.size(), &props); DBUG_ASSERT(props.size() >= old_size); if (!status.ok()) { - DBUG_RETURN( - rdb_error_to_mysql(status, "Could not access RocksDB properties")); + DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql( + status, "Could not access RocksDB properties")); } } @@ -10707,61 +11188,62 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, other SQL tables, it can be that we're only seeing a small fraction of table's entries (and so we can't update statistics based on that). */ - if (ids_to_check.find(it1.m_gl_index_id) == ids_to_check.end()) + if (stats.find(it1.m_gl_index_id) == stats.end()) { continue; + } - auto kd = ddl_manager.safe_find(it1.m_gl_index_id); - DBUG_ASSERT(kd != nullptr); - stats[it1.m_gl_index_id].merge(it1, true, kd->max_storage_fmt_length()); + auto it_index = to_recalc.find(it1.m_gl_index_id); + DBUG_ASSERT(it_index != to_recalc.end()); + if (it_index == to_recalc.end()) { + continue; + } + stats[it1.m_gl_index_id].merge( + it1, true, it_index->second->max_storage_fmt_length()); } num_sst++; } - // calculate memtable cardinality - Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct); - auto read_opts = rocksdb::ReadOptions(); - read_opts.read_tier = rocksdb::ReadTier::kMemtableTier; - for (uint i = 0; i < table_arg->s->keys; i++) { - const Rdb_key_def &kd = *m_key_descr_arr[i]; - Rdb_index_stats &stat = stats[kd.get_gl_index_id()]; + if (include_memtables) { + // calculate memtable cardinality + Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct); + auto read_opts = rocksdb::ReadOptions(); + read_opts.read_tier = rocksdb::ReadTier::kMemtableTier; + for (const auto &it_kd : to_recalc) { + const std::shared_ptr &kd = it_kd.second; + Rdb_index_stats &stat = stats[kd->get_gl_index_id()]; - uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; - auto r = get_range(i, r_buf); - uint64_t memtableCount; - uint64_t memtableSize; - rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memtableCount, - &memtableSize); - if (memtableCount < (uint64_t)stat.m_rows / 10) { - // skip tables that already have enough stats from SST files to reduce - // overhead and avoid degradation of big tables stats by sampling from - // relatively tiny (less than 10% of full data set) memtable dataset - continue; - } - - std::unique_ptr it = std::unique_ptr( - rdb->NewIterator(read_opts, kd.get_cf())); - - uchar *first_key; - uint key_size; - if (is_pk(i, table, m_tbl_def)) { - first_key = m_pk_packed_tuple; - } else { - first_key = m_sk_packed_tuple; - } - kd.get_first_key(first_key, &key_size); - rocksdb::Slice first_index_key((const char *)first_key, key_size); - - cardinality_collector.Reset(); - for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) { - const rocksdb::Slice key = it->key(); - if (!kd.covers_key(key)) { - break; // end of this index + uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; + auto r = myrocks::get_range(*kd, r_buf); + uint64_t memtableCount; + uint64_t memtableSize; + rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount, + &memtableSize); + if (memtableCount < (uint64_t)stat.m_rows / 10) { + // skip tables that already have enough stats from SST files to reduce + // overhead and avoid degradation of big tables stats by sampling from + // relatively tiny (less than 10% of full data set) memtable dataset + continue; } - stat.m_rows++; - cardinality_collector.ProcessKey(key, &kd, &stat); + std::unique_ptr it = + std::unique_ptr( + rdb->NewIterator(read_opts, kd->get_cf())); + + rocksdb::Slice first_index_key((const char *)r_buf, + Rdb_key_def::INDEX_NUMBER_SIZE); + + cardinality_collector.Reset(); + for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) { + const rocksdb::Slice key = it->key(); + if (!kd->covers_key(key)) { + break; // end of this index + } + stat.m_rows++; + + cardinality_collector.ProcessKey(key, kd.get(), &stat); + } + cardinality_collector.AdjustStats(&stat); } - cardinality_collector.AdjustStats(&stat); } // set and persist new stats @@ -10771,6 +11253,19 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, DBUG_RETURN(HA_EXIT_SUCCESS); } +int ha_rocksdb::calculate_stats_for_table() { + DBUG_ENTER_FUNC(); + + std::unordered_map> + ids_to_check; + for (uint i = 0; i < table->s->keys; i++) { + ids_to_check.insert(std::make_pair(m_key_descr_arr[i]->get_gl_index_id(), + m_key_descr_arr[i])); + } + + DBUG_RETURN(calculate_stats(ids_to_check, true)); +} + /* @return HA_ADMIN_OK OK @@ -10779,7 +11274,16 @@ int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) { DBUG_ENTER_FUNC(); - if (table && calculate_stats(table, thd, check_opt) != HA_EXIT_SUCCESS) { + if (table) { + if (calculate_stats_for_table() != HA_EXIT_SUCCESS) { + DBUG_RETURN(HA_ADMIN_FAILED); + } + } + + // A call to ::info is needed to repopulate some SQL level structs. This is + // necessary for online analyze because we cannot rely on another ::open + // call to call info for us. + if (info(HA_STATUS_CONST | HA_STATUS_VARIABLE) != HA_EXIT_SUCCESS) { DBUG_RETURN(HA_ADMIN_FAILED); } @@ -11184,7 +11688,9 @@ bool ha_rocksdb::prepare_inplace_alter_table( if (!new_tdef) { new_tdef = m_tbl_def; } - max_auto_incr = load_auto_incr_value_from_index(); + if (table->found_next_number_field) { + max_auto_incr = load_auto_incr_value_from_index(); + } } ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx( @@ -11636,18 +12142,6 @@ bool ha_rocksdb::commit_inplace_alter_table( dict_manager.finish_indexes_operation( create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING); - /* - We need to recalculate the index stats here manually. The reason is that - the secondary index does not exist inside - m_index_num_to_keydef until it is committed to the data dictionary, which - prevents us from updating the stats normally as the ddl_manager cannot - find the proper gl_index_ids yet during adjust_stats calls. - */ - if (calculate_stats(altered_table, nullptr, nullptr)) { - /* Failed to update index statistics, should never happen */ - DBUG_ASSERT(0); - } - rdb_drop_idx_thread.signal(); } @@ -11719,6 +12213,8 @@ struct rocksdb_status_counters_t { uint64_t block_cache_data_hit; uint64_t block_cache_data_add; uint64_t bloom_filter_useful; + uint64_t bloom_filter_full_positive; + uint64_t bloom_filter_full_true_positive; uint64_t memtable_hit; uint64_t memtable_miss; uint64_t get_hit_l0; @@ -11793,6 +12289,8 @@ DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS) DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT) DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD) DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL) +DEF_SHOW_FUNC(bloom_filter_full_positive, BLOOM_FILTER_FULL_POSITIVE) +DEF_SHOW_FUNC(bloom_filter_full_true_positive, BLOOM_FILTER_FULL_TRUE_POSITIVE) DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT) DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS) DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0) @@ -12027,6 +12525,8 @@ static SHOW_VAR rocksdb_status_vars[] = { DEF_STATUS_VAR(block_cache_data_hit), DEF_STATUS_VAR(block_cache_data_add), DEF_STATUS_VAR(bloom_filter_useful), + DEF_STATUS_VAR(bloom_filter_full_positive), + DEF_STATUS_VAR(bloom_filter_full_true_positive), DEF_STATUS_VAR(memtable_hit), DEF_STATUS_VAR(memtable_miss), DEF_STATUS_VAR(get_hit_l0), @@ -12084,6 +12584,10 @@ static SHOW_VAR rocksdb_status_vars[] = { &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs, SHOW_LONGLONG), + DEF_STATUS_VAR_PTR("manual_compactions_processed", + &rocksdb_manual_compactions_processed, SHOW_LONGLONG), + DEF_STATUS_VAR_PTR("manual_compactions_running", + &rocksdb_manual_compactions_running, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete, @@ -12158,6 +12662,42 @@ void Rdb_background_thread::run() { } } + // Recalculate statistics for indexes. + if (rocksdb_stats_recalc_rate) { + std::unordered_map> + to_recalc; + + if (rdb_indexes_to_recalc.empty()) { + struct Rdb_index_collector : public Rdb_tables_scanner { + int add_table(Rdb_tbl_def *tdef) override { + for (uint i = 0; i < tdef->m_key_count; i++) { + rdb_indexes_to_recalc.push_back( + tdef->m_key_descr_arr[i]->get_gl_index_id()); + } + return HA_EXIT_SUCCESS; + } + } collector; + ddl_manager.scan_for_tables(&collector); + } + + while (to_recalc.size() < rocksdb_stats_recalc_rate && + !rdb_indexes_to_recalc.empty()) { + const auto index_id = rdb_indexes_to_recalc.back(); + rdb_indexes_to_recalc.pop_back(); + + std::shared_ptr keydef = + ddl_manager.safe_find(index_id); + + if (keydef) { + to_recalc.insert(std::make_pair(keydef->get_gl_index_id(), keydef)); + } + } + + if (!to_recalc.empty()) { + calculate_stats(to_recalc, false); + } + } + // Set the next timestamp for mysql_cond_timedwait() (which ends up calling // pthread_cond_timedwait()) to wait on. ts_next_sync.tv_sec = ts.tv_sec + WAKE_UP_INTERVAL; @@ -12167,12 +12707,152 @@ void Rdb_background_thread::run() { ddl_manager.persist_stats(); } -bool ha_rocksdb::check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd, - const rocksdb::Slice &eq_cond, - const bool use_all_keys) { +/* + A background thread to handle manual compactions, + except for dropping indexes/tables. Every second, it checks + pending manual compactions, and it calls CompactRange if there is. +*/ +void Rdb_manual_compaction_thread::run() { + mysql_mutex_init(0, &m_mc_mutex, MY_MUTEX_INIT_FAST); + RDB_MUTEX_LOCK_CHECK(m_signal_mutex); + for (;;) { + if (m_stop) { + break; + } + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += 1; + + const auto ret MY_ATTRIBUTE((__unused__)) = + mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts); + if (m_stop) { + break; + } + // make sure, no program error is returned + DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT); + RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex); + + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + // Grab the first item and proceed, if not empty. + if (m_requests.empty()) { + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + RDB_MUTEX_LOCK_CHECK(m_signal_mutex); + continue; + } + Manual_compaction_request &mcr = m_requests.begin()->second; + DBUG_ASSERT(mcr.cf != nullptr); + DBUG_ASSERT(mcr.state == Manual_compaction_request::INITED); + mcr.state = Manual_compaction_request::RUNNING; + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + + DBUG_ASSERT(mcr.state == Manual_compaction_request::RUNNING); + // NO_LINT_DEBUG + sql_print_information("Manual Compaction id %d cf %s started.", mcr.mc_id, + mcr.cf->GetName().c_str()); + rocksdb_manual_compactions_running++; + if (rocksdb_debug_manual_compaction_delay > 0) { + my_sleep(rocksdb_debug_manual_compaction_delay * 1000000); + } + // CompactRange may take a very long time. On clean shutdown, + // it is cancelled by CancelAllBackgroundWork, then status is + // set to shutdownInProgress. + const rocksdb::Status s = rdb->CompactRange( + getCompactRangeOptions(mcr.concurrency), mcr.cf, mcr.start, mcr.limit); + rocksdb_manual_compactions_running--; + if (s.ok()) { + // NO_LINT_DEBUG + sql_print_information("Manual Compaction id %d cf %s ended.", mcr.mc_id, + mcr.cf->GetName().c_str()); + } else { + // NO_LINT_DEBUG + sql_print_information("Manual Compaction id %d cf %s aborted. %s", + mcr.mc_id, mcr.cf->GetName().c_str(), s.getState()); + if (!s.IsShutdownInProgress()) { + rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD); + } else { + DBUG_ASSERT(m_requests.size() == 1); + } + } + rocksdb_manual_compactions_processed++; + clear_manual_compaction_request(mcr.mc_id, false); + RDB_MUTEX_LOCK_CHECK(m_signal_mutex); + } + clear_all_manual_compaction_requests(); + DBUG_ASSERT(m_requests.empty()); + RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex); + mysql_mutex_destroy(&m_mc_mutex); +} + +void Rdb_manual_compaction_thread::clear_all_manual_compaction_requests() { + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + m_requests.clear(); + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); +} + +void Rdb_manual_compaction_thread::clear_manual_compaction_request( + int mc_id, bool init_only) { + bool erase = true; + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + auto it = m_requests.find(mc_id); + if (it != m_requests.end()) { + if (init_only) { + Manual_compaction_request mcr = it->second; + if (mcr.state != Manual_compaction_request::INITED) { + erase = false; + } + } + if (erase) { + m_requests.erase(it); + } + } else { + // Current code path guarantees that erasing by the same mc_id happens + // at most once. INITED state may be erased by a thread that requested + // the compaction. RUNNING state is erased by mc thread only. + DBUG_ASSERT(0); + } + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); +} + +int Rdb_manual_compaction_thread::request_manual_compaction( + rocksdb::ColumnFamilyHandle *cf, rocksdb::Slice *start, + rocksdb::Slice *limit, int concurrency) { + int mc_id = -1; + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + if (m_requests.size() >= rocksdb_max_manual_compactions) { + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + return mc_id; + } + Manual_compaction_request mcr; + mc_id = mcr.mc_id = ++m_latest_mc_id; + mcr.state = Manual_compaction_request::INITED; + mcr.cf = cf; + mcr.start = start; + mcr.limit = limit; + mcr.concurrency = concurrency; + m_requests.insert(std::make_pair(mcr.mc_id, mcr)); + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + return mc_id; +} + +bool Rdb_manual_compaction_thread::is_manual_compaction_finished(int mc_id) { + bool finished = false; + RDB_MUTEX_LOCK_CHECK(m_mc_mutex); + if (m_requests.count(mc_id) == 0) { + finished = true; + } + RDB_MUTEX_UNLOCK_CHECK(m_mc_mutex); + return finished; +} + +bool ha_rocksdb::check_bloom_and_set_bounds( + THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, + const bool use_all_keys, size_t bound_len, uchar *const lower_bound, + uchar *const upper_bound, rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice) { bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys); if (!can_use_bloom) { - setup_iterator_bounds(kd, eq_cond); + setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound, + lower_bound_slice, upper_bound_slice); } return can_use_bloom; } @@ -12282,7 +12962,6 @@ void rdb_update_global_stats(const operation_type &type, uint count, int rdb_get_table_perf_counters(const char *const tablename, Rdb_perf_counters *const counters) { - DBUG_ASSERT(counters != nullptr); DBUG_ASSERT(tablename != nullptr); Rdb_table_handler *table_handler; @@ -12321,9 +13000,6 @@ const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) { // In case of core dump generation we want this function NOT to be optimized // so that we can capture as much data as possible to debug the root cause // more efficiently. -#pragma GCC push_options -#pragma GCC optimize("O0") - void rdb_handle_io_error(const rocksdb::Status status, const RDB_IO_ERROR_TYPE err_type) { if (status.IsIOError()) { @@ -12338,6 +13014,9 @@ void rdb_handle_io_error(const rocksdb::Status status, } case RDB_IO_ERROR_BG_THREAD: { rdb_log_status_error(status, "BG thread failed to write to RocksDB"); + /* NO_LINT_DEBUG */ + sql_print_error("MyRocks: aborting on BG write error."); + abort(); break; } case RDB_IO_ERROR_GENERAL: { @@ -12373,8 +13052,6 @@ void rdb_handle_io_error(const rocksdb::Status status, } } -#pragma GCC pop_options - Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; } Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; } @@ -12672,6 +13349,42 @@ static void rocksdb_set_wal_bytes_per_sync( RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } +/* + Validating and updating block cache size via sys_var::check path. + SetCapacity may take seconds when reducing block cache, and + sys_var::update holds LOCK_global_system_variables mutex, so + updating block cache size is done at check path instead. +*/ +static int rocksdb_validate_set_block_cache_size( + THD *thd MY_ATTRIBUTE((__unused__)), + struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), + void *var_ptr, struct st_mysql_value *value) { + DBUG_ASSERT(value != nullptr); + + long long new_value; + + /* value is NULL */ + if (value->val_int(value, &new_value)) { + return HA_EXIT_FAILURE; + } + + if (new_value < RDB_MIN_BLOCK_CACHE_SIZE || + (uint64_t)new_value > (uint64_t)LONGLONG_MAX) { + return HA_EXIT_FAILURE; + } + + RDB_MUTEX_LOCK_CHECK(rdb_block_cache_resize_mutex); + const rocksdb::BlockBasedTableOptions &table_options = + rdb_get_table_options(); + + if (rocksdb_block_cache_size != new_value && table_options.block_cache) { + table_options.block_cache->SetCapacity(new_value); + } + *static_cast(var_ptr) = static_cast(new_value); + RDB_MUTEX_UNLOCK_CHECK(rdb_block_cache_resize_mutex); + return HA_EXIT_SUCCESS; +} + static int rocksdb_validate_update_cf_options(THD * /* unused */, struct st_mysql_sys_var * /*unused*/, @@ -12849,6 +13562,13 @@ double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) { DBUG_RETURN((rows / 20.0) + 1); } +void ha_rocksdb::print_error(int error, myf errflag) { + if (error == HA_ERR_ROCKSDB_STATUS_BUSY) { + error = HA_ERR_LOCK_DEADLOCK; + } + handler::print_error(error, errflag); +} + std::string rdb_corruption_marker_file_name() { std::string ret(rocksdb_datadir); ret.append("/ROCKSDB_CORRUPTED"); @@ -12885,5 +13605,6 @@ mysql_declare_plugin(rocksdb_se){ myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global, myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats, myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl, - myrocks::rdb_i_s_index_file_map, myrocks::rdb_i_s_lock_info, - myrocks::rdb_i_s_trx_info, myrocks::rdb_i_s_deadlock_info mysql_declare_plugin_end; + myrocks::rdb_i_s_sst_props, myrocks::rdb_i_s_index_file_map, + myrocks::rdb_i_s_lock_info, myrocks::rdb_i_s_trx_info, + myrocks::rdb_i_s_deadlock_info mysql_declare_plugin_end; diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index 0b3b473d841..ca141657413 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -104,6 +104,7 @@ struct Rdb_deadlock_info { std::string table_name; }; std::vector path; + int64_t deadlock_time; ulonglong victim_trx_id; }; @@ -144,6 +145,11 @@ const char *const BG_THREAD_NAME = "myrocks-bg"; */ const char *const INDEX_THREAD_NAME = "myrocks-index"; +/* + Name for the manual compaction thread. +*/ +const char *const MANUAL_COMPACTION_THREAD_NAME = "myrocks-mc"; + /* Separator between partition name and the qualifier. Sample usage: @@ -485,6 +491,12 @@ class ha_rocksdb : public my_core::handler { const rocksdb::Snapshot *m_scan_it_snapshot; + /* Buffers used for upper/lower bounds for m_scan_it. */ + uchar *m_scan_it_lower_bound; + uchar *m_scan_it_upper_bound; + rocksdb::Slice m_scan_it_lower_bound_slice; + rocksdb::Slice m_scan_it_upper_bound_slice; + Rdb_tbl_def *m_tbl_def; /* Primary Key encoder from KeyTupleFormat to StorageFormat */ @@ -548,12 +560,6 @@ class ha_rocksdb : public my_core::handler { uchar *m_dup_sk_packed_tuple; uchar *m_dup_sk_packed_tuple_old; - /* Buffers used for passing upper/bound eq conditions. */ - uchar *m_eq_cond_lower_bound; - uchar *m_eq_cond_upper_bound; - rocksdb::Slice m_eq_cond_lower_bound_slice; - rocksdb::Slice m_eq_cond_upper_bound_slice; - /* Temporary space for packing VARCHARs (we provide it to pack_record()/pack_index_tuple() calls). @@ -635,13 +641,20 @@ class ha_rocksdb : public my_core::handler { enum ha_rkey_function find_flag) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); void setup_iterator_bounds(const Rdb_key_def &kd, - const rocksdb::Slice &eq_cond); + const rocksdb::Slice &eq_cond, size_t bound_len, + uchar *const lower_bound, uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice); bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, const bool use_all_keys); bool check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, - const bool use_all_keys); + const bool use_all_keys, size_t bound_len, + uchar *const lower_bound, + uchar *const upper_bound, + rocksdb::Slice *lower_bound_slice, + rocksdb::Slice *upper_bound_slice); void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice, const bool use_all_keys, const uint eq_cond_len) MY_ATTRIBUTE((__nonnull__)); @@ -834,7 +847,7 @@ public: HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS | (m_pk_can_be_decoded ? HA_PRIMARY_KEY_IN_READ_INDEX : 0) | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY | - HA_PARTIAL_COLUMN_READ); + HA_PARTIAL_COLUMN_READ | HA_ONLINE_ANALYZE); } bool init_with_fields() override; @@ -1009,6 +1022,7 @@ public: } virtual double read_time(uint, uint, ha_rows rows) override; + virtual void print_error(int error, myf errflag) override; int open(const char *const name, int mode, uint test_if_locked) override MY_ATTRIBUTE((__warn_unused_result__)); @@ -1123,8 +1137,8 @@ private: MY_ATTRIBUTE((__nonnull__)); int compare_key_parts(const KEY *const old_key, - const KEY *const new_key) const; - MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); + const KEY *const new_key) const + MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); int compare_keys(const KEY *const old_key, const KEY *const new_key) const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); @@ -1180,7 +1194,7 @@ private: int update_pk(const Rdb_key_def &kd, const struct update_row_info &row_info, const bool &pk_changed) MY_ATTRIBUTE((__warn_unused_result__)); int update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, - const struct update_row_info &row_info) + const struct update_row_info &row_info, const bool bulk_load_sk) MY_ATTRIBUTE((__warn_unused_result__)); int update_indexes(const struct update_row_info &row_info, const bool &pk_changed) @@ -1234,7 +1248,9 @@ private: int finalize_bulk_load(bool print_client_error = true) MY_ATTRIBUTE((__warn_unused_result__)); -public: + int calculate_stats_for_table() MY_ATTRIBUTE((__warn_unused_result__)); + + public: int index_init(uint idx, bool sorted) override MY_ATTRIBUTE((__warn_unused_result__)); int index_end() override MY_ATTRIBUTE((__warn_unused_result__)); @@ -1327,9 +1343,6 @@ public: MY_ATTRIBUTE((__warn_unused_result__)); int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override MY_ATTRIBUTE((__warn_unused_result__)); - int calculate_stats(const TABLE *const table_arg, THD *const thd, - HA_CHECK_OPT *const check_opt) - MY_ATTRIBUTE((__warn_unused_result__)); enum_alter_inplace_result check_if_supported_inplace_alter( TABLE *altered_table, @@ -1356,7 +1369,7 @@ public: virtual void rpl_after_delete_rows() override; virtual void rpl_before_update_rows() override; virtual void rpl_after_update_rows() override; - virtual bool use_read_free_rpl(); + virtual bool use_read_free_rpl() override; private: /* Flags tracking if we are inside different replication operation */ diff --git a/storage/rocksdb/ha_rocksdb_proto.h b/storage/rocksdb/ha_rocksdb_proto.h index 71ec6957456..a2d014f2f01 100644 --- a/storage/rocksdb/ha_rocksdb_proto.h +++ b/storage/rocksdb/ha_rocksdb_proto.h @@ -39,7 +39,12 @@ enum RDB_IO_ERROR_TYPE { const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type); void rdb_handle_io_error(const rocksdb::Status status, - const RDB_IO_ERROR_TYPE err_type); + const RDB_IO_ERROR_TYPE err_type) +#if defined(__clang__) + MY_ATTRIBUTE((optnone)); +#else + MY_ATTRIBUTE((optimize("O0"))); +#endif int rdb_normalize_tablename(const std::string &tablename, std::string *str) MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)); diff --git a/storage/rocksdb/mysql-test/rocksdb/combinations b/storage/rocksdb/mysql-test/rocksdb/combinations new file mode 100644 index 00000000000..c3f6b9d0396 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/combinations @@ -0,0 +1,6 @@ +[write_committed] +rocksdb_write_policy=write_committed + +[write_prepared] +rocksdb_write_policy=write_prepared +rocksdb_commit_time_batch_for_recovery=on diff --git a/storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc b/storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc new file mode 100644 index 00000000000..681b966f680 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/include/have_write_committed.inc @@ -0,0 +1,3 @@ +if (`select count(*) = 0 from information_schema.session_variables where variable_name = 'rocksdb_write_policy' and variable_value = 'write_committed';`) { + --skip Test requires write_committed policy +} diff --git a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result index b7b60f26816..b6b942c3d70 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace.result @@ -287,11 +287,13 @@ set global rocksdb_bulk_load=1; show global variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load ON +rocksdb_bulk_load_allow_sk OFF rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 show session variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load ON +rocksdb_bulk_load_allow_sk OFF rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 CREATE TABLE t1 (i INT, j INT, PRIMARY KEY (i)) ENGINE = ROCKSDB; @@ -335,6 +337,7 @@ SET session rocksdb_merge_buf_size = 340; show variables like 'rocksdb_bulk_load%'; Variable_name Value rocksdb_bulk_load OFF +rocksdb_bulk_load_allow_sk OFF rocksdb_bulk_load_allow_unsorted OFF rocksdb_bulk_load_size 1000 CREATE TABLE t1 (a VARCHAR(80)) ENGINE=RocksDB; @@ -442,3 +445,24 @@ t1 CREATE TABLE `t1` ( KEY `kb` (`b`(8)) ) ENGINE=ROCKSDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin DROP TABLE t1; +SET @prior_rocksdb_table_stats_sampling_pct = @@rocksdb_table_stats_sampling_pct; +set global rocksdb_table_stats_sampling_pct = 100; +CREATE TABLE t1 (a INT, b INT, PRIMARY KEY ka(a)) ENGINE=RocksDB; +INSERT INTO t1 (a, b) VALUES (1, 10); +INSERT INTO t1 (a, b) VALUES (2, 10); +INSERT INTO t1 (a, b) VALUES (3, 20); +INSERT INTO t1 (a, b) VALUES (4, 20); +set global rocksdb_force_flush_memtable_now=1; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +SHOW INDEX in t1; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment +t1 0 PRIMARY 1 a A 4 NULL NULL LSMTREE +ALTER TABLE t1 ADD INDEX kb(b), ALGORITHM=INPLACE; +SHOW INDEX in t1; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment +t1 0 PRIMARY 1 a A 4 NULL NULL LSMTREE +t1 1 kb 1 b A 2 NULL NULL YES LSMTREE +DROP TABLE t1; +SET global rocksdb_table_stats_sampling_pct = @prior_rocksdb_table_stats_sampling_pct; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result index ec9c940f250..97310071ef2 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/add_index_inplace_sstfilewriter.result @@ -15,7 +15,7 @@ count(b) 3000000 ALTER TABLE t1 ADD INDEX kb(b), ALGORITHM=INPLACE; ALTER TABLE t1 ADD INDEX kb_copy(b), ALGORITHM=COPY; -ERROR HY000: Status error 10 received from RocksDB: Operation aborted: Failed to acquire lock due to max_num_locks limit +ERROR HY000: Got error 10 'Operation aborted: Failed to acquire lock due to max_num_locks limit' from ROCKSDB set session rocksdb_bulk_load=1; ALTER TABLE t1 ADD INDEX kb_copy(b), ALGORITHM=COPY; set session rocksdb_bulk_load=0; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/allow_no_primary_key.result b/storage/rocksdb/mysql-test/rocksdb/r/allow_no_primary_key.result index d86792a6469..6f811e177ba 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/allow_no_primary_key.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/allow_no_primary_key.result @@ -262,3 +262,28 @@ SELECT * FROM t1; a b 36 foo DROP TABLE t1; +# +# Issue #834/MDEV-15304 ALTER TABLE table_with_hidden_pk causes Can't +# write; duplicate key in table error and/or crash +# +CREATE TABLE t1 (a INT, KEY(a)) ENGINE=RocksDB; +INSERT INTO t1 VALUES (1),(1+1); +create table t2 (a int); +insert into t2 values (10),(20),(30); +BEGIN; +select * from t2; +a +10 +20 +30 +alter table t1 force; +select * from t1; +a +insert into t1 values (100); +select * from t1; +a +1 +2 +100 +rollback; +drop table t1,t2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result index 79030e35225..8b51c126e2c 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_debug.result @@ -59,12 +59,10 @@ insert into t values (); set debug="+d,crash_commit_before"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 4 -select max(i) from t; -max(i) -3 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 # After engine prepare begin; insert into t values (); @@ -72,12 +70,10 @@ insert into t values (); set debug="+d,crash_commit_after_prepare"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 4 -select max(i) from t; -max(i) -3 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 # After binlog begin; insert into t values (); @@ -85,12 +81,10 @@ insert into t values (); set debug="+d,crash_commit_after_log"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 6 -select max(i) from t; -max(i) -5 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 # After everything begin; insert into t values (); @@ -98,10 +92,8 @@ insert into t values (); set debug="+d,crash_commit_after"; commit; ERROR HY000: Lost connection to MySQL server during query -select table_schema, table_name, auto_increment from information_schema.tables where table_name = 't'; -table_schema table_name auto_increment -test t 8 -select max(i) from t; -max(i) -7 +select max(i) into @row_max from t; +select table_schema, table_name, auto_increment > @row_max from information_schema.tables where table_name = 't'; +table_schema table_name auto_increment > @row_max +test t 1 drop table t; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result index 848d362f9e1..17e6bedb882 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/autoinc_vars.result @@ -141,3 +141,21 @@ SELECT * FROM t1; a b 18446744073709551613 a DROP TABLE t1; +#---------------------------------- +# Issue #792 Crash in autoincrement +#---------------------------------- +CREATE TABLE t1(C1 DOUBLE AUTO_INCREMENT KEY,C2 CHAR) ENGINE=ROCKSDB; +INSERT INTO t1 VALUES(2177,0); +DROP TABLE t1; +CREATE TABLE t0(c0 BLOB) ENGINE=ROCKSDB; +INSERT INTO t0 VALUES(0); +ALTER TABLE t0 AUTO_INCREMENT=0; +DROP TABLE t0; +#---------------------------------- +# Issue #869 Crash in autoincrement +#---------------------------------- +CREATE TABLE t1 (pk INT AUTO_INCREMENT, a INT, PRIMARY KEY(pk)) ENGINE=RocksDB; +INSERT INTO t1 (a) VALUES (1); +UPDATE t1 SET pk = 3; +ALTER TABLE t1 AUTO_INCREMENT 2; +DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result new file mode 100644 index 00000000000..058d3608c75 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result @@ -0,0 +1,62 @@ +# +# Issue #809: Wrong query result with bloom filters +# +create table t1 ( +id1 bigint not null, +id2 bigint not null, +id3 varchar(100) not null, +id4 int not null, +id5 int not null, +value bigint, +value2 varchar(100), +primary key (id1, id2, id3, id4) COMMENT 'rev:bf5_1' +) engine=ROCKSDB; +create table t2(a int); +insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t3(seq int); +insert into t3 +select +1+ A.a + B.a* 10 + C.a * 100 + D.a * 1000 +from t2 A, t2 B, t2 C, t2 D; +insert t1 +select +(seq+9) div 10, (seq+4) div 5, (seq+4) div 5, seq, seq, 1000, "aaabbbccc" +from t3; +set global rocksdb_force_flush_memtable_now=1; +# Full table scan +explain +select * from t1 limit 10; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 NULL +select * from t1 limit 10; +id1 id2 id3 id4 id5 value value2 +1000 2000 2000 10000 10000 1000 aaabbbccc +1000 2000 2000 9999 9999 1000 aaabbbccc +1000 2000 2000 9998 9998 1000 aaabbbccc +1000 2000 2000 9997 9997 1000 aaabbbccc +1000 2000 2000 9996 9996 1000 aaabbbccc +1000 1999 1999 9995 9995 1000 aaabbbccc +1000 1999 1999 9994 9994 1000 aaabbbccc +1000 1999 1999 9993 9993 1000 aaabbbccc +1000 1999 1999 9992 9992 1000 aaabbbccc +1000 1999 1999 9991 9991 1000 aaabbbccc +# An index scan starting from the end of the table: +explain +select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index NULL PRIMARY 122 NULL 1 NULL +select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1; +id1 id2 id3 id4 id5 value value2 +1000 2000 2000 10000 10000 1000 aaabbbccc +create table t4 ( +pk int unsigned not null primary key, +kp1 int unsigned not null, +kp2 int unsigned not null, +col1 int unsigned, +key(kp1, kp2) comment 'rev:bf5_2' +) engine=rocksdb; +insert into t4 values (1, 0xFFFF, 0xFFF, 12345); +# This must not fail an assert: +select * from t4 force index(kp1) where kp1=0xFFFFFFFF and kp2<=0xFFFFFFFF order by kp2 desc; +pk kp1 kp2 col1 +drop table t1,t2,t3,t4; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result new file mode 100644 index 00000000000..4b02d1103cf --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter_bulk_load.result @@ -0,0 +1,15 @@ +create table r1 (id bigint primary key, value bigint) engine=rocksdb; +create table r2 (id bigint, value bigint, primary key (id) comment 'cf2') engine=rocksdb; +set session rocksdb_bulk_load=1; +set session rocksdb_bulk_load=0; +select variable_value into @h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +insert into r1 values (100, 100); +select variable_value-@h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +variable_value-@h +1 +select variable_value into @h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +insert into r2 values (100, 100); +select variable_value-@h from information_schema.global_status where variable_name='rocksdb_block_cache_filter_hit'; +variable_value-@h +0 +DROP TABLE r1, r2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result new file mode 100644 index 00000000000..42f820a2a42 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/bulk_load_sk.result @@ -0,0 +1,229 @@ +SET rocksdb_bulk_load_size=15; +CREATE TABLE t4 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +CREATE TABLE t3 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +CREATE TABLE t2 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +CREATE TABLE t1 (a INT, b INT, c INT, +PRIMARY KEY (a), +KEY (b), +KEY (c) COMMENT "rev:cf") ENGINE=ROCKSDB; +SET rocksdb_bulk_load=1; +INSERT INTO t1 SELECT * FROM t3 FORCE INDEX (PRIMARY) ORDER BY a; +SELECT count(*) FROM t1 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t1 FORCE INDEX (b); +count(*) +10 +SELECT count(*) FROM t1 FORCE INDEX (c); +count(*) +10 +SET rocksdb_bulk_load=0; +SELECT * FROM t1 FORCE INDEX (PRIMARY); +a b c +-9 11 11 +-7 9 9 +-5 7 7 +-3 5 5 +-1 3 3 +2 0 0 +4 -2 -2 +6 -4 -4 +8 -6 -6 +10 -8 -8 +SELECT b FROM t1 FORCE INDEX (b); +b +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +SELECT c FROM t1 FORCE INDEX (c); +c +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +Checksums should match +CHECKSUM TABLE t3; +Table Checksum +test.t3 3862424802 +CHECKSUM TABLE t1; +Table Checksum +test.t1 3862424802 +SET rocksdb_bulk_load_allow_sk=1; +SET rocksdb_bulk_load=1; +INSERT INTO t4 SELECT * FROM t3 FORCE INDEX (PRIMARY) ORDER BY a; +SELECT count(*) FROM t4 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t4 FORCE INDEX (b); +count(*) +0 +SELECT count(*) FROM t4 FORCE INDEX (c); +count(*) +0 +SET rocksdb_bulk_load=0; +SELECT * FROM t4 FORCE INDEX (PRIMARY); +a b c +-9 11 11 +-7 9 9 +-5 7 7 +-3 5 5 +-1 3 3 +2 0 0 +4 -2 -2 +6 -4 -4 +8 -6 -6 +10 -8 -8 +SELECT b FROM t4 FORCE INDEX (b); +b +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +SELECT c FROM t4 FORCE INDEX (c); +c +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +Checksums should match +CHECKSUM TABLE t3; +Table Checksum +test.t3 3862424802 +CHECKSUM TABLE t4; +Table Checksum +test.t4 3862424802 +SET rocksdb_bulk_load_allow_unsorted=1; +SET rocksdb_bulk_load_allow_sk=1; +SET rocksdb_bulk_load=1; +INSERT INTO t2 SELECT * FROM t3 WHERE b >= 0 ORDER BY b; +INSERT INTO t2 SELECT * FROM t3 WHERE b < 0 ORDER BY b; +SELECT count(*) FROM t2 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (b); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (c); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (PRIMARY); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (b); +count(*) +0 +SELECT count(*) FROM t2 FORCE INDEX (c); +count(*) +0 +SET rocksdb_bulk_load=0; +SELECT * FROM t2 FORCE INDEX (PRIMARY); +a b c +-19 21 21 +-17 19 19 +-15 17 17 +-13 15 15 +-11 13 13 +-9 11 11 +-7 9 9 +-5 7 7 +-3 5 5 +-1 3 3 +2 0 0 +4 -2 -2 +6 -4 -4 +8 -6 -6 +10 -8 -8 +12 -10 -10 +14 -12 -12 +16 -14 -14 +18 -16 -16 +20 -18 -18 +SELECT b FROM t2 FORCE INDEX (b); +b +-18 +-16 +-14 +-12 +-10 +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +13 +15 +17 +19 +21 +SELECT c FROM t2 FORCE INDEX (c); +c +-18 +-16 +-14 +-12 +-10 +-8 +-6 +-4 +-2 +0 +3 +5 +7 +9 +11 +13 +15 +17 +19 +21 +Checksums should match +CHECKSUM TABLE t3; +Table Checksum +test.t3 1495594118 +CHECKSUM TABLE t2; +Table Checksum +test.t2 1495594118 +DROP TABLE t1; +DROP TABLE t2; +DROP TABLE t3; +DROP TABLE t4; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result b/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result index 94585af6cfc..029c68ae6b1 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/cardinality.result @@ -82,4 +82,19 @@ t1 1 t1_5 2 c1 A 100000 NULL NULL YES LSMTREE SELECT table_name, table_rows FROM information_schema.tables WHERE table_schema = DATABASE(); table_name table_rows t1 100000 -drop table t1; +CREATE TABLE t2 (a INT, b INT, c INT, d INT, e INT, f INT, g INT, +PRIMARY KEY (a), KEY (c, b, a, d, e, f, g)) +ENGINE=ROCKSDB; +SET GLOBAL rocksdb_force_flush_memtable_now = 1; +ANALYZE TABLE t2; +Table Op Msg_type Msg_text +test.t2 analyze status OK +cardinality of the columns after 'a' must be equal to the cardinality of column 'a' +SELECT CARDINALITY INTO @c FROM information_schema.statistics WHERE TABLE_NAME='t2' AND INDEX_NAME='c' AND COLUMN_NAME='a'; +SELECT COLUMN_NAME, CARDINALITY = @c FROM information_schema.statistics WHERE TABLE_NAME='t2' AND INDEX_NAME='c' AND SEQ_IN_INDEX > 3; +COLUMN_NAME CARDINALITY = @c +d 1 +e 1 +f 1 +g 1 +drop table t1, t2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/collation.result b/storage/rocksdb/mysql-test/rocksdb/r/collation.result index edd3fd1f187..b4354ad353f 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/collation.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/collation.result @@ -1,6 +1,7 @@ -SET @start_global_value = @@global.ROCKSDB_STRICT_COLLATION_EXCEPTIONS; -DROP TABLE IF EXISTS t1; +call mtr.add_suppression("Invalid pattern"); CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text) engine=rocksdb charset utf8; +ALTER TABLE t1 ADD INDEX (value); +ERROR HY000: Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). DROP TABLE t1; CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text, index(value)) engine=rocksdb charset utf8; ERROR HY000: Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). @@ -13,6 +14,7 @@ SET GLOBAL rocksdb_strict_collation_check=1; CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text, index(value2)) engine=rocksdb charset utf8; DROP TABLE t1; CREATE TABLE t1 (id varchar(20), value varchar(50), value2 varchar(50), value3 text, primary key (id), index(value, value2)) engine=rocksdb charset latin1 collate latin1_bin; +ALTER TABLE t1 collate=latin1_general_ci; DROP TABLE t1; CREATE TABLE t1 (id varchar(20), value varchar(50), value2 varchar(50), value3 text, primary key (id), index(value, value2)) engine=rocksdb charset utf8 collate utf8_bin; DROP TABLE t1; @@ -126,4 +128,16 @@ CREATE TABLE abcd (id INT PRIMARY KEY, value varchar(50), index(value)) engine=r ERROR HY000: Unsupported collation on string indexed column test.abcd.value Use binary collation (binary, latin1_bin, utf8_bin). DROP TABLE abc; SET GLOBAL rocksdb_strict_collation_exceptions=null; -SET GLOBAL rocksdb_strict_collation_exceptions=@start_global_value; +SET GLOBAL rocksdb_strict_collation_check=1; +CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text, index(value)) engine=rocksdb charset utf8; +Warnings: +Warning 1210 Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). +DROP TABLE t1; +CREATE TABLE t1 (id INT primary key, value varchar(50), value2 varbinary(50), value3 text) engine=rocksdb charset utf8; +ALTER TABLE t1 ADD INDEX (value); +Warnings: +Warning 1210 Unsupported collation on string indexed column test.t1.value Use binary collation (binary, latin1_bin, utf8_bin). +DROP TABLE t1; +CREATE TABLE t1 (id varchar(20), value varchar(50), value2 varchar(50), value3 text, primary key (id), index(value, value2)) engine=rocksdb charset latin1 collate latin1_bin; +ALTER TABLE t1 collate=latin1_general_ci; +DROP TABLE t1; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result b/storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result new file mode 100644 index 00000000000..789ce12e900 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/com_rpc_tx.result @@ -0,0 +1,21 @@ +CREATE DATABASE db_rpc; +USE db_rpc; +CREATE TABLE t1(pk INT PRIMARY KEY) ENGINE=rocksdb; +SET GLOBAL rocksdb_enable_2pc=1; +SET autocommit = 0; +SET autocommit = 0; +BEGIN; +BEGIN; +SELECT * from t1; +pk +SELECT * from t1; +pk +INSERT INTO t1 VALUES(1); +INSERT INTO t1 VALUES(2); +COMMIT; +COMMIT; +SELECT * from db_rpc.t1; +pk +1 +2 +DROP DATABASE db_rpc; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result b/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result new file mode 100644 index 00000000000..e5aeb57ebdf --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb/r/create_no_primary_key_table.result @@ -0,0 +1,38 @@ +USE mysql; +CREATE TABLE mysql_table (a INT) ENGINE=ROCKSDB; +CREATE TABLE test.mysql_table (a INT) ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +USE test; +CREATE TABLE mysql_table (a INT) ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE IF NOT EXISTS mysql_table_2 (a INT) ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_table_no_cols ENGINE=ROCKSDB; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql.mysql_table_2 (a INT) ENGINE=ROCKSDB; +CREATE TABLE mysql_primkey (a INT PRIMARY KEY, b INT, c INT, d INT, INDEX (c)) ENGINE=ROCKSDB; +ALTER TABLE mysql_primkey DROP b, DROP a, ADD (f INT PRIMARY KEY); +ALTER TABLE mysql_primkey DROP PRIMARY KEY; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_primkey2 (a INT PRIMARY KEY, b INT, c INT) ENGINE=ROCKSDB; +ALTER TABLE mysql_primkey2 DROP b; +ALTER TABLE mysql_primkey2 ADD (b INT); +ALTER TABLE mysql_primkey2 DROP c, DROP A; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +CREATE TABLE mysql_primkey3 (a INT PRIMARY KEY, b INT, c INT, INDEX indexonb (b), INDEX indexonc (c)) ENGINE=ROCKSDB; +ALTER TABLE mysql_primkey3 DROP INDEX indexonb; +ALTER TABLE mysql_primkey3 DROP c; +ALTER TABLE mysql_primkey3 DROP PRIMARY KEY, ADD PRIMARY KEY(b); +CREATE TABLE mysql_primkey4(a INT, b INT, PRIMARY KEY(a), INDEX si (a, b)) ENGINE=ROCKSDB; +DROP INDEX si ON mysql_primkey4; +DROP INDEX `PRIMARY` ON mysql_primkey4; +ERROR HY000: Table without primary key cannot be created outside mysql schema. +ALTER TABLE mysql.mysql_table ADD PRIMARY KEY (a); +ALTER TABLE mysql.mysql_table DROP PRIMARY KEY; +DROP TABLE mysql_primkey; +DROP TABLE mysql_primkey2; +DROP TABLE mysql_primkey3; +DROP TABLE mysql_primkey4; +USE mysql; +DROP TABLE mysql_table; +DROP TABLE mysql_table_2; diff --git a/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result index 39130475349..50733f81598 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/ddl_high_priority.result @@ -45,7 +45,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 alter table t1 modify i bigint;; set high_priority_ddl = 0; @@ -98,7 +98,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 alter table t1 rename t1_new;; set high_priority_ddl = 0; @@ -152,7 +152,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop table t1;; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 @@ -202,7 +202,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop table t1;; set high_priority_ddl = 0; @@ -251,7 +251,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 alter table t1 modify i bigint;; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 @@ -302,7 +302,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 create index idx1 on t1 (i);; set high_priority_ddl = 0; @@ -342,7 +342,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop index idx1 on t1;; set high_priority_ddl = 0; @@ -390,7 +390,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 truncate t1;; set high_priority_ddl = 0; @@ -438,7 +438,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 create trigger ins_sum before insert on t1 for each row set @sum = @sum + new.i;; set high_priority_ddl = 0; @@ -478,7 +478,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 drop trigger ins_sum;; set high_priority_ddl = 0; @@ -528,7 +528,7 @@ set high_priority_ddl = 1; select @@high_priority_ddl; @@high_priority_ddl 1 -lock tables t1 write; +rename table t1 to t2; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on table metadata: test.t1 optimize table t1;; Table Op Msg_type Msg_text @@ -538,6 +538,55 @@ connection: default (for show processlist) show processlist; Id User Host db Command Time State Info Rows examined Rows sent Tid Srv_Id root test