From d039346a7acac7c72f264377a8cd6b0273c548df Mon Sep 17 00:00:00 2001 From: Kristian Nielsen Date: Fri, 8 Sep 2023 13:12:49 +0200 Subject: [PATCH] MDEV-4991: GTID binlog indexing Improve the performance of slave connect using B+-Tree indexes on each binlog file. The index allows fast lookup of a GTID position to the corresponding offset in the binlog file, as well as lookup of a position to find the corresponding GTID position. This eliminates a costly sequential scan of the starting binlog file to find the GTID starting position when a slave connects. This is especially costly if the binlog file is not cached in memory (IO cost), or if it is encrypted or a lot of slaves connect simultaneously (CPU cost). The size of the index files is generally less than 1% of the binlog data, so not expected to be an issue. Most of the work writing the index is done as a background task, in the binlog background thread. This minimises the performance impact on transaction commit. A simple global mutex is used to protect index reads and (background) index writes; this is fine as slave connect is a relatively infrequent operation. Here are the user-visible options and status variables. The feature is on by default and is expected to need no tuning or configuration for most users. binlog_gtid_index On by default. Can be used to disable the indexes for testing purposes. binlog_gtid_index_page_size (default 4096) Page size to use for the binlog GTID index. This is the size of the nodes in the B+-tree used internally in the index. A very small page-size (64 is the minimum) will be less efficient, but can be used to stress the BTree-code during testing. binlog_gtid_index_span_min (default 65536) Control sparseness of the binlog GTID index. If set to N, at most one index record will be added for every N bytes of binlog file written. This can be used to reduce the number of records in the index, at the cost only of having to scan a few more events in the binlog file before finding the target position Two status variables are available to monitor the use of the GTID indexes: Binlog_gtid_index_hit Binlog_gtid_index_miss The "hit" status increments for each successful lookup in a GTID index. The "miss" increments when a lookup is not possible. This indicates that the index file is missing (eg. binlog written by old server version without GTID index support), or corrupt. Signed-off-by: Kristian Nielsen --- libmysqld/CMakeLists.txt | 2 +- mysql-test/main/mysqld--help.result | 14 + .../suite/binlog/r/binlog_gtid_index.result | 135 ++ .../binlog/r/binlog_gtid_index_crash.result | 28 + .../suite/binlog/t/binlog_gtid_index.test | 229 +++ .../binlog/t/binlog_gtid_index_crash.opt | 1 + .../binlog/t/binlog_gtid_index_crash.test | 66 + ...d_master_switch_to_unencrypted_gtid.result | 6 +- ...ted_master_switch_to_unencrypted_gtid.test | 10 +- .../galera/r/galera_gtid_server_id.result | 12 + .../perfschema/r/dml_setup_instruments.result | 2 +- mysql-test/suite/perfschema/r/relaylog.result | 2 + .../suite/rpl/include/rpl_gtid_index.inc | 187 +++ .../rpl/r/rpl_gtid_glle_no_terminate.result | 1 - mysql-test/suite/rpl/r/rpl_gtid_index.result | 366 +++++ .../rpl/t/rpl_gtid_glle_no_terminate.test | 1 + mysql-test/suite/rpl/t/rpl_gtid_index.test | 89 + .../sys_vars/r/sysvars_server_embedded.result | 30 + .../r/sysvars_server_notembedded.result | 30 + sql/CMakeLists.txt | 2 +- sql/gtid_index.cc | 1434 +++++++++++++++++ sql/gtid_index.h | 521 ++++++ sql/log.cc | 516 +++++- sql/log.h | 20 +- sql/mysqld.cc | 19 +- sql/mysqld.h | 9 +- sql/privilege.h | 9 + sql/rpl_gtid.cc | 441 +++-- sql/rpl_gtid.h | 39 +- sql/rpl_rli.cc | 2 +- sql/sql_repl.cc | 318 +++- sql/sys_vars.cc | 30 + 32 files changed, 4315 insertions(+), 256 deletions(-) create mode 100644 mysql-test/suite/binlog/r/binlog_gtid_index.result create mode 100644 mysql-test/suite/binlog/r/binlog_gtid_index_crash.result create mode 100644 mysql-test/suite/binlog/t/binlog_gtid_index.test create mode 100644 mysql-test/suite/binlog/t/binlog_gtid_index_crash.opt create mode 100644 mysql-test/suite/binlog/t/binlog_gtid_index_crash.test create mode 100644 mysql-test/suite/rpl/include/rpl_gtid_index.inc create mode 100644 mysql-test/suite/rpl/r/rpl_gtid_index.result create mode 100644 mysql-test/suite/rpl/t/rpl_gtid_index.test create mode 100644 sql/gtid_index.cc create mode 100644 sql/gtid_index.h diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt index b8f805be518..ee643e60709 100644 --- a/libmysqld/CMakeLists.txt +++ b/libmysqld/CMakeLists.txt @@ -126,7 +126,7 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc ../sql/sql_expression_cache.cc ../sql/my_apc.cc ../sql/my_apc.h ../sql/my_json_writer.cc ../sql/my_json_writer.h - ../sql/rpl_gtid.cc + ../sql/rpl_gtid.cc ../sql/gtid_index.cc ../sql/sql_explain.cc ../sql/sql_explain.h ../sql/sql_analyze_stmt.cc ../sql/sql_analyze_stmt.h ../sql/compat56.cc diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result index 66565065714..96b251de7db 100644 --- a/mysql-test/main/mysqld--help.result +++ b/mysql-test/main/mysqld--help.result @@ -98,6 +98,17 @@ The following specify which files/extra groups are read (specified before remain involve user-defined functions (i.e. UDFs) or the UUID() function; for those, row-based binary logging is automatically used. + --binlog-gtid-index Enable the creation of a GTID index for every binlog + file, and the use of such index for speeding up GTID + lookup in the binlog. + (Defaults to on; use --skip-binlog-gtid-index to disable.) + --binlog-gtid-index-page-size=# + Page size to use for the binlog GTID index. + --binlog-gtid-index-span-min=# + Control sparseness of the binlog GTID index. If set to N, + at most one index record will be added for every N bytes + of binlog file written, to reduce the size of the index. + Normally does not need tuning. --binlog-ignore-db=name Tells the master that updates to the given database should not be logged to the binary log. @@ -1597,6 +1608,9 @@ binlog-direct-non-transactional-updates FALSE binlog-expire-logs-seconds 0 binlog-file-cache-size 16384 binlog-format MIXED +binlog-gtid-index TRUE +binlog-gtid-index-page-size 4096 +binlog-gtid-index-span-min 65536 binlog-legacy-event-pos FALSE binlog-optimize-thread-scheduling TRUE binlog-row-event-max-size 8192 diff --git a/mysql-test/suite/binlog/r/binlog_gtid_index.result b/mysql-test/suite/binlog/r/binlog_gtid_index.result new file mode 100644 index 00000000000..e53e1aac8e3 --- /dev/null +++ b/mysql-test/suite/binlog/r/binlog_gtid_index.result @@ -0,0 +1,135 @@ +SET GLOBAL binlog_gtid_index= 0; +SET GLOBAL binlog_gtid_index= 1; +SET @gtid1= @@gtid_binlog_pos; +CREATE TABLE t1 (a INT PRIMARY KEY); +SET @gtid2= @@gtid_binlog_pos; +INSERT INTO t1 VALUES (1); +SET @gtid3= @@gtid_binlog_pos; +INSERT INTO t1 VALUES (2); +INSERT INTO t1 VALUES (3); +INSERT INTO t1 VALUES (4); +SET @gtid4= @@gtid_binlog_pos; +INSERT INTO t1 VALUES (5); +SET @gtid5= @@gtid_binlog_pos; +SET @gtid6= @@gtid_binlog_pos; +INSERT INTO t1 VALUES (106); +INSERT INTO t1 VALUES (107); +Ok +1 +Ok +1 +Ok +1 +Ok +1 +Ok +1 +Ok +1 +FLUSH BINARY LOGS; +Ok +1 +Ok +1 +Ok +1 +Ok +1 +Ok +1 +Ok +1 +*** Test that purge deletes the gtid index files. *** +FLUSH BINARY LOGS; +INSERT INTO t1 VALUES (200); +FLUSH BINARY LOGS; +INSERT INTO t1 VALUES (201); +FLUSH BINARY LOGS; +INSERT INTO t1 VALUES (202); +PURGE BINARY LOGS TO 'FILE'; +*** Test missed index lookup due to missing or corrupt index file. +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +INSERT INTO t1 VALUES (301); +INSERT INTO t1 VALUES (302); +INSERT INTO t1 VALUES (303); +SET @gtid_pos= @@GLOBAL.gtid_binlog_pos; +INSERT INTO t1 VALUES (304); +INSERT INTO t1 VALUES (305); +FLUSH NO_WRITE_TO_BINLOG STATUS; ++++ Initial status: +SHOW STATUS LIKE 'binlog_gtid_index_%'; +Variable_name Value +Binlog_gtid_index_hit 0 +Binlog_gtid_index_miss 0 ++++ GTID Lookup in good index. +Gtid_Lookup_Ok +1 +SHOW STATUS LIKE 'binlog_gtid_index_%'; +Variable_name Value +Binlog_gtid_index_hit 1 +Binlog_gtid_index_miss 0 ++++ GTID Lookup, index file is missing. +Gtid_Lookup_Ok +1 +SHOW STATUS LIKE 'binlog_gtid_index_%'; +Variable_name Value +Binlog_gtid_index_hit 1 +Binlog_gtid_index_miss 1 +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +INSERT INTO t1 VALUES (306); +SET @gtid_pos= @@GLOBAL.gtid_binlog_pos; +INSERT INTO t1 VALUES (307); +INSERT INTO t1 VALUES (308); +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; ++++ GTID Lookup, first page of index is corrupt. +Gtid_Lookup_Ok +1 +SHOW STATUS LIKE 'binlog_gtid_index_%'; +Variable_name Value +Binlog_gtid_index_hit 1 +Binlog_gtid_index_miss 2 +SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size; +SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min; +SET GLOBAL binlog_gtid_index_page_size= 64; +SET GLOBAL binlog_gtid_index_span_min= 1; +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +INSERT INTO t1 VALUES (310); +INSERT INTO t1 VALUES (311); +INSERT INTO t1 VALUES (312); +SET @gtid_pos= @@GLOBAL.gtid_binlog_pos; +INSERT INTO t1 VALUES (313); +INSERT INTO t1 VALUES (314); +INSERT INTO t1 VALUES (315); +INSERT INTO t1 VALUES (316); +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +SET GLOBAL binlog_gtid_index_page_size= @old_page_size; +SET GLOBAL binlog_gtid_index_span_min= @old_span_min; ++++ GTID Lookup, root page of index is corrupt. +Gtid_Lookup_Ok +1 +SHOW STATUS LIKE 'binlog_gtid_index_%'; +Variable_name Value +Binlog_gtid_index_hit 1 +Binlog_gtid_index_miss 3 +*** Test BINLOG_GTID_POS() with too-large offset. +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +INSERT INTO t1 VALUES (401); +INSERT INTO t1 VALUES (402); ++++ Test the hot index. +SELECT BINLOG_GTID_POS('FILE', 100000000); +BINLOG_GTID_POS('FILE', 100000000) +NULL +SHOW STATUS LIKE 'binlog_gtid_index_%'; +Variable_name Value +Binlog_gtid_index_hit 2 +Binlog_gtid_index_miss 3 +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; ++++ Test the cold index. +SELECT BINLOG_GTID_POS('FILE', 100000000); +BINLOG_GTID_POS('FILE', 100000000) +NULL +SHOW STATUS LIKE 'binlog_gtid_index_%'; +Variable_name Value +Binlog_gtid_index_hit 3 +Binlog_gtid_index_miss 3 +DROP TABLE t1; diff --git a/mysql-test/suite/binlog/r/binlog_gtid_index_crash.result b/mysql-test/suite/binlog/r/binlog_gtid_index_crash.result new file mode 100644 index 00000000000..91e5a6c9df1 --- /dev/null +++ b/mysql-test/suite/binlog/r/binlog_gtid_index_crash.result @@ -0,0 +1,28 @@ +*** Test that binlog GTID index is recovered after a crash. +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; +Ok +1 +Ok +1 +Ok +1 +SHOW STATUS LIKE 'binlog_gtid_index_%'; +Variable_name Value +Binlog_gtid_index_hit 3 +Binlog_gtid_index_miss 0 +*** Crash the server, check that GTID index can be used after restart. +SET debug_dbug="d,crash_shutdown"; +shutdown; +ERROR HY000: Lost connection to server during query +FLUSH NO_WRITE_TO_BINLOG STATUS; +Ok +1 +Ok +1 +Ok +1 +SHOW STATUS LIKE 'binlog_gtid_index_%'; +Variable_name Value +Binlog_gtid_index_hit 3 +Binlog_gtid_index_miss 0 +DROP TABLE t1; diff --git a/mysql-test/suite/binlog/t/binlog_gtid_index.test b/mysql-test/suite/binlog/t/binlog_gtid_index.test new file mode 100644 index 00000000000..458b77ec9a7 --- /dev/null +++ b/mysql-test/suite/binlog/t/binlog_gtid_index.test @@ -0,0 +1,229 @@ +--source include/have_binlog_format_mixed.inc + +SET GLOBAL binlog_gtid_index= 0; +SET GLOBAL binlog_gtid_index= 1; + +--let $file= query_get_value(SHOW MASTER STATUS, File, 1) +--let $pos1= query_get_value(SHOW MASTER STATUS, Position, 1) +SET @gtid1= @@gtid_binlog_pos; +CREATE TABLE t1 (a INT PRIMARY KEY); +--let $pos2= query_get_value(SHOW MASTER STATUS, Position, 1) +SET @gtid2= @@gtid_binlog_pos; +INSERT INTO t1 VALUES (1); +--let $pos3= query_get_value(SHOW MASTER STATUS, Position, 1) +SET @gtid3= @@gtid_binlog_pos; +INSERT INTO t1 VALUES (2); +INSERT INTO t1 VALUES (3); +INSERT INTO t1 VALUES (4); +--let $pos4= query_get_value(SHOW MASTER STATUS, Position, 1) +SET @gtid4= @@gtid_binlog_pos; +INSERT INTO t1 VALUES (5); +--let $pos5= query_get_value(SHOW MASTER STATUS, Position, 1) +SET @gtid5= @@gtid_binlog_pos; + +--disable_query_log +--let $i=0 +while ($i < 100) { + eval INSERT INTO t1 VALUES (6 + $i); + inc $i; +} +--enable_query_log +--let $pos6= query_get_value(SHOW MASTER STATUS, Position, 1) +SET @gtid6= @@gtid_binlog_pos; + +INSERT INTO t1 VALUES (106); +INSERT INTO t1 VALUES (107); + +# Test first the hot and then the cold index. +--let $i= 0 +while ($i < 2) { + --disable_query_log + eval SELECT BINLOG_GTID_POS('$file', $pos1) = @gtid1 AS Ok; + eval SELECT BINLOG_GTID_POS('$file', $pos2) = @gtid2 AS Ok; + eval SELECT BINLOG_GTID_POS('$file', $pos3) = @gtid3 AS Ok; + eval SELECT BINLOG_GTID_POS('$file', $pos4) = @gtid4 AS Ok; + eval SELECT BINLOG_GTID_POS('$file', $pos5) = @gtid5 AS Ok; + eval SELECT BINLOG_GTID_POS('$file', $pos6) = @gtid6 AS Ok; + --enable_query_log + + inc $i; + if ($i == 1) { + FLUSH BINARY LOGS; + } +} + +--echo *** Test that purge deletes the gtid index files. *** +FLUSH BINARY LOGS; +INSERT INTO t1 VALUES (200); +--let $file2= query_get_value(SHOW MASTER STATUS, File, 1) +FLUSH BINARY LOGS; +INSERT INTO t1 VALUES (201); +--let $file3= query_get_value(SHOW MASTER STATUS, File, 1) +FLUSH BINARY LOGS; +INSERT INTO t1 VALUES (202); +--let $file4= query_get_value(SHOW MASTER STATUS, File, 1) + +--replace_result $file3 FILE +eval PURGE BINARY LOGS TO '$file3'; + +--let $MYSQLD_DATADIR= `select @@datadir` +--error 1 +--file_exists $MYSQLD_DATADIR/$file.idx +--error 1 +--file_exists $MYSQLD_DATADIR/$file2.idx +--file_exists $MYSQLD_DATADIR/$file3.idx +--file_exists $MYSQLD_DATADIR/$file4.idx + +--echo *** Test missed index lookup due to missing or corrupt index file. +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +--let $file= query_get_value(SHOW MASTER STATUS, File, 1) +INSERT INTO t1 VALUES (301); +INSERT INTO t1 VALUES (302); +INSERT INTO t1 VALUES (303); +--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1) +SET @gtid_pos= @@GLOBAL.gtid_binlog_pos; +INSERT INTO t1 VALUES (304); +INSERT INTO t1 VALUES (305); + +# BINLOG_GTID_POS() has a side effect: it increments binlog_gtid_index_hit +--disable_ps2_protocol +FLUSH NO_WRITE_TO_BINLOG STATUS; +--echo +++ Initial status: +SHOW STATUS LIKE 'binlog_gtid_index_%'; +--echo +++ GTID Lookup in good index. +--disable_query_log +eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok; +--enable_query_log +SHOW STATUS LIKE 'binlog_gtid_index_%'; +--remove_file $MYSQLD_DATADIR/$file.idx +--echo +++ GTID Lookup, index file is missing. +--disable_query_log +eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok; +--enable_query_log +SHOW STATUS LIKE 'binlog_gtid_index_%'; + +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +--let $file= query_get_value(SHOW MASTER STATUS, File, 1) +INSERT INTO t1 VALUES (306); +--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1) +SET @gtid_pos= @@GLOBAL.gtid_binlog_pos; +INSERT INTO t1 VALUES (307); +INSERT INTO t1 VALUES (308); +# Rotate again so we hit an on-disk index file, not the "hot" index. +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; + +# Corrupt the flag byte of the first page with an unused bit. +--let FILE_TO_CORRUPT= $MYSQLD_DATADIR/$file.idx +--perl +use strict; +use warnings; +use Fcntl qw(:DEFAULT :seek); +sysopen F, $ENV{FILE_TO_CORRUPT}, O_RDWR + or die "Cannot open file $ENV{FILE_TO_CORRUPT}: $!\n"; +# Corrupt the flag byte with an unused flag. +sysseek(F, 16, SEEK_SET) + or die "Cannot seek file: $!\n"; +my $buf; +sysread(F, $buf, 1) + or die "Cannot read file: $!\n"; +$buf= chr(ord($buf) | 0x80); +sysseek(F, 16, SEEK_SET) + or die "Cannot seek file: $!\n"; +syswrite(F, $buf, 1) == 1 + or die "Cannot write file: $!\n"; +close F; +EOF + +--echo +++ GTID Lookup, first page of index is corrupt. +--disable_query_log +eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok; +--enable_query_log +SHOW STATUS LIKE 'binlog_gtid_index_%'; + +# Corrupt the last byte of the root page. +# Set a small page-size so we test corruption in something not the header page. +SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size; +SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min; +SET GLOBAL binlog_gtid_index_page_size= 64; +SET GLOBAL binlog_gtid_index_span_min= 1; +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +--let $file= query_get_value(SHOW MASTER STATUS, File, 1) +INSERT INTO t1 VALUES (310); +INSERT INTO t1 VALUES (311); +INSERT INTO t1 VALUES (312); +--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1) +SET @gtid_pos= @@GLOBAL.gtid_binlog_pos; +INSERT INTO t1 VALUES (313); +INSERT INTO t1 VALUES (314); +INSERT INTO t1 VALUES (315); +INSERT INTO t1 VALUES (316); +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +SET GLOBAL binlog_gtid_index_page_size= @old_page_size; +SET GLOBAL binlog_gtid_index_span_min= @old_span_min; + +--let FILE_TO_CORRUPT= $MYSQLD_DATADIR/$file.idx +--perl +use strict; +use warnings; +use Fcntl qw(:DEFAULT :seek); +sysopen F, $ENV{FILE_TO_CORRUPT}, O_RDWR + or die "Cannot open file $ENV{FILE_TO_CORRUPT}: $!\n"; +# Tricky: The index is written asynchroneously, it may still be incomplete. +# So wait for the file to be written completely with a root node at the end. +my $count= 0; +for (;;) { + my $end= sysseek(F, 0, SEEK_END); + if ($end > 0 && ($end % 64) == 0) { + # The index file is non-empty with a full page at the end, test if the + # root page has been fully written. This is seen as bit 2 (PAGE_FLAG_LAST) + # and bit 3 (PAGE_FLAG_ROOT) being set (0xc). + my $flag; + if (sysseek(F, -64, SEEK_CUR) && + sysread(F, $flag, 1) && + (ord($flag) & 0xc) == 0xc) { + last; + } + } + die "Timeout waiting for GTID index to be non-empty\n" + if ++$count >= 500; + # Simple way to do sub-second sleep. + select(undef, undef, undef, 0.050); +} +# Corrupt the flag byte with an unused flag. +sysseek(F, -2, SEEK_END) + or die "Cannot seek file: $!\n"; +my $buf; +sysread(F, $buf, 1) + or die "Cannot read file: $!\n"; +$buf= chr(ord($buf) ^ 0x4); +sysseek(F, -2, SEEK_END) + or die "Cannot seek file: $!\n"; +syswrite(F, $buf, 1) == 1 + or die "Cannot write file: $!\n"; +close F; +EOF + +--echo +++ GTID Lookup, root page of index is corrupt. +--disable_query_log +eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok; +--enable_query_log +SHOW STATUS LIKE 'binlog_gtid_index_%'; + +--echo *** Test BINLOG_GTID_POS() with too-large offset. +# New binlog to skip the now corrupted one. +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +--let $file= query_get_value(SHOW MASTER STATUS, File, 1) +INSERT INTO t1 VALUES (401); +INSERT INTO t1 VALUES (402); +--echo +++ Test the hot index. +--replace_result $file FILE +eval SELECT BINLOG_GTID_POS('$file', 100000000); +SHOW STATUS LIKE 'binlog_gtid_index_%'; +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +--echo +++ Test the cold index. +--replace_result $file FILE +eval SELECT BINLOG_GTID_POS('$file', 100000000); +SHOW STATUS LIKE 'binlog_gtid_index_%'; +--enable_ps2_protocol + +DROP TABLE t1; diff --git a/mysql-test/suite/binlog/t/binlog_gtid_index_crash.opt b/mysql-test/suite/binlog/t/binlog_gtid_index_crash.opt new file mode 100644 index 00000000000..993e6613cc6 --- /dev/null +++ b/mysql-test/suite/binlog/t/binlog_gtid_index_crash.opt @@ -0,0 +1 @@ +--binlog-gtid-index-page-size=128 --binlog-gtid-index-span-min=1 diff --git a/mysql-test/suite/binlog/t/binlog_gtid_index_crash.test b/mysql-test/suite/binlog/t/binlog_gtid_index_crash.test new file mode 100644 index 00000000000..965e08a4015 --- /dev/null +++ b/mysql-test/suite/binlog/t/binlog_gtid_index_crash.test @@ -0,0 +1,66 @@ +--source include/have_innodb.inc +# Don't test this under valgrind, memory leaks will occur +--source include/not_valgrind.inc +# Avoid CrashReporter popup on Mac +--source include/not_crashrep.inc +# Binary must be compiled with debug for crash to occur +--source include/have_debug.inc +--source include/have_binlog_format_row.inc + +# We have an .opt file that sets a small page size and disables sparseness, +# so we get something non-trivial in the GTID index even with a small amount +# of binlogged events. + +--echo *** Test that binlog GTID index is recovered after a crash. +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; + +--disable_query_log +INSERT INTO t1 VALUES (0, 0); +INSERT INTO t1 VALUES (1, 0); +INSERT INTO t1 VALUES (2, 0); +--let $i= 10 +while ($i < 20) { + eval INSERT INTO t1 VALUES ($i, 0); + inc $i; +} +--let $file= query_get_value(SHOW MASTER STATUS, File, 1) +--let $pos1= query_get_value(SHOW MASTER STATUS, Position, 1) +--let $gtid1= `SELECT @@gtid_binlog_pos` +while ($i < 30) { + eval INSERT INTO t1 VALUES ($i, 0); + inc $i; +} +--let $pos2= query_get_value(SHOW MASTER STATUS, Position, 1) +--let $gtid2= `SELECT @@gtid_binlog_pos` +while ($i < 40) { + eval INSERT INTO t1 VALUES ($i, 0); + inc $i; +} +--let $pos3= query_get_value(SHOW MASTER STATUS, Position, 1) +--let $gtid3= `SELECT @@gtid_binlog_pos` +INSERT INTO t1 VALUES (50, 0); +INSERT INTO t1 VALUES (51, 0); + +--disable_ps2_protocol +FLUSH NO_WRITE_TO_BINLOG STATUS; +eval SELECT BINLOG_GTID_POS('$file', $pos1) = "$gtid1" AS Ok; +eval SELECT BINLOG_GTID_POS('$file', $pos2) = "$gtid2" AS Ok; +eval SELECT BINLOG_GTID_POS('$file', $pos3) = "$gtid3" AS Ok; +--enable_query_log +SHOW STATUS LIKE 'binlog_gtid_index_%'; +--enable_ps2_protocol + +--echo *** Crash the server, check that GTID index can be used after restart. +--source include/crash_mysqld.inc + +--disable_ps2_protocol +FLUSH NO_WRITE_TO_BINLOG STATUS; +--disable_query_log +eval SELECT BINLOG_GTID_POS('$file', $pos1) = "$gtid1" AS Ok; +eval SELECT BINLOG_GTID_POS('$file', $pos2) = "$gtid2" AS Ok; +eval SELECT BINLOG_GTID_POS('$file', $pos3) = "$gtid3" AS Ok; +--enable_query_log +SHOW STATUS LIKE 'binlog_gtid_index_%'; +--enable_ps2_protocol + +DROP TABLE t1; diff --git a/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.result b/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.result index c6835ff90f4..d05e3abd068 100644 --- a/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.result +++ b/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.result @@ -6,6 +6,7 @@ connection server_2; include/stop_slave.inc CHANGE MASTER TO MASTER_USE_GTID=SLAVE_POS; call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not set up decryption for binlog.'"); +call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not decrypt binlog: encryption key error"); ##################################################### # Part 1: unencrypted master ##################################################### @@ -58,10 +59,11 @@ INSERT INTO table3_no_encryption SELECT NULL,NOW(),b FROM table3_no_encryption; connection server_2; start slave; include/wait_for_slave_io_error.inc [errno=1236] -# Ensuring slave was unable to replicate any transactions.. +# Ensuring slave was unable to replicate any encrypted transactions.. # ..success SHOW TABLES; Tables_in_test +table1_no_encryption include/stop_slave_sql.inc reset slave; ########## @@ -80,5 +82,7 @@ COUNT(*) 4 DROP TABLE table1_no_encryption, table2_to_encrypt, table3_no_encryption; connection server_2; +RESET MASTER; +SET GLOBAL gtid_slave_pos= ''; include/start_slave.inc include/rpl_end.inc diff --git a/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.test b/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.test index 9991fb9b1b9..7f717190cbf 100644 --- a/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.test +++ b/mysql-test/suite/binlog_encryption/encrypted_master_switch_to_unencrypted_gtid.test @@ -36,6 +36,7 @@ CHANGE MASTER TO MASTER_USE_GTID=SLAVE_POS; --enable_connect_log call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not set up decryption for binlog.'"); +call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not decrypt binlog: encryption key error"); --echo ##################################################### --echo # Part 1: unencrypted master @@ -55,6 +56,7 @@ FLUSH BINARY LOGS; SET binlog_format=ROW; INSERT INTO table1_no_encryption SELECT NULL,NOW(),b FROM table1_no_encryption; INSERT INTO table1_no_encryption SELECT NULL,NOW(),b FROM table1_no_encryption; +--let $last_unencrypted_gtid= `SELECT @@gtid_binlog_pos` # Make sure that binary logs are not encrypted @@ -120,11 +122,11 @@ start slave; --let $slave_io_errno= 1236 --source include/wait_for_slave_io_error.inc ---echo # Ensuring slave was unable to replicate any transactions.. +--echo # Ensuring slave was unable to replicate any encrypted transactions.. --let $gsp= `SELECT @@global.gtid_slave_pos` -if (`SELECT strcmp("$gsp","")`) +if (`SELECT strcmp("$gsp","$last_unencrypted_gtid")`) { - die Slave without encryption configured should fail to read encrypted binlog; + die Slave without encryption configured should fail to read encrypted binlog (expected $last_unencrypted_gtid but got $gsp); } --echo # ..success @@ -151,5 +153,7 @@ DROP TABLE table1_no_encryption, table2_to_encrypt, table3_no_encryption; --connection server_2 --disable_connect_log +RESET MASTER; +SET GLOBAL gtid_slave_pos= ''; --source include/start_slave.inc --source include/rpl_end.inc diff --git a/mysql-test/suite/galera/r/galera_gtid_server_id.result b/mysql-test/suite/galera/r/galera_gtid_server_id.result index 8765fcc1636..2dbe67b84ae 100644 --- a/mysql-test/suite/galera/r/galera_gtid_server_id.result +++ b/mysql-test/suite/galera/r/galera_gtid_server_id.result @@ -14,6 +14,9 @@ CREATE TABLE t1(id int not null primary key) engine=innodb; INSERT INTO t1 values (1); show global variables like '%gtid%'; Variable_name Value +binlog_gtid_index ON +binlog_gtid_index_page_size 4096 +binlog_gtid_index_span_min 65536 gtid_binlog_pos 1-11-2 gtid_binlog_state 1-11-2 gtid_cleanup_batch_size 64 @@ -29,6 +32,9 @@ connection node_2; SET SESSION wsrep_sync_wait=15; show global variables like '%gtid%'; Variable_name Value +binlog_gtid_index ON +binlog_gtid_index_page_size 4096 +binlog_gtid_index_span_min 65536 gtid_binlog_pos 0-12-1,1-11-2 gtid_binlog_state 0-12-1,1-11-2 gtid_cleanup_batch_size 64 @@ -55,6 +61,9 @@ Error 1231 Variable 'server_id' can't be set to the value of '200' INSERT INTO t1 values(2); show global variables like '%gtid%'; Variable_name Value +binlog_gtid_index ON +binlog_gtid_index_page_size 4096 +binlog_gtid_index_span_min 65536 gtid_binlog_pos 0-12-1,1-11-3 gtid_binlog_state 0-12-1,1-11-3 gtid_cleanup_batch_size 64 @@ -69,6 +78,9 @@ wsrep_gtid_mode ON connection node_1; show global variables like '%gtid%'; Variable_name Value +binlog_gtid_index ON +binlog_gtid_index_page_size 4096 +binlog_gtid_index_span_min 65536 gtid_binlog_pos 1-11-3 gtid_binlog_state 1-11-3 gtid_cleanup_batch_size 64 diff --git a/mysql-test/suite/perfschema/r/dml_setup_instruments.result b/mysql-test/suite/perfschema/r/dml_setup_instruments.result index cdc52da54dc..ff000a09312 100644 --- a/mysql-test/suite/perfschema/r/dml_setup_instruments.result +++ b/mysql-test/suite/perfschema/r/dml_setup_instruments.result @@ -8,12 +8,12 @@ wait/synch/mutex/sql/Ack_receiver::mutex YES YES wait/synch/mutex/sql/Cversion_lock YES YES wait/synch/mutex/sql/Delayed_insert::mutex YES YES wait/synch/mutex/sql/Event_scheduler::LOCK_scheduler_state YES YES +wait/synch/mutex/sql/Gtid_index_writer::gtid_index_mutex YES YES wait/synch/mutex/sql/gtid_waiting::LOCK_gtid_waiting YES YES wait/synch/mutex/sql/hash_filo::lock YES YES wait/synch/mutex/sql/HA_DATA_PARTITION::LOCK_auto_inc YES YES wait/synch/mutex/sql/LOCK_active_mi YES YES wait/synch/mutex/sql/LOCK_after_binlog_sync YES YES -wait/synch/mutex/sql/LOCK_audit_mask YES YES select * from performance_schema.setup_instruments where name like 'Wait/Synch/Rwlock/sql/%' and name not in ( diff --git a/mysql-test/suite/perfschema/r/relaylog.result b/mysql-test/suite/perfschema/r/relaylog.result index ce3e9c04a5e..7cc87530770 100644 --- a/mysql-test/suite/perfschema/r/relaylog.result +++ b/mysql-test/suite/perfschema/r/relaylog.result @@ -23,6 +23,7 @@ from performance_schema.file_summary_by_instance where file_name like "%master-%" order by file_name; FILE_NAME EVENT_NAME COUNT_READ COUNT_WRITE SUM_NUMBER_OF_BYTES_READ SUM_NUMBER_OF_BYTES_WRITE master-bin.000001 wait/io/file/sql/binlog MANY MANY MANY MANY +master-bin.000001.idx wait/io/file/sql/gtid_index NONE MANY NONE MANY master-bin.index wait/io/file/sql/binlog_index MANY MANY MANY MANY select * from performance_schema.file_summary_by_instance where file_name like "%slave-%" order by file_name; @@ -112,6 +113,7 @@ where file_name like "%slave-%" order by file_name; FILE_NAME EVENT_NAME COUNT_READ COUNT_WRITE SUM_NUMBER_OF_BYTES_READ SUM_NUMBER_OF_BYTES_WRITE slave-bin.000001 wait/io/file/sql/binlog MANY MANY MANY MANY +slave-bin.000001.idx wait/io/file/sql/gtid_index NONE MANY NONE MANY slave-bin.index wait/io/file/sql/binlog_index MANY MANY MANY MANY slave-relay-bin.000001 wait/io/file/sql/relaylog MANY MANY MANY MANY slave-relay-bin.000002 wait/io/file/sql/relaylog MANY MANY MANY MANY diff --git a/mysql-test/suite/rpl/include/rpl_gtid_index.inc b/mysql-test/suite/rpl/include/rpl_gtid_index.inc new file mode 100644 index 00000000000..262d43d0109 --- /dev/null +++ b/mysql-test/suite/rpl/include/rpl_gtid_index.inc @@ -0,0 +1,187 @@ +# Include file for main test rpl.rpl_gtid_index. +# Test GTID indexes with given parameters. +# +# Parameters: +# $NUM_POS Number of GTIDs/binlog positions to create +# $NUM_DOMAIN Number of different domains to use +# $NUM_SERVER Number of different server_id to use +# $NUM_SLAVE_CONNECTS How many GTID slave connect positions to test +# $RND_SEED Random seed + + +--echo *** Testing $NUM_POS GTIDs with $NUM_SLAVE_CONNECTS test connects + +--connection master +DELETE FROM t1 WHERE a >= 1000; +# Rotate binlogs to make new GTID index settings take effect. +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; + +# Prepare some random values, but deterministic between test runs. +CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT) + ENGINE=InnoDB; +INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1); +eval +INSERT INTO rand_data(idx, domain_id, server_id) +SELECT seq, + @tmp:=floor($NUM_DOMAIN*POW(rand($RND_SEED),2)), + 100 + $NUM_SERVER*@tmp + floor($NUM_SERVER*rand($RND_SEED)) + FROM seq_1_to_$NUM_POS; +# Let's check that the test data is deterministic. +# If this changes due to some server changes, it's fine, the .result can just +# be updated. But we want it to be identical between test runs on same code, +# to facilitate debugging test failures. +SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data; + +# Create some data for the binlog (and GTID index), recording the correct +# binlog positions and GTIDs. +CREATE TABLE gtid_data( + idx INT PRIMARY KEY, + gtid VARCHAR(44), + gtid_pos VARCHAR(255), + file VARCHAR(100), + pos INT, + row_count INT, + KEY(file, pos)) ENGINE=InnoDB; +--let $gtid= `SELECT @@last_gtid` + +--source include/save_master_gtid.inc + +--connection slave +--source include/sync_with_master_gtid.inc +--source include/stop_slave.inc + +--connection master +SET @orig_domain_id= @@gtid_domain_id; +SET @orig_server_id= @@server_id; +--let $i= 0 +--let $rotate_point= `SELECT floor($NUM_POS/2)` +--let $base_count= `SELECT COUNT(*) FROM t1` +--disable_query_log +while ($i < $NUM_POS) { + --let $file= query_get_value(SHOW MASTER STATUS, File, 1) + --let $pos= query_get_value(SHOW MASTER STATUS, Position, 1) + --let $gtid_pos= `SELECT @@gtid_binlog_pos` + --let $row_count= `SELECT $base_count + $i` + eval SET gtid_domain_id= (SELECT domain_id FROM rand_data WHERE idx=$i+1); + eval SET server_id= (SELECT server_id FROM rand_data WHERE idx=$i+1); + BEGIN; + eval INSERT INTO gtid_data(idx, gtid, gtid_pos, file, pos, row_count) + VALUES ($i, '$gtid', '$gtid_pos', '$file', $pos, $row_count); + eval INSERT INTO t1 VALUES ($i + 1000, 0); + COMMIT; +--let $gtid= `SELECT @@last_gtid` + inc $i; + if ($i==$rotate_point) { + FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; + } +} +--enable_query_log +SET gtid_domain_id= @orig_domain_id; +SET server_id= @orig_server_id; + +SELECT COUNT(*) FROM gtid_data; + +# Test that BINLOG_GTID_POS returns correct positions for every GTID position. +--echo *** The result should be empty, otherwise some result is wrong: +SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos) + FROM gtid_data + WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos)) + ORDER BY idx; + +# Prepare to rewind the slave to this point to test again on same binlog. +--connection slave +SET @orig_pos= @@GLOBAL.gtid_slave_pos; +SET @orig_t1_limit= (SELECT MAX(a) FROM t1); + +--echo *** Now connect the slave to each position in turn, and test that +--echo *** the right amount of data is replicated at each point. +--let $old_silent= $keep_include_silent +--let $keep_include_silent= 1 +--let $i= 0 +--disable_query_log +while ($i < $NUM_POS) { + --connection master + --let $gtid_pos= `SELECT gtid_pos FROM gtid_data WHERE idx=$i` + --let $master_count= `SELECT row_count FROM gtid_data WHERE idx=$i` + --connection slave + --disable_result_log + eval START SLAVE UNTIL master_gtid_pos='$gtid_pos'; + --enable_result_log + --let $res= `SELECT MASTER_GTID_WAIT('$gtid_pos')` + if ($res != 0) { + --die "FAIL: MASTER_GTID_WAIT($gtid_pos) returned $res, should have been 0" + } + --source include/wait_for_slave_to_stop.inc + --let $slave_count = `SELECT COUNT(*) FROM t1` + if ($master_count != $slave_count) { + SELECT * FROM gtid_data ORDER BY file, pos; + SELECT * FROM t1 ORDER BY a; + --die "Not all rows replicated. $master_count on master but $slave_count on slave." + } + --let $i= `SELECT $i + ceil($NUM_POS / $NUM_SLAVE_CONNECTS)` +} +--enable_query_log + +--echo *** Test slave connecting to some GTID positions where the position in +--echo *** the master's binlog is different between the different domains. +--echo *** Revind the slave and test on the same binlog data from the master as before. +--connection slave +SET sql_log_bin= 0; +TRUNCATE gtid_data; +DELETE FROM t1 WHERE a > @orig_t1_limit; +SET sql_log_bin= 1; +SET GLOBAL gtid_slave_pos= @orig_pos; + +--let $i= 0 +--disable_query_log +while ($i <= $NUM_DOMAIN) { + # Build a GTID position from GTIDs that are picked at different locations + # in the gtid_data table for each domain. + --connection master + let $until_pos=` + SELECT GROUP_CONCAT(gtid SEPARATOR ',') + FROM gtid_data + WHERE idx IN ( + SELECT MAX(gtid_data.idx) AS pick + FROM gtid_data + INNER JOIN rand_data ON (rand_data.idx = gtid_data.idx) + WHERE gtid_data.idx*$NUM_DOMAIN <= (domain_id + $i)*$NUM_POS + GROUP BY domain_id + )`; + --connection slave + --disable_result_log + eval START SLAVE UNTIL master_gtid_pos='$until_pos'; + --enable_result_log + --let $res= `SELECT MASTER_GTID_WAIT('$until_pos')` + if ($res != 0) { + --die "FAIL: MASTER_GTID_WAIT($until_pos) returned $res, should have been 0" + } + --source include/wait_for_slave_to_stop.inc + + inc $i; +} +--enable_query_log +--let $keep_include_silent= $old_silent + +# Check that everything was replicated (nothing skipped). +# We have one less row on the slave since the last UNTIL is the one before +# the master inserted the last row. +--connection master +--let $master_count= `SELECT COUNT(*)-1 FROM t1` +--connection slave +--let $slave_count= `SELECT COUNT(*) FROM t1` +if ($master_count != $slave_count) { + SELECT * FROM gtid_data ORDER BY file, pos; + SELECT * FROM t1 ORDER BY a; + --die "Not all rows replicated. $master_count on master but $slave_count on slave." +} + +--connection master +DROP TABLE gtid_data, rand_data; +--source include/save_master_gtid.inc + +--connection slave +--source include/start_slave.inc +--source include/sync_with_master_gtid.inc + +--connection master diff --git a/mysql-test/suite/rpl/r/rpl_gtid_glle_no_terminate.result b/mysql-test/suite/rpl/r/rpl_gtid_glle_no_terminate.result index f4d257c2668..98daf309e8c 100644 --- a/mysql-test/suite/rpl/r/rpl_gtid_glle_no_terminate.result +++ b/mysql-test/suite/rpl/r/rpl_gtid_glle_no_terminate.result @@ -28,7 +28,6 @@ include/show_events.inc Log_name Pos Event_type Server_id End_log_pos Info slave-relay-bin.000002 # Rotate # # master-bin.000001;pos=POS slave-relay-bin.000002 # Format_desc # # SERVER_VERSION, BINLOG_VERSION -slave-relay-bin.000002 # Gtid_list # # [] slave-relay-bin.000002 # Binlog_checkpoint # # master-bin.000001 slave-relay-bin.000002 # Gtid # # GTID #-#-# slave-relay-bin.000002 # Gtid_list # # [#-#-#] diff --git a/mysql-test/suite/rpl/r/rpl_gtid_index.result b/mysql-test/suite/rpl/r/rpl_gtid_index.result new file mode 100644 index 00000000000..a4cd2491953 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_gtid_index.result @@ -0,0 +1,366 @@ +include/master-slave.inc +[connection master] +connection slave; +include/stop_slave.inc +CHANGE MASTER TO master_use_gtid= slave_pos; +include/start_slave.inc +connection master; +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; +INSERT INTO t1 VALUES (0, 0); +*** Test looking up a lot of different event positions and GTIDs. +CREATE FUNCTION gtid_eq(a VARCHAR(255), b VARCHAR(255)) RETURNS BOOLEAN DETERMINISTIC +BEGIN +DECLARE g VARCHAR(255); +IF a IS NULL OR b IS NULL OR LENGTH(a) != LENGTH(b) THEN +RETURN FALSE; +END IF; +SET a= CONCAT(a, ','); +SET b= CONCAT(',', b, ','); +WHILE LENGTH(a) > 0 DO +SET g= REGEXP_SUBSTR(a, '^[^,]+,'); +SET a= SUBSTRING(a, LENGTH(g)+1); +SET b= REPLACE(b, CONCAT(',', g), ','); +END WHILE; +RETURN b = ','; +END // +SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size; +SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min; +*** A fair amount of work with default GTID index settings. +*** Testing 200 GTIDs with 50 test connects +connection master; +DELETE FROM t1 WHERE a >= 1000; +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT) +ENGINE=InnoDB; +INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1); +INSERT INTO rand_data(idx, domain_id, server_id) +SELECT seq, +@tmp:=floor(5*POW(rand(42),2)), +100 + 5*@tmp + floor(5*rand(42)) +FROM seq_1_to_200; +SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data; +COUNT(*) SUM(domain_id) SUM(server_id) +201 285 21852 +CREATE TABLE gtid_data( +idx INT PRIMARY KEY, +gtid VARCHAR(44), +gtid_pos VARCHAR(255), +file VARCHAR(100), +pos INT, +row_count INT, +KEY(file, pos)) ENGINE=InnoDB; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/stop_slave.inc +connection master; +SET @orig_domain_id= @@gtid_domain_id; +SET @orig_server_id= @@server_id; +SET gtid_domain_id= @orig_domain_id; +SET server_id= @orig_server_id; +SELECT COUNT(*) FROM gtid_data; +COUNT(*) +200 +*** The result should be empty, otherwise some result is wrong: +SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos) +FROM gtid_data +WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos)) +ORDER BY idx; +idx gtid_pos BINLOG_GTID_POS(file, pos) +connection slave; +SET @orig_pos= @@GLOBAL.gtid_slave_pos; +SET @orig_t1_limit= (SELECT MAX(a) FROM t1); +*** Now connect the slave to each position in turn, and test that +*** the right amount of data is replicated at each point. +*** Test slave connecting to some GTID positions where the position in +*** the master's binlog is different between the different domains. +*** Revind the slave and test on the same binlog data from the master as before. +connection slave; +SET sql_log_bin= 0; +TRUNCATE gtid_data; +DELETE FROM t1 WHERE a > @orig_t1_limit; +SET sql_log_bin= 1; +SET GLOBAL gtid_slave_pos= @orig_pos; +connection master; +connection slave; +connection master; +DROP TABLE gtid_data, rand_data; +include/save_master_gtid.inc +connection slave; +include/start_slave.inc +include/sync_with_master_gtid.inc +connection master; +*** A lot of GTIDs with small btree pages to stress the Btree code. +SET GLOBAL binlog_gtid_index_page_size= 64; +SET GLOBAL binlog_gtid_index_span_min= 1; +*** Testing 1000 GTIDs with 50 test connects +connection master; +DELETE FROM t1 WHERE a >= 1000; +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT) +ENGINE=InnoDB; +INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1); +INSERT INTO rand_data(idx, domain_id, server_id) +SELECT seq, +@tmp:=floor(10*POW(rand(150),2)), +100 + 5*@tmp + floor(5*rand(150)) +FROM seq_1_to_1000; +SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data; +COUNT(*) SUM(domain_id) SUM(server_id) +1001 2881 116394 +CREATE TABLE gtid_data( +idx INT PRIMARY KEY, +gtid VARCHAR(44), +gtid_pos VARCHAR(255), +file VARCHAR(100), +pos INT, +row_count INT, +KEY(file, pos)) ENGINE=InnoDB; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/stop_slave.inc +connection master; +SET @orig_domain_id= @@gtid_domain_id; +SET @orig_server_id= @@server_id; +SET gtid_domain_id= @orig_domain_id; +SET server_id= @orig_server_id; +SELECT COUNT(*) FROM gtid_data; +COUNT(*) +1000 +*** The result should be empty, otherwise some result is wrong: +SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos) +FROM gtid_data +WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos)) +ORDER BY idx; +idx gtid_pos BINLOG_GTID_POS(file, pos) +connection slave; +SET @orig_pos= @@GLOBAL.gtid_slave_pos; +SET @orig_t1_limit= (SELECT MAX(a) FROM t1); +*** Now connect the slave to each position in turn, and test that +*** the right amount of data is replicated at each point. +*** Test slave connecting to some GTID positions where the position in +*** the master's binlog is different between the different domains. +*** Revind the slave and test on the same binlog data from the master as before. +connection slave; +SET sql_log_bin= 0; +TRUNCATE gtid_data; +DELETE FROM t1 WHERE a > @orig_t1_limit; +SET sql_log_bin= 1; +SET GLOBAL gtid_slave_pos= @orig_pos; +connection master; +connection slave; +connection master; +DROP TABLE gtid_data, rand_data; +include/save_master_gtid.inc +connection slave; +include/start_slave.inc +include/sync_with_master_gtid.inc +connection master; +*** Small page size with sparse index. +SET GLOBAL binlog_gtid_index_page_size= 64; +SET GLOBAL binlog_gtid_index_span_min= 2048; +*** Testing 200 GTIDs with 50 test connects +connection master; +DELETE FROM t1 WHERE a >= 1000; +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT) +ENGINE=InnoDB; +INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1); +INSERT INTO rand_data(idx, domain_id, server_id) +SELECT seq, +@tmp:=floor(10*POW(rand(666),2)), +100 + 5*@tmp + floor(5*rand(666)) +FROM seq_1_to_200; +SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data; +COUNT(*) SUM(domain_id) SUM(server_id) +201 599 23410 +CREATE TABLE gtid_data( +idx INT PRIMARY KEY, +gtid VARCHAR(44), +gtid_pos VARCHAR(255), +file VARCHAR(100), +pos INT, +row_count INT, +KEY(file, pos)) ENGINE=InnoDB; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/stop_slave.inc +connection master; +SET @orig_domain_id= @@gtid_domain_id; +SET @orig_server_id= @@server_id; +SET gtid_domain_id= @orig_domain_id; +SET server_id= @orig_server_id; +SELECT COUNT(*) FROM gtid_data; +COUNT(*) +200 +*** The result should be empty, otherwise some result is wrong: +SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos) +FROM gtid_data +WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos)) +ORDER BY idx; +idx gtid_pos BINLOG_GTID_POS(file, pos) +connection slave; +SET @orig_pos= @@GLOBAL.gtid_slave_pos; +SET @orig_t1_limit= (SELECT MAX(a) FROM t1); +*** Now connect the slave to each position in turn, and test that +*** the right amount of data is replicated at each point. +*** Test slave connecting to some GTID positions where the position in +*** the master's binlog is different between the different domains. +*** Revind the slave and test on the same binlog data from the master as before. +connection slave; +SET sql_log_bin= 0; +TRUNCATE gtid_data; +DELETE FROM t1 WHERE a > @orig_t1_limit; +SET sql_log_bin= 1; +SET GLOBAL gtid_slave_pos= @orig_pos; +connection master; +connection slave; +connection master; +DROP TABLE gtid_data, rand_data; +include/save_master_gtid.inc +connection slave; +include/start_slave.inc +include/sync_with_master_gtid.inc +connection master; +*** Medium page size. +SET GLOBAL binlog_gtid_index_page_size= 512; +SET GLOBAL binlog_gtid_index_span_min= 512; +*** Testing 200 GTIDs with 50 test connects +connection master; +DELETE FROM t1 WHERE a >= 1000; +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT) +ENGINE=InnoDB; +INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1); +INSERT INTO rand_data(idx, domain_id, server_id) +SELECT seq, +@tmp:=floor(10*POW(rand(1024),2)), +100 + 5*@tmp + floor(5*rand(1024)) +FROM seq_1_to_200; +SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data; +COUNT(*) SUM(domain_id) SUM(server_id) +201 555 23160 +CREATE TABLE gtid_data( +idx INT PRIMARY KEY, +gtid VARCHAR(44), +gtid_pos VARCHAR(255), +file VARCHAR(100), +pos INT, +row_count INT, +KEY(file, pos)) ENGINE=InnoDB; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/stop_slave.inc +connection master; +SET @orig_domain_id= @@gtid_domain_id; +SET @orig_server_id= @@server_id; +SET gtid_domain_id= @orig_domain_id; +SET server_id= @orig_server_id; +SELECT COUNT(*) FROM gtid_data; +COUNT(*) +200 +*** The result should be empty, otherwise some result is wrong: +SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos) +FROM gtid_data +WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos)) +ORDER BY idx; +idx gtid_pos BINLOG_GTID_POS(file, pos) +connection slave; +SET @orig_pos= @@GLOBAL.gtid_slave_pos; +SET @orig_t1_limit= (SELECT MAX(a) FROM t1); +*** Now connect the slave to each position in turn, and test that +*** the right amount of data is replicated at each point. +*** Test slave connecting to some GTID positions where the position in +*** the master's binlog is different between the different domains. +*** Revind the slave and test on the same binlog data from the master as before. +connection slave; +SET sql_log_bin= 0; +TRUNCATE gtid_data; +DELETE FROM t1 WHERE a > @orig_t1_limit; +SET sql_log_bin= 1; +SET GLOBAL gtid_slave_pos= @orig_pos; +connection master; +connection slave; +connection master; +DROP TABLE gtid_data, rand_data; +include/save_master_gtid.inc +connection slave; +include/start_slave.inc +include/sync_with_master_gtid.inc +connection master; +*** Large page size. +SET GLOBAL binlog_gtid_index_page_size= 16384; +SET GLOBAL binlog_gtid_index_span_min= 1; +*** Testing 200 GTIDs with 50 test connects +connection master; +DELETE FROM t1 WHERE a >= 1000; +FLUSH NO_WRITE_TO_BINLOG BINARY LOGS; +CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT) +ENGINE=InnoDB; +INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1); +INSERT INTO rand_data(idx, domain_id, server_id) +SELECT seq, +@tmp:=floor(10*POW(rand(12345),2)), +100 + 5*@tmp + floor(5*rand(12345)) +FROM seq_1_to_200; +SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data; +COUNT(*) SUM(domain_id) SUM(server_id) +201 571 23252 +CREATE TABLE gtid_data( +idx INT PRIMARY KEY, +gtid VARCHAR(44), +gtid_pos VARCHAR(255), +file VARCHAR(100), +pos INT, +row_count INT, +KEY(file, pos)) ENGINE=InnoDB; +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/stop_slave.inc +connection master; +SET @orig_domain_id= @@gtid_domain_id; +SET @orig_server_id= @@server_id; +SET gtid_domain_id= @orig_domain_id; +SET server_id= @orig_server_id; +SELECT COUNT(*) FROM gtid_data; +COUNT(*) +200 +*** The result should be empty, otherwise some result is wrong: +SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos) +FROM gtid_data +WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos)) +ORDER BY idx; +idx gtid_pos BINLOG_GTID_POS(file, pos) +connection slave; +SET @orig_pos= @@GLOBAL.gtid_slave_pos; +SET @orig_t1_limit= (SELECT MAX(a) FROM t1); +*** Now connect the slave to each position in turn, and test that +*** the right amount of data is replicated at each point. +*** Test slave connecting to some GTID positions where the position in +*** the master's binlog is different between the different domains. +*** Revind the slave and test on the same binlog data from the master as before. +connection slave; +SET sql_log_bin= 0; +TRUNCATE gtid_data; +DELETE FROM t1 WHERE a > @orig_t1_limit; +SET sql_log_bin= 1; +SET GLOBAL gtid_slave_pos= @orig_pos; +connection master; +connection slave; +connection master; +DROP TABLE gtid_data, rand_data; +include/save_master_gtid.inc +connection slave; +include/start_slave.inc +include/sync_with_master_gtid.inc +connection master; +connection master; +SET GLOBAL binlog_gtid_index_page_size= @old_page_size; +SET GLOBAL binlog_gtid_index_span_min= @old_span_min; +DROP TABLE t1; +DROP FUNCTION gtid_eq; +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_gtid_glle_no_terminate.test b/mysql-test/suite/rpl/t/rpl_gtid_glle_no_terminate.test index f0f38a31da6..8d8f22bb1e7 100644 --- a/mysql-test/suite/rpl/t/rpl_gtid_glle_no_terminate.test +++ b/mysql-test/suite/rpl/t/rpl_gtid_glle_no_terminate.test @@ -24,6 +24,7 @@ CHANGE MASTER TO MASTER_USE_GTID=slave_pos; --echo # --echo # Initialize test data --connection master +--source include/wait_for_binlog_checkpoint.inc create table t1 (a int); SET @@session.server_id= 3; create table t2 (a int); diff --git a/mysql-test/suite/rpl/t/rpl_gtid_index.test b/mysql-test/suite/rpl/t/rpl_gtid_index.test new file mode 100644 index 00000000000..6001cc6e600 --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_gtid_index.test @@ -0,0 +1,89 @@ +--source include/have_sequence.inc +--source include/have_innodb.inc +--source include/master-slave.inc +--source include/have_binlog_format_mixed.inc + +--connection slave +--source include/stop_slave.inc +CHANGE MASTER TO master_use_gtid= slave_pos; +--source include/start_slave.inc + +--connection master +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; +INSERT INTO t1 VALUES (0, 0); + + +--echo *** Test looking up a lot of different event positions and GTIDs. + +# A function for comparing GTID positions. +# Handles that the domain_id order is different in the two strings. +# Works by repeatedly removing one GTID from each string. If the strings have +# the same length and nothing is left at the end, then they are identical. +delimiter //; +CREATE FUNCTION gtid_eq(a VARCHAR(255), b VARCHAR(255)) RETURNS BOOLEAN DETERMINISTIC +BEGIN + DECLARE g VARCHAR(255); + IF a IS NULL OR b IS NULL OR LENGTH(a) != LENGTH(b) THEN + RETURN FALSE; + END IF; + SET a= CONCAT(a, ','); + SET b= CONCAT(',', b, ','); + WHILE LENGTH(a) > 0 DO + SET g= REGEXP_SUBSTR(a, '^[^,]+,'); + SET a= SUBSTRING(a, LENGTH(g)+1); + SET b= REPLACE(b, CONCAT(',', g), ','); + END WHILE; + RETURN b = ','; +END // +delimiter ;// + +SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size; +SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min; + +--echo *** A fair amount of work with default GTID index settings. +--let $NUM_POS= 200 +--let $NUM_DOMAIN= 5 +--let $NUM_SERVER= 5 +--let $NUM_SLAVE_CONNECTS= 50 +--let $RND_SEED= 42 +--source suite/rpl/include/rpl_gtid_index.inc + +--echo *** A lot of GTIDs with small btree pages to stress the Btree code. +--let $NUM_POS= 1000 +--let $NUM_DOMAIN= 10 +--let $RND_SEED= 150 +SET GLOBAL binlog_gtid_index_page_size= 64; +SET GLOBAL binlog_gtid_index_span_min= 1; +--source suite/rpl/include/rpl_gtid_index.inc + +--echo *** Small page size with sparse index. +--let $NUM_POS= 200 +--let $RND_SEED= 666 +SET GLOBAL binlog_gtid_index_page_size= 64; +SET GLOBAL binlog_gtid_index_span_min= 2048; +--source suite/rpl/include/rpl_gtid_index.inc + +--echo *** Medium page size. +--let $NUM_POS= 200 +--let $RND_SEED= 1024 +SET GLOBAL binlog_gtid_index_page_size= 512; +SET GLOBAL binlog_gtid_index_span_min= 512; +--source suite/rpl/include/rpl_gtid_index.inc + +--echo *** Large page size. +--let $NUM_POS= 200 +--let $RND_SEED= 12345 +SET GLOBAL binlog_gtid_index_page_size= 16384; +SET GLOBAL binlog_gtid_index_span_min= 1; +--source suite/rpl/include/rpl_gtid_index.inc + + +# Cleanup. +--connection master +SET GLOBAL binlog_gtid_index_page_size= @old_page_size; +SET GLOBAL binlog_gtid_index_span_min= @old_span_min; + +DROP TABLE t1; +DROP FUNCTION gtid_eq; + +--source include/rpl_end.inc diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result index 866e567f65f..e1a4244be07 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result @@ -432,6 +432,36 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST MIXED,STATEMENT,ROW READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME BINLOG_GTID_INDEX +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BOOLEAN +VARIABLE_COMMENT Enable the creation of a GTID index for every binlog file, and the use of such index for speeding up GTID lookup in the binlog. +NUMERIC_MIN_VALUE NULL +NUMERIC_MAX_VALUE NULL +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST OFF,ON +READ_ONLY NO +COMMAND_LINE_ARGUMENT OPTIONAL +VARIABLE_NAME BINLOG_GTID_INDEX_PAGE_SIZE +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE INT UNSIGNED +VARIABLE_COMMENT Page size to use for the binlog GTID index. +NUMERIC_MIN_VALUE 64 +NUMERIC_MAX_VALUE 16777216 +NUMERIC_BLOCK_SIZE 1 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME BINLOG_GTID_INDEX_SPAN_MIN +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE INT UNSIGNED +VARIABLE_COMMENT Control sparseness of the binlog GTID index. If set to N, at most one index record will be added for every N bytes of binlog file written, to reduce the size of the index. Normally does not need tuning. +NUMERIC_MIN_VALUE 1 +NUMERIC_MAX_VALUE 1073741824 +NUMERIC_BLOCK_SIZE 1 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME BINLOG_OPTIMIZE_THREAD_SCHEDULING VARIABLE_SCOPE GLOBAL VARIABLE_TYPE BOOLEAN diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result index f668d078723..5ecd1f86e12 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result @@ -452,6 +452,36 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST MIXED,STATEMENT,ROW READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME BINLOG_GTID_INDEX +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BOOLEAN +VARIABLE_COMMENT Enable the creation of a GTID index for every binlog file, and the use of such index for speeding up GTID lookup in the binlog. +NUMERIC_MIN_VALUE NULL +NUMERIC_MAX_VALUE NULL +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST OFF,ON +READ_ONLY NO +COMMAND_LINE_ARGUMENT OPTIONAL +VARIABLE_NAME BINLOG_GTID_INDEX_PAGE_SIZE +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE INT UNSIGNED +VARIABLE_COMMENT Page size to use for the binlog GTID index. +NUMERIC_MIN_VALUE 64 +NUMERIC_MAX_VALUE 16777216 +NUMERIC_BLOCK_SIZE 1 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME BINLOG_GTID_INDEX_SPAN_MIN +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE INT UNSIGNED +VARIABLE_COMMENT Control sparseness of the binlog GTID index. If set to N, at most one index record will be added for every N bytes of binlog file written, to reduce the size of the index. Normally does not need tuning. +NUMERIC_MIN_VALUE 1 +NUMERIC_MAX_VALUE 1073741824 +NUMERIC_BLOCK_SIZE 1 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME BINLOG_IGNORE_DB VARIABLE_SCOPE GLOBAL VARIABLE_TYPE VARCHAR diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index 466ec320b25..2c0082c59c9 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -165,7 +165,7 @@ SET (SQL_SOURCE gcalc_slicescan.cc gcalc_tools.cc my_apc.cc mf_iocache_encr.cc item_jsonfunc.cc my_json_writer.cc json_schema.cc json_schema_helper.cc - rpl_gtid.cc rpl_parallel.cc + rpl_gtid.cc gtid_index.cc rpl_parallel.cc semisync.cc semisync_master.cc semisync_slave.cc semisync_master_ack_receiver.cc sp_instr.cc diff --git a/sql/gtid_index.cc b/sql/gtid_index.cc new file mode 100644 index 00000000000..0467c7cf480 --- /dev/null +++ b/sql/gtid_index.cc @@ -0,0 +1,1434 @@ +/* + Copyright (c) 2023 Kristian Nielsen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +*/ + +#include "gtid_index.h" +#include "sql_const.h" +#include "log.h" + + +static const uchar GTID_INDEX_MAGIC[4]= { + 254, 254, 12, 1, +}; + +Gtid_index_writer *Gtid_index_writer::hot_index_list= nullptr; +/* gtid_index_mutex is inited in MYSQL_LOG::init_pthread_objects(). */ +mysql_mutex_t Gtid_index_writer::gtid_index_mutex; + + +Gtid_index_writer::Gtid_index_writer(const char *filename, uint32 offset, + rpl_binlog_state_base *binlog_state, + uint32 opt_page_size, + my_off_t opt_span_min) + : offset_min_threshold(opt_span_min), + nodes(nullptr), previous_offset(0), + max_level(0), index_file(-1), + error_state(false), file_header_written(false), in_hot_index_list(false) +{ + uint32 count; + rpl_gtid *gtid_list; + page_size= opt_page_size; + pending_state.init(); + + if (alloc_level_if_missing(0)) + { + give_error("Out of memory allocating node list"); + return; + } + + /* + Lock the index mutex at this point just before we create the new index + file on disk. From this point on, and until the index is fully written, + the reader will find us in the "hot index" list and will be able to read + from the index while it's still being constructed. + */ + lock_gtid_index(); + + build_index_filename(filename); + int create_flags= O_WRONLY|O_TRUNC|O_BINARY|O_EXCL; + index_file= mysql_file_create(key_file_gtid_index, index_file_name, + CREATE_MODE, create_flags, MYF(0)); + if (index_file < 0 && my_errno == EEXIST) + { + /* + It shouldn't happen that an old GTID index file remains, as we remove + them as part of RESET MASTER and PURGE BINARY LOGS. But if it happens + due to some external file copy of the user or something, delete any old + GTID index file first. + */ + sql_print_information("Old GTID index file found '%s', deleting", + index_file_name); + my_errno= 0; + mysql_file_delete(key_file_gtid_index, index_file_name, MYF(0)); + index_file= mysql_file_create(key_file_gtid_index, index_file_name, + CREATE_MODE, create_flags, MYF(0)); + } + if (index_file < 0) + { + give_error("Failed to open new index file for writing"); + goto err; + } + + /* + Write out an initial index record, i.e. corresponding to the GTID_LIST + event / binlog state at the start of the binlog file. + */ + count= binlog_state->count_nolock(); + gtid_list= gtid_list_buffer(count); + if (count > 0) + { + if (!gtid_list) + goto err; + binlog_state->get_gtid_list_nolock(gtid_list, count); + } + write_record(offset, gtid_list, count); + + insert_in_hot_index(); + +err: + unlock_gtid_index(); +} + + +Gtid_index_writer::~Gtid_index_writer() +{ + if (in_hot_index_list) + { + lock_gtid_index(); + close(); + unlock_gtid_index(); + } + + if (index_file > 0) + { + /* + Should have been closed by call to Gtid_index_writer::close(). + We can at least avoid leaking file descriptor. + */ + mysql_file_close(index_file, MYF(0)); + } + + if (nodes) + { + for (uint32 i= 0; i <= max_level; ++i) + delete nodes[i]; + my_free(nodes); + } + + /* + state.free() is not needed here, will be called from rpl_binlog_state_base + destructor. + */ +} + + +void +Gtid_index_writer::gtid_index_init() +{ + mysql_mutex_init(key_gtid_index_lock, >id_index_mutex, MY_MUTEX_INIT_SLOW); +} + +void +Gtid_index_writer::gtid_index_cleanup() +{ + mysql_mutex_destroy(>id_index_mutex); +} + + +const Gtid_index_writer * +Gtid_index_writer::find_hot_index(const char *file_name) +{ + mysql_mutex_assert_owner(>id_index_mutex); + + for (const Gtid_index_writer *p= hot_index_list; p; p= p->next_hot_index) + { + if (0 == strcmp(file_name, p->index_file_name)) + return p; + } + return nullptr; +} + +void +Gtid_index_writer::insert_in_hot_index() +{ + mysql_mutex_assert_owner(>id_index_mutex); + + next_hot_index= hot_index_list; + hot_index_list= this; + in_hot_index_list= true; +} + + +void +Gtid_index_writer::remove_from_hot_index() +{ + mysql_mutex_assert_owner(>id_index_mutex); + + Gtid_index_writer **next_ptr_ptr= &hot_index_list; + for (;;) + { + Gtid_index_writer *p= *next_ptr_ptr; + if (!p) + break; + if (p == this) + { + *next_ptr_ptr= p->next_hot_index; + break; + } + next_ptr_ptr= &p->next_hot_index; + } + next_hot_index= nullptr; + in_hot_index_list= false; +} + +void +Gtid_index_writer::process_gtid(uint32 offset, const rpl_gtid *gtid) +{ + rpl_gtid *gtid_list; + uint32 gtid_count; + + if (process_gtid_check_batch(offset, gtid, >id_list, >id_count)) + return; // Error + + if (gtid_list) + async_update(offset, gtid_list, gtid_count); +} + + +int +Gtid_index_writer::process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid, + rpl_gtid **out_gtid_list, + uint32 *out_gtid_count) +{ + uint32 count; + rpl_gtid *gtid_list; + + mysql_mutex_assert_not_owner(>id_index_mutex); + + if (unlikely(pending_state.update_nolock(gtid))) + { + give_error("Out of memory processing GTID for binlog GTID index"); + return 1; + } + /* + Sparse index; we record only selected GTIDs, and scan the binlog forward + from there to find the exact spot. + */ + if (offset - previous_offset < offset_min_threshold) + { + *out_gtid_list= nullptr; + *out_gtid_count= 0; + return 0; + } + + count= pending_state.count_nolock(); + DBUG_ASSERT(count > 0 /* Since we just updated with a GTID. */); + gtid_list= (rpl_gtid *) + my_malloc(key_memory_binlog_gtid_index, count*sizeof(*gtid_list), MYF(0)); + if (unlikely(!gtid_list)) + { + give_error("Out of memory allocating GTID list for binlog GTID index"); + return 1; + } + if (unlikely(pending_state.get_gtid_list_nolock(gtid_list, count))) + { + /* Shouldn't happen as we allocated the list with the correct length. */ + DBUG_ASSERT(false); + give_error("Internal error allocating GTID list for binlog GTID index"); + my_free(gtid_list); + return 1; + } + pending_state.reset_nolock(); + previous_offset= offset; + *out_gtid_list= gtid_list; + *out_gtid_count= count; + return 0; +} + + +int +Gtid_index_writer::async_update(uint32 event_offset, + rpl_gtid *gtid_list, + uint32 gtid_count) +{ + lock_gtid_index(); + int res= write_record(event_offset, gtid_list, gtid_count); + unlock_gtid_index(); + my_free(gtid_list); + return res; +} + + +void +Gtid_index_writer::close() +{ + lock_gtid_index(); + if (!error_state) + { + + /* + Write out the remaining pending pages, and insert the final child pointer + in interior nodes. + */ + for (uint32 level= 0; ; ++level) + { + uint32 node_ptr= write_current_node(level, level==max_level); + nodes[level]->reset(); + if (!node_ptr || level >= max_level) + break; + add_child_ptr(level+1, node_ptr); + } + } + remove_from_hot_index(); + unlock_gtid_index(); + + if (!error_state) + { + if (mysql_file_sync(index_file, MYF(0))) + give_error("Error syncing index file to disk"); + } + + mysql_file_close(index_file, MYF(0)); + index_file= (File)-1; +} + + +Gtid_index_base::Index_node_base::Index_node_base() + : first_page(nullptr), current_page(nullptr), current_ptr(nullptr) +{ +} + + +Gtid_index_base::Index_node_base::~Index_node_base() +{ + free_pages(); +} + + +void +Gtid_index_base::Index_node_base::free_pages() +{ + for (Node_page *p= first_page; p; ) + { + Node_page *q= p->next; + my_free(p); + p= q; + } +} + + +void +Gtid_index_base::Index_node_base::reset() +{ + free_pages(); + first_page= current_page= nullptr; +} + + +Gtid_index_base::Gtid_index_base() + : gtid_buffer(nullptr), gtid_buffer_alloc(0) +{ +} + + +Gtid_index_base::~Gtid_index_base() +{ + if (gtid_buffer_alloc > 0) + my_free(gtid_buffer); +} + + +void +Gtid_index_base::make_gtid_index_file_name(char *out_name, size_t bufsize, + const char *base_filename) +{ + char *p= strmake(out_name, base_filename, bufsize-1); + size_t remain= bufsize - (p - out_name); + strmake(p, ".idx", remain-1); +} + + +void +Gtid_index_base::build_index_filename(const char *filename) +{ + make_gtid_index_file_name(index_file_name, sizeof(index_file_name), filename); +} + + +rpl_gtid * +Gtid_index_base::gtid_list_buffer(uint32 count) +{ + if (gtid_buffer_alloc >= count) + return gtid_buffer; + rpl_gtid *new_buffer= (rpl_gtid *) + my_malloc(key_memory_binlog_gtid_index, count*sizeof(*new_buffer), MYF(0)); + if (!new_buffer) + { + give_error("Out of memory allocating buffer for GTID list"); + return NULL; + } + my_free(gtid_buffer); + gtid_buffer= new_buffer; + gtid_buffer_alloc= count; + return new_buffer; +} + + +Gtid_index_writer::Index_node::Index_node(uint32 level_) + : num_records(0), level(level_), force_spill_page(false) +{ + state.init(); +} + + +Gtid_index_writer::Index_node::~Index_node() +{ + free_pages(); +} + + +uint32 +Gtid_index_writer::write_current_node(uint32 level, bool is_root) +{ + Index_node *n= nodes[level]; + + uint32 node_pos= (uint32)mysql_file_tell(index_file, MYF(0)); + + for (Node_page *p= n->first_page; p ; p= p->next) + { + if (unlikely(is_root)) + *(p->flag_ptr) |= PAGE_FLAG_ROOT; + if (likely(!p->next)) + *(p->flag_ptr) |= PAGE_FLAG_LAST; + int4store(p->page + page_size - CHECKSUM_LEN, + my_checksum(0, p->page, page_size - CHECKSUM_LEN)); + if (mysql_file_write(index_file, p->page, page_size, MYF(MY_NABP))) + { + give_error("Error writing index page"); + return 0; + } + } + + DBUG_ASSERT(node_pos % page_size == 0); + /* Page numbers are +1 just so that zero can denote invalid page pointer. */ + return 1 + (node_pos / (uint32)page_size); +} + + +void +Gtid_index_writer::Index_node::reset() +{ + Index_node_base::reset(); + state.reset_nolock(); + num_records= 0; + force_spill_page= false; +} + + +/* + Make sure there is requested space in the current page, by allocating a + new spill page if necessary. +*/ +int +Gtid_index_writer::reserve_space(Index_node *n, size_t bytes) +{ + DBUG_ASSERT(bytes <= page_size); + if (likely(n->current_page) && + likely(n->current_ptr - n->current_page->page + bytes <= + (page_size - CHECKSUM_LEN))) + return 0; + /* Not enough room, allocate a spill page. */ + Node_page *page= alloc_page(); + n->force_spill_page= false; + if (!page) + return 1; + n->current_ptr= + init_header(page, n->level==0, !n->current_page); + if (n->current_page) + n->current_page->next= page; + else + n->first_page= page; + n->current_page= page; + return 0; +} + + +int +Gtid_index_writer::do_write_record(uint32 level, + uint32 event_offset, + const rpl_gtid *gtid_list, + uint32 gtid_count) +{ + DBUG_ASSERT(level <= max_level); + Index_node *n= nodes[level]; + if (reserve_space(n, 8)) + return 1; + /* Store the count as +1, so that 0 can mean "no more records". */ + int4store(n->current_ptr, gtid_count+1); + int4store(n->current_ptr+4, event_offset); + n->current_ptr+= 8; + for (uint32 i= 0; i < gtid_count; ++i) + { + if (reserve_space(n, 16)) + return 1; + int4store(n->current_ptr, gtid_list[i].domain_id); + int4store(n->current_ptr+4, gtid_list[i].server_id); + int8store(n->current_ptr+8, gtid_list[i].seq_no); + n->current_ptr+= 16; + } + + ++n->num_records; + return 0; +} + + +/* + Add a child pointer to the current node on LEVEL. + The first page has node_ptr=1 just so that a zero node_ptr can be used as + a no/invalid value (effectively node_ptr points to the end of the target + page, in unit of pages). + + Adding a child pointer shouldn't spill to a new page, code must make sure that + there is always room for the final child pointer in current non-leaf node. +*/ +int +Gtid_index_writer::add_child_ptr(uint32 level, my_off_t node_offset) +{ + DBUG_ASSERT(level <= max_level); + DBUG_ASSERT(node_offset > 0); + Index_node *n= nodes[level]; + if (reserve_space(n, 4)) + return 1; + DBUG_ASSERT(n->current_page); + DBUG_ASSERT((size_t)(n->current_ptr - n->current_page->page + 4) <= + page_size - CHECKSUM_LEN); + + int4store(n->current_ptr, node_offset); + n->current_ptr+= 4; + return 0; +} + + +/* + Write one index record to the GTID index, flushing nodes and allocating + new nodes as necessary. +*/ +int +Gtid_index_writer::write_record(uint32 event_offset, + const rpl_gtid *gtid_list, + uint32 gtid_count) +{ + if (error_state) + return 1; /* Avoid continuing on a possibly corrupt state. */ + + uint32 level= 0; + /* + The most frequent case is when there is room in the current page for the + current position to be written, in which case we exit early in the first + iteration of the following loop. + + In the general case, we move up through the path to the root, writing + lower-level node page to disk and adding child pointers in higher-level + nodes, until we reach a node that has room. This final node may be a + freshly allocated new root node in the few times when the height of the + tree increases. + */ + for (;;) + { + Index_node *n= nodes[level]; + if (update_gtid_state(&n->state, gtid_list, gtid_count)) + return give_error("Out of memory updating the local GTID state"); + + if (check_room(level, gtid_count)) + { + /* There is room in the node, just add the index record. */ + return do_write_record(level, event_offset, gtid_list, gtid_count); + } + + /* + This node is full: + - First, write out this node to disk. + - Add a child pointer in the parent node (allocating one if needed). + - On level 0, allocate a new leaf node and add the index record there. + - On levels >0, skip the last index record when the node gets full + (B+-Tree has (k-1) keys for k child pointers). + - Loop to the parent node to add an index record there. + */ + uint32 node_ptr= write_current_node(level, false); + if (!node_ptr) + return 1; + if (alloc_level_if_missing(level+1) || + add_child_ptr(level+1, node_ptr)) + return 1; + uint32 new_count= n->state.count_nolock(); + rpl_gtid *new_gtid_list= gtid_list_buffer(new_count); + if (new_count > 0 && !new_gtid_list) + return 1; + if (n->state.get_gtid_list_nolock(new_gtid_list, new_count)) + return give_error("Internal error processing GTID state"); + n->reset(); + if (level == 0) + { + if (do_write_record(level, event_offset, new_gtid_list, new_count)) + return 1; + } + else + { + /* + Allocate a page for the node. This is mostly to help the reader of hot + index to not see NULL pointers, and we will need the page later anyway + to put at least one child pointer to the level below. + */ + if (reserve_space(n, 4)) + return 1; + } + gtid_list= new_gtid_list; + gtid_count= new_count; + ++level; + } + // NotReached. +} + + +bool +Gtid_index_writer::check_room(uint32 level, uint32 gtid_count) +{ + Index_node *n= nodes[level]; + /* There's always room in an empty (to-be-allocated) page. */ + if (!n->current_page || n->num_records == 0) + return true; + /* + Make sure we use at least 1/2 a page of room after the initial record, + setting a flag to allocate a spill page later if needed. + */ + size_t avail= page_size - CHECKSUM_LEN - (n->current_ptr - n->current_page->page); + if (n->num_records==1 && avail < page_size/2) + { + n->force_spill_page= true; + return true; + } + if (n->force_spill_page) + return true; + size_t needed= 8 + 16*gtid_count; + /* Non-leaf pages need extra 4 bytes for a child pointer. */ + if (level > 0) + needed+= 4; + return needed <= avail; +} + + +int +Gtid_index_writer::alloc_level_if_missing(uint32 level) +{ + if (likely(nodes)) + { + if (likely(max_level >= level)) + return 0; + DBUG_ASSERT(level == max_level+1); // Alloc one at a time + } + + Index_node *node= new Index_node(level); + if (!node) + return give_error("Out of memory allocating new node"); + Index_node **new_nodes= (Index_node **) + my_realloc(key_memory_binlog_gtid_index, nodes, (level+1)*sizeof(*nodes), + MYF(MY_ALLOW_ZERO_PTR|MY_ZEROFILL)); + if (!new_nodes) + { + delete node; + return give_error("Out of memory allocating larger node list"); + } + new_nodes[level]= node; + nodes= new_nodes; + max_level= level; + return 0; +} + + +/* + Initialize the start of a data page. + This is at the start of a page, except for the very first page where it + comes after the global file header. + Format: + 0 flags. + 1-3 unused padding/reserved. + + The argument FIRST denotes if this is the first page (if false it is a + continuation page). +*/ +uchar * +Gtid_index_writer::init_header(Node_page *page, bool is_leaf, bool is_first) +{ + uchar *p= page->page; + bool is_file_header= !file_header_written; + + if (unlikely(is_file_header)) + { + memcpy(p, GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC)); + p+= sizeof(GTID_INDEX_MAGIC); + *p++= GTID_INDEX_VERSION_MAJOR; + *p++= GTID_INDEX_VERSION_MINOR; + /* Flags/padding currently unused. */ + *p++= 0; + *p++= 0; + int4store(p, page_size); + p+= 4; + DBUG_ASSERT(p == page->page + GTID_INDEX_FILE_HEADER_SIZE); + file_header_written= true; + } + + uchar flags= 0; + if (is_leaf) + flags|= PAGE_FLAG_IS_LEAF; + if (unlikely(!is_first)) + flags|= PAGE_FLAG_IS_CONT; + page->flag_ptr= p; + *p++= flags; + /* Padding/reserved. */ + p+= 3; + DBUG_ASSERT(p == page->page + + (is_file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0) + + GTID_INDEX_PAGE_HEADER_SIZE); + DBUG_ASSERT((size_t)(p - page->page) < page_size - CHECKSUM_LEN); + return p; +} + + +int +Gtid_index_base::update_gtid_state(rpl_binlog_state_base *state, + const rpl_gtid *gtid_list, uint32 gtid_count) +{ + for (uint32 i= 0; i < gtid_count; ++i) + if (state->update_nolock(>id_list[i])) + return 1; + return 0; +} + + +Gtid_index_base::Node_page *Gtid_index_base::alloc_page() +{ + Node_page *new_node= (Node_page *) + my_malloc(key_memory_binlog_gtid_index, + sizeof(Node_page) + page_size, + MYF(MY_ZEROFILL)); + if (!new_node) + give_error("Out of memory for allocating index page"); + return new_node; +} + + +int Gtid_index_writer::give_error(const char *msg) +{ + if (!error_state) + { + sql_print_information("Error during binlog GTID index creation, will " + "fallback to slower sequential binlog scan. " + "Error is: %s", msg); + error_state= true; + } + return 1; +} + + +Gtid_index_reader::Gtid_index_reader() + : n(nullptr), index_file(-1), + file_open(false), index_valid(false), has_root_node(false), + version_major(0), version_minor(0) +{ + current_state.init(); + compare_state.init(); +} + + +Gtid_index_reader::~Gtid_index_reader() +{ + if (file_open) + mysql_file_close(index_file, MYF(0)); +} + + +int +Gtid_index_reader::search_offset(uint32 in_offset, + uint32 *out_offset, uint32 *out_gtid_count) +{ + in_search_offset= in_offset; + search_cmp_function= &Gtid_index_reader::search_cmp_offset; + + return do_index_search(out_offset, out_gtid_count); +} + +int +Gtid_index_reader::search_gtid_pos(slave_connection_state *in_gtid_pos, + uint32 *out_offset, uint32 *out_gtid_count) +{ + in_search_gtid_pos= in_gtid_pos; + search_cmp_function= &Gtid_index_reader::search_cmp_gtid_pos; + + int res= do_index_search(out_offset, out_gtid_count); + /* Let's not leave a dangling pointer to the caller's memory. */ + in_search_gtid_pos= nullptr; + + return res; +} + +rpl_gtid * +Gtid_index_reader::search_gtid_list() +{ + return gtid_buffer; +} + + +int +Gtid_index_reader::search_cmp_offset(uint32 offset, + rpl_binlog_state_base *state) +{ + if (offset <= in_search_offset) + return 0; + else + return -1; +} + + +int +Gtid_index_reader::search_cmp_gtid_pos(uint32 offset, + rpl_binlog_state_base *state) +{ + if (state->is_before_pos(in_search_gtid_pos)) + return 0; + else + return -1; +} + + +int +Gtid_index_reader::next_page() +{ + if (!read_page->next) + return 1; + read_page= read_page->next; + read_ptr= read_page->flag_ptr + 4; + return 0; +} + + +int +Gtid_index_reader::find_bytes(uint32 num_bytes) +{ + if ((my_ptrdiff_t)(read_ptr - read_page->page + num_bytes) <= + (my_ptrdiff_t)(page_size - CHECKSUM_LEN)) + return 0; + return next_page(); +} + + +int +Gtid_index_reader::get_child_ptr(uint32 *out_child_ptr) +{ + if (find_bytes(4)) + return give_error("Corrupt index, short index node"); + *out_child_ptr= (uint32)uint4korr(read_ptr); + read_ptr+= 4; + return 0; +} + + +/* + Read the start of an index record (count of GTIDs in the differential state + and offset). + Returns: + 0 ok + 1 EOF, no more data in this node +*/ +int +Gtid_index_reader::get_offset_count(uint32 *out_offset, uint32 *out_gtid_count) +{ + if (find_bytes(8)) + return 1; + uint32 gtid_count= uint4korr(read_ptr); + if (gtid_count == 0) + { + /* 0 means invalid/no record (we store N+1 for N GTIDs in record). */ + return 1; + } + *out_gtid_count= gtid_count - 1; + *out_offset= uint4korr(read_ptr + 4); + read_ptr+= 8; + return 0; +} + + +int +Gtid_index_reader::get_gtid_list(rpl_gtid *out_gtid_list, uint32 count) +{ + for (uint32 i= 0; i < count; ++i) + { + if (find_bytes(16)) + return give_error("Corrupt index, short index node"); + out_gtid_list[i].domain_id= uint4korr(read_ptr); + out_gtid_list[i].server_id= uint4korr(read_ptr + 4); + out_gtid_list[i].seq_no= uint8korr(read_ptr + 8); + read_ptr+= 16; + } + return 0; +} + + +int +Gtid_index_reader::open_index_file(const char *binlog_filename) +{ + close_index_file(); + build_index_filename(binlog_filename); + if ((index_file= mysql_file_open(key_file_gtid_index, index_file_name, + O_RDONLY|O_BINARY, MYF(0))) < 0) + return 1; // No error for missing index (eg. upgrade) + + file_open= true; + if (read_file_header()) + return 1; + + return 0; +} + +void +Gtid_index_reader::close_index_file() +{ + if (!file_open) + return; + mysql_file_close(index_file, MYF(0)); + file_open= false; + index_valid= false; +} + + +int +Gtid_index_reader::do_index_search(uint32 *out_offset, uint32 *out_gtid_count) +{ + /* In cold index, we require a complete index with a valid root node. */ + if (!has_root_node) + return -1; + + return do_index_search_root(out_offset, out_gtid_count); +} + + +int +Gtid_index_reader::do_index_search_root(uint32 *out_offset, + uint32 *out_gtid_count) +{ + current_state.reset_nolock(); + compare_state.reset_nolock(); + /* + These states will be initialized to the full state stored at the start of + the root node and then incrementally updated. + */ + bool current_state_updated= false; + + if (read_root_node()) + return -1; + for (;;) + { + if (*n->first_page->flag_ptr & PAGE_FLAG_IS_LEAF) + break; + + if (compare_state.load_nolock(¤t_state)) + { + give_error("Out of memory allocating GTID list"); + return -1; + } + uint32 child_ptr; + if (get_child_ptr(&child_ptr)) + return -1; + + /* Scan over the keys in the node to find the child pointer to follow */ + for (;;) + { + uint32 offset, gtid_count; + int res= get_offset_count(&offset, >id_count); + if (res == 1) // EOF? + { + /* Follow the right-most child pointer. */ + if (read_node(child_ptr)) + return -1; + break; + } + rpl_gtid *gtid_list= gtid_list_buffer(gtid_count); + uint32 child2_ptr; + if ((gtid_count > 0 && !gtid_list) || + get_gtid_list(gtid_list, gtid_count) || + get_child_ptr(&child2_ptr)) + return -1; + if (update_gtid_state(&compare_state, gtid_list, gtid_count)) + return -1; + int cmp= (this->*search_cmp_function)(offset, &compare_state); + if (cmp < 0) + { + /* Follow the left child of this key. */ + if (read_node(child_ptr)) + return -1; + break; + } + /* Continue to scan the next key. */ + update_gtid_state(¤t_state, gtid_list, gtid_count); + current_state_updated= true; + current_offset= offset; + child_ptr= child2_ptr; + } + } + return do_index_search_leaf(current_state_updated, + out_offset, out_gtid_count); +} + +int Gtid_index_reader::do_index_search_leaf(bool current_state_updated, + uint32 *out_offset, + uint32 *out_gtid_count) +{ + uint32 offset, gtid_count; + int res= get_offset_count(&offset, >id_count); + if (res == 1) + { + DBUG_ASSERT(0); + give_error("Corrupt index; empty leaf node"); + return -1; + } + rpl_gtid *gtid_list= gtid_list_buffer(gtid_count); + if ((gtid_count > 0 && !gtid_list) || + get_gtid_list(gtid_list, gtid_count)) + return -1; + /* + The first key is ignored (already included in the current state), unless + it is the very first state in the index. + */ + if (!current_state_updated) + update_gtid_state(¤t_state, gtid_list, gtid_count); + current_offset= offset; + if (compare_state.load_nolock(¤t_state)) + { + give_error("Out of memory allocating GTID state"); + return -1; + } + int cmp= (this->*search_cmp_function)(offset, &compare_state); + if (cmp < 0) + return 0; // Search position is before start of index. + + /* Scan over the keys in the leaf node. */ + for (;;) + { + uint32 offset, gtid_count; + int res= get_offset_count(&offset, >id_count); + if (res == 1) // EOF? + { + /* Reached end of leaf, last key is the one searched for. */ + break; + } + gtid_list= gtid_list_buffer(gtid_count); + if ((gtid_count > 0 && !gtid_list) || + get_gtid_list(gtid_list, gtid_count)) + return -1; + if (update_gtid_state(&compare_state, gtid_list, gtid_count)) + return -1; + cmp= (this->*search_cmp_function)(offset, &compare_state); + if (cmp < 0) + { + /* Next key is larger, so current state is the one searched for. */ + break; + } + update_gtid_state(¤t_state, gtid_list, gtid_count); + current_offset= offset; + } + + *out_offset= current_offset; + *out_gtid_count= current_state.count_nolock(); + /* Save the result in the shared gtid list buffer. */ + if ((!(gtid_list= gtid_list_buffer(*out_gtid_count)) && *out_gtid_count > 0) || + current_state.get_gtid_list_nolock(gtid_list, *out_gtid_count)) + return -1; + + return 1; +} + + +/* + Read the file header and check that it's valid and that the format is not + too new a version for us to be able to read it. +*/ +int +Gtid_index_reader::read_file_header() +{ + if (!file_open) + return 1; + + uchar buf[GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE]; + + if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, 0, MY_SEEK_SET, MYF(0)) || + mysql_file_read(index_file, buf, + GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE, + MYF(MY_NABP))) + return give_error("Error reading page from index file"); + if (memcmp(&buf[0], GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC))) + return give_error("Corrupt index file, magic not found in header"); + version_major= buf[4]; + version_minor= buf[5]; + /* We cannot safely read a major version we don't know about. */ + if (version_major > GTID_INDEX_VERSION_MAJOR) + return give_error("Incompatible index file, version too high"); + page_size= uint4korr(&buf[8]); + + /* Verify checksum integrity of page_size and major/minor version. */ + uint32 crc= my_checksum(0, buf, sizeof(buf)); + uchar *buf3= (uchar *) + my_malloc(key_memory_binlog_gtid_index, page_size - sizeof(buf), MYF(0)); + if (!buf3) + return give_error("Error allocating memory for index page"); + int res= 0; + if (mysql_file_read(index_file, buf3, page_size - sizeof(buf), MYF(MY_NABP))) + res= give_error("Error reading page from index file"); + else + { + crc= my_checksum(crc, buf3, page_size - sizeof(buf) - CHECKSUM_LEN); + if (crc != uint4korr(buf3 + page_size - sizeof(buf) - CHECKSUM_LEN)) + res= give_error("Corrupt page, invalid checksum"); + } + my_free(buf3); + if (res) + return res; + + /* + Check that there is a valid root node at the end of the file. + If there is not, the index may be a "hot index" that is currently being + constructed. Or it was only partially written before server crash and not + recovered for some reason. + */ + uchar flags= buf[GTID_INDEX_PAGE_HEADER_SIZE]; + constexpr uchar needed_flags= PAGE_FLAG_ROOT|PAGE_FLAG_LAST; + if ((flags & needed_flags) == needed_flags) + { + /* Special case: the index is a single page, which is the root node. */ + has_root_node= true; + } + else + { + uchar buf2[GTID_INDEX_PAGE_HEADER_SIZE]; + if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size, + MY_SEEK_END, MYF(0)) || + mysql_file_read(index_file, buf2, GTID_INDEX_PAGE_HEADER_SIZE, + MYF(MY_NABP))) + return give_error("Error reading root page from index file"); + flags= buf2[0]; + has_root_node= ((flags & needed_flags) == needed_flags); + /* No need to verify checksum here, will be done by read_root_node(). */ + } + index_valid= true; + return 0; +} + + +int +Gtid_index_reader::verify_checksum(Gtid_index_base::Node_page *page) +{ + uint32 calc_checksum= my_checksum(0, page->page, page_size - CHECKSUM_LEN); + uint32 read_checksum= uint4korr(page->page + page_size - CHECKSUM_LEN); + if (calc_checksum != read_checksum) + return give_error("Corrupt page, invalid checksum"); + return 0; +} + + +Gtid_index_base::Node_page * +Gtid_index_reader::alloc_and_read_page() +{ + Node_page *page= alloc_page(); + if (!page) + { + give_error("Error allocating memory for index page"); + return nullptr; + } + if (mysql_file_read(index_file, page->page, page_size, MYF(MY_NABP))) + { + my_free(page); + give_error("Error reading page from index file"); + return nullptr; + } + if (verify_checksum(page)) + { + my_free(page); + return nullptr; + } + return page; +} + + +int +Gtid_index_reader::read_root_node() +{ + if (!index_valid || !has_root_node) + return 1; + + cold_node.reset(); + n= &cold_node; + /* + Read pages one by one from the back of the file until we have a complete + root node. + */ + if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size, + MY_SEEK_END, MYF(0))) + return give_error("Error seeking index file"); + + for (;;) + { + Node_page *page= alloc_and_read_page(); + if (!page) + return 1; + if (mysql_file_tell(index_file, MYF(0)) == page_size) + page->flag_ptr= &page->page[GTID_INDEX_FILE_HEADER_SIZE]; + else + page->flag_ptr= &page->page[0]; + page->next= n->first_page; + n->first_page= page; + uchar flags= *page->flag_ptr; + if (unlikely(!(flags & PAGE_FLAG_ROOT))) + return give_error("Corrupt or truncated index, no root node found"); + if (!(flags & PAGE_FLAG_IS_CONT)) + break; // Found start of root node + if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)(2*page_size), + MY_SEEK_CUR, MYF(0))) + return give_error("Error seeking index file for multi-page root node"); + } + + read_page= n->first_page; + read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE; + return 0; +} + + +int +Gtid_index_reader::read_node(uint32 page_ptr) +{ + DBUG_ASSERT(page_ptr != 0 /* No zero child pointers in on-disk pages. */); + if (!index_valid || !page_ptr) + return 1; + return read_node_cold(page_ptr); +} + + +int +Gtid_index_reader::read_node_cold(uint32 page_ptr) +{ + if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, (page_ptr-1)*page_size, + MY_SEEK_SET, MYF(0))) + return give_error("Error seeking index file"); + + bool file_header= (page_ptr == 1); + cold_node.reset(); + n= &cold_node; + Node_page **next_ptr_ptr= &n->first_page; + for (;;) + { + Node_page *page= alloc_and_read_page(); + if (!page) + return 1; + page->flag_ptr= &page->page[file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0]; + file_header= false; + /* Insert the page at the end of the list. */ + page->next= nullptr; + *next_ptr_ptr= page; + next_ptr_ptr= &page->next; + + uchar flags= *(page->flag_ptr); + if (flags & PAGE_FLAG_LAST) + break; + } + + read_page= n->first_page; + read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE; + return 0; +} + + +int Gtid_index_reader::give_error(const char *msg) +{ + sql_print_information("Error reading binlog GTID index, will " + "fallback to slower sequential binlog scan. " + "Error is: %s", msg); + return 1; +} + + +Gtid_index_reader_hot::Gtid_index_reader_hot() + : hot_writer(nullptr) +{ +} + + +int +Gtid_index_reader_hot::get_child_ptr(uint32 *out_child_ptr) +{ + if (find_bytes(4)) + { + /* + If reading hot index, EOF or zero child ptr means the child pointer has + not yet been written. A zero out_child_ptr makes read_node() read the + hot node for the child. + */ + if (hot_writer) + { + *out_child_ptr= 0; + return 0; + } + return give_error("Corrupt index, short index node"); + } + *out_child_ptr= (uint32)uint4korr(read_ptr); + read_ptr+= 4; + return 0; +} + + +int +Gtid_index_reader_hot::do_index_search(uint32 *out_offset, + uint32 *out_gtid_count) +{ + /* Check for a "hot" index. */ + Gtid_index_writer::lock_gtid_index(); + hot_writer= Gtid_index_writer::find_hot_index(index_file_name); + if (!hot_writer) + { + Gtid_index_writer::unlock_gtid_index(); + /* + Check the index file header (and index end) again, in case it was + hot when open_index_file() was called, but became cold in the meantime. + */ + if (!has_root_node && Gtid_index_reader::read_file_header()) + return -1; + } + + int res= do_index_search_root(out_offset, out_gtid_count); + + if (hot_writer) + { + hot_writer= nullptr; + Gtid_index_writer::unlock_gtid_index(); + } + return res; +} + + +int +Gtid_index_reader_hot::read_file_header() +{ + if (!file_open) + return 1; + + Gtid_index_writer::lock_gtid_index(); + hot_writer= Gtid_index_writer::find_hot_index(index_file_name); + if (!hot_writer) + Gtid_index_writer::unlock_gtid_index(); + + int res; + if (hot_writer && hot_writer->max_level == 0) + { + /* + No pages from the hot index have been written to disk, there's just a + single incomplete node at level 0. + We have to read the file header from the in-memory page. + */ + uchar *p= hot_writer->nodes[0]->first_page->page; + page_size= uint4korr(p + 8); + has_root_node= false; + index_valid= true; + res= 0; + } + else + res= Gtid_index_reader::read_file_header(); + + if (hot_writer) + { + hot_writer= nullptr; + Gtid_index_writer::unlock_gtid_index(); + } + return res; +} + + +int +Gtid_index_reader_hot::read_root_node() +{ + if (!index_valid) + return 1; + + if (hot_writer) + { + hot_level= hot_writer->max_level; + return read_node_hot(); + } + if (has_root_node) + { + return Gtid_index_reader::read_root_node(); + } + return 1; +} + + +int +Gtid_index_reader_hot::read_node(uint32 page_ptr) +{ + if (!index_valid || (!page_ptr && !hot_writer)) + return 1; + + if (hot_writer) + { + if (!page_ptr) + { + /* + The "hot" index is only partially written. Not yet written child pages + are indicated by zero child pointers. Such child pages are found from + the list of active nodes in the writer. + */ + if (hot_level <= 0) + { + DBUG_ASSERT(0 /* Should be no child pointer to follow on leaf page. */); + return give_error("Corrupt hot index (child pointer on leaf page"); + } + DBUG_ASSERT(n == hot_writer->nodes[hot_level]); + --hot_level; + return read_node_hot(); + } + + /* + We started searching the "hot" index, but now we've reached a "cold" + part of the index that's already fully written. So leave the "hot index" + mode and continue reading pages from the on-disk index from here. + */ + hot_writer= nullptr; + Gtid_index_writer::unlock_gtid_index(); + } + + return read_node_cold(page_ptr); +} + + +int +Gtid_index_reader_hot::read_node_hot() +{ + if (hot_writer->error_state) + return give_error("Cannot access hot index"); + n= hot_writer->nodes[hot_level]; + read_page= n->first_page; + /* The writer should allocate pages for all nodes. */ + DBUG_ASSERT(read_page != nullptr); + if (!read_page) + return give_error("Page not available in hot index"); + read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE; + return 0; +} diff --git a/sql/gtid_index.h b/sql/gtid_index.h new file mode 100644 index 00000000000..4674bf97c06 --- /dev/null +++ b/sql/gtid_index.h @@ -0,0 +1,521 @@ +/* + Copyright (c) 2023 Kristian Nielsen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +*/ + +#ifndef GTID_INDEX_H +#define GTID_INDEX_H + +#include "my_global.h" +#include "mysqld.h" +#include "mariadb.h" +#include "rpl_gtid.h" + +/* + This implements an on-disk index for each binlog file to speed up access to + the binlog at a specific offset or GTID position. This is primarily used when + a slave connects to the master, but also by user calling BINLOG_GTID_POS(). + + A connecting slave first scans the binlog files to find the last one with an + initial GTID_LIST event that lies before the starting GTID position. Then a + sequential scan of the binlog file is done until the requested GTID position + is found. + + The binlog index conceptually extends this using index records corresponding + to different offset within one binlog file. Each record functions as if it + was the initial GTID_LIST event of a new binlog file, allowing the + sequential scan to start from the corresponding position. By having + sufficiently many index records, the scan will be fast. + + The code that adds one record to the index is in two parts, a "sync" path + and an "async" path. The "sync" path in process_gtid_check_batch() does the + minimum amount of work which needs to run as part of transaction commit. The + actual writing of the index, in async_update(), can then be done as a + background task, minimizing the performance impact on transaction processing. + The "sync" and "async" paths each run single threaded, but can execute in + parallel with each other. + + The index file is written incrementally together with the binlog file. + However there is no fsync()'s of the index file needed while writing. A + partially written index left by a crashing server will be re-written during + binlog recovery. A reader is allowed to use the index as it is begin written + (for the "hot" binlog file); such access is protected by mutex. + + In case of lost or corrupt index, fallback to full sequential scan is done + (so performance will be affected but not correct functionality). + + The index file is structured like a B+-tree. The index is append-only, so + also resembles a log-structured merge-tree, but with no merging of levels + needed as it covers a single fixed-size binlog file. This makes the building + of the tree relatively simple. + + Keys in the tree consist of a GTID state (corresponding to a GTID_LIST + event) and the associated binlog file offset. All keys (except the first key + in each level of the tree) are delta-compressed to save space, holding only + the (domain_id, server_id) pairs that differ from the previous record. + + The file is page-based. The first page contains the leftmost leaf node, and + the root node is at the end of the file. An incompletely written index file + can be detected by the last page in the file not being a root node page. + Nodes in the B+-tree usually fit in one page, but a node can be split across + multiple pages if GTID states are very large. + + Page format: + + The first page contains an extra file header: + + Offset Size Description + 0 4 MAGIC header identifying the file as a binlog index + 4 1 Major version number. A new major version of the file format + is not readable by older server versions. + 5 1 Minor version number. Formats differing only in minor version + are backwards compatible and can be read by older servers. + 6 2 Padding/unused. + 8 4 Page size. + + Each page additionally contains this header: + + Offset Size Description + 0 1 Flags + 1 3 Padding/unused + + The last 4 bytes of each page is a 32-bit CRC. + + An interior node is a sequence of + ... + while a leaf node has only keys. + + A child pointer is stored as 4 byte integer. The first page is 1, so that + 0 can be used to denote "not present". + + Format of a key: + + Offset Size Description + 0 4 Number of GTIDs in the key, plus 1. Or 0 for EOF. + 4 4 Binlog file offset + 8 4 Domain_id of first GTID + 12 4 Server_id of first GTID + 16 8 Seq_no of first GTID + ... and so on for each GTID in the key. + + A node typically fits in one page. But if the GTID state is very big (or + the page size very small), multiple pages may be used. When a node is split, + it can be split after a child pointer or before or after a GTID, but not + elsewhere. + +Here is an example GTID index with page_size=64 containing 3 records: + Offset GTID state + 0x11d [empty] + 0x20e [0-1-1] + 0x2ad [0-1-2] +The example contains 3 nodes, each stored in a single page. Two leaf nodes and +one interior root node. + +Page 1 (leaf node page with file header): + fe fe 0c 01 "magic" identifying the file as a binlog GTID index + 01 00 Major version 1, minor version 0 + 00 00 Padding / currently unused + 40 00 00 00 Page size (64 bytes in this example) + 05 Flag PAGE_FLAG_IS_LEAF | PAGE_FLAG_LAST (single-page leaf node) + 00 00 00 Padding / current unused +Key 1: + 01 00 00 00 = 1 (entry has zero GTIDs in it) + 1d 01 00 00 Binlog file offset = 0x11d + [Empty GTID state at the very start of the binlog] +Key 2: + 02 00 00 00 GTID_count = 1 + 0e 02 00 00 Binlog file offset = 0x20e + 00 00 00 00 01 00 00 00 + 01 00 00 00 00 00 00 00 GTID 0-1-1 + + 00 00 00 00 00 Zero denotes end-of-node + 00 00 00 00 00 00 00 (Unused space in the page) + 0e 4f ac 43 Checksum / CRC + +Page 2 (leaf node): + 05 Flag PAGE_FLAG_IS_LEAF | PAGE_FLAG_LAST (single-page leaf node) + 00 00 00 Unused +Key 1: + 02 00 00 00 GTID_count = 1 + ad 02 00 00 Binlog file offset = 0x2ad + 00 00 00 00 01 00 00 00 + 02 00 00 00 00 00 00 00 GTID 0-1-2 + + 00 00 00 00 End-of-node + 00 00 00 00 00 00 00 00 + 00 00 00 00 00 00 00 00 + 00 00 00 00 00 00 00 00 + 00 00 00 00 (Unused space in the page) + 0c 4e c2 b9 CRC + +Page 3 (root node): + 0c PAGE_FLAG_ROOT | PAGE_FLAG_LAST (interior root node) + 00 00 00 Unused +Child pointer: + 01 00 00 00 Pointer to page 1 +Key for next child page: + 02 00 00 00 GTID_count = 1 + ad 02 00 00 Binlog offset = 0x2ad + 00 00 00 00 01 00 00 00 + 02 00 00 00 00 00 00 00 GTID 0-1-2 +Child pointer: + 02 00 00 00 Pointer to page 2 + + 00 00 0 000 Zero denotes end-of-node + 00 00 00 00 00 00 00 00 + 00 00 00 00 00 00 00 00 + 00 00 00 00 (Unused) + 8155 a3c7 CRC + + Below is an example of the logical B-Tree structure of a larger GTID index + with a total of 12 keys. + + We use S0, S1, ..., S11 to denote a key, which consists of a GTID state (as + seen in @@binlog_gtid_state and GTID_LIST_EVENT) and the associated binlog + file offset. D1, D2, ..., D11 denote the same keys, but delta-compressed, so + that D1 stores only those GTIDs that are not the same as in S0. + + Pages are denoted by P1, P2, ..., P8. In the example, P1, P2, P3, P5, and P6 + are leaf pages, the rest are interior node pages. P8 is the root node (the + root is always the last page in the index). + + The contents of each page is listed in square brackets [...]. So P1[S0 D1 D2] + is a leaf page with 3 keys, and P7[P5 P6] is an interior node page + with one key and two child-page pointers to P5 and P6. The + notation denotes the delta-compression of key S11 relative to S9; + all GTIDs in S11 that are not present in S9. In the code, this is computed + by combining D10 and D11, hence the use of the notation "D10+D11" instead of + the equivalent "S11-S9". + + Here is the example B-Tree. It has 3 levels, with the leaf nodes at the top: + + P1[S0 D1 D2] P2[D3 D4 D5] P3[D6 D7 D8] P5[D9 D10] P6[D11] + P4[P1 P2 P3] P7[P5 P6] + P8[P4 P7] + + To find eg. S4, we start from the root P8. S4S3 and S4 is when the index + is closed at binlog rotation). The right column are the operations performed, + as follows: + alloc(p) Allocate page p + add_key(p,k) Insert the key k into the page p + add_ptr(p,q) Insert a pointer to child page q in parent page p + write(p) Write out page p to disk at the end of the index file. + + GTID STATE OPERATIONS + S0 alloc(P1) add_key(P1,S0) + D1 add_key(P1,D1) + D2 add_key(P1,D2) + D3 write(P1) alloc(P4) add_ptr(P4,P1) + alloc(P2) add_key(P2,D3) add_key(P4,S3) + D4 add_key(P2,D4) + D5 add_key(P2,D5) + D6 write(P2) add_ptr(P4,P2) + alloc(P3) add_key(P3,D6) add_key(P4,D4+D5+D6) + D7 add_key(P3,D7) + D8 add_key(P3,D8) + D9 write(P3) add_ptr(P4,P3) + alloc(P5) add_key(P5,D9) + write(P4) alloc(P8) add_ptr(P8,P4) alloc(P7) add_key(P8,S9) + D10 add_key(P5,D10) + D11 write(P5) add_ptr(P7,P5) + alloc(P6) add_key(P6,D11) add_key(P7,D10+D11) + write(P6) add_ptr(P7,P6) + write(P7) add_ptr(P8,P7) + write(P8) + + After adding each record to the index, there is exactly one partial page + allocated in-memory for each level present in the B-Tree; new pages being + allocated as old pages fill up and are written to disk. +*/ + + +class Gtid_index_base +{ +public: + /* +4 for ".idx" prefix. */ + static constexpr size_t GTID_INDEX_FILENAME_MAX_SIZE= FN_REFLEN+4; + +protected: + enum enum_page_flags { + /* Set for a leaf node page, cleared for an interior node page. */ + PAGE_FLAG_IS_LEAF= 1, + /* This is a continuation page. */ + PAGE_FLAG_IS_CONT= 2, + /* No continuation page follows (the last page in a group). */ + PAGE_FLAG_LAST= 4, + /* + Flag set to mark the root node. (The root node is normally the last page + in the index file, but having an explicit flag allows us to detect a + partially written index file with the root node missing. + */ + PAGE_FLAG_ROOT= 8, + }; + + /* + Minor version increment represents a backwards-compatible format (can be + read by any server version that knows the format of the major version). + Major version increment means a server should not attempt to read from the + index. + */ + static constexpr uchar GTID_INDEX_VERSION_MAJOR= 1; + static constexpr uchar GTID_INDEX_VERSION_MINOR= 0; + static constexpr size_t GTID_INDEX_FILE_HEADER_SIZE= 12; + static constexpr size_t GTID_INDEX_PAGE_HEADER_SIZE= 4; + static constexpr size_t CHECKSUM_LEN= 4; + +#ifdef _MSC_VER +/* + Flexible array member is part of C99, but it is not standard in C++. + All the compilers and platforms we support do support it, though. + Just we need to disable on Windows a warning about using a non-standard + C++ extension. +*/ +#pragma warning(disable : 4200) +#endif + struct Node_page + { + Node_page *next; + /* Pointer to allow to update the "flags" byte at page writeout. */ + uchar *flag_ptr; + /* Flexible array member; will be allocated to opt_gtid_index_page_size. */ + uchar page[]; + }; + + struct Index_node_base + { + Node_page *first_page; + Node_page *current_page; + /* The current_ptr is only valid if current_page != 0. */ + uchar *current_ptr; + + Index_node_base(); + ~Index_node_base(); + void free_pages(); + void reset(); + }; + +public: + static void make_gtid_index_file_name(char *out_name, size_t bufsize, + const char *base_filename); + +protected: + int update_gtid_state(rpl_binlog_state_base *state, + const rpl_gtid *gtid_list, uint32 gtid_count); + Node_page *alloc_page(); + rpl_gtid *gtid_list_buffer(uint32 count); + void build_index_filename(const char *filename); + virtual int give_error(const char *msg) = 0; + + /* + A buffer to hold a gtid_list temporarily. + Increased as needed to hold largest needed list. + */ + rpl_gtid *gtid_buffer; + uint32 gtid_buffer_alloc; + size_t page_size; +public: + char index_file_name[GTID_INDEX_FILENAME_MAX_SIZE]; + +protected: + Gtid_index_base(); + virtual ~Gtid_index_base(); +}; + + +class Gtid_index_writer : public Gtid_index_base +{ +private: + const my_off_t offset_min_threshold; + + struct Index_node : public Index_node_base + { + rpl_binlog_state_base state; + uint32 num_records; + uint32 level; + bool force_spill_page; + + Index_node(uint32 level_); + ~Index_node(); + void reset(); + }; + +public: + static void gtid_index_init(); + static void gtid_index_cleanup(); +protected: + friend class Gtid_index_reader_hot; + static void lock_gtid_index() { mysql_mutex_lock(>id_index_mutex); } + static void unlock_gtid_index() { mysql_mutex_unlock(>id_index_mutex); } + static const Gtid_index_writer *find_hot_index(const char *file_name); + +public: + Gtid_index_writer(const char *filename, uint32 offset, + rpl_binlog_state_base *binlog_state, + uint32 opt_page_size, my_off_t opt_span_min); + virtual ~Gtid_index_writer(); + void process_gtid(uint32 offset, const rpl_gtid *gtid); + int process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid, + rpl_gtid **out_gtid_list, + uint32 *out_gtid_count); + int async_update(uint32 event_offset, rpl_gtid *gtid_list, uint32 gtid_count); + void close(); + +private: + void insert_in_hot_index(); + void remove_from_hot_index(); + uint32 write_current_node(uint32 level, bool is_root); + int reserve_space(Index_node *n, size_t bytes); + int do_write_record(uint32 level, uint32 event_offset, + const rpl_gtid *gtid_list, uint32 gtid_count); + int add_child_ptr(uint32 level, my_off_t node_offset); + int write_record(uint32 event_offset, const rpl_gtid *gtid_list, + uint32 gtid_count); + bool check_room(uint32 level, uint32 gtid_count); + int alloc_level_if_missing(uint32 level); + uchar *init_header(Node_page *page, bool is_leaf, bool is_first); + int give_error(const char *msg) override; + + static mysql_mutex_t gtid_index_mutex; + static Gtid_index_writer *hot_index_list; + + rpl_binlog_state_base pending_state; + /* Next pointer for the hot_index_list linked list. */ + Gtid_index_writer *next_hot_index; + /* The currently being built index nodes, from leaf[0] to root[max_level]. */ + Index_node **nodes; + my_off_t previous_offset; + uint32 max_level; + + File index_file; + + /* + This is set if we encounter an error (such as out-of-memory or I/O error). + Then we will no longer do any updates to the index, to prevent leaving a + corrupt index. This is not fatal; the partial index will work up to where + it got the error, and the code can fall-back to sequential scan of the + binlog. + */ + bool error_state; + /* Flag to help put the file header at the start of the very first page. */ + bool file_header_written; + /* Flag set while this object is visible in the "hot index" list. */ + bool in_hot_index_list; +}; + + +class Gtid_index_reader : public Gtid_index_base +{ +public: + Gtid_index_reader(); + virtual ~Gtid_index_reader(); + + int open_index_file(const char *binlog_filename); + void close_index_file(); + /* + The search functions take either a binlog offset or GTID position to search + for. They return: + 0 for "not found" (searched position is earlier than start of index). + 1 for "found" + -1 for error. + When found, the returned position is the last position in the index that + lies at or before the searched position. The offset of the returned + position is written to *out_offset. The number of GTIDs in the returned + GTID state is written to *out_gtid_count; the list of found GTIDs can be + accessed with search_gtid_list() and is valid only until next search or + freeing of the Gtid_index_reader object. + */ + int search_offset(uint32 in_offset, uint32 *out_offset, + uint32 *out_gtid_count); + int search_gtid_pos(slave_connection_state *in_gtid_pos, uint32 *out_offset, + uint32 *out_gtid_count); + rpl_gtid *search_gtid_list(); + +protected: + int search_cmp_offset(uint32 offset, rpl_binlog_state_base *state); + int search_cmp_gtid_pos(uint32 offset, rpl_binlog_state_base *state); + virtual int do_index_search(uint32 *out_offset, uint32 *out_gtid_count); + int do_index_search_root(uint32 *out_offset, uint32 *out_gtid_count); + int do_index_search_leaf(bool current_state_updated, + uint32 *out_offset, uint32 *out_gtid_count); + int next_page(); + int find_bytes(uint32 num_bytes); + virtual int get_child_ptr(uint32 *out_child_ptr); + int get_offset_count(uint32 *out_offset, uint32 *out_gtid_count); + int get_gtid_list(rpl_gtid *out_gtid_list, uint32 count); + virtual int read_file_header(); + int verify_checksum(Node_page *page); + Node_page *alloc_and_read_page(); + virtual int read_root_node(); + virtual int read_node(uint32 page_ptr); + int read_node_cold(uint32 page_ptr); + int give_error(const char *msg) override; + + rpl_binlog_state_base current_state; + rpl_binlog_state_base compare_state; + Index_node_base cold_node; + /* n points to either cold node or hot node in writer. */ + Index_node_base *n; + int (Gtid_index_reader::* search_cmp_function)(uint32, rpl_binlog_state_base *); + slave_connection_state *in_search_gtid_pos; + Node_page *read_page; + uchar *read_ptr; + File index_file; + uint32 current_offset; + uint32 in_search_offset; + bool file_open; + bool index_valid; + bool has_root_node; + uchar version_major; + uchar version_minor; +}; + + +/* + Sub-class of Gtid_index_reader that can additionally access in-memory "hot" + pages of the index, which are partially filled pages of the current binlog + file, not yet written to disk. +*/ +class Gtid_index_reader_hot : public Gtid_index_reader +{ +public: + Gtid_index_reader_hot(); + virtual ~Gtid_index_reader_hot() { } + +private: + int do_index_search(uint32 *out_offset, uint32 *out_gtid_count) override; + int get_child_ptr(uint32 *out_child_ptr) override; + int read_file_header() override; + int read_root_node() override; + int read_node(uint32 page_ptr) override; + int read_node_hot(); + + /* Pointer to the writer object, if we're reading a hot index. */ + const Gtid_index_writer *hot_writer; + /* The level we are currently reading in the hot writer .*/ + uint32 hot_level; +}; + +#endif /* GTID_INDEX_H */ diff --git a/sql/log.cc b/sql/log.cc index 752267a9c73..adf63e448c8 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -40,6 +40,7 @@ #include "sql_audit.h" #include "mysqld.h" #include "ddl_log.h" +#include "gtid_index.h" #include #include // For test_if_number @@ -164,12 +165,44 @@ static SHOW_VAR binlog_status_vars_detail[]= Variables for the binlog background thread. Protected by the MYSQL_BIN_LOG::LOCK_binlog_background_thread mutex. */ +struct Binlog_background_job +{ + union + { + MYSQL_BIN_LOG::xid_count_per_binlog *notify_entry; + struct { + Gtid_index_writer *gi; + rpl_gtid *gtid_list; + uint32 gtid_count; + uint32 offset; + } gtid_index_data; + }; + Binlog_background_job *next; + enum enum_job_type { + CHECKPOINT_NOTIFY, + GTID_INDEX_UPDATE, + GTID_INDEX_CLOSE, + SENTINEL + } job_type; +}; static bool binlog_background_thread_started= false; static bool binlog_background_thread_stop= false; -static MYSQL_BIN_LOG::xid_count_per_binlog * - binlog_background_thread_queue= NULL; +static bool binlog_background_thread_sentinel= false; +static Binlog_background_job *binlog_background_thread_queue= NULL; +static Binlog_background_job **binlog_background_thread_endptr= + &binlog_background_thread_queue; +static Binlog_background_job *binlog_background_freelist= NULL; static bool start_binlog_background_thread(); +static int queue_binlog_background_checkpoint_notify( + MYSQL_BIN_LOG::xid_count_per_binlog *entry); +static int queue_binlog_background_gtid_index_update(Gtid_index_writer *gi, + uint32 offset, + rpl_gtid *gtid_list, + uint32 count); +static int queue_binlog_background_gtid_index_close(Gtid_index_writer *gi); +static int queue_binlog_background_sentinel(); +static void binlog_background_wait_for_sentinel(); static rpl_binlog_state rpl_global_gtid_binlog_state; @@ -3418,7 +3451,7 @@ MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period) group_commit_queue(0), group_commit_queue_busy(FALSE), num_commits(0), num_group_commits(0), group_commit_trigger_count(0), group_commit_trigger_timeout(0), - group_commit_trigger_lock_wait(0), + group_commit_trigger_lock_wait(0), gtid_index(nullptr), sync_period_ptr(sync_period), sync_counter(0), state_file_deleted(false), binlog_state_recover_done(false), is_relay_log(0), relay_signal_cnt(0), @@ -3861,6 +3894,26 @@ bool MYSQL_BIN_LOG::open(const char *log_name, if (write_event(&gl_ev)) goto err; + /* Open an index file for this binlog file. */ + DBUG_ASSERT(!gtid_index); /* Binlog close should clear it. */ + if (gtid_index) + delete gtid_index; + if (opt_binlog_gtid_index) + { + my_off_t offset= my_b_tell(&log_file); + gtid_index= + new Gtid_index_writer(log_file_name, (uint32)offset, + &rpl_global_gtid_binlog_state, + (uint32)opt_binlog_gtid_index_page_size, + (my_off_t)opt_binlog_gtid_index_span_min); + if (!gtid_index) + sql_print_information("Could not create GTID index for binlog " + "file '%s'. Accesses to this binlog file will " + "fallback to slower sequential scan."); + } + else + gtid_index= nullptr; + /* Output a binlog checkpoint event at the start of the binlog file. */ /* @@ -4410,12 +4463,31 @@ bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool create_new_log, no new ones will be written. So we can proceed to delete the logs. */ mysql_mutex_unlock(&LOCK_xid_list); + + /* + Push a sentinel through the binlog background thread and wait for it to + return. When it does, we know that no more GTID index operations are + pending as we are holding LOCK_log. + (This is normally already the case as we pushed a binlog checkpoint + request through. But if no XID-capable engines are enabled (eg. running + without InnoDB), then that is a no-op). + */ + queue_binlog_background_sentinel(); + binlog_background_wait_for_sentinel(); } /* Save variables so that we can reopen the log */ save_name=name; name=0; // Protect against free - close(LOG_CLOSE_TO_BE_OPENED); + + /* + Close the active log. + Close the active GTID index synchroneously. We don't want the close + running in the background while we delete the gtid index file. And we just + pushed a sentinel through the binlog background thread while holding + LOCK_log, so no other GTID index operations can be pending. + */ + close(LOG_CLOSE_TO_BE_OPENED|LOG_CLOSE_SYNC_GTID_INDEX); last_used_log_number= 0; // Reset log number cache @@ -4440,6 +4512,28 @@ bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool create_new_log, for (;;) { + /* Delete any GTID index file. */ + char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE]; + Gtid_index_base::make_gtid_index_file_name(buf, sizeof(buf), + linfo.log_file_name); + if (my_delete(buf, MYF(0))) + { + /* If ENOENT, the GTID index file is already deleted or never existed. */ + if (my_errno != ENOENT) + { + if (thd) + { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_DELETE_FILE, ER_THD(thd, ER_CANT_DELETE_FILE), + buf, my_errno); + } + sql_print_information("Failed to delete file '%s' (errno=%d)", + buf, my_errno); + } + my_errno= 0; + } + + /* Delete the binlog file. */ if (unlikely((error= my_delete(linfo.log_file_name, MYF(0))))) { if (my_errno == ENOENT) @@ -4950,6 +5044,7 @@ int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *reclaimed_space, int error= 0; LOG_INFO log_info; LOG_INFO check_log_info; + char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE]; DBUG_ASSERT(my_b_inited(&purge_index_file)); @@ -4983,6 +5078,24 @@ int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *reclaimed_space, /* Get rid of the trailing '\n' */ log_info.log_file_name[length-1]= 0; + Gtid_index_base::make_gtid_index_file_name(buf, sizeof(buf), + log_info.log_file_name); + if (my_delete(buf, MYF(0))) + { + /* If ENOENT, the GTID index file is already deleted or never existed. */ + if (my_errno != ENOENT) + { + if (thd) + { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_DELETE_FILE, ER_THD(thd, ER_CANT_DELETE_FILE), + buf, my_errno); + } + sql_print_information("Failed to delete file '%s'", buf); + } + my_errno= 0; + } + if (unlikely(!mysql_file_stat(m_key_file_log, log_info.log_file_name, &s, MYF(0)))) { @@ -6962,6 +7075,8 @@ err: { bool synced; + update_gtid_index((uint32)offset, thd->get_last_commit_gtid()); + if ((error= flush_and_sync(&synced))) { } @@ -7039,6 +7154,30 @@ err: } +void +MYSQL_BIN_LOG::update_gtid_index(uint32 offset, rpl_gtid gtid) +{ + if (!unlikely(gtid_index)) + return; + + rpl_gtid *gtid_list; + uint32 gtid_count; + int err= gtid_index->process_gtid_check_batch(offset, >id, + >id_list, >id_count); + if (err) + return; + if (gtid_list) + { + /* + Perform the GTID index update in the binlog background thread, + as we are running under the critical LOCK_log mutex. + */ + if (queue_binlog_background_gtid_index_update(gtid_index, offset, + gtid_list, gtid_count)) + my_free(gtid_list); + } +} + int error_log_print(enum loglevel level, const char *format, va_list args) { @@ -8477,6 +8616,8 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) strmake_buf(cache_mngr->last_commit_pos_file, log_file_name); commit_offset= my_b_write_tell(&log_file); + update_gtid_index((uint32)commit_offset, + current->thd->get_last_commit_gtid()); cache_mngr->last_commit_pos_offset= commit_offset; if ((cache_mngr->using_xa && cache_mngr->xa_xid) || current->need_unlog) { @@ -9063,6 +9204,33 @@ void MYSQL_BIN_LOG::close(uint exiting) } #endif /* HAVE_REPLICATION */ + if (!is_relay_log && likely(gtid_index)) + { + if (exiting & (LOG_CLOSE_STOP_EVENT|LOG_CLOSE_SYNC_GTID_INDEX)) + { + /* + The binlog background thread is already stopped just close the final + GTID index synchronously. Or caller explicitly requested synchronous + close of the GTID index. + */ + gtid_index->close(); + delete gtid_index; + } + else + { + /* + Queue a close on the current GTID index. + Important that this is queued _before_ the checkpoint request is sent + (and thus before chechpoint notifications can be queued); this way, if + we crash before the GTID index is synced to disk, the checkpoint will + still be pending and the binlog file will be scanned during crash + recovery and the GTID index recovered. + */ + queue_binlog_background_gtid_index_close(gtid_index); + } + gtid_index= nullptr; + } + /* don't pwrite in a file opened with O_APPEND - it doesn't work */ if (log_file.type == WRITE_CACHE && !(exiting & LOG_CLOSE_DELAYED_CLOSE)) { @@ -10656,22 +10824,7 @@ void TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie) { xid_count_per_binlog *entry= static_cast(cookie); - bool found_entry= false; - mysql_mutex_lock(&LOCK_binlog_background_thread); - /* count the same notification kind from different engines */ - for (xid_count_per_binlog *link= binlog_background_thread_queue; - link && !found_entry; link= link->next_in_queue) - { - if ((found_entry= (entry == link))) - entry->notify_count++; - } - if (!found_entry) - { - entry->next_in_queue= binlog_background_thread_queue; - binlog_background_thread_queue= entry; - } - mysql_cond_signal(&COND_binlog_background_thread); - mysql_mutex_unlock(&LOCK_binlog_background_thread); + queue_binlog_background_checkpoint_notify(entry); } /* @@ -10690,7 +10843,9 @@ pthread_handler_t binlog_background_thread(void *arg __attribute__((unused))) { bool stop; - MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next; + Binlog_background_job *queue, *next; + Binlog_background_job *freelist= nullptr; + Binlog_background_job **freelist_endptr= &freelist; THD *thd; my_thread_init(); DBUG_ENTER("binlog_background_thread"); @@ -10734,6 +10889,18 @@ binlog_background_thread(void *arg __attribute__((unused))) */ THD_STAGE_INFO(thd, stage_binlog_waiting_background_tasks); mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread); + + /* + Put back our job objects in the freelist, now that we own the mutex again. + */ + if (freelist) + { + *freelist_endptr= binlog_background_freelist; + binlog_background_freelist= freelist; + freelist= nullptr; + freelist_endptr= &freelist; + } + for (;;) { stop= binlog_background_thread_stop; @@ -10752,6 +10919,7 @@ binlog_background_thread(void *arg __attribute__((unused))) } /* Grab the queue, if any. */ binlog_background_thread_queue= NULL; + binlog_background_thread_endptr= &binlog_background_thread_queue; mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread); /* Process any incoming commit_checkpoint_notify() calls. */ @@ -10767,17 +10935,40 @@ binlog_background_thread(void *arg __attribute__((unused))) #endif while (queue) { - long count= queue->notify_count; - THD_STAGE_INFO(thd, stage_binlog_processing_checkpoint_notify); - DEBUG_SYNC(thd, "binlog_background_thread_before_mark_xid_done"); - /* Set the thread start time */ - thd->set_time(); - /* Grab next pointer first, as mark_xid_done() may free the element. */ - next= queue->next_in_queue; - queue->notify_count= 0; - for (long i= 0; i <= count; i++) - mysql_bin_log.mark_xid_done(queue->binlog_id, true); - queue= next; + switch (queue->job_type) + { + case Binlog_background_job::CHECKPOINT_NOTIFY: + THD_STAGE_INFO(thd, stage_binlog_processing_checkpoint_notify); + DEBUG_SYNC(thd, "binlog_background_thread_before_mark_xid_done"); + /* Set the thread start time */ + thd->set_time(); + mysql_bin_log.mark_xid_done(queue->notify_entry->binlog_id, true); + break; + + case Binlog_background_job::GTID_INDEX_UPDATE: + queue->gtid_index_data.gi-> + async_update(queue->gtid_index_data.offset, + queue->gtid_index_data.gtid_list, + queue->gtid_index_data.gtid_count); + break; + + case Binlog_background_job::GTID_INDEX_CLOSE: + queue->gtid_index_data.gi->close(); + delete queue->gtid_index_data.gi; + break; + + case Binlog_background_job::SENTINEL: + /* + The sentinel is a way to signal to reset_logs() that all pending + background jobs prior to the sentinel have been processed. + */ + mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread); + DBUG_ASSERT(binlog_background_thread_sentinel); + binlog_background_thread_sentinel= false; + mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end); + mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread); + break; + } #ifdef ENABLED_DEBUG_SYNC DBUG_EXECUTE_IF("binlog_background_checkpoint_processed", @@ -10786,6 +10977,12 @@ binlog_background_thread(void *arg __attribute__((unused))) STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed"))); ); #endif + + next= queue->next; + queue->next= nullptr; + *freelist_endptr= queue; + freelist_endptr= &queue->next; + queue= next; } if (stop) @@ -10794,6 +10991,13 @@ binlog_background_thread(void *arg __attribute__((unused))) THD_STAGE_INFO(thd, stage_binlog_stopping_background_thread); + while (freelist) + { + next= freelist->next; + my_free(freelist); + freelist= next; + } + /* No need to use mutex as thd is not linked into other threads */ THD_count::count++; delete thd; @@ -10802,6 +11006,12 @@ binlog_background_thread(void *arg __attribute__((unused))) /* Signal that we are (almost) stopped. */ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread); + while (binlog_background_freelist) + { + next= binlog_background_freelist->next; + my_free(binlog_background_freelist); + binlog_background_freelist= next; + } binlog_background_thread_stop= false; mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end); mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread); @@ -10845,6 +11055,139 @@ start_binlog_background_thread() return 0; } + + +static Binlog_background_job * +get_binlog_background_job() +{ + Binlog_background_job *job; + mysql_mutex_assert_owner(&mysql_bin_log.LOCK_binlog_background_thread); + + if ((job= binlog_background_freelist) != nullptr) + binlog_background_freelist= job->next; + else + job= (Binlog_background_job *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*job), + MYF(MY_WME)); + + return job; +} + + +static void +queue_binlog_background_job(Binlog_background_job *job) +{ + mysql_mutex_assert_owner(&mysql_bin_log.LOCK_binlog_background_thread); + + job->next= nullptr; + *binlog_background_thread_endptr= job; + binlog_background_thread_endptr= &job->next; + mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread); +} + + +static int +queue_binlog_background_checkpoint_notify( + MYSQL_BIN_LOG::xid_count_per_binlog *entry) +{ + int res; + + mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread); + Binlog_background_job *job= get_binlog_background_job(); + if (!job) + res= 1; + else + { + job->job_type= Binlog_background_job::CHECKPOINT_NOTIFY; + job->notify_entry= entry; + queue_binlog_background_job(job); + res= 0; + } + mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread); + return res; +} + + +static int +queue_binlog_background_gtid_index_update(Gtid_index_writer *gi, uint32 offset, + rpl_gtid *gtid_list, uint32 count) +{ + int res; + + mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread); + Binlog_background_job *job= get_binlog_background_job(); + if (!unlikely(job)) + res= 1; + else + { + job->job_type= Binlog_background_job::GTID_INDEX_UPDATE; + job->gtid_index_data.gi= gi; + job->gtid_index_data.gtid_list= gtid_list; + job->gtid_index_data.gtid_count= count; + job->gtid_index_data.offset= offset; + queue_binlog_background_job(job); + res= 0; + } + mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread); + + return res; +} + + +static int +queue_binlog_background_gtid_index_close(Gtid_index_writer *gi) +{ + int res; + + mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread); + Binlog_background_job *job= get_binlog_background_job(); + if (!job) + return 1; + else + { + job->job_type= Binlog_background_job::GTID_INDEX_CLOSE; + job->gtid_index_data.gi= gi; + queue_binlog_background_job(job); + res= 0; + } + mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread); + + return res; +} + + +static int +queue_binlog_background_sentinel() +{ + int res; + + mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread); + DBUG_ASSERT(!binlog_background_thread_sentinel); + Binlog_background_job *job= get_binlog_background_job(); + if (!job) + return 1; + else + { + binlog_background_thread_sentinel= true; + job->job_type= Binlog_background_job::SENTINEL; + queue_binlog_background_job(job); + res= 0; + } + mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread); + + return res; +} + +static void +binlog_background_wait_for_sentinel() +{ + mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread); + while(binlog_background_thread_sentinel) + mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread_end, + &mysql_bin_log.LOCK_binlog_background_thread); + mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread); +} + + #ifdef HAVE_REPLICATION class Recovery_context { @@ -11111,7 +11454,7 @@ bool Recovery_context::reset_truncate_coord(my_off_t pos) for (uint i= 0; i < gtid_maybe_to_truncate->elements(); i++) { rpl_gtid gtid= gtid_maybe_to_truncate->at(i); - if (rpl_global_gtid_binlog_state.update_nolock(>id, false)) + if (rpl_global_gtid_binlog_state.update_nolock(>id)) return true; } gtid_maybe_to_truncate->clear(); @@ -11376,6 +11719,7 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, Format_description_log_event *fdle, bool do_xa) { Log_event *ev= NULL; + Gtid_index_writer *gtid_index_recover= NULL; HASH xids, ddl_log_ids; MEM_ROOT mem_root; char binlog_checkpoint_name[FN_REFLEN]; @@ -11512,6 +11856,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, /* Initialise the binlog state from the Gtid_list event. */ if (rpl_global_gtid_binlog_state.load(glev->list, glev->count)) goto err2; + if (opt_binlog_gtid_index) + gtid_index_recover= recover_gtid_index_start(last_log_name, end_pos); } break; @@ -11551,7 +11897,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, (((Query_log_event *)ev)->is_commit() || ((Query_log_event *)ev)->is_rollback())))); - if (rpl_global_gtid_binlog_state.update_nolock(&ctx.last_gtid, false)) + recover_gtid_index_process(gtid_index_recover, end_pos, &ctx.last_gtid); + if (rpl_global_gtid_binlog_state.update_nolock(&ctx.last_gtid)) goto err2; ctx.last_gtid_valid= false; } @@ -11560,6 +11907,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, delete ev; ev= NULL; } // end of while + recover_gtid_index_end(gtid_index_recover); + gtid_index_recover= NULL; cur_log= &log; /* @@ -11645,6 +11994,7 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, err2: delete ev; + recover_gtid_index_abort(gtid_index_recover); if (file >= 0) { end_io_cache(&log); @@ -11663,6 +12013,104 @@ err1: } +/* + Start recovery of the GTID index for a binlog file. + The old index is deleted and a new index is rebuilt while scanning the + binlog file during binlog recovery. + Errors are not fatal, as the code can fallback to slower full binlog file + scan when no GTID index is available. + + @param base_name File name of the binlog file. + @param offset End log pos of the GTID_LIST log event of the binlog file. + + @return Gtid_index_writer object or NULL. +*/ +Gtid_index_writer * +MYSQL_BIN_LOG::recover_gtid_index_start(const char *base_name, my_off_t offset) +{ + char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE]; + + Gtid_index_base::make_gtid_index_file_name(buf, sizeof(buf), base_name); + if (my_delete(buf, MYF(0))) + { + /* If ENOENT, the GTID index file is already deleted or never existed. */ + if (my_errno != ENOENT) + { + sql_print_information("Failed to delete file '%s' (errno=%d)", buf, my_errno); + } + my_errno= 0; + } + Gtid_index_writer *gi= + new Gtid_index_writer(base_name, (uint32)offset, + &rpl_global_gtid_binlog_state, + (uint32)opt_binlog_gtid_index_page_size, + (my_off_t)opt_binlog_gtid_index_span_min); + return gi; +} + + +/* + Process one GTID during GTID index recovery. + + @param gi Gtid_index_writer object or NULL. + @param offset End log pos of the GTID event. + @param gev GTID log event to process. + + @return nothing +*/ +void +MYSQL_BIN_LOG::recover_gtid_index_process(Gtid_index_writer *gi, + my_off_t offset, + const rpl_gtid *gtid) +{ + if (gi) + { + gi->process_gtid((uint32)offset, gtid); + } +} + + +/* + Complete the recovery of one GTID index, syncing and closing it. + + @param gi Gtid_index_writer object or NULL. + + @return nothing +*/ +void +MYSQL_BIN_LOG::recover_gtid_index_end(Gtid_index_writer *gi) +{ + if (gi) + { + gi->close(); + delete gi; + } +} + + +/* + Abort the recovery of one GTID index, deleting any partially recovered index. + + @param gi Gtid_index_writer object or NULL. + + @return nothing +*/ +void +MYSQL_BIN_LOG::recover_gtid_index_abort(Gtid_index_writer *gi) +{ + if (gi) + { + char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE]; + strmake(buf, gi->index_file_name, sizeof(buf)-1); + /* + Delete first the Gtid_index_writer object and then the partial index + (the writer still has the index file open and active until destructed). + */ + delete(gi); + my_delete(buf, MYF(0)); + } +} + int MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery) diff --git a/sql/log.h b/sql/log.h index fa0324c73de..30435ba558a 100644 --- a/sql/log.h +++ b/sql/log.h @@ -21,8 +21,10 @@ #include "rpl_constants.h" class Relay_log_info; +class Gtid_index_writer; class Format_description_log_event; +class Gtid_log_event; bool reopen_fstreams(const char *filename, FILE *outstream, FILE *errstream); void setup_log_handling(); @@ -240,6 +242,7 @@ extern TC_LOG_DUMMY tc_log_dummy; #define LOG_CLOSE_TO_BE_OPENED 2 #define LOG_CLOSE_STOP_EVENT 4 #define LOG_CLOSE_DELAYED_CLOSE 8 +#define LOG_CLOSE_SYNC_GTID_INDEX 16 /* Maximum unique log filename extension. @@ -711,6 +714,9 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log ulonglong group_commit_trigger_count, group_commit_trigger_timeout; ulonglong group_commit_trigger_lock_wait; + /* Binlog GTID index. */ + Gtid_index_writer *gtid_index; + /* pointer to the sync period variable, for binlog this will be sync_binlog_period, for relay log this will be sync_relay_log_period @@ -720,6 +726,13 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log bool state_file_deleted; bool binlog_state_recover_done; + Gtid_index_writer *recover_gtid_index_start(const char *base_name, + my_off_t offset); + void recover_gtid_index_process(Gtid_index_writer *gi, my_off_t offset, + const rpl_gtid *gtid); + void recover_gtid_index_end(Gtid_index_writer *gi); + void recover_gtid_index_abort(Gtid_index_writer *gi); + inline uint get_sync_period() { return *sync_period_ptr; @@ -739,6 +752,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log bool write_transaction_to_binlog_events(group_commit_entry *entry); void trx_group_commit_leader(group_commit_entry *leader); bool is_xidlist_idle_nolock(); + void update_gtid_index(uint32 offset, rpl_gtid gtid); + public: int new_file_without_locking(); /* @@ -759,11 +774,8 @@ public: ulong binlog_id; /* Total prepared XIDs and pending checkpoint requests in this binlog. */ long xid_count; - long notify_count; - /* For linking in requests to the binlog background thread. */ - xid_count_per_binlog *next_in_queue; xid_count_per_binlog(char *log_file_name, uint log_file_name_len) - :binlog_id(0), xid_count(0), notify_count(0) + :binlog_id(0), xid_count(0) { binlog_name_len= log_file_name_len; binlog_name= (char *) my_malloc(PSI_INSTRUMENT_ME, binlog_name_len, MYF(MY_ZEROFILL)); diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 9526773ad78..d0413d9e3de 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -83,6 +83,7 @@ #include "wsrep_server_state.h" #endif /* WITH_WSREP */ #include "proxy_protocol.h" +#include "gtid_index.h" #include "sql_callback.h" #include "threadpool.h" @@ -443,6 +444,9 @@ my_bool sp_automatic_privileges= 1; ulong opt_binlog_rows_event_max_size; ulong binlog_row_metadata; +my_bool opt_binlog_gtid_index= TRUE; +uint opt_binlog_gtid_index_page_size= 4096; +uint opt_binlog_gtid_index_span_min= 65536; my_bool opt_master_verify_checksum= 0; my_bool opt_slave_sql_verify_checksum= 1; const char *binlog_format_names[]= {"MIXED", "STATEMENT", "ROW", NullS}; @@ -491,6 +495,7 @@ ulong malloc_calls; ulong specialflag=0; ulong binlog_cache_use= 0, binlog_cache_disk_use= 0; ulong binlog_stmt_cache_use= 0, binlog_stmt_cache_disk_use= 0; +ulong binlog_gtid_index_hit= 0, binlog_gtid_index_miss= 0; ulong max_connections, max_connect_errors; uint max_password_errors; ulong extra_max_connections; @@ -896,7 +901,7 @@ PSI_file_key key_file_binlog, key_file_binlog_cache, key_file_binlog_index, PSI_file_key key_file_query_log, key_file_slow_log; PSI_file_key key_file_relaylog, key_file_relaylog_index, key_file_relaylog_cache, key_file_relaylog_index_cache; -PSI_file_key key_file_binlog_state; +PSI_file_key key_file_binlog_state, key_file_gtid_index; #ifdef HAVE_PSI_INTERFACE #ifdef HAVE_MMAP @@ -921,6 +926,7 @@ PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list, key_LOCK_status, key_LOCK_temp_pool, key_LOCK_system_variables_hash, key_LOCK_thd_data, key_LOCK_thd_kill, key_LOCK_user_conn, key_LOCK_uuid_short_generator, key_LOG_LOCK_log, + key_gtid_index_lock, key_master_info_data_lock, key_master_info_run_lock, key_master_info_sleep_lock, key_master_info_start_stop_lock, key_master_info_start_alter_lock, @@ -1007,6 +1013,7 @@ static PSI_mutex_info all_server_mutexes[]= { &key_LOCK_user_conn, "LOCK_user_conn", PSI_FLAG_GLOBAL}, { &key_LOCK_uuid_short_generator, "LOCK_uuid_short_generator", PSI_FLAG_GLOBAL}, { &key_LOG_LOCK_log, "LOG::LOCK_log", 0}, + { &key_gtid_index_lock, "Gtid_index_writer::gtid_index_mutex", 0}, { &key_master_info_data_lock, "Master_info::data_lock", 0}, { &key_master_info_start_stop_lock, "Master_info::start_stop_lock", 0}, { &key_master_info_run_lock, "Master_info::run_lock", 0}, @@ -2011,6 +2018,7 @@ static void clean_up(bool print_message) injector::free_instance(); mysql_bin_log.cleanup(); + Gtid_index_writer::gtid_index_cleanup(); my_tz_free(); my_dboptions_cache_free(); @@ -3962,6 +3970,7 @@ static int init_common_variables() inited before MY_INIT(). So we do it here. */ mysql_bin_log.init_pthread_objects(); + Gtid_index_writer::gtid_index_init(); /* TODO: remove this when my_time_t is 64 bit compatible */ if (!IS_TIME_T_VALID_FOR_TIMESTAMP(server_start_time)) @@ -7396,6 +7405,8 @@ SHOW_VAR status_vars[]= { {"Binlog_bytes_written", (char*) offsetof(STATUS_VAR, binlog_bytes_written), SHOW_LONGLONG_STATUS}, {"Binlog_cache_disk_use", (char*) &binlog_cache_disk_use, SHOW_LONG}, {"Binlog_cache_use", (char*) &binlog_cache_use, SHOW_LONG}, + {"Binlog_gtid_index_hit", (char*) &binlog_gtid_index_hit, SHOW_LONG}, + {"Binlog_gtid_index_miss", (char*) &binlog_gtid_index_miss, SHOW_LONG}, {"Binlog_stmt_cache_disk_use",(char*) &binlog_stmt_cache_disk_use, SHOW_LONG}, {"Binlog_stmt_cache_use", (char*) &binlog_stmt_cache_use, SHOW_LONG}, {"Busy_time", (char*) offsetof(STATUS_VAR, busy_time), SHOW_DOUBLE_STATUS}, @@ -7821,6 +7832,7 @@ static int mysql_init_variables(void) delayed_insert_errors= thread_created= 0; specialflag= 0; binlog_cache_use= binlog_cache_disk_use= 0; + binlog_gtid_index_hit= binlog_gtid_index_miss= 0; max_used_connections= slow_launch_threads = 0; max_used_connections_time= 0; mysqld_user= mysqld_chroot= opt_init_file= opt_bin_logname = 0; @@ -9219,7 +9231,8 @@ static PSI_file_info all_server_files[]= { &key_file_trg, "trigger_name", 0}, { &key_file_trn, "trigger", 0}, { &key_file_init, "init", 0}, - { &key_file_binlog_state, "binlog_state", 0} + { &key_file_binlog_state, "binlog_state", 0}, + { &key_file_gtid_index, "gtid_index", 0} }; #endif /* HAVE_PSI_INTERFACE */ @@ -9413,6 +9426,7 @@ PSI_memory_key key_memory_acl_cache; PSI_memory_key key_memory_acl_mem; PSI_memory_key key_memory_acl_memex; PSI_memory_key key_memory_binlog_cache_mngr; +PSI_memory_key key_memory_binlog_gtid_index; PSI_memory_key key_memory_binlog_pos; PSI_memory_key key_memory_binlog_recover_exec; PSI_memory_key key_memory_binlog_statement_buffer; @@ -9652,6 +9666,7 @@ static PSI_memory_info all_server_memory[]= // { &key_memory_Slave_job_group_group_relay_log_name, "Slave_job_group::group_relay_log_name", 0}, { &key_memory_Relay_log_info_group_relay_log_name, "Relay_log_info::group_relay_log_name", 0}, { &key_memory_binlog_cache_mngr, "binlog_cache_mngr", 0}, + { &key_memory_binlog_gtid_index, "binlog_gtid_index", 0}, { &key_memory_Row_data_memory_memory, "Row_data_memory::memory", 0}, // { &key_memory_Gtid_set_to_string, "Gtid_set::to_string", 0}, // { &key_memory_Gtid_state_to_string, "Gtid_state::to_string", 0}, diff --git a/sql/mysqld.h b/sql/mysqld.h index 11e1ee1c8da..2967de9d728 100644 --- a/sql/mysqld.h +++ b/sql/mysqld.h @@ -217,6 +217,7 @@ extern ulonglong thd_startup_options; extern my_thread_id global_thread_id; extern ulong binlog_cache_use, binlog_cache_disk_use; extern ulong binlog_stmt_cache_use, binlog_stmt_cache_disk_use; +extern ulong binlog_gtid_index_hit, binlog_gtid_index_miss; extern ulong aborted_threads, aborted_connects, aborted_connects_preauth; extern ulong delayed_insert_timeout; extern ulong delayed_insert_limit, delayed_queue_size; @@ -249,6 +250,9 @@ extern ulonglong slave_max_statement_time; extern double slave_max_statement_time_double; extern ulong opt_binlog_rows_event_max_size; extern ulong binlog_row_metadata; +extern my_bool opt_binlog_gtid_index; +extern uint opt_binlog_gtid_index_page_size; +extern uint opt_binlog_gtid_index_span_min; extern ulong thread_cache_size; extern ulong stored_program_cache_size; extern ulong opt_slave_parallel_threads; @@ -333,7 +337,7 @@ extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list, key_LOCK_rpl_status, key_LOCK_server_started, key_LOCK_status, key_LOCK_optimizer_costs, key_LOCK_thd_data, key_LOCK_thd_kill, - key_LOCK_user_conn, key_LOG_LOCK_log, + key_LOCK_user_conn, key_LOG_LOCK_log, key_gtid_index_lock, key_master_info_data_lock, key_master_info_run_lock, key_master_info_sleep_lock, key_master_info_start_stop_lock, key_master_info_start_alter_lock, @@ -411,7 +415,7 @@ extern PSI_file_key key_file_relaylog, key_file_relaylog_index, key_file_relaylog_cache, key_file_relaylog_index_cache; extern PSI_socket_key key_socket_tcpip, key_socket_unix, key_socket_client_connection; -extern PSI_file_key key_file_binlog_state; +extern PSI_file_key key_file_binlog_state, key_file_gtid_index; #ifdef HAVE_PSI_INTERFACE void init_server_psi_keys(); @@ -456,6 +460,7 @@ extern PSI_memory_key key_memory_user_var_entry_value; extern PSI_memory_key key_memory_Slave_job_group_group_relay_log_name; extern PSI_memory_key key_memory_Relay_log_info_group_relay_log_name; extern PSI_memory_key key_memory_binlog_cache_mngr; +extern PSI_memory_key key_memory_binlog_gtid_index; extern PSI_memory_key key_memory_Row_data_memory_memory; extern PSI_memory_key key_memory_errmsgs; extern PSI_memory_key key_memory_Event_queue_element_for_exec_names; diff --git a/sql/privilege.h b/sql/privilege.h index 143eebd125e..dedabc32c38 100644 --- a/sql/privilege.h +++ b/sql/privilege.h @@ -376,6 +376,15 @@ constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_ROW_METADATA= constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_LEGACY_EVENT_POS= SUPER_ACL | BINLOG_ADMIN_ACL; +constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX= + BINLOG_ADMIN_ACL; + +constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_PAGE_SIZE= + BINLOG_ADMIN_ACL; + +constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_SPAN_MIN= + BINLOG_ADMIN_ACL; + constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_EXPIRE_LOGS_DAYS= BINLOG_ADMIN_ACL; diff --git a/sql/rpl_gtid.cc b/sql/rpl_gtid.cc index 20188d6c11d..27c10f21c00 100644 --- a/sql/rpl_gtid.cc +++ b/sql/rpl_gtid.cc @@ -1542,19 +1542,18 @@ rpl_slave_state::alloc_gtid_pos_table(LEX_CSTRING *table_name, void *hton, } -void rpl_binlog_state::init() +void +rpl_binlog_state_base::init() { my_hash_init(PSI_INSTRUMENT_ME, &hash, &my_charset_bin, 32, offsetof(element, domain_id), sizeof(element::domain_id), NULL, my_free, HASH_UNIQUE); - my_init_dynamic_array(PSI_INSTRUMENT_ME, >id_sort_array, sizeof(rpl_gtid), 8, 8, MYF(0)); - mysql_mutex_init(key_LOCK_binlog_state, &LOCK_binlog_state, - MY_MUTEX_INIT_SLOW); initialized= 1; } + void -rpl_binlog_state::reset_nolock() +rpl_binlog_state_base::reset_nolock() { uint32 i; @@ -1564,6 +1563,267 @@ rpl_binlog_state::reset_nolock() } +void +rpl_binlog_state_base::free() +{ + if (initialized) + { + initialized= 0; + reset_nolock(); + my_hash_free(&hash); + } +} + + +rpl_binlog_state_base::~rpl_binlog_state_base() +{ + free(); +} + + +bool +rpl_binlog_state_base::load_nolock(struct rpl_gtid *list, uint32 count) +{ + uint32 i; + bool res= false; + + reset_nolock(); + for (i= 0; i < count; ++i) + { + if (update_nolock(&(list[i]))) + { + res= true; + break; + } + } + return res; +} + + +bool +rpl_binlog_state_base::load_nolock(rpl_binlog_state_base *orig_state) +{ + ulong i, j; + HASH *h1= &orig_state->hash; + + reset_nolock(); + for (i= 0; i < h1->records; ++i) + { + element *e= (element *)my_hash_element(h1, i); + HASH *h2= &e->hash; + const rpl_gtid *last_gtid= e->last_gtid; + for (j= 0; j < h2->records; ++j) + { + const rpl_gtid *gtid= (const rpl_gtid *)my_hash_element(h2, j); + if (gtid == last_gtid) + continue; + if (update_nolock(gtid)) + return true; + } + if (likely(last_gtid) && update_nolock(last_gtid)) + return true; + } + + return false; +} + + +/* + Update replication state with a new GTID. + + If the (domain_id, server_id) pair already exists, then the new GTID replaces + the old one for that domain id. Else a new entry is inserted. + + Note that rpl_binlog_state_base::update_nolock() does not call my_error() + for out-of-memory, caller must do that if needed (eg. ER_OUT_OF_RESOURCES). + + Returns 0 for ok, 1 for error. +*/ +int +rpl_binlog_state_base::update_nolock(const struct rpl_gtid *gtid) +{ + element *elem; + + if ((elem= (element *)my_hash_search(&hash, + (const uchar *)(>id->domain_id), + sizeof(gtid->domain_id)))) + { + if (elem->seq_no_counter < gtid->seq_no) + elem->seq_no_counter= gtid->seq_no; + if (!elem->update_element(gtid)) + return 0; + } + else if (!alloc_element_nolock(gtid)) + return 0; + + return 1; +} + + +int +rpl_binlog_state_base::alloc_element_nolock(const rpl_gtid *gtid) +{ + element *elem; + rpl_gtid *lookup_gtid; + + /* First time we see this domain_id; allocate a new element. */ + elem= (element *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*elem), MYF(0)); + lookup_gtid= (rpl_gtid *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*lookup_gtid), + MYF(0)); + if (elem && lookup_gtid) + { + elem->domain_id= gtid->domain_id; + my_hash_init(PSI_INSTRUMENT_ME, &elem->hash, &my_charset_bin, 32, + offsetof(rpl_gtid, server_id), sizeof(rpl_gtid::domain_id), + NULL, my_free, HASH_UNIQUE); + elem->last_gtid= lookup_gtid; + elem->seq_no_counter= gtid->seq_no; + memcpy(lookup_gtid, gtid, sizeof(*lookup_gtid)); + if (0 == my_hash_insert(&elem->hash, (const uchar *)lookup_gtid)) + { + lookup_gtid= NULL; /* Do not free. */ + if (0 == my_hash_insert(&hash, (const uchar *)elem)) + return 0; + } + my_hash_free(&elem->hash); + } + + /* An error. */ + if (elem) + my_free(elem); + if (lookup_gtid) + my_free(lookup_gtid); + return 1; +} + + +uint32 +rpl_binlog_state_base::count_nolock() +{ + uint32 c= 0; + uint32 i; + + for (i= 0; i < hash.records; ++i) + c+= ((element *)my_hash_element(&hash, i))->hash.records; + + return c; +} + + +int +rpl_binlog_state_base::get_gtid_list_nolock(rpl_gtid *gtid_list, uint32 list_size) +{ + uint32 i, j, pos; + + pos= 0; + for (i= 0; i < hash.records; ++i) + { + element *e= (element *)my_hash_element(&hash, i); + if (!e->last_gtid) + { + DBUG_ASSERT(e->hash.records==0); + continue; + } + for (j= 0; j <= e->hash.records; ++j) + { + const rpl_gtid *gtid; + if (j < e->hash.records) + { + gtid= (rpl_gtid *)my_hash_element(&e->hash, j); + if (gtid == e->last_gtid) + continue; + } + else + gtid= e->last_gtid; + + if (pos >= list_size) + return 1; + memcpy(>id_list[pos++], gtid, sizeof(*gtid)); + } + } + + return 0; +} + + +rpl_gtid * +rpl_binlog_state_base::find_nolock(uint32 domain_id, uint32 server_id) +{ + element *elem; + if (!(elem= (element *)my_hash_search(&hash, (const uchar *)&domain_id, + sizeof(domain_id)))) + return NULL; + return (rpl_gtid *)my_hash_search(&elem->hash, (const uchar *)&server_id, + sizeof(server_id)); +} + + +/* + Return true if this binlog state is before the position specified by the + passed-in slave_connection_state, false otherwise. + Note that if the GTID D-S-N is the last GTID added to the state in the + domain D, then the state is considered to come before the position D-S-N + within domain D. +*/ +bool +rpl_binlog_state_base::is_before_pos(slave_connection_state *pos) +{ + /* + First check each GTID in the slave position, if it comes after what is + in the state. + */ + for (uint32 i= 0; i < pos->hash.records; ++i) + { + const slave_connection_state::entry *e= + (const slave_connection_state::entry *)my_hash_element(&pos->hash, i); + /* + IF we have an entry with the same (domain_id, server_id), + AND either + ( we are ahead in that server_id + OR we are identical, but there's some other server_id after) + THEN that position lies before our state. + */ + element *elem; + if ((elem= (element *)my_hash_search(&hash, + (const uchar *)&e->gtid.domain_id, + sizeof(e->gtid.domain_id)))) + { + const rpl_gtid *g= (rpl_gtid *) + my_hash_search(&elem->hash, (const uchar *)&e->gtid.server_id, + sizeof(e->gtid.server_id)); + if (g != nullptr && + ( g->seq_no > e->gtid.seq_no || + ( g->seq_no == e->gtid.seq_no && g != elem->last_gtid) )) + return false; + } + } + + /* + Then check the state, if there are any domains present that are missing + from the position. + */ + for (uint32 i= 0; i < hash.records; ++i) + { + const element *elem= (const element *) my_hash_element(&hash, i); + if (likely(elem->hash.records > 0) && + !pos->find(elem->domain_id)) + return false; + } + + /* Nothing in our state lies after anything in the position. */ + return true; +} + + +void rpl_binlog_state::init() +{ + rpl_binlog_state_base::init(); + my_init_dynamic_array(PSI_INSTRUMENT_ME, >id_sort_array, sizeof(rpl_gtid), 8, 8, MYF(0)); + mysql_mutex_init(key_LOCK_binlog_state, &LOCK_binlog_state, + MY_MUTEX_INIT_SLOW); +} + + void rpl_binlog_state::reset() { @@ -1577,32 +1837,27 @@ void rpl_binlog_state::free() { if (initialized) { - initialized= 0; - reset_nolock(); - my_hash_free(&hash); + rpl_binlog_state_base::free(); delete_dynamic(>id_sort_array); mysql_mutex_destroy(&LOCK_binlog_state); } } +rpl_binlog_state::~rpl_binlog_state() +{ + free(); +} + + bool rpl_binlog_state::load(struct rpl_gtid *list, uint32 count) { - uint32 i; - bool res= false; - mysql_mutex_lock(&LOCK_binlog_state); - reset_nolock(); - for (i= 0; i < count; ++i) - { - if (update_nolock(&(list[i]), false)) - { - res= true; - break; - } - } + bool res= load_nolock(list, count); mysql_mutex_unlock(&LOCK_binlog_state); + if (res) + my_error(ER_OUT_OF_RESOURCES, MYF(0)); return res; } @@ -1610,7 +1865,7 @@ rpl_binlog_state::load(struct rpl_gtid *list, uint32 count) static int rpl_binlog_state_load_cb(rpl_gtid *gtid, void *data) { rpl_binlog_state *self= (rpl_binlog_state *)data; - return self->update_nolock(gtid, false); + return self->update_nolock(gtid); } @@ -1622,31 +1877,22 @@ rpl_binlog_state::load(rpl_slave_state *slave_pos) mysql_mutex_lock(&LOCK_binlog_state); reset_nolock(); if (slave_pos->iterate(rpl_binlog_state_load_cb, this, NULL, 0, false)) + { + my_error(ER_OUT_OF_RESOURCES, MYF(0)); res= true; + } mysql_mutex_unlock(&LOCK_binlog_state); return res; } -rpl_binlog_state::~rpl_binlog_state() -{ - free(); -} - - -/* - Update replication state with a new GTID. - - If the (domain_id, server_id) pair already exists, then the new GTID replaces - the old one for that domain id. Else a new entry is inserted. - - Returns 0 for ok, 1 for error. -*/ int -rpl_binlog_state::update_nolock(const struct rpl_gtid *gtid, bool strict) +rpl_binlog_state::update(const struct rpl_gtid *gtid, bool strict) { + int res= 0; element *elem; + mysql_mutex_lock(&LOCK_binlog_state); if ((elem= (element *)my_hash_search(&hash, (const uchar *)(>id->domain_id), sizeof(gtid->domain_id)))) @@ -1656,27 +1902,21 @@ rpl_binlog_state::update_nolock(const struct rpl_gtid *gtid, bool strict) my_error(ER_GTID_STRICT_OUT_OF_ORDER, MYF(0), gtid->domain_id, gtid->server_id, gtid->seq_no, elem->last_gtid->domain_id, elem->last_gtid->server_id, elem->last_gtid->seq_no); - return 1; + res= 1; + } + else + { + if (elem->seq_no_counter < gtid->seq_no) + elem->seq_no_counter= gtid->seq_no; + if (elem->update_element(gtid)) + res= 1; } - if (elem->seq_no_counter < gtid->seq_no) - elem->seq_no_counter= gtid->seq_no; - if (!elem->update_element(gtid)) - return 0; } - else if (!alloc_element_nolock(gtid)) - return 0; - - my_error(ER_OUT_OF_RESOURCES, MYF(0)); - return 1; -} - - -int -rpl_binlog_state::update(const struct rpl_gtid *gtid, bool strict) -{ - int res; - mysql_mutex_lock(&LOCK_binlog_state); - res= update_nolock(gtid, strict); + else if (alloc_element_nolock(gtid)) + { + my_error(ER_OUT_OF_RESOURCES, MYF(0)); + res= 1; + } mysql_mutex_unlock(&LOCK_binlog_state); return res; } @@ -1762,43 +2002,6 @@ rpl_binlog_state::element::update_element(const rpl_gtid *gtid) } -int -rpl_binlog_state::alloc_element_nolock(const rpl_gtid *gtid) -{ - element *elem; - rpl_gtid *lookup_gtid; - - /* First time we see this domain_id; allocate a new element. */ - elem= (element *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*elem), MYF(MY_WME)); - lookup_gtid= (rpl_gtid *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*lookup_gtid), - MYF(MY_WME)); - if (elem && lookup_gtid) - { - elem->domain_id= gtid->domain_id; - my_hash_init(PSI_INSTRUMENT_ME, &elem->hash, &my_charset_bin, 32, - offsetof(rpl_gtid, server_id), sizeof(rpl_gtid::domain_id), - NULL, my_free, HASH_UNIQUE); - elem->last_gtid= lookup_gtid; - elem->seq_no_counter= gtid->seq_no; - memcpy(lookup_gtid, gtid, sizeof(*lookup_gtid)); - if (0 == my_hash_insert(&elem->hash, (const uchar *)lookup_gtid)) - { - lookup_gtid= NULL; /* Do not free. */ - if (0 == my_hash_insert(&hash, (const uchar *)elem)) - return 0; - } - my_hash_free(&elem->hash); - } - - /* An error. */ - if (elem) - my_free(elem); - if (lookup_gtid) - my_free(lookup_gtid); - return 1; -} - - /* Check that a new GTID can be logged without creating an out-of-order sequence number with existing GTIDs. @@ -1950,7 +2153,7 @@ rpl_binlog_state::read_from_iocache(IO_CACHE *src) p= buf; end= buf + len; if (gtid_parser_helper(&p, end, >id) || - update_nolock(>id, false)) + update_nolock(>id)) { res= 1; break; @@ -1961,17 +2164,6 @@ rpl_binlog_state::read_from_iocache(IO_CACHE *src) } -rpl_gtid * -rpl_binlog_state::find_nolock(uint32 domain_id, uint32 server_id) -{ - element *elem; - if (!(elem= (element *)my_hash_search(&hash, (const uchar *)&domain_id, - sizeof(domain_id)))) - return NULL; - return (rpl_gtid *)my_hash_search(&elem->hash, (const uchar *)&server_id, - sizeof(server_id)); -} - rpl_gtid * rpl_binlog_state::find(uint32 domain_id, uint32 server_id) { @@ -2002,12 +2194,8 @@ rpl_binlog_state::find_most_recent(uint32 domain_id) uint32 rpl_binlog_state::count() { - uint32 c= 0; - uint32 i; - mysql_mutex_lock(&LOCK_binlog_state); - for (i= 0; i < hash.records; ++i) - c+= ((element *)my_hash_element(&hash, i))->hash.records; + uint32 c= count_nolock(); mysql_mutex_unlock(&LOCK_binlog_state); return c; @@ -2017,41 +2205,8 @@ rpl_binlog_state::count() int rpl_binlog_state::get_gtid_list(rpl_gtid *gtid_list, uint32 list_size) { - uint32 i, j, pos; - int res= 0; - mysql_mutex_lock(&LOCK_binlog_state); - pos= 0; - for (i= 0; i < hash.records; ++i) - { - element *e= (element *)my_hash_element(&hash, i); - if (!e->last_gtid) - { - DBUG_ASSERT(e->hash.records==0); - continue; - } - for (j= 0; j <= e->hash.records; ++j) - { - const rpl_gtid *gtid; - if (j < e->hash.records) - { - gtid= (rpl_gtid *)my_hash_element(&e->hash, j); - if (gtid == e->last_gtid) - continue; - } - else - gtid= e->last_gtid; - - if (pos >= list_size) - { - res= 1; - goto end; - } - memcpy(>id_list[pos++], gtid, sizeof(*gtid)); - } - } - -end: + int res= get_gtid_list_nolock(gtid_list, list_size); mysql_mutex_unlock(&LOCK_binlog_state); return res; } diff --git a/sql/rpl_gtid.h b/sql/rpl_gtid.h index 7d25ee6e75d..8b697c79515 100644 --- a/sql/rpl_gtid.h +++ b/sql/rpl_gtid.h @@ -26,6 +26,11 @@ extern const LEX_CSTRING rpl_gtid_slave_state_table_name; class String; +#ifdef MYSQL_SERVER +struct TABLE; +#endif +struct slave_connection_state; + #define PARAM_GTID(G) G.domain_id, G.server_id, G.seq_no #define GTID_MAX_STR_LENGTH (10+1+10+1+20) @@ -296,8 +301,13 @@ struct rpl_slave_state to know where to start when a master is changed to a slave. As a side effect, it also allows to skip a hash lookup in the very common case of logging a new GTID with same server id as last GTID. + + The base class rpl_binlog_state_base contains just be basic data operations + to insert/update GTIDs, and is used eg. from Gtid_index_*. The main class + rpl_binlog_state builds server logic on top of that like mutex locking, + gtid_strict_mode handling, etc. */ -struct rpl_binlog_state +struct rpl_binlog_state_base { struct element { uint32 domain_id; @@ -309,29 +319,45 @@ struct rpl_binlog_state int update_element(const rpl_gtid *gtid); }; + /* Mapping from domain_id to collection of elements. */ HASH hash; + my_bool initialized; + + rpl_binlog_state_base() : initialized(0) {} + ~rpl_binlog_state_base(); + void init(); + void reset_nolock(); + void free(); + bool load_nolock(struct rpl_gtid *list, uint32 count); + bool load_nolock(rpl_binlog_state_base *orig_state); + int update_nolock(const struct rpl_gtid *gtid); + int alloc_element_nolock(const rpl_gtid *gtid); + uint32 count_nolock(); + int get_gtid_list_nolock(rpl_gtid *gtid_list, uint32 list_size); + rpl_gtid *find_nolock(uint32 domain_id, uint32 server_id); + bool is_before_pos(slave_connection_state *pos); +}; + +struct rpl_binlog_state : public rpl_binlog_state_base +{ /* Mutex protecting access to the state. */ mysql_mutex_t LOCK_binlog_state; - my_bool initialized; /* Auxiliary buffer to sort gtid list. */ DYNAMIC_ARRAY gtid_sort_array; - rpl_binlog_state() :initialized(0) {} + rpl_binlog_state() {} ~rpl_binlog_state(); void init(); - void reset_nolock(); void reset(); void free(); bool load(struct rpl_gtid *list, uint32 count); bool load(rpl_slave_state *slave_pos); - int update_nolock(const struct rpl_gtid *gtid, bool strict); int update(const struct rpl_gtid *gtid, bool strict); int update_with_next_gtid(uint32 domain_id, uint32 server_id, rpl_gtid *gtid); - int alloc_element_nolock(const rpl_gtid *gtid); bool check_strict_sequence(uint32 domain_id, uint32 server_id, uint64 seq_no, bool no_error= false); int bump_seq_no_if_needed(uint32 domain_id, uint64 seq_no); @@ -342,7 +368,6 @@ struct rpl_binlog_state int get_most_recent_gtid_list(rpl_gtid **list, uint32 *size); bool append_pos(String *str); bool append_state(String *str); - rpl_gtid *find_nolock(uint32 domain_id, uint32 server_id); rpl_gtid *find(uint32 domain_id, uint32 server_id); rpl_gtid *find_most_recent(uint32 domain_id); const char* drop_domain(DYNAMIC_ARRAY *ids, Gtid_list_log_event *glev, char*); diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index cc6def9b8f3..9adf75228de 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -1547,7 +1547,7 @@ Relay_log_info::update_relay_log_state(rpl_gtid *gtid_list, uint32 count) int res= 0; while (count) { - if (relay_log_state.update_nolock(gtid_list, false)) + if (relay_log_state.update_nolock(gtid_list)) res= 1; ++gtid_list; --count; diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc index e7fdfea8b2b..fddf274fe02 100644 --- a/sql/sql_repl.cc +++ b/sql/sql_repl.cc @@ -31,6 +31,7 @@ #include "semisync_master.h" #include "semisync_slave.h" #include "mysys_err.h" +#include "gtid_index.h" enum enum_gtid_until_state { @@ -1286,6 +1287,100 @@ end: return err; } + +/* + Helper function for gtid_find_binlog_pos() below. + Check a binlog file against a slave position. Use a GTID index if present. + Returns: + 0 This is the binlog file that contains the position. If *out_start_seek + is non-zero, it is the offset found in the GTID index at which to start + scanning the binlog file for events to send to the slave. + 1 This binlog file is too new to contain the given slave position. + -1 Error, *out_errormsg contains error string. + + The *out_glev event must be deleted by the caller if set non-null. + */ +static int +gtid_check_binlog_file(slave_connection_state *state, + Gtid_index_reader_hot *reader, + const binlog_file_entry *list, + bool *found_in_index, uint32 *out_start_seek, + uint32 *found_count, + char *out_name, Gtid_list_log_event **out_glev, + const char **out_errormsg) +{ + Gtid_list_log_event *glev= nullptr; + char buf[FN_REFLEN]; + File file; + IO_CACHE cache; + int res= -1; + + *found_in_index= false; + *out_glev= nullptr; + *out_errormsg= nullptr; + /* + Try to lookup the GTID position in the gtid index. + If that doesn't work, read the Gtid_list_log_event at the start of the + binlog file to get the binlog state. + */ + if (normalize_binlog_name(buf, list->name.str, false)) + { + *out_errormsg= "Failed to determine binlog file name while looking for " + "GTID position in binlog"; + goto end; + } + + if (likely(reader && !reader->open_index_file(buf))) + { + int lookup= reader->search_gtid_pos(state, out_start_seek, found_count); + reader->close_index_file(); + if (lookup >= 0) + { + statistic_increment(binlog_gtid_index_hit, &LOCK_status); + if (lookup == 0) + res= 1; + else + { + strmake(out_name, buf, FN_REFLEN); + *found_in_index= true; + res= 0; + } + goto end; + } + /* + Error in the index lookup; fall back to reading the GTID_LIST event from + the binlog file and scan it from the beginning. + */ + } + statistic_increment(binlog_gtid_index_miss, &LOCK_status); + + bzero((char*) &cache, sizeof(cache)); + if (unlikely((file= open_binlog(&cache, buf, out_errormsg)) == (File)-1)) + goto end; + *out_errormsg= get_gtid_list_event(&cache, &glev); + end_io_cache(&cache); + mysql_file_close(file, MYF(MY_WME)); + if (unlikely(*out_errormsg)) + goto end; + + if (!glev || contains_all_slave_gtid(state, glev)) + { + strmake(out_name, buf, FN_REFLEN); + *out_glev= glev; + *out_errormsg= nullptr; + res= 0; + } + else + { + delete glev; + res= 1; + } + +end: + return res; +} + + /* Find the name of the binlog file to start reading for a slave that connects using GTID state. @@ -1314,14 +1409,17 @@ end: the requested GTID that was already purged. */ static const char * -gtid_find_binlog_file(slave_connection_state *state, char *out_name, - slave_connection_state *until_gtid_state) +gtid_find_binlog_pos(slave_connection_state *state, char *out_name, + slave_connection_state *until_gtid_state, + rpl_binlog_state *until_binlog_state, + bool *found_in_index, uint32 *out_start_seek) { MEM_ROOT memroot; binlog_file_entry *list; Gtid_list_log_event *glev= NULL; const char *errormsg= NULL; - char buf[FN_REFLEN]; + Gtid_index_reader_hot *reader= NULL; + *found_in_index= false; init_alloc_root(PSI_INSTRUMENT_ME, &memroot, 10*(FN_REFLEN+sizeof(binlog_file_entry)), 0, @@ -1332,48 +1430,41 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name, goto end; } + if (opt_binlog_gtid_index) + reader= new Gtid_index_reader_hot(); + while (list) { - File file; - IO_CACHE cache; - - if (!list->next) - { - /* - It should be safe to read the currently used binlog, as we will only - read the header part that is already written. - - But if that does not work on windows, then we will need to cache the - event somewhere in memory I suppose - that could work too. - */ - } - /* - Read the Gtid_list_log_event at the start of the binlog file to - get the binlog state. - */ - if (normalize_binlog_name(buf, list->name.str, false)) - { - errormsg= "Failed to determine binlog file name while looking for " - "GTID position in binlog"; + uint32 found_count; + int res= gtid_check_binlog_file(state, reader, list, found_in_index, + out_start_seek, &found_count, + out_name, &glev, &errormsg); + if (res < 0) goto end; - } - bzero((char*) &cache, sizeof(cache)); - if (unlikely((file= open_binlog(&cache, buf, &errormsg)) == (File)-1)) - goto end; - errormsg= get_gtid_list_event(&cache, &glev); - end_io_cache(&cache); - mysql_file_close(file, MYF(MY_WME)); - if (unlikely(errormsg)) - goto end; - - if (!glev || contains_all_slave_gtid(state, glev)) + if (res == 0) { - strmake(out_name, buf, FN_REFLEN); - - if (glev) + if (*found_in_index || glev) { uint32 i; + uint32 count; + rpl_gtid *gtids; + if (*found_in_index) + { + count= found_count; + gtids= reader->search_gtid_list(); + /* + Load the initial GTID state corresponding to the position found in + the GTID index, as we will not have a GTID_LIST event to load it + from. + */ + until_binlog_state->load(gtids, count); + } + else + { + count= glev->count; + gtids= glev->list; + } /* As a special case, we allow to start from binlog file N if the requested GTID is the last event (in the corresponding domain) in @@ -1385,9 +1476,9 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name, from the UNTIL hash, to mark that such domains have already reached their UNTIL condition. */ - for (i= 0; i < glev->count; ++i) + for (i= 0; i < count; ++i) { - const rpl_gtid *gtid= state->find(glev->list[i].domain_id); + const rpl_gtid *gtid= state->find(gtids[i].domain_id); if (!gtid) { /* @@ -1400,8 +1491,8 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name, further GTIDs in the Gtid_list. */ DBUG_ASSERT(0); - } else if (gtid->server_id == glev->list[i].server_id && - gtid->seq_no == glev->list[i].seq_no) + } else if (gtid->server_id == gtids[i].server_id && + gtid->seq_no == gtids[i].seq_no) { /* The slave requested to start from the very beginning of this @@ -1412,9 +1503,9 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name, } if (until_gtid_state && - (gtid= until_gtid_state->find(glev->list[i].domain_id)) && - gtid->server_id == glev->list[i].server_id && - gtid->seq_no <= glev->list[i].seq_no) + (gtid= until_gtid_state->find(gtids[i].domain_id)) && + gtid->server_id == gtids[i].server_id && + gtid->seq_no <= gtids[i].seq_no) { /* We've already reached the stop position in UNTIL for this domain, @@ -1427,8 +1518,6 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name, goto end; } - delete glev; - glev= NULL; list= list->next; } @@ -1441,11 +1530,56 @@ end: if (glev) delete glev; + if (reader) + delete reader; + free_root(&memroot, MYF(0)); return errormsg; } +static bool +gtid_index_lookup_pos(const char *name, uint32 offset, uint32 *out_start_seek, + slave_connection_state *out_gtid_state) +{ + Gtid_index_reader_hot *reader= nullptr; + bool opened= false; + bool found= false; + uint32 found_offset, found_gtid_count; + rpl_gtid *found_gtids; + int res; + + if (!(reader= new Gtid_index_reader_hot()) || + reader->open_index_file(name)) + { + statistic_increment(binlog_gtid_index_miss, &LOCK_status); + goto err; + } + opened= true; + res= reader->search_offset(offset, &found_offset, &found_gtid_count); + if (res <= 0) + { + statistic_increment(binlog_gtid_index_miss, &LOCK_status); + goto err; + } + statistic_increment(binlog_gtid_index_hit, &LOCK_status); + + /* We found the position, initialize the state from the index. */ + found_gtids= reader->search_gtid_list(); + if (out_gtid_state->load(found_gtids, found_gtid_count)) + goto err; + *out_start_seek= found_offset; + found= true; + +err: + if (opened) + reader->close_index_file(); + if (reader) + delete reader; + return found; +} + + /* Given an old-style binlog position with file name and file offset, find the corresponding gtid position. If the offset is not at an event boundary, give @@ -1469,8 +1603,22 @@ gtid_state_from_pos(const char *name, uint32 offset, int err; String packet; Format_description_log_event *fdev= NULL; + bool found_in_index; + uint32 UNINIT_VAR(start_seek); + bool seek_done= false; - if (unlikely(gtid_state->load((const rpl_gtid *)NULL, 0))) + /* + Try to lookup the position in the binlog gtid index. If found (as it will + usually be unless the index is corrupted somehow), we can seek directly to + a point at or just before the desired location, saving an expensive scan + of the binlog file from the start. + */ + found_in_index= opt_binlog_gtid_index ? + gtid_index_lookup_pos(name, offset, &start_seek, gtid_state) : + false; + if (found_in_index) + found_gtid_list_event= true; + else if (unlikely(gtid_state->load((const rpl_gtid *)NULL, 0))) { errormsg= "Internal error (out of memory?) initializing slave state " "while scanning binlog to find start position"; @@ -1559,6 +1707,25 @@ gtid_state_from_pos(const char *name, uint32 offset, errormsg= "Could not start decryption of binlog."; goto end; } + if (found_in_index && !seek_done) + { + /* + Just to avoid a redundant event read before hitting the next branch. + ToDo: share this code with the below somehow. + */ + my_b_seek(&cache, start_seek); + seek_done= true; + } + } + else if (found_in_index && !seek_done) + { + /* + After reading the format_description event and possibly + start_encryption, we can seek forward to avoid most or all of the scan + (depending on the sparseness of the index). + */ + my_b_seek(&cache, start_seek); + seek_done= true; } else if (unlikely(typ != FORMAT_DESCRIPTION_EVENT && !found_format_description_event)) @@ -1570,7 +1737,7 @@ gtid_state_from_pos(const char *name, uint32 offset, else if (typ == ROTATE_EVENT || typ == STOP_EVENT || typ == BINLOG_CHECKPOINT_EVENT) continue; /* Continue looking */ - else if (typ == GTID_LIST_EVENT) + else if (typ == GTID_LIST_EVENT && !found_in_index) { rpl_gtid *gtid_list; bool status; @@ -1798,7 +1965,7 @@ send_event_to_slave(binlog_send_info *info, Log_event_type event_type, } }); - if (info->until_binlog_state.update_nolock(&event_gtid, false)) + if (info->until_binlog_state.update_nolock(&event_gtid)) { info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG; return "Failed in internal GTID book-keeping: Out of memory"; @@ -2198,6 +2365,8 @@ static int init_binlog_sender(binlog_send_info *info, char search_file_name[FN_REFLEN]; const char *name=search_file_name; + bool found_in_index= false; + uint32 start_seek= 0; if (info->using_gtid_state) { if (info->gtid_state.load(connect_gtid_state.ptr(), @@ -2223,16 +2392,26 @@ static int init_binlog_sender(binlog_send_info *info, info->error= error; return 1; } - if ((info->errmsg= gtid_find_binlog_file(&info->gtid_state, - search_file_name, - info->until_gtid_state))) + if ((info->errmsg= gtid_find_binlog_pos(&info->gtid_state, + search_file_name, + info->until_gtid_state, + &info->until_binlog_state, + &found_in_index, &start_seek))) { info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG; return 1; } - /* start from beginning of binlog file */ - *pos = 4; + if (found_in_index) + { + /* Start from a position looked up in the binlog gtid index. */ + *pos = start_seek; + } + else + { + /* start from beginning of binlog file */ + *pos = 4; + } } else { @@ -2865,6 +3044,7 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos, ushort flags) { LOG_INFO linfo; + ulong ev_offset; IO_CACHE log; File file = -1; @@ -2990,6 +3170,34 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos, if (info->until_gtid_state && info->until_gtid_state->count() == 0) info->gtid_until_group= GTID_UNTIL_STOP_AFTER_STANDALONE; + if (info->using_gtid_state && pos > BIN_LOG_HEADER_SIZE && + ( info->gtid_state.is_pos_reached() || + info->gtid_until_group == GTID_UNTIL_STOP_AFTER_STANDALONE ) ) + { + /* + We are starting a GTID connect from a point not at the start of the + binlog file (from a GTID index lookup). Send a fake GTID_LIST event + in place of the real GTID_LIST that would normally be sent from the + start of the binlog file. + + If we already reached the gtid UNTIL position, then set the + FLAG_UNTIL_REACHED in the GTID_LIST event and stop immediately. + */ + uint32 flag= 0; + if (info->gtid_until_group == GTID_UNTIL_STOP_AFTER_STANDALONE) + { + flag= Gtid_list_log_event::FLAG_UNTIL_REACHED; + info->should_stop= true; + } + Gtid_list_log_event glev(&info->until_binlog_state, flag); + if (reset_transmit_packet(info, info->flags, &ev_offset, &info->errmsg) || + fake_gtid_list_event(info, &glev, &info->errmsg, (int32)pos)) + { + info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG; + goto err; + } + } + THD_STAGE_INFO(thd, stage_sending_binlog_event_to_slave); if (send_one_binlog_file(info, &log, &linfo, pos)) break; diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index ffcfc90aca9..6065bf2dd0a 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -6844,6 +6844,36 @@ Sys_binlog_row_metadata( ON_UPDATE(NULL)); +static Sys_var_on_access_global +Sys_binlog_gtid_index( + "binlog_gtid_index", + "Enable the creation of a GTID index for every binlog file, and the use " + "of such index for speeding up GTID lookup in the binlog.", + GLOBAL_VAR(opt_binlog_gtid_index), CMD_LINE(OPT_ARG), + DEFAULT(TRUE)); + + +static Sys_var_on_access_global +Sys_binlog_gtid_index_page_size( + "binlog_gtid_index_page_size", + "Page size to use for the binlog GTID index.", + GLOBAL_VAR(opt_binlog_gtid_index_page_size), CMD_LINE(REQUIRED_ARG), + VALID_RANGE(64, 1<<24), DEFAULT(4096), BLOCK_SIZE(1)); + + +static Sys_var_on_access_global +Sys_binlog_gtid_index_span_min( + "binlog_gtid_index_span_min", + "Control sparseness of the binlog GTID index. If set to N, at most one " + "index record will be added for every N bytes of binlog file written, " + "to reduce the size of the index. Normally does not need tuning.", + GLOBAL_VAR(opt_binlog_gtid_index_span_min), CMD_LINE(REQUIRED_ARG), + VALID_RANGE(1, 1024*1024L*1024L), DEFAULT(65536), BLOCK_SIZE(1)); + + static bool check_pseudo_slave_mode(sys_var *self, THD *thd, set_var *var) { longlong previous_val= thd->variables.pseudo_slave_mode;