From b89de2b2ce84f41351a66d5832c3967b80f9dc31 Mon Sep 17 00:00:00 2001 From: Kristian Nielsen Date: Tue, 23 Jun 2015 14:10:14 +0200 Subject: [PATCH] MDEV-8354: out-of-order error with --gtid-ignore-duplicates and row-based replication The --gtid-ignore-duplicates option was not working correctly with row-based replication. When a row event was completed, but before committing, there was a small window where another multi-source SQL thread could wrongly try to re-execute the same transaction, without properly ignoring the duplicate GTID. This would lead to duplicate key error or out-of-order GTID error or similar. Thanks to Matt Neth for reporting this and giving an easy way to reproduce the issue. --- .../multi_source/gtid_ignore_duplicates.cnf | 4 + .../gtid_ignore_duplicates.result | 139 ++++++++++++++++++ .../multi_source/gtid_ignore_duplicates.test | 108 ++++++++++++++ sql/rpl_rli.cc | 19 ++- 4 files changed, 263 insertions(+), 7 deletions(-) diff --git a/mysql-test/suite/multi_source/gtid_ignore_duplicates.cnf b/mysql-test/suite/multi_source/gtid_ignore_duplicates.cnf index b47ebb2cf30..ba1ffee4d9d 100644 --- a/mysql-test/suite/multi_source/gtid_ignore_duplicates.cnf +++ b/mysql-test/suite/multi_source/gtid_ignore_duplicates.cnf @@ -3,21 +3,25 @@ [mysqld.1] log-slave-updates loose-innodb +binlog-format=mixed [mysqld.2] log-slave-updates loose-innodb +binlog-format=mixed [mysqld.3] log-bin=server3-bin log-slave-updates loose-innodb +binlog-format=mixed [mysqld.4] server-id=4 log-bin=server4-bin log-slave-updates loose-innodb +binlog-format=mixed [ENV] SERVER_MYPORT_4= @mysqld.4.port diff --git a/mysql-test/suite/multi_source/gtid_ignore_duplicates.result b/mysql-test/suite/multi_source/gtid_ignore_duplicates.result index 5426091b635..a43eea47ded 100644 --- a/mysql-test/suite/multi_source/gtid_ignore_duplicates.result +++ b/mysql-test/suite/multi_source/gtid_ignore_duplicates.result @@ -242,6 +242,145 @@ a 24 25 26 +*** MDEV-8354: out-of-order error with --gtid-ignore-duplicates and row-based replication *** +SET default_master_connection = "b2a"; +STOP SLAVE; +include/wait_for_slave_to_stop.inc +SET default_master_connection = "c2a"; +STOP SLAVE; +include/wait_for_slave_to_stop.inc +SET default_master_connection = "c2b"; +STOP SLAVE; +include/wait_for_slave_to_stop.inc +SET default_master_connection = "b2c"; +STOP SLAVE; +include/wait_for_slave_to_stop.inc +SET @old_slave_mode=@@GLOBAL.slave_exec_mode; +SET GLOBAL slave_exec_mode=IDEMPOTENT; +SET @old_strict=@@GLOBAL.gtid_strict_mode; +SET GLOBAL gtid_strict_mode=1; +SET @old_dbug=@@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,inject_sleep_gtid_100_x_x"; +SET @old_domain=@@SESSION.gtid_domain_id; +SET @old_format=@@SESSION.binlog_format; +SET SESSION gtid_domain_id=100; +SET SESSION binlog_format='row'; +INSERT INTO t1 VALUES (30); +INSERT INTO t1 VALUES (31); +INSERT INTO t1 VALUES (32); +INSERT INTO t1 VALUES (33); +INSERT INTO t1 VALUES (34); +INSERT INTO t1 VALUES (35); +INSERT INTO t1 VALUES (36); +INSERT INTO t1 VALUES (37); +INSERT INTO t1 VALUES (38); +INSERT INTO t1 VALUES (39); +INSERT INTO t1 VALUES (40); +INSERT INTO t1 VALUES (41); +INSERT INTO t1 VALUES (42); +INSERT INTO t1 VALUES (43); +INSERT INTO t1 VALUES (44); +INSERT INTO t1 VALUES (45); +INSERT INTO t1 VALUES (46); +INSERT INTO t1 VALUES (47); +INSERT INTO t1 VALUES (48); +INSERT INTO t1 VALUES (49); +SET SESSION gtid_domain_id=@old_domain; +SET SESSION binlog_format=@old_format; +include/save_master_gtid.inc +include/sync_with_master_gtid.inc +INSERT INTO t1 VALUES (50); +include/save_master_gtid.inc +SET default_master_connection = "b2c"; +START SLAVE; +include/wait_for_slave_to_start.inc +SELECT MASTER_GTID_WAIT("GTID", 30); +MASTER_GTID_WAIT("GTID", 30) +0 +SET default_master_connection = "b2a"; +START SLAVE; +include/wait_for_slave_to_start.inc +SET default_master_connection = "c2a"; +START SLAVE; +include/wait_for_slave_to_start.inc +include/sync_with_master_gtid.inc +SELECT * FROM t1 WHERE a >= 30 ORDER BY a; +a +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +SET default_master_connection = "c2b"; +START SLAVE; +include/wait_for_slave_to_start.inc +include/sync_with_master_gtid.inc +SELECT * FROM t1 WHERE a >= 30 ORDER BY a; +a +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +include/sync_with_master_gtid.inc +SET GLOBAL debug_dbug=@old_dbug; +SELECT * FROM t1 WHERE a >= 30 ORDER BY a; +a +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +SET GLOBAL slave_exec_mode=@old_slave_mode; +SET GLOBAL gtid_strict_mode=@old_strict; SET GLOBAL gtid_domain_id=0; STOP ALL SLAVES; Warnings: diff --git a/mysql-test/suite/multi_source/gtid_ignore_duplicates.test b/mysql-test/suite/multi_source/gtid_ignore_duplicates.test index cf1c750fc19..b2c42e03335 100644 --- a/mysql-test/suite/multi_source/gtid_ignore_duplicates.test +++ b/mysql-test/suite/multi_source/gtid_ignore_duplicates.test @@ -1,5 +1,7 @@ --source include/not_embedded.inc --source include/have_innodb.inc +--source include/have_debug.inc + --echo *** Test all-to-all replication with --gtid-ignore-duplicates *** @@ -258,6 +260,112 @@ SELECT * FROM t1 WHERE a >= 20 ORDER BY a; SELECT * FROM t1 WHERE a >= 20 ORDER BY a; +--echo *** MDEV-8354: out-of-order error with --gtid-ignore-duplicates and row-based replication *** + +# Have only A->C A->B initially. +--connection server_1 +SET default_master_connection = "b2a"; +STOP SLAVE; +--source include/wait_for_slave_to_stop.inc +SET default_master_connection = "c2a"; +STOP SLAVE; +--source include/wait_for_slave_to_stop.inc + +--connection server_2 +SET default_master_connection = "c2b"; +STOP SLAVE; +--source include/wait_for_slave_to_stop.inc + +--connection server_3 +SET default_master_connection = "b2c"; +STOP SLAVE; +--source include/wait_for_slave_to_stop.inc +SET @old_slave_mode=@@GLOBAL.slave_exec_mode; +SET GLOBAL slave_exec_mode=IDEMPOTENT; +SET @old_strict=@@GLOBAL.gtid_strict_mode; +SET GLOBAL gtid_strict_mode=1; + +SET @old_dbug=@@GLOBAL.debug_dbug; +# This will inject a small sleep that helps trigger the race. I did not manage +# to create a non-sleeping version with debug_sync for this; the problem is +# that once the bug is fixed, the race becomes impossible, so even with +# debug_sync at best we can check that the debug_sync times out. Which is +# just another way of adding a sleep. +# +# The bug was a race at this point where another multi-source connection +# could incorrectly re-apply the same GTID, in case of row-based replication. +SET GLOBAL debug_dbug="+d,inject_sleep_gtid_100_x_x"; + +--connection server_1 +SET @old_domain=@@SESSION.gtid_domain_id; +SET @old_format=@@SESSION.binlog_format; +SET SESSION gtid_domain_id=100; +SET SESSION binlog_format='row'; +INSERT INTO t1 VALUES (30); +INSERT INTO t1 VALUES (31); +INSERT INTO t1 VALUES (32); +INSERT INTO t1 VALUES (33); +INSERT INTO t1 VALUES (34); +INSERT INTO t1 VALUES (35); +INSERT INTO t1 VALUES (36); +INSERT INTO t1 VALUES (37); +INSERT INTO t1 VALUES (38); +INSERT INTO t1 VALUES (39); +INSERT INTO t1 VALUES (40); +INSERT INTO t1 VALUES (41); +INSERT INTO t1 VALUES (42); +INSERT INTO t1 VALUES (43); +INSERT INTO t1 VALUES (44); +INSERT INTO t1 VALUES (45); +INSERT INTO t1 VALUES (46); +INSERT INTO t1 VALUES (47); +INSERT INTO t1 VALUES (48); +INSERT INTO t1 VALUES (49); +SET SESSION gtid_domain_id=@old_domain; +SET SESSION binlog_format=@old_format; +--source include/save_master_gtid.inc + +--connection server_2 +--source include/sync_with_master_gtid.inc +INSERT INTO t1 VALUES (50); +--let $gtid=`SELECT @@last_gtid` +--source include/save_master_gtid.inc + +--connection server_3 +SET default_master_connection = "b2c"; +START SLAVE; +--source include/wait_for_slave_to_start.inc +--replace_result $gtid GTID +eval SELECT MASTER_GTID_WAIT("$gtid", 30); +# The bug occured here, the slave would get an out-of-order binlog error +# due to trying to re-apply the 100-x-x transaction. + +# Restart stopped multi-source connections, and sync up. +--connection server_1 +SET default_master_connection = "b2a"; +START SLAVE; +--source include/wait_for_slave_to_start.inc +SET default_master_connection = "c2a"; +START SLAVE; +--source include/wait_for_slave_to_start.inc +--source include/sync_with_master_gtid.inc +SELECT * FROM t1 WHERE a >= 30 ORDER BY a; + +--connection server_2 +SET default_master_connection = "c2b"; +START SLAVE; +--source include/wait_for_slave_to_start.inc +--source include/sync_with_master_gtid.inc +SELECT * FROM t1 WHERE a >= 30 ORDER BY a; + +--connection server_3 +--source include/sync_with_master_gtid.inc +SET GLOBAL debug_dbug=@old_dbug; +SELECT * FROM t1 WHERE a >= 30 ORDER BY a; +SET GLOBAL slave_exec_mode=@old_slave_mode; +SET GLOBAL gtid_strict_mode=@old_strict; + + # Clean up. --connection server_1 SET GLOBAL gtid_domain_id=0; diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index a324c3c30da..9ed388265be 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -1788,6 +1788,13 @@ void rpl_group_info::cleanup_context(THD *thd, bool error) rli->clear_flag(Relay_log_info::IN_STMT); rli->clear_flag(Relay_log_info::IN_TRANSACTION); } + + /* + Ensure we always release the domain for others to process, when using + --gtid-ignore-duplicates. + */ + if (gtid_ignore_duplicate_state != GTID_DUPLICATE_NULL) + rpl_global_gtid_slave_state.release_domain_owner(this); } /* @@ -1796,13 +1803,6 @@ void rpl_group_info::cleanup_context(THD *thd, bool error) thd->variables.option_bits&= ~OPTION_NO_FOREIGN_KEY_CHECKS; thd->variables.option_bits&= ~OPTION_RELAXED_UNIQUE_CHECKS; - /* - Ensure we always release the domain for others to process, when using - --gtid-ignore-duplicates. - */ - if (gtid_ignore_duplicate_state != GTID_DUPLICATE_NULL) - rpl_global_gtid_slave_state.release_domain_owner(this); - /* Reset state related to long_find_row notes in the error log: - timestamp @@ -1811,6 +1811,11 @@ void rpl_group_info::cleanup_context(THD *thd, bool error) reset_row_stmt_start_timestamp(); unset_long_find_row_note_printed(); + DBUG_EXECUTE_IF("inject_sleep_gtid_100_x_x", { + if (current_gtid.domain_id == 100) + my_sleep(50000); + };); + DBUG_VOID_RETURN; }