diff --git a/mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim.result b/mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim.result new file mode 100644 index 00000000000..f438f335283 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim.result @@ -0,0 +1,51 @@ +include/master-slave.inc +[connection master] +MDEV-31655: Parallel replication deadlock victim preference code erroneously removed +connection server_1; +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; +BEGIN; +COMMIT; +include/save_master_gtid.inc +connection server_2; +include/sync_with_master_gtid.inc +include/stop_slave.inc +SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads; +SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode; +set @@global.slave_parallel_threads= 5; +set @@global.slave_parallel_mode= conservative; +SET @old_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries"; +connection server_1; +SET @old_dbug= @@SESSION.debug_dbug; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; +SET @commit_id= 1+1000; +SET @commit_id= 2+1000; +SET @commit_id= 3+1000; +SET @commit_id= 4+1000; +SET @commit_id= 5+1000; +SET @commit_id= 6+1000; +SET @commit_id= 7+1000; +SET @commit_id= 8+1000; +SET @commit_id= 9+1000; +SET @commit_id= 10+1000; +SET SESSION debug_dbug= @old_dbug; +SELECT COUNT(*), SUM(a*100*b) FROM t1; +COUNT(*) SUM(a*100*b) +10 225000 +include/save_master_gtid.inc +connection server_2; +include/start_slave.inc +include/sync_with_master_gtid.inc +SET GLOBAL debug_dbug= @old_dbug; +SELECT COUNT(*), SUM(a*100*b) FROM t1; +COUNT(*) SUM(a*100*b) +10 225000 +connection server_2; +include/stop_slave.inc +SET GLOBAL slave_parallel_threads=@old_parallel_threads; +SET GLOBAL slave_parallel_mode=@old_parallel_mode; +include/start_slave.inc +connection server_1; +DROP TABLE t1; +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim2.result b/mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim2.result new file mode 100644 index 00000000000..c26944b3321 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_parallel_deadlock_victim2.result @@ -0,0 +1,50 @@ +include/master-slave.inc +[connection master] +connection master; +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CREATE TABLE t1(a INT) ENGINE=INNODB; +INSERT INTO t1 VALUES(1); +include/save_master_gtid.inc +connection slave; +include/sync_with_master_gtid.inc +include/stop_slave.inc +set @@global.slave_parallel_threads= 2; +set @@global.slave_parallel_mode= OPTIMISTIC; +set @@global.slave_transaction_retries= 2; +*** MDEV-28776: rpl.rpl_mark_optimize_tbl_ddl fails with timeout on sync_with_master +connection master; +SET @@gtid_seq_no= 100; +INSERT INTO t1 SELECT 1+a FROM t1; +SET @@gtid_seq_no= 200; +INSERT INTO t1 SELECT 2+a FROM t1; +SELECT * FROM t1 ORDER BY a; +a +1 +2 +3 +4 +include/save_master_gtid.inc +connection slave; +SET @save_dbug= @@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,rpl_parallel_delay_gtid_0_x_100_start"; +SET GLOBAL debug_dbug="+d,rpl_write_record_small_sleep_gtid_100_200"; +SET GLOBAL debug_dbug="+d,small_sleep_after_lock_wait"; +SET GLOBAL debug_dbug="+d,rpl_delay_deadlock_kill"; +include/start_slave.inc +include/sync_with_master_gtid.inc +SET GLOBAL debug_dbug= @save_dbug; +SELECT * FROM t1 ORDER BY a; +a +1 +2 +3 +4 +connection slave; +include/stop_slave.inc +SET @@global.slave_parallel_threads= 0; +SET @@global.slave_parallel_mode= optimistic; +SET @@global.slave_transaction_retries= 10; +include/start_slave.inc +connection master; +DROP TABLE t1; +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim.test b/mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim.test new file mode 100644 index 00000000000..ab634d2953e --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim.test @@ -0,0 +1,86 @@ +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/master-slave.inc + + +--echo MDEV-31655: Parallel replication deadlock victim preference code erroneously removed +# The problem was that InnoDB would choose the wrong deadlock victim. +# Create a lot of transactions that can cause deadlocks, and use error +# injection to check that the first transactions in each group is never +# selected as deadlock victim. +--let $rows= 10 +--let $transactions= 5 +--let $gcos= 10 + +--connection server_1 +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; +BEGIN; +--disable_query_log +--let $i= 0 +while ($i < 10) { + eval INSERT INTO t1 VALUES ($i, 0); + inc $i; +} +--enable_query_log +COMMIT; +--source include/save_master_gtid.inc + +--connection server_2 +--source include/sync_with_master_gtid.inc +--source include/stop_slave.inc +SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads; +SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode; +eval set @@global.slave_parallel_threads= $transactions; +set @@global.slave_parallel_mode= conservative; +SET @old_dbug= @@GLOBAL.debug_dbug; +# This error injection will allow no retries for GTIDs divisible by 1000. +SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries"; + +--connection server_1 +SET @old_dbug= @@SESSION.debug_dbug; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; + +--let $j= 1 +while ($j <= $gcos) { + eval SET @commit_id= $j+1000; + --let $i= 0 + while ($i < $transactions) { + --disable_query_log + eval SET SESSION gtid_seq_no= 1000 + 1000*$j + $i; + BEGIN; + --let $k= 0 + while ($k < $rows) { + eval UPDATE t1 SET b=b+1 WHERE a=(($i+$k) MOD $rows); + inc $k; + } + COMMIT; + --enable_query_log + inc $i; + } + inc $j; +} + +SET SESSION debug_dbug= @old_dbug; +SELECT COUNT(*), SUM(a*100*b) FROM t1; + +--source include/save_master_gtid.inc + +--connection server_2 +--source include/start_slave.inc +--source include/sync_with_master_gtid.inc +SET GLOBAL debug_dbug= @old_dbug; +SELECT COUNT(*), SUM(a*100*b) FROM t1; + + +# Clean up. +--connection server_2 +--source include/stop_slave.inc +SET GLOBAL slave_parallel_threads=@old_parallel_threads; +SET GLOBAL slave_parallel_mode=@old_parallel_mode; +--source include/start_slave.inc + +--connection server_1 +DROP TABLE t1; + +--source include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim2.test b/mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim2.test new file mode 100644 index 00000000000..522cec18bbc --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_parallel_deadlock_victim2.test @@ -0,0 +1,83 @@ +--source include/master-slave.inc +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_binlog_format_statement.inc + +--connection master +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CREATE TABLE t1(a INT) ENGINE=INNODB; +INSERT INTO t1 VALUES(1); +--source include/save_master_gtid.inc + +--connection slave +--source include/sync_with_master_gtid.inc +--source include/stop_slave.inc +--let $save_transaction_retries= `SELECT @@global.slave_transaction_retries` +--let $save_slave_parallel_threads= `SELECT @@global.slave_parallel_threads` +--let $save_slave_parallel_mode= `SELECT @@global.slave_parallel_mode` +set @@global.slave_parallel_threads= 2; +set @@global.slave_parallel_mode= OPTIMISTIC; +set @@global.slave_transaction_retries= 2; + +--echo *** MDEV-28776: rpl.rpl_mark_optimize_tbl_ddl fails with timeout on sync_with_master +# This was a failure where a transaction T1 could deadlock multiple times +# with T2, eventually exceeding the default --slave-transaction-retries=10. +# Root cause was MDEV-31655, causing InnoDB to wrongly choose T1 as deadlock +# victim over T2. If thread scheduling is right, it was possible for T1 to +# repeatedly deadlock, roll back, and have time to grab an S lock again before +# T2 woke up and got its waiting X lock, thus repeating the same deadlock over +# and over. +# Once the bug is fixed, it is not possible to re-create the same execution +# and thread scheduling. Instead we inject small sleeps in a way that +# triggered the problem when the bug was there, to demonstrate that the +# problem no longer occurs. + +--connection master +# T1 +SET @@gtid_seq_no= 100; +INSERT INTO t1 SELECT 1+a FROM t1; +# T2 +SET @@gtid_seq_no= 200; +INSERT INTO t1 SELECT 2+a FROM t1; + +SELECT * FROM t1 ORDER BY a; +--source include/save_master_gtid.inc + +--connection slave +SET @save_dbug= @@GLOBAL.debug_dbug; + +# Inject various delays to hint thread scheduling to happen in the way that +# triggered MDEV-28776. + +# Small delay starting T1 so it will be the youngest trx and be chosen over +# T2 as the deadlock victim by default in InnoDB. +SET GLOBAL debug_dbug="+d,rpl_parallel_delay_gtid_0_x_100_start"; + +# Small delay before taking insert X lock to give time for both T1 and T2 to +# get the S lock first and cause a deadlock. +SET GLOBAL debug_dbug="+d,rpl_write_record_small_sleep_gtid_100_200"; + +# Small delay after T2's wait on the X lock, to give time for T1 retry to +# re-aquire the T1 S lock first. +SET GLOBAL debug_dbug="+d,small_sleep_after_lock_wait"; + +# Delay deadlock kill of T2. +SET GLOBAL debug_dbug="+d,rpl_delay_deadlock_kill"; + +--source include/start_slave.inc +--source include/sync_with_master_gtid.inc +SET GLOBAL debug_dbug= @save_dbug; +SELECT * FROM t1 ORDER BY a; + +# Cleanup. +--connection slave +--source include/stop_slave.inc +eval SET @@global.slave_parallel_threads= $save_slave_parallel_threads; +eval SET @@global.slave_parallel_mode= $save_slave_parallel_mode; +eval SET @@global.slave_transaction_retries= $save_transaction_retries; +--source include/start_slave.inc + +--connection master +DROP TABLE t1; + +--source include/rpl_end.inc diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 8034fe7718a..c044defd000 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -1301,6 +1301,11 @@ handle_rpl_parallel_thread(void *arg) bool did_enter_cond= false; PSI_stage_info old_stage; + DBUG_EXECUTE_IF("rpl_parallel_delay_gtid_0_x_100_start", { + if (rgi->current_gtid.domain_id==0 && + rgi->current_gtid.seq_no == 100) + my_sleep(10000); + }); #ifdef ENABLED_DEBUG_SYNC DBUG_EXECUTE_IF("hold_worker_on_schedule", { if (rgi->current_gtid.domain_id == 0 && @@ -1480,8 +1485,13 @@ handle_rpl_parallel_thread(void *arg) err= dbug_simulate_tmp_error(rgi, thd);); if (unlikely(err)) { + ulong max_retries= slave_trans_retries; convert_kill_to_deadlock_error(rgi); - if (has_temporary_error(thd) && slave_trans_retries > 0) + DBUG_EXECUTE_IF("rpl_mdev31655_zero_retries", + if ((rgi->current_gtid.seq_no % 1000) == 0) + max_retries= 0; + ); + if (has_temporary_error(thd) && max_retries > 0) err= retry_event_group(rgi, rpt, qev); } } diff --git a/sql/slave.cc b/sql/slave.cc index 4b24d281d5d..5ad1a446ae6 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -506,6 +506,7 @@ static void bg_rpl_load_gtid_slave_state(void *) static void bg_slave_kill(void *victim) { THD *to_kill= (THD *)victim; + DBUG_EXECUTE_IF("rpl_delay_deadlock_kill", my_sleep(1500000);); to_kill->awake(KILL_CONNECTION); mysql_mutex_lock(&to_kill->LOCK_wakeup_ready); to_kill->rgi_slave->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED; diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 8e2f34ac53d..0c929837957 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -5516,6 +5516,49 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd) return 0; } + +/* + If the storage engine detects a deadlock, and needs to choose a victim + transaction to roll back, it can call this function to ask the upper + server layer for which of two possible transactions is prefered to be + aborted and rolled back. + + In parallel replication, if two transactions are running in parallel and + one is fixed to commit before the other, then the one that commits later + will be prefered as the victim - chosing the early transaction as a victim + will not resolve the deadlock anyway, as the later transaction still needs + to wait for the earlier to commit. + + The return value is -1 if the first transaction is prefered as a deadlock + victim, 1 if the second transaction is prefered, or 0 for no preference (in + which case the storage engine can make the choice as it prefers). +*/ +extern "C" int +thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2) +{ + rpl_group_info *rgi1, *rgi2; + + if (!thd1 || !thd2) + return 0; + + /* + If the transactions are participating in the same replication domain in + parallel replication, then request to select the one that will commit + later (in the fixed commit order from the master) as the deadlock victim. + */ + rgi1= thd1->rgi_slave; + rgi2= thd2->rgi_slave; + if (rgi1 && rgi2 && + rgi1->is_parallel_exec && + rgi1->rli == rgi2->rli && + rgi1->current_gtid.domain_id == rgi2->current_gtid.domain_id) + return rgi1->gtid_sub_id < rgi2->gtid_sub_id ? 1 : -1; + + /* No preferences, let the storage engine decide. */ + return 0; +} + + extern "C" int thd_non_transactional_update(const MYSQL_THD thd) { return(thd->transaction->all.modified_non_trans_table); diff --git a/sql/sql_insert.cc b/sql/sql_insert.cc index 3ca8fc3303b..132be65a848 100644 --- a/sql/sql_insert.cc +++ b/sql/sql_insert.cc @@ -80,6 +80,7 @@ #include "debug_sync.h" // DEBUG_SYNC #include "debug.h" // debug_crash_here #include +#include "rpl_rli.h" #ifdef WITH_WSREP #include "wsrep_trans_observer.h" /* wsrep_start_transction() */ @@ -1828,6 +1829,12 @@ int write_record(THD *thd, TABLE *table, COPY_INFO *info, select_result *sink) save_read_set= table->read_set; save_write_set= table->write_set; + DBUG_EXECUTE_IF("rpl_write_record_small_sleep_gtid_100_200", + { + if (thd->rgi_slave && (thd->rgi_slave->current_gtid.seq_no == 100 || + thd->rgi_slave->current_gtid.seq_no == 200)) + my_sleep(20000); + }); if (info->handle_duplicates == DUP_REPLACE || info->handle_duplicates == DUP_UPDATE) { diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 0bb14f5fcf7..a794e2bbf03 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -63,6 +63,7 @@ ulong innodb_deadlock_report; extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd); extern "C" int thd_need_wait_reports(const MYSQL_THD thd); extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); +extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2); #endif /** Functor for accessing the embedded node within a table lock. */ @@ -1958,6 +1959,14 @@ check_trx_error: end_wait: mysql_mutex_unlock(&lock_sys.wait_mutex); + DBUG_EXECUTE_IF("small_sleep_after_lock_wait", + { + if (!(type_mode & LOCK_TABLE) && + (type_mode & LOCK_MODE_MASK) == LOCK_X && + trx->error_state != DB_DEADLOCK && !trx_is_interrupted(trx)) { + my_sleep(20000); + } + }); thd_wait_end(trx->mysql_thd); return trx->error_state; @@ -6309,6 +6318,28 @@ namespace Deadlock } } + ATTRIBUTE_COLD + /** Calculate a number used to compare deadlock victim candidates. +Bit 62 is used to prefer transaction that did not modified non-transactional +tables. Bits 1-61 are set to TRX_WEIGHT to prefer transactions with less locks +and less modified rows. Bit 0 is used to prefer orig_trx in case of a tie. + @param trx Transaction + @return a 64-bit unsigned, the lower the more preferred TRX is as a deadlock + victim */ + static undo_no_t calc_victim_weight(trx_t *trx, const trx_t *orig_trx) + { + const undo_no_t trx_weight= (trx != orig_trx) | (TRX_WEIGHT(trx) << 1) | + (trx->mysql_thd && +#ifdef WITH_WSREP + (thd_has_edited_nontrans_tables(trx->mysql_thd) || + (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))) +#else + thd_has_edited_nontrans_tables(trx->mysql_thd) +#endif /* WITH_WSREP */ + ? 1ULL << 62 : 0); + return trx_weight; + } + ATTRIBUTE_COLD /** Report a deadlock (cycle in the waits-for graph). @param trx transaction waiting for a lock in this thread @@ -6332,24 +6363,7 @@ namespace Deadlock static const char rollback_msg[]= "*** WE ROLL BACK TRANSACTION (%u)\n"; char buf[9 + sizeof rollback_msg]; - - /* If current_trx=true, trx is owned by this thread, and we can - safely invoke these without holding trx->mutex or lock_sys.latch. - If current_trx=false, a concurrent commit is protected by both - lock_sys.latch and lock_sys.wait_mutex. */ - const undo_no_t trx_weight= TRX_WEIGHT(trx) | - (trx->mysql_thd && -#ifdef WITH_WSREP - (thd_has_edited_nontrans_tables(trx->mysql_thd) || - (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))) -#else - thd_has_edited_nontrans_tables(trx->mysql_thd) -#endif /* WITH_WSREP */ - ? 1ULL << 63 : 0); - trx_t *victim= nullptr; - undo_no_t victim_weight= ~0ULL; - unsigned victim_pos= 0, trx_pos= 0; /* Here, lock elision does not make sense, because for the output we are going to invoke system calls, @@ -6362,43 +6376,52 @@ namespace Deadlock } { - unsigned l= 0; + unsigned l= 1; /* Now that we are holding lock_sys.wait_mutex again, check whether a cycle still exists. */ trx_t *cycle= find_cycle(trx); if (!cycle) goto func_exit; /* One of the transactions was already aborted. */ + + victim= cycle; + undo_no_t victim_weight= calc_victim_weight(victim, trx); + unsigned victim_pos= l; for (trx_t *next= cycle;;) { next= next->lock.wait_trx; l++; - const undo_no_t next_weight= TRX_WEIGHT(next) | - (next->mysql_thd && -#ifdef WITH_WSREP - (thd_has_edited_nontrans_tables(next->mysql_thd) || - (next->is_wsrep() && wsrep_thd_is_BF(next->mysql_thd, false))) + const undo_no_t next_weight= calc_victim_weight(next, trx); +#ifdef HAVE_REPLICATION + const int pref= + thd_deadlock_victim_preference(victim->mysql_thd, next->mysql_thd); + /* Set bit 63 for any non-preferred victim to make such preference take + priority in the weight comparison. + -1 means victim is preferred. 1 means next is preferred. */ + undo_no_t victim_not_pref= (1ULL << 63) & (undo_no_t)(int64_t)(-pref); + undo_no_t next_not_pref= (1ULL << 63) & (undo_no_t)(int64_t)pref; #else - thd_has_edited_nontrans_tables(next->mysql_thd) -#endif /* WITH_WSREP */ - ? 1ULL << 63 : 0); - if (next_weight < victim_weight) + undo_no_t victim_not_pref= 0; + undo_no_t next_not_pref= 0; +#endif + /* Single comparison to decide which of two transactions is preferred + as a deadlock victim. + - If thd_deadlock_victim_preference() returned non-zero, bit 63 + comparison will decide the preferred one. + - Else if exactly one of them modified non-transactional tables, + bit 62 will decide. + - Else the TRX_WEIGHT in bits 1-61 will decide, if not equal. + - Else, if one of them is the original trx, bit 0 will decide. + - If all is equal, previous victim will arbitrarily be chosen. */ + if ((next_weight|next_not_pref) < (victim_weight|victim_not_pref)) { victim_weight= next_weight; victim= next; victim_pos= l; } - if (next == victim) - trx_pos= l; if (next == cycle) break; } - if (trx_pos && trx_weight == victim_weight) - { - victim= trx; - victim_pos= trx_pos; - } - /* Finally, display the deadlock */ switch (const auto r= static_cast(innodb_deadlock_report)) { case REPORT_OFF: