From 11a1c83c03285ccf6e28a7849f272323a493424b Mon Sep 17 00:00:00 2001 From: "guilhem@mysql.com" <> Date: Wed, 2 Mar 2005 11:29:48 +0100 Subject: [PATCH 1/2] Fix for BUG#8325 "Deadlock in replication thread stops replication": in slave SQL thread: if a transaction fails because of InnoDB deadlock or innodb_lock_wait_timeout exceeded, optionally retry the transaction a certain number of times (new variable --slave_transaction_retries). --- mysql-test/r/rpl_deadlock.result | 81 +++++++++++++++++++++ mysql-test/t/rpl_deadlock-slave.opt | 1 + mysql-test/t/rpl_deadlock.test | 107 ++++++++++++++++++++++++++++ sql/mysql_priv.h | 2 +- sql/mysqld.cc | 12 +++- sql/set_var.cc | 4 ++ sql/slave.cc | 57 +++++++++++++++ sql/slave.h | 4 +- 8 files changed, 263 insertions(+), 5 deletions(-) create mode 100644 mysql-test/r/rpl_deadlock.result create mode 100644 mysql-test/t/rpl_deadlock-slave.opt create mode 100644 mysql-test/t/rpl_deadlock.test diff --git a/mysql-test/r/rpl_deadlock.result b/mysql-test/r/rpl_deadlock.result new file mode 100644 index 00000000000..366d18f3e05 --- /dev/null +++ b/mysql-test/r/rpl_deadlock.result @@ -0,0 +1,81 @@ +stop slave; +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; +reset master; +reset slave; +drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9; +start slave; +create table t1 (a int not null, key(a)) engine=innodb; +create table t2 (a int not null, key(a)) engine=innodb; +create table t3 (a int) engine=innodb; +create table t4 (a int) engine=innodb; +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL default '0', + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +show create table t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) NOT NULL default '0', + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 +stop slave; +begin; +insert into t3 select * from t2 for update; +insert into t1 values(1); +commit; +begin; +select * from t1 for update; +a +start slave; +insert into t2 values(22); +commit; +select * from t1; +a +1 +select * from t2; +a +22 +show slave status; +Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master +# 127.0.0.1 root MASTER_MYPORT 1 master-bin.000001 13110 # # master-bin.000001 Yes Yes 0 0 13110 # None 0 No # +stop slave; +change master to master_log_pos=401; +begin; +select * from t2 for update; +a +22 +start slave; +commit; +select * from t1; +a +1 +1 +select * from t2; +a +22 +show slave status; +Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master +# 127.0.0.1 root MASTER_MYPORT 1 master-bin.000001 13110 # # master-bin.000001 Yes Yes 0 0 13110 # None 0 No # +set global max_relay_log_size=0; +stop slave; +change master to master_log_pos=401; +begin; +select * from t2 for update; +a +22 +start slave; +commit; +select * from t1; +a +1 +1 +1 +select * from t2; +a +22 +show slave status; +Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master +# 127.0.0.1 root MASTER_MYPORT 1 master-bin.000001 13110 # # master-bin.000001 Yes Yes 0 0 13110 # None 0 No # +drop table t1,t2; diff --git a/mysql-test/t/rpl_deadlock-slave.opt b/mysql-test/t/rpl_deadlock-slave.opt new file mode 100644 index 00000000000..f4a8c640458 --- /dev/null +++ b/mysql-test/t/rpl_deadlock-slave.opt @@ -0,0 +1 @@ +--innodb --loose-innodb_lock_wait_timeout=4 --slave-transaction-retries=2 --max-relay-log-size=4096 diff --git a/mysql-test/t/rpl_deadlock.test b/mysql-test/t/rpl_deadlock.test new file mode 100644 index 00000000000..82470e8ebd0 --- /dev/null +++ b/mysql-test/t/rpl_deadlock.test @@ -0,0 +1,107 @@ +# See if slave restarts the transaction after failing on an InnoDB deadlock error. + +# Note: testing what happens when too many retries is possible, but +# needs large waits when running with --debug, so we don't do it. +# The same way, this test may not test what is expected when run +# under Valgrind, timings are too short then (with --valgrind I +# (Guilhem) have seen the test manage to provoke lock wait timeout +# error but not deadlock error; that is ok as code deals with the two +# errors in exactly the same way. + +source include/have_innodb.inc; +source include/master-slave.inc; + +connection master; +create table t1 (a int not null, key(a)) engine=innodb; +create table t2 (a int not null, key(a)) engine=innodb; +create table t3 (a int) engine=innodb; +create table t4 (a int) engine=innodb; +sync_slave_with_master; + +show create table t1; +show create table t2; +stop slave; + +# 1) Test deadlock + +connection master; +begin; +# Let's keep BEGIN and the locked statement in two different relay logs. +let $1=200; +disable_query_log; +while ($1) +{ + eval insert into t3 values( $1 ); + dec $1; +} +enable_query_log; +insert into t3 select * from t2 for update; +insert into t1 values(1); +commit; +save_master_pos; + +connection slave; +begin; +# Let's make our transaction large so that it's slave who is chosen as +# victim +let $1=1000; +disable_query_log; +while ($1) +{ + eval insert into t4 values( $1 ); + dec $1; +} +enable_query_log; +select * from t1 for update; +start slave; +--sleep 3; # hope that slave is blocked now +insert into t2 values(22); # provoke deadlock, slave should be victim +commit; +sync_with_master; +select * from t1; # check that slave succeeded finally +select * from t2; +# check that no error is reported +--replace_column 1 # 8 # 9 # 23 # 33 # +--replace_result $MASTER_MYPORT MASTER_MYPORT +show slave status; + +# 2) Test lock wait timeout + +stop slave; +change master to master_log_pos=401; # the BEGIN log event +begin; +select * from t2 for update; # hold lock +start slave; +--sleep 10; # slave should have blocked, and be retrying +commit; +sync_with_master; +select * from t1; # check that slave succeeded finally +select * from t2; +# check that no error is reported +--replace_column 1 # 8 # 9 # 23 # 33 # +--replace_result $MASTER_MYPORT MASTER_MYPORT +show slave status; + +# Now we repeat 2), but with BEGIN in the same relay log as +# COMMIT (to see if seeking into hot log is ok). + +set global max_relay_log_size=0; + +# This is really copy-paste of 2) of above +stop slave; +change master to master_log_pos=401; +begin; +select * from t2 for update; +start slave; +--sleep 10; +commit; +sync_with_master; +select * from t1; +select * from t2; +--replace_column 1 # 8 # 9 # 23 # 33 # +--replace_result $MASTER_MYPORT MASTER_MYPORT +show slave status; + +connection master; +drop table t1,t2; +sync_slave_with_master; diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h index 6c77a8934dd..9e395aa37da 100644 --- a/sql/mysql_priv.h +++ b/sql/mysql_priv.h @@ -890,7 +890,7 @@ extern ulong ha_read_first_count, ha_read_last_count; extern ulong ha_read_rnd_count, ha_read_rnd_next_count, ha_discover_count; extern ulong ha_commit_count, ha_rollback_count,table_cache_size; extern ulong max_connections,max_connect_errors, connect_timeout; -extern ulong slave_net_timeout; +extern ulong slave_net_timeout, slave_trans_retries; extern ulong max_user_connections; extern ulong long_query_count, what_to_log,flush_time; extern ulong query_buff_size, thread_stack,thread_stack_min; diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 1591b205d46..dba39299907 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -311,7 +311,7 @@ ulong server_id, thd_startup_options; ulong table_cache_size, thread_stack, thread_stack_min, what_to_log; ulong query_buff_size, slow_launch_time, slave_open_temp_tables; ulong open_files_limit, max_binlog_size, max_relay_log_size; -ulong slave_net_timeout; +ulong slave_net_timeout, slave_trans_retries; ulong thread_cache_size=0, binlog_cache_size=0, max_binlog_cache_size=0; ulong query_cache_size=0; ulong com_stat[(uint) SQLCOM_END], com_other; @@ -4096,7 +4096,7 @@ enum options_mysqld OPT_QUERY_CACHE_TYPE, OPT_QUERY_CACHE_WLOCK_INVALIDATE, OPT_RECORD_BUFFER, OPT_RECORD_RND_BUFFER, OPT_RELAY_LOG_SPACE_LIMIT, OPT_RELAY_LOG_PURGE, OPT_SLAVE_NET_TIMEOUT, OPT_SLAVE_COMPRESSED_PROTOCOL, OPT_SLOW_LAUNCH_TIME, - OPT_READONLY, OPT_DEBUGGING, + OPT_SLAVE_TRANS_RETRIES, OPT_READONLY, OPT_DEBUGGING, OPT_SORT_BUFFER, OPT_TABLE_CACHE, OPT_THREAD_CONCURRENCY, OPT_THREAD_CACHE_SIZE, OPT_TMP_TABLE_SIZE, OPT_THREAD_STACK, @@ -5049,7 +5049,7 @@ The minimum value for this variable is 4096.", (gptr*) &max_system_variables.max_length_for_sort_data, 0, GET_ULONG, REQUIRED_ARG, 1024, 4, 8192*1024L, 0, 1, 0}, {"max_relay_log_size", OPT_MAX_RELAY_LOG_SIZE, - "If non-zero: relay log will be rotated automatically when the size exceeds this value; if zero (the default): when the size exceeds max_binlog_size. 0 expected, the minimum value for this variable is 4096.", + "If non-zero: relay log will be rotated automatically when the size exceeds this value; if zero (the default): when the size exceeds max_binlog_size. 0 excepted, the minimum value for this variable is 4096.", (gptr*) &max_relay_log_size, (gptr*) &max_relay_log_size, 0, GET_ULONG, REQUIRED_ARG, 0L, 0L, 1024*1024L*1024L, 0, IO_SIZE, 0}, { "max_seeks_for_key", OPT_MAX_SEEKS_FOR_KEY, @@ -5220,6 +5220,12 @@ The minimum value for this variable is 4096.", "Number of seconds to wait for more data from a master/slave connection before aborting the read.", (gptr*) &slave_net_timeout, (gptr*) &slave_net_timeout, 0, GET_ULONG, REQUIRED_ARG, SLAVE_NET_TIMEOUT, 1, LONG_TIMEOUT, 0, 1, 0}, + {"slave_transaction_retries", OPT_SLAVE_TRANS_RETRIES, + "Number of times the slave SQL thread will retry a transaction in case " + "it failed with a deadlock or elapsed lock wait timeout, " + "before giving up and stopping.", + (gptr*) &slave_trans_retries, (gptr*) &slave_trans_retries, 0, + GET_ULONG, REQUIRED_ARG, 0L, 0L, (longlong) ULONG_MAX, 0, 1, 0}, #endif /* HAVE_REPLICATION */ {"slow_launch_time", OPT_SLOW_LAUNCH_TIME, "If creating the thread takes longer than this value (in seconds), the Slow_launch_threads counter will be incremented.", diff --git a/sql/set_var.cc b/sql/set_var.cc index 29ebb2c8817..02b53f701cc 100644 --- a/sql/set_var.cc +++ b/sql/set_var.cc @@ -320,6 +320,8 @@ sys_var_bool_ptr sys_slave_compressed_protocol("slave_compressed_protocol", #ifdef HAVE_REPLICATION sys_var_long_ptr sys_slave_net_timeout("slave_net_timeout", &slave_net_timeout); +sys_var_long_ptr sys_slave_trans_retries("slave_transaction_retries", + &slave_trans_retries); #endif sys_var_long_ptr sys_slow_launch_time("slow_launch_time", &slow_launch_time); @@ -604,6 +606,7 @@ sys_var *sys_variables[]= #ifdef HAVE_REPLICATION &sys_slave_compressed_protocol, &sys_slave_net_timeout, + &sys_slave_trans_retries, &sys_slave_skip_counter, #endif &sys_slow_launch_time, @@ -855,6 +858,7 @@ struct show_var_st init_vars[]= { {"skip_show_database", (char*) &opt_skip_show_db, SHOW_BOOL}, #ifdef HAVE_REPLICATION {sys_slave_net_timeout.name,(char*) &sys_slave_net_timeout, SHOW_SYS}, + {sys_slave_trans_retries.name,(char*) &sys_slave_trans_retries, SHOW_SYS}, #endif {sys_slow_launch_time.name, (char*) &sys_slow_launch_time, SHOW_SYS}, #ifdef HAVE_SYS_UN_H diff --git a/sql/slave.cc b/sql/slave.cc index a39cbdbe14b..145f4295075 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -2959,6 +2959,62 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) exec_res = ev->exec_event(rli); DBUG_ASSERT(rli->sql_thd==thd); delete ev; + if (slave_trans_retries) + { + if (exec_res && + (thd->net.last_errno == ER_LOCK_DEADLOCK || + thd->net.last_errno == ER_LOCK_WAIT_TIMEOUT) && + !thd->is_fatal_error) + { + const char *errmsg; + /* + We were in a transaction which has been rolled back because of a + deadlock (currently, InnoDB deadlock detected by InnoDB) or lock + wait timeout (innodb_lock_wait_timeout exceeded); let's seek back to + BEGIN log event and retry it all again. + We have to not only seek but also + a) init_master_info(), to seek back to hot relay log's start for later + (for when we will come back to this hot log after re-processing the + possibly existing old logs where BEGIN is: check_binlog_magic() will + then need the cache to be at position 0 (see comments at beginning of + init_master_info()). + b) init_relay_log_pos(), because the BEGIN may be an older relay log. + */ + if (rli->trans_retries--) + { + sql_print_information("Slave SQL thread retries transaction"); + if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL)) + sql_print_error("Failed to initialize the master info structure"); + else if (init_relay_log_pos(rli, + rli->group_relay_log_name, + rli->group_relay_log_pos, + 1, &errmsg)) + sql_print_error("Error initializing relay log position: %s", + errmsg); + else + { + exec_res= 0; + sleep(2); // chance for concurrent connection to get more locks + } + } + else + sql_print_error("Slave SQL thread retried transaction %lu time(s) " + "in vain, giving up. Consider raising the value of " + "the slave_transaction_retries variable.", + slave_trans_retries); + } + if (!((thd->options & OPTION_BEGIN) && opt_using_transactions)) + { + rli->trans_retries= slave_trans_retries; // restart from fresh + /* + TODO: when merged into 5.0, when slave does auto-rollback if + corrupted binlog, this should reset the retry counter too + (any rollback should). In fact it will work, as here we are just out + of a Format_description_log_event::exec_event() which rolled back. + But check repl code in 5.0 for new ha_rollback calls, just in case. + */ + } + } return exec_res; } else @@ -3370,6 +3426,7 @@ slave_begin: pthread_mutex_lock(&rli->log_space_lock); rli->ignore_log_space_limit= 0; pthread_mutex_unlock(&rli->log_space_lock); + rli->trans_retries= slave_trans_retries; // start from "no error" if (init_relay_log_pos(rli, rli->group_relay_log_name, diff --git a/sql/slave.h b/sql/slave.h index bcd79dd4a39..fb7560ec738 100644 --- a/sql/slave.h +++ b/sql/slave.h @@ -294,7 +294,9 @@ typedef struct st_relay_log_info UNTIL_LOG_NAMES_CMP_UNKNOWN= -2, UNTIL_LOG_NAMES_CMP_LESS= -1, UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1 } until_log_names_cmp_result; - + + ulong trans_retries; + st_relay_log_info(); ~st_relay_log_info(); From c4ac232d1759633bd7022c70dfd65ce8d1e7a661 Mon Sep 17 00:00:00 2001 From: "guilhem@mysql.com" <> Date: Wed, 2 Mar 2005 16:37:54 +0100 Subject: [PATCH 2/2] First part of fix for BUG#7998 "Replication should be more clever about when to replicate RELEASE_LOCK()" (next part is updating test results) --- sql/item_func.cc | 22 ++++++++++------------ sql/log_event.cc | 10 +++++----- sql/mysqld.cc | 2 +- sql/slave.cc | 13 ++----------- 4 files changed, 18 insertions(+), 29 deletions(-) diff --git a/sql/item_func.cc b/sql/item_func.cc index 19228108bfc..5bd606ab4a9 100644 --- a/sql/item_func.cc +++ b/sql/item_func.cc @@ -2857,18 +2857,6 @@ void item_user_lock_free(void) void item_user_lock_release(User_level_lock *ull) { ull->locked=0; - if (mysql_bin_log.is_open()) - { - char buf[256]; - const char *command="DO RELEASE_LOCK(\""; - String tmp(buf,sizeof(buf), system_charset_info); - tmp.copy(command, strlen(command), tmp.charset()); - tmp.append(ull->key,ull->key_length); - tmp.append("\")", 2); - Query_log_event qev(current_thd, tmp.ptr(), tmp.length(), 0, FALSE); - qev.error_code=0; // this query is always safe to run on slave - mysql_bin_log.write(&qev); - } if (--ull->count) pthread_cond_signal(&ull->cond); else @@ -2992,6 +2980,16 @@ longlong Item_func_get_lock::val_int() User_level_lock *ull; int error=0; + /* + In slave thread no need to get locks, everything is serialized. Anyway + there is no way to make GET_LOCK() work on slave like it did on master + (i.e. make it return exactly the same value) because we don't have the + same other concurrent threads environment. No matter what we return here, + it's not guaranteed to be same as on master. + */ + if (thd->slave_thread) + return 1; + pthread_mutex_lock(&LOCK_user_locks); if (!res || !res->length()) diff --git a/sql/log_event.cc b/sql/log_event.cc index 7f92159208b..889550465bc 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -1616,9 +1616,9 @@ end: probably, so data_buf will be freed, so the thd->... listed above will be pointers to freed memory. So we must set them to 0, so that those bad pointers values are not later - used. Note that "cleanup" queries (automatic DO RELEASE_LOCK() and DROP - TEMPORARY TABLE don't suffer from these assignments to 0 as DROP TEMPORARY - TABLE uses the db.table syntax). + used. Note that "cleanup" queries like automatic DROP TEMPORARY TABLE + don't suffer from these assignments to 0 as DROP TEMPORARY + TABLE uses the db.table syntax. */ thd->db= thd->catalog= 0; // prevent db from being freed thd->query= 0; // just to be sure @@ -3663,8 +3663,8 @@ void Stop_log_event::print(FILE* file, bool short_form, LAST_EVENT_INFO* last_ev The master stopped. We used to clean up all temporary tables but this is useless as, as the - master has shut down properly, it has written all DROP TEMPORARY TABLE and DO - RELEASE_LOCK (prepared statements' deletion is TODO). + master has shut down properly, it has written all DROP TEMPORARY TABLE + (prepared statements' deletion is TODO only when we binlog prep stmts). We used to clean up slave_load_tmpdir, but this is useless as it has been cleared at the end of LOAD DATA INFILE. So we have nothing to do here. diff --git a/sql/mysqld.cc b/sql/mysqld.cc index d72d22f6da2..ce70b05761a 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -5799,7 +5799,7 @@ static void mysql_init_variables(void) opt_log= opt_update_log= opt_bin_log= opt_slow_log= 0; opt_disable_networking= opt_skip_show_db=0; opt_logname= opt_update_logname= opt_binlog_index_name= opt_slow_logname= 0; - opt_tc_log_file= "tc.log"; // no hostname in tc_log file name ! + opt_tc_log_file= (char *)"tc.log"; // no hostname in tc_log file name ! opt_secure_auth= 0; opt_bootstrap= opt_myisam_log= 0; mqh_used= 0; diff --git a/sql/slave.cc b/sql/slave.cc index 8865bf25d52..ef88f103b63 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -3257,7 +3257,7 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) else if (init_relay_log_pos(rli, rli->group_relay_log_name, rli->group_relay_log_pos, - 1, &errmsg)) + 1, &errmsg, 1)) sql_print_error("Error initializing relay log position: %s", errmsg); else @@ -3273,16 +3273,7 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) slave_trans_retries); } if (!((thd->options & OPTION_BEGIN) && opt_using_transactions)) - { rli->trans_retries= slave_trans_retries; // restart from fresh - /* - TODO: when merged into 5.0, when slave does auto-rollback if - corrupted binlog, this should reset the retry counter too - (any rollback should). In fact it will work, as here we are just out - of a Format_description_log_event::exec_event() which rolled back. - But check repl code in 5.0 for new ha_rollback calls, just in case. - */ - } } return exec_res; } @@ -4245,7 +4236,7 @@ int queue_event(MASTER_INFO* mi,const char* buf, ulong event_len) master server shutdown. The only thing this does is cleaning. But cleaning is already done on a per-master-thread basis (as the master server is shutting down cleanly, it has written all DROP TEMPORARY TABLE - and DO RELEASE_LOCK; prepared statements' deletion are TODO). + prepared statements' deletion are TODO only when we binlog prep stmts). We don't even increment mi->master_log_pos, because we may be just after a Rotate event. Btw, in a few milliseconds we are going to have a Start