New variable rli->ignore_log_space_limit to resolve
a deadlock between I/O and SQL threads in replication when relay_log_space is too small. This fixes bug #79. sql/log.cc: New variable rli->ignore_log_space_limit to resolve a deadlock between I/O and SQL threads in replication when relay_log_space is too small. sql/slave.cc: New variable rli->ignore_log_space_limit to resolve a deadlock between I/O and SQL threads in replication when relay_log_space is too small. sql/slave.h: New variable rli->ignore_log_space_limit to resolve a deadlock between I/O and SQL threads in replication when relay_log_space is too small. sql/sql_repl.cc: New variable rli->ignore_log_space_limit to resolve a deadlock between I/O and SQL threads in replication when relay_log_space is too small.
This commit is contained in:
parent
2103479670
commit
40c0b2c6c8
13
mysql-test/r/rpl_relayspace.result
Normal file
13
mysql-test/r/rpl_relayspace.result
Normal file
@ -0,0 +1,13 @@
|
||||
slave stop;
|
||||
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
|
||||
reset master;
|
||||
reset slave;
|
||||
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
|
||||
slave start;
|
||||
stop slave;
|
||||
create table t1 (a int);
|
||||
reset slave;
|
||||
start slave;
|
||||
select master_pos_wait('master-bin.001',5000,45)=-1;
|
||||
master_pos_wait('master-bin.001',5000,45)=-1
|
||||
0
|
1
mysql-test/t/rpl_relayspace-slave.opt
Normal file
1
mysql-test/t/rpl_relayspace-slave.opt
Normal file
@ -0,0 +1 @@
|
||||
-O relay_log_space_limit=1024
|
33
mysql-test/t/rpl_relayspace.test
Normal file
33
mysql-test/t/rpl_relayspace.test
Normal file
@ -0,0 +1,33 @@
|
||||
# The slave is started with relay_log_space_limit=1024 bytes,
|
||||
# to force the deadlock
|
||||
|
||||
source include/master-slave.inc;
|
||||
connection slave;
|
||||
stop slave;
|
||||
connection master;
|
||||
create table t1 (a int);
|
||||
let $1=200;
|
||||
disable_query_log;
|
||||
while ($1)
|
||||
{
|
||||
# eval means expand $ expressions
|
||||
eval insert into t1 values( $1 );
|
||||
dec $1;
|
||||
}
|
||||
# This will generate one 10kB master's binlog
|
||||
enable_query_log;
|
||||
save_master_pos;
|
||||
connection slave;
|
||||
reset slave;
|
||||
start slave;
|
||||
# The I/O thread stops filling the relay log when
|
||||
# it's 1kB. And the SQL thread cannot purge this relay log
|
||||
# as purge is done only when the SQL thread switches to another
|
||||
# relay log, which does not exist here.
|
||||
# So we should have a deadlock.
|
||||
# if it is not resolved automatically we'll detect
|
||||
# it with master_pos_wait that waits for farther than 1kB;
|
||||
# it will timeout after 45 seconds;
|
||||
# also the slave will probably not cooperate to shutdown
|
||||
# (as 2 threads are locked)
|
||||
select master_pos_wait('master-bin.001',5000,45)=-1;
|
@ -645,6 +645,8 @@ int MYSQL_LOG::purge_first_log(struct st_relay_log_info* rli)
|
||||
*/
|
||||
pthread_mutex_lock(&rli->log_space_lock);
|
||||
rli->log_space_total -= rli->relay_log_pos;
|
||||
//tell the I/O thread to take the relay_log_space_limit into account
|
||||
rli->ignore_log_space_limit= 0;
|
||||
pthread_mutex_unlock(&rli->log_space_lock);
|
||||
pthread_cond_broadcast(&rli->log_space_cond);
|
||||
|
||||
|
60
sql/slave.cc
60
sql/slave.cc
@ -238,7 +238,7 @@ int init_relay_log_pos(RELAY_LOG_INFO* rli,const char* log,
|
||||
if (log) // If not first log
|
||||
{
|
||||
if (strcmp(log, rli->linfo.log_file_name))
|
||||
rli->skip_log_purge=1; // Different name; Don't purge
|
||||
rli->skip_log_purge= 1; // Different name; Don't purge
|
||||
if (rli->relay_log.find_log_pos(&rli->linfo, log, 1))
|
||||
{
|
||||
*errmsg="Could not find target log during relay log initialization";
|
||||
@ -273,6 +273,12 @@ int init_relay_log_pos(RELAY_LOG_INFO* rli,const char* log,
|
||||
my_b_seek(rli->cur_log,(off_t)pos);
|
||||
|
||||
err:
|
||||
/*
|
||||
If we don't purge, we can't honour relay_log_space_limit ;
|
||||
silently discard it
|
||||
*/
|
||||
if (rli->skip_log_purge)
|
||||
rli->log_space_limit= 0;
|
||||
pthread_cond_broadcast(&rli->data_cond);
|
||||
if (need_data_lock)
|
||||
pthread_mutex_unlock(&rli->data_lock);
|
||||
@ -1312,7 +1318,8 @@ static bool wait_for_relay_log_space(RELAY_LOG_INFO* rli)
|
||||
save_proc_info = thd->proc_info;
|
||||
thd->proc_info = "Waiting for relay log space to free";
|
||||
while (rli->log_space_limit < rli->log_space_total &&
|
||||
!(slave_killed=io_slave_killed(thd,mi)))
|
||||
!(slave_killed=io_slave_killed(thd,mi)) &&
|
||||
!rli->ignore_log_space_limit)
|
||||
{
|
||||
pthread_cond_wait(&rli->log_space_cond, &rli->log_space_lock);
|
||||
}
|
||||
@ -1588,7 +1595,7 @@ bool flush_master_info(MASTER_INFO* mi)
|
||||
|
||||
st_relay_log_info::st_relay_log_info()
|
||||
:info_fd(-1), cur_log_fd(-1), master_log_pos(0), save_temporary_tables(0),
|
||||
cur_log_old_open_count(0), log_space_total(0),
|
||||
cur_log_old_open_count(0), log_space_total(0), ignore_log_space_limit(0),
|
||||
slave_skip_counter(0), abort_pos_wait(0), slave_run_id(0),
|
||||
sql_thd(0), last_slave_errno(0), inited(0), abort_slave(0),
|
||||
slave_running(0), skip_log_purge(0),
|
||||
@ -2296,7 +2303,8 @@ reconnect done to recover from failed read");
|
||||
}
|
||||
flush_master_info(mi);
|
||||
if (mi->rli.log_space_limit && mi->rli.log_space_limit <
|
||||
mi->rli.log_space_total)
|
||||
mi->rli.log_space_total &&
|
||||
!mi->rli.ignore_log_space_limit)
|
||||
if (wait_for_relay_log_space(&mi->rli))
|
||||
{
|
||||
sql_print_error("Slave I/O thread aborted while waiting for relay \
|
||||
@ -2408,6 +2416,10 @@ slave_begin:
|
||||
pthread_cond_broadcast(&rli->start_cond);
|
||||
// This should always be set to 0 when the slave thread is started
|
||||
rli->pending = 0;
|
||||
|
||||
//tell the I/O thread to take relay_log_space_limit into account from now on
|
||||
rli->ignore_log_space_limit= 0;
|
||||
|
||||
if (init_relay_log_pos(rli,
|
||||
rli->relay_log_name,
|
||||
rli->relay_log_pos,
|
||||
@ -3086,11 +3098,41 @@ Log_event* next_event(RELAY_LOG_INFO* rli)
|
||||
update. If we do not, show slave status will block
|
||||
*/
|
||||
pthread_mutex_unlock(&rli->data_lock);
|
||||
/* Note that wait_for_update unlocks lock_log ! */
|
||||
rli->relay_log.wait_for_update(rli->sql_thd);
|
||||
|
||||
// re-acquire data lock since we released it earlier
|
||||
pthread_mutex_lock(&rli->data_lock);
|
||||
|
||||
/*
|
||||
Possible deadlock :
|
||||
- the I/O thread has reached log_space_limit
|
||||
- the SQL thread has read all relay logs, but cannot purge for some
|
||||
reason:
|
||||
* it has already purged all logs except the current one
|
||||
* there are other logs than the current one but they're involved in
|
||||
a transaction that finishes in the current one (or is not finished)
|
||||
Solution :
|
||||
Wake up the possibly waiting I/O thread, and set a boolean asking
|
||||
the I/O thread to temporarily ignore the log_space_limit
|
||||
constraint, because we do not want the I/O thread to block because of
|
||||
space (it's ok if it blocks for any other reason (e.g. because the
|
||||
master does not send anything). Then the I/O thread stops waiting
|
||||
and reads more events.
|
||||
The SQL thread decides when the I/O thread should take log_space_limit
|
||||
into account again : ignore_log_space_limit is reset to 0
|
||||
in purge_first_log (when the SQL thread purges the just-read relay
|
||||
log), and also when the SQL thread starts. We should also reset
|
||||
ignore_log_space_limit to 0 when the user does RESET SLAVE, but in
|
||||
fact, no need as RESET SLAVE requires that the slave
|
||||
be stopped, and when the SQL thread is later restarted
|
||||
ignore_log_space_limit will be reset to 0.
|
||||
*/
|
||||
pthread_mutex_lock(&rli->log_space_lock);
|
||||
// prevent the I/O thread from blocking next times
|
||||
rli->ignore_log_space_limit= 1;
|
||||
// If the I/O thread is blocked, unblock it
|
||||
pthread_cond_broadcast(&rli->log_space_cond);
|
||||
pthread_mutex_unlock(&rli->log_space_lock);
|
||||
// Note that wait_for_update unlocks lock_log !
|
||||
rli->relay_log.wait_for_update(rli->sql_thd);
|
||||
// re-acquire data lock since we released it earlier
|
||||
pthread_mutex_lock(&rli->data_lock);
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
|
@ -137,7 +137,14 @@ typedef struct st_relay_log_info
|
||||
offset. pending stored the extra offset to be added to the position.
|
||||
*/
|
||||
ulonglong relay_log_pos, pending;
|
||||
|
||||
/*
|
||||
Handling of the relay_log_space_limit optional constraint.
|
||||
ignore_log_space_limit is used to resolve a deadlock between I/O and SQL
|
||||
threads, it makes the I/O thread temporarily forget about the constraint
|
||||
*/
|
||||
ulonglong log_space_limit,log_space_total;
|
||||
bool ignore_log_space_limit;
|
||||
|
||||
/*
|
||||
InnoDB internally stores the master log position it has processed
|
||||
|
@ -858,22 +858,21 @@ int change_master(THD* thd, MASTER_INFO* mi)
|
||||
|
||||
if (lex_mi->relay_log_name)
|
||||
{
|
||||
need_relay_log_purge = 0;
|
||||
mi->rli.skip_log_purge=1;
|
||||
need_relay_log_purge= 0;
|
||||
strmake(mi->rli.relay_log_name,lex_mi->relay_log_name,
|
||||
sizeof(mi->rli.relay_log_name)-1);
|
||||
}
|
||||
|
||||
if (lex_mi->relay_log_pos)
|
||||
{
|
||||
need_relay_log_purge=0;
|
||||
need_relay_log_purge= 0;
|
||||
mi->rli.relay_log_pos=lex_mi->relay_log_pos;
|
||||
}
|
||||
|
||||
flush_master_info(mi);
|
||||
if (need_relay_log_purge)
|
||||
{
|
||||
mi->rli.skip_log_purge=0;
|
||||
mi->rli.skip_log_purge= 0;
|
||||
thd->proc_info="purging old relay logs";
|
||||
if (purge_relay_logs(&mi->rli, thd,
|
||||
0 /* not only reset, but also reinit */,
|
||||
@ -887,6 +886,7 @@ int change_master(THD* thd, MASTER_INFO* mi)
|
||||
else
|
||||
{
|
||||
const char* msg;
|
||||
mi->rli.skip_log_purge= 1;
|
||||
/* Relay log is already initialized */
|
||||
if (init_relay_log_pos(&mi->rli,
|
||||
mi->rli.relay_log_name,
|
||||
|
Loading…
x
Reference in New Issue
Block a user