diff --git a/mysql-test/suite/binlog/r/binlog_commit_wait.result b/mysql-test/suite/binlog/r/binlog_commit_wait.result new file mode 100644 index 00000000000..e3d4b9e5d97 --- /dev/null +++ b/mysql-test/suite/binlog/r/binlog_commit_wait.result @@ -0,0 +1,57 @@ +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; +SET @old_count= @@GLOBAL.binlog_commit_wait_count; +SET GLOBAL binlog_commit_wait_count= 3; +SET @old_usec= @@GLOBAL.binlog_commit_wait_usec; +SET GLOBAL binlog_commit_wait_usec= 20000000; +SET @a= current_timestamp(); +BEGIN; +INSERT INTO t1 VALUES (1,0); +COMMIT; +INSERT INTO t1 VALUES (1,1); +SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a); +SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")); +IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")) +Ok +ERROR 23000: Duplicate entry '1' for key 'PRIMARY' +SET @a= current_timestamp(); +INSERT INTO t1 VALUES (2,0); +INSERT INTO t1 VALUES (3,0); +INSERT INTO t1 VALUES (4,0); +SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a); +SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")); +IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")) +Ok +SET @a= current_timestamp(); +INSERT INTO t1 VALUES (6,0); +BEGIN; +UPDATE t1 SET b=b+1 WHERE a=1; +UPDATE t1 SET b=b+10 WHERE a=1; +SELECT SLEEP(0.25); +SLEEP(0.25) +0 +UPDATE t1 SET b=b+1 WHERE a=3; +COMMIT; +SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a); +SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")); +IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")) +Ok +SET @a= current_timestamp(); +INSERT INTO t1 VALUES (7,0); +INSERT INTO t1 VALUES (8,0); +SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a); +SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")); +IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")) +Ok +SELECT * FROM t1 ORDER BY a; +a b +1 11 +2 0 +3 1 +4 0 +6 0 +7 0 +8 0 +DROP TABLE t1; +SET GLOBAL binlog_commit_wait_count= @old_count; +SET GLOBAL binlog_commit_wait_usec= @old_usec; diff --git a/mysql-test/suite/binlog/t/binlog_commit_wait.test b/mysql-test/suite/binlog/t/binlog_commit_wait.test new file mode 100644 index 00000000000..5b3fb0a5e25 --- /dev/null +++ b/mysql-test/suite/binlog/t/binlog_commit_wait.test @@ -0,0 +1,123 @@ +--source include/have_innodb.inc +--source include/have_log_bin.inc + +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB; + +SET @old_count= @@GLOBAL.binlog_commit_wait_count; +SET GLOBAL binlog_commit_wait_count= 3; +SET @old_usec= @@GLOBAL.binlog_commit_wait_usec; +SET GLOBAL binlog_commit_wait_usec= 20000000; + +connect(con1,localhost,root,,test); +connect(con2,localhost,root,,test); +connect(con3,localhost,root,,test); + +# Check that if T2 goes to wait for a row lock of T1 while T1 is waiting for +# more transactions to arrive for group commit, the commit of T1 will complete +# immediately. +# We test this by setting a very high timeout (20 seconds), and testing that +# that much time does not elapse. + +--connection default +SET @a= current_timestamp(); + +--connection con1 +BEGIN; +INSERT INTO t1 VALUES (1,0); +send COMMIT; + +--connection con2 +send INSERT INTO t1 VALUES (1,1); + +--connection con1 +reap; + +--connection default +SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a); +SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")); + +--connection con2 +--error ER_DUP_ENTRY +reap; + + +# Test that the commit triggers when sufficient commits have queued up. +--connection default +SET @a= current_timestamp(); + +--connection con1 +send INSERT INTO t1 VALUES (2,0); + +--connection con2 +send INSERT INTO t1 VALUES (3,0); + +--connection con3 +INSERT INTO t1 VALUES (4,0); + +--connection con1 +reap; +--connection con2 +reap; + +--connection default +SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a); +SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")); + + +# Test that commit triggers immediately if there is already a transaction +# waiting on another transaction that reaches its commit. + +--connection default +SET @a= current_timestamp(); + +--connection con1 +send INSERT INTO t1 VALUES (6,0); + +--connection con2 +BEGIN; +UPDATE t1 SET b=b+1 WHERE a=1; + +--connection con3 +send UPDATE t1 SET b=b+10 WHERE a=1; + +--connection con2 +# A small sleep to let con3 have time to wait on con2. +# The sleep might be too small on loaded host, but that is not a big problem; +# it only means we will trigger a different code path (con3 waits after con2 +# is ready to commit rather than before); and either path should work the same. +# So we will not get false positive in case of different timing; at worst false +# negative. +SELECT SLEEP(0.25); +UPDATE t1 SET b=b+1 WHERE a=3; +COMMIT; + +--connection con1 +reap; + +--connection default +SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a); +SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")); + +--connection default +SET @a= current_timestamp(); + +# Now con3 will be waiting for a following group commit to trigger. +--connection con1 +send INSERT INTO t1 VALUES (7,0); +--connection con2 +INSERT INTO t1 VALUES (8,0); +--connection con3 +reap; + +--connection default +SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a); +SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20")); + +--connection default +SELECT * FROM t1 ORDER BY a; + +--connection default +DROP TABLE t1; +SET GLOBAL binlog_commit_wait_count= @old_count; +SET GLOBAL binlog_commit_wait_usec= @old_usec; diff --git a/sql/handler.cc b/sql/handler.cc index 9a2ba6bb732..70bce6f3963 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -1545,7 +1545,10 @@ commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans) } /* Free resources and perform other cleanup even for 'empty' transactions. */ if (is_real_trans) + { + thd->has_waiter= false; thd->transaction.cleanup(); + } DBUG_RETURN(error); } @@ -1626,7 +1629,10 @@ int ha_rollback_trans(THD *thd, bool all) /* Always cleanup. Even if nht==0. There may be savepoints. */ if (is_real_trans) + { + thd->has_waiter= false; thd->transaction.cleanup(); + } if (all) thd->transaction_rollback_request= FALSE; diff --git a/sql/log.cc b/sql/log.cc index 6fa6156c489..30ecd51ab77 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -7029,6 +7029,14 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) } } + /* + Handle the heuristics that if another transaction is waiting for this + transaction (or if it does so later), then we want to trigger group + commit immediately, without waiting for the binlog_commit_wait_usec + timeout to expire. + */ + entry->thd->waiting_on_group_commit= true; + /* Add the entry to the group commit queue. */ next_entry= entry->next; entry->next= group_commit_queue; @@ -7044,7 +7052,7 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) cur= entry->thd->wait_for_commit_ptr; } - if (opt_binlog_commit_wait_count > 0) + if (opt_binlog_commit_wait_count > 0 && orig_queue != NULL) mysql_cond_signal(&COND_prepare_ordered); mysql_mutex_unlock(&LOCK_prepare_ordered); DEBUG_SYNC(orig_entry->thd, "commit_after_release_LOCK_prepare_ordered"); @@ -7218,6 +7226,11 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) while (current) { group_commit_entry *next= current->next; + /* + Now that group commit is started, we can clear the flag; there is no + longer any use in waiters on this commit trying to trigger it early. + */ + current->thd->waiting_on_group_commit= false; current->next= queue; queue= current; current= next; @@ -7530,7 +7543,7 @@ MYSQL_BIN_LOG::wait_for_sufficient_commits() mysql_mutex_assert_owner(&LOCK_prepare_ordered); for (e= last_head= group_commit_queue, count= 0; e; e= e->next) - if (++count >= opt_binlog_commit_wait_count) + if (++count >= opt_binlog_commit_wait_count || unlikely(e->thd->has_waiter)) return; mysql_mutex_unlock(&LOCK_log); @@ -7545,13 +7558,20 @@ MYSQL_BIN_LOG::wait_for_sufficient_commits() &wait_until); if (err == ETIMEDOUT) break; + if (unlikely(last_head->thd->has_waiter)) + break; head= group_commit_queue; for (e= head; e && e != last_head; e= e->next) + { ++count; + if (unlikely(e->thd->has_waiter)) + goto after_loop; + } if (count >= opt_binlog_commit_wait_count) break; last_head= head; } +after_loop: /* We must not wait for LOCK_log while holding LOCK_prepare_ordered. @@ -7575,6 +7595,42 @@ MYSQL_BIN_LOG::wait_for_sufficient_commits() } +void +MYSQL_BIN_LOG::binlog_trigger_immediate_group_commit() +{ + group_commit_entry *head; + mysql_mutex_lock(&LOCK_prepare_ordered); + head= group_commit_queue; + if (head) + { + head->thd->has_waiter= true; + mysql_cond_signal(&COND_prepare_ordered); + } + mysql_mutex_unlock(&LOCK_prepare_ordered); +} + + +/* + This function is called when a transaction T1 goes to wait for another + transaction T2. It is used to cut short any binlog group commit delay from + --binlog-commit-wait-count in the case where another transaction is stalled + on the wait due to conflicting row locks. + + If T2 is already ready to group commit, any waiting group commit will be + signalled to proceed immediately. Otherwise, a flag will be set in T2, and + when T2 later becomes ready, immediate group commit will be triggered. +*/ +void +binlog_report_wait_for(THD *thd1, THD *thd2) +{ + if (opt_binlog_commit_wait_count == 0) + return; + thd2->has_waiter= true; + if (thd2->waiting_on_group_commit) + mysql_bin_log.binlog_trigger_immediate_group_commit(); +} + + /** Wait until we get a signal that the relay log has been updated. diff --git a/sql/log.h b/sql/log.h index a93e2e1c0b9..78a55f90292 100644 --- a/sql/log.h +++ b/sql/log.h @@ -690,6 +690,7 @@ public: void set_max_size(ulong max_size_arg); void signal_update(); void wait_for_sufficient_commits(); + void binlog_trigger_immediate_group_commit(); void wait_for_update_relay_log(THD* thd); int wait_for_update_bin_log(THD* thd, const struct timespec * timeout); void init(ulong max_size); @@ -1005,6 +1006,7 @@ bool general_log_print(THD *thd, enum enum_server_command command, bool general_log_write(THD *thd, enum enum_server_command command, const char *query, uint query_length); +void binlog_report_wait_for(THD *thd, THD *other_thd); void sql_perror(const char *message); bool flush_error_log(); diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 9cdf5cec54d..8c52d5dd92d 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -887,6 +887,7 @@ THD::THD() in_lock_tables(0), bootstrap(0), derived_tables_processing(FALSE), + waiting_on_group_commit(FALSE), has_waiter(FALSE), spcont(NULL), m_parser_state(NULL), #if defined(ENABLED_DEBUG_SYNC) @@ -4233,6 +4234,8 @@ thd_need_wait_for(const MYSQL_THD thd) { rpl_group_info *rgi; + if (mysql_bin_log.is_open() && opt_binlog_commit_wait_count > 0) + return true; if (!thd) return false; rgi= thd->rgi_slave; @@ -4267,13 +4270,14 @@ thd_need_wait_for(const MYSQL_THD thd) not harmful, but could lead to unnecessary kill and retry, so best avoided). */ extern "C" void -thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd) +thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd) { rpl_group_info *rgi; rpl_group_info *other_rgi; if (!thd || !other_thd) return; + binlog_report_wait_for(thd, other_thd); rgi= thd->rgi_slave; other_rgi= other_thd->rgi_slave; if (!rgi || !other_rgi) diff --git a/sql/sql_class.h b/sql/sql_class.h index 5d64837a2c1..3252bc1bbe2 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -2674,6 +2674,18 @@ public: it returned an error on master, and this is OK on the slave. */ bool is_slave_error; + /* + True when a transaction is queued up for binlog group commit. + Used so that if another transaction needs to wait for a row lock held by + this transaction, it can signal to trigger the group commit immediately, + skipping the normal --binlog-commit-wait-count wait. + */ + bool waiting_on_group_commit; + /* + Set true when another transaction goes to wait on a row lock held by this + transaction. Used together with waiting_on_group_commit. + */ + bool has_waiter; /* In case of a slave, set to the error code the master got when executing the query. 0 if no error on the master. diff --git a/sql/transaction.cc b/sql/transaction.cc index 51d8a08e981..a70c075e142 100644 --- a/sql/transaction.cc +++ b/sql/transaction.cc @@ -149,6 +149,8 @@ bool trans_begin(THD *thd, uint flags) when we come here. We should at some point change this to an assert. */ thd->transaction.all.modified_non_trans_table= FALSE; + thd->has_waiter= false; + thd->waiting_on_group_commit= false; if (res) DBUG_RETURN(TRUE); diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index b822b400153..850d81cad02 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -375,7 +375,7 @@ struct lock_stack_t { ulint heap_no; /*!< heap number if rec lock */ }; -extern "C" void thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd); +extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd); extern "C" int thd_need_wait_for(const MYSQL_THD thd); extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index aa36202f76f..f9ea52470d5 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -375,7 +375,7 @@ struct lock_stack_t { ulint heap_no; /*!< heap number if rec lock */ }; -extern "C" void thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd); +extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd); extern "C" int thd_need_wait_for(const MYSQL_THD thd); extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);