From 595a007d4e570f935ffc396aeb62480c2f414741 Mon Sep 17 00:00:00 2001 From: Luis Soares Date: Fri, 11 Nov 2011 17:26:56 +0000 Subject: [PATCH] BUG#11760927: 53375: RBR + NO PK => HIGH LOAD ON SLAVE (TABLE SCAN/CPU) => SLAVE FAILURE When a statement containing a large amount of ROWs to be applied on the slave, and the slave's table does not contain a PK, it can take a considerable amount of time to find and change all the rows that are to be changed. The proper slave enhancement will be implemented in WL 5597. However, in this bug we are making it clear to the user what the problem is, by printing a message to the error log if the execution time, for a given statement in RBR, takes more than LONG_FIND_ROW_THRESHOLD (set to 60 seconds). This shall help the DBA to diagnose what's happening when facing a slave server that is quiet for no apparent reason... The note is only printed to the error log if log_warnings is set to be greater than 1. --- .../suite/rpl/r/rpl_row_find_row_debug.result | 18 ++++++ .../suite/rpl/t/rpl_row_find_row_debug.test | 62 ++++++++++++++++++ sql/log_event.cc | 64 +++++++++++++++++++ sql/log_event.h | 1 + sql/rpl_rli.cc | 9 +++ sql/rpl_rli.h | 41 ++++++++++++ 6 files changed, 195 insertions(+) create mode 100644 mysql-test/suite/rpl/r/rpl_row_find_row_debug.result create mode 100644 mysql-test/suite/rpl/t/rpl_row_find_row_debug.test diff --git a/mysql-test/suite/rpl/r/rpl_row_find_row_debug.result b/mysql-test/suite/rpl/r/rpl_row_find_row_debug.result new file mode 100644 index 00000000000..295c053563d --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_row_find_row_debug.result @@ -0,0 +1,18 @@ +include/master-slave.inc +[connection master] +include/stop_slave.inc +SET GLOBAL log_warnings = 2; +SET GLOBAL debug="d,inject_long_find_row_note"; +include/start_slave.inc +CREATE TABLE t1 (c1 INT); +INSERT INTO t1 VALUES (1), (2); +UPDATE t1 SET c1= 1000 WHERE c1=2; +DELETE FROM t1; +DROP TABLE t1; +# Check if any note related to long DELETE_ROWS and UPDATE_ROWS appears in the error log +Occurrences: update=1, delete=1 +include/stop_slave.inc +SET GLOBAL debug = ''; +SET GLOBAL log_warnings = 1; +include/start_slave.inc +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_row_find_row_debug.test b/mysql-test/suite/rpl/t/rpl_row_find_row_debug.test new file mode 100644 index 00000000000..a8d9e878ba2 --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_row_find_row_debug.test @@ -0,0 +1,62 @@ +# +# Bug#11760927: 53375: RBR + NO PK => HIGH LOAD ON SLAVE (TABLE SCAN/CPU) => SLAVE FAILURE +# +--source include/master-slave.inc +--source include/have_binlog_format_row.inc +--source include/have_debug.inc + +# SETUP +# - setup log_warnings and debug +--connection slave +--source include/stop_slave.inc +--let $debug_save= `SELECT @@GLOBAL.debug` +--let $log_warnings_save= `SELECT @@GLOBAL.log_warnings` + +SET GLOBAL log_warnings = 2; + +let $log_error_= `SELECT @@GLOBAL.log_error`; +if(!$log_error_) +{ + # MySQL Server on windows is started with --console and thus + # does not know the location of its .err log, use default location + let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.2.err; +} + +# Assign env variable LOG_ERROR +let LOG_ERROR=$log_error_; + +# force printing the notes to the error log +SET GLOBAL debug="d,inject_long_find_row_note"; +--source include/start_slave.inc + +# test +--connection master +CREATE TABLE t1 (c1 INT); +--sync_slave_with_master +--connection master + +INSERT INTO t1 VALUES (1), (2); +UPDATE t1 SET c1= 1000 WHERE c1=2; +DELETE FROM t1; +DROP TABLE t1; +--sync_slave_with_master + +--echo # Check if any note related to long DELETE_ROWS and UPDATE_ROWS appears in the error log +perl; + use strict; + my $log_error= $ENV{'LOG_ERROR'} or die "LOG_ERROR not set"; + open(FILE, "$log_error") or die("Unable to open $log_error: $!\n"); + my $upd_count = () = grep(/The slave is applying a ROW event on behalf of an UPDATE statement on table t1 and is currently taking a considerable amount/g,); + seek(FILE, 0, 0) or die "Can't seek to beginning of file: $!"; + my $del_count = () = grep(/The slave is applying a ROW event on behalf of a DELETE statement on table t1 and is currently taking a considerable amount/g,); + print "Occurrences: update=$upd_count, delete=$del_count\n"; + close(FILE); +EOF + +# cleanup +--source include/stop_slave.inc +--eval SET GLOBAL debug = '$debug_save' +--eval SET GLOBAL log_warnings = $log_warnings_save +--source include/start_slave.inc + +--source include/rpl_end.inc diff --git a/sql/log_event.cc b/sql/log_event.cc index b29bcfdb536..d994058f037 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -7761,6 +7761,12 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli) // row processing loop + /* + set the initial time of this ROWS statement if it was not done + before in some other ROWS event. + */ + const_cast(rli)->set_row_stmt_start_timestamp(); + while (error == 0 && m_curr_row < m_rows_end) { /* in_use can have been set to NULL in close_tables_for_reopen */ @@ -9252,6 +9258,51 @@ record_compare_exit: return result; } +/* + Check if we are already spending too much time on this statement. + if we are, warn user that it might be because table does not have + a PK, but only if the warning was not printed before for this STMT. + + @param type The event type code. + @param table_name The name of the table that the slave is + operating. + @param is_index_scan States whether the slave is doing an index scan + or not. + @param rli The relay metadata info. +*/ +static inline +void issue_long_find_row_warning(Log_event_type type, + const char *table_name, + bool is_index_scan, + const Relay_log_info *rli) +{ + if ((global_system_variables.log_warnings > 1 && + !const_cast(rli)->is_long_find_row_note_printed())) + { + time_t now= my_time(0); + time_t stmt_ts= const_cast(rli)->get_row_stmt_start_timestamp(); + + DBUG_EXECUTE_IF("inject_long_find_row_note", + stmt_ts-=(LONG_FIND_ROW_THRESHOLD*2);); + + time_t delta= (now - stmt_ts); + + if (delta > LONG_FIND_ROW_THRESHOLD) + { + const_cast(rli)->set_long_find_row_note_printed(); + const char* evt_type= type == DELETE_ROWS_EVENT ? " DELETE" : "n UPDATE"; + const char* scan_type= is_index_scan ? "scanning an index" : "scanning the table"; + + sql_print_information("The slave is applying a ROW event on behalf of a%s statement " + "on table %s and is currently taking a considerable amount " + "of time (%lu seconds). This is due to the fact that it is %s " + "while looking up records to be processed. Consider adding a " + "primary key (or unique key) to the table to improve " + "performance.", evt_type, table_name, delta, scan_type); + } + } +} + /** Locate the current row in event's table. @@ -9287,6 +9338,7 @@ int Rows_log_event::find_row(const Relay_log_info *rli) TABLE *table= m_table; int error= 0; + bool is_table_scan= false, is_index_scan= false; /* rpl_row_tabledefs.test specifies that @@ -9452,6 +9504,8 @@ int Rows_log_event::find_row(const Relay_log_info *rli) } } + is_index_scan=true; + /* In case key is not unique, we still have to iterate over records found and find the one which is identical to the row given. A copy of the @@ -9508,6 +9562,8 @@ int Rows_log_event::find_row(const Relay_log_info *rli) goto err; } + is_table_scan= true; + /* Continue until we find the right record or have made a full loop */ do { @@ -9561,10 +9617,18 @@ int Rows_log_event::find_row(const Relay_log_info *rli) goto err; } ok: + if (is_table_scan || is_index_scan) + issue_long_find_row_warning(get_type_code(), m_table->alias, + is_index_scan, rli); + table->default_column_bitmaps(); DBUG_RETURN(0); err: + if (is_table_scan || is_index_scan) + issue_long_find_row_warning(get_type_code(), m_table->alias, + is_index_scan, rli); + table->default_column_bitmaps(); DBUG_RETURN(error); } diff --git a/sql/log_event.h b/sql/log_event.h index 07656ebaa72..d65f2d2c85d 100644 --- a/sql/log_event.h +++ b/sql/log_event.h @@ -53,6 +53,7 @@ class String; #define PREFIX_SQL_LOAD "SQL_LOAD-" +#define LONG_FIND_ROW_THRESHOLD 60 /* seconds */ /** Either assert or return an error. diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index e1c764a4248..0000ba9abb7 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -1245,6 +1245,15 @@ void Relay_log_info::cleanup_context(THD *thd, bool error) */ thd->variables.option_bits&= ~OPTION_NO_FOREIGN_KEY_CHECKS; thd->variables.option_bits&= ~OPTION_RELAXED_UNIQUE_CHECKS; + + /* + Reset state related to long_find_row notes in the error log: + - timestamp + - flag that decides whether the slave prints or not + */ + reset_row_stmt_start_timestamp(); + unset_long_find_row_note_printed(); + DBUG_VOID_RETURN; } diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h index fb62279c4c1..c8a0f549e0f 100644 --- a/sql/rpl_rli.h +++ b/sql/rpl_rli.h @@ -452,8 +452,49 @@ public: (m_flags & (1UL << IN_STMT)); } + time_t get_row_stmt_start_timestamp() + { + return row_stmt_start_timestamp; + } + + time_t set_row_stmt_start_timestamp() + { + if (row_stmt_start_timestamp == 0) + row_stmt_start_timestamp= my_time(0); + + return row_stmt_start_timestamp; + } + + void reset_row_stmt_start_timestamp() + { + row_stmt_start_timestamp= 0; + } + + void set_long_find_row_note_printed() + { + long_find_row_note_printed= true; + } + + void unset_long_find_row_note_printed() + { + long_find_row_note_printed= false; + } + + bool is_long_find_row_note_printed() + { + return long_find_row_note_printed; + } + private: + uint32 m_flags; + + /* + Runtime state for printing a note when slave is taking + too long while processing a row event. + */ + time_t row_stmt_start_timestamp; + bool long_find_row_note_printed; };