Replication:

Now the I/O thread (in flush_master_info()) flushes the relay log to disk after reading every event. Slower but provides additionnal safety in case of brutal crash. I had to make the flush optional (i.e. add a if(some_bool_argument) in the function) because sometimes flush_master_info() is called when there is no usable relay log (the relay log's IO_CACHE is not initialized so can't be flushed). mysql-test/r/rpl_loaddata_rule_m.result: avoid a harmless error in the .err file; we don't need a slave in this test (even though it's called 'rpl' because it's testing binlog-ignore-db). mysql-test/t/rpl_loaddata_rule_m.test: result update sql/repl_failsafe.cc: update call to flush_master_info() according to new prototype. sql/slave.cc: - Now the I/O thread (in flush_master_info()) flushes the relay log to disk after reading every event. Slower but provides additionnal safety in case of brutal crash. I had to make the flush optional (i.e. add a if(some_bool_argument) in the function) because sometimes flush_master_info() is called when there is no usable relay log (the relay log's IO_CACHE is not initialized so can't be flushed). - Update version in message. - Remove warning about bug as it's not true anymore (since this changeset). sql/slave.h: new prototype sql/sql_repl.cc: update call to flush_master_info() according to new prototype.
2003-11-23 17:02:59 +01:00 · 2003-11-23 17:02:59 +01:00 · bd6a70019e
commit bd6a70019e
parent d0d8ba7815
6 changed files with 35 additions and 36 deletions
--- a/mysql-test/r/rpl_loaddata_rule_m.result
+++ b/mysql-test/r/rpl_loaddata_rule_m.result
@ -5,7 +5,7 @@ reset slave;
 drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
 start slave;
 drop database if exists mysqltest;
-reset master;
+stop slave;
 create database mysqltest;
 create table t1(a int, b int, unique(b));
 use mysqltest;
--- a/mysql-test/t/rpl_loaddata_rule_m.test
+++ b/mysql-test/t/rpl_loaddata_rule_m.test
@ -9,7 +9,7 @@ drop database if exists mysqltest;
 --enable_warnings
 connection slave;
-reset master;
+stop slave; # don't need slave for this test
 # Test logging on master
--- a/sql/repl_failsafe.cc
+++ b/sql/repl_failsafe.cc
@ -908,7 +908,12 @@ int load_master_data(THD* thd)
 	// don't hit the magic number
 	if (active_mi->master_log_pos < BIN_LOG_HEADER_SIZE)
 	  active_mi->master_log_pos = BIN_LOG_HEADER_SIZE;
-	flush_master_info(active_mi);
+        /*
          Relay log's IO_CACHE may not be inited (even if we are sure that some
          host was specified; there could have been a problem when replication
          started, which led to relay log's IO_CACHE to not be inited.
        */
 	flush_master_info(active_mi, 0);
      }
      mysql_free_result(master_status_res);
    }
--- a/sql/slave.cc
+++ b/sql/slave.cc
@ -1107,7 +1107,7 @@ static int get_master_version_and_clock(MYSQL* mysql, MASTER_INFO* mi)
    break;
  default:
    /* 5.0 is not supported */
-    errmsg = "Master reported an unrecognized MySQL version. Note that 4.0 \
+    errmsg = "Master reported an unrecognized MySQL version. Note that 4.1 \
 slaves can't replicate a 5.0 or newer master.";
    break;
  }
@ -1368,32 +1368,9 @@ int init_relay_log_info(RELAY_LOG_INFO* rli, const char* info_fname)
  }
  /*
-    The relay log will now be opened, as a SEQ_READ_APPEND IO_CACHE. It is
+    The relay log will now be opened, as a SEQ_READ_APPEND IO_CACHE.
-    notable that the last kilobytes of it (8 kB for example) may live in
+    Note that the I/O thread flushes it to disk after writing every event, in
-    memory, not on disk (depending on what the thread using it does). While
+    flush_master_info(mi, 1).
    this is efficient, it has a side-effect one must know: 
    The size of the relay log on disk (displayed by 'ls -l' on Unix) can be a
    few kilobytes less than one would expect by doing SHOW SLAVE STATUS; this
    happens when only the IO thread is started (not the SQL thread). The
    "missing" kilobytes are in memory, are preserved during 'STOP SLAVE; START
    SLAVE IO_THREAD', and are flushed to disk when the slave's mysqld stops. So
    this does not cause any bug. Example of how disk size grows by leaps:
     Read_Master_Log_Pos: 7811 -rw-rw----    1 guilhem  qq              4 Jun  5 16:19 gbichot2-relay-bin.002
     ...later...
     Read_Master_Log_Pos: 9744 -rw-rw----    1 guilhem  qq           8192 Jun  5 16:27 gbichot2-relay-bin.002
    See how 4 is less than 7811 and 8192 is less than 9744.
    WARNING: this is risky because the slave can stay like this for a long
    time; then if it has a power failure, master.info says the I/O thread has
    read until 9744 while the relay-log contains only until 8192 (the
    in-memory part from 8192 to 9744 has been lost), so the SQL slave thread
    will miss some events, silently breaking replication.
    Ideally we would like to flush master.info only when we know that the relay
    log has no in-memory tail.
    Note that the above problem may arise only when only the IO thread is
    started, which is unlikely.
  */
  /*
@ -1850,7 +1827,7 @@ file '%s')", fname);
  mi->inited = 1;
  // now change cache READ -> WRITE - must do this before flush_master_info
  reinit_io_cache(&mi->file, WRITE_CACHE,0L,0,1);
-  if ((error=test(flush_master_info(mi))))
+  if ((error=test(flush_master_info(mi, 1))))
    sql_print_error("Failed to flush master info file");
  pthread_mutex_unlock(&mi->data_lock);
  DBUG_RETURN(error);
@ -2100,7 +2077,7 @@ int show_master_info(THD* thd, MASTER_INFO* mi)
 }
-bool flush_master_info(MASTER_INFO* mi)
+bool flush_master_info(MASTER_INFO* mi, bool flush_relay_log_cache)
 {
  IO_CACHE* file = &mi->file;
  char lbuf[22];
@ -2124,6 +2101,20 @@ bool flush_master_info(MASTER_INFO* mi)
              (int)(mi->ssl), mi->ssl_ca, mi->ssl_capath, mi->ssl_cert,
              mi->ssl_cipher, mi->ssl_key);
  flush_io_cache(file);
  /*
    Flush the relay log to disk. If we don't do it, then the relay log while
    have some part (its last kilobytes) in memory only, so if the slave server
    dies now, with, say, from master's position 100 to 150 in memory only (not
    on disk), and with position 150 in master.info, then when the slave
    restarts, the I/O thread will fetch binlogs from 150, so in the relay log
    we will have "[0, 100] U [150, infinity[" and nobody will notice it, so the
    SQL thread will jump from 100 to 150, and replication will silently break.
    When we come to this place in code, relay log may or not be initialized;
    the caller is responsible for setting 'flush_relay_log_cache' accordingly.
  */
  if (flush_relay_log_cache)
    flush_io_cache(mi->rli.relay_log.get_log_file());
  DBUG_RETURN(0);
 }
@ -2982,7 +2973,7 @@ reconnect done to recover from failed read");
 	sql_print_error("Slave I/O thread could not queue event from master");
 	goto err;
      }
-      flush_master_info(mi);
+      flush_master_info(mi, 1); /* sure that we can flush the relay log */
      /*
        See if the relay logs take too much space.
        We don't lock mi->rli.log_space_lock here; this dirty read saves time
--- a/sql/slave.h
+++ b/sql/slave.h
@ -461,7 +461,7 @@ typedef struct st_table_rule_ent
 int init_slave();
 void init_slave_skip_errors(const char* arg);
-bool flush_master_info(MASTER_INFO* mi);
+bool flush_master_info(MASTER_INFO* mi, bool flush_relay_log_cache);
 bool flush_relay_log_info(RELAY_LOG_INFO* rli);
 int register_slave_on_master(MYSQL* mysql);
 int terminate_slave_threads(MASTER_INFO* mi, int thread_mask,
--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@ -1085,8 +1085,11 @@ int change_master(THD* thd, MASTER_INFO* mi)
     strmake(mi->master_log_name, mi->rli.group_master_log_name,
             sizeof(mi->master_log_name)-1);
  }
-
+  /*
-  flush_master_info(mi);
+    Relay log's IO_CACHE may not be inited, if rli->inited==0 (server was never
    a slave before).
  */
  flush_master_info(mi, 0);
  if (need_relay_log_purge)
  {
    relay_log_purge= 1;