diff --git a/extra/mariabackup/fil_cur.cc b/extra/mariabackup/fil_cur.cc index e0a4711a2aa..2932fa6d5a6 100644 --- a/extra/mariabackup/fil_cur.cc +++ b/extra/mariabackup/fil_cur.cc @@ -199,12 +199,6 @@ xb_fil_cur_open( return(XB_FIL_CUR_SKIP); } - if (srv_file_flush_method == SRV_O_DIRECT - || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) { - - os_file_set_nocache(cursor->file, node->name, "OPEN"); - } - posix_fadvise(cursor->file, 0, 0, POSIX_FADV_SEQUENTIAL); cursor->page_size = node->space->physical_size(); diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 67560ec03aa..27bb10bf82e 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -311,6 +311,8 @@ extern const char *innodb_checksum_algorithm_names[]; extern TYPELIB innodb_checksum_algorithm_typelib; extern const char *innodb_flush_method_names[]; extern TYPELIB innodb_flush_method_typelib; +/** Ignored option */ +static ulong innodb_flush_method; static const char *binlog_info_values[] = {"off", "lockless", "on", "auto", NullS}; @@ -1032,6 +1034,8 @@ enum options_xtrabackup #if defined __linux__ || defined _WIN32 OPT_INNODB_LOG_FILE_BUFFERING, #endif + OPT_INNODB_DATA_FILE_BUFFERING, + OPT_INNODB_DATA_FILE_WRITE_THROUGH, OPT_INNODB_LOG_FILE_SIZE, OPT_INNODB_OPEN_FILES, OPT_XTRA_DEBUG_SYNC, @@ -1583,10 +1587,10 @@ struct my_option xb_server_options[] = FALSE, 0, 0, 0, 0, 0}, {"innodb_flush_method", OPT_INNODB_FLUSH_METHOD, - "With which method to flush data.", - &srv_file_flush_method, &srv_file_flush_method, + "Ignored parameter with no effect", + &innodb_flush_method, &innodb_flush_method, &innodb_flush_method_typelib, GET_ENUM, REQUIRED_ARG, - IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT), 0, 0, 0, 0, 0}, + 4/* O_DIRECT */, 0, 0, 0, 0, 0}, {"innodb_log_buffer_size", OPT_INNODB_LOG_BUFFER_SIZE, "Redo log buffer size in bytes.", @@ -1600,6 +1604,16 @@ struct my_option xb_server_options[] = (G_PTR*) &log_sys.log_buffered, 0, GET_BOOL, NO_ARG, TRUE, 0, 0, 0, 0, 0}, #endif + {"innodb_data_file_buffering", OPT_INNODB_DATA_FILE_BUFFERING, + "Whether the file system cache for data files is enabled during --backup", + (G_PTR*) &fil_system.buffered, + (G_PTR*) &fil_system.buffered, 0, GET_BOOL, NO_ARG, + FALSE, 0, 0, 0, 0, 0}, + {"innodb_data_file_write_through", OPT_INNODB_DATA_FILE_WRITE_THROUGH, + "Whether each write to data files writes through", + (G_PTR*) &fil_system.write_through, + (G_PTR*) &fil_system.write_through, 0, GET_BOOL, NO_ARG, + FALSE, 0, 0, 0, 0, 0}, {"innodb_log_file_size", OPT_INNODB_LOG_FILE_SIZE, "Ignored for mysqld option compatibility", (G_PTR*) &srv_log_file_size, (G_PTR*) &srv_log_file_size, 0, @@ -1917,12 +1931,6 @@ xb_get_one_option(const struct my_option *opt, ADD_PRINT_PARAM_OPT(srv_log_group_home_dir); break; - case OPT_INNODB_FLUSH_METHOD: - ut_a(srv_file_flush_method - <= IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT_NO_FSYNC)); - ADD_PRINT_PARAM_OPT(innodb_flush_method_names[srv_file_flush_method]); - break; - case OPT_INNODB_PAGE_SIZE: ADD_PRINT_PARAM_OPT(innobase_page_size); diff --git a/mysql-test/lib/My/Debugger.pm b/mysql-test/lib/My/Debugger.pm index c2062c2eaba..412c028cfc5 100644 --- a/mysql-test/lib/My/Debugger.pm +++ b/mysql-test/lib/My/Debugger.pm @@ -78,7 +78,7 @@ my %debuggers = ( options => '-f -o {log} {exe} {args}', }, rr => { - options => '_RR_TRACE_DIR={log} rr record {exe} {args} --loose-skip-innodb-use-native-aio --loose-innodb-flush-method=fsync', + options => '_RR_TRACE_DIR={log} rr record {exe} {args}', run => 'env', pre => sub { ::mtr_error('rr requires kernel.perf_event_paranoid <= 1') diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index 6eface8c097..998d82587b6 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -355,6 +355,18 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT OPTIONAL +VARIABLE_NAME INNODB_DATA_FILE_BUFFERING +SESSION_VALUE NULL +DEFAULT_VALUE OFF +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BOOLEAN +VARIABLE_COMMENT Whether the file system cache for data files is enabled +NUMERIC_MIN_VALUE NULL +NUMERIC_MAX_VALUE NULL +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST OFF,ON +READ_ONLY NO +COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME INNODB_DATA_FILE_PATH SESSION_VALUE NULL DEFAULT_VALUE ibdata1:12M:autoextend @@ -379,6 +391,18 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY YES COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME INNODB_DATA_FILE_WRITE_THROUGH +SESSION_VALUE NULL +DEFAULT_VALUE OFF +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BOOLEAN +VARIABLE_COMMENT Whether each write to data files writes through +NUMERIC_MIN_VALUE NULL +NUMERIC_MAX_VALUE NULL +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST OFF,ON +READ_ONLY NO +COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME INNODB_DATA_HOME_DIR SESSION_VALUE NULL DEFAULT_VALUE @@ -1015,6 +1039,18 @@ NUMERIC_BLOCK_SIZE 4096 ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME INNODB_LOG_FILE_WRITE_THROUGH +SESSION_VALUE NULL +DEFAULT_VALUE OFF +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BOOLEAN +VARIABLE_COMMENT Whether each write to ib_logfile0 is write through +NUMERIC_MIN_VALUE NULL +NUMERIC_MAX_VALUE NULL +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST OFF,ON +READ_ONLY NO +COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME INNODB_LOG_GROUP_HOME_DIR SESSION_VALUE NULL DEFAULT_VALUE diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 6bf03d3e72a..d71cbfbf743 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1724,7 +1724,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept resize_log.write(CHECKPOINT_1, {c, get_block_size()}); } - if (srv_file_flush_method != SRV_O_DSYNC) + if (!log_write_through) ut_a(log.flush()); latch.wr_lock(SRW_LOCK_CALL); ut_ad(checkpoint_pending); @@ -1756,7 +1756,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept if (!is_pmem()) { - if (srv_file_flush_method != SRV_O_DSYNC) + if (!log_write_through) ut_a(resize_log.flush()); IF_WIN(log.close(),); } @@ -1902,13 +1902,7 @@ static bool log_checkpoint() if (recv_recovery_is_on()) recv_sys.apply(true); - switch (srv_file_flush_method) { - case SRV_NOSYNC: - case SRV_O_DIRECT_NO_FSYNC: - break; - default: - fil_flush_file_spaces(); - } + fil_flush_file_spaces(); log_sys.latch.wr_lock(SRW_LOCK_CALL); const lsn_t end_lsn= log_sys.get_lsn(); @@ -2060,13 +2054,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) MONITOR_FLUSH_SYNC_PAGES, n_flushed); } - switch (srv_file_flush_method) { - case SRV_NOSYNC: - case SRV_O_DIRECT_NO_FSYNC: - break; - default: - fil_flush_file_spaces(); - } + fil_flush_file_spaces(); log_sys.latch.wr_lock(SRW_LOCK_CALL); const lsn_t newest_lsn= log_sys.get_lsn(); diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 481a2dbce53..cecda94cac4 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -499,6 +499,9 @@ void fil_space_t::flush_low() break; } + if (fil_system.is_write_through()) + goto skip_flush; + fil_n_pending_tablespace_flushes++; for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node; node= UT_LIST_GET_NEXT(chain, node)) @@ -523,8 +526,9 @@ void fil_space_t::flush_low() mysql_mutex_unlock(&fil_system.mutex); } - clear_flush(); fil_n_pending_tablespace_flushes--; +skip_flush: + clear_flush(); } /** Try to extend a tablespace. @@ -753,7 +757,6 @@ inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle) { if (space->is_in_unflushed_spaces) { - ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC); space->is_in_unflushed_spaces= false; fil_system.unflushed_spaces.remove(*space); } @@ -786,7 +789,6 @@ pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle) if (space->is_in_unflushed_spaces) { - ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC); space->is_in_unflushed_spaces= false; unflushed_spaces.remove(*space); } @@ -1320,6 +1322,120 @@ ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size() mysql_mutex_unlock(&mutex); } +ATTRIBUTE_COLD void fil_space_t::reopen_all() +{ + mysql_mutex_assert_owner(&fil_system.mutex); + fil_system.freeze_space_list++; + + for (fil_space_t &space : fil_system.space_list) + { + for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + if (node->is_open()) + goto need_to_close; + continue; + + need_to_close: + uint32_t p= space.n_pending.fetch_or(CLOSING, std::memory_order_acquire); + if (p & (STOPPING | CLOSING)) + continue; + + for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + { + if (!node->is_open()) + continue; + + ulint type= OS_DATA_FILE; + + switch (FSP_FLAGS_GET_ZIP_SSIZE(space.flags)) { + case 1: case 2: + type= OS_DATA_FILE_NO_O_DIRECT; + } + + for (ulint count= 10000; count--;) + { + p= space.pending(); + + if (!(p & CLOSING) || (p & STOPPING)) + break; + + if (!(p & PENDING) && !node->being_extended) + { + space.reacquire(); + mysql_mutex_unlock(&fil_system.mutex); + /* Unconditionally flush the file, because + fil_system.write_through was updated prematurely, + potentially causing some flushes to be lost. */ + os_file_flush(node->handle); + mysql_mutex_lock(&fil_system.mutex); + p= space.n_pending.fetch_sub(1, std::memory_order_relaxed) - 1; + + if (!(p & CLOSING) || (p & STOPPING)) + break; + + if (!(p & PENDING) && !node->being_extended) + { + ut_a(os_file_close(node->handle)); + bool success; + node->handle= os_file_create(innodb_data_file_key, node->name, + node->is_raw_disk + ? OS_FILE_OPEN_RAW : OS_FILE_OPEN, + OS_FILE_AIO, type, + srv_read_only_mode, &success); + ut_a(success); + goto next_file; + } + } + + space.reacquire(); + mysql_mutex_unlock(&fil_system.mutex); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + mysql_mutex_lock(&fil_system.mutex); + space.release(); + + if (!node->is_open()) + goto next_file; + } + + if (!(p & CLOSING) || (p & STOPPING)) + next_file: + continue; + + sql_print_error("InnoDB: Failed to reopen file '%s' due to " UINT32PF + " operations", node->name, p & PENDING); + } + } + + fil_system.freeze_space_list--; +} + +void fil_system_t::set_write_through(bool write_through) +{ + mysql_mutex_lock(&mutex); + + if (write_through != this->write_through) + { + this->write_through= write_through; + fil_space_t::reopen_all(); + } + + mysql_mutex_unlock(&mutex); +} + +void fil_system_t::set_buffered(bool buffered) +{ + mysql_mutex_lock(&mutex); + + if (buffered != this->buffered) + { + this->buffered= buffered; + fil_space_t::reopen_all(); + } + + mysql_mutex_unlock(&mutex); +} + /** Close all tablespace files at shutdown */ void fil_space_t::close_all() { @@ -1340,12 +1456,9 @@ void fil_space_t::close_all() for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL; node= UT_LIST_GET_NEXT(chain, node)) { - if (!node->is_open()) - { next: continue; - } for (ulint count= 10000; count--;) { @@ -1361,8 +1474,8 @@ void fil_space_t::close_all() goto next; } - ib::error() << "File '" << node->name << "' has " << space.referenced() - << " operations"; + sql_print_error("InnoDB: File '%s' has " UINT32PF " operations", + node->name, space.referenced()); } fil_system.detach(&space); @@ -2598,7 +2711,7 @@ inline void fil_node_t::complete_write() mysql_mutex_assert_not_owner(&fil_system.mutex); if (space->purpose != FIL_TYPE_TEMPORARY && - srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC && + (!fil_system.is_write_through() && !my_disable_sync) && space->set_needs_flush()) { mysql_mutex_lock(&fil_system.mutex); @@ -2774,14 +2887,6 @@ write_completed: possibly cached by the OS. */ void fil_flush_file_spaces() { - if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) - { - ut_d(mysql_mutex_lock(&fil_system.mutex)); - ut_ad(fil_system.unflushed_spaces.empty()); - ut_d(mysql_mutex_unlock(&fil_system.mutex)); - return; - } - rescan: mysql_mutex_lock(&fil_system.mutex); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 86fc747faed..a51e30e28ce 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -366,6 +366,8 @@ const char* innodb_flush_method_names[] = { NullS }; +static constexpr ulong innodb_flush_method_default = IF_WIN(6,4); + /** Enumeration of innodb_flush_method */ TYPELIB innodb_flush_method_typelib = { array_elements(innodb_flush_method_names) - 1, @@ -374,6 +376,9 @@ TYPELIB innodb_flush_method_typelib = { NULL }; +/** Deprecated parameter */ +static ulong innodb_flush_method; + /** Names of allowed values of innodb_deadlock_report */ static const char *innodb_deadlock_report_names[]= { "off", /* Do not report any details of deadlocks */ @@ -4005,22 +4010,27 @@ static int innodb_init_params() data_mysql_default_charset_coll = (ulint) default_charset_info->number; -#ifndef _WIN32 - if (srv_use_atomic_writes && my_may_have_atomic_write) { - /* - Force O_DIRECT on Unixes (on Windows writes are always - unbuffered) - */ - switch (srv_file_flush_method) { - case SRV_O_DIRECT: - case SRV_O_DIRECT_NO_FSYNC: - break; - default: - srv_file_flush_method = SRV_O_DIRECT; - fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n"); - } - } + if (innodb_flush_method == 1 /* O_DSYNC */) { + log_sys.log_write_through = true; + fil_system.write_through = true; + fil_system.buffered = false; +#if defined __linux__ || defined _WIN32 + log_sys.log_buffered = false; + goto skip_buffering_tweak; #endif + } else if (innodb_flush_method >= 4 /* O_DIRECT */ + IF_WIN(&& innodb_flush_method < 8 /* normal */,)) { + /* O_DIRECT and similar settings do nothing */ +#ifndef _WIN32 + } else if (srv_use_atomic_writes && my_may_have_atomic_write) { + /* If atomic writes are enabled, do the same as with + innodb_flush_method=O_DIRECT: retain the default settings */ +#endif + } else { + log_sys.log_write_through = false; + fil_system.write_through = false; + fil_system.buffered = true; + } #if defined __linux__ || defined _WIN32 if (srv_flush_log_at_trx_commit == 2) { @@ -4028,6 +4038,7 @@ static int innodb_init_params() innodb_flush_log_at_trx_commit=2. */ log_sys.log_buffered = true; } +skip_buffering_tweak: #endif if (srv_read_only_mode) { @@ -4035,12 +4046,6 @@ static int innodb_init_params() srv_use_doublewrite_buf = FALSE; } -#if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32 - /* Currently native AIO is supported only on windows and linux - and that also when the support is compiled in. In all other - cases, we ignore the setting of innodb_use_native_aio. */ - srv_use_native_aio = FALSE; -#endif #ifdef HAVE_URING if (srv_use_native_aio && io_uring_may_be_unsafe) { sql_print_warning("innodb_use_native_aio may cause " @@ -4048,22 +4053,13 @@ static int innodb_init_params() "https://jira.mariadb.org/browse/MDEV-26674", io_uring_may_be_unsafe); } +#elif !defined LINUX_NATIVE_AIO && !defined _WIN32 + /* Currently native AIO is supported only on windows and linux + and that also when the support is compiled in. In all other + cases, we ignore the setting of innodb_use_native_aio. */ + srv_use_native_aio = FALSE; #endif -#ifndef _WIN32 - ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC); -#else - switch (srv_file_flush_method) { - case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */: - srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC; - break; - case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */: - srv_file_flush_method = SRV_FSYNC; - break; - default: - ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC); - } -#endif innodb_buffer_pool_size_init(); srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift); @@ -18409,7 +18405,7 @@ buffer_pool_load_abort( } #if defined __linux__ || defined _WIN32 -static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*, +static void innodb_log_file_buffering_update(THD *, st_mysql_sys_var*, void *, const void *save) { mysql_mutex_unlock(&LOCK_global_system_variables); @@ -18418,6 +18414,30 @@ static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*, } #endif +static void innodb_log_file_write_through_update(THD *, st_mysql_sys_var*, + void *, const void *save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + log_sys.set_write_through(*static_cast(save)); + mysql_mutex_lock(&LOCK_global_system_variables); +} + +static void innodb_data_file_buffering_update(THD *, st_mysql_sys_var*, + void *, const void *save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + fil_system.set_buffered(*static_cast(save)); + mysql_mutex_lock(&LOCK_global_system_variables); +} + +static void innodb_data_file_write_through_update(THD *, st_mysql_sys_var*, + void *, const void *save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + fil_system.set_write_through(*static_cast(save)); + mysql_mutex_lock(&LOCK_global_system_variables); +} + static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, void *var, const void *save) { @@ -18876,11 +18896,10 @@ static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, " guarantees in case of crash. 0 and 2 can be faster than 1 or 3.", NULL, NULL, 1, 0, 3, 0); -static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, +static MYSQL_SYSVAR_ENUM(flush_method, innodb_flush_method, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_DEPRECATED, "With which method to flush data.", - NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT), - &innodb_flush_method_typelib); + NULL, NULL, innodb_flush_method_default, &innodb_flush_method_typelib); static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -19312,6 +19331,21 @@ static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered, nullptr, innodb_log_file_buffering_update, FALSE); #endif +static MYSQL_SYSVAR_BOOL(log_file_write_through, log_sys.log_write_through, + PLUGIN_VAR_OPCMDARG, + "Whether each write to ib_logfile0 is write through", + nullptr, innodb_log_file_write_through_update, FALSE); + +static MYSQL_SYSVAR_BOOL(data_file_buffering, fil_system.buffered, + PLUGIN_VAR_OPCMDARG, + "Whether the file system cache for data files is enabled", + nullptr, innodb_data_file_buffering_update, FALSE); + +static MYSQL_SYSVAR_BOOL(data_file_write_through, fil_system.write_through, + PLUGIN_VAR_OPCMDARG, + "Whether each write to data files writes through", + nullptr, innodb_data_file_write_through_update, FALSE); + static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size, PLUGIN_VAR_RQCMDARG, "Redo log size in bytes.", @@ -19756,6 +19790,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { #if defined __linux__ || defined _WIN32 MYSQL_SYSVAR(log_file_buffering), #endif + MYSQL_SYSVAR(log_file_write_through), + MYSQL_SYSVAR(data_file_buffering), + MYSQL_SYSVAR(data_file_write_through), MYSQL_SYSVAR(log_file_size), MYSQL_SYSVAR(log_group_home_dir), MYSQL_SYSVAR(max_dirty_pages_pct), diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 940e1b68458..210f365ddd8 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -51,35 +51,6 @@ using space_list_t= ilist; // Forward declaration extern my_bool srv_use_doublewrite_buf; -/** Possible values of innodb_flush_method */ -enum srv_flush_t -{ - /** fsync, the default */ - SRV_FSYNC= 0, - /** open log files in O_DSYNC mode */ - SRV_O_DSYNC, - /** do not call os_file_flush() when writing data files, but do flush - after writing to log files */ - SRV_LITTLESYNC, - /** do not flush after writing */ - SRV_NOSYNC, - /** invoke os_file_set_nocache() on data files. This implies using - unbuffered I/O but still fdatasync(), because some filesystems might - not flush meta-data on write completion */ - SRV_O_DIRECT, - /** Like O_DIRECT, but skip fdatasync(), assuming that the data is - durable on write completion */ - SRV_O_DIRECT_NO_FSYNC -#ifdef _WIN32 - /** Traditional Windows appoach to open all files without caching, - and do FileFlushBuffers() */ - ,SRV_ALL_O_DIRECT_FSYNC -#endif -}; - -/** innodb_flush_method */ -extern ulong srv_file_flush_method; - /** Undo tablespaces starts with space_id. */ extern uint32_t srv_undo_space_id_start; /** The number of UNDO tablespaces that are open and ready to use. */ @@ -631,6 +602,8 @@ private: } public: + /** Reopen all files on set_write_through() or set_buffered(). */ + static void reopen_all(); /** Try to close a file to adhere to the innodb_open_files limit. @param print_info whether to diagnose why a file cannot be closed @return whether a file was closed */ @@ -1414,6 +1387,20 @@ public: fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ /** Map of fil_space_t::id to fil_space_t* */ hash_table_t spaces; + + /** whether each write to data files is durable (O_DSYNC) */ + my_bool write_through; + /** whether data files are buffered (not O_DIRECT) */ + my_bool buffered; + + /** Try to enable or disable write-through of data files */ + void set_write_through(bool write_through); + /** Try to enable or disable file system caching of data files */ + void set_buffered(bool buffered); + + TPOOL_SUPPRESS_TSAN bool is_write_through() const { return write_through; } + TPOOL_SUPPRESS_TSAN bool is_buffered() const { return buffered; } + /** tablespaces for which fil_space_t::needs_flush() holds */ sized_ilist unflushed_spaces; /** number of currently open files; protected by mutex */ @@ -1527,12 +1514,7 @@ template inline void fil_space_t::flush() mysql_mutex_assert_not_owner(&fil_system.mutex); ut_ad(!have_reference || (pending() & PENDING)); ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT); - if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) - { - ut_ad(!is_in_unflushed_spaces); - ut_ad(!needs_flush()); - } - else if (have_reference) + if (have_reference) flush_low(); else { diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 09e4ece8894..8afa92abc93 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -275,6 +275,8 @@ public: bool log_maybe_unbuffered; # endif #endif + /** whether each write to ib_logfile0 is durable (O_DSYNC) */ + my_bool log_write_through; /** Fields involved in checkpoints @{ */ lsn_t log_capacity; /*!< capacity of the log; if @@ -362,6 +364,8 @@ public: /** Try to enable or disable file system caching (update log_buffered) */ void set_buffered(bool buffered); #endif + /** Try to enable or disable durable writes (update log_write_through) */ + void set_write_through(bool write_through); void attach(log_file_t file, os_offset_t size); diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 2804143721c..4e9ed1263f6 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -401,6 +401,31 @@ void log_t::set_buffered(bool buffered) } #endif + /** Try to enable or disable durable writes (update log_write_through) */ +void log_t::set_write_through(bool write_through) +{ + if (is_pmem() || high_level_read_only) + return; + log_resize_acquire(); + if (!resize_in_progress() && is_opened() && + bool(log_write_through) != write_through) + { + os_file_close_func(log.m_file); + log.m_file= OS_FILE_CLOSED; + std::string path{get_log_file_path()}; + log_write_through= write_through; + bool success; + log.m_file= os_file_create_func(path.c_str(), + OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE, + false, &success); + ut_a(log.m_file != OS_FILE_CLOSED); + sql_print_information(log_write_through + ? "InnoDB: Log writes write through" + : "InnoDB: Log writes may be cached"); + } + log_resize_release(); +} + /** Start resizing the log and release the exclusive latch. @param size requested new file_size @return whether the resizing was started successfully */ @@ -852,7 +877,7 @@ bool log_t::flush(lsn_t lsn) noexcept { ut_ad(lsn >= get_flushed_lsn()); flush_lock.set_pending(lsn); - const bool success{srv_file_flush_method == SRV_O_DSYNC || log.flush()}; + const bool success{log_write_through || log.flush()}; if (UNIV_LIKELY(success)) { flushed_to_disk_lsn.store(lsn, std::memory_order_release); diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index d4cfb6207bf..6141c9dcc37 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -65,7 +65,9 @@ Created 10/21/1995 Heikki Tuuri #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ #ifdef _WIN32 -#include +# include +#elif !defined O_DSYNC +# define O_DSYNC O_SYNC #endif // my_test_if_atomic_write() , my_win_secattr() @@ -931,6 +933,8 @@ bool os_file_flush_func( os_file_t file) { + if (UNIV_UNLIKELY(my_disable_sync)) return true; + int ret; ret = os_file_sync_posix(file); @@ -981,40 +985,19 @@ os_file_create_simple_func( *success = false; - int create_flag; - const char* mode_str = NULL; + int create_flag = O_RDONLY; ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); - if (create_mode == OS_FILE_OPEN) { - mode_str = "OPEN"; - - if (access_type == OS_FILE_READ_ONLY) { - - create_flag = O_RDONLY; - - } else if (read_only) { - - create_flag = O_RDONLY; - - } else { + if (read_only) { + } else if (create_mode == OS_FILE_OPEN) { + if (access_type != OS_FILE_READ_ONLY) { create_flag = O_RDWR; } - - } else if (read_only) { - - mode_str = "OPEN"; - create_flag = O_RDONLY; - } else if (create_mode == OS_FILE_CREATE) { - - mode_str = "CREATE"; create_flag = O_RDWR | O_CREAT | O_EXCL; - } else if (create_mode == OS_FILE_CREATE_PATH) { - - mode_str = "CREATE PATH"; /* Create subdirs along the path if needed. */ *success = os_file_create_subdirs_if_needed(name); @@ -1040,40 +1023,32 @@ os_file_create_simple_func( return(OS_FILE_CLOSED); } - bool retry; + create_flag |= O_CLOEXEC; + if (fil_system.is_write_through()) create_flag |= O_DSYNC; + int direct_flag = fil_system.is_buffered() ? 0 : O_DIRECT; - do { - file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + for (;;) { + file = open(name, create_flag | direct_flag, os_innodb_umask); if (file == -1) { + if (direct_flag && errno == EINVAL) { + direct_flag = 0; + continue; + } + *success = false; - retry = os_file_handle_error( + if (!os_file_handle_error( name, create_mode == OS_FILE_OPEN - ? "open" : "create"); + ? "open" : "create")) { + break; + } } else { *success = true; - retry = false; - } - - } while (retry); - - /* This function is always called for data files, we should disable - OS caching (O_DIRECT) here as we do in os_file_create_func(), so - we open the same file in the same mode, see man page of open(2). */ - if (!srv_read_only_mode && *success) { - switch (srv_file_flush_method) { - case SRV_O_DSYNC: - case SRV_O_DIRECT: - case SRV_O_DIRECT_NO_FSYNC: - os_file_set_nocache(file, name, mode_str); - break; - default: break; } } -#ifndef _WIN32 if (!read_only && *success && access_type == OS_FILE_READ_WRITE @@ -1084,7 +1059,6 @@ os_file_create_simple_func( close(file); file = -1; } -#endif /* !_WIN32 */ return(file); } @@ -1156,8 +1130,8 @@ os_file_create_func( return(OS_FILE_CLOSED); ); - int create_flag; - const char* mode_str = NULL; + int create_flag = O_RDONLY | O_CLOEXEC; + const char* mode_str = "OPEN"; on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT ? true : false; @@ -1167,30 +1141,17 @@ os_file_create_func( create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT)); - if (create_mode == OS_FILE_OPEN - || create_mode == OS_FILE_OPEN_RAW - || create_mode == OS_FILE_OPEN_RETRY) { - - mode_str = "OPEN"; - - create_flag = read_only ? O_RDONLY : O_RDWR; - - } else if (read_only) { - - mode_str = "OPEN"; - - create_flag = O_RDONLY; - + if (read_only) { + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + create_flag = O_RDWR | O_CLOEXEC; } else if (create_mode == OS_FILE_CREATE) { - mode_str = "CREATE"; - create_flag = O_RDWR | O_CREAT | O_EXCL; - + create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC; } else if (create_mode == OS_FILE_OVERWRITE) { - mode_str = "OVERWRITE"; - create_flag = O_RDWR | O_CREAT | O_TRUNC; - + create_flag = O_RDWR | O_CREAT | O_TRUNC | O_CLOEXEC; } else { ib::error() << "Unknown file create mode (" << create_mode << ")" @@ -1205,25 +1166,30 @@ os_file_create_func( ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); - /* We let O_DSYNC only affect log files */ + create_flag |= O_CLOEXEC; - if (!read_only - && type == OS_LOG_FILE - && srv_file_flush_method == SRV_O_DSYNC) { -#ifdef O_DSYNC + int direct_flag = type == OS_DATA_FILE && create_mode != OS_FILE_CREATE + && !fil_system.is_buffered() + ? O_DIRECT : 0; + + if (read_only) { + } else if ((type == OS_LOG_FILE) + ? log_sys.log_write_through + : fil_system.is_write_through()) { create_flag |= O_DSYNC; -#else - create_flag |= O_SYNC; -#endif } os_file_t file; - bool retry; - do { - file = open(name, create_flag | O_CLOEXEC, os_innodb_umask); + for (;;) { + file = open(name, create_flag | direct_flag, os_innodb_umask); if (file == -1) { + if (direct_flag && errno == EINVAL) { + direct_flag = 0; + continue; + } + const char* operation; operation = (create_mode == OS_FILE_CREATE @@ -1232,39 +1198,30 @@ os_file_create_func( *success = false; if (on_error_no_exit) { - retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + if (os_file_handle_error_no_exit( + name, operation, on_error_silent)) + continue; } else { - retry = os_file_handle_error(name, operation); + if (os_file_handle_error(name, operation)) + continue; } + + return file; } else { *success = true; - retry = false; + break; } - - } while (retry); - - if (!*success) { - return file; } #if (defined __sun__ && defined DIRECTIO_ON) || defined O_DIRECT - if (type == OS_DATA_FILE) { - switch (srv_file_flush_method) { - case SRV_O_DSYNC: - case SRV_O_DIRECT: - case SRV_O_DIRECT_NO_FSYNC: + if (type == OS_DATA_FILE && create_mode == OS_FILE_CREATE + && !fil_system.is_buffered()) { # ifdef __linux__ use_o_direct: # endif - os_file_set_nocache(file, name, mode_str); - break; - default: - break; - } - } + os_file_set_nocache(file, name, mode_str); # ifdef __linux__ - else if (type == OS_LOG_FILE && !log_sys.is_opened()) { + } else if (type == OS_LOG_FILE && !log_sys.is_opened()) { struct stat st; char b[20 + sizeof "/sys/dev/block/" ":" "/../queue/physical_block_size"]; @@ -1316,11 +1273,10 @@ skip_o_direct: log_sys.log_buffered= true; log_sys.set_block_size(512); } - } # endif + } #endif -#ifndef _WIN32 if (!read_only && create_mode != OS_FILE_OPEN_RAW && !my_disable_locking @@ -1348,7 +1304,6 @@ skip_o_direct: close(file); file = -1; } -#endif /* !_WIN32 */ return(file); } @@ -1786,6 +1741,9 @@ Flushes the write buffers of a given file to the disk. @return true if success */ bool os_file_flush_func(os_file_t file) { + if (UNIV_UNLIKELY(my_disable_sync)) + return true; + ++os_n_fsyncs; static bool disable_datasync; @@ -2011,6 +1969,11 @@ os_file_create_simple_func( return(OS_FILE_CLOSED); } + if (fil_system.is_write_through()) + attributes |= FILE_FLAG_WRITE_THROUGH; + if (!fil_system.is_buffered()) + attributes |= FILE_FLAG_NO_BUFFERING; + bool retry; do { @@ -2182,27 +2145,16 @@ os_file_create_func( if (!log_sys.is_opened() && !log_sys.log_buffered) { attributes|= FILE_FLAG_NO_BUFFERING; } - if (srv_file_flush_method == SRV_O_DSYNC) + if (log_sys.log_write_through) + attributes|= FILE_FLAG_WRITE_THROUGH; + } else { + if (type == OS_DATA_FILE && !fil_system.is_buffered()) + attributes|= FILE_FLAG_NO_BUFFERING; + if (fil_system.is_write_through()) attributes|= FILE_FLAG_WRITE_THROUGH; } - else if (type == OS_DATA_FILE) - { - switch (srv_file_flush_method) - { - case SRV_FSYNC: - case SRV_LITTLESYNC: - case SRV_NOSYNC: - break; - default: - attributes|= FILE_FLAG_NO_BUFFERING; - } - } - DWORD access = GENERIC_READ; - - if (!read_only) { - access |= GENERIC_WRITE; - } + DWORD access = read_only ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE; for (;;) { const char *operation; diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index ffb7f53c15c..02c7367cc93 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -223,9 +223,6 @@ ulong srv_read_ahead_threshold; buffer in terms of percentage of the buffer pool. */ uint srv_change_buffer_max_size; -ulong srv_file_flush_method; - - /** copy of innodb_open_files; @see innodb_init_params() */ ulint srv_max_n_open_files; diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 475ae887d23..0ff20b31771 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -1168,7 +1168,7 @@ static void trx_flush_log_if_needed_low(lsn_t lsn, const trx_t *trx) callback= &cb; } - log_write_up_to(lsn, srv_file_flush_method != SRV_NOSYNC && + log_write_up_to(lsn, !my_disable_sync && (srv_flush_log_at_trx_commit & 1), callback); }