From 5624a03c83adaa832cb25f2d3acbf52ce437b118 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Tue, 6 May 2014 13:17:49 -0400 Subject: [PATCH 01/46] #226 delete CMakeLists.in, no longer used --- storage/tokudb/CMakeLists.in | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 storage/tokudb/CMakeLists.in diff --git a/storage/tokudb/CMakeLists.in b/storage/tokudb/CMakeLists.in deleted file mode 100644 index 20c05126841..00000000000 --- a/storage/tokudb/CMakeLists.in +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (C) 2006 MySQL AB -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; version 2 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTOKUDB_VERSION=\\\"TOKUDB_VERSION_REPLACE_ME\\\"") -SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DSAFEMALLOC -DSAFE_MUTEX") -SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DSAFEMALLOC -DSAFE_MUTEX") - -INCLUDE_DIRECTORIES(TOKUDB_DIR_REPLACE_ME/windows - TOKUDB_DIR_REPLACE_ME/src - TOKUDB_DIR_REPLACE_ME/include - TOKUDB_DIR_REPLACE_ME/toku_include) - -INCLUDE("${PROJECT_SOURCE_DIR}/storage/mysql_storage_engine.cmake") -SET(TOKUDB_SOURCES hatoku_hton.cc ha_tokudb.cc hatoku_cmp.cc) -MYSQL_STORAGE_ENGINE(TOKUDB) - -TARGET_LINK_LIBRARIES(ha_tokudb PowrProf optimized TOKUDB_OBJ_DIR_REPLACE_ME/opt/ipo_libtokudb optimized TOKUDB_OBJ_DIR_REPLACE_ME/opt/libtokuportability debug TOKUDB_OBJ_DIR_REPLACE_ME/debug/static_libtokudb debug TOKUDB_OBJ_DIR_REPLACE_ME/debug/libtokuportability) From b0493252e01a2afc64abe3238bcdb51cf7bdbd8f Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Wed, 7 May 2014 08:20:41 -0400 Subject: [PATCH 02/46] #228 use thd_get/set_ha_data for tokudb_trx data --- storage/tokudb/ha_tokudb.cc | 49 +++++++++--------------- storage/tokudb/ha_tokudb_alter_56.cc | 2 +- storage/tokudb/ha_tokudb_alter_common.cc | 2 +- storage/tokudb/hatoku_hton.cc | 25 ++++++------ 4 files changed, 32 insertions(+), 46 deletions(-) diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index 296c472d36b..3f44c46afe5 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -120,14 +120,6 @@ extern "C" { #include "hatoku_defines.h" #include "hatoku_cmp.h" -static inline void *thd_data_get(THD *thd, int slot) { - return thd->ha_data[slot].ha_ptr; -} - -static inline void thd_data_set(THD *thd, int slot, void *data) { - thd->ha_data[slot].ha_ptr = data; -} - static inline uint get_key_parts(const KEY *key); #undef PACKAGE @@ -1016,8 +1008,7 @@ static uchar* pack_toku_field_blob( static int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) { int error; - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) tokudb_my_malloc(sizeof(*trx), MYF(MY_ZEROFILL)); + tokudb_trx_data* trx = (tokudb_trx_data *) tokudb_my_malloc(sizeof(*trx), MYF(MY_ZEROFILL)); if (!trx) { error = ENOMEM; goto cleanup; @@ -1614,8 +1605,7 @@ int ha_tokudb::initialize_share( DB_TXN* txn = NULL; bool do_commit = false; THD* thd = ha_thd(); - tokudb_trx_data *trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(ha_thd(), tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton); if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) { txn = trx->sub_sp_level; } @@ -3260,7 +3250,7 @@ void ha_tokudb::start_bulk_insert(ha_rows rows) { TOKUDB_HANDLER_DBUG_ENTER("%llu txn %p", (unsigned long long) rows, transaction); #endif THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); delay_updating_ai_metadata = true; ai_metadata_update_required = false; abort_loader = false; @@ -3328,7 +3318,7 @@ int ha_tokudb::end_bulk_insert(bool abort) { TOKUDB_HANDLER_DBUG_ENTER(""); int error = 0; THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); bool using_loader = (loader != NULL); if (ai_metadata_update_required) { tokudb_pthread_mutex_lock(&share->mutex); @@ -4060,7 +4050,7 @@ int ha_tokudb::write_row(uchar * record) { } } - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (!error) { added_rows++; trx->stmt_progress.inserted++; @@ -4117,7 +4107,7 @@ int ha_tokudb::update_row(const uchar * old_row, uchar * new_row) { THD* thd = ha_thd(); DB_TXN* sub_trans = NULL; DB_TXN* txn = NULL; - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); uint curr_num_DBs; LINT_INIT(error); @@ -4291,7 +4281,7 @@ int ha_tokudb::delete_row(const uchar * record) { bool has_null; THD* thd = ha_thd(); uint curr_num_DBs; - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; ha_statistic_increment(&SSV::ha_delete_count); @@ -4855,7 +4845,7 @@ int ha_tokudb::index_read(uchar * buf, const uchar * key, uint key_len, enum ha_ int error = 0; uint32_t flags = 0; THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; struct smart_dbt_info info; struct index_read_info ir_info; @@ -5333,7 +5323,7 @@ int ha_tokudb::get_next(uchar* buf, int direction, DBT* key_to_compare, bool do_ int error = 0; uint32_t flags = SET_PRELOCK_FLAG(0); THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; bool need_val; HANDLE_INVALID_CURSOR(); @@ -5486,7 +5476,7 @@ int ha_tokudb::index_first(uchar * buf) { struct smart_dbt_info info; uint32_t flags = SET_PRELOCK_FLAG(0); THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; HANDLE_INVALID_CURSOR(); ha_statistic_increment(&SSV::ha_read_first_count); @@ -5529,7 +5519,7 @@ int ha_tokudb::index_last(uchar * buf) { struct smart_dbt_info info; uint32_t flags = SET_PRELOCK_FLAG(0); THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; HANDLE_INVALID_CURSOR(); ha_statistic_increment(&SSV::ha_read_last_count); @@ -5620,7 +5610,7 @@ int ha_tokudb::rnd_next(uchar * buf) { void ha_tokudb::track_progress(THD* thd) { - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (trx) { ulonglong num_written = trx->stmt_progress.inserted + trx->stmt_progress.updated + trx->stmt_progress.deleted; bool update_status = @@ -6205,12 +6195,11 @@ int ha_tokudb::external_lock(THD * thd, int lock_type) { } int error = 0; - tokudb_trx_data *trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (!trx) { error = create_tokudb_trx_data_instance(&trx); if (error) { goto cleanup; } - thd_data_set(thd, tokudb_hton->slot, trx); + thd_set_ha_data(thd, tokudb_hton, trx); } if (trx->all == NULL) { trx->sp_level = NULL; @@ -6284,7 +6273,7 @@ int ha_tokudb::start_stmt(THD * thd, thr_lock_type lock_type) { TOKUDB_HANDLER_TRACE("q %s", thd->query()); int error = 0; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); DBUG_ASSERT(trx); /* @@ -6898,7 +6887,7 @@ int ha_tokudb::create(const char *name, TABLE * form, HA_CREATE_INFO * create_in newname = (char *)tokudb_my_malloc(get_max_dict_name_path_length(name),MYF(MY_WME)); if (newname == NULL){ error = ENOMEM; goto cleanup;} - trx = (tokudb_trx_data *) thd_data_get(ha_thd(), tokudb_hton->slot); + trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton); if (trx && trx->sub_sp_level && thd_sql_command(thd) == SQLCOM_CREATE_TABLE) { txn = trx->sub_sp_level; } @@ -7088,7 +7077,7 @@ int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_nam DB_TXN *parent_txn = NULL; tokudb_trx_data *trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (thd_sql_command(ha_thd()) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) { parent_txn = trx->sub_sp_level; } @@ -8234,12 +8223,12 @@ void ha_tokudb::cleanup_txn(DB_TXN *txn) { } void ha_tokudb::add_to_trx_handler_list() { - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(ha_thd(), tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton); trx->handlers = list_add(trx->handlers, &trx_handler_list); } void ha_tokudb::remove_from_trx_handler_list() { - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(ha_thd(), tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton); trx->handlers = list_delete(trx->handlers, &trx_handler_list); } diff --git a/storage/tokudb/ha_tokudb_alter_56.cc b/storage/tokudb/ha_tokudb_alter_56.cc index e0e1e7deee4..dbfce8764bc 100644 --- a/storage/tokudb/ha_tokudb_alter_56.cc +++ b/storage/tokudb/ha_tokudb_alter_56.cc @@ -752,7 +752,7 @@ bool ha_tokudb::commit_inplace_alter_table(TABLE *altered_table, Alter_inplace_i if (!commit) { // abort the alter transaction NOW so that any alters are rolled back. this allows the following restores to work. - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); assert(ctx->alter_txn == trx->stmt); assert(trx->tokudb_lock_count > 0); // for partitioned tables, we use a single transaction to do all of the partition changes. the tokudb_lock_count diff --git a/storage/tokudb/ha_tokudb_alter_common.cc b/storage/tokudb/ha_tokudb_alter_common.cc index ecef0fb7415..414e8280daf 100644 --- a/storage/tokudb/ha_tokudb_alter_common.cc +++ b/storage/tokudb/ha_tokudb_alter_common.cc @@ -814,7 +814,7 @@ int ha_tokudb::write_frm_data(const uchar *frm_data, size_t frm_len) { if (TOKU_PARTITION_WRITE_FRM_DATA || table->part_info == NULL) { // write frmdata to status THD *thd = ha_thd(); - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); assert(trx); DB_TXN *txn = trx->stmt; // use alter table transaction assert(txn); diff --git a/storage/tokudb/hatoku_hton.cc b/storage/tokudb/hatoku_hton.cc index 28394f2502c..bf7319b1203 100644 --- a/storage/tokudb/hatoku_hton.cc +++ b/storage/tokudb/hatoku_hton.cc @@ -624,8 +624,7 @@ int tokudb_end(handlerton * hton, ha_panic_function type) { static int tokudb_close_connection(handlerton * hton, THD * thd) { int error = 0; - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (trx && trx->checkpoint_lock_taken) { error = db_env->checkpointing_resume(db_env); } @@ -723,7 +722,7 @@ static int tokudb_commit(handlerton * hton, THD * thd, bool all) { TOKUDB_DBUG_ENTER(""); DBUG_PRINT("trans", ("ending transaction %s", all ? "all" : "stmt")); uint32_t syncflag = THDVAR(thd, commit_sync) ? 0 : DB_TXN_NOSYNC; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); DB_TXN **txn = all ? &trx->all : &trx->stmt; DB_TXN *this_txn = *txn; if (this_txn) { @@ -752,7 +751,7 @@ static int tokudb_commit(handlerton * hton, THD * thd, bool all) { static int tokudb_rollback(handlerton * hton, THD * thd, bool all) { TOKUDB_DBUG_ENTER(""); DBUG_PRINT("trans", ("aborting transaction %s", all ? "all" : "stmt")); - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); DB_TXN **txn = all ? &trx->all : &trx->stmt; DB_TXN *this_txn = *txn; if (this_txn) { @@ -782,7 +781,7 @@ static int tokudb_xa_prepare(handlerton* hton, THD* thd, bool all) { TOKUDB_DBUG_ENTER(""); int r = 0; DBUG_PRINT("trans", ("preparing transaction %s", all ? "all" : "stmt")); - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); DB_TXN* txn = all ? trx->all : trx->stmt; if (txn) { if (tokudb_debug & TOKUDB_DEBUG_TXN) { @@ -861,7 +860,7 @@ static int tokudb_savepoint(handlerton * hton, THD * thd, void *savepoint) { TOKUDB_DBUG_ENTER(""); int error; SP_INFO save_info = (SP_INFO)savepoint; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); if (thd->in_sub_stmt) { assert(trx->stmt); error = txn_begin(db_env, trx->sub_sp_level, &(save_info->txn), DB_INHERIT_ISOLATION, thd); @@ -892,7 +891,7 @@ static int tokudb_rollback_to_savepoint(handlerton * hton, THD * thd, void *save DB_TXN* parent = NULL; DB_TXN* txn_to_rollback = save_info->txn; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); parent = txn_to_rollback->parent; if (!(error = txn_to_rollback->abort(txn_to_rollback))) { if (save_info->in_sub_stmt) { @@ -914,7 +913,7 @@ static int tokudb_release_savepoint(handlerton * hton, THD * thd, void *savepoin DB_TXN* parent = NULL; DB_TXN* txn_to_commit = save_info->txn; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); parent = txn_to_commit->parent; if (!(error = txn_to_commit->commit(txn_to_commit, 0))) { if (save_info->in_sub_stmt) { @@ -974,7 +973,7 @@ static int tokudb_discover3(handlerton *hton, THD* thd, const char *db, const ch bool do_commit; #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099 - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) { do_commit = false; txn = trx->sub_sp_level; @@ -1129,15 +1128,14 @@ static bool tokudb_show_engine_status(THD * thd, stat_print_fn * stat_print) { static void tokudb_checkpoint_lock(THD * thd) { int error; const char *old_proc_info; - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (!trx) { error = create_tokudb_trx_data_instance(&trx); // // can only fail due to memory allocation, so ok to assert // assert(!error); - thd_data_set(thd, tokudb_hton->slot, trx); + thd_set_ha_data(thd, tokudb_hton, trx); } if (trx->checkpoint_lock_taken) { @@ -1161,8 +1159,7 @@ cleanup: static void tokudb_checkpoint_unlock(THD * thd) { int error; const char *old_proc_info; - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (!trx) { error = 0; goto cleanup; From 09a284742a69f214e118131bc8bccaa616141cf2 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Thu, 8 May 2014 15:03:10 -0400 Subject: [PATCH 03/46] #230 disable the tokudb bulk loader in the tokudb locks schema tests --- .../suite/tokudb/r/i_s_tokudb_lock_waits_released.result | 2 ++ .../suite/tokudb/r/i_s_tokudb_lock_waits_timeout.result | 1 + mysql-test/suite/tokudb/r/i_s_tokudb_locks.result | 2 +- mysql-test/suite/tokudb/r/i_s_tokudb_locks_released.result | 1 + .../suite/tokudb/t/i_s_tokudb_lock_waits_released.test | 2 ++ mysql-test/suite/tokudb/t/i_s_tokudb_lock_waits_timeout.test | 1 + mysql-test/suite/tokudb/t/i_s_tokudb_locks.test | 2 +- mysql-test/suite/tokudb/t/i_s_tokudb_locks_released.test | 5 +---- 8 files changed, 10 insertions(+), 6 deletions(-) diff --git a/mysql-test/suite/tokudb/r/i_s_tokudb_lock_waits_released.result b/mysql-test/suite/tokudb/r/i_s_tokudb_lock_waits_released.result index f84be01163f..db63d23e382 100644 --- a/mysql-test/suite/tokudb/r/i_s_tokudb_lock_waits_released.result +++ b/mysql-test/suite/tokudb/r/i_s_tokudb_lock_waits_released.result @@ -9,6 +9,7 @@ locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right select * from information_schema.tokudb_lock_waits; requesting_trx_id blocking_trx_id lock_waits_dname lock_waits_key_left lock_waits_key_right lock_waits_start_time set autocommit=0; +set tokudb_prelock_empty=OFF; insert into t values (1); set autocommit=0; insert into t values (1); @@ -38,6 +39,7 @@ locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right select * from information_schema.tokudb_lock_waits; requesting_trx_id blocking_trx_id lock_waits_dname lock_waits_key_left lock_waits_key_right lock_waits_start_time set autocommit=0; +set tokudb_prelock_empty=OFF; replace into t values (1); set autocommit=0; replace into t values (1); diff --git a/mysql-test/suite/tokudb/r/i_s_tokudb_lock_waits_timeout.result b/mysql-test/suite/tokudb/r/i_s_tokudb_lock_waits_timeout.result index 1e0668164ff..10e3830506d 100644 --- a/mysql-test/suite/tokudb/r/i_s_tokudb_lock_waits_timeout.result +++ b/mysql-test/suite/tokudb/r/i_s_tokudb_lock_waits_timeout.result @@ -9,6 +9,7 @@ locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right select * from information_schema.tokudb_lock_waits; requesting_trx_id blocking_trx_id lock_waits_dname lock_waits_key_left lock_waits_key_right lock_waits_start_time set autocommit=0; +set tokudb_prelock_empty=OFF; insert into t values (1); set autocommit=0; insert into t values (1); diff --git a/mysql-test/suite/tokudb/r/i_s_tokudb_locks.result b/mysql-test/suite/tokudb/r/i_s_tokudb_locks.result index ad252da448f..9fce0695983 100644 --- a/mysql-test/suite/tokudb/r/i_s_tokudb_locks.result +++ b/mysql-test/suite/tokudb/r/i_s_tokudb_locks.result @@ -12,7 +12,7 @@ set autocommit=0; insert into t values (2); insert into t values (4); insert into t values (6); -select * from information_schema.tokudb_locks order by locks_trx_id; +select * from information_schema.tokudb_locks order by locks_trx_id,locks_key_left; locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right TRX_ID MYSQL_ID ./test/t-main 0001000000 0001000000 TRX_ID MYSQL_ID ./test/t-main 0003000000 0003000000 diff --git a/mysql-test/suite/tokudb/r/i_s_tokudb_locks_released.result b/mysql-test/suite/tokudb/r/i_s_tokudb_locks_released.result index 21a6b5d308c..628ff46ffc4 100644 --- a/mysql-test/suite/tokudb/r/i_s_tokudb_locks_released.result +++ b/mysql-test/suite/tokudb/r/i_s_tokudb_locks_released.result @@ -6,6 +6,7 @@ set autocommit=0; select * from information_schema.tokudb_locks; locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right set autocommit=0; +set tokudb_prelock_empty=OFF; insert into t values (1); set autocommit=0; insert into t values (1); diff --git a/mysql-test/suite/tokudb/t/i_s_tokudb_lock_waits_released.test b/mysql-test/suite/tokudb/t/i_s_tokudb_lock_waits_released.test index 25a62c08a14..f259c5fe6bc 100644 --- a/mysql-test/suite/tokudb/t/i_s_tokudb_lock_waits_released.test +++ b/mysql-test/suite/tokudb/t/i_s_tokudb_lock_waits_released.test @@ -19,6 +19,7 @@ select * from information_schema.tokudb_lock_waits; connect (conn_a,localhost,root,,); set autocommit=0; +set tokudb_prelock_empty=OFF; # disable the bulk loader insert into t values (1); connect (conn_b,localhost,root,,); @@ -68,6 +69,7 @@ select * from information_schema.tokudb_lock_waits; connect (conn_a,localhost,root,,); set autocommit=0; +set tokudb_prelock_empty=OFF; # disable the bulk loader replace into t values (1); connect (conn_b,localhost,root,,); diff --git a/mysql-test/suite/tokudb/t/i_s_tokudb_lock_waits_timeout.test b/mysql-test/suite/tokudb/t/i_s_tokudb_lock_waits_timeout.test index ea7eb9a2c89..d7925733a0f 100644 --- a/mysql-test/suite/tokudb/t/i_s_tokudb_lock_waits_timeout.test +++ b/mysql-test/suite/tokudb/t/i_s_tokudb_lock_waits_timeout.test @@ -16,6 +16,7 @@ select * from information_schema.tokudb_lock_waits; connect (conn_a,localhost,root,,); set autocommit=0; +set tokudb_prelock_empty=OFF; insert into t values (1); connect (conn_b,localhost,root,,); diff --git a/mysql-test/suite/tokudb/t/i_s_tokudb_locks.test b/mysql-test/suite/tokudb/t/i_s_tokudb_locks.test index a3745b5471b..e5a67559b1a 100644 --- a/mysql-test/suite/tokudb/t/i_s_tokudb_locks.test +++ b/mysql-test/suite/tokudb/t/i_s_tokudb_locks.test @@ -29,7 +29,7 @@ insert into t values (6); # should find 3 locks for 2 transactions connection default; replace_column 1 TRX_ID 2 MYSQL_ID; -eval select * from information_schema.tokudb_locks order by locks_trx_id; +eval select * from information_schema.tokudb_locks order by locks_trx_id,locks_key_left; connection conn_a; commit; diff --git a/mysql-test/suite/tokudb/t/i_s_tokudb_locks_released.test b/mysql-test/suite/tokudb/t/i_s_tokudb_locks_released.test index be475d93c2c..4d654244682 100644 --- a/mysql-test/suite/tokudb/t/i_s_tokudb_locks_released.test +++ b/mysql-test/suite/tokudb/t/i_s_tokudb_locks_released.test @@ -14,20 +14,17 @@ let $default_id=`select connection_id()`; # should be empty select * from information_schema.tokudb_locks; - connect (conn_a,localhost,root,,); set autocommit=0; -let $a_id=`select connection_id()`; +set tokudb_prelock_empty=OFF; # disable bulk loader insert into t values (1); connect (conn_b,localhost,root,,); set autocommit=0; -let $b_id=`select connection_id()`; send insert into t values (1); - # should find the presence of a lock on 2nd transaction connection default; let $wait_condition= select count(*)=1 from information_schema.processlist where info='insert into t values (1)' and state='update'; From fb9681b9572941dc5cc545f7922ef68909a86909 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Thu, 8 May 2014 17:39:29 -0400 Subject: [PATCH 04/46] #231 change lock_uniq_key_empty test to work without the bulk insert avoidance patch --- mysql-test/suite/tokudb.bugs/r/lock_uniq_key_empty.result | 2 ++ mysql-test/suite/tokudb.bugs/t/lock_uniq_key_empty.test | 2 ++ 2 files changed, 4 insertions(+) diff --git a/mysql-test/suite/tokudb.bugs/r/lock_uniq_key_empty.result b/mysql-test/suite/tokudb.bugs/r/lock_uniq_key_empty.result index 6966aa24ff8..325aef46afe 100644 --- a/mysql-test/suite/tokudb.bugs/r/lock_uniq_key_empty.result +++ b/mysql-test/suite/tokudb.bugs/r/lock_uniq_key_empty.result @@ -1,6 +1,7 @@ set default_storage_engine=tokudb; drop table if exists t; create table t (id int, unique key(id)); +set tokudb_prelock_empty=OFF; begin; insert into t values (1); begin; @@ -13,6 +14,7 @@ id 2 drop table if exists t; create table t (id int not null, unique key(id)); +set tokudb_prelock_empty=OFF; begin; insert into t values (1); begin; diff --git a/mysql-test/suite/tokudb.bugs/t/lock_uniq_key_empty.test b/mysql-test/suite/tokudb.bugs/t/lock_uniq_key_empty.test index 3f8d7113dff..0a001c2736d 100644 --- a/mysql-test/suite/tokudb.bugs/t/lock_uniq_key_empty.test +++ b/mysql-test/suite/tokudb.bugs/t/lock_uniq_key_empty.test @@ -7,6 +7,7 @@ enable_warnings; create table t (id int, unique key(id)); connect(c1,localhost,root,,); +set tokudb_prelock_empty=OFF; # disable the tokudb bulk loader begin; insert into t values (1); connect(c2,localhost,root,,); @@ -24,6 +25,7 @@ drop table if exists t; create table t (id int not null, unique key(id)); connect(c1,localhost,root,,); +set tokudb_prelock_empty=OFF; # disable the tokudb bulk loader begin; insert into t values (1); connect(c2,localhost,root,,); From 39b08e022202a8a8d7069dedafcb3e8c7754bf49 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Sat, 10 May 2014 15:53:31 -0400 Subject: [PATCH 05/46] #232 compile in jemalloc detector --- storage/tokudb/hatoku_defines.h | 4 ++++ storage/tokudb/hatoku_hton.cc | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/storage/tokudb/hatoku_defines.h b/storage/tokudb/hatoku_defines.h index cf983b5b8cb..444ae425b2d 100644 --- a/storage/tokudb/hatoku_defines.h +++ b/storage/tokudb/hatoku_defines.h @@ -96,6 +96,10 @@ PATENT RIGHTS GRANT: #pragma interface /* gcc class implementation */ #endif +#if !defined(TOKUDB_CHECK_JEMALLOC) +#define TOKUDB_CHECK_JEMALLOC 1 +#endif + #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099 // mariadb 10.0 #define TOKU_USE_DB_TYPE_TOKUDB 1 diff --git a/storage/tokudb/hatoku_hton.cc b/storage/tokudb/hatoku_hton.cc index bf7319b1203..9a6eef92d6a 100644 --- a/storage/tokudb/hatoku_hton.cc +++ b/storage/tokudb/hatoku_hton.cc @@ -576,9 +576,6 @@ static int tokudb_done_func(void *p) { toku_global_status_rows = NULL; my_hash_free(&tokudb_open_tables); tokudb_pthread_mutex_destroy(&tokudb_mutex); -#if defined(_WIN64) - toku_ydb_destroy(); -#endif TOKUDB_DBUG_RETURN(0); } From f39c22282aa116a6bc6eae5a0f50b32078a15e01 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Tue, 13 May 2014 08:56:06 -0400 Subject: [PATCH 06/46] #221 fix tokudb::estimate_num_rows --- storage/tokudb/ha_tokudb.cc | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index 3f44c46afe5..87b54256965 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -1706,7 +1706,7 @@ int ha_tokudb::initialize_share( } share->ref_length = ref_length; - error = estimate_num_rows(share->file,&num_rows, txn); + error = estimate_num_rows(share->file, &num_rows, txn); // // estimate_num_rows should not fail under normal conditions // @@ -1916,7 +1916,6 @@ exit: // int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) { int error = ENOSYS; - DBC* crsr = NULL; bool do_commit = false; DB_BTREE_STAT64 dict_stats; DB_TXN* txn_to_use = NULL; @@ -1930,21 +1929,12 @@ int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) { txn_to_use = txn; } - error = db->stat64( - share->file, - txn_to_use, - &dict_stats - ); + error = db->stat64(db, txn_to_use, &dict_stats); if (error) { goto cleanup; } *num_rows = dict_stats.bt_ndata; error = 0; cleanup: - if (crsr != NULL) { - int r = crsr->c_close(crsr); - assert(r==0); - crsr = NULL; - } if (do_commit) { commit_txn(txn_to_use, 0); txn_to_use = NULL; From 6f5239cd294dcda8cdb5dd7f20365e63d61ee33f Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Tue, 13 May 2014 16:05:00 -0400 Subject: [PATCH 07/46] #237 fix various bulk loader bugs related to nproc ulimit exceeded --- ft/ftloader-internal.h | 4 +- ft/ftloader.cc | 159 ++++++++------- ft/ftloader.h | 3 +- ft/tests/ftloader-test-bad-generate.cc | 2 +- ft/tests/ftloader-test-extractor-errors.cc | 2 +- ft/tests/ftloader-test-extractor.cc | 2 +- ft/tests/ftloader-test-merge-files-dbufio.cc | 7 +- ft/tests/ftloader-test-open.cc | 2 +- src/loader.cc | 31 ++- src/tests/loader-nproc-close.cc | 201 ++++++++++++++++++ src/tests/loader-nproc-create.cc | 202 +++++++++++++++++++ 11 files changed, 522 insertions(+), 93 deletions(-) create mode 100644 src/tests/loader-nproc-close.cc create mode 100644 src/tests/loader-nproc-create.cc diff --git a/ft/ftloader-internal.h b/ft/ftloader-internal.h index be1ded59890..d60537490dd 100644 --- a/ft/ftloader-internal.h +++ b/ft/ftloader-internal.h @@ -245,6 +245,7 @@ struct ft_loader_s { CACHETABLE cachetable; bool did_reserve_memory; bool compress_intermediates; + bool allow_puts; uint64_t reserved_memory; // how much memory are we allowed to use? /* To make it easier to recover from errors, we don't use FILE*, instead we use an index into the file_infos. */ @@ -346,7 +347,8 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp, TOKUTXN txn, bool reserve_memory, uint64_t reserve_memory_size, - bool compress_intermediates); + bool compress_intermediates, + bool allow_puts); void toku_ft_loader_internal_destroy (FTLOADER bl, bool is_error); diff --git a/ft/ftloader.cc b/ft/ftloader.cc index 2df6d0a1cda..7214ee10039 100644 --- a/ft/ftloader.cc +++ b/ft/ftloader.cc @@ -420,6 +420,10 @@ void toku_ft_loader_internal_destroy (FTLOADER bl, bool is_error) { } destroy_rowset(&bl->primary_rowset); + if (bl->primary_rowset_queue) { + queue_destroy(bl->primary_rowset_queue); + bl->primary_rowset_queue = nullptr; + } for (int i=0; iN; i++) { if ( bl->fractal_queues ) { @@ -543,7 +547,8 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp, TOKUTXN txn, bool reserve_memory, uint64_t reserve_memory_size, - bool compress_intermediates) + bool compress_intermediates, + bool allow_puts) // Effect: Allocate and initialize a FTLOADER, but do not create the extractor thread. { FTLOADER CALLOC(bl); // initialized to all zeros (hence CALLOC) @@ -560,10 +565,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp, bl->reserved_memory = 512*1024*1024; // if no cache table use 512MB. } bl->compress_intermediates = compress_intermediates; - if (0) { // debug - fprintf(stderr, "%s Reserved memory=%" PRId64 "\n", __FUNCTION__, bl->reserved_memory); - } - + bl->allow_puts = allow_puts; bl->src_db = src_db; bl->N = N; bl->load_lsn = load_lsn; @@ -628,7 +630,6 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp, { int r = queue_create(&bl->primary_rowset_queue, EXTRACTOR_QUEUE_DEPTH); if (r!=0) { toku_ft_loader_internal_destroy(bl, true); return r; } } - //printf("%s:%d toku_pthread_create\n", __FILE__, __LINE__); { ft_loader_lock_init(bl); } @@ -650,34 +651,38 @@ int toku_ft_loader_open (/* out */ FTLOADER *blp, TOKUTXN txn, bool reserve_memory, uint64_t reserve_memory_size, - bool compress_intermediates) -/* Effect: called by DB_ENV->create_loader to create an ft loader. - * Arguments: - * blp Return the ft loader here. - * g The function for generating a row - * src_db The source database. Needed by g. May be NULL if that's ok with g. - * N The number of dbs to create. - * dbs An array of open databases. Used by g. The data will be put in these database. - * new_fnames The file names (these strings are owned by the caller: we make a copy for our own purposes). - * temp_file_template A template suitable for mkstemp() - * Return value: 0 on success, an error number otherwise. - */ -{ + bool compress_intermediates, + bool allow_puts) { +// Effect: called by DB_ENV->create_loader to create a brt loader. +// Arguments: +// blp Return the brt loader here. +// g The function for generating a row +// src_db The source database. Needed by g. May be NULL if that's ok with g. +// N The number of dbs to create. +// dbs An array of open databases. Used by g. The data will be put in these database. +// new_fnames The file names (these strings are owned by the caller: we make a copy for our own purposes). +// temp_file_template A template suitable for mkstemp() +// reserve_memory Cause the loader to reserve memory for its use from the cache table. +// compress_intermediates Cause the loader to compress intermediate loader files. +// allow_puts Prepare the loader for rows to insert. When puts are disabled, the loader does not run the +// extractor or the fractal tree writer threads. +// Return value: 0 on success, an error number otherwise. int result = 0; { int r = toku_ft_loader_internal_init(blp, cachetable, g, src_db, - N, fts, dbs, - new_fnames_in_env, - bt_compare_functions, - temp_file_template, - load_lsn, - txn, - reserve_memory, - reserve_memory_size, - compress_intermediates); + N, fts, dbs, + new_fnames_in_env, + bt_compare_functions, + temp_file_template, + load_lsn, + txn, + reserve_memory, + reserve_memory_size, + compress_intermediates, + allow_puts); if (r!=0) result = r; } - if (result==0) { + if (result==0 && allow_puts) { FTLOADER bl = *blp; int r = toku_pthread_create(&bl->extractor_thread, NULL, extractor_thread, (void*)bl); if (r==0) { @@ -1213,6 +1218,7 @@ finish_extractor (FTLOADER bl) { { int r = queue_destroy(bl->primary_rowset_queue); invariant(r==0); + bl->primary_rowset_queue = nullptr; } rval = ft_loader_fi_close_all(&bl->file_infos); @@ -1374,10 +1380,9 @@ int toku_ft_loader_put (FTLOADER bl, DBT *key, DBT *val) * Return value: 0 on success, an error number otherwise. */ { - if (ft_loader_get_error(&bl->error_callback)) + if (!bl->allow_puts || ft_loader_get_error(&bl->error_callback)) return EINVAL; // previous panic bl->n_rows++; -// return loader_write_row(key, val, bl->fprimary_rows, &bl->fprimary_offset, bl); return loader_do_put(bl, key, val); } @@ -2714,12 +2719,7 @@ static int loader_do_i (FTLOADER bl, struct rowset *rows = &(bl->rows[which_db]); invariant(rows->data==NULL); // the rows should be all cleaned up already - // a better allocation would be to figure out roughly how many merge passes we'll need. - int allocation_for_merge = (2*progress_allocation)/3; - progress_allocation -= allocation_for_merge; - - int r; - r = queue_create(&bl->fractal_queues[which_db], FRACTAL_WRITER_QUEUE_DEPTH); + int r = queue_create(&bl->fractal_queues[which_db], FRACTAL_WRITER_QUEUE_DEPTH); if (r) goto error; { @@ -2740,49 +2740,62 @@ static int loader_do_i (FTLOADER bl, r = dest_db->get_fanout(dest_db, &target_fanout); invariant_zero(r); - // This structure must stay live until the join below. - struct fractal_thread_args fta = { bl, - descriptor, - fd, - progress_allocation, - bl->fractal_queues[which_db], - bl->extracted_datasizes[which_db], - 0, - which_db, - target_nodesize, - target_basementnodesize, - target_compression_method, - target_fanout - }; + if (bl->allow_puts) { + // a better allocation would be to figure out roughly how many merge passes we'll need. + int allocation_for_merge = (2*progress_allocation)/3; + progress_allocation -= allocation_for_merge; + + // This structure must stay live until the join below. + struct fractal_thread_args fta = { + bl, + descriptor, + fd, + progress_allocation, + bl->fractal_queues[which_db], + bl->extracted_datasizes[which_db], + 0, + which_db, + target_nodesize, + target_basementnodesize, + target_compression_method, + target_fanout + }; - r = toku_pthread_create(bl->fractal_threads+which_db, NULL, fractal_thread, (void*)&fta); - if (r) { - int r2 __attribute__((__unused__)) = queue_destroy(bl->fractal_queues[which_db]); - // ignore r2, since we already have an error - goto error; - } - invariant(bl->fractal_threads_live[which_db]==false); - bl->fractal_threads_live[which_db] = true; + r = toku_pthread_create(bl->fractal_threads+which_db, NULL, fractal_thread, (void*)&fta); + if (r) { + int r2 __attribute__((__unused__)) = queue_destroy(bl->fractal_queues[which_db]); + // ignore r2, since we already have an error + bl->fractal_queues[which_db] = nullptr; + goto error; + } + invariant(bl->fractal_threads_live[which_db]==false); + bl->fractal_threads_live[which_db] = true; - r = merge_files(fs, bl, which_db, dest_db, compare, allocation_for_merge, bl->fractal_queues[which_db]); + r = merge_files(fs, bl, which_db, dest_db, compare, allocation_for_merge, bl->fractal_queues[which_db]); - { - void *toku_pthread_retval; - int r2 = toku_pthread_join(bl->fractal_threads[which_db], &toku_pthread_retval); - invariant(fta.bl==bl); // this is a gratuitous assertion to make sure that the fta struct is still live here. A previous bug but that struct into a C block statement. - resource_assert_zero(r2); - invariant(toku_pthread_retval==NULL); - invariant(bl->fractal_threads_live[which_db]); - bl->fractal_threads_live[which_db] = false; - if (r == 0) r = fta.errno_result; + { + void *toku_pthread_retval; + int r2 = toku_pthread_join(bl->fractal_threads[which_db], &toku_pthread_retval); + invariant(fta.bl==bl); // this is a gratuitous assertion to make sure that the fta struct is still live here. A previous bug put that struct into a C block statement. + resource_assert_zero(r2); + invariant(toku_pthread_retval==NULL); + invariant(bl->fractal_threads_live[which_db]); + bl->fractal_threads_live[which_db] = false; + if (r == 0) r = fta.errno_result; + } + } else { + queue_eof(bl->fractal_queues[which_db]); + r = toku_loader_write_ft_from_q(bl, descriptor, fd, progress_allocation, + bl->fractal_queues[which_db], bl->extracted_datasizes[which_db], which_db, + target_nodesize, target_basementnodesize, target_compression_method, target_fanout); } } error: // this is the cleanup code. Even if r==0 (no error) we fall through to here. - { + if (bl->fractal_queues[which_db]) { int r2 = queue_destroy(bl->fractal_queues[which_db]); invariant(r2==0); - bl->fractal_queues[which_db]=NULL; + bl->fractal_queues[which_db] = nullptr; } // if we get here we need to free up the merge_fileset and the rowset, as well as the keys @@ -2851,6 +2864,10 @@ int toku_ft_loader_close (FTLOADER bl, if (r) result = r; invariant(!bl->extractor_live); + } else { + r = finish_primary_rows(bl); + if (r) + result = r; } // check for an error during extraction diff --git a/ft/ftloader.h b/ft/ftloader.h index c3376c90e91..c920b4c5362 100644 --- a/ft/ftloader.h +++ b/ft/ftloader.h @@ -113,7 +113,8 @@ int toku_ft_loader_open (FTLOADER *bl, TOKUTXN txn, bool reserve_memory, uint64_t reserve_memory_size, - bool compress_intermediates); + bool compress_intermediates, + bool allow_puts); int toku_ft_loader_put (FTLOADER bl, DBT *key, DBT *val); diff --git a/ft/tests/ftloader-test-bad-generate.cc b/ft/tests/ftloader-test-bad-generate.cc index 1ecae89da78..9ae24f7c4ec 100644 --- a/ft/tests/ftloader-test-bad-generate.cc +++ b/ft/tests/ftloader-test-bad-generate.cc @@ -170,7 +170,7 @@ static void test_extractor(int nrows, int nrowsets, bool expect_fail) { } FTLOADER loader; - r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, "tempXXXXXX", ZERO_LSN, nullptr, true, 0, false); + r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, "tempXXXXXX", ZERO_LSN, nullptr, true, 0, false, true); assert(r == 0); struct rowset *rowset[nrowsets]; diff --git a/ft/tests/ftloader-test-extractor-errors.cc b/ft/tests/ftloader-test-extractor-errors.cc index 4dcd7fb2f8c..007fd39fe08 100644 --- a/ft/tests/ftloader-test-extractor-errors.cc +++ b/ft/tests/ftloader-test-extractor-errors.cc @@ -180,7 +180,7 @@ static void test_extractor(int nrows, int nrowsets, bool expect_fail, const char sprintf(temp, "%s/%s", testdir, "tempXXXXXX"); FTLOADER loader; - r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, "tempXXXXXX", ZERO_LSN, nullptr, true, 0, false); + r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, "tempXXXXXX", ZERO_LSN, nullptr, true, 0, false, true); assert(r == 0); struct rowset *rowset[nrowsets]; diff --git a/ft/tests/ftloader-test-extractor.cc b/ft/tests/ftloader-test-extractor.cc index 0a8ce157269..afba44a7a22 100644 --- a/ft/tests/ftloader-test-extractor.cc +++ b/ft/tests/ftloader-test-extractor.cc @@ -402,7 +402,7 @@ static void test_extractor(int nrows, int nrowsets, const char *testdir) { sprintf(temp, "%s/%s", testdir, "tempXXXXXX"); FTLOADER loader; - r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, temp, ZERO_LSN, nullptr, true, 0, false); + r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, temp, ZERO_LSN, nullptr, true, 0, false, true); assert(r == 0); struct rowset *rowset[nrowsets]; diff --git a/ft/tests/ftloader-test-merge-files-dbufio.cc b/ft/tests/ftloader-test-merge-files-dbufio.cc index 82583595470..cdd4c1d6691 100644 --- a/ft/tests/ftloader-test-merge-files-dbufio.cc +++ b/ft/tests/ftloader-test-merge-files-dbufio.cc @@ -412,7 +412,7 @@ static void test (const char *directory, bool is_error) { bt_compare_functions, "tempxxxxxx", *lsnp, - nullptr, true, 0, false); + nullptr, true, 0, false, true); assert(r==0); } @@ -500,11 +500,6 @@ static void test (const char *directory, bool is_error) { assert(cthunk.n_read == N_RECORDS); } } - //printf("%s:%d Destroying\n", __FILE__, __LINE__); - { - int r = queue_destroy(bl->primary_rowset_queue); - assert(r==0); - } { int r = queue_destroy(q); assert(r==0); diff --git a/ft/tests/ftloader-test-open.cc b/ft/tests/ftloader-test-open.cc index f2919f04d3d..cdf0a14ab00 100644 --- a/ft/tests/ftloader-test-open.cc +++ b/ft/tests/ftloader-test-open.cc @@ -143,7 +143,7 @@ static void test_loader_open(int ndbs) { for (i = 0; ; i++) { set_my_malloc_trigger(i+1); - r = toku_ft_loader_open(&loader, NULL, NULL, NULL, ndbs, fts, dbs, fnames, compares, "", ZERO_LSN, nullptr, true, 0, false); + r = toku_ft_loader_open(&loader, NULL, NULL, NULL, ndbs, fts, dbs, fnames, compares, "", ZERO_LSN, nullptr, true, 0, false, true); if (r == 0) break; } diff --git a/src/loader.cc b/src/loader.cc index 88db258e1ff..ecad190e03a 100644 --- a/src/loader.cc +++ b/src/loader.cc @@ -172,6 +172,13 @@ struct __toku_loader_internal { char **inames_in_env; /* [N] inames of new files to be created */ }; +static void free_inames(char **inames, int n) { + for (int i = 0; i < n; i++) { + toku_free(inames[i]); + } + toku_free(inames); +} + /* * free_loader_resources() frees all of the resources associated with * struct __toku_loader_internal @@ -185,16 +192,15 @@ static void free_loader_resources(DB_LOADER *loader) toku_destroy_dbt(&loader->i->err_val); if (loader->i->inames_in_env) { - for (int i=0; ii->N; i++) { - if (loader->i->inames_in_env[i]) toku_free(loader->i->inames_in_env[i]); - } - toku_free(loader->i->inames_in_env); + free_inames(loader->i->inames_in_env, loader->i->N); + loader->i->inames_in_env = nullptr; } - if (loader->i->temp_file_template) toku_free(loader->i->temp_file_template); + toku_free(loader->i->temp_file_template); + loader->i->temp_file_template = nullptr; // loader->i toku_free(loader->i); - loader->i = NULL; + loader->i = nullptr; } } @@ -306,6 +312,9 @@ toku_loader_create_loader(DB_ENV *env, // time to open the big kahuna char **XMALLOC_N(N, new_inames_in_env); + for (int i = 0; i < N; i++) { + new_inames_in_env[i] = nullptr; + } FT_HANDLE *XMALLOC_N(N, fts); for (int i=0; ii->ft_handle; @@ -313,7 +322,7 @@ toku_loader_create_loader(DB_ENV *env, LSN load_lsn; rval = locked_load_inames(env, txn, N, dbs, new_inames_in_env, &load_lsn, puts_allowed); if ( rval!=0 ) { - toku_free(new_inames_in_env); + free_inames(new_inames_in_env, N); toku_free(fts); goto create_exit; } @@ -331,12 +340,14 @@ toku_loader_create_loader(DB_ENV *env, ttxn, puts_allowed, env->get_loader_memory_size(env), - compress_intermediates); + compress_intermediates, + puts_allowed); if ( rval!=0 ) { - toku_free(new_inames_in_env); + free_inames(new_inames_in_env, N); toku_free(fts); goto create_exit; } + loader->i->inames_in_env = new_inames_in_env; toku_free(fts); @@ -441,7 +452,7 @@ static void redirect_loader_to_empty_dictionaries(DB_LOADER *loader) { loader->i->dbs, loader->i->db_flags, loader->i->dbt_flags, - 0, + LOADER_DISALLOW_PUTS, false ); lazy_assert_zero(r); diff --git a/src/tests/loader-nproc-close.cc b/src/tests/loader-nproc-close.cc new file mode 100644 index 00000000000..eee15faa1d3 --- /dev/null +++ b/src/tests/loader-nproc-close.cc @@ -0,0 +1,201 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." +#ident "$Id$" + +#include "test.h" +#include +#include + +static int loader_flags = 0; +static const char *envdir = TOKU_TEST_FILENAME; + +static int put_multiple_generate(DB *UU(dest_db), DB *UU(src_db), DBT_ARRAY *UU(dest_keys), DBT_ARRAY *UU(dest_vals), const DBT *UU(src_key), const DBT *UU(src_val)) { + return ENOMEM; +} + +static void loader_open_close(int ndb) { + int r; + + char rmcmd[32 + strlen(envdir)]; + snprintf(rmcmd, sizeof rmcmd, "rm -rf %s", envdir); + r = system(rmcmd); CKERR(r); + r = toku_os_mkdir(envdir, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + r = env->set_generate_row_callback_for_put(env, put_multiple_generate); + CKERR(r); + int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; + r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + + DB *dbs[ndb]; + uint32_t db_flags[ndb]; + uint32_t dbt_flags[ndb]; + for (int i = 0; i < ndb; i++) { + db_flags[i] = DB_NOOVERWRITE; + dbt_flags[i] = 0; + r = db_create(&dbs[i], env, 0); CKERR(r); + char name[32]; + sprintf(name, "db%d", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + + DB_LOADER *loader; + r = env->create_loader(env, txn, &loader, ndb > 0 ? dbs[0] : NULL, ndb, dbs, db_flags, dbt_flags, loader_flags); CKERR(r); + + struct rlimit current_nproc_limit; + r = getrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + struct rlimit new_nproc_limit = current_nproc_limit; + new_nproc_limit.rlim_cur = 0; + r = setrlimit(RLIMIT_NPROC, &new_nproc_limit); + assert(r == 0); + + r = loader->close(loader); + + if (loader_flags & LOADER_DISALLOW_PUTS) + CKERR(r); + else + CKERR2(r, EAGAIN); + + r = setrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + r = txn->abort(txn); CKERR(r); + + for (int i = 0; i < ndb; i++) { + r = dbs[i]->close(dbs[i], 0); CKERR(r); + } + + r = env->close(env, 0); CKERR(r); +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: %s -h -v -q -p\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-p") == 0) { + loader_flags |= LOADER_DISALLOW_PUTS; + } else if (strcmp(argv[0], "-z") == 0) { + loader_flags |= LOADER_COMPRESS_INTERMEDIATES; + } else if (strcmp(argv[0], "-e") == 0) { + argc--; argv++; + if (argc > 0) + envdir = argv[0]; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + loader_open_close(1); + return 0; +} diff --git a/src/tests/loader-nproc-create.cc b/src/tests/loader-nproc-create.cc new file mode 100644 index 00000000000..09f57c9019c --- /dev/null +++ b/src/tests/loader-nproc-create.cc @@ -0,0 +1,202 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." +#ident "$Id$" + +#include "test.h" +#include +#include + +static int loader_flags = 0; +static const char *envdir = TOKU_TEST_FILENAME; + +static int put_multiple_generate(DB *UU(dest_db), DB *UU(src_db), DBT_ARRAY *UU(dest_keys), DBT_ARRAY *UU(dest_vals), const DBT *UU(src_key), const DBT *UU(src_val)) { + return ENOMEM; +} + +static void loader_open_close(int ndb) { + int r; + + char rmcmd[32 + strlen(envdir)]; + snprintf(rmcmd, sizeof rmcmd, "rm -rf %s", envdir); + r = system(rmcmd); CKERR(r); + r = toku_os_mkdir(envdir, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + r = env->set_generate_row_callback_for_put(env, put_multiple_generate); + CKERR(r); + int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; + r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + + DB *dbs[ndb]; + uint32_t db_flags[ndb]; + uint32_t dbt_flags[ndb]; + for (int i = 0; i < ndb; i++) { + db_flags[i] = DB_NOOVERWRITE; + dbt_flags[i] = 0; + r = db_create(&dbs[i], env, 0); CKERR(r); + char name[32]; + sprintf(name, "db%d", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + + struct rlimit current_nproc_limit; + r = getrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + struct rlimit new_nproc_limit = current_nproc_limit; + new_nproc_limit.rlim_cur = 0; + r = setrlimit(RLIMIT_NPROC, &new_nproc_limit); + assert(r == 0); + + DB_LOADER *loader; + int loader_r = env->create_loader(env, txn, &loader, ndb > 0 ? dbs[0] : NULL, ndb, dbs, db_flags, dbt_flags, loader_flags); + + r = setrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + if (loader_flags & LOADER_DISALLOW_PUTS) { + CKERR(loader_r); + loader_r = loader->close(loader); + CKERR(loader_r); + } else { + CKERR2(loader_r, EAGAIN); + } + + r = txn->abort(txn); CKERR(r); + + for (int i = 0; i < ndb; i++) { + r = dbs[i]->close(dbs[i], 0); CKERR(r); + } + + r = env->close(env, 0); CKERR(r); +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: %s -h -v -q -p\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-p") == 0) { + loader_flags |= LOADER_DISALLOW_PUTS; + } else if (strcmp(argv[0], "-z") == 0) { + loader_flags |= LOADER_COMPRESS_INTERMEDIATES; + } else if (strcmp(argv[0], "-e") == 0) { + argc--; argv++; + if (argc > 0) + envdir = argv[0]; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + loader_open_close(1); + return 0; +} From 2006f3bf58e880e502f586418366186c442ca683 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Wed, 14 May 2014 14:43:44 -0400 Subject: [PATCH 08/46] #206 merge mariadb 10.0.11 changes --- storage/tokudb/ha_tokudb_alter_56.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/storage/tokudb/ha_tokudb_alter_56.cc b/storage/tokudb/ha_tokudb_alter_56.cc index dbfce8764bc..5289779bb32 100644 --- a/storage/tokudb/ha_tokudb_alter_56.cc +++ b/storage/tokudb/ha_tokudb_alter_56.cc @@ -219,6 +219,11 @@ static bool change_type_is_supported(TABLE *table, TABLE *altered_table, Alter_i static ulong fix_handler_flags(THD *thd, TABLE *table, TABLE *altered_table, Alter_inplace_info *ha_alter_info) { ulong handler_flags = ha_alter_info->handler_flags; +#if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099 + // This is automatically supported, hide the flag from later checks + handler_flags &= ~Alter_inplace_info::ALTER_PARTITIONED; +#endif + // workaround for fill_alter_inplace_info bug (#5193) // the function erroneously sets the ADD_INDEX and DROP_INDEX flags for a column addition that does not // change the keys. the following code turns the ADD_INDEX and DROP_INDEX flags so that we can do hot @@ -728,7 +733,8 @@ bool ha_tokudb::commit_inplace_alter_table(TABLE *altered_table, Alter_inplace_i if (commit) { #if (50613 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \ - (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) + (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) || \ + (100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099) if (ha_alter_info->group_commit_ctx) { ha_alter_info->group_commit_ctx = NULL; } From 4d2c3ffbb629c30fc09e26c13e9a37752e277f80 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Thu, 15 May 2014 08:33:30 -0400 Subject: [PATCH 09/46] #240 make the toku thread pool handle transient thread creation errors --- util/tests/threadpool-nproc-limit.cc | 171 +++++++++++++++++++++++++++ util/threadpool.cc | 25 ++-- 2 files changed, 186 insertions(+), 10 deletions(-) create mode 100644 util/tests/threadpool-nproc-limit.cc diff --git a/util/tests/threadpool-nproc-limit.cc b/util/tests/threadpool-nproc-limit.cc new file mode 100644 index 00000000000..f1ba10dad84 --- /dev/null +++ b/util/tests/threadpool-nproc-limit.cc @@ -0,0 +1,171 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved." + +// this test verifies that the toku thread pool is resilient when hitting the nproc limit. + +#include +#include +#include +#include +#include +#include +#include +#include + +int verbose = 0; + +static int usage(void) { + fprintf(stderr, "[-q] [-v] [--verbose] (%d)\n", verbose); + return 1; +} + +static void *f(void *arg) { + return arg; +} + +static int dotest(int the_limit) { + if (verbose) + fprintf(stderr, "%s:%u %d\n", __FILE__, __LINE__, the_limit); + int r; + struct toku_thread_pool *pool = nullptr; + r = toku_thread_pool_create(&pool, 10); + assert(r == 0 && pool != nullptr); + + struct rlimit current_nproc_limit; + r = getrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + struct rlimit new_nproc_limit = current_nproc_limit; + new_nproc_limit.rlim_cur = the_limit; + r = setrlimit(RLIMIT_NPROC, &new_nproc_limit); + assert(r == 0); + + int want_n = 20; + int got_n = want_n; + r = toku_thread_pool_run(pool, 0, &got_n, f, nullptr); + if (r == 0) + assert(want_n == got_n); + else { + assert(r == EWOULDBLOCK); + assert(got_n <= want_n); + } + + r = setrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + if (verbose) + toku_thread_pool_print(pool, stderr); + toku_thread_pool_destroy(&pool); + return got_n > 0; +} + +int main(int argc, char *argv[]) { + // parse args + for (int i = 1; i < argc; i++) { + char *arg = argv[i]; + if (arg[0] != '-') + break; + if (strcmp(arg, "-v") == 0 || strcmp(arg, "--verbose") == 0) { + verbose = verbose+1; + continue; + } + if (strcmp(arg, "-q") == 0) { + verbose = verbose > 0 ? verbose-1 : 0; + continue; + } + return usage(); + } + // set increasing nproc limits until the test succeeds in hitting the limit after > 0 threads are created + for (int i = 0; 1; i++) { + if (dotest(i)) + break; + } + return 0; +} diff --git a/util/threadpool.cc b/util/threadpool.cc index d6652b7a71c..4f1105d83c2 100644 --- a/util/threadpool.cc +++ b/util/threadpool.cc @@ -132,13 +132,18 @@ static int toku_thread_create(struct toku_thread_pool *pool, struct toku_thread **toku_thread_return) { int r; struct toku_thread *MALLOC(thread); - if (thread == NULL) { + if (thread == nullptr) { r = get_error_errno(); } else { memset(thread, 0, sizeof *thread); thread->pool = pool; - toku_cond_init(&thread->wait, NULL); - r = toku_pthread_create(&thread->tid, NULL, toku_thread_run_internal, thread); resource_assert_zero(r); + toku_cond_init(&thread->wait, nullptr); + r = toku_pthread_create(&thread->tid, nullptr, toku_thread_run_internal, thread); + if (r) { + toku_cond_destroy(&thread->wait); + toku_free(thread); + thread = nullptr; + } *toku_thread_return = thread; } return r; @@ -192,7 +197,7 @@ toku_thread_run_internal(void *arg) { if (doexit) break; toku_thread_pool_lock(pool); - thread->f = NULL; + thread->f = nullptr; toku_list_push(&pool->free_threads, &thread->free_link); } return arg; @@ -202,13 +207,13 @@ int toku_thread_pool_create(struct toku_thread_pool **pool_return, int max_threads) { int r; struct toku_thread_pool *CALLOC(pool); - if (pool == NULL) { + if (pool == nullptr) { r = get_error_errno(); } else { - toku_mutex_init(&pool->lock, NULL); + toku_mutex_init(&pool->lock, nullptr); toku_list_init(&pool->free_threads); toku_list_init(&pool->all_threads); - toku_cond_init(&pool->wait_free, NULL); + toku_cond_init(&pool->wait_free, nullptr); pool->cur_threads = 0; pool->max_threads = max_threads; *pool_return = pool; @@ -230,7 +235,7 @@ toku_thread_pool_unlock(struct toku_thread_pool *pool) { void toku_thread_pool_destroy(struct toku_thread_pool **poolptr) { struct toku_thread_pool *pool = *poolptr; - *poolptr = NULL; + *poolptr = nullptr; // ask the threads to exit toku_thread_pool_lock(pool); @@ -260,7 +265,7 @@ toku_thread_pool_destroy(struct toku_thread_pool **poolptr) { static int toku_thread_pool_add(struct toku_thread_pool *pool) { - struct toku_thread *thread = NULL; + struct toku_thread *thread = nullptr; int r = toku_thread_create(pool, &thread); if (r == 0) { pool->cur_threads += 1; @@ -294,7 +299,7 @@ toku_thread_pool_get_one(struct toku_thread_pool *pool, int dowait, struct toku_ struct toku_thread *thread = toku_list_struct(list, struct toku_thread, free_link); *toku_thread_return = thread; } else - *toku_thread_return = NULL; + *toku_thread_return = nullptr; toku_thread_pool_unlock(pool); return r; } From 1a17a12c10c872362a3e0e005830b9235081c5e3 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Thu, 15 May 2014 10:00:41 -0400 Subject: [PATCH 10/46] #239 fix dbremove crash when NOFILE limit is exceeded --- src/tests/dbremove-nofile-limit.cc | 177 +++++++++++++++++++++++++++++ src/ydb.cc | 8 +- 2 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 src/tests/dbremove-nofile-limit.cc diff --git a/src/tests/dbremove-nofile-limit.cc b/src/tests/dbremove-nofile-limit.cc new file mode 100644 index 00000000000..eb5c6b80b63 --- /dev/null +++ b/src/tests/dbremove-nofile-limit.cc @@ -0,0 +1,177 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved." +#ident "$Id$" + +// This test verifies that the env->dbremove function returns an error rather than +// crash when the NOFILE resource limit is exceeded. + +#include "test.h" +#include +#include + +static const char *envdir = TOKU_TEST_FILENAME; + +static void test_dbremove() { + int r; + + char rmcmd[32 + strlen(envdir)]; + snprintf(rmcmd, sizeof rmcmd, "rm -rf %s", envdir); + r = system(rmcmd); CKERR(r); + r = toku_os_mkdir(envdir, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; + r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + + DB *db; + r = db_create(&db, env, 0); CKERR(r); + char fname[32]; + sprintf(fname, "db%d", 0); + r = db->open(db, nullptr, fname, nullptr, DB_BTREE, DB_CREATE, 0666); CKERR(r); + + r = db->close(db, 0); CKERR(r); + + DB_TXN *txn; + r = env->txn_begin(env, nullptr, &txn, 0); CKERR(r); + + struct rlimit current_limit; + r = getrlimit(RLIMIT_NOFILE, ¤t_limit); + assert(r == 0); + + struct rlimit new_limit = current_limit; + new_limit.rlim_cur = 0; + r = setrlimit(RLIMIT_NOFILE, &new_limit); + assert(r == 0); + + r = env->dbremove(env, txn, fname, nullptr, 0); + CKERR2(r, EMFILE); + + r = setrlimit(RLIMIT_NOFILE, ¤t_limit); + assert(r == 0); + + r = env->dbremove(env, txn, fname, nullptr, 0); + CKERR(r); + + r = txn->commit(txn, 0); CKERR(r); + + r = env->close(env, 0); CKERR(r); +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: %s -h -v -q\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + test_dbremove(); + return 0; +} diff --git a/src/ydb.cc b/src/ydb.cc index a2bb221a40b..4a01c37bea6 100644 --- a/src/ydb.cc +++ b/src/ydb.cc @@ -2901,7 +2901,13 @@ env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u r = toku_db_create(&db, env, 0); lazy_assert_zero(r); r = toku_db_open_iname(db, txn, iname, 0, 0); - lazy_assert_zero(r); + if (txn && r) { + if (r == EMFILE || r == ENFILE) + r = toku_ydb_do_error(env, r, "toku dbremove failed because open file limit reached\n"); + else + r = toku_ydb_do_error(env, r, "toku dbremove failed\n"); + goto exit; + } if (txn) { // Now that we have a writelock on dname, verify that there are still no handles open. (to prevent race conditions) if (env_is_db_with_dname_open(env, dname)) { From 6658f8996cc1ab16b7aac34ba05fe3dd1e3c56f0 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Mon, 19 May 2014 10:48:17 -0400 Subject: [PATCH 11/46] #242 fix loader creation bug that unlinks the wrong fractal tree files --- src/loader.cc | 21 +- ...c-close.cc => loader-close-nproc-limit.cc} | 0 src/tests/loader-create-commit-nproc-limit.cc | 211 ++++++++++++++++++ ...create.cc => loader-create-nproc-limit.cc} | 0 src/ydb_db.cc | 26 +-- 5 files changed, 232 insertions(+), 26 deletions(-) rename src/tests/{loader-nproc-close.cc => loader-close-nproc-limit.cc} (100%) create mode 100644 src/tests/loader-create-commit-nproc-limit.cc rename src/tests/{loader-nproc-create.cc => loader-create-nproc-limit.cc} (100%) diff --git a/src/loader.cc b/src/loader.cc index ecad190e03a..62b4f0b6cef 100644 --- a/src/loader.cc +++ b/src/loader.cc @@ -251,6 +251,7 @@ toku_loader_create_loader(DB_ENV *env, bool check_empty) { int rval; HANDLE_READ_ONLY_TXN(txn); + DB_TXN *loader_txn = nullptr; *blp = NULL; // set later when created @@ -305,6 +306,13 @@ toku_loader_create_loader(DB_ENV *env, } { + if (env->i->open_flags & DB_INIT_TXN) { + rval = env->txn_begin(env, txn, &loader_txn, 0); + if (rval) { + goto create_exit; + } + } + ft_compare_func compare_functions[N]; for (int i=0; ii->bt_compare; @@ -320,13 +328,13 @@ toku_loader_create_loader(DB_ENV *env, fts[i] = dbs[i]->i->ft_handle; } LSN load_lsn; - rval = locked_load_inames(env, txn, N, dbs, new_inames_in_env, &load_lsn, puts_allowed); + rval = locked_load_inames(env, loader_txn, N, dbs, new_inames_in_env, &load_lsn, puts_allowed); if ( rval!=0 ) { free_inames(new_inames_in_env, N); toku_free(fts); goto create_exit; } - TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; + TOKUTXN ttxn = loader_txn ? db_txn_struct_i(loader_txn)->tokutxn : NULL; rval = toku_ft_loader_open(&loader->i->ft_loader, env->i->cachetable, env->i->generate_row_for_put, @@ -359,10 +367,19 @@ toku_loader_create_loader(DB_ENV *env, rval = 0; } + rval = loader_txn->commit(loader_txn, 0); + assert_zero(rval); + loader_txn = nullptr; + rval = 0; } *blp = loader; create_exit: + if (loader_txn) { + int r = loader_txn->abort(loader_txn); + assert_zero(r); + loader_txn = nullptr; + } if (rval == 0) { (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_CREATE), 1); (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_CURRENT), 1); diff --git a/src/tests/loader-nproc-close.cc b/src/tests/loader-close-nproc-limit.cc similarity index 100% rename from src/tests/loader-nproc-close.cc rename to src/tests/loader-close-nproc-limit.cc diff --git a/src/tests/loader-create-commit-nproc-limit.cc b/src/tests/loader-create-commit-nproc-limit.cc new file mode 100644 index 00000000000..26ce5e478ed --- /dev/null +++ b/src/tests/loader-create-commit-nproc-limit.cc @@ -0,0 +1,211 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." +#ident "$Id$" + +// This test crashes if a failed loader creation causes the db to be corrupted by unlinking +// the underlying fractal tree files. This unlinking occurs because the txn that logs the +// load log entries is committed rather than aborted. + +#include "test.h" +#include +#include + +static int loader_flags = 0; +static const char *envdir = TOKU_TEST_FILENAME; + +static void loader_create_commit(int ndb) { + int r; + + char rmcmd[32 + strlen(envdir)]; + snprintf(rmcmd, sizeof rmcmd, "rm -rf %s", envdir); + r = system(rmcmd); CKERR(r); + r = toku_os_mkdir(envdir, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; + r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + + DB *dbs[ndb]; + uint32_t db_flags[ndb]; + uint32_t dbt_flags[ndb]; + for (int i = 0; i < ndb; i++) { + db_flags[i] = DB_NOOVERWRITE; + dbt_flags[i] = 0; + r = db_create(&dbs[i], env, 0); CKERR(r); + char name[32]; + sprintf(name, "db%d", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + + struct rlimit current_nproc_limit; + r = getrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + struct rlimit new_nproc_limit = current_nproc_limit; + new_nproc_limit.rlim_cur = 0; + r = setrlimit(RLIMIT_NPROC, &new_nproc_limit); + assert(r == 0); + + DB_LOADER *loader; + int loader_r = env->create_loader(env, txn, &loader, ndb > 0 ? dbs[0] : NULL, ndb, dbs, db_flags, dbt_flags, loader_flags); + + r = setrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + if (loader_flags & LOADER_DISALLOW_PUTS) { + CKERR(loader_r); + loader_r = loader->close(loader); + CKERR(loader_r); + } else { + CKERR2(loader_r, EAGAIN); + } + + r = txn->commit(txn, 0); CKERR(r); + + for (int i = 0; i < ndb; i++) { + r = dbs[i]->close(dbs[i], 0); CKERR(r); + } + + for (int i = 0; i < ndb; i++) { + r = db_create(&dbs[i], env, 0); CKERR(r); + char name[32]; + sprintf(name, "db%d", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, 0, 0666); CKERR(r); + } + + for (int i = 0; i < ndb; i++) { + r = dbs[i]->close(dbs[i], 0); CKERR(r); + } + + r = env->close(env, 0); CKERR(r); +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: %s -h -v -q -p\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-p") == 0) { + loader_flags |= LOADER_DISALLOW_PUTS; + } else if (strcmp(argv[0], "-z") == 0) { + loader_flags |= LOADER_COMPRESS_INTERMEDIATES; + } else if (strcmp(argv[0], "-e") == 0) { + argc--; argv++; + if (argc > 0) + envdir = argv[0]; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + loader_create_commit(1); + return 0; +} diff --git a/src/tests/loader-nproc-create.cc b/src/tests/loader-create-nproc-limit.cc similarity index 100% rename from src/tests/loader-nproc-create.cc rename to src/tests/loader-create-nproc-limit.cc diff --git a/src/ydb_db.cc b/src/ydb_db.cc index 78e08705ac6..b9fa32eb4a0 100644 --- a/src/ydb_db.cc +++ b/src/ydb_db.cc @@ -1221,36 +1221,14 @@ load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[/*N*/], const char * new int locked_load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[/*N*/], char * new_inames_in_env[/*N*/], LSN *load_lsn, bool mark_as_loader) { - int ret, r; + int r; HANDLE_READ_ONLY_TXN(txn); - DB_TXN *child_txn = NULL; - int using_txns = env->i->open_flags & DB_INIT_TXN; - if (using_txns) { - ret = toku_txn_begin(env, txn, &child_txn, 0); - invariant_zero(ret); - } - // cannot begin a checkpoint toku_multi_operation_client_lock(); - r = load_inames(env, child_txn, N, dbs, (const char **) new_inames_in_env, load_lsn, mark_as_loader); + r = load_inames(env, txn, N, dbs, (const char **) new_inames_in_env, load_lsn, mark_as_loader); toku_multi_operation_client_unlock(); - if (using_txns) { - if (r == 0) { - ret = locked_txn_commit(child_txn, DB_TXN_NOSYNC); - invariant_zero(ret); - } else { - ret = locked_txn_abort(child_txn); - invariant_zero(ret); - for (int i = 0; i < N; i++) { - if (new_inames_in_env[i]) { - toku_free(new_inames_in_env[i]); - new_inames_in_env[i] = NULL; - } - } - } - } return r; } From b62e8d10fcdc8115eb363370dbeb2021e3e530e1 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Tue, 20 May 2014 08:18:13 -0400 Subject: [PATCH 12/46] #236 mysqld_safe should use libjemalloc.so if it exists in the tarball --- scripts/setup.mysql.bash | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/setup.mysql.bash b/scripts/setup.mysql.bash index 6ae604e34c1..e97e4a4f562 100755 --- a/scripts/setup.mysql.bash +++ b/scripts/setup.mysql.bash @@ -180,6 +180,10 @@ if [ $startup -ne 0 ] ; then else default_arg="--defaults-file=$defaultsfile" fi + j=/usr/local/mysql/lib/mysql/libjemalloc.so + if [ -f $j ] ; then + default_arg="$default_arg --malloc-lib=$j" + fi $sudo -b bash -c "$ldpath /usr/local/mysql/bin/mysqld_safe $default_arg $mysqld_args" >/dev/null 2>&1 & fi sleep $sleeptime From ac31894df60b1cef31729b9d538823a2296f2f2c Mon Sep 17 00:00:00 2001 From: John Esmet Date: Tue, 20 May 2014 13:51:32 -0400 Subject: [PATCH 13/46] Support gcc 4.9 in cmake, fix uninitialized value warnings --- cmake_modules/TokuSetupCompiler.cmake | 8 ++++++++ ft/log_upgrade.cc | 4 ++-- ft/logger.cc | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/cmake_modules/TokuSetupCompiler.cmake b/cmake_modules/TokuSetupCompiler.cmake index 4b8a600f141..316d0757fdc 100644 --- a/cmake_modules/TokuSetupCompiler.cmake +++ b/cmake_modules/TokuSetupCompiler.cmake @@ -137,6 +137,14 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL Clang) set(CMAKE_C_FLAGS_RELEASE "-g -O3 ${CMAKE_C_FLAGS_RELEASE} -UNDEBUG") set(CMAKE_CXX_FLAGS_RELEASE "-g -O3 ${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") else () + if (CMAKE_CXX_COMPILER_ID MATCHES GNU) + ## Versions of gcc >= 4.9.0 require special version of 'ar' and 'ranlib' for + ## link-time optimizations to work properly. + if (NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9.0")) + set(CMAKE_AR "gcc-ar") + set(CMAKE_RANLIB "gcc-ranlib") + endif() + endif() # we overwrite this because the default passes -DNDEBUG and we don't want that set(CMAKE_C_FLAGS_RELWITHDEBINFO "-flto -fuse-linker-plugin ${CMAKE_C_FLAGS_RELWITHDEBINFO} -g -O3 -UNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-flto -fuse-linker-plugin ${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -g -O3 -UNDEBUG") diff --git a/ft/log_upgrade.cc b/ft/log_upgrade.cc index e5a36a88cff..8dba57e9d8d 100644 --- a/ft/log_upgrade.cc +++ b/ft/log_upgrade.cc @@ -321,8 +321,8 @@ toku_maybe_upgrade_log(const char *env_dir, const char *log_dir, LSN * lsn_of_cl r = 0; //Logs are up to date else { FOOTPRINT(4); - LSN last_lsn; - TXNID last_xid; + LSN last_lsn = ZERO_LSN; + TXNID last_xid = TXNID_NONE; r = verify_clean_shutdown_of_log_version(log_dir, version_of_logs_on_disk, &last_lsn, &last_xid); if (r != 0) { goto cleanup; diff --git a/ft/logger.cc b/ft/logger.cc index e4fd854c637..bbac5cf7de3 100644 --- a/ft/logger.cc +++ b/ft/logger.cc @@ -621,7 +621,7 @@ int toku_logger_find_next_unused_log_file(const char *directory, long long *resu if (d==0) return get_error_errno(); while ((de=readdir(d))) { if (de==0) return get_error_errno(); - long long thisl; + long long thisl = -1; if ( is_a_logfile(de->d_name, &thisl) ) { if ((long long)thisl > maxf) maxf = thisl; } From bfe318a43372d1ebc974baf2bad26fb65017d9cb Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Tue, 20 May 2014 14:39:56 -0400 Subject: [PATCH 14/46] moved/cleaned up gcc-ar/gcc-ranlib checking #245 --- CMakeLists.txt | 25 +++++++++++++++++++++++++ cmake_modules/TokuSetupCompiler.cmake | 8 -------- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 30de495271e..5a5a9713b4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,31 @@ project(TokuDB) set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "") set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "") +## Versions of gcc >= 4.9.0 require special version of 'ar' and 'ranlib' for +## link-time optimizations to work properly. +## +## From https://gcc.gnu.org/gcc-4.9/changes.html: +## +## When using a linker plugin, compiling with the -flto option now +## generates slim objects files (.o) which only contain intermediate +## language representation for LTO. Use -ffat-lto-objects to create +## files which contain additionally the object code. To generate +## static libraries suitable for LTO processing, use gcc-ar and +## gcc-ranlib; to list symbols from a slim object file use +## gcc-nm. (Requires that ar, ranlib and nm have been compiled with +## plugin support.) +if ((CMAKE_CXX_COMPILER_ID STREQUAL GNU) AND + NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9.0")) + find_program(gcc_ar "gcc-ar") + if (gcc_ar) + set(CMAKE_AR "${gcc_ar}") + endif () + find_program(gcc_ranlib "gcc-ranlib") + if (gcc_ranlib) + set(CMAKE_RANLIB "${gcc_ranlib}") + endif () +endif() + include(TokuFeatureDetection) include(TokuSetupCompiler) include(TokuSetupCTest) diff --git a/cmake_modules/TokuSetupCompiler.cmake b/cmake_modules/TokuSetupCompiler.cmake index 316d0757fdc..4b8a600f141 100644 --- a/cmake_modules/TokuSetupCompiler.cmake +++ b/cmake_modules/TokuSetupCompiler.cmake @@ -137,14 +137,6 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL Clang) set(CMAKE_C_FLAGS_RELEASE "-g -O3 ${CMAKE_C_FLAGS_RELEASE} -UNDEBUG") set(CMAKE_CXX_FLAGS_RELEASE "-g -O3 ${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") else () - if (CMAKE_CXX_COMPILER_ID MATCHES GNU) - ## Versions of gcc >= 4.9.0 require special version of 'ar' and 'ranlib' for - ## link-time optimizations to work properly. - if (NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9.0")) - set(CMAKE_AR "gcc-ar") - set(CMAKE_RANLIB "gcc-ranlib") - endif() - endif() # we overwrite this because the default passes -DNDEBUG and we don't want that set(CMAKE_C_FLAGS_RELWITHDEBINFO "-flto -fuse-linker-plugin ${CMAKE_C_FLAGS_RELWITHDEBINFO} -g -O3 -UNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-flto -fuse-linker-plugin ${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -g -O3 -UNDEBUG") From 412f1d9f36166c6a10bbd26f6b318f845994a9b4 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Wed, 21 May 2014 11:51:26 -0400 Subject: [PATCH 15/46] #244 skip jemalloc build if it is not in the third party directory --- cmake_modules/TokuThirdParty.cmake | 53 +++++++++++++++--------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/cmake_modules/TokuThirdParty.cmake b/cmake_modules/TokuThirdParty.cmake index ca1640bc525..0e8be69bf37 100644 --- a/cmake_modules/TokuThirdParty.cmake +++ b/cmake_modules/TokuThirdParty.cmake @@ -3,35 +3,34 @@ include(ExternalProject) if (NOT DEFINED LIBJEMALLOC) ## add jemalloc with an external project set(JEMALLOC_SOURCE_DIR "${TokuDB_SOURCE_DIR}/third_party/jemalloc" CACHE FILEPATH "Where to find jemalloc sources.") - if (NOT EXISTS "${JEMALLOC_SOURCE_DIR}/configure") - message(FATAL_ERROR "Can't find jemalloc sources. Please check them out to ${JEMALLOC_SOURCE_DIR} or modify JEMALLOC_SOURCE_DIR.") - endif () - set(jemalloc_configure_opts "CC=${CMAKE_C_COMPILER}" "--with-jemalloc-prefix=" "--with-private-namespace=tokudb_jemalloc_internal_" "--enable-cc-silence") - option(JEMALLOC_DEBUG "Build jemalloc with --enable-debug." OFF) - if (JEMALLOC_DEBUG) - list(APPEND jemalloc_configure_opts --enable-debug) - endif () - ExternalProject_Add(build_jemalloc - PREFIX jemalloc - SOURCE_DIR "${JEMALLOC_SOURCE_DIR}" - CONFIGURE_COMMAND - "${JEMALLOC_SOURCE_DIR}/configure" ${jemalloc_configure_opts} - "--prefix=${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc" - ) + if (EXISTS "${JEMALLOC_SOURCE_DIR}/configure") + set(jemalloc_configure_opts "CC=${CMAKE_C_COMPILER}" "--with-jemalloc-prefix=" "--with-private-namespace=tokudb_jemalloc_internal_" "--enable-cc-silence") + option(JEMALLOC_DEBUG "Build jemalloc with --enable-debug." OFF) + if (JEMALLOC_DEBUG) + list(APPEND jemalloc_configure_opts --enable-debug) + endif () + ExternalProject_Add(build_jemalloc + PREFIX jemalloc + SOURCE_DIR "${JEMALLOC_SOURCE_DIR}" + CONFIGURE_COMMAND + "${JEMALLOC_SOURCE_DIR}/configure" ${jemalloc_configure_opts} + "--prefix=${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc" + ) - add_library(jemalloc STATIC IMPORTED GLOBAL) - set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION - "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib/libjemalloc_pic.a") - add_dependencies(jemalloc build_jemalloc) - add_library(jemalloc_nopic STATIC IMPORTED GLOBAL) - set_target_properties(jemalloc_nopic PROPERTIES IMPORTED_LOCATION - "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib/libjemalloc.a") - add_dependencies(jemalloc_nopic build_jemalloc) + add_library(jemalloc STATIC IMPORTED GLOBAL) + set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION + "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib/libjemalloc_pic.a") + add_dependencies(jemalloc build_jemalloc) + add_library(jemalloc_nopic STATIC IMPORTED GLOBAL) + set_target_properties(jemalloc_nopic PROPERTIES IMPORTED_LOCATION + "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib/libjemalloc.a") + add_dependencies(jemalloc_nopic build_jemalloc) - # detect when we are being built as a subproject - if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) - install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib" DESTINATION . - COMPONENT tokukv_libs_extra) + # detect when we are being built as a subproject + if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) + install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib" DESTINATION . + COMPONENT tokukv_libs_extra) + endif () endif () endif () From 462f6012367c1f74829bc3c24eb73187972c661e Mon Sep 17 00:00:00 2001 From: John Esmet Date: Thu, 22 May 2014 18:57:56 -0400 Subject: [PATCH 16/46] fixes #248 Convert to a tree on omt clone if it must support marks --- util/omt.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/util/omt.cc b/util/omt.cc index 92cda38aefe..709c7eab4c3 100644 --- a/util/omt.cc +++ b/util/omt.cc @@ -207,6 +207,9 @@ void omt::clone(const omt &src) { src.fill_array_with_subtree_values(&this->d.a.values[0], src.d.t.root); } this->d.a.num_values = src.size(); + if (supports_marks) { + this->convert_to_tree(); + } } template From 71c2d3a170441699b933268fd21acc69300dddbe Mon Sep 17 00:00:00 2001 From: John Esmet Date: Thu, 22 May 2014 18:57:56 -0400 Subject: [PATCH 17/46] fixes #226 When serializing a nonleaf node, include the offsets stored in each message tree. This removes a sort during deserialization, which can be expensive when there are many messages and I/O is fast. This change supports auto-upgrade from older versions. --- ft/ft-ops.cc | 29 +++++++++--- ft/ft-serialize.cc | 1 + ft/ft_layout_version.h | 1 + ft/ft_node-serialize.cc | 100 ++++++++++++++++++++++++++++++++++++---- 4 files changed, 117 insertions(+), 14 deletions(-) diff --git a/ft/ft-ops.cc b/ft/ft-ops.cc index 64b6b498c9a..ab7de1a0a2c 100644 --- a/ft/ft-ops.cc +++ b/ft/ft-ops.cc @@ -890,6 +890,11 @@ void toku_ftnode_clone_callback( for (int i = 0; i < node->n_children-1; i++) { toku_clone_dbt(&cloned_node->childkeys[i], node->childkeys[i]); } + if (node->height > 0) { + // need to move messages here so that we don't serialize stale + // messages to the fresh tree - ft verify code complains otherwise. + toku_move_ftnode_messages_to_stale(ft, node); + } // clone partition ftnode_clone_partitions(node, cloned_node); @@ -932,11 +937,14 @@ void toku_ftnode_flush_callback( int height = ftnode->height; if (write_me) { toku_assert_entire_node_in_memory(ftnode); - if (height == 0) { + if (height > 0 && !is_clone) { + // cloned nodes already had their stale messages moved, see toku_ftnode_clone_callback() + toku_move_ftnode_messages_to_stale(h, ftnode); + } else if (height == 0) { ft_leaf_run_gc(h, ftnode); - } - if (height == 0 && !is_clone) { - ftnode_update_disk_stats(ftnode, h, for_checkpoint); + if (!is_clone) { + ftnode_update_disk_stats(ftnode, h, for_checkpoint); + } } int r = toku_serialize_ftnode_to(fd, ftnode->thisnodename, ftnode, ndd, !is_clone, h, for_checkpoint); assert_zero(r); @@ -1150,11 +1158,20 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext if (BP_STATE(node,i) == PT_AVAIL) { if (BP_SHOULD_EVICT(node,i)) { NONLEAF_CHILDINFO bnc; - if (ft_compress_buffers_before_eviction) { - // When partially evicting, always compress with quicklz + if (ft_compress_buffers_before_eviction && + // We may not serialize and compress a partition in memory if its + // in memory layout version is different than what's on disk (and + // therefore requires upgrade). + // + // Auto-upgrade code assumes that if a node's layout version read + // from disk is not current, it MUST require upgrade. Breaking + // this rule would cause upgrade code to upgrade this partition + // again after we serialize it as the current version, which is bad. + node->layout_version == node->layout_version_read_from_disk) { bnc = compress_internal_node_partition( node, i, + // Always compress with quicklz TOKU_QUICKLZ_METHOD ); } else { diff --git a/ft/ft-serialize.cc b/ft/ft-serialize.cc index 4a4817e7f6c..1879561f20a 100644 --- a/ft/ft-serialize.cc +++ b/ft/ft-serialize.cc @@ -462,6 +462,7 @@ serialize_ft_min_size (uint32_t version) { size_t size = 0; switch(version) { + case FT_LAYOUT_VERSION_27: case FT_LAYOUT_VERSION_26: case FT_LAYOUT_VERSION_25: case FT_LAYOUT_VERSION_24: diff --git a/ft/ft_layout_version.h b/ft/ft_layout_version.h index e9c6a68328b..01c7363e98d 100644 --- a/ft/ft_layout_version.h +++ b/ft/ft_layout_version.h @@ -120,6 +120,7 @@ enum ft_layout_version_e { FT_LAYOUT_VERSION_24 = 24, // Riddler: change logentries that log transactions to store TXNID_PAIRs instead of TXNIDs FT_LAYOUT_VERSION_25 = 25, // SecretSquirrel: ROLLBACK_LOG_NODES (on disk and in memory) now just use blocknum (instead of blocknum + hash) to point to other log nodes. same for xstillopen log entry FT_LAYOUT_VERSION_26 = 26, // Hojo: basements store key/vals separately on disk for fixed klpair length BNs + FT_LAYOUT_VERSION_27 = 27, // serialize message trees with nonleaf buffers to avoid key, msn sort on deserialize FT_NEXT_VERSION, // the version after the current version FT_LAYOUT_VERSION = FT_NEXT_VERSION-1, // A hack so I don't have to change this line. FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported diff --git a/ft/ft_node-serialize.cc b/ft/ft_node-serialize.cc index fcb38f11834..2eca2891bbc 100644 --- a/ft/ft_node-serialize.cc +++ b/ft/ft_node-serialize.cc @@ -291,8 +291,13 @@ serialize_ftnode_partition_size (FTNODE node, int i) paranoid_invariant(node->bp[i].state == PT_AVAIL); result++; // Byte that states what the partition is if (node->height > 0) { - result += 4; // size of bytes in buffer table - result += toku_bnc_nbytesinbuf(BNC(node, i)); + NONLEAF_CHILDINFO bnc = BNC(node, i); + // number of messages (4 bytes) plus size of the buffer + result += (4 + toku_bnc_nbytesinbuf(bnc)); + // number of offsets (4 bytes) plus an array of 4 byte offsets, for each message tree + result += (4 + (4 * bnc->fresh_message_tree.size())); + result += (4 + (4 * bnc->stale_message_tree.size())); + result += (4 + (4 * bnc->broadcast_list.size())); } else { result += 4 + bn_data::HEADER_LENGTH; // n_entries in buffer table + basement header @@ -305,8 +310,14 @@ serialize_ftnode_partition_size (FTNODE node, int i) #define FTNODE_PARTITION_DMT_LEAVES 0xaa #define FTNODE_PARTITION_FIFO_MSG 0xbb +static int +wbuf_write_offset(const int32_t &offset, const uint32_t UU(idx), struct wbuf *const wb) { + wbuf_nocrc_int(wb, offset); + return 0; +} + static void -serialize_nonleaf_childinfo(NONLEAF_CHILDINFO bnc, struct wbuf *wb) +serialize_child_buffer(NONLEAF_CHILDINFO bnc, struct wbuf *wb) { unsigned char ch = FTNODE_PARTITION_FIFO_MSG; wbuf_nocrc_char(wb, ch); @@ -323,6 +334,17 @@ serialize_nonleaf_childinfo(NONLEAF_CHILDINFO bnc, struct wbuf *wb) wbuf_nocrc_bytes(wb, key, keylen); wbuf_nocrc_bytes(wb, data, datalen); }); + + // serialize the message trees (num entries, offsets array): + // fresh, stale, broadcast + wbuf_nocrc_int(wb, bnc->fresh_message_tree.size()); + bnc->fresh_message_tree.iterate(wb); + + wbuf_nocrc_int(wb, bnc->stale_message_tree.size()); + bnc->stale_message_tree.iterate(wb); + + wbuf_nocrc_int(wb, bnc->broadcast_list.size()); + bnc->broadcast_list.iterate(wb); } // @@ -346,7 +368,7 @@ serialize_ftnode_partition(FTNODE node, int i, struct sub_block *sb) { wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size); if (node->height > 0) { // TODO: (Zardosht) possibly exit early if there are no messages - serialize_nonleaf_childinfo(BNC(node, i), &wb); + serialize_child_buffer(BNC(node, i), &wb); } else { unsigned char ch = FTNODE_PARTITION_DMT_LEAVES; @@ -1024,8 +1046,8 @@ toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DA } static void -deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf, - DESCRIPTOR desc, ft_compare_func cmp) { +deserialize_child_buffer_v26(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf, + DESCRIPTOR desc, ft_compare_func cmp) { int r; int n_in_this_buffer = rbuf_int(rbuf); int32_t *fresh_offsets = NULL, *stale_offsets = NULL; @@ -1090,6 +1112,59 @@ deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf, } } +// effect: deserialize a single message from rbuf and enque the result into the given fifo +static void +fifo_deserialize_msg_from_rbuf(FIFO fifo, struct rbuf *rbuf) { + bytevec key, val; + ITEMLEN keylen, vallen; + enum ft_msg_type type = (enum ft_msg_type) rbuf_char(rbuf); + bool is_fresh = rbuf_char(rbuf); + MSN msn = rbuf_msn(rbuf); + XIDS xids; + xids_create_from_buffer(rbuf, &xids); + rbuf_bytes(rbuf, &key, &keylen); /* Returns a pointer into the rbuf. */ + rbuf_bytes(rbuf, &val, &vallen); + int r = toku_fifo_enq(fifo, key, keylen, val, vallen, type, msn, xids, is_fresh, nullptr); + lazy_assert_zero(r); + xids_destroy(&xids); +} + +static void +deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf) { + int n_in_this_buffer = rbuf_int(rbuf); + int nfresh = 0, nstale = 0, nbroadcast_offsets = 0; + int32_t *XMALLOC_N(n_in_this_buffer, stale_offsets); + int32_t *XMALLOC_N(n_in_this_buffer, fresh_offsets); + int32_t *XMALLOC_N(n_in_this_buffer, broadcast_offsets); + + toku_fifo_resize(bnc->buffer, rbuf->size + 64); + for (int i = 0; i < n_in_this_buffer; i++) { + fifo_deserialize_msg_from_rbuf(bnc->buffer, rbuf); + } + + // read in each message tree (fresh, stale, broadcast) + nfresh = rbuf_int(rbuf); + for (int i = 0; i < nfresh; i++) { + fresh_offsets[i] = rbuf_int(rbuf); + } + nstale = rbuf_int(rbuf); + for (int i = 0; i < nstale; i++) { + stale_offsets[i] = rbuf_int(rbuf); + } + nbroadcast_offsets = rbuf_int(rbuf); + for (int i = 0; i < nbroadcast_offsets; i++) { + broadcast_offsets[i] = rbuf_int(rbuf); + } + + // build OMTs out of each offset array + bnc->fresh_message_tree.destroy(); + bnc->fresh_message_tree.create_steal_sorted_array(&fresh_offsets, nfresh, n_in_this_buffer); + bnc->stale_message_tree.destroy(); + bnc->stale_message_tree.create_steal_sorted_array(&stale_offsets, nstale, n_in_this_buffer); + bnc->broadcast_list.destroy(); + bnc->broadcast_list.create_steal_sorted_array(&broadcast_offsets, nbroadcast_offsets, n_in_this_buffer); +} + // dump a buffer to stderr // no locking around this for now void @@ -1161,13 +1236,16 @@ NONLEAF_CHILDINFO toku_create_empty_nl(void) { return cn; } -// does NOT create OMTs, just the FIFO +// must clone the OMTs, since we serialize them along with the FIFO NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo) { NONLEAF_CHILDINFO XMALLOC(cn); toku_fifo_clone(orig_childinfo->buffer, &cn->buffer); cn->fresh_message_tree.create_no_array(); + cn->fresh_message_tree.clone(orig_childinfo->fresh_message_tree); cn->stale_message_tree.create_no_array(); + cn->stale_message_tree.clone(orig_childinfo->stale_message_tree); cn->broadcast_list.create_no_array(); + cn->broadcast_list.clone(orig_childinfo->broadcast_list); memset(cn->flow, 0, sizeof cn->flow); return cn; } @@ -1513,7 +1591,13 @@ deserialize_ftnode_partition( if (node->height > 0) { assert(ch == FTNODE_PARTITION_FIFO_MSG); - deserialize_child_buffer(BNC(node, childnum), &rb, desc, cmp); + NONLEAF_CHILDINFO bnc = BNC(node, childnum); + if (node->layout_version_read_from_disk <= FT_LAYOUT_VERSION_26) { + // Layout version <= 26 did not serialize sorted message trees to disk. + deserialize_child_buffer_v26(bnc, &rb, desc, cmp); + } else { + deserialize_child_buffer(bnc, &rb); + } BP_WORKDONE(node, childnum) = 0; } else { From 1d6d6e3ce81529eb720f8662324c821d7275195c Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Fri, 23 May 2014 08:42:19 -0400 Subject: [PATCH 18/46] changed CHECKPOINT_DURATION[_LAST] to UINT64 #249 fixes #249 --- ft/checkpoint.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ft/checkpoint.cc b/ft/checkpoint.cc index 3d26c3a460e..bc4629a1d08 100644 --- a/ft/checkpoint.cc +++ b/ft/checkpoint.cc @@ -158,8 +158,8 @@ status_init(void) { STATUS_INIT(CP_TIME_LAST_CHECKPOINT_BEGIN, CHECKPOINT_LAST_BEGAN, UNIXTIME, "last checkpoint began ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CP_TIME_LAST_CHECKPOINT_BEGIN_COMPLETE, CHECKPOINT_LAST_COMPLETE_BEGAN, UNIXTIME, "last complete checkpoint began ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CP_TIME_LAST_CHECKPOINT_END, CHECKPOINT_LAST_COMPLETE_ENDED, UNIXTIME, "last complete checkpoint ended", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); - STATUS_INIT(CP_TIME_CHECKPOINT_DURATION, CHECKPOINT_DURATION, UNIXTIME, "time spent during checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); - STATUS_INIT(CP_TIME_CHECKPOINT_DURATION_LAST, CHECKPOINT_DURATION_LAST, UNIXTIME, "time spent during last checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(CP_TIME_CHECKPOINT_DURATION, CHECKPOINT_DURATION, UINT64, "time spent during checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(CP_TIME_CHECKPOINT_DURATION_LAST, CHECKPOINT_DURATION_LAST, UINT64, "time spent during last checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CP_LAST_LSN, nullptr, UINT64, "last complete checkpoint LSN", TOKU_ENGINE_STATUS); STATUS_INIT(CP_CHECKPOINT_COUNT, CHECKPOINT_TAKEN, UINT64, "checkpoints taken ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CP_CHECKPOINT_COUNT_FAIL, CHECKPOINT_FAILED, UINT64, "checkpoints failed", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); @@ -381,8 +381,8 @@ toku_checkpoint(CHECKPOINTER cp, TOKULOGGER logger, STATUS_VALUE(CP_LONG_BEGIN_TIME) += duration; STATUS_VALUE(CP_LONG_BEGIN_COUNT) += 1; } - STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION) += ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN)); - STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION_LAST) = ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN)); + STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION) += (uint64_t) ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN)); + STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION_LAST) = (uint64_t) ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN)); STATUS_VALUE(CP_FOOTPRINT) = 0; checkpoint_safe_checkpoint_unlock(); From 1e9f2f07b987f7ba89a9bb4d9f36d0ed1a07f015 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Fri, 23 May 2014 10:56:18 -0400 Subject: [PATCH 19/46] #247 fix loader->close fd leak when NOFILE limit exceeded --- ft/ftloader.cc | 4 ++++ src/tests/loader-close-nproc-limit.cc | 13 +++++-------- src/tests/loader-create-close.cc | 14 ++++---------- src/tests/loader-create-commit-nproc-limit.cc | 4 ++-- src/tests/loader-create-nproc-limit.cc | 13 +++++-------- 5 files changed, 20 insertions(+), 28 deletions(-) diff --git a/ft/ftloader.cc b/ft/ftloader.cc index 7214ee10039..67b3cf9905e 100644 --- a/ft/ftloader.cc +++ b/ft/ftloader.cc @@ -356,6 +356,8 @@ int ft_loader_open_temp_file (FTLOADER bl, FIDX *file_idx) */ { int result = 0; + if (result) // debug hack + return result; FILE *f = NULL; int fd = -1; char *fname = toku_strdup(bl->temp_file_template); @@ -2430,6 +2432,8 @@ static int toku_loader_write_ft_from_q (FTLOADER bl, if (r) { result = r; drain_writer_q(q); + r = toku_os_close(fd); + assert_zero(r); return result; } FILE *pivots_stream = toku_bl_fidx2file(bl, pivots_file); diff --git a/src/tests/loader-close-nproc-limit.cc b/src/tests/loader-close-nproc-limit.cc index eee15faa1d3..3ef2b0541f7 100644 --- a/src/tests/loader-close-nproc-limit.cc +++ b/src/tests/loader-close-nproc-limit.cc @@ -85,6 +85,9 @@ PATENT RIGHTS GRANT: under this License. */ +// Verify that loader->close works correctly (does not crash, does not leak memory, returns the right error code) +// when the NPROC limit is exceeded. + #ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." #ident "$Id$" @@ -95,11 +98,7 @@ PATENT RIGHTS GRANT: static int loader_flags = 0; static const char *envdir = TOKU_TEST_FILENAME; -static int put_multiple_generate(DB *UU(dest_db), DB *UU(src_db), DBT_ARRAY *UU(dest_keys), DBT_ARRAY *UU(dest_vals), const DBT *UU(src_key), const DBT *UU(src_val)) { - return ENOMEM; -} - -static void loader_open_close(int ndb) { +static void run_test(int ndb) { int r; char rmcmd[32 + strlen(envdir)]; @@ -109,8 +108,6 @@ static void loader_open_close(int ndb) { DB_ENV *env; r = db_env_create(&env, 0); CKERR(r); - r = env->set_generate_row_callback_for_put(env, put_multiple_generate); - CKERR(r); int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); env->set_errfile(env, stderr); @@ -196,6 +193,6 @@ static void do_args(int argc, char * const argv[]) { int test_main(int argc, char * const *argv) { do_args(argc, argv); - loader_open_close(1); + run_test(1); return 0; } diff --git a/src/tests/loader-create-close.cc b/src/tests/loader-create-close.cc index 6a04387152f..4d66a9df004 100644 --- a/src/tests/loader-create-close.cc +++ b/src/tests/loader-create-close.cc @@ -97,11 +97,7 @@ PATENT RIGHTS GRANT: static int loader_flags = 0; static const char *envdir = TOKU_TEST_FILENAME; -static int put_multiple_generate(DB *UU(dest_db), DB *UU(src_db), DBT_ARRAY *UU(dest_keys), DBT_ARRAY *UU(dest_vals), const DBT *UU(src_key), const DBT *UU(src_val)) { - return ENOMEM; -} - -static void loader_open_abort(int ndb) { +static void test_loader_create_close(int ndb) { int r; char rmcmd[32 + strlen(envdir)]; @@ -111,8 +107,6 @@ static void loader_open_abort(int ndb) { DB_ENV *env; r = db_env_create(&env, 0); CKERR(r); - r = env->set_generate_row_callback_for_put(env, put_multiple_generate); - CKERR(r); int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); env->set_errfile(env, stderr); @@ -181,8 +175,8 @@ static void do_args(int argc, char * const argv[]) { int test_main(int argc, char * const *argv) { do_args(argc, argv); - loader_open_abort(0); - loader_open_abort(1); - loader_open_abort(2); + test_loader_create_close(0); + test_loader_create_close(1); + test_loader_create_close(2); return 0; } diff --git a/src/tests/loader-create-commit-nproc-limit.cc b/src/tests/loader-create-commit-nproc-limit.cc index 26ce5e478ed..091809a8551 100644 --- a/src/tests/loader-create-commit-nproc-limit.cc +++ b/src/tests/loader-create-commit-nproc-limit.cc @@ -99,7 +99,7 @@ PATENT RIGHTS GRANT: static int loader_flags = 0; static const char *envdir = TOKU_TEST_FILENAME; -static void loader_create_commit(int ndb) { +static void run_test(int ndb) { int r; char rmcmd[32 + strlen(envdir)]; @@ -206,6 +206,6 @@ static void do_args(int argc, char * const argv[]) { int test_main(int argc, char * const *argv) { do_args(argc, argv); - loader_create_commit(1); + run_test(1); return 0; } diff --git a/src/tests/loader-create-nproc-limit.cc b/src/tests/loader-create-nproc-limit.cc index 09f57c9019c..7a61fce7799 100644 --- a/src/tests/loader-create-nproc-limit.cc +++ b/src/tests/loader-create-nproc-limit.cc @@ -85,6 +85,9 @@ PATENT RIGHTS GRANT: under this License. */ +// Verify that env->create_loader works correctly (does not crash, does not leak memory, returns the right error code) +// when the NPROC limit is exceeded. + #ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." #ident "$Id$" @@ -95,11 +98,7 @@ PATENT RIGHTS GRANT: static int loader_flags = 0; static const char *envdir = TOKU_TEST_FILENAME; -static int put_multiple_generate(DB *UU(dest_db), DB *UU(src_db), DBT_ARRAY *UU(dest_keys), DBT_ARRAY *UU(dest_vals), const DBT *UU(src_key), const DBT *UU(src_val)) { - return ENOMEM; -} - -static void loader_open_close(int ndb) { +static void run_test(int ndb) { int r; char rmcmd[32 + strlen(envdir)]; @@ -109,8 +108,6 @@ static void loader_open_close(int ndb) { DB_ENV *env; r = db_env_create(&env, 0); CKERR(r); - r = env->set_generate_row_callback_for_put(env, put_multiple_generate); - CKERR(r); int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); env->set_errfile(env, stderr); @@ -197,6 +194,6 @@ static void do_args(int argc, char * const argv[]) { int test_main(int argc, char * const *argv) { do_args(argc, argv); - loader_open_close(1); + run_test(1); return 0; } From 598105c433bc55ee0b4fb8fbf5ffaaa2f9a63a35 Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Fri, 23 May 2014 14:02:53 -0400 Subject: [PATCH 20/46] vectorized loops in new deserialization code #226 also a typo --- ft/ft_node-serialize.cc | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/ft/ft_node-serialize.cc b/ft/ft_node-serialize.cc index 2eca2891bbc..4e4f231c21d 100644 --- a/ft/ft_node-serialize.cc +++ b/ft/ft_node-serialize.cc @@ -1112,7 +1112,7 @@ deserialize_child_buffer_v26(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf, } } -// effect: deserialize a single message from rbuf and enque the result into the given fifo +// effect: deserialize a single message from rbuf and enqueue the result into the given fifo static void fifo_deserialize_msg_from_rbuf(FIFO fifo, struct rbuf *rbuf) { bytevec key, val; @@ -1144,16 +1144,25 @@ deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf) { // read in each message tree (fresh, stale, broadcast) nfresh = rbuf_int(rbuf); + bytevec fresh_offsets_src_v; + rbuf_literal_bytes(rbuf, &fresh_offsets_v, nfresh * (sizeof *fresh_offsets)); + const int32_t *fresh_offsets_src = (const int32_t *) fresh_offsets_src_v; for (int i = 0; i < nfresh; i++) { - fresh_offsets[i] = rbuf_int(rbuf); + fresh_offsets[i] = toku_dtoh32(fresh_offsets_src[i]); } nstale = rbuf_int(rbuf); + bytevec stale_offsets_src_v; + rbuf_literal_bytes(rbuf, &stale_offsets_v, nstale * (sizeof *stale_offsets)); + const int32_t *stale_offsets_src = (const int32_t *) stale_offsets_src_v; for (int i = 0; i < nstale; i++) { - stale_offsets[i] = rbuf_int(rbuf); + stale_offsets[i] = toku_dtoh32(stale_offsets_src[i]); } nbroadcast_offsets = rbuf_int(rbuf); + bytevec broadcast_offsets_src_v; + rbuf_literal_bytes(rbuf, &broadcast_offsets_v, nbroadcast_offsets * (sizeof *broadcast_offsets)); + const int32_t *broadcast_offsets_src = (const int32_t *) broadcast_offsets_src_v; for (int i = 0; i < nbroadcast_offsets; i++) { - broadcast_offsets[i] = rbuf_int(rbuf); + broadcast_offsets[i] = toku_dtoh32(broadcast_offsets_src[i]); } // build OMTs out of each offset array From 97d542a3b4216209f7006f08c884658508147af6 Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Fri, 23 May 2014 14:25:54 -0400 Subject: [PATCH 21/46] fixed typo #226 --- ft/ft_node-serialize.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ft/ft_node-serialize.cc b/ft/ft_node-serialize.cc index 4e4f231c21d..1090bca6ca0 100644 --- a/ft/ft_node-serialize.cc +++ b/ft/ft_node-serialize.cc @@ -1145,21 +1145,21 @@ deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf) { // read in each message tree (fresh, stale, broadcast) nfresh = rbuf_int(rbuf); bytevec fresh_offsets_src_v; - rbuf_literal_bytes(rbuf, &fresh_offsets_v, nfresh * (sizeof *fresh_offsets)); + rbuf_literal_bytes(rbuf, &fresh_offsets_src_v, nfresh * (sizeof *fresh_offsets)); const int32_t *fresh_offsets_src = (const int32_t *) fresh_offsets_src_v; for (int i = 0; i < nfresh; i++) { fresh_offsets[i] = toku_dtoh32(fresh_offsets_src[i]); } nstale = rbuf_int(rbuf); bytevec stale_offsets_src_v; - rbuf_literal_bytes(rbuf, &stale_offsets_v, nstale * (sizeof *stale_offsets)); + rbuf_literal_bytes(rbuf, &stale_offsets_src_v, nstale * (sizeof *stale_offsets)); const int32_t *stale_offsets_src = (const int32_t *) stale_offsets_src_v; for (int i = 0; i < nstale; i++) { stale_offsets[i] = toku_dtoh32(stale_offsets_src[i]); } nbroadcast_offsets = rbuf_int(rbuf); bytevec broadcast_offsets_src_v; - rbuf_literal_bytes(rbuf, &broadcast_offsets_v, nbroadcast_offsets * (sizeof *broadcast_offsets)); + rbuf_literal_bytes(rbuf, &broadcast_offsets_src_v, nbroadcast_offsets * (sizeof *broadcast_offsets)); const int32_t *broadcast_offsets_src = (const int32_t *) broadcast_offsets_src_v; for (int i = 0; i < nbroadcast_offsets; i++) { broadcast_offsets[i] = toku_dtoh32(broadcast_offsets_src[i]); From c92c8cc9673a5b639cf3e234140dbeab92bd82ab Mon Sep 17 00:00:00 2001 From: John Esmet Date: Fri, 23 May 2014 18:10:29 -0400 Subject: [PATCH 22/46] FT-242 Begin breaking up fttypes.h by moving many things to their appropriate headers --- ft/block_table.h | 2 +- ft/cachetable.h | 39 +++++++- ft/fifo.cc | 2 +- ft/fifo.h | 28 +----- ft/ft-cachetable-wrappers.h | 4 +- ft/ft-flusher.h | 22 ++++- ft/ft-internal.h | 90 +++++------------- ft/ft-ops.cc | 31 +++++- ft/ft-ops.h | 6 +- ft/ft-search.h | 1 + ft/ft_msg.h | 116 ++++++++++++++++++++--- ft/ft_node-serialize.cc | 2 + ft/ftloader.h | 2 + ft/fttypes.h | 183 +----------------------------------- ft/le-cursor.cc | 7 +- ft/leafentry.h | 7 +- ft/log-internal.h | 6 ++ ft/logger.h | 5 +- ft/rollback.h | 10 +- ft/sub_block.h | 1 + ft/txn_manager.h | 17 ++++ 21 files changed, 277 insertions(+), 304 deletions(-) diff --git a/ft/block_table.h b/ft/block_table.h index a9f17ad0e7e..72c914988fa 100644 --- a/ft/block_table.h +++ b/ft/block_table.h @@ -92,7 +92,7 @@ PATENT RIGHTS GRANT: #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." #include "fttypes.h" - +#include "ft/ft-internal.h" typedef struct block_table *BLOCK_TABLE; diff --git a/ft/cachetable.h b/ft/cachetable.h index 9c11db02e00..c4290b6f6d7 100644 --- a/ft/cachetable.h +++ b/ft/cachetable.h @@ -111,6 +111,42 @@ PATENT RIGHTS GRANT: typedef BLOCKNUM CACHEKEY; +class checkpointer; +typedef class checkpointer *CHECKPOINTER; +typedef struct cachetable *CACHETABLE; +typedef struct cachefile *CACHEFILE; +typedef struct ctpair *PAIR; + +// This struct hold information about values stored in the cachetable. +// As one can tell from the names, we are probably violating an +// abstraction layer by placing names. +// +// The purpose of having this struct is to have a way for the +// cachetable to accumulate the some totals we are interested in. +// Breaking this abstraction layer by having these names was the +// easiest way. +// +typedef struct pair_attr_s { + long size; // size PAIR's value takes in memory + long nonleaf_size; // size if PAIR is a nonleaf node, 0 otherwise, used only for engine status + long leaf_size; // size if PAIR is a leaf node, 0 otherwise, used only for engine status + long rollback_size; // size of PAIR is a rollback node, 0 otherwise, used only for engine status + long cache_pressure_size; // amount PAIR contributes to cache pressure, is sum of buffer sizes and workdone counts + bool is_valid; +} PAIR_ATTR; + +static inline PAIR_ATTR make_pair_attr(long size) { + PAIR_ATTR result={ + .size = size, + .nonleaf_size = 0, + .leaf_size = 0, + .rollback_size = 0, + .cache_pressure_size = 0, + .is_valid = true + }; + return result; +} + void toku_set_cleaner_period (CACHETABLE ct, uint32_t new_period); uint32_t toku_get_cleaner_period_unlocked (CACHETABLE ct); void toku_set_cleaner_iterations (CACHETABLE ct, uint32_t new_iterations); @@ -394,8 +430,9 @@ struct unlockers { bool locked; void (*f)(void* extra); void *extra; - UNLOCKERS next; + struct unlockers *next; }; +typedef struct unlockers *UNLOCKERS; // Effect: If the block is in the cachetable, then return it. // Otherwise call the functions in unlockers, fetch the data (but don't pin it, since we'll just end up pinning it again later), and return TOKUDB_TRY_AGAIN. diff --git a/ft/fifo.cc b/ft/fifo.cc index 07d7baec2a1..6acd29be67c 100644 --- a/ft/fifo.cc +++ b/ft/fifo.cc @@ -172,7 +172,7 @@ int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *d toku_fifo_resize(fifo, next_2); } struct fifo_entry *entry = (struct fifo_entry *)(fifo->memory + fifo->memory_used); - fifo_entry_set_msg_type(entry, type); + entry->type = (unsigned char) type; entry->msn = msn; xids_cpy(&entry->xids_s, xids); entry->is_fresh = is_fresh; diff --git a/ft/fifo.h b/ft/fifo.h index e9f53248b98..5333ca905a7 100644 --- a/ft/fifo.h +++ b/ft/fifo.h @@ -91,10 +91,10 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "fttypes.h" -#include "xids-internal.h" -#include "xids.h" - +#include "ft/fttypes.h" +#include "ft/xids-internal.h" +#include "ft/xids.h" +#include "ft/ft_msg.h" // If the fifo_entry is unpacked, the compiler aligns the xids array and we waste a lot of space struct __attribute__((__packed__)) fifo_entry { @@ -106,24 +106,6 @@ struct __attribute__((__packed__)) fifo_entry { XIDS_S xids_s; }; -// get and set the ft message type for a fifo entry. -// it is internally stored as a single unsigned char. -static inline enum ft_msg_type -fifo_entry_get_msg_type(const struct fifo_entry * entry) -{ - enum ft_msg_type msg_type; - msg_type = (enum ft_msg_type) entry->type; - return msg_type; -} - -static inline void -fifo_entry_set_msg_type(struct fifo_entry * entry, - enum ft_msg_type msg_type) -{ - unsigned char type = (unsigned char) msg_type; - entry->type = type; -} - typedef struct fifo *FIFO; int toku_fifo_create(FIFO *); @@ -150,7 +132,7 @@ void toku_fifo_iterate(FIFO, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,IT struct fifo_entry *e = toku_fifo_iterate_internal_get_entry(fifo, fifo_iterate_off); \ ITEMLEN keylenvar = e->keylen; \ ITEMLEN datalenvar = e->vallen; \ - enum ft_msg_type typevar = fifo_entry_get_msg_type(e); \ + enum ft_msg_type typevar = (enum ft_msg_type) e->type; \ MSN msnvar = e->msn; \ XIDS xidsvar = &e->xids_s; \ bytevec keyvar = xids_get_end_of_array(xidsvar); \ diff --git a/ft/ft-cachetable-wrappers.h b/ft/ft-cachetable-wrappers.h index 9a56f4ff220..eb33f7cfe05 100644 --- a/ft/ft-cachetable-wrappers.h +++ b/ft/ft-cachetable-wrappers.h @@ -92,8 +92,8 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include -#include "cachetable.h" +#include "ft/ft-internal.h" +#include "ft/cachetable.h" /** * Put an empty node (that is, no fields filled) into the cachetable. diff --git a/ft/ft-flusher.h b/ft/ft-flusher.h index 0788bf665d3..0861669157a 100644 --- a/ft/ft-flusher.h +++ b/ft/ft-flusher.h @@ -91,8 +91,7 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -// This must be first to make the 64-bit file mode work right in Linux -#include "fttypes.h" +#include "ft/ft-internal.h" typedef enum { FT_FLUSHER_CLEANER_TOTAL_NODES = 0, // total number of nodes whose buffers are potentially flushed by cleaner thread @@ -158,6 +157,23 @@ toku_ft_flush_node_on_background_thread( FTNODE parent ); +// Given pinned node and pinned child, split child into two +// and update node with information about its new child. +void toku_ft_split_child( + FT h, + FTNODE node, + int childnum, + FTNODE child, + enum split_mode split_mode + ); +// Given pinned node, merge childnum with a neighbor and update node with +// information about the change +void toku_ft_merge_child( + FT ft, + FTNODE node, + int childnum + ); + /** * Effect: Split a leaf node. * Argument "node" is node to be split. @@ -166,6 +182,7 @@ toku_ft_flush_node_on_background_thread( * nodea is the left node that results from the split * splitk is the right-most key of nodea */ +// TODO: Rename toku_ft_leaf_split void ftleaf_split( FT h, @@ -189,6 +206,7 @@ ftleaf_split( * but it does not guarantee that the resulting nodes are smaller than nodesize. */ void +// TODO: Rename toku_ft_nonleaf_split ft_nonleaf_split( FT h, FTNODE node, diff --git a/ft/ft-internal.h b/ft/ft-internal.h index 42d27638330..9d065a3f5de 100644 --- a/ft/ft-internal.h +++ b/ft/ft-internal.h @@ -115,7 +115,9 @@ PATENT RIGHTS GRANT: #include "compress.h" #include #include -#include "bndata.h" +#include "ft/bndata.h" +#include "ft/rollback.h" +#include "ft/ft-search.h" enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */ enum { FT_MSG_OVERHEAD = (2 + sizeof(MSN)) }; // the type plus freshness plus MSN @@ -136,6 +138,18 @@ enum ftnode_fetch_type { ftnode_fetch_keymatch, // one child is needed if it holds both keys }; +enum split_mode { + SPLIT_EVENLY, + SPLIT_LEFT_HEAVY, + SPLIT_RIGHT_HEAVY +}; + +enum reactivity { + RE_STABLE, + RE_FUSIBLE, + RE_FISSIBLE +}; + static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) UU(); static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) { switch (type) { @@ -187,6 +201,7 @@ struct ftnode_fetch_extra { tokutime_t decompress_time; tokutime_t deserialize_time; }; +typedef struct ftnode_fetch_extra *FTNODE_FETCH_EXTRA; struct toku_fifo_entry_key_msn_heaviside_extra { DESCRIPTOR desc; @@ -573,6 +588,7 @@ struct ft { // is this ft a blackhole? if so, all messages are dropped. bool blackhole; }; +typedef struct ft *FT; // Allocate a DB struct off the stack and only set its comparison // descriptor. We don't bother setting any other fields because @@ -738,22 +754,6 @@ int toku_ftnode_cleaner_callback( void *ftnode_pv, BLOCKNUM blocknum, uint32_t f void toku_evict_bn_from_memory(FTNODE node, int childnum, FT h); BASEMENTNODE toku_detach_bn(FTNODE node, int childnum); -// Given pinned node and pinned child, split child into two -// and update node with information about its new child. -void toku_ft_split_child( - FT h, - FTNODE node, - int childnum, - FTNODE child, - enum split_mode split_mode - ); -// Given pinned node, merge childnum with a neighbor and update node with -// information about the change -void toku_ft_merge_child( - FT ft, - FTNODE node, - int childnum - ); static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(FT h) { CACHETABLE_WRITE_CALLBACK wc; wc.flush_callback = toku_ftnode_flush_callback; @@ -766,27 +766,6 @@ static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(FT h) { return wc; } -static const FTNODE null_ftnode=0; - -/* an ft cursor is represented as a kv pair in a tree */ -struct ft_cursor { - struct toku_list cursors_link; - FT_HANDLE ft_handle; - DBT key, val; // The key-value pair that the cursor currently points to - DBT range_lock_left_key, range_lock_right_key; - bool prefetching; - bool left_is_neg_infty, right_is_pos_infty; - bool is_snapshot_read; // true if query is read_committed, false otherwise - bool is_leaf_mode; - bool disable_prefetching; - bool is_temporary; - int out_of_range_error; - int direction; - TOKUTXN ttxn; - FT_CHECK_INTERRUPT_CALLBACK interrupt_cb; - void *interrupt_cb_extra; -}; - // // Helper function to fill a ftnode_fetch_extra with data // that will tell the fetch callback that the entire node is @@ -922,43 +901,22 @@ static inline void destroy_bfe_for_prefetch(struct ftnode_fetch_extra *bfe) { } // this is in a strange place because it needs the cursor struct to be defined -static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe, - FT h, - FT_CURSOR c) { - paranoid_invariant(h->h->type == FT_CURRENT); - bfe->type = ftnode_fetch_prefetch; - bfe->h = h; - bfe->search = NULL; - toku_init_dbt(&bfe->range_lock_left_key); - toku_init_dbt(&bfe->range_lock_right_key); - const DBT *left = &c->range_lock_left_key; - if (left->data) { - toku_clone_dbt(&bfe->range_lock_left_key, *left); - } - const DBT *right = &c->range_lock_right_key; - if (right->data) { - toku_clone_dbt(&bfe->range_lock_right_key, *right); - } - bfe->left_is_neg_infty = c->left_is_neg_infty; - bfe->right_is_pos_infty = c->right_is_pos_infty; - bfe->child_to_read = -1; - bfe->disable_prefetching = c->disable_prefetching; - bfe->read_all_partitions = false; - bfe->bytes_read = 0; - bfe->io_time = 0; - bfe->deserialize_time = 0; - bfe->decompress_time = 0; -} +void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe, + FT h, + struct ft_cursor *c); struct ancestors { FTNODE node; // This is the root node if next is NULL. int childnum; // which buffer holds messages destined to the node whose ancestors this list represents. - ANCESTORS next; // Parent of this node (so next->node.(next->childnum) refers to this node). + struct ancestors *next; // Parent of this node (so next->node.(next->childnum) refers to this node). }; +typedef struct ancestors *ANCESTORS; + struct pivot_bounds { const DBT * const lower_bound_exclusive; const DBT * const upper_bound_inclusive; // NULL to indicate negative or positive infinity (which are in practice exclusive since there are now transfinite keys in messages). }; +typedef struct pivot_bounds const * const PIVOT_BOUNDS; __attribute__((nonnull)) void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node); diff --git a/ft/ft-ops.cc b/ft/ft-ops.cc index ab7de1a0a2c..dacb7c15448 100644 --- a/ft/ft-ops.cc +++ b/ft/ft-ops.cc @@ -419,6 +419,7 @@ toku_ft_get_status(FT_STATUS s) { } \ } while (0) + void toku_note_deserialized_basement_node(bool fixed_key_size) { if (fixed_key_size) { STATUS_INC(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, 1); @@ -4397,7 +4398,7 @@ do_bn_apply_msg(FT_HANDLE t, BASEMENTNODE bn, struct fifo_entry *entry, txn_gc_i if (entry->msn.msn > bn->max_msn_applied.msn) { ITEMLEN keylen = entry->keylen; ITEMLEN vallen = entry->vallen; - enum ft_msg_type type = fifo_entry_get_msg_type(entry); + enum ft_msg_type type = (enum ft_msg_type) entry->type; MSN msn = entry->msn; const XIDS xids = (XIDS) &entry->xids_s; bytevec key = xids_get_end_of_array(xids); @@ -5129,6 +5130,34 @@ ftnode_pf_callback_and_free_bfe(void *ftnode_pv, void* disk_data, void *read_ext return r; } +void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe, + FT h, + struct ft_cursor *c) { + paranoid_invariant(h->h->type == FT_CURRENT); + bfe->type = ftnode_fetch_prefetch; + bfe->h = h; + bfe->search = NULL; + toku_init_dbt(&bfe->range_lock_left_key); + toku_init_dbt(&bfe->range_lock_right_key); + const DBT *left = &c->range_lock_left_key; + if (left->data) { + toku_clone_dbt(&bfe->range_lock_left_key, *left); + } + const DBT *right = &c->range_lock_right_key; + if (right->data) { + toku_clone_dbt(&bfe->range_lock_right_key, *right); + } + bfe->left_is_neg_infty = c->left_is_neg_infty; + bfe->right_is_pos_infty = c->right_is_pos_infty; + bfe->child_to_read = -1; + bfe->disable_prefetching = c->disable_prefetching; + bfe->read_all_partitions = false; + bfe->bytes_read = 0; + bfe->io_time = 0; + bfe->deserialize_time = 0; + bfe->decompress_time = 0; +} + static void ft_node_maybe_prefetch(FT_HANDLE ft_handle, FTNODE node, int childnum, FT_CURSOR ftcursor, bool *doprefetch) { // the number of nodes to prefetch diff --git a/ft/ft-ops.h b/ft/ft-ops.h index b482d2b8206..b00eefe2f6d 100644 --- a/ft/ft-ops.h +++ b/ft/ft-ops.h @@ -100,6 +100,8 @@ PATENT RIGHTS GRANT: #include "log.h" #include "ft-search.h" #include "compress.h" +#include "ft_msg.h" +#include "ft/cursor.h" // A callback function is invoked with the key, and the data. // The pointers (to the bytevecs) must not be modified. The data must be copied out before the callback function returns. @@ -114,8 +116,6 @@ PATENT RIGHTS GRANT: // When lock_only is true, the callback only does optional lock tree locking. typedef int(*FT_GET_CALLBACK_FUNCTION)(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only); -typedef bool(*FT_CHECK_INTERRUPT_CALLBACK)(void* extra); - int toku_open_ft_handle (const char *fname, int is_create, FT_HANDLE *, int nodesize, int basementnodesize, enum toku_compression_method compression_method, CACHETABLE, TOKUTXN, int(*)(DB *,const DBT*,const DBT*)) __attribute__ ((warn_unused_result)); // effect: changes the descriptor for the ft of the given handle. @@ -246,6 +246,7 @@ void toku_ft_maybe_delete (FT_HANDLE ft_h, DBT *k, TOKUTXN txn, bool oplsn_valid TXNID toku_ft_get_oldest_referenced_xid_estimate(FT_HANDLE ft_h); TXN_MANAGER toku_ft_get_txn_manager(FT_HANDLE ft_h); +class txn_gc_info; void toku_ft_send_insert(FT_HANDLE ft_h, DBT *key, DBT *val, XIDS xids, enum ft_msg_type type, txn_gc_info *gc_info); void toku_ft_send_delete(FT_HANDLE ft_h, DBT *key, XIDS xids, txn_gc_info *gc_info); void toku_ft_send_commit_any(FT_HANDLE ft_h, DBT *key, XIDS xids, txn_gc_info *gc_info); @@ -258,7 +259,6 @@ extern int toku_ft_debug_mode; int toku_verify_ft (FT_HANDLE ft_h) __attribute__ ((warn_unused_result)); int toku_verify_ft_with_progress (FT_HANDLE ft_h, int (*progress_callback)(void *extra, float progress), void *extra, int verbose, int keep_going) __attribute__ ((warn_unused_result)); -typedef struct ft_cursor *FT_CURSOR; int toku_ft_cursor (FT_HANDLE, FT_CURSOR*, TOKUTXN, bool, bool) __attribute__ ((warn_unused_result)); void toku_ft_cursor_set_leaf_mode(FT_CURSOR); // Sets a boolean on the ft cursor that prevents uncessary copying of diff --git a/ft/ft-search.h b/ft/ft-search.h index 9c26be456de..8e8fece6a3c 100644 --- a/ft/ft-search.h +++ b/ft/ft-search.h @@ -92,6 +92,7 @@ PATENT RIGHTS GRANT: #ifndef FT_SEARCH_H #define FT_SEARCH_H +#include "ft/ybt.h" enum ft_search_direction_e { FT_SEARCH_LEFT = 1, /* search left -> right, finds min xy as defined by the compare function */ diff --git a/ft/ft_msg.h b/ft/ft_msg.h index f468d7f647b..2f996c6558f 100644 --- a/ft/ft_msg.h +++ b/ft/ft_msg.h @@ -97,6 +97,107 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." +/* tree command types */ +enum ft_msg_type { + FT_NONE = 0, + FT_INSERT = 1, + FT_DELETE_ANY = 2, // Delete any matching key. This used to be called FT_DELETE. + //FT_DELETE_BOTH = 3, + FT_ABORT_ANY = 4, // Abort any commands on any matching key. + //FT_ABORT_BOTH = 5, // Abort commands that match both the key and the value + FT_COMMIT_ANY = 6, + //FT_COMMIT_BOTH = 7, + FT_COMMIT_BROADCAST_ALL = 8, // Broadcast to all leafentries, (commit all transactions). + FT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction). + FT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction). + FT_INSERT_NO_OVERWRITE = 11, + FT_OPTIMIZE = 12, // Broadcast + FT_OPTIMIZE_FOR_UPGRADE = 13, // same as FT_OPTIMIZE, but record version number in leafnode + FT_UPDATE = 14, + FT_UPDATE_BROADCAST_ALL = 15 +}; + +static inline bool +ft_msg_type_applies_once(enum ft_msg_type type) +{ + bool ret_val; + switch (type) { + case FT_INSERT_NO_OVERWRITE: + case FT_INSERT: + case FT_DELETE_ANY: + case FT_ABORT_ANY: + case FT_COMMIT_ANY: + case FT_UPDATE: + ret_val = true; + break; + case FT_COMMIT_BROADCAST_ALL: + case FT_COMMIT_BROADCAST_TXN: + case FT_ABORT_BROADCAST_TXN: + case FT_OPTIMIZE: + case FT_OPTIMIZE_FOR_UPGRADE: + case FT_UPDATE_BROADCAST_ALL: + case FT_NONE: + ret_val = false; + break; + default: + assert(false); + } + return ret_val; +} + +static inline bool +ft_msg_type_applies_all(enum ft_msg_type type) +{ + bool ret_val; + switch (type) { + case FT_NONE: + case FT_INSERT_NO_OVERWRITE: + case FT_INSERT: + case FT_DELETE_ANY: + case FT_ABORT_ANY: + case FT_COMMIT_ANY: + case FT_UPDATE: + ret_val = false; + break; + case FT_COMMIT_BROADCAST_ALL: + case FT_COMMIT_BROADCAST_TXN: + case FT_ABORT_BROADCAST_TXN: + case FT_OPTIMIZE: + case FT_OPTIMIZE_FOR_UPGRADE: + case FT_UPDATE_BROADCAST_ALL: + ret_val = true; + break; + default: + assert(false); + } + return ret_val; +} + +static inline bool +ft_msg_type_does_nothing(enum ft_msg_type type) +{ + return (type == FT_NONE); +} + +typedef struct xids_t *XIDS; + +/* tree commands */ +struct ft_msg { + enum ft_msg_type type; + MSN msn; // message sequence number + XIDS xids; + union { + /* insert or delete */ + struct ft_msg_insert_delete { + const DBT *key; // for insert, delete, upsertdel + const DBT *val; // for insert, delete, (and it is the "extra" for upsertdel, upsertdel_broadcast_all) + } id; + } u; +}; + +// Message sent into the ft to implement insert, delete, update, etc +typedef struct ft_msg FT_MSG_S; +typedef struct ft_msg *FT_MSG; uint32_t ft_msg_get_keylen(FT_MSG ft_msg); @@ -104,21 +205,10 @@ uint32_t ft_msg_get_vallen(FT_MSG ft_msg); XIDS ft_msg_get_xids(FT_MSG ft_msg); -void * ft_msg_get_key(FT_MSG ft_msg); +void *ft_msg_get_key(FT_MSG ft_msg); -void * ft_msg_get_val(FT_MSG ft_msg); +void *ft_msg_get_val(FT_MSG ft_msg); enum ft_msg_type ft_msg_get_type(FT_MSG ft_msg); -void ft_msg_from_fifo_msg(FT_MSG ft_msg, FIFO_MSG fifo_msg); - -#if 0 - -void ft_msg_from_dbts(FT_MSG ft_msg, DBT *key, DBT *val, XIDS xids, enum ft_msg_type type); - #endif - - - -#endif // FT_MSG_H - diff --git a/ft/ft_node-serialize.cc b/ft/ft_node-serialize.cc index 1090bca6ca0..13512be6e50 100644 --- a/ft/ft_node-serialize.cc +++ b/ft/ft_node-serialize.cc @@ -98,6 +98,8 @@ PATENT RIGHTS GRANT: #include "ft.h" #include #include +#include "ft/cachetable.h" +#include "ft/rollback.h" static FT_UPGRADE_STATUS_S ft_upgrade_status; diff --git a/ft/ftloader.h b/ft/ftloader.h index c920b4c5362..ab78af34ea2 100644 --- a/ft/ftloader.h +++ b/ft/ftloader.h @@ -92,6 +92,8 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." +#include "ft/cachetable.h" + // The loader callbacks are C functions and need to be defined as such typedef void (*ft_loader_error_func)(DB *, int which_db, int err, DBT *key, DBT *val, void *extra); diff --git a/ft/fttypes.h b/ft/fttypes.h index 73e228cf6ff..c176eb24896 100644 --- a/ft/fttypes.h +++ b/ft/fttypes.h @@ -115,9 +115,6 @@ typedef struct ft *FT; typedef struct ft_header *FT_HEADER; typedef struct ft_options *FT_OPTIONS; -struct wbuf; -struct dbuf; - typedef unsigned int ITEMLEN; typedef const void *bytevec; @@ -143,36 +140,6 @@ typedef TOKU_XA_XID *XIDP; // this is the type that's passed to the logger code static inline BLOCKNUM make_blocknum(int64_t b) { BLOCKNUM result={b}; return result; } -// This struct hold information about values stored in the cachetable. -// As one can tell from the names, we are probably violating an -// abstraction layer by placing names. -// -// The purpose of having this struct is to have a way for the -// cachetable to accumulate the some totals we are interested in. -// Breaking this abstraction layer by having these names was the -// easiest way. -// -typedef struct pair_attr_s { - long size; // size PAIR's value takes in memory - long nonleaf_size; // size if PAIR is a nonleaf node, 0 otherwise, used only for engine status - long leaf_size; // size if PAIR is a leaf node, 0 otherwise, used only for engine status - long rollback_size; // size of PAIR is a rollback node, 0 otherwise, used only for engine status - long cache_pressure_size; // amount PAIR contributes to cache pressure, is sum of buffer sizes and workdone counts - bool is_valid; -} PAIR_ATTR; - -static inline PAIR_ATTR make_pair_attr(long size) { - PAIR_ATTR result={ - .size = size, - .nonleaf_size = 0, - .leaf_size = 0, - .rollback_size = 0, - .cache_pressure_size = 0, - .is_valid = true - }; - return result; -} - typedef struct { uint32_t len; char *data; @@ -221,162 +188,14 @@ typedef struct tokutxn *TOKUTXN; typedef struct txninfo *TXNINFO; #define NULL_TXN ((TOKUTXN)0) -struct logged_btt_pair { - DISKOFF off; - int32_t size; -}; - -typedef struct cachetable *CACHETABLE; -typedef struct cachefile *CACHEFILE; -typedef struct ctpair *PAIR; -typedef class checkpointer *CHECKPOINTER; -class bn_data; - -/* tree command types */ -enum ft_msg_type { - FT_NONE = 0, - FT_INSERT = 1, - FT_DELETE_ANY = 2, // Delete any matching key. This used to be called FT_DELETE. - //FT_DELETE_BOTH = 3, - FT_ABORT_ANY = 4, // Abort any commands on any matching key. - //FT_ABORT_BOTH = 5, // Abort commands that match both the key and the value - FT_COMMIT_ANY = 6, - //FT_COMMIT_BOTH = 7, - FT_COMMIT_BROADCAST_ALL = 8, // Broadcast to all leafentries, (commit all transactions). - FT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction). - FT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction). - FT_INSERT_NO_OVERWRITE = 11, - FT_OPTIMIZE = 12, // Broadcast - FT_OPTIMIZE_FOR_UPGRADE = 13, // same as FT_OPTIMIZE, but record version number in leafnode - FT_UPDATE = 14, - FT_UPDATE_BROADCAST_ALL = 15 -}; - -static inline bool -ft_msg_type_applies_once(enum ft_msg_type type) -{ - bool ret_val; - switch (type) { - case FT_INSERT_NO_OVERWRITE: - case FT_INSERT: - case FT_DELETE_ANY: - case FT_ABORT_ANY: - case FT_COMMIT_ANY: - case FT_UPDATE: - ret_val = true; - break; - case FT_COMMIT_BROADCAST_ALL: - case FT_COMMIT_BROADCAST_TXN: - case FT_ABORT_BROADCAST_TXN: - case FT_OPTIMIZE: - case FT_OPTIMIZE_FOR_UPGRADE: - case FT_UPDATE_BROADCAST_ALL: - case FT_NONE: - ret_val = false; - break; - default: - assert(false); - } - return ret_val; -} - -static inline bool -ft_msg_type_applies_all(enum ft_msg_type type) -{ - bool ret_val; - switch (type) { - case FT_NONE: - case FT_INSERT_NO_OVERWRITE: - case FT_INSERT: - case FT_DELETE_ANY: - case FT_ABORT_ANY: - case FT_COMMIT_ANY: - case FT_UPDATE: - ret_val = false; - break; - case FT_COMMIT_BROADCAST_ALL: - case FT_COMMIT_BROADCAST_TXN: - case FT_ABORT_BROADCAST_TXN: - case FT_OPTIMIZE: - case FT_OPTIMIZE_FOR_UPGRADE: - case FT_UPDATE_BROADCAST_ALL: - ret_val = true; - break; - default: - assert(false); - } - return ret_val; -} - -static inline bool -ft_msg_type_does_nothing(enum ft_msg_type type) -{ - return (type == FT_NONE); -} - typedef struct xids_t *XIDS; -typedef struct fifo_msg_t *FIFO_MSG; -/* tree commands */ -struct ft_msg { - enum ft_msg_type type; - MSN msn; // message sequence number - XIDS xids; - union { - /* insert or delete */ - struct ft_msg_insert_delete { - const DBT *key; // for insert, delete, upsertdel - const DBT *val; // for insert, delete, (and it is the "extra" for upsertdel, upsertdel_broadcast_all) - } id; - } u; -}; - -// Message sent into the ft to implement insert, delete, update, etc -typedef struct ft_msg FT_MSG_S; -typedef struct ft_msg *FT_MSG; typedef int (*ft_compare_func)(DB *, const DBT *, const DBT *); typedef void (*setval_func)(const DBT *, void *); typedef int (*ft_update_func)(DB *, const DBT *, const DBT *, const DBT *, setval_func, void *); -typedef void (*on_redirect_callback)(FT_HANDLE, void*); typedef void (*remove_ft_ref_callback)(FT, void*); +typedef void (*on_redirect_callback)(FT_HANDLE, void*); #define UU(x) x __attribute__((__unused__)) -typedef struct memarena *MEMARENA; -typedef struct rollback_log_node *ROLLBACK_LOG_NODE; -typedef struct serialized_rollback_log_node *SERIALIZED_ROLLBACK_LOG_NODE; - -// -// Types of snapshots that can be taken by a tokutxn -// - TXN_SNAPSHOT_NONE: means that there is no snapshot. Reads do not use snapshot reads. -// used for SERIALIZABLE and READ UNCOMMITTED -// - TXN_SNAPSHOT_ROOT: means that all tokutxns use their root transaction's snapshot -// used for REPEATABLE READ -// - TXN_SNAPSHOT_CHILD: means that each child tokutxn creates its own snapshot -// used for READ COMMITTED -// - -typedef enum __TXN_SNAPSHOT_TYPE { - TXN_SNAPSHOT_NONE=0, - TXN_SNAPSHOT_ROOT=1, - TXN_SNAPSHOT_CHILD=2 -} TXN_SNAPSHOT_TYPE; - -typedef struct ancestors *ANCESTORS; -typedef struct pivot_bounds const * const PIVOT_BOUNDS; -typedef struct ftnode_fetch_extra *FTNODE_FETCH_EXTRA; -typedef struct unlockers *UNLOCKERS; - -enum reactivity { - RE_STABLE, - RE_FUSIBLE, - RE_FISSIBLE -}; - -enum split_mode { - SPLIT_EVENLY, - SPLIT_LEFT_HEAVY, - SPLIT_RIGHT_HEAVY -}; - #endif diff --git a/ft/le-cursor.cc b/ft/le-cursor.cc index b08fc62632c..3eb73f1345b 100644 --- a/ft/le-cursor.cc +++ b/ft/le-cursor.cc @@ -89,9 +89,10 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "ft.h" -#include "ft-internal.h" -#include "le-cursor.h" +#include "ft/ft.h" +#include "ft/ft-internal.h" +#include "ft/le-cursor.h" +#include "ft/cursor.h" // A LE_CURSOR is a special purpose FT_CURSOR that: // - enables prefetching diff --git a/ft/leafentry.h b/ft/leafentry.h index 5c525db5c19..e3c29b6e224 100644 --- a/ft/leafentry.h +++ b/ft/leafentry.h @@ -98,8 +98,9 @@ PATENT RIGHTS GRANT: #include #include -#include "txn_manager.h" -#include "rbuf.h" +#include "ft/txn_manager.h" +#include "ft/rbuf.h" +#include "ft/ft_msg.h" /* Memory format of packed leaf entry @@ -241,6 +242,8 @@ toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored size_t *new_leafentry_memorysize, LEAFENTRY *new_leafentry_p); +class bn_data; + void toku_le_apply_msg(FT_MSG msg, LEAFENTRY old_leafentry, // NULL if there was no stored data. diff --git a/ft/log-internal.h b/ft/log-internal.h index be8ab7a53da..cd191571a67 100644 --- a/ft/log-internal.h +++ b/ft/log-internal.h @@ -119,6 +119,7 @@ using namespace toku; #define LOGGER_MIN_BUF_SIZE (1<<24) +// TODO: Remove mylock, it has no value struct mylock { toku_mutex_t lock; }; @@ -283,6 +284,7 @@ struct tokutxn { // txn to not transition to commit or abort uint64_t client_id; }; +typedef struct tokutxn *TOKUTXN; static inline int txn_has_current_rollback_log(TOKUTXN txn) { @@ -305,6 +307,10 @@ struct txninfo { BLOCKNUM spilled_rollback_tail; BLOCKNUM current_rollback; }; +typedef struct txninfo *TXNINFO; + +// TODO: Remove null txn +#define NULL_TXN ((TOKUTXN)0) static inline int toku_logsizeof_uint8_t (uint32_t v __attribute__((__unused__))) { return 1; diff --git a/ft/logger.h b/ft/logger.h index 6488ec0707d..11731fb5b97 100644 --- a/ft/logger.h +++ b/ft/logger.h @@ -92,8 +92,9 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "fttypes.h" -#include "ft_layout_version.h" +#include "ft/fttypes.h" +#include "ft/ft-internal.h" +#include "ft/ft_layout_version.h" enum { TOKU_LOG_VERSION_1 = 1, diff --git a/ft/rollback.h b/ft/rollback.h index 2e9493b0e6b..e9cb528b7a9 100644 --- a/ft/rollback.h +++ b/ft/rollback.h @@ -92,8 +92,13 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include -#include "sub_block.h" +#include "ft/sub_block.h" +#include "ft/cachetable.h" + +#include "util/memarena.h" + +typedef struct rollback_log_node *ROLLBACK_LOG_NODE; +typedef struct serialized_rollback_log_node *SERIALIZED_ROLLBACK_LOG_NODE; void toku_poll_txn_progress_function(TOKUTXN txn, uint8_t is_commit, uint8_t stall_for_checkpoint); @@ -172,6 +177,7 @@ struct serialized_rollback_log_node { BLOCKNUM blocknum; struct sub_block sub_block[max_sub_blocks]; }; +typedef struct serialized_rollback_log_node *SERIALIZED_ROLLBACK_LOG_NODE; static inline void toku_static_serialized_rollback_log_destroy(SERIALIZED_ROLLBACK_LOG_NODE log) { diff --git a/ft/sub_block.h b/ft/sub_block.h index 23fad83c966..d00df6fa51a 100644 --- a/ft/sub_block.h +++ b/ft/sub_block.h @@ -112,6 +112,7 @@ struct sub_block { uint32_t xsum; // sub block checksum }; +typedef struct sub_block *SUB_BLOCK; struct stored_sub_block { uint32_t uncompressed_size; diff --git a/ft/txn_manager.h b/ft/txn_manager.h index 12267297a0e..58d7555dc05 100644 --- a/ft/txn_manager.h +++ b/ft/txn_manager.h @@ -123,6 +123,7 @@ struct txn_manager { TXNID last_xid_seen_for_recover; TXNID last_calculated_oldest_referenced_xid; }; +typedef struct txn_manager *TXN_MANAGER; struct txn_manager_state { txn_manager_state(TXN_MANAGER mgr) : @@ -189,6 +190,22 @@ TXNID toku_txn_manager_get_oldest_living_xid(TXN_MANAGER txn_manager); TXNID toku_txn_manager_get_oldest_referenced_xid_estimate(TXN_MANAGER txn_manager); +// +// Types of snapshots that can be taken by a tokutxn +// - TXN_SNAPSHOT_NONE: means that there is no snapshot. Reads do not use snapshot reads. +// used for SERIALIZABLE and READ UNCOMMITTED +// - TXN_SNAPSHOT_ROOT: means that all tokutxns use their root transaction's snapshot +// used for REPEATABLE READ +// - TXN_SNAPSHOT_CHILD: means that each child tokutxn creates its own snapshot +// used for READ COMMITTED +// + +typedef enum __TXN_SNAPSHOT_TYPE { + TXN_SNAPSHOT_NONE=0, + TXN_SNAPSHOT_ROOT=1, + TXN_SNAPSHOT_CHILD=2 +} TXN_SNAPSHOT_TYPE; + void toku_txn_manager_handle_snapshot_create_for_child_txn( TOKUTXN txn, TXN_MANAGER txn_manager, From 4680500514b424c4fa142a47a0d0c592c268d120 Mon Sep 17 00:00:00 2001 From: John Esmet Date: Fri, 23 May 2014 18:13:32 -0400 Subject: [PATCH 23/46] TMX-1 Rename TokuKV to TokuFT --- CMakeLists.txt | 2 +- README.md | 30 +++++++++++++++--------------- buildheader/CMakeLists.txt | 4 ++-- cmake_modules/TokuThirdParty.cmake | 2 +- examples/CMakeLists.txt | 2 +- ft/CMakeLists.txt | 2 +- portability/CMakeLists.txt | 8 ++++---- src/CMakeLists.txt | 4 ++-- util/CMakeLists.txt | 2 +- 9 files changed, 28 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a5a9713b4e..6d2b8773e6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,7 +83,7 @@ add_subdirectory(examples) install( FILES README.md README-TOKUDB DESTINATION . - COMPONENT tokukv_misc + COMPONENT tokuft_misc ) ## build tags diff --git a/README.md b/README.md index 72b8988165a..12d6e6e4eed 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ -TokuKV +TokuFT ====== -TokuKV is a high-performance, transactional key-value store, used in the +TokuFT is a high-performance, transactional key-value store, used in the TokuDB storage engine for MySQL and MariaDB and in TokuMX, the high-performance MongoDB distribution. -TokuKV is provided as a shared library with an interface similar to +TokuFT is provided as a shared library with an interface similar to Berkeley DB. To build the full MySQL product, see the instructions for [Tokutek/ft-engine][ft-engine]. To build TokuMX, see the instructions -for [Tokutek/mongo][mongo]. This document covers TokuKV only. +for [Tokutek/mongo][mongo]. This document covers TokuFT only. [ft-engine]: https://github.com/Tokutek/ft-engine [mongo]: https://github.com/Tokutek/mongo @@ -19,7 +19,7 @@ for [Tokutek/mongo][mongo]. This document covers TokuKV only. Building -------- -TokuKV is built using CMake >= 2.8.9. Out-of-source builds are +TokuFT is built using CMake >= 2.8.9. Out-of-source builds are recommended. You need a C++11 compiler, though only GCC >= 4.7 and Apple's Clang are tested. You also need zlib development packages (`yum install zlib-devel` or `apt-get install zlib1g-dev`). @@ -50,14 +50,14 @@ to that if you are planning to run benchmarks or in production. ### Platforms -TokuKV is supported on 64-bit Centos, should work on other 64-bit linux -distributions, and may work on OSX 10.8 and FreeBSD. TokuKV is not +TokuFT is supported on 64-bit Centos, should work on other 64-bit linux +distributions, and may work on OSX 10.8 and FreeBSD. TokuFT is not supported on 32-bit systems. [Transparent hugepages][transparent-hugepages] is a feature in newer linux kernel versions that causes problems for the memory usage tracking -calculations in TokuKV and can lead to memory overcommit. If you have -this feature enabled, TokuKV will not start, and you should turn it off. +calculations in TokuFT and can lead to memory overcommit. If you have +this feature enabled, TokuFT will not start, and you should turn it off. If you want to run with transparent hugepages on, you can set an environment variable `TOKU_HUGE_PAGES_OK=1`, but only do this for testing, and only with a small cache size. @@ -68,23 +68,23 @@ and only with a small cache size. Examples -------- -There are some sample programs that can use either TokuKV or Berkeley DB +There are some sample programs that can use either TokuFT or Berkeley DB in the `examples/` directory. Follow the above instructions to build and -install TokuKV, and then look in the installed `examples/` directory for +install TokuFT, and then look in the installed `examples/` directory for instructions on building and running them. Testing ------- -TokuKV uses CTest for testing. The CDash testing dashboard is not +TokuFT uses CTest for testing. The CDash testing dashboard is not currently public, but you can run the tests without submitting them. There are some large data files not stored in the git repository, that will be made available soon. For now, the tests that use these files will not run. -Many of the tests are linked with both TokuKV and Berkeley DB, as a sanity +Many of the tests are linked with both TokuFT and Berkeley DB, as a sanity check on the tests themselves. To build these tests, you will need Berkeley DB and its header files installed. If you do not have Berkeley DB installed, just don't pass `USE_BDB=ON`. @@ -103,7 +103,7 @@ ctest -D ExperimentalStart \ Contributing ------------ -Please report bugs in TokuKV here on github. +Please report bugs in TokuFT here on github. We have two publicly accessible mailing lists for TokuDB: @@ -125,7 +125,7 @@ We are also available on IRC on freenode.net, in the #tokutek channel. License ------- -TokuKV is available under the GPL version 2, with slight modifications. +TokuFT is available under the GPL version 2, with slight modifications. See [README-TOKUDB][license]. [license]: http://github.com/Tokutek/ft-index/blob/master/README-TOKUDB diff --git a/buildheader/CMakeLists.txt b/buildheader/CMakeLists.txt index d9629f2cc68..68d7ddb5412 100644 --- a/buildheader/CMakeLists.txt +++ b/buildheader/CMakeLists.txt @@ -19,11 +19,11 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) FILES "${CMAKE_CURRENT_BINARY_DIR}/db.h" DESTINATION include RENAME tokudb.h - COMPONENT tokukv_headers + COMPONENT tokuft_headers ) install( FILES "${CMAKE_CURRENT_BINARY_DIR}/db.h" DESTINATION include - COMPONENT tokukv_headers + COMPONENT tokuft_headers ) endif () \ No newline at end of file diff --git a/cmake_modules/TokuThirdParty.cmake b/cmake_modules/TokuThirdParty.cmake index 0e8be69bf37..4aefb569f6c 100644 --- a/cmake_modules/TokuThirdParty.cmake +++ b/cmake_modules/TokuThirdParty.cmake @@ -29,7 +29,7 @@ if (NOT DEFINED LIBJEMALLOC) # detect when we are being built as a subproject if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib" DESTINATION . - COMPONENT tokukv_libs_extra) + COMPONENT tokuft_libs_extra) endif () endif () endif () diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 01ad01aa8d2..9e2a35143ff 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -11,6 +11,6 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) DESTINATION examples COMPONENT - tokukv_examples + tokuft_examples ) endif () \ No newline at end of file diff --git a/ft/CMakeLists.txt b/ft/CMakeLists.txt index 95d7866cb9d..07cc4e3fbe4 100644 --- a/ft/CMakeLists.txt +++ b/ft/CMakeLists.txt @@ -102,7 +102,7 @@ target_link_libraries(ftverify m) install( TARGETS tokuftdump DESTINATION bin - COMPONENT tokukv_tools + COMPONENT tokuft_tools ) add_subdirectory(tests) diff --git a/portability/CMakeLists.txt b/portability/CMakeLists.txt index 93dcf1d1675..c690a9aee50 100644 --- a/portability/CMakeLists.txt +++ b/portability/CMakeLists.txt @@ -38,22 +38,22 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) install( FILES toku_os_types.h toku_time.h toku_list.h toku_os.h DESTINATION include - COMPONENT tokukv_headers + COMPONENT tokuft_headers ) install( FILES "${CMAKE_CURRENT_BINARY_DIR}/toku_config.h" DESTINATION include - COMPONENT tokukv_headers + COMPONENT tokuft_headers ) install( TARGETS ${LIBTOKUPORTABILITY}_static DESTINATION ${INSTALL_LIBDIR} - COMPONENT tokukv_libs_static + COMPONENT tokuft_libs_static ) install( TARGETS ${LIBTOKUPORTABILITY} DESTINATION ${INSTALL_LIBDIR} - COMPONENT tokukv_libs_shared + COMPONENT tokuft_libs_shared ) endif () diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 28e51c3f8fe..284d798ea87 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -44,12 +44,12 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) install( TARGETS ${LIBTOKUDB} DESTINATION ${INSTALL_LIBDIR} - COMPONENT tokukv_libs_shared + COMPONENT tokuft_libs_shared ) install( TARGETS ${LIBTOKUDB}_static DESTINATION ${INSTALL_LIBDIR} - COMPONENT tokukv_libs_static + COMPONENT tokuft_libs_static ) endif () diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt index 6a0bb6208a5..decbcff8e85 100644 --- a/util/CMakeLists.txt +++ b/util/CMakeLists.txt @@ -24,7 +24,7 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) install( FILES partitioned_counter.h DESTINATION include - COMPONENT tokukv_headers + COMPONENT tokuft_headers ) endif () From f66c44230fdc98a56a27c156ed8f919192623923 Mon Sep 17 00:00:00 2001 From: John Esmet Date: Fri, 23 May 2014 18:13:59 -0400 Subject: [PATCH 24/46] TMX-242 Add cursor.h, which missed the last commit --- ft/cursor.h | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 ft/cursor.h diff --git a/ft/cursor.h b/ft/cursor.h new file mode 100644 index 00000000000..e8417697dfc --- /dev/null +++ b/ft/cursor.h @@ -0,0 +1,115 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: + +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2014 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#pragma once + +#include + +#include "portability/toku_list.h" + +typedef bool(*FT_CHECK_INTERRUPT_CALLBACK)(void* extra); + +/* an ft cursor is represented as a kv pair in a tree */ +struct ft_cursor { + struct toku_list cursors_link; + FT_HANDLE ft_handle; + DBT key, val; // The key-value pair that the cursor currently points to + DBT range_lock_left_key, range_lock_right_key; + bool prefetching; + bool left_is_neg_infty, right_is_pos_infty; + bool is_snapshot_read; // true if query is read_committed, false otherwise + bool is_leaf_mode; + bool disable_prefetching; + bool is_temporary; + int out_of_range_error; + int direction; + TOKUTXN ttxn; + FT_CHECK_INTERRUPT_CALLBACK interrupt_cb; + void *interrupt_cb_extra; +}; +typedef struct ft_cursor *FT_CURSOR; From 17c6b1650cc6b8249ca23dec7659f0bd0fa4853d Mon Sep 17 00:00:00 2001 From: John Esmet Date: Fri, 23 May 2014 18:29:09 -0400 Subject: [PATCH 25/46] Revert "TMX-242 Add cursor.h, which missed the last commit" We will wait until after TokuMX 1.5.0 for these changes This reverts commit 917a23bd59d2e2e2e35991e3266e284072489b5c. --- ft/cursor.h | 115 ---------------------------------------------------- 1 file changed, 115 deletions(-) delete mode 100644 ft/cursor.h diff --git a/ft/cursor.h b/ft/cursor.h deleted file mode 100644 index e8417697dfc..00000000000 --- a/ft/cursor.h +++ /dev/null @@ -1,115 +0,0 @@ -/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ -// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: - -/* -COPYING CONDITIONS NOTICE: - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation, and provided that the - following conditions are met: - - * Redistributions of source code must retain this COPYING - CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the - DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the - PATENT MARKING NOTICE (below), and the PATENT RIGHTS - GRANT (below). - - * Redistributions in binary form must reproduce this COPYING - CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the - DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the - PATENT MARKING NOTICE (below), and the PATENT RIGHTS - GRANT (below) in the documentation and/or other materials - provided with the distribution. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. - -COPYRIGHT NOTICE: - - TokuDB, Tokutek Fractal Tree Indexing Library. - Copyright (C) 2014 Tokutek, Inc. - -DISCLAIMER: - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - -UNIVERSITY PATENT NOTICE: - - The technology is licensed by the Massachusetts Institute of - Technology, Rutgers State University of New Jersey, and the Research - Foundation of State University of New York at Stony Brook under - United States of America Serial No. 11/760379 and to the patents - and/or patent applications resulting from it. - -PATENT MARKING NOTICE: - - This software is covered by US Patent No. 8,185,551. - This software is covered by US Patent No. 8,489,638. - -PATENT RIGHTS GRANT: - - "THIS IMPLEMENTATION" means the copyrightable works distributed by - Tokutek as part of the Fractal Tree project. - - "PATENT CLAIMS" means the claims of patents that are owned or - licensable by Tokutek, both currently or in the future; and that in - the absence of this license would be infringed by THIS - IMPLEMENTATION or by using or running THIS IMPLEMENTATION. - - "PATENT CHALLENGE" shall mean a challenge to the validity, - patentability, enforceability and/or non-infringement of any of the - PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. - - Tokutek hereby grants to you, for the term and geographical scope of - the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, - irrevocable (except as stated in this section) patent license to - make, have made, use, offer to sell, sell, import, transfer, and - otherwise run, modify, and propagate the contents of THIS - IMPLEMENTATION, where such license applies only to the PATENT - CLAIMS. This grant does not include claims that would be infringed - only as a consequence of further modifications of THIS - IMPLEMENTATION. If you or your agent or licensee institute or order - or agree to the institution of patent litigation against any entity - (including a cross-claim or counterclaim in a lawsuit) alleging that - THIS IMPLEMENTATION constitutes direct or contributory patent - infringement, or inducement of patent infringement, then any rights - granted to you under this License shall terminate as of the date - such litigation is filed. If you or your agent or exclusive - licensee institute or order or agree to the institution of a PATENT - CHALLENGE, then Tokutek may terminate any rights granted to you - under this License. -*/ - -#pragma once - -#include - -#include "portability/toku_list.h" - -typedef bool(*FT_CHECK_INTERRUPT_CALLBACK)(void* extra); - -/* an ft cursor is represented as a kv pair in a tree */ -struct ft_cursor { - struct toku_list cursors_link; - FT_HANDLE ft_handle; - DBT key, val; // The key-value pair that the cursor currently points to - DBT range_lock_left_key, range_lock_right_key; - bool prefetching; - bool left_is_neg_infty, right_is_pos_infty; - bool is_snapshot_read; // true if query is read_committed, false otherwise - bool is_leaf_mode; - bool disable_prefetching; - bool is_temporary; - int out_of_range_error; - int direction; - TOKUTXN ttxn; - FT_CHECK_INTERRUPT_CALLBACK interrupt_cb; - void *interrupt_cb_extra; -}; -typedef struct ft_cursor *FT_CURSOR; From 1338d907873de38c3ad3d75e271b655b5675bf08 Mon Sep 17 00:00:00 2001 From: John Esmet Date: Fri, 23 May 2014 18:29:24 -0400 Subject: [PATCH 26/46] Revert "TMX-1 Rename TokuKV to TokuFT" We will wait until after TokuMX 1.5.0 for these changes This reverts commit b1f4a0aa62e8bbf89f7ce8651110c175d63f1c09. --- CMakeLists.txt | 2 +- README.md | 30 +++++++++++++++--------------- buildheader/CMakeLists.txt | 4 ++-- cmake_modules/TokuThirdParty.cmake | 2 +- examples/CMakeLists.txt | 2 +- ft/CMakeLists.txt | 2 +- portability/CMakeLists.txt | 8 ++++---- src/CMakeLists.txt | 4 ++-- util/CMakeLists.txt | 2 +- 9 files changed, 28 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d2b8773e6c..5a5a9713b4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,7 +83,7 @@ add_subdirectory(examples) install( FILES README.md README-TOKUDB DESTINATION . - COMPONENT tokuft_misc + COMPONENT tokukv_misc ) ## build tags diff --git a/README.md b/README.md index 12d6e6e4eed..72b8988165a 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ -TokuFT +TokuKV ====== -TokuFT is a high-performance, transactional key-value store, used in the +TokuKV is a high-performance, transactional key-value store, used in the TokuDB storage engine for MySQL and MariaDB and in TokuMX, the high-performance MongoDB distribution. -TokuFT is provided as a shared library with an interface similar to +TokuKV is provided as a shared library with an interface similar to Berkeley DB. To build the full MySQL product, see the instructions for [Tokutek/ft-engine][ft-engine]. To build TokuMX, see the instructions -for [Tokutek/mongo][mongo]. This document covers TokuFT only. +for [Tokutek/mongo][mongo]. This document covers TokuKV only. [ft-engine]: https://github.com/Tokutek/ft-engine [mongo]: https://github.com/Tokutek/mongo @@ -19,7 +19,7 @@ for [Tokutek/mongo][mongo]. This document covers TokuFT only. Building -------- -TokuFT is built using CMake >= 2.8.9. Out-of-source builds are +TokuKV is built using CMake >= 2.8.9. Out-of-source builds are recommended. You need a C++11 compiler, though only GCC >= 4.7 and Apple's Clang are tested. You also need zlib development packages (`yum install zlib-devel` or `apt-get install zlib1g-dev`). @@ -50,14 +50,14 @@ to that if you are planning to run benchmarks or in production. ### Platforms -TokuFT is supported on 64-bit Centos, should work on other 64-bit linux -distributions, and may work on OSX 10.8 and FreeBSD. TokuFT is not +TokuKV is supported on 64-bit Centos, should work on other 64-bit linux +distributions, and may work on OSX 10.8 and FreeBSD. TokuKV is not supported on 32-bit systems. [Transparent hugepages][transparent-hugepages] is a feature in newer linux kernel versions that causes problems for the memory usage tracking -calculations in TokuFT and can lead to memory overcommit. If you have -this feature enabled, TokuFT will not start, and you should turn it off. +calculations in TokuKV and can lead to memory overcommit. If you have +this feature enabled, TokuKV will not start, and you should turn it off. If you want to run with transparent hugepages on, you can set an environment variable `TOKU_HUGE_PAGES_OK=1`, but only do this for testing, and only with a small cache size. @@ -68,23 +68,23 @@ and only with a small cache size. Examples -------- -There are some sample programs that can use either TokuFT or Berkeley DB +There are some sample programs that can use either TokuKV or Berkeley DB in the `examples/` directory. Follow the above instructions to build and -install TokuFT, and then look in the installed `examples/` directory for +install TokuKV, and then look in the installed `examples/` directory for instructions on building and running them. Testing ------- -TokuFT uses CTest for testing. The CDash testing dashboard is not +TokuKV uses CTest for testing. The CDash testing dashboard is not currently public, but you can run the tests without submitting them. There are some large data files not stored in the git repository, that will be made available soon. For now, the tests that use these files will not run. -Many of the tests are linked with both TokuFT and Berkeley DB, as a sanity +Many of the tests are linked with both TokuKV and Berkeley DB, as a sanity check on the tests themselves. To build these tests, you will need Berkeley DB and its header files installed. If you do not have Berkeley DB installed, just don't pass `USE_BDB=ON`. @@ -103,7 +103,7 @@ ctest -D ExperimentalStart \ Contributing ------------ -Please report bugs in TokuFT here on github. +Please report bugs in TokuKV here on github. We have two publicly accessible mailing lists for TokuDB: @@ -125,7 +125,7 @@ We are also available on IRC on freenode.net, in the #tokutek channel. License ------- -TokuFT is available under the GPL version 2, with slight modifications. +TokuKV is available under the GPL version 2, with slight modifications. See [README-TOKUDB][license]. [license]: http://github.com/Tokutek/ft-index/blob/master/README-TOKUDB diff --git a/buildheader/CMakeLists.txt b/buildheader/CMakeLists.txt index 68d7ddb5412..d9629f2cc68 100644 --- a/buildheader/CMakeLists.txt +++ b/buildheader/CMakeLists.txt @@ -19,11 +19,11 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) FILES "${CMAKE_CURRENT_BINARY_DIR}/db.h" DESTINATION include RENAME tokudb.h - COMPONENT tokuft_headers + COMPONENT tokukv_headers ) install( FILES "${CMAKE_CURRENT_BINARY_DIR}/db.h" DESTINATION include - COMPONENT tokuft_headers + COMPONENT tokukv_headers ) endif () \ No newline at end of file diff --git a/cmake_modules/TokuThirdParty.cmake b/cmake_modules/TokuThirdParty.cmake index 4aefb569f6c..0e8be69bf37 100644 --- a/cmake_modules/TokuThirdParty.cmake +++ b/cmake_modules/TokuThirdParty.cmake @@ -29,7 +29,7 @@ if (NOT DEFINED LIBJEMALLOC) # detect when we are being built as a subproject if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib" DESTINATION . - COMPONENT tokuft_libs_extra) + COMPONENT tokukv_libs_extra) endif () endif () endif () diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9e2a35143ff..01ad01aa8d2 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -11,6 +11,6 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) DESTINATION examples COMPONENT - tokuft_examples + tokukv_examples ) endif () \ No newline at end of file diff --git a/ft/CMakeLists.txt b/ft/CMakeLists.txt index 07cc4e3fbe4..95d7866cb9d 100644 --- a/ft/CMakeLists.txt +++ b/ft/CMakeLists.txt @@ -102,7 +102,7 @@ target_link_libraries(ftverify m) install( TARGETS tokuftdump DESTINATION bin - COMPONENT tokuft_tools + COMPONENT tokukv_tools ) add_subdirectory(tests) diff --git a/portability/CMakeLists.txt b/portability/CMakeLists.txt index c690a9aee50..93dcf1d1675 100644 --- a/portability/CMakeLists.txt +++ b/portability/CMakeLists.txt @@ -38,22 +38,22 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) install( FILES toku_os_types.h toku_time.h toku_list.h toku_os.h DESTINATION include - COMPONENT tokuft_headers + COMPONENT tokukv_headers ) install( FILES "${CMAKE_CURRENT_BINARY_DIR}/toku_config.h" DESTINATION include - COMPONENT tokuft_headers + COMPONENT tokukv_headers ) install( TARGETS ${LIBTOKUPORTABILITY}_static DESTINATION ${INSTALL_LIBDIR} - COMPONENT tokuft_libs_static + COMPONENT tokukv_libs_static ) install( TARGETS ${LIBTOKUPORTABILITY} DESTINATION ${INSTALL_LIBDIR} - COMPONENT tokuft_libs_shared + COMPONENT tokukv_libs_shared ) endif () diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 284d798ea87..28e51c3f8fe 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -44,12 +44,12 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) install( TARGETS ${LIBTOKUDB} DESTINATION ${INSTALL_LIBDIR} - COMPONENT tokuft_libs_shared + COMPONENT tokukv_libs_shared ) install( TARGETS ${LIBTOKUDB}_static DESTINATION ${INSTALL_LIBDIR} - COMPONENT tokuft_libs_static + COMPONENT tokukv_libs_static ) endif () diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt index decbcff8e85..6a0bb6208a5 100644 --- a/util/CMakeLists.txt +++ b/util/CMakeLists.txt @@ -24,7 +24,7 @@ if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) install( FILES partitioned_counter.h DESTINATION include - COMPONENT tokuft_headers + COMPONENT tokukv_headers ) endif () From e61903510a39068b9306e1cbb19b66edf87fbd05 Mon Sep 17 00:00:00 2001 From: John Esmet Date: Fri, 23 May 2014 18:29:27 -0400 Subject: [PATCH 27/46] Revert "FT-242 Begin breaking up fttypes.h by moving many things to their" We will wait until after TokuMX 1.5.0 for these changes This reverts commit 1927c6dd3822300d555a2556fc63ecb7ff03735d. --- ft/block_table.h | 2 +- ft/cachetable.h | 39 +------- ft/fifo.cc | 2 +- ft/fifo.h | 28 +++++- ft/ft-cachetable-wrappers.h | 4 +- ft/ft-flusher.h | 22 +---- ft/ft-internal.h | 90 +++++++++++++----- ft/ft-ops.cc | 31 +----- ft/ft-ops.h | 6 +- ft/ft-search.h | 1 - ft/ft_msg.h | 116 +++-------------------- ft/ft_node-serialize.cc | 2 - ft/ftloader.h | 2 - ft/fttypes.h | 183 +++++++++++++++++++++++++++++++++++- ft/le-cursor.cc | 7 +- ft/leafentry.h | 7 +- ft/log-internal.h | 6 -- ft/logger.h | 5 +- ft/rollback.h | 10 +- ft/sub_block.h | 1 - ft/txn_manager.h | 17 ---- 21 files changed, 304 insertions(+), 277 deletions(-) diff --git a/ft/block_table.h b/ft/block_table.h index 72c914988fa..a9f17ad0e7e 100644 --- a/ft/block_table.h +++ b/ft/block_table.h @@ -92,7 +92,7 @@ PATENT RIGHTS GRANT: #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." #include "fttypes.h" -#include "ft/ft-internal.h" + typedef struct block_table *BLOCK_TABLE; diff --git a/ft/cachetable.h b/ft/cachetable.h index c4290b6f6d7..9c11db02e00 100644 --- a/ft/cachetable.h +++ b/ft/cachetable.h @@ -111,42 +111,6 @@ PATENT RIGHTS GRANT: typedef BLOCKNUM CACHEKEY; -class checkpointer; -typedef class checkpointer *CHECKPOINTER; -typedef struct cachetable *CACHETABLE; -typedef struct cachefile *CACHEFILE; -typedef struct ctpair *PAIR; - -// This struct hold information about values stored in the cachetable. -// As one can tell from the names, we are probably violating an -// abstraction layer by placing names. -// -// The purpose of having this struct is to have a way for the -// cachetable to accumulate the some totals we are interested in. -// Breaking this abstraction layer by having these names was the -// easiest way. -// -typedef struct pair_attr_s { - long size; // size PAIR's value takes in memory - long nonleaf_size; // size if PAIR is a nonleaf node, 0 otherwise, used only for engine status - long leaf_size; // size if PAIR is a leaf node, 0 otherwise, used only for engine status - long rollback_size; // size of PAIR is a rollback node, 0 otherwise, used only for engine status - long cache_pressure_size; // amount PAIR contributes to cache pressure, is sum of buffer sizes and workdone counts - bool is_valid; -} PAIR_ATTR; - -static inline PAIR_ATTR make_pair_attr(long size) { - PAIR_ATTR result={ - .size = size, - .nonleaf_size = 0, - .leaf_size = 0, - .rollback_size = 0, - .cache_pressure_size = 0, - .is_valid = true - }; - return result; -} - void toku_set_cleaner_period (CACHETABLE ct, uint32_t new_period); uint32_t toku_get_cleaner_period_unlocked (CACHETABLE ct); void toku_set_cleaner_iterations (CACHETABLE ct, uint32_t new_iterations); @@ -430,9 +394,8 @@ struct unlockers { bool locked; void (*f)(void* extra); void *extra; - struct unlockers *next; + UNLOCKERS next; }; -typedef struct unlockers *UNLOCKERS; // Effect: If the block is in the cachetable, then return it. // Otherwise call the functions in unlockers, fetch the data (but don't pin it, since we'll just end up pinning it again later), and return TOKUDB_TRY_AGAIN. diff --git a/ft/fifo.cc b/ft/fifo.cc index 6acd29be67c..07d7baec2a1 100644 --- a/ft/fifo.cc +++ b/ft/fifo.cc @@ -172,7 +172,7 @@ int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *d toku_fifo_resize(fifo, next_2); } struct fifo_entry *entry = (struct fifo_entry *)(fifo->memory + fifo->memory_used); - entry->type = (unsigned char) type; + fifo_entry_set_msg_type(entry, type); entry->msn = msn; xids_cpy(&entry->xids_s, xids); entry->is_fresh = is_fresh; diff --git a/ft/fifo.h b/ft/fifo.h index 5333ca905a7..e9f53248b98 100644 --- a/ft/fifo.h +++ b/ft/fifo.h @@ -91,10 +91,10 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "ft/fttypes.h" -#include "ft/xids-internal.h" -#include "ft/xids.h" -#include "ft/ft_msg.h" +#include "fttypes.h" +#include "xids-internal.h" +#include "xids.h" + // If the fifo_entry is unpacked, the compiler aligns the xids array and we waste a lot of space struct __attribute__((__packed__)) fifo_entry { @@ -106,6 +106,24 @@ struct __attribute__((__packed__)) fifo_entry { XIDS_S xids_s; }; +// get and set the ft message type for a fifo entry. +// it is internally stored as a single unsigned char. +static inline enum ft_msg_type +fifo_entry_get_msg_type(const struct fifo_entry * entry) +{ + enum ft_msg_type msg_type; + msg_type = (enum ft_msg_type) entry->type; + return msg_type; +} + +static inline void +fifo_entry_set_msg_type(struct fifo_entry * entry, + enum ft_msg_type msg_type) +{ + unsigned char type = (unsigned char) msg_type; + entry->type = type; +} + typedef struct fifo *FIFO; int toku_fifo_create(FIFO *); @@ -132,7 +150,7 @@ void toku_fifo_iterate(FIFO, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,IT struct fifo_entry *e = toku_fifo_iterate_internal_get_entry(fifo, fifo_iterate_off); \ ITEMLEN keylenvar = e->keylen; \ ITEMLEN datalenvar = e->vallen; \ - enum ft_msg_type typevar = (enum ft_msg_type) e->type; \ + enum ft_msg_type typevar = fifo_entry_get_msg_type(e); \ MSN msnvar = e->msn; \ XIDS xidsvar = &e->xids_s; \ bytevec keyvar = xids_get_end_of_array(xidsvar); \ diff --git a/ft/ft-cachetable-wrappers.h b/ft/ft-cachetable-wrappers.h index eb33f7cfe05..9a56f4ff220 100644 --- a/ft/ft-cachetable-wrappers.h +++ b/ft/ft-cachetable-wrappers.h @@ -92,8 +92,8 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "ft/ft-internal.h" -#include "ft/cachetable.h" +#include +#include "cachetable.h" /** * Put an empty node (that is, no fields filled) into the cachetable. diff --git a/ft/ft-flusher.h b/ft/ft-flusher.h index 0861669157a..0788bf665d3 100644 --- a/ft/ft-flusher.h +++ b/ft/ft-flusher.h @@ -91,7 +91,8 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "ft/ft-internal.h" +// This must be first to make the 64-bit file mode work right in Linux +#include "fttypes.h" typedef enum { FT_FLUSHER_CLEANER_TOTAL_NODES = 0, // total number of nodes whose buffers are potentially flushed by cleaner thread @@ -157,23 +158,6 @@ toku_ft_flush_node_on_background_thread( FTNODE parent ); -// Given pinned node and pinned child, split child into two -// and update node with information about its new child. -void toku_ft_split_child( - FT h, - FTNODE node, - int childnum, - FTNODE child, - enum split_mode split_mode - ); -// Given pinned node, merge childnum with a neighbor and update node with -// information about the change -void toku_ft_merge_child( - FT ft, - FTNODE node, - int childnum - ); - /** * Effect: Split a leaf node. * Argument "node" is node to be split. @@ -182,7 +166,6 @@ void toku_ft_merge_child( * nodea is the left node that results from the split * splitk is the right-most key of nodea */ -// TODO: Rename toku_ft_leaf_split void ftleaf_split( FT h, @@ -206,7 +189,6 @@ ftleaf_split( * but it does not guarantee that the resulting nodes are smaller than nodesize. */ void -// TODO: Rename toku_ft_nonleaf_split ft_nonleaf_split( FT h, FTNODE node, diff --git a/ft/ft-internal.h b/ft/ft-internal.h index 9d065a3f5de..42d27638330 100644 --- a/ft/ft-internal.h +++ b/ft/ft-internal.h @@ -115,9 +115,7 @@ PATENT RIGHTS GRANT: #include "compress.h" #include #include -#include "ft/bndata.h" -#include "ft/rollback.h" -#include "ft/ft-search.h" +#include "bndata.h" enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */ enum { FT_MSG_OVERHEAD = (2 + sizeof(MSN)) }; // the type plus freshness plus MSN @@ -138,18 +136,6 @@ enum ftnode_fetch_type { ftnode_fetch_keymatch, // one child is needed if it holds both keys }; -enum split_mode { - SPLIT_EVENLY, - SPLIT_LEFT_HEAVY, - SPLIT_RIGHT_HEAVY -}; - -enum reactivity { - RE_STABLE, - RE_FUSIBLE, - RE_FISSIBLE -}; - static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) UU(); static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) { switch (type) { @@ -201,7 +187,6 @@ struct ftnode_fetch_extra { tokutime_t decompress_time; tokutime_t deserialize_time; }; -typedef struct ftnode_fetch_extra *FTNODE_FETCH_EXTRA; struct toku_fifo_entry_key_msn_heaviside_extra { DESCRIPTOR desc; @@ -588,7 +573,6 @@ struct ft { // is this ft a blackhole? if so, all messages are dropped. bool blackhole; }; -typedef struct ft *FT; // Allocate a DB struct off the stack and only set its comparison // descriptor. We don't bother setting any other fields because @@ -754,6 +738,22 @@ int toku_ftnode_cleaner_callback( void *ftnode_pv, BLOCKNUM blocknum, uint32_t f void toku_evict_bn_from_memory(FTNODE node, int childnum, FT h); BASEMENTNODE toku_detach_bn(FTNODE node, int childnum); +// Given pinned node and pinned child, split child into two +// and update node with information about its new child. +void toku_ft_split_child( + FT h, + FTNODE node, + int childnum, + FTNODE child, + enum split_mode split_mode + ); +// Given pinned node, merge childnum with a neighbor and update node with +// information about the change +void toku_ft_merge_child( + FT ft, + FTNODE node, + int childnum + ); static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(FT h) { CACHETABLE_WRITE_CALLBACK wc; wc.flush_callback = toku_ftnode_flush_callback; @@ -766,6 +766,27 @@ static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(FT h) { return wc; } +static const FTNODE null_ftnode=0; + +/* an ft cursor is represented as a kv pair in a tree */ +struct ft_cursor { + struct toku_list cursors_link; + FT_HANDLE ft_handle; + DBT key, val; // The key-value pair that the cursor currently points to + DBT range_lock_left_key, range_lock_right_key; + bool prefetching; + bool left_is_neg_infty, right_is_pos_infty; + bool is_snapshot_read; // true if query is read_committed, false otherwise + bool is_leaf_mode; + bool disable_prefetching; + bool is_temporary; + int out_of_range_error; + int direction; + TOKUTXN ttxn; + FT_CHECK_INTERRUPT_CALLBACK interrupt_cb; + void *interrupt_cb_extra; +}; + // // Helper function to fill a ftnode_fetch_extra with data // that will tell the fetch callback that the entire node is @@ -901,22 +922,43 @@ static inline void destroy_bfe_for_prefetch(struct ftnode_fetch_extra *bfe) { } // this is in a strange place because it needs the cursor struct to be defined -void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe, - FT h, - struct ft_cursor *c); +static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe, + FT h, + FT_CURSOR c) { + paranoid_invariant(h->h->type == FT_CURRENT); + bfe->type = ftnode_fetch_prefetch; + bfe->h = h; + bfe->search = NULL; + toku_init_dbt(&bfe->range_lock_left_key); + toku_init_dbt(&bfe->range_lock_right_key); + const DBT *left = &c->range_lock_left_key; + if (left->data) { + toku_clone_dbt(&bfe->range_lock_left_key, *left); + } + const DBT *right = &c->range_lock_right_key; + if (right->data) { + toku_clone_dbt(&bfe->range_lock_right_key, *right); + } + bfe->left_is_neg_infty = c->left_is_neg_infty; + bfe->right_is_pos_infty = c->right_is_pos_infty; + bfe->child_to_read = -1; + bfe->disable_prefetching = c->disable_prefetching; + bfe->read_all_partitions = false; + bfe->bytes_read = 0; + bfe->io_time = 0; + bfe->deserialize_time = 0; + bfe->decompress_time = 0; +} struct ancestors { FTNODE node; // This is the root node if next is NULL. int childnum; // which buffer holds messages destined to the node whose ancestors this list represents. - struct ancestors *next; // Parent of this node (so next->node.(next->childnum) refers to this node). + ANCESTORS next; // Parent of this node (so next->node.(next->childnum) refers to this node). }; -typedef struct ancestors *ANCESTORS; - struct pivot_bounds { const DBT * const lower_bound_exclusive; const DBT * const upper_bound_inclusive; // NULL to indicate negative or positive infinity (which are in practice exclusive since there are now transfinite keys in messages). }; -typedef struct pivot_bounds const * const PIVOT_BOUNDS; __attribute__((nonnull)) void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node); diff --git a/ft/ft-ops.cc b/ft/ft-ops.cc index dacb7c15448..ab7de1a0a2c 100644 --- a/ft/ft-ops.cc +++ b/ft/ft-ops.cc @@ -419,7 +419,6 @@ toku_ft_get_status(FT_STATUS s) { } \ } while (0) - void toku_note_deserialized_basement_node(bool fixed_key_size) { if (fixed_key_size) { STATUS_INC(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, 1); @@ -4398,7 +4397,7 @@ do_bn_apply_msg(FT_HANDLE t, BASEMENTNODE bn, struct fifo_entry *entry, txn_gc_i if (entry->msn.msn > bn->max_msn_applied.msn) { ITEMLEN keylen = entry->keylen; ITEMLEN vallen = entry->vallen; - enum ft_msg_type type = (enum ft_msg_type) entry->type; + enum ft_msg_type type = fifo_entry_get_msg_type(entry); MSN msn = entry->msn; const XIDS xids = (XIDS) &entry->xids_s; bytevec key = xids_get_end_of_array(xids); @@ -5130,34 +5129,6 @@ ftnode_pf_callback_and_free_bfe(void *ftnode_pv, void* disk_data, void *read_ext return r; } -void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe, - FT h, - struct ft_cursor *c) { - paranoid_invariant(h->h->type == FT_CURRENT); - bfe->type = ftnode_fetch_prefetch; - bfe->h = h; - bfe->search = NULL; - toku_init_dbt(&bfe->range_lock_left_key); - toku_init_dbt(&bfe->range_lock_right_key); - const DBT *left = &c->range_lock_left_key; - if (left->data) { - toku_clone_dbt(&bfe->range_lock_left_key, *left); - } - const DBT *right = &c->range_lock_right_key; - if (right->data) { - toku_clone_dbt(&bfe->range_lock_right_key, *right); - } - bfe->left_is_neg_infty = c->left_is_neg_infty; - bfe->right_is_pos_infty = c->right_is_pos_infty; - bfe->child_to_read = -1; - bfe->disable_prefetching = c->disable_prefetching; - bfe->read_all_partitions = false; - bfe->bytes_read = 0; - bfe->io_time = 0; - bfe->deserialize_time = 0; - bfe->decompress_time = 0; -} - static void ft_node_maybe_prefetch(FT_HANDLE ft_handle, FTNODE node, int childnum, FT_CURSOR ftcursor, bool *doprefetch) { // the number of nodes to prefetch diff --git a/ft/ft-ops.h b/ft/ft-ops.h index b00eefe2f6d..b482d2b8206 100644 --- a/ft/ft-ops.h +++ b/ft/ft-ops.h @@ -100,8 +100,6 @@ PATENT RIGHTS GRANT: #include "log.h" #include "ft-search.h" #include "compress.h" -#include "ft_msg.h" -#include "ft/cursor.h" // A callback function is invoked with the key, and the data. // The pointers (to the bytevecs) must not be modified. The data must be copied out before the callback function returns. @@ -116,6 +114,8 @@ PATENT RIGHTS GRANT: // When lock_only is true, the callback only does optional lock tree locking. typedef int(*FT_GET_CALLBACK_FUNCTION)(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only); +typedef bool(*FT_CHECK_INTERRUPT_CALLBACK)(void* extra); + int toku_open_ft_handle (const char *fname, int is_create, FT_HANDLE *, int nodesize, int basementnodesize, enum toku_compression_method compression_method, CACHETABLE, TOKUTXN, int(*)(DB *,const DBT*,const DBT*)) __attribute__ ((warn_unused_result)); // effect: changes the descriptor for the ft of the given handle. @@ -246,7 +246,6 @@ void toku_ft_maybe_delete (FT_HANDLE ft_h, DBT *k, TOKUTXN txn, bool oplsn_valid TXNID toku_ft_get_oldest_referenced_xid_estimate(FT_HANDLE ft_h); TXN_MANAGER toku_ft_get_txn_manager(FT_HANDLE ft_h); -class txn_gc_info; void toku_ft_send_insert(FT_HANDLE ft_h, DBT *key, DBT *val, XIDS xids, enum ft_msg_type type, txn_gc_info *gc_info); void toku_ft_send_delete(FT_HANDLE ft_h, DBT *key, XIDS xids, txn_gc_info *gc_info); void toku_ft_send_commit_any(FT_HANDLE ft_h, DBT *key, XIDS xids, txn_gc_info *gc_info); @@ -259,6 +258,7 @@ extern int toku_ft_debug_mode; int toku_verify_ft (FT_HANDLE ft_h) __attribute__ ((warn_unused_result)); int toku_verify_ft_with_progress (FT_HANDLE ft_h, int (*progress_callback)(void *extra, float progress), void *extra, int verbose, int keep_going) __attribute__ ((warn_unused_result)); +typedef struct ft_cursor *FT_CURSOR; int toku_ft_cursor (FT_HANDLE, FT_CURSOR*, TOKUTXN, bool, bool) __attribute__ ((warn_unused_result)); void toku_ft_cursor_set_leaf_mode(FT_CURSOR); // Sets a boolean on the ft cursor that prevents uncessary copying of diff --git a/ft/ft-search.h b/ft/ft-search.h index 8e8fece6a3c..9c26be456de 100644 --- a/ft/ft-search.h +++ b/ft/ft-search.h @@ -92,7 +92,6 @@ PATENT RIGHTS GRANT: #ifndef FT_SEARCH_H #define FT_SEARCH_H -#include "ft/ybt.h" enum ft_search_direction_e { FT_SEARCH_LEFT = 1, /* search left -> right, finds min xy as defined by the compare function */ diff --git a/ft/ft_msg.h b/ft/ft_msg.h index 2f996c6558f..f468d7f647b 100644 --- a/ft/ft_msg.h +++ b/ft/ft_msg.h @@ -97,107 +97,6 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -/* tree command types */ -enum ft_msg_type { - FT_NONE = 0, - FT_INSERT = 1, - FT_DELETE_ANY = 2, // Delete any matching key. This used to be called FT_DELETE. - //FT_DELETE_BOTH = 3, - FT_ABORT_ANY = 4, // Abort any commands on any matching key. - //FT_ABORT_BOTH = 5, // Abort commands that match both the key and the value - FT_COMMIT_ANY = 6, - //FT_COMMIT_BOTH = 7, - FT_COMMIT_BROADCAST_ALL = 8, // Broadcast to all leafentries, (commit all transactions). - FT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction). - FT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction). - FT_INSERT_NO_OVERWRITE = 11, - FT_OPTIMIZE = 12, // Broadcast - FT_OPTIMIZE_FOR_UPGRADE = 13, // same as FT_OPTIMIZE, but record version number in leafnode - FT_UPDATE = 14, - FT_UPDATE_BROADCAST_ALL = 15 -}; - -static inline bool -ft_msg_type_applies_once(enum ft_msg_type type) -{ - bool ret_val; - switch (type) { - case FT_INSERT_NO_OVERWRITE: - case FT_INSERT: - case FT_DELETE_ANY: - case FT_ABORT_ANY: - case FT_COMMIT_ANY: - case FT_UPDATE: - ret_val = true; - break; - case FT_COMMIT_BROADCAST_ALL: - case FT_COMMIT_BROADCAST_TXN: - case FT_ABORT_BROADCAST_TXN: - case FT_OPTIMIZE: - case FT_OPTIMIZE_FOR_UPGRADE: - case FT_UPDATE_BROADCAST_ALL: - case FT_NONE: - ret_val = false; - break; - default: - assert(false); - } - return ret_val; -} - -static inline bool -ft_msg_type_applies_all(enum ft_msg_type type) -{ - bool ret_val; - switch (type) { - case FT_NONE: - case FT_INSERT_NO_OVERWRITE: - case FT_INSERT: - case FT_DELETE_ANY: - case FT_ABORT_ANY: - case FT_COMMIT_ANY: - case FT_UPDATE: - ret_val = false; - break; - case FT_COMMIT_BROADCAST_ALL: - case FT_COMMIT_BROADCAST_TXN: - case FT_ABORT_BROADCAST_TXN: - case FT_OPTIMIZE: - case FT_OPTIMIZE_FOR_UPGRADE: - case FT_UPDATE_BROADCAST_ALL: - ret_val = true; - break; - default: - assert(false); - } - return ret_val; -} - -static inline bool -ft_msg_type_does_nothing(enum ft_msg_type type) -{ - return (type == FT_NONE); -} - -typedef struct xids_t *XIDS; - -/* tree commands */ -struct ft_msg { - enum ft_msg_type type; - MSN msn; // message sequence number - XIDS xids; - union { - /* insert or delete */ - struct ft_msg_insert_delete { - const DBT *key; // for insert, delete, upsertdel - const DBT *val; // for insert, delete, (and it is the "extra" for upsertdel, upsertdel_broadcast_all) - } id; - } u; -}; - -// Message sent into the ft to implement insert, delete, update, etc -typedef struct ft_msg FT_MSG_S; -typedef struct ft_msg *FT_MSG; uint32_t ft_msg_get_keylen(FT_MSG ft_msg); @@ -205,10 +104,21 @@ uint32_t ft_msg_get_vallen(FT_MSG ft_msg); XIDS ft_msg_get_xids(FT_MSG ft_msg); -void *ft_msg_get_key(FT_MSG ft_msg); +void * ft_msg_get_key(FT_MSG ft_msg); -void *ft_msg_get_val(FT_MSG ft_msg); +void * ft_msg_get_val(FT_MSG ft_msg); enum ft_msg_type ft_msg_get_type(FT_MSG ft_msg); +void ft_msg_from_fifo_msg(FT_MSG ft_msg, FIFO_MSG fifo_msg); + +#if 0 + +void ft_msg_from_dbts(FT_MSG ft_msg, DBT *key, DBT *val, XIDS xids, enum ft_msg_type type); + #endif + + + +#endif // FT_MSG_H + diff --git a/ft/ft_node-serialize.cc b/ft/ft_node-serialize.cc index 13512be6e50..1090bca6ca0 100644 --- a/ft/ft_node-serialize.cc +++ b/ft/ft_node-serialize.cc @@ -98,8 +98,6 @@ PATENT RIGHTS GRANT: #include "ft.h" #include #include -#include "ft/cachetable.h" -#include "ft/rollback.h" static FT_UPGRADE_STATUS_S ft_upgrade_status; diff --git a/ft/ftloader.h b/ft/ftloader.h index ab78af34ea2..c920b4c5362 100644 --- a/ft/ftloader.h +++ b/ft/ftloader.h @@ -92,8 +92,6 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "ft/cachetable.h" - // The loader callbacks are C functions and need to be defined as such typedef void (*ft_loader_error_func)(DB *, int which_db, int err, DBT *key, DBT *val, void *extra); diff --git a/ft/fttypes.h b/ft/fttypes.h index c176eb24896..73e228cf6ff 100644 --- a/ft/fttypes.h +++ b/ft/fttypes.h @@ -115,6 +115,9 @@ typedef struct ft *FT; typedef struct ft_header *FT_HEADER; typedef struct ft_options *FT_OPTIONS; +struct wbuf; +struct dbuf; + typedef unsigned int ITEMLEN; typedef const void *bytevec; @@ -140,6 +143,36 @@ typedef TOKU_XA_XID *XIDP; // this is the type that's passed to the logger code static inline BLOCKNUM make_blocknum(int64_t b) { BLOCKNUM result={b}; return result; } +// This struct hold information about values stored in the cachetable. +// As one can tell from the names, we are probably violating an +// abstraction layer by placing names. +// +// The purpose of having this struct is to have a way for the +// cachetable to accumulate the some totals we are interested in. +// Breaking this abstraction layer by having these names was the +// easiest way. +// +typedef struct pair_attr_s { + long size; // size PAIR's value takes in memory + long nonleaf_size; // size if PAIR is a nonleaf node, 0 otherwise, used only for engine status + long leaf_size; // size if PAIR is a leaf node, 0 otherwise, used only for engine status + long rollback_size; // size of PAIR is a rollback node, 0 otherwise, used only for engine status + long cache_pressure_size; // amount PAIR contributes to cache pressure, is sum of buffer sizes and workdone counts + bool is_valid; +} PAIR_ATTR; + +static inline PAIR_ATTR make_pair_attr(long size) { + PAIR_ATTR result={ + .size = size, + .nonleaf_size = 0, + .leaf_size = 0, + .rollback_size = 0, + .cache_pressure_size = 0, + .is_valid = true + }; + return result; +} + typedef struct { uint32_t len; char *data; @@ -188,14 +221,162 @@ typedef struct tokutxn *TOKUTXN; typedef struct txninfo *TXNINFO; #define NULL_TXN ((TOKUTXN)0) +struct logged_btt_pair { + DISKOFF off; + int32_t size; +}; + +typedef struct cachetable *CACHETABLE; +typedef struct cachefile *CACHEFILE; +typedef struct ctpair *PAIR; +typedef class checkpointer *CHECKPOINTER; +class bn_data; + +/* tree command types */ +enum ft_msg_type { + FT_NONE = 0, + FT_INSERT = 1, + FT_DELETE_ANY = 2, // Delete any matching key. This used to be called FT_DELETE. + //FT_DELETE_BOTH = 3, + FT_ABORT_ANY = 4, // Abort any commands on any matching key. + //FT_ABORT_BOTH = 5, // Abort commands that match both the key and the value + FT_COMMIT_ANY = 6, + //FT_COMMIT_BOTH = 7, + FT_COMMIT_BROADCAST_ALL = 8, // Broadcast to all leafentries, (commit all transactions). + FT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction). + FT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction). + FT_INSERT_NO_OVERWRITE = 11, + FT_OPTIMIZE = 12, // Broadcast + FT_OPTIMIZE_FOR_UPGRADE = 13, // same as FT_OPTIMIZE, but record version number in leafnode + FT_UPDATE = 14, + FT_UPDATE_BROADCAST_ALL = 15 +}; + +static inline bool +ft_msg_type_applies_once(enum ft_msg_type type) +{ + bool ret_val; + switch (type) { + case FT_INSERT_NO_OVERWRITE: + case FT_INSERT: + case FT_DELETE_ANY: + case FT_ABORT_ANY: + case FT_COMMIT_ANY: + case FT_UPDATE: + ret_val = true; + break; + case FT_COMMIT_BROADCAST_ALL: + case FT_COMMIT_BROADCAST_TXN: + case FT_ABORT_BROADCAST_TXN: + case FT_OPTIMIZE: + case FT_OPTIMIZE_FOR_UPGRADE: + case FT_UPDATE_BROADCAST_ALL: + case FT_NONE: + ret_val = false; + break; + default: + assert(false); + } + return ret_val; +} + +static inline bool +ft_msg_type_applies_all(enum ft_msg_type type) +{ + bool ret_val; + switch (type) { + case FT_NONE: + case FT_INSERT_NO_OVERWRITE: + case FT_INSERT: + case FT_DELETE_ANY: + case FT_ABORT_ANY: + case FT_COMMIT_ANY: + case FT_UPDATE: + ret_val = false; + break; + case FT_COMMIT_BROADCAST_ALL: + case FT_COMMIT_BROADCAST_TXN: + case FT_ABORT_BROADCAST_TXN: + case FT_OPTIMIZE: + case FT_OPTIMIZE_FOR_UPGRADE: + case FT_UPDATE_BROADCAST_ALL: + ret_val = true; + break; + default: + assert(false); + } + return ret_val; +} + +static inline bool +ft_msg_type_does_nothing(enum ft_msg_type type) +{ + return (type == FT_NONE); +} + typedef struct xids_t *XIDS; +typedef struct fifo_msg_t *FIFO_MSG; +/* tree commands */ +struct ft_msg { + enum ft_msg_type type; + MSN msn; // message sequence number + XIDS xids; + union { + /* insert or delete */ + struct ft_msg_insert_delete { + const DBT *key; // for insert, delete, upsertdel + const DBT *val; // for insert, delete, (and it is the "extra" for upsertdel, upsertdel_broadcast_all) + } id; + } u; +}; + +// Message sent into the ft to implement insert, delete, update, etc +typedef struct ft_msg FT_MSG_S; +typedef struct ft_msg *FT_MSG; typedef int (*ft_compare_func)(DB *, const DBT *, const DBT *); typedef void (*setval_func)(const DBT *, void *); typedef int (*ft_update_func)(DB *, const DBT *, const DBT *, const DBT *, setval_func, void *); -typedef void (*remove_ft_ref_callback)(FT, void*); typedef void (*on_redirect_callback)(FT_HANDLE, void*); +typedef void (*remove_ft_ref_callback)(FT, void*); #define UU(x) x __attribute__((__unused__)) +typedef struct memarena *MEMARENA; +typedef struct rollback_log_node *ROLLBACK_LOG_NODE; +typedef struct serialized_rollback_log_node *SERIALIZED_ROLLBACK_LOG_NODE; + +// +// Types of snapshots that can be taken by a tokutxn +// - TXN_SNAPSHOT_NONE: means that there is no snapshot. Reads do not use snapshot reads. +// used for SERIALIZABLE and READ UNCOMMITTED +// - TXN_SNAPSHOT_ROOT: means that all tokutxns use their root transaction's snapshot +// used for REPEATABLE READ +// - TXN_SNAPSHOT_CHILD: means that each child tokutxn creates its own snapshot +// used for READ COMMITTED +// + +typedef enum __TXN_SNAPSHOT_TYPE { + TXN_SNAPSHOT_NONE=0, + TXN_SNAPSHOT_ROOT=1, + TXN_SNAPSHOT_CHILD=2 +} TXN_SNAPSHOT_TYPE; + +typedef struct ancestors *ANCESTORS; +typedef struct pivot_bounds const * const PIVOT_BOUNDS; +typedef struct ftnode_fetch_extra *FTNODE_FETCH_EXTRA; +typedef struct unlockers *UNLOCKERS; + +enum reactivity { + RE_STABLE, + RE_FUSIBLE, + RE_FISSIBLE +}; + +enum split_mode { + SPLIT_EVENLY, + SPLIT_LEFT_HEAVY, + SPLIT_RIGHT_HEAVY +}; + #endif diff --git a/ft/le-cursor.cc b/ft/le-cursor.cc index 3eb73f1345b..b08fc62632c 100644 --- a/ft/le-cursor.cc +++ b/ft/le-cursor.cc @@ -89,10 +89,9 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "ft/ft.h" -#include "ft/ft-internal.h" -#include "ft/le-cursor.h" -#include "ft/cursor.h" +#include "ft.h" +#include "ft-internal.h" +#include "le-cursor.h" // A LE_CURSOR is a special purpose FT_CURSOR that: // - enables prefetching diff --git a/ft/leafentry.h b/ft/leafentry.h index e3c29b6e224..5c525db5c19 100644 --- a/ft/leafentry.h +++ b/ft/leafentry.h @@ -98,9 +98,8 @@ PATENT RIGHTS GRANT: #include #include -#include "ft/txn_manager.h" -#include "ft/rbuf.h" -#include "ft/ft_msg.h" +#include "txn_manager.h" +#include "rbuf.h" /* Memory format of packed leaf entry @@ -242,8 +241,6 @@ toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored size_t *new_leafentry_memorysize, LEAFENTRY *new_leafentry_p); -class bn_data; - void toku_le_apply_msg(FT_MSG msg, LEAFENTRY old_leafentry, // NULL if there was no stored data. diff --git a/ft/log-internal.h b/ft/log-internal.h index cd191571a67..be8ab7a53da 100644 --- a/ft/log-internal.h +++ b/ft/log-internal.h @@ -119,7 +119,6 @@ using namespace toku; #define LOGGER_MIN_BUF_SIZE (1<<24) -// TODO: Remove mylock, it has no value struct mylock { toku_mutex_t lock; }; @@ -284,7 +283,6 @@ struct tokutxn { // txn to not transition to commit or abort uint64_t client_id; }; -typedef struct tokutxn *TOKUTXN; static inline int txn_has_current_rollback_log(TOKUTXN txn) { @@ -307,10 +305,6 @@ struct txninfo { BLOCKNUM spilled_rollback_tail; BLOCKNUM current_rollback; }; -typedef struct txninfo *TXNINFO; - -// TODO: Remove null txn -#define NULL_TXN ((TOKUTXN)0) static inline int toku_logsizeof_uint8_t (uint32_t v __attribute__((__unused__))) { return 1; diff --git a/ft/logger.h b/ft/logger.h index 11731fb5b97..6488ec0707d 100644 --- a/ft/logger.h +++ b/ft/logger.h @@ -92,9 +92,8 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "ft/fttypes.h" -#include "ft/ft-internal.h" -#include "ft/ft_layout_version.h" +#include "fttypes.h" +#include "ft_layout_version.h" enum { TOKU_LOG_VERSION_1 = 1, diff --git a/ft/rollback.h b/ft/rollback.h index e9cb528b7a9..2e9493b0e6b 100644 --- a/ft/rollback.h +++ b/ft/rollback.h @@ -92,13 +92,8 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -#include "ft/sub_block.h" -#include "ft/cachetable.h" - -#include "util/memarena.h" - -typedef struct rollback_log_node *ROLLBACK_LOG_NODE; -typedef struct serialized_rollback_log_node *SERIALIZED_ROLLBACK_LOG_NODE; +#include +#include "sub_block.h" void toku_poll_txn_progress_function(TOKUTXN txn, uint8_t is_commit, uint8_t stall_for_checkpoint); @@ -177,7 +172,6 @@ struct serialized_rollback_log_node { BLOCKNUM blocknum; struct sub_block sub_block[max_sub_blocks]; }; -typedef struct serialized_rollback_log_node *SERIALIZED_ROLLBACK_LOG_NODE; static inline void toku_static_serialized_rollback_log_destroy(SERIALIZED_ROLLBACK_LOG_NODE log) { diff --git a/ft/sub_block.h b/ft/sub_block.h index d00df6fa51a..23fad83c966 100644 --- a/ft/sub_block.h +++ b/ft/sub_block.h @@ -112,7 +112,6 @@ struct sub_block { uint32_t xsum; // sub block checksum }; -typedef struct sub_block *SUB_BLOCK; struct stored_sub_block { uint32_t uncompressed_size; diff --git a/ft/txn_manager.h b/ft/txn_manager.h index 58d7555dc05..12267297a0e 100644 --- a/ft/txn_manager.h +++ b/ft/txn_manager.h @@ -123,7 +123,6 @@ struct txn_manager { TXNID last_xid_seen_for_recover; TXNID last_calculated_oldest_referenced_xid; }; -typedef struct txn_manager *TXN_MANAGER; struct txn_manager_state { txn_manager_state(TXN_MANAGER mgr) : @@ -190,22 +189,6 @@ TXNID toku_txn_manager_get_oldest_living_xid(TXN_MANAGER txn_manager); TXNID toku_txn_manager_get_oldest_referenced_xid_estimate(TXN_MANAGER txn_manager); -// -// Types of snapshots that can be taken by a tokutxn -// - TXN_SNAPSHOT_NONE: means that there is no snapshot. Reads do not use snapshot reads. -// used for SERIALIZABLE and READ UNCOMMITTED -// - TXN_SNAPSHOT_ROOT: means that all tokutxns use their root transaction's snapshot -// used for REPEATABLE READ -// - TXN_SNAPSHOT_CHILD: means that each child tokutxn creates its own snapshot -// used for READ COMMITTED -// - -typedef enum __TXN_SNAPSHOT_TYPE { - TXN_SNAPSHOT_NONE=0, - TXN_SNAPSHOT_ROOT=1, - TXN_SNAPSHOT_CHILD=2 -} TXN_SNAPSHOT_TYPE; - void toku_txn_manager_handle_snapshot_create_for_child_txn( TOKUTXN txn, TXN_MANAGER txn_manager, From ebc4a08a62525b719f3873cfd1cda7d14cd43ce3 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Sun, 25 May 2014 08:44:04 -0400 Subject: [PATCH 28/46] TokuDB 7.1.6 is released --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7d4ebcefce1..1deb3699c5a 100644 --- a/README.md +++ b/README.md @@ -24,14 +24,14 @@ working MySQL or MariaDB with Tokutek patches, and with the TokuDB storage engine, called `make.mysql.bash`. This script will download copies of the needed source code from github and build everything. -To build MySQL 5.5.36 with TokuDB 7.1.5: +To build MySQL 5.5.37 with TokuDB 7.1.6: ```sh -scripts/make.mysql.bash --mysqlbuild=mysql-5.5.36-tokudb-7.1.5-linux-x86_64 +scripts/make.mysql.bash --mysqlbuild=mysql-5.5.37-tokudb-7.1.6-linux-x86_64 ``` -To build MariaDB 5.5.36 with TokuDB 7.1.5: +To build MariaDB 5.5.37 with TokuDB 7.1.6: ```sh -scripts/make.mysql.bash --mysqlbuild=mariadb-5.5.36-tokudb-7.1.5-linux-x86_64 +scripts/make.mysql.bash --mysqlbuild=mariadb-5.5.37-tokudb-7.1.6-linux-x86_64 ``` Before you start, make sure you have a C++11-compatible compiler (GCC >= From e0de0b2d9679eb4d2689578877c554a3e5254e50 Mon Sep 17 00:00:00 2001 From: John Esmet Date: Sun, 25 May 2014 12:42:52 -0400 Subject: [PATCH 29/46] fixes #158 Use promotion to record the blocknum of the rightmost non-root leaf node in each FT. When the FT detects a rightmost insertion pattern, it attempts to do inserts and unique checks directly into the rightmost leaf node, greatly optimizing sequential insert speed. --- ft/ft-cachetable-wrappers.cc | 22 ++ ft/ft-cachetable-wrappers.h | 3 + ft/ft-flusher.cc | 25 +- ft/ft-internal.h | 23 ++ ft/ft-ops.cc | 352 ++++++++++++++++-- ft/ft-ops.h | 3 + ...test_rightmost_leaf_seqinsert_heuristic.cc | 183 +++++++++ ft/tests/test_rightmost_leaf_split_merge.cc | 212 +++++++++++ src/tests/test_insert_unique.cc | 202 ++++++++++ src/ydb_write.cc | 82 ++-- 10 files changed, 1049 insertions(+), 58 deletions(-) create mode 100644 ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc create mode 100644 ft/tests/test_rightmost_leaf_split_merge.cc create mode 100644 src/tests/test_insert_unique.cc diff --git a/ft/ft-cachetable-wrappers.cc b/ft/ft-cachetable-wrappers.cc index 1f3aa3e0baa..91a0040b02e 100644 --- a/ft/ft-cachetable-wrappers.cc +++ b/ft/ft-cachetable-wrappers.cc @@ -403,3 +403,25 @@ toku_unpin_ftnode_read_only(FT ft, FTNODE node) ); assert(r==0); } + +void toku_ftnode_swap_pair_values(FTNODE a, FTNODE b) +// Effect: Swap the blocknum, fullhash, and PAIR for for a and b +// Requires: Both nodes are pinned +{ + BLOCKNUM tmp_blocknum = a->thisnodename; + uint32_t tmp_fullhash = a->fullhash; + PAIR tmp_pair = a->ct_pair; + + a->thisnodename = b->thisnodename; + a->fullhash = b->fullhash; + a->ct_pair = b->ct_pair; + + b->thisnodename = tmp_blocknum; + b->fullhash = tmp_fullhash; + b->ct_pair = tmp_pair; + + // A and B swapped pair pointers, but we still have to swap + // the actual pair values (ie: the FTNODEs they represent) + // in the cachetable. + toku_cachetable_swap_pair_values(a->ct_pair, b->ct_pair); +} diff --git a/ft/ft-cachetable-wrappers.h b/ft/ft-cachetable-wrappers.h index 9a56f4ff220..dc84d7f006b 100644 --- a/ft/ft-cachetable-wrappers.h +++ b/ft/ft-cachetable-wrappers.h @@ -190,4 +190,7 @@ int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pai void toku_unpin_ftnode(FT h, FTNODE node); void toku_unpin_ftnode_read_only(FT ft, FTNODE node); +// Effect: Swaps pair values of two pinned nodes +void toku_ftnode_swap_pair_values(FTNODE nodea, FTNODE nodeb); + #endif diff --git a/ft/ft-flusher.cc b/ft/ft-flusher.cc index 0fe556aec0f..dc4096a7993 100644 --- a/ft/ft-flusher.cc +++ b/ft/ft-flusher.cc @@ -565,6 +565,7 @@ static bool may_node_be_reactive(FT ft, FTNODE node) */ static void handle_split_of_child( + FT ft, FTNODE node, int childnum, FTNODE childa, @@ -607,8 +608,20 @@ handle_split_of_child( paranoid_invariant(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child + // We never set the rightmost blocknum to be the root. + // Instead, we wait for the root to split and let promotion initialize the rightmost + // blocknum to be the first non-root leaf node on the right extreme to recieve an insert. + invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b); + if (childa->thisnodename.b == ft->rightmost_blocknum.b) { + // The rightmost leaf (a) split into (a) and (b). We want (b) to swap pair values + // with (a), now that it is the new rightmost leaf. This keeps the rightmost blocknum + // constant, the same the way we keep the root blocknum constant. + toku_ftnode_swap_pair_values(childa, childb); + BP_BLOCKNUM(node, childnum) = childa->thisnodename; + } + BP_BLOCKNUM(node, childnum+1) = childb->thisnodename; - BP_WORKDONE(node, childnum+1) = 0; + BP_WORKDONE(node, childnum+1) = 0; BP_STATE(node,childnum+1) = PT_AVAIL; NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl(); @@ -1071,7 +1084,7 @@ ft_split_child( ft_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes); } // printf("%s:%d child did split\n", __FILE__, __LINE__); - handle_split_of_child (node, childnum, nodea, nodeb, &splitk); + handle_split_of_child (h, node, childnum, nodea, nodeb, &splitk); // for test call_flusher_thread_callback(flt_flush_during_split); @@ -1489,6 +1502,14 @@ ft_merge_child( &node->childkeys[childnuma+1], (node->n_children-childnumb)*sizeof(node->childkeys[0])); REALLOC_N(node->n_children-1, node->childkeys); + + // Handle a merge of the rightmost leaf node. + if (did_merge && childb->thisnodename.b == h->rightmost_blocknum.b) { + invariant(childb->thisnodename.b != h->h->root_blocknum.b); + toku_ftnode_swap_pair_values(childa, childb); + BP_BLOCKNUM(node, childnuma) = childa->thisnodename; + } + paranoid_invariant(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b); childa->dirty = 1; // just to make sure childb->dirty = 1; // just to make sure diff --git a/ft/ft-internal.h b/ft/ft-internal.h index 42d27638330..f182a4f6aed 100644 --- a/ft/ft-internal.h +++ b/ft/ft-internal.h @@ -123,6 +123,10 @@ enum { FT_DEFAULT_FANOUT = 16 }; enum { FT_DEFAULT_NODE_SIZE = 4 * 1024 * 1024 }; enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 }; +// We optimize for a sequential insert pattern if 100 consecutive injections +// happen into the rightmost leaf node due to promotion. +enum { FT_SEQINSERT_SCORE_THRESHOLD = 100 }; + // // Field in ftnode_fetch_extra that tells the // partial fetch callback what piece of the node @@ -572,6 +576,22 @@ struct ft { // is this ft a blackhole? if so, all messages are dropped. bool blackhole; + + // The blocknum of the rightmost leaf node in the tree. Stays constant through splits + // and merges using pair-swapping (like the root node, see toku_ftnode_swap_pair_values()) + // + // This field only transitions from RESERVED_BLOCKNUM_NULL to non-null, never back. + // We initialize it when promotion inserts into a non-root leaf node on the right extreme. + // We use the blocktable lock to protect the initialize transition, though it's not really + // necessary since all threads should be setting it to the same value. We maintain that invariant + // on first initialization, see ft_set_or_verify_rightmost_blocknum() + BLOCKNUM rightmost_blocknum; + + // sequential access pattern heuristic + // - when promotion pushes a message directly into the rightmost leaf, the score goes up. + // - if the score is high enough, we optimistically attempt to insert directly into the rightmost leaf + // - if our attempt fails because the key was not in range of the rightmost leaf, we reset the score back to 0 + uint32_t seqinsert_score; }; // Allocate a DB struct off the stack and only set its comparison @@ -1186,6 +1206,9 @@ typedef enum { FT_PRO_NUM_DIDNT_WANT_PROMOTE, FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize + FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, + FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, + FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE, FT_STATUS_NUM_ROWS } ft_status_entry; diff --git a/ft/ft-ops.cc b/ft/ft-ops.cc index ab7de1a0a2c..9521a9228a7 100644 --- a/ft/ft-ops.cc +++ b/ft/ft-ops.cc @@ -367,6 +367,9 @@ status_init(void) STATUS_INIT(FT_PRO_NUM_DIDNT_WANT_PROMOTE, PROMOTION_STOPPED_AFTER_LOCKING_CHILD, PARCOUNT, "promotion: stopped anyway, after locking the child", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, BASEMENT_DESERIALIZATION_FIXED_KEY, PARCOUNT, "basement nodes deserialized with fixed-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, BASEMENT_DESERIALIZATION_VARIABLE_KEY, PARCOUNT, "basement nodes deserialized with variable-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, nullptr, PARCOUNT, "promotion: succeeded in using the rightmost leaf shortcut", TOKU_ENGINE_STATUS); + STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, nullptr, PARCOUNT, "promotion: tried the rightmost leaf shorcut but failed (out-of-bounds)", TOKU_ENGINE_STATUS); + STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE,nullptr, PARCOUNT, "promotion: tried the rightmost leaf shorcut but failed (child reactive)", TOKU_ENGINE_STATUS); ft_status.initialized = true; } @@ -1643,12 +1646,10 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp) BLOCKNUM old_blocknum = oldroot->thisnodename; uint32_t old_fullhash = oldroot->fullhash; - PAIR old_pair = oldroot->ct_pair; int new_height = oldroot->height+1; uint32_t new_fullhash; BLOCKNUM new_blocknum; - PAIR new_pair = NULL; cachetable_put_empty_node_with_dep_nodes( ft, @@ -1658,7 +1659,6 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp) &new_fullhash, &newroot ); - new_pair = newroot->ct_pair; assert(newroot); assert(new_height > 0); @@ -1670,22 +1670,18 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp) ft->h->layout_version, ft->h->flags ); + newroot->fullhash = new_fullhash; MSN msna = oldroot->max_msn_applied_to_node_on_disk; newroot->max_msn_applied_to_node_on_disk = msna; BP_STATE(newroot,0) = PT_AVAIL; newroot->dirty = 1; - // now do the "switcheroo" - BP_BLOCKNUM(newroot,0) = new_blocknum; - newroot->thisnodename = old_blocknum; - newroot->fullhash = old_fullhash; - newroot->ct_pair = old_pair; - - oldroot->thisnodename = new_blocknum; - oldroot->fullhash = new_fullhash; - oldroot->ct_pair = new_pair; - - toku_cachetable_swap_pair_values(old_pair, new_pair); + // Set the first child to have the new blocknum, + // and then swap newroot with oldroot. The new root + // will inherit the hash/blocknum/pair from oldroot, + // keeping the root blocknum constant. + BP_BLOCKNUM(newroot, 0) = new_blocknum; + toku_ftnode_swap_pair_values(newroot, oldroot); toku_ft_split_child( ft, @@ -2774,6 +2770,16 @@ static void inject_message_in_locked_node( // verify that msn of latest message was captured in root node paranoid_invariant(msg->msn.msn == node->max_msn_applied_to_node_on_disk.msn); + if (node->thisnodename.b == ft->rightmost_blocknum.b) { + if (ft->seqinsert_score < FT_SEQINSERT_SCORE_THRESHOLD) { + // we promoted to the rightmost leaf node and the seqinsert score has not yet saturated. + toku_sync_fetch_and_add(&ft->seqinsert_score, 1); + } + } else if (ft->seqinsert_score != 0) { + // we promoted to something other than the rightmost leaf node and the score should reset + ft->seqinsert_score = 0; + } + // if we call toku_ft_flush_some_child, then that function unpins the root // otherwise, we unpin ourselves if (node->height > 0 && toku_ft_nonleaf_is_gorged(node, ft->h->nodesize)) { @@ -2930,6 +2936,21 @@ static inline bool should_inject_in_node(seqinsert_loc loc, int height, int dept return (height == 0 || (loc == NEITHER_EXTREME && (height <= 1 || depth >= 2))); } +static void ft_set_or_verify_rightmost_blocknum(FT ft, BLOCKNUM b) +// Given: 'b', the _definitive_ and constant rightmost blocknum of 'ft' +{ + if (ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL) { + toku_ft_lock(ft); + if (ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL) { + ft->rightmost_blocknum = b; + } + toku_ft_unlock(ft); + } + // The rightmost blocknum only transitions from RESERVED_BLOCKNUM_NULL to non-null. + // If it's already set, verify that the stored value is consistent with 'b' + invariant(ft->rightmost_blocknum.b == b.b); +} + static void push_something_in_subtree( FT ft, FTNODE subtree_root, @@ -2977,6 +2998,14 @@ static void push_something_in_subtree( default: STATUS_INC(FT_PRO_NUM_INJECT_DEPTH_GT3, 1); break; } + // If the target node is a non-root leaf node on the right extreme, + // set the rightmost blocknum. We know there are no messages above us + // because promotion would not chose to inject directly into this leaf + // otherwise. We explicitly skip the root node because then we don't have + // to worry about changing the rightmost blocknum when the root splits. + if (subtree_root->height == 0 && loc == RIGHT_EXTREME && subtree_root->thisnodename.b != ft->h->root_blocknum.b) { + ft_set_or_verify_rightmost_blocknum(ft, subtree_root->thisnodename); + } inject_message_in_locked_node(ft, subtree_root, target_childnum, msg, flow_deltas, gc_info); } else { int r; @@ -3247,7 +3276,260 @@ void toku_ft_root_put_msg( } } -// Effect: Insert the key-val pair into ft. +static int ft_compare_keys(FT ft, const DBT *a, const DBT *b) +// Effect: Compare two keys using the given fractal tree's comparator/descriptor +{ + FAKE_DB(db, &ft->cmp_descriptor); + return ft->compare_fun(&db, a, b); +} + +static LEAFENTRY bn_get_le_and_key(BASEMENTNODE bn, int idx, DBT *key) +// Effect: Gets the i'th leafentry from the given basement node and +// fill its key in *key +// Requires: The i'th leafentry exists. +{ + LEAFENTRY le; + uint32_t le_len; + void *le_key; + int r = bn->data_buffer.fetch_klpair(idx, &le, &le_len, &le_key); + invariant_zero(r); + toku_fill_dbt(key, le_key, le_len); + return le; +} + +static LEAFENTRY ft_leaf_leftmost_le_and_key(FTNODE leaf, DBT *leftmost_key) +// Effect: If a leftmost key exists in the given leaf, toku_fill_dbt() +// the key into *leftmost_key +// Requires: Leaf is fully in memory and pinned for read or write. +// Return: leafentry if it exists, nullptr otherwise +{ + for (int i = 0; i < leaf->n_children; i++) { + BASEMENTNODE bn = BLB(leaf, i); + if (bn->data_buffer.num_klpairs() > 0) { + // Get the first (leftmost) leafentry and its key + return bn_get_le_and_key(bn, 0, leftmost_key); + } + } + return nullptr; +} + +static LEAFENTRY ft_leaf_rightmost_le_and_key(FTNODE leaf, DBT *rightmost_key) +// Effect: If a rightmost key exists in the given leaf, toku_fill_dbt() +// the key into *rightmost_key +// Requires: Leaf is fully in memory and pinned for read or write. +// Return: leafentry if it exists, nullptr otherwise +{ + for (int i = leaf->n_children - 1; i >= 0; i--) { + BASEMENTNODE bn = BLB(leaf, i); + size_t num_les = bn->data_buffer.num_klpairs(); + if (num_les > 0) { + // Get the last (rightmost) leafentry and its key + return bn_get_le_and_key(bn, num_les - 1, rightmost_key); + } + } + return nullptr; +} + +static int ft_leaf_get_relative_key_pos(FT ft, FTNODE leaf, const DBT *key, bool *nondeleted_key_found, int *target_childnum) +// Effect: Determines what the relative position of the given key is with +// respect to a leaf node, and if it exists. +// Requires: Leaf is fully in memory and pinned for read or write. +// Requires: target_childnum is non-null +// Return: < 0 if key is less than the leftmost key in the leaf OR the relative position is unknown, for any reason. +// 0 if key is in the bounds [leftmost_key, rightmost_key] for this leaf or the leaf is empty +// > 0 if key is greater than the rightmost key in the leaf +// *nondeleted_key_found is set (if non-null) if the target key was found and is not deleted, unmodified otherwise +// *target_childnum is set to the child that (does or would) contain the key, if calculated, unmodified otherwise +{ + DBT rightmost_key; + LEAFENTRY rightmost_le = ft_leaf_rightmost_le_and_key(leaf, &rightmost_key); + if (rightmost_le == nullptr) { + // If we can't get a rightmost key then the leaf is empty. + // In such a case, we don't have any information about what keys would be in this leaf. + // We have to assume the leaf node that would contain this key is to the left. + return -1; + } + // We have a rightmost leafentry, so it must exist in some child node + invariant(leaf->n_children > 0); + + int relative_pos = 0; + int c = ft_compare_keys(ft, key, &rightmost_key); + if (c > 0) { + relative_pos = 1; + *target_childnum = leaf->n_children - 1; + } else if (c == 0) { + if (nondeleted_key_found != nullptr && !le_latest_is_del(rightmost_le)) { + *nondeleted_key_found = true; + } + relative_pos = 0; + *target_childnum = leaf->n_children - 1; + } else { + // The key is less than the rightmost. It may still be in bounds if it's >= the leftmost. + DBT leftmost_key; + LEAFENTRY leftmost_le = ft_leaf_leftmost_le_and_key(leaf, &leftmost_key); + invariant_notnull(leftmost_le); // Must exist because a rightmost exists + c = ft_compare_keys(ft, key, &leftmost_key); + if (c > 0) { + if (nondeleted_key_found != nullptr) { + // The caller wants to know if a nondeleted key can be found. + LEAFENTRY target_le; + int childnum = toku_ftnode_which_child(leaf, key, &ft->cmp_descriptor, ft->compare_fun); + BASEMENTNODE bn = BLB(leaf, childnum); + struct msg_leafval_heaviside_extra extra = { ft->compare_fun, &ft->cmp_descriptor, key }; + int r = bn->data_buffer.find_zero( + extra, + &target_le, + nullptr, nullptr, nullptr + ); + *target_childnum = childnum; + if (r == 0 && !le_latest_is_del(leftmost_le)) { + *nondeleted_key_found = true; + } + } + relative_pos = 0; + } else if (c == 0) { + if (nondeleted_key_found != nullptr && !le_latest_is_del(leftmost_le)) { + *nondeleted_key_found = true; + } + relative_pos = 0; + *target_childnum = 0; + } else { + relative_pos = -1; + } + } + + return relative_pos; +} + +static void ft_insert_directly_into_leaf(FT ft, FTNODE leaf, int target_childnum, DBT *key, DBT *val, + XIDS message_xids, enum ft_msg_type type, txn_gc_info *gc_info); +static int getf_nothing(ITEMLEN, bytevec, ITEMLEN, bytevec, void *, bool); + +static int ft_maybe_insert_into_rightmost_leaf(FT ft, DBT *key, DBT *val, XIDS message_xids, enum ft_msg_type type, + txn_gc_info *gc_info, bool unique) +// Effect: Pins the rightmost leaf node and attempts to do an insert. +// There are three reasons why we may not succeed. +// - The rightmost leaf is too full and needs a split. +// - The key to insert is not within the provable bounds of this leaf node. +// - The key is within bounds, but it already exists. +// Return: 0 if this function did insert, DB_KEYEXIST if a unique key constraint exists and +// some nondeleted leafentry with the same key exists +// < 0 if this function did not insert, for a reason other than DB_KEYEXIST. +// Note: Treat this function as a possible, but not necessary, optimization for insert. +// Rationale: We want O(1) insertions down the rightmost path of the tree. +{ + int r = -1; + + uint32_t rightmost_fullhash; + BLOCKNUM rightmost_blocknum = ft->rightmost_blocknum; + FTNODE rightmost_leaf = nullptr; + + // Don't do the optimization if our heurstic suggests that + // insertion pattern is not sequential. + if (ft->seqinsert_score < FT_SEQINSERT_SCORE_THRESHOLD) { + goto cleanup; + } + + // We know the seqinsert score is high enough that we should + // attemp to directly insert into the right most leaf. Because + // the score is non-zero, the rightmost blocknum must have been + // set. See inject_message_in_locked_node(), which only increases + // the score if the target node blocknum == rightmost_blocknum + invariant(rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL); + + // Pin the rightmost leaf with a write lock. + rightmost_fullhash = toku_cachetable_hash(ft->cf, rightmost_blocknum); + struct ftnode_fetch_extra bfe; + fill_bfe_for_full_read(&bfe, ft); + toku_pin_ftnode(ft, rightmost_blocknum, rightmost_fullhash, &bfe, PL_WRITE_CHEAP, &rightmost_leaf, true); + + // The rightmost blocknum never chances once it is initialized to something + // other than null. Verify that the pinned node has the correct blocknum. + invariant(rightmost_leaf->thisnodename.b == rightmost_blocknum.b); + + // If the rightmost leaf is reactive, bail out out and let the normal promotion pass + // take care of it. This also ensures that if any of our ancestors are reactive, + // they'll be taken care of too. + if (get_leaf_reactivity(rightmost_leaf, ft->h->nodesize) != RE_STABLE) { + STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE, 1); + goto cleanup; + } + + // The groundwork has been laid for an insertion directly into the rightmost + // leaf node. We know that it is pinned for write, fully in memory, has + // no messages above it, and is not reactive. + // + // Now, two more things must be true for this insertion to actually happen: + // 1. The key to insert is within the bounds of this leafnode, or to the right. + // 2. If there is a uniqueness constraint, it passes. + bool nondeleted_key_found; + int relative_pos; + int target_childnum; + + nondeleted_key_found = false; + target_childnum = -1; + relative_pos = ft_leaf_get_relative_key_pos(ft, rightmost_leaf, key, + unique ? &nondeleted_key_found : nullptr, + &target_childnum); + if (relative_pos >= 0) { + STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, 1); + if (unique && nondeleted_key_found) { + r = DB_KEYEXIST; + } else { + ft_insert_directly_into_leaf(ft, rightmost_leaf, target_childnum, + key, val, message_xids, type, gc_info); + r = 0; + } + } else { + STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, 1); + r = -1; + } + +cleanup: + // If we did the insert, the rightmost leaf was unpinned for us. + if (r != 0 && rightmost_leaf != nullptr) { + toku_unpin_ftnode(ft, rightmost_leaf); + } + + return r; +} + +static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type); + +int toku_ft_insert_unique(FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool do_logging) { +// Effect: Insert a unique key-val pair into the fractal tree. +// Return: 0 on success, DB_KEYEXIST if the overwrite constraint failed + XIDS message_xids = txn != nullptr ? toku_txn_get_xids(txn) : xids_get_root_xids(); + + TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h); + txn_manager_state txn_state_for_gc(txn_manager); + + TXNID oldest_referenced_xid_estimate = toku_ft_get_oldest_referenced_xid_estimate(ft_h); + txn_gc_info gc_info(&txn_state_for_gc, + oldest_referenced_xid_estimate, + // no messages above us, we can implicitly promote uxrs based on this xid + oldest_referenced_xid_estimate, + true); + int r = ft_maybe_insert_into_rightmost_leaf(ft_h->ft, key, val, message_xids, FT_INSERT, &gc_info, true); + if (r != 0 && r != DB_KEYEXIST) { + // Default to a regular unique check + insert algorithm if we couldn't + // do it based on the rightmost leaf alone. + int lookup_r = toku_ft_lookup(ft_h, key, getf_nothing, nullptr); + if (lookup_r == DB_NOTFOUND) { + toku_ft_send_insert(ft_h, key, val, message_xids, FT_INSERT, &gc_info); + r = 0; + } else { + r = DB_KEYEXIST; + } + } + + if (r == 0) { + ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, FT_INSERT); + } + return r; +} + +// Effect: Insert the key-val pair into an ft. void toku_ft_insert (FT_HANDLE ft_handle, DBT *key, DBT *val, TOKUTXN txn) { toku_ft_maybe_insert(ft_handle, key, val, txn, false, ZERO_LSN, true, FT_INSERT); } @@ -3373,32 +3655,38 @@ TXNID toku_ft_get_oldest_referenced_xid_estimate(FT_HANDLE ft_h) { return txn_manager != nullptr ? toku_txn_manager_get_oldest_referenced_xid_estimate(txn_manager) : TXNID_NONE; } -void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type) { - paranoid_invariant(type==FT_INSERT || type==FT_INSERT_NO_OVERWRITE); - XIDS message_xids = xids_get_root_xids(); //By default use committed messages +static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type) { + paranoid_invariant(type == FT_INSERT || type == FT_INSERT_NO_OVERWRITE); + + //By default use committed messages TXNID_PAIR xid = toku_txn_get_txnid(txn); if (txn) { BYTESTRING keybs = {key->size, (char *) key->data}; - toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft_h->ft->cf), &keybs); - toku_txn_maybe_note_ft(txn, ft_h->ft); - message_xids = toku_txn_get_xids(txn); + toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft->cf), &keybs); + toku_txn_maybe_note_ft(txn, ft); } TOKULOGGER logger = toku_txn_logger(txn); if (do_logging && logger) { BYTESTRING keybs = {.len=key->size, .data=(char *) key->data}; BYTESTRING valbs = {.len=val->size, .data=(char *) val->data}; if (type == FT_INSERT) { - toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs); + toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs); } else { - toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs); + toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs); } } +} + +void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type) { + ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, type); LSN treelsn; if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) { // do nothing } else { + XIDS message_xids = txn ? toku_txn_get_xids(txn) : xids_get_root_xids(); + TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h); txn_manager_state txn_state_for_gc(txn_manager); @@ -3408,10 +3696,26 @@ void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool // no messages above us, we can implicitly promote uxrs based on this xid oldest_referenced_xid_estimate, txn != nullptr ? !txn->for_recovery : false); - toku_ft_send_insert(ft_h, key, val, message_xids, type, &gc_info); + int r = ft_maybe_insert_into_rightmost_leaf(ft_h->ft, key, val, message_xids, FT_INSERT, &gc_info, false); + if (r != 0) { + toku_ft_send_insert(ft_h, key, val, message_xids, type, &gc_info); + } } } +static void ft_insert_directly_into_leaf(FT ft, FTNODE leaf, int target_childnum, DBT *key, DBT *val, + XIDS message_xids, enum ft_msg_type type, txn_gc_info *gc_info) +// Effect: Insert directly into a leaf node a fractal tree. Does not do any logging. +// Requires: Leaf is fully in memory and pinned for write. +// Requires: If this insertion were to happen through the root node, the promotion +// algorithm would have selected the given leaf node as the point of injection. +// That means this function relies on the current implementation of promotion. +{ + FT_MSG_S ftcmd = { type, ZERO_MSN, message_xids, .u = { .id = { key, val } } }; + size_t flow_deltas[] = { 0, 0 }; + inject_message_in_locked_node(ft, leaf, target_childnum, &ftcmd, flow_deltas, gc_info); +} + static void ft_send_update_msg(FT_HANDLE ft_h, FT_MSG_S *msg, TOKUTXN txn) { msg->xids = (txn diff --git a/ft/ft-ops.h b/ft/ft-ops.h index b482d2b8206..cfa6ba20f6f 100644 --- a/ft/ft-ops.h +++ b/ft/ft-ops.h @@ -213,6 +213,9 @@ int toku_ft_lookup (FT_HANDLE ft_h, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void // Effect: Insert a key and data pair into an ft void toku_ft_insert (FT_HANDLE ft_h, DBT *k, DBT *v, TOKUTXN txn); +// Returns: 0 if the key was inserted, DB_KEYEXIST if the key already exists +int toku_ft_insert_unique(FT_HANDLE ft, DBT *k, DBT *v, TOKUTXN txn, bool do_logging); + // Effect: Optimize the ft void toku_ft_optimize (FT_HANDLE ft_h); diff --git a/ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc b/ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc new file mode 100644 index 00000000000..100e5153636 --- /dev/null +++ b/ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc @@ -0,0 +1,183 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2014 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved." + +#include "test.h" + +#include +#include + +// Each FT maintains a sequential insert heuristic to determine if its +// worth trying to insert directly into a well-known rightmost leaf node. +// +// The heuristic is only maintained when a rightmost leaf node is known. +// +// This test verifies that sequential inserts increase the seqinsert score +// and that a single non-sequential insert resets the score. + +static void test_seqinsert_heuristic(void) { + int r = 0; + char name[TOKU_PATH_MAX + 1]; + toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata"); + toku_os_recursive_delete(TOKU_TEST_FILENAME); + r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r); + + FT_HANDLE ft_handle; + CACHETABLE ct; + toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER); + r = toku_open_ft_handle(name, 1, &ft_handle, + 4*1024*1024, 64*1024, + TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL, + toku_builtin_compare_fun); CKERR(r); + FT ft = ft_handle->ft; + + int k; + DBT key, val; + const int val_size = 1024 * 1024; + char *XMALLOC_N(val_size, val_buf); + memset(val_buf, 'x', val_size); + toku_fill_dbt(&val, val_buf, val_size); + + // Insert many rows sequentially. This is enough data to: + // - force the root to split (the righmost leaf will then be known) + // - raise the seqinsert score high enough to enable direct rightmost injections + const int rows_to_insert = 200; + for (int i = 0; i < rows_to_insert; i++) { + k = toku_htonl(i); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + } + invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL); + invariant(ft->seqinsert_score == FT_SEQINSERT_SCORE_THRESHOLD); + + // Insert on the left extreme. The seq insert score is high enough + // that we will attempt to insert into the rightmost leaf. We won't + // be successful because key 0 won't be in the bounds of the rightmost leaf. + // This failure should reset the seqinsert score back to 0. + k = toku_htonl(0); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 0); + + // Insert in the middle. The score should not go up. + k = toku_htonl(rows_to_insert / 2); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 0); + + // Insert on the right extreme. The score should go up. + k = toku_htonl(rows_to_insert); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 1); + + // Insert again on the right extreme again, the score should go up. + k = toku_htonl(rows_to_insert + 1); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 2); + + // Insert close to, but not at, the right extreme. The score should reset. + // -- the magic number 4 derives from the fact that vals are 1mb and nodes are 4mb + k = toku_htonl(rows_to_insert - 4); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 0); + + toku_free(val_buf); + toku_ft_handle_close(ft_handle); + toku_cachetable_close(&ct); + toku_os_recursive_delete(TOKU_TEST_FILENAME); +} + +int test_main(int argc, const char *argv[]) { + default_parse_args(argc, argv); + test_seqinsert_heuristic(); + return 0; +} diff --git a/ft/tests/test_rightmost_leaf_split_merge.cc b/ft/tests/test_rightmost_leaf_split_merge.cc new file mode 100644 index 00000000000..517fc277fd3 --- /dev/null +++ b/ft/tests/test_rightmost_leaf_split_merge.cc @@ -0,0 +1,212 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2014 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved." + +#include "test.h" + +#include +#include + +// Promotion tracks the rightmost blocknum in the FT when a message +// is successfully promoted to a non-root leaf node on the right extreme. +// +// This test verifies that a split or merge of the rightmost leaf properly +// maintains the rightmost blocknum (which is constant - the pair's swap values, +// like the root blocknum). + +static void test_split_merge(void) { + int r = 0; + char name[TOKU_PATH_MAX + 1]; + toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata"); + toku_os_recursive_delete(TOKU_TEST_FILENAME); + r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r); + + FT_HANDLE ft_handle; + CACHETABLE ct; + toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER); + r = toku_open_ft_handle(name, 1, &ft_handle, + 4*1024*1024, 64*1024, + TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL, + toku_builtin_compare_fun); CKERR(r); + + // We have a root blocknum, but no rightmost blocknum yet. + FT ft = ft_handle->ft; + invariant(ft->h->root_blocknum.b != RESERVED_BLOCKNUM_NULL); + invariant(ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL); + + int k; + DBT key, val; + const int val_size = 1 * 1024 * 1024; + char *XMALLOC_N(val_size, val_buf); + memset(val_buf, 'x', val_size); + toku_fill_dbt(&val, val_buf, val_size); + + // Insert 16 rows (should induce a few splits) + const int rows_to_insert = 16; + for (int i = 0; i < rows_to_insert; i++) { + k = toku_htonl(i); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + } + + // rightmost blocknum should be set, because the root split and promotion + // did a rightmost insertion directly into the rightmost leaf, lazily + // initializing the rightmost blocknum. + invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL); + + BLOCKNUM root_blocknum = ft->h->root_blocknum; + FTNODE root_node; + struct ftnode_fetch_extra bfe; + fill_bfe_for_full_read(&bfe, ft); + toku_pin_ftnode(ft, root_blocknum, + toku_cachetable_hash(ft->cf, ft->h->root_blocknum), + &bfe, PL_WRITE_EXPENSIVE, &root_node, true); + // root blocknum should be consistent + invariant(root_node->thisnodename.b == ft->h->root_blocknum.b); + // root should have split at least once, and it should now be at height 1 + invariant(root_node->n_children > 1); + invariant(root_node->height == 1); + // rightmost blocknum should no longer be the root, since the root split + invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b); + // the right child should have the rightmost blocknum + invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b); + + BLOCKNUM rightmost_blocknum_before_merge = ft->rightmost_blocknum; + const int num_children_before_merge = root_node->n_children; + + // delete the last 6 rows. + // - 1mb each, so 6mb deleted + // - should be enough to delete the entire rightmost leaf + some of its neighbor + const int rows_to_delete = 6; + toku_unpin_ftnode(ft, root_node); + for (int i = 0; i < rows_to_delete; i++) { + k = toku_htonl(rows_to_insert - i); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_delete(ft_handle, &key, NULL); + } + toku_pin_ftnode(ft, root_blocknum, + toku_cachetable_hash(ft->cf, root_blocknum), + &bfe, PL_WRITE_EXPENSIVE, &root_node, true); + + // - rightmost leaf should be fusible after those deletes (which were promoted directly to the leaf) + FTNODE rightmost_leaf; + toku_pin_ftnode(ft, rightmost_blocknum_before_merge, + toku_cachetable_hash(ft->cf, rightmost_blocknum_before_merge), + &bfe, PL_WRITE_EXPENSIVE, &rightmost_leaf, true); + invariant(get_node_reactivity(ft, rightmost_leaf) == RE_FUSIBLE); + toku_unpin_ftnode(ft, rightmost_leaf); + + // - merge the rightmost child now that it's fusible + toku_ft_merge_child(ft, root_node, root_node->n_children - 1); + toku_pin_ftnode(ft, root_blocknum, + toku_cachetable_hash(ft->cf, root_blocknum), + &bfe, PL_WRITE_EXPENSIVE, &root_node, true); + + // the merge should have worked, and the root should still be at height 1 + invariant(root_node->n_children < num_children_before_merge); + invariant(root_node->height == 1); + // the rightmost child of the root has the rightmost blocknum + invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b); + // the value for rightmost blocknum itself should not have changed + // (we keep it constant, like the root blocknum) + invariant(rightmost_blocknum_before_merge.b == ft->rightmost_blocknum.b); + + toku_unpin_ftnode(ft, root_node); + + toku_free(val_buf); + toku_ft_handle_close(ft_handle); + toku_cachetable_close(&ct); + toku_os_recursive_delete(TOKU_TEST_FILENAME); +} + +int test_main(int argc, const char *argv[]) { + default_parse_args(argc, argv); + test_split_merge(); + return 0; +} diff --git a/src/tests/test_insert_unique.cc b/src/tests/test_insert_unique.cc new file mode 100644 index 00000000000..29439f9d704 --- /dev/null +++ b/src/tests/test_insert_unique.cc @@ -0,0 +1,202 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." +/** + * Test that unique inserts work correctly. This exercises the rightmost leaf inject optimization. + */ + +#include + +#include "test.h" + +static char random_buf[8]; +static struct random_data random_data; + +static void test_simple_unique_insert(DB_ENV *env) { + int r; + DB *db; + r = db_create(&db, env, 0); CKERR(r); + r = db->open(db, NULL, "db", NULL, DB_BTREE, DB_CREATE, 0644); CKERR(r); + + DBT key1, key2, key3; + dbt_init(&key1, "a", sizeof("a")); + dbt_init(&key2, "b", sizeof("b")); + dbt_init(&key3, "c", sizeof("c")); + r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR(r); + r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + r = db->put(db, NULL, &key3, &key3, DB_NOOVERWRITE); CKERR(r); + r = db->put(db, NULL, &key3, &key3, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + r = db->put(db, NULL, &key2, &key2, DB_NOOVERWRITE); CKERR(r); + r = db->put(db, NULL, &key2, &key2, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + // sanity check + r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + r = db->put(db, NULL, &key1, &key3, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + + r = db->close(db, 0); CKERR(r); + r = env->dbremove(env, NULL, "db", NULL, 0); CKERR(r); +} + +static void test_large_sequential_insert_unique(DB_ENV *env) { + int r; + DB *db; + r = db_create(&db, env, 0); CKERR(r); + + // very small nodes/basements to make a taller tree + r = db->set_pagesize(db, 8 * 1024); CKERR(r); + r = db->set_readpagesize(db, 2 * 1024); CKERR(r); + r = db->open(db, NULL, "db", NULL, DB_BTREE, DB_CREATE, 0644); CKERR(r); + + const int val_size = 1024; + char *XMALLOC_N(val_size, val_buf); + memset(val_buf, 'k', val_size); + DBT val; + dbt_init(&val, val_buf, val_size); + + // grow a tree to about depth 3, taking sanity checks along the way + const int start_num_rows = (64 * 1024 * 1024) / val_size; + for (int i = 0; i < start_num_rows; i++) { + DBT key; + int k = toku_htonl(i); + dbt_init(&key, &k, sizeof(k)); + r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR(r); + if (i % 50 == 0) { + // sanity check - should not be able to insert this key twice in a row + r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + + // .. but re-inserting is okay, if we provisionally deleted the row + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + r = db->del(db, NULL, &key, DB_DELETE_ANY); CKERR(r); + r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR(r); + r = txn->commit(txn, 0); CKERR(r); + } + if (i > 0 && i % 250 == 0) { + // sanity check - unique checks on random keys we already inserted should + // fail (exercises middle-of-the-tree checks) + for (int check_i = 0; check_i < 4; check_i++) { + DBT rand_key; + int rand_k = toku_htonl(myrandom_r(&random_data) % i); + dbt_init(&rand_key, &rand_k, sizeof(rand_k)); + r = db->put(db, NULL, &rand_key, &val, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + } + } + } + + toku_free(val_buf); + r = db->close(db, 0); CKERR(r); + r = env->dbremove(env, NULL, "db", NULL, 0); CKERR(r); +} + + +int test_main(int argc, char * const argv[]) { + default_parse_args(argc, argv); + + int r; + const int envflags = DB_INIT_MPOOL | DB_CREATE | DB_THREAD | + DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_TXN | DB_PRIVATE; + + // startup + DB_ENV *env; + toku_os_recursive_delete(TOKU_TEST_FILENAME); + r = toku_os_mkdir(TOKU_TEST_FILENAME, 0755); CKERR(r); + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, TOKU_TEST_FILENAME, envflags, 0755); + + r = myinitstate_r(random(), random_buf, 8, &random_data); CKERR(r); + + test_simple_unique_insert(env); + test_large_sequential_insert_unique(env); + + // cleanup + r = env->close(env, 0); CKERR(r); + + return 0; +} + diff --git a/src/ydb_write.cc b/src/ydb_write.cc index 4826e418ab5..82fbf439885 100644 --- a/src/ydb_write.cc +++ b/src/ydb_write.cc @@ -253,6 +253,30 @@ toku_db_del(DB *db, DB_TXN *txn, DBT *key, uint32_t flags, bool holds_mo_lock) { return r; } +static int +db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, int flags, bool do_log) { + int r = 0; + bool unique = false; + enum ft_msg_type type = FT_INSERT; + if (flags == DB_NOOVERWRITE) { + unique = true; + } else if (flags == DB_NOOVERWRITE_NO_ERROR) { + type = FT_INSERT_NO_OVERWRITE; + } else if (flags != 0) { + // All other non-zero flags are unsupported + r = EINVAL; + } + if (r == 0) { + TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : nullptr; + if (unique) { + r = toku_ft_insert_unique(db->i->ft_handle, key, val, ttxn, do_log); + } else { + toku_ft_maybe_insert(db->i->ft_handle, key, val, ttxn, false, ZERO_LSN, do_log, type); + } + invariant(r == DB_KEYEXIST || r == 0); + } + return r; +} int toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_mo_lock) { @@ -265,25 +289,16 @@ toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_ flags &= ~lock_flags; r = db_put_check_size_constraints(db, key, val); - if (r == 0) { - //Do any checking required by the flags. - r = db_put_check_overwrite_constraint(db, txn, key, lock_flags, flags); - } - //Do locking if necessary. Do not grab the lock again if this DB had a unique - //check performed because the lock was already grabbed by its cursor callback. + + //Do locking if necessary. bool do_locking = (bool)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)); - if (r == 0 && do_locking && !(flags & DB_NOOVERWRITE)) { + if (r == 0 && do_locking) { r = toku_db_get_point_write_lock(db, txn, key); } if (r == 0) { //Insert into the ft. - TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; - enum ft_msg_type type = FT_INSERT; - if (flags==DB_NOOVERWRITE_NO_ERROR) { - type = FT_INSERT_NO_OVERWRITE; - } if (!holds_mo_lock) toku_multi_operation_client_lock(); - toku_ft_maybe_insert(db->i->ft_handle, key, val, ttxn, false, ZERO_LSN, true, type); + r = db_put(db, txn, key, val, flags, true); if (!holds_mo_lock) toku_multi_operation_client_unlock(); } @@ -635,9 +650,11 @@ log_put_multiple(DB_TXN *txn, DB *src_db, const DBT *src_key, const DBT *src_val } } +// Requires: If remaining_flags is non-null, this function performs any required uniqueness checks +// Otherwise, the caller is responsible. static int -do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], DBT_ARRAY vals[], DB *src_db, const DBT *src_key, bool indexer_shortcut) { - TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; +do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], DBT_ARRAY vals[], uint32_t *remaining_flags, DB *src_db, const DBT *src_key, bool indexer_shortcut) { + int r = 0; for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { DB *db = db_array[which_db]; @@ -666,16 +683,21 @@ do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], } if (do_put) { for (uint32_t i = 0; i < keys[which_db].size; i++) { - // if db is being indexed by an indexer, then put into that db if the src key is to the left or equal to the - // indexers cursor. we have to get the src_db from the indexer and find it in the db_array. - toku_ft_maybe_insert(db->i->ft_handle, - &keys[which_db].dbts[i], &vals[which_db].dbts[i], - ttxn, false, ZERO_LSN, false, FT_INSERT); + int flags = 0; + if (remaining_flags != nullptr) { + flags = remaining_flags[which_db]; + invariant(!(flags & DB_NOOVERWRITE_NO_ERROR)); + } + r = db_put(db, txn, &keys[which_db].dbts[i], &vals[which_db].dbts[i], flags, false); + if (r != 0) { + goto done; + } } } } } - return 0; +done: + return r; } static int @@ -754,20 +776,14 @@ env_put_multiple_internal( r = db_put_check_size_constraints(db, &put_key, &put_val); if (r != 0) goto cleanup; - //Check overwrite constraints - r = db_put_check_overwrite_constraint(db, txn, - &put_key, - lock_flags[which_db], remaining_flags[which_db]); - if (r != 0) goto cleanup; if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) { //put_multiple does not support delaying the no error, since we would //have to log the flag in the put_multiple. r = EINVAL; goto cleanup; } - //Do locking if necessary. Do not grab the lock again if this DB had a unique - //check performed because the lock was already grabbed by its cursor callback. - if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE) && !(remaining_flags[which_db] & DB_NOOVERWRITE)) { + //Do locking if necessary. + if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) { //Needs locking r = toku_db_get_point_write_lock(db, txn, &put_key); if (r != 0) goto cleanup; @@ -790,8 +806,10 @@ env_put_multiple_internal( } } toku_multi_operation_client_lock(); - log_put_multiple(txn, src_db, src_key, src_val, num_dbs, fts); - r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, src_db, src_key, indexer_shortcut); + r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, remaining_flags, src_db, src_key, indexer_shortcut); + if (r == 0) { + log_put_multiple(txn, src_db, src_key, src_val, num_dbs, fts); + } toku_multi_operation_client_unlock(); if (indexer_lock_taken) { toku_indexer_unlock(indexer); @@ -1075,7 +1093,7 @@ env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, // recovery so we don't end up losing data. // So unlike env->put_multiple, we ONLY log a 'put_multiple' log entry. log_put_multiple(txn, src_db, new_src_key, new_src_data, n_put_dbs, put_fts); - r = do_put_multiple(txn, n_put_dbs, put_dbs, put_key_arrays, put_val_arrays, src_db, new_src_key, indexer_shortcut); + r = do_put_multiple(txn, n_put_dbs, put_dbs, put_key_arrays, put_val_arrays, nullptr, src_db, new_src_key, indexer_shortcut); } toku_multi_operation_client_unlock(); if (indexer_lock_taken) { From fc87324fed9966dd9c70f814df6d5a4a7e86f37d Mon Sep 17 00:00:00 2001 From: John Esmet Date: Tue, 27 May 2014 16:44:08 -0400 Subject: [PATCH 30/46] refs #226 Fix a benign (but nevertheless important) bug where nonleaf partial eviction would fail to move stale messages out of the fresh message tree before serializing them to memory. --- ft/ft-ops.cc | 27 +++++++++++++++------------ ft/ft_node-serialize.cc | 23 +++++++++++++++++++++++ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/ft/ft-ops.cc b/ft/ft-ops.cc index 9521a9228a7..f9701ec34b1 100644 --- a/ft/ft-ops.cc +++ b/ft/ft-ops.cc @@ -1090,9 +1090,10 @@ exit: return; } +static void ft_bnc_move_messages_to_stale(FT ft, NONLEAF_CHILDINFO bnc); + // replace the child buffer with a compressed version of itself. -// @return the old child buffer -static NONLEAF_CHILDINFO +static void compress_internal_node_partition(FTNODE node, int i, enum toku_compression_method compression_method) { // if we should evict, compress the @@ -1103,11 +1104,9 @@ compress_internal_node_partition(FTNODE node, int i, enum toku_compression_metho sub_block_init(sb); toku_create_compressed_partition_from_available(node, i, compression_method, sb); - // now set the state to compressed and return the old, available partition - NONLEAF_CHILDINFO bnc = BNC(node, i); + // now set the state to compressed set_BSB(node, i, sb); BP_STATE(node,i) = PT_COMPRESSED; - return bnc; } void toku_evict_bn_from_memory(FTNODE node, int childnum, FT h) { @@ -1160,7 +1159,7 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext for (int i = 0; i < node->n_children; i++) { if (BP_STATE(node,i) == PT_AVAIL) { if (BP_SHOULD_EVICT(node,i)) { - NONLEAF_CHILDINFO bnc; + NONLEAF_CHILDINFO bnc = BNC(node, i); if (ft_compress_buffers_before_eviction && // We may not serialize and compress a partition in memory if its // in memory layout version is different than what's on disk (and @@ -1171,7 +1170,8 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext // this rule would cause upgrade code to upgrade this partition // again after we serialize it as the current version, which is bad. node->layout_version == node->layout_version_read_from_disk) { - bnc = compress_internal_node_partition( + ft_bnc_move_messages_to_stale(ft, bnc); + compress_internal_node_partition( node, i, // Always compress with quicklz @@ -1180,7 +1180,6 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext } else { // We're not compressing buffers before eviction. Simply // detach the buffer and set the child's state to on-disk. - bnc = BNC(node, i); set_BNULL(node, i); BP_STATE(node, i) = PT_ON_DISK; } @@ -5215,6 +5214,13 @@ int copy_to_stale(const int32_t &offset, const uint32_t UU(idx), struct copy_to_ return 0; } +static void ft_bnc_move_messages_to_stale(FT ft, NONLEAF_CHILDINFO bnc) { + struct copy_to_stale_extra cts_extra = { .ft = ft, .bnc = bnc }; + int r = bnc->fresh_message_tree.iterate_over_marked(&cts_extra); + invariant_zero(r); + bnc->fresh_message_tree.delete_all_marked(); +} + __attribute__((nonnull)) void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node) { @@ -5227,10 +5233,7 @@ toku_move_ftnode_messages_to_stale(FT ft, FTNODE node) { // We can't delete things out of the fresh tree inside the above // procedures because we're still looking at the fresh tree. Instead // we have to move messages after we're done looking at it. - struct copy_to_stale_extra cts_extra = { .ft = ft, .bnc = bnc }; - int r = bnc->fresh_message_tree.iterate_over_marked(&cts_extra); - invariant_zero(r); - bnc->fresh_message_tree.delete_all_marked(); + ft_bnc_move_messages_to_stale(ft, bnc); } } diff --git a/ft/ft_node-serialize.cc b/ft/ft_node-serialize.cc index 1090bca6ca0..91ea0890c30 100644 --- a/ft/ft_node-serialize.cc +++ b/ft/ft_node-serialize.cc @@ -310,6 +310,27 @@ serialize_ftnode_partition_size (FTNODE node, int i) #define FTNODE_PARTITION_DMT_LEAVES 0xaa #define FTNODE_PARTITION_FIFO_MSG 0xbb +UU() static int +assert_fresh(const int32_t &offset, const uint32_t UU(idx), struct fifo *const f) { + struct fifo_entry *entry = toku_fifo_get_entry(f, offset); + assert(entry->is_fresh); + return 0; +} + +UU() static int +assert_stale(const int32_t &offset, const uint32_t UU(idx), struct fifo *const f) { + struct fifo_entry *entry = toku_fifo_get_entry(f, offset); + assert(!entry->is_fresh); + return 0; +} + +static void bnc_verify_message_trees(NONLEAF_CHILDINFO UU(bnc)) { +#ifdef TOKU_DEBUG_PARANOID + bnc->fresh_message_tree.iterate(bnc->buffer); + bnc->stale_message_tree.iterate(bnc->buffer); +#endif +} + static int wbuf_write_offset(const int32_t &offset, const uint32_t UU(idx), struct wbuf *const wb) { wbuf_nocrc_int(wb, offset); @@ -335,6 +356,8 @@ serialize_child_buffer(NONLEAF_CHILDINFO bnc, struct wbuf *wb) wbuf_nocrc_bytes(wb, data, datalen); }); + bnc_verify_message_trees(bnc); + // serialize the message trees (num entries, offsets array): // fresh, stale, broadcast wbuf_nocrc_int(wb, bnc->fresh_message_tree.size()); From 98c81ef1a16e769c3a3dfde88d681923c0796972 Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Wed, 28 May 2014 12:09:49 -0400 Subject: [PATCH 31/46] changed ft-verify to work with promotion #250 fixes #250 --- ft/ft-internal.h | 2 +- ft/ft-verify.cc | 26 +++++++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/ft/ft-internal.h b/ft/ft-internal.h index f182a4f6aed..378e8921328 100644 --- a/ft/ft-internal.h +++ b/ft/ft-internal.h @@ -1057,7 +1057,7 @@ toku_get_node_for_verify( int toku_verify_ftnode (FT_HANDLE ft_h, - MSN rootmsn, MSN parentmsn, bool messages_exist_above, + MSN rootmsn, MSN parentmsn_with_messages, bool messages_exist_above, FTNODE node, int height, const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) const DBT *greatereq_pivot, // Everything in the subtree should be <= lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) diff --git a/ft/ft-verify.cc b/ft/ft-verify.cc index 506a54a07a0..7e8d241cce2 100644 --- a/ft/ft-verify.cc +++ b/ft/ft-verify.cc @@ -310,7 +310,7 @@ toku_get_node_for_verify( static int toku_verify_ftnode_internal(FT_HANDLE ft_handle, - MSN rootmsn, MSN parentmsn, bool messages_exist_above, + MSN rootmsn, MSN parentmsn_with_messages, bool messages_exist_above, FTNODE node, int height, const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) const DBT *greatereq_pivot, // Everything in the subtree should be <= lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) @@ -328,7 +328,7 @@ toku_verify_ftnode_internal(FT_HANDLE ft_handle, invariant(height == node->height); // this is a bad failure if wrong } if (node->height > 0 && messages_exist_above) { - VERIFY_ASSERTION((parentmsn.msn >= this_msn.msn), 0, "node msn must be descending down tree, newest messages at top"); + VERIFY_ASSERTION((parentmsn_with_messages.msn >= this_msn.msn), 0, "node msn must be descending down tree, newest messages at top"); } // Verify that all the pivot keys are in order. for (int i = 0; i < node->n_children-2; i++) { @@ -450,7 +450,7 @@ done: // input is a pinned node, on exit, node is unpinned int toku_verify_ftnode (FT_HANDLE ft_handle, - MSN rootmsn, MSN parentmsn, bool messages_exist_above, + MSN rootmsn, MSN parentmsn_with_messages, bool messages_exist_above, FTNODE node, int height, const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) const DBT *greatereq_pivot, // Everything in the subtree should be <= lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) @@ -469,7 +469,7 @@ toku_verify_ftnode (FT_HANDLE ft_handle, // Otherwise we'll just do the next call result = toku_verify_ftnode_internal( - ft_handle, rootmsn, parentmsn, messages_exist_above, node, height, lesser_pivot, greatereq_pivot, + ft_handle, rootmsn, parentmsn_with_messages, messages_exist_above, node, height, lesser_pivot, greatereq_pivot, verbose, keep_going_on_failure, false); if (result != 0 && (!keep_going_on_failure || result != TOKUDB_NEEDS_REPAIR)) goto done; } @@ -477,7 +477,7 @@ toku_verify_ftnode (FT_HANDLE ft_handle, toku_move_ftnode_messages_to_stale(ft_handle->ft, node); } result2 = toku_verify_ftnode_internal( - ft_handle, rootmsn, parentmsn, messages_exist_above, node, height, lesser_pivot, greatereq_pivot, + ft_handle, rootmsn, parentmsn_with_messages, messages_exist_above, node, height, lesser_pivot, greatereq_pivot, verbose, keep_going_on_failure, true); if (result == 0) { result = result2; @@ -489,12 +489,16 @@ toku_verify_ftnode (FT_HANDLE ft_handle, for (int i = 0; i < node->n_children; i++) { FTNODE child_node; toku_get_node_for_verify(BP_BLOCKNUM(node, i), ft_handle, &child_node); - int r = toku_verify_ftnode(ft_handle, rootmsn, this_msn, messages_exist_above || toku_bnc_n_entries(BNC(node, i)) > 0, - child_node, node->height-1, - (i==0) ? lesser_pivot : &node->childkeys[i-1], - (i==node->n_children-1) ? greatereq_pivot : &node->childkeys[i], - progress_callback, progress_extra, - recurse, verbose, keep_going_on_failure); + int r = toku_verify_ftnode(ft_handle, rootmsn, + (toku_bnc_n_entries(BNC(node, i)) > 0 + ? this_msn + : parentmsn_with_messages), + messages_exist_above || toku_bnc_n_entries(BNC(node, i)) > 0, + child_node, node->height-1, + (i==0) ? lesser_pivot : &node->childkeys[i-1], + (i==node->n_children-1) ? greatereq_pivot : &node->childkeys[i], + progress_callback, progress_extra, + recurse, verbose, keep_going_on_failure); if (r) { result = r; if (!keep_going_on_failure || result != TOKUDB_NEEDS_REPAIR) goto done; From e7496641ced39c2cd1e5eca324c4e948f768c7a5 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Thu, 29 May 2014 07:41:16 -0400 Subject: [PATCH 32/46] #241 unique key check should avoid relocking keys if the table is already prelocked by the loader --- storage/tokudb/ha_tokudb.cc | 105 +++++++----------------------------- storage/tokudb/ha_tokudb.h | 2 +- 2 files changed, 20 insertions(+), 87 deletions(-) diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index 87b54256965..ddacb6d1382 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -3342,12 +3342,8 @@ int ha_tokudb::end_bulk_insert(bool abort) { if (i == primary_key && !share->pk_has_string) { continue; } - error = is_index_unique( - &is_unique, - transaction, - share->key_file[i], - &table->key_info[i] - ); + error = is_index_unique(&is_unique, transaction, share->key_file[i], &table->key_info[i], + DB_PRELOCKED_WRITE); if (error) goto cleanup; if (!is_unique) { error = HA_ERR_FOUND_DUPP_KEY; @@ -3394,7 +3390,7 @@ int ha_tokudb::end_bulk_insert() { return end_bulk_insert( false ); } -int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info) { +int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags) { int error; DBC* tmp_cursor1 = NULL; DBC* tmp_cursor2 = NULL; @@ -3410,49 +3406,23 @@ int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_in memset(&packed_key2, 0, sizeof(packed_key2)); *is_unique = true; - error = db->cursor( - db, - txn, - &tmp_cursor1, - DB_SERIALIZABLE - ); + error = db->cursor(db, txn, &tmp_cursor1, DB_SERIALIZABLE); if (error) { goto cleanup; } - error = db->cursor( - db, - txn, - &tmp_cursor2, - DB_SERIALIZABLE - ); + error = db->cursor(db, txn, &tmp_cursor2, DB_SERIALIZABLE); if (error) { goto cleanup; } - - error = tmp_cursor1->c_get( - tmp_cursor1, - &key1, - &val, - DB_NEXT - ); + error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags); if (error == DB_NOTFOUND) { *is_unique = true; error = 0; goto cleanup; } else if (error) { goto cleanup; } - error = tmp_cursor2->c_get( - tmp_cursor2, - &key2, - &val, - DB_NEXT - ); + error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags); if (error) { goto cleanup; } - error = tmp_cursor2->c_get( - tmp_cursor2, - &key2, - &val, - DB_NEXT - ); + error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags); if (error == DB_NOTFOUND) { *is_unique = true; error = 0; @@ -3464,59 +3434,25 @@ int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_in bool has_null1; bool has_null2; int cmp; - place_key_into_mysql_buff( - key_info, - table->record[0], - (uchar *) key1.data + 1 - ); - place_key_into_mysql_buff( - key_info, - table->record[1], - (uchar *) key2.data + 1 - ); + place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key1.data + 1); + place_key_into_mysql_buff(key_info, table->record[1], (uchar *) key2.data + 1); - create_dbt_key_for_lookup( - &packed_key1, - key_info, - key_buff, - table->record[0], - &has_null1 - ); - create_dbt_key_for_lookup( - &packed_key2, - key_info, - key_buff2, - table->record[1], - &has_null2 - ); + create_dbt_key_for_lookup(&packed_key1, key_info, key_buff, table->record[0], &has_null1); + create_dbt_key_for_lookup(&packed_key2, key_info, key_buff2, table->record[1], &has_null2); if (!has_null1 && !has_null2) { cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2); if (cmp == 0) { memcpy(key_buff, key1.data, key1.size); - place_key_into_mysql_buff( - key_info, - table->record[0], - (uchar *) key_buff + 1 - ); + place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key_buff + 1); *is_unique = false; break; } } - error = tmp_cursor1->c_get( - tmp_cursor1, - &key1, - &val, - DB_NEXT - ); + error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags); if (error) { goto cleanup; } - error = tmp_cursor2->c_get( - tmp_cursor2, - &key2, - &val, - DB_NEXT - ); + error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags); if (error && (error != DB_NOTFOUND)) { goto cleanup; } cnt++; @@ -7766,7 +7702,8 @@ int ha_tokudb::tokudb_add_index( num_processed++; if ((num_processed % 1000) == 0) { - sprintf(status_msg, "Adding indexes: Fetched %llu of about %llu rows, loading of data still remains.", num_processed, (long long unsigned) share->rows); + sprintf(status_msg, "Adding indexes: Fetched %llu of about %llu rows, loading of data still remains.", + num_processed, (long long unsigned) share->rows); thd_proc_info(thd, status_msg); #ifdef HA_TOKUDB_HAS_THD_PROGRESS @@ -7798,12 +7735,8 @@ int ha_tokudb::tokudb_add_index( for (uint i = 0; i < num_of_keys; i++, curr_index++) { if (key_info[i].flags & HA_NOSAME) { bool is_unique; - error = is_index_unique( - &is_unique, - txn, - share->key_file[curr_index], - &key_info[i] - ); + error = is_index_unique(&is_unique, txn, share->key_file[curr_index], &key_info[i], + creating_hot_index ? 0 : DB_PRELOCKED_WRITE); if (error) goto cleanup; if (!is_unique) { error = HA_ERR_FOUND_DUPP_KEY; diff --git a/storage/tokudb/ha_tokudb.h b/storage/tokudb/ha_tokudb.h index 47dd0f7c478..1ad602ff9b0 100644 --- a/storage/tokudb/ha_tokudb.h +++ b/storage/tokudb/ha_tokudb.h @@ -475,7 +475,7 @@ private: ); int create_main_dictionary(const char* name, TABLE* form, DB_TXN* txn, KEY_AND_COL_INFO* kc_info, toku_compression_method compression_method); void trace_create_table_info(const char *name, TABLE * form); - int is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info); + int is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags); int is_val_unique(bool* is_unique, uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn); int do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd); void set_main_dict_put_flags(THD* thd, bool opt_eligible, uint32_t* put_flags); From 6a1387c0da2b285a63606f2f3e8e0424bfc2c66c Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Thu, 29 May 2014 11:06:33 -0400 Subject: [PATCH 33/46] added test_stress_with_verify to stress test runner --- scripts/run.stress-tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/run.stress-tests.py b/scripts/run.stress-tests.py index fbbf5ee6472..003870d88c9 100755 --- a/scripts/run.stress-tests.py +++ b/scripts/run.stress-tests.py @@ -735,6 +735,7 @@ if __name__ == '__main__': 'test_stress6.tdb', 'test_stress7.tdb', 'test_stress_hot_indexing.tdb', + 'test_stress_with_verify.tdb', 'test_stress_openclose.tdb'] default_recover_testnames = ['recover-test_stress1.tdb', 'recover-test_stress2.tdb', From 5c6565f1a0ab32b2ca8397b838878289bb4cc3c6 Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Thu, 29 May 2014 11:06:45 -0400 Subject: [PATCH 34/46] added 7.1.6 data set to stress test runner --- scripts/run.stress-tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/run.stress-tests.py b/scripts/run.stress-tests.py index 003870d88c9..d4245a7c4b4 100755 --- a/scripts/run.stress-tests.py +++ b/scripts/run.stress-tests.py @@ -767,8 +767,8 @@ if __name__ == '__main__': help="skip the tests that don't involve upgrade [default=False]") upgrade_group.add_option('--double_upgrade', action='store_true', dest='double_upgrade', default=False, help='run the upgrade tests twice in a row [default=False]') - upgrade_group.add_option('--add_old_version', action='append', type='choice', dest='old_versions', choices=['4.2.0', '5.0.8', '5.2.7', '6.0.0', '6.1.0', '6.5.1', '6.6.3'], - help='which old versions to use for running the stress tests in upgrade mode. can be specified multiple times [options=4.2.0, 5.0.8, 5.2.7, 6.0.0, 6.1.0, 6.5.1, 6.6.3]') + upgrade_group.add_option('--add_old_version', action='append', type='choice', dest='old_versions', choices=['4.2.0', '5.0.8', '5.2.7', '6.0.0', '6.1.0', '6.5.1', '6.6.3', '7.1.6'], + help='which old versions to use for running the stress tests in upgrade mode. can be specified multiple times [options=4.2.0, 5.0.8, 5.2.7, 6.0.0, 6.1.0, 6.5.1, 6.6.3, 7.1.6]') upgrade_group.add_option('--old_environments_dir', type='string', dest='old_environments_dir', default=('%s/old-stress-test-envs' % default_tokudb_data), help='directory containing old version environments (should contain 5.0.8/, 5.2.7/, etc, and the environments should be in those) [default=../../tokudb.data/stress_environments]') From 878828ead2683fcfa5644af5c8e04bec7adec11d Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Fri, 30 May 2014 12:58:28 -0400 Subject: [PATCH 35/46] #229 make ftdump easier to use --- ft/tokuftdump.cc | 425 ++++++++++++++++++++++++----------------------- 1 file changed, 219 insertions(+), 206 deletions(-) diff --git a/ft/tokuftdump.cc b/ft/tokuftdump.cc index f2d4fce83cb..a7d94f41d78 100644 --- a/ft/tokuftdump.cc +++ b/ft/tokuftdump.cc @@ -89,7 +89,7 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -/* Tell me the diff between two FT files. */ +// Dump a fractal tree file #include "cachetable.h" #include "ft.h" @@ -102,20 +102,26 @@ PATENT RIGHTS GRANT: #include #include -static void -format_time(const uint64_t time_int, char *buf) { +static int do_dump_data = 1; +static int do_interactive = 0; +static int do_header = 0; +static int do_fragmentation = 0; +static int do_garbage = 0; +static int do_translation_table = 0; +static int do_rootnode = 0; +static int do_tsv = 0; + +static const char *arg0; +static const char *fname; + +static void format_time(const uint64_t time_int, char *buf) { time_t timer = (time_t) time_int; ctime_r(&timer, buf); assert(buf[24] == '\n'); buf[24] = 0; } -static int dump_data = 1; - -static CACHETABLE ct; - -static void -print_item (bytevec val, ITEMLEN len) { +static void print_item(bytevec val, ITEMLEN len) { printf("\""); ITEMLEN i; for (i=0; idbt.size); simple_hex_dump((unsigned char*) d->dbt.data, d->dbt.size); printf("\n"); } -static void -open_header (int f, FT *header, CACHEFILE cf) { +static void open_header(int fd, FT *header, CACHEFILE cf) { FT ft = NULL; int r; - r = toku_deserialize_ft_from (f, MAX_LSN, &ft); - assert(r==0); + r = toku_deserialize_ft_from (fd, MAX_LSN, &ft); + if (r != 0) { + fprintf(stderr, "%s: can not deserialize from %s error %d\n", arg0, fname, r); + exit(1); + } + assert_zero(r); ft->cf = cf; *header = ft; } -static void -dump_header(FT ft) { +static void dump_header(FT ft) { char timestr[26]; printf("ft:\n"); printf(" layout_version=%d\n", ft->h->layout_version); @@ -212,29 +217,19 @@ dump_header(FT ft) { printf(" estimated numbytes=%" PRId64 "\n", ft->in_memory_stats.numbytes); } -static int -print_le( - const void* key, - const uint32_t keylen, - const LEAFENTRY &le, - const uint32_t idx UU(), - void *const ai UU() - ) -{ +static int print_le(const void* key, const uint32_t keylen, const LEAFENTRY &le, const uint32_t idx UU(), void *const ai UU()) { print_klpair(stdout, key, keylen, le); printf("\n"); return 0; } - -static void -dump_node (int f, BLOCKNUM blocknum, FT h) { +static void dump_node(int fd, BLOCKNUM blocknum, FT h) { FTNODE n; struct ftnode_fetch_extra bfe; FTNODE_DISK_DATA ndd = NULL; fill_bfe_for_full_read(&bfe, h); - int r = toku_deserialize_ftnode_from (f, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe); - assert(r==0); + int r = toku_deserialize_ftnode_from (fd, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe); + assert_zero(r); assert(n!=0); printf("ftnode\n"); DISKOFF disksize, diskoffset; @@ -271,15 +266,16 @@ dump_node (int f, BLOCKNUM blocknum, FT h) { } printf(" children:\n"); for (int i=0; in_children; i++) { + printf(" child %d: ", i); if (n->height > 0) { - printf(" child %d: %" PRId64 "\n", i, BP_BLOCKNUM(n, i).b); + printf("%" PRId64 "\n", BP_BLOCKNUM(n, i).b); NONLEAF_CHILDINFO bnc = BNC(n, i); unsigned int n_bytes = toku_bnc_nbytesinbuf(bnc); int n_entries = toku_bnc_n_entries(bnc); if (n_bytes > 0 || n_entries > 0) { printf(" buffer contains %u bytes (%d items)\n", n_bytes, n_entries); } - if (dump_data) { + if (do_dump_data) { FIFO_ITERATE(bnc->buffer, key, keylen, data, datalen, typ, msn, xids, UU(is_fresh), { printf(" msn=%" PRIu64 " (0x%" PRIx64 ") ", msn.msn, msn.msn); @@ -316,7 +312,7 @@ dump_node (int f, BLOCKNUM blocknum, FT h) { } else { printf(" n_bytes_in_buffer= %" PRIu64 "", BLB_DATA(n, i)->get_disk_size()); printf(" items_in_buffer=%u\n", BLB_DATA(n, i)->num_klpairs()); - if (dump_data) { + if (do_dump_data) { BLB_DATA(n, i)->iterate(NULL); } } @@ -325,13 +321,11 @@ dump_node (int f, BLOCKNUM blocknum, FT h) { toku_free(ndd); } -static void -dump_block_translation(FT h, uint64_t offset) { +static void dump_block_translation(FT h, uint64_t offset) { toku_blocknum_dump_translation(h->blocktable, make_blocknum(offset)); } -static void -dump_fragmentation(int UU(f), FT h, int tsv) { +static void dump_fragmentation(int UU(f), FT h, int tsv) { int64_t used_space; int64_t total_space; toku_blocktable_internal_fragmentation(h->blocktable, &total_space, &used_space); @@ -349,21 +343,20 @@ dump_fragmentation(int UU(f), FT h, int tsv) { } typedef struct { - int f; + int fd; FT h; uint64_t blocksizes; uint64_t leafsizes; uint64_t leafblocks; } frag_help_extra; -static int -nodesizes_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra) { +static int nodesizes_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra) { frag_help_extra *CAST_FROM_VOIDP(info, extra); FTNODE n; FTNODE_DISK_DATA ndd = NULL; struct ftnode_fetch_extra bfe; fill_bfe_for_full_read(&bfe, info->h); - int r = toku_deserialize_ftnode_from(info->f, b, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe); + int r = toku_deserialize_ftnode_from(info->fd, b, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe); if (r==0) { info->blocksizes += size; if (n->height == 0) { @@ -376,11 +369,10 @@ nodesizes_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra) { return 0; } -static void -dump_nodesizes(int f, FT h) { +static void dump_nodesizes(int fd, FT h) { frag_help_extra info; memset(&info, 0, sizeof(info)); - info.f = f; + info.fd = fd; info.h = h; toku_blocktable_iterate(h->blocktable, TRANSLATION_CHECKPOINTED, nodesizes_helper, &info, true, true); @@ -389,36 +381,45 @@ dump_nodesizes(int f, FT h) { printf("leafsizes\t%" PRIu64 "\n", info.leafsizes); } -static void -dump_garbage_stats(int f, FT ft) { - invariant(f == toku_cachefile_get_fd(ft->cf)); +static void dump_garbage_stats(int fd, FT ft) { + assert(fd == toku_cachefile_get_fd(ft->cf)); uint64_t total_space = 0; uint64_t used_space = 0; toku_ft_get_garbage(ft, &total_space, &used_space); - printf("total_size\t%" PRIu64 "\n", total_space); - printf("used_size\t%" PRIu64 "\n", used_space); + printf("garbage total size\t%" PRIu64 "\n", total_space); + printf("garbage used size\t%" PRIu64 "\n", used_space); } -static uint32_t -get_unaligned_uint32(unsigned char *p) { - return *(uint32_t *)p; +typedef struct __dump_node_extra { + int fd; + FT h; +} dump_node_extra; + +static int dump_node_wrapper(BLOCKNUM b, int64_t UU(size), int64_t UU(address), void *extra) { + dump_node_extra *CAST_FROM_VOIDP(info, extra); + dump_node(info->fd, b, info->h); + return 0; +} + +static uint32_t get_unaligned_uint32(unsigned char *p) { + uint32_t n; + memcpy(&n, p, sizeof n); + return n; } struct dump_sub_block { - uint32_t compressed_size; - uint32_t uncompressed_size; - uint32_t xsum; + uint32_t compressed_size; + uint32_t uncompressed_size; + uint32_t xsum; }; -static void -sub_block_deserialize(struct dump_sub_block *sb, unsigned char *sub_block_header) { +static void sub_block_deserialize(struct dump_sub_block *sb, unsigned char *sub_block_header) { sb->compressed_size = toku_dtoh32(get_unaligned_uint32(sub_block_header+0)); sb->uncompressed_size = toku_dtoh32(get_unaligned_uint32(sub_block_header+4)); sb->xsum = toku_dtoh32(get_unaligned_uint32(sub_block_header+8)); } -static void -verify_block(unsigned char *cp, uint64_t file_offset, uint64_t size) { +static void verify_block(unsigned char *cp, uint64_t file_offset, uint64_t size) { // verify the header checksum const size_t node_header = 8 + sizeof (uint32_t) + sizeof (uint32_t) + sizeof (uint32_t); @@ -461,24 +462,22 @@ verify_block(unsigned char *cp, uint64_t file_offset, uint64_t size) { printf("offset %u expected %" PRIu64 "\n", offset, size); } -static void -dump_block(int f, BLOCKNUM blocknum, FT h) { +static void dump_block(int fd, BLOCKNUM blocknum, FT h) { DISKOFF offset, size; toku_translate_blocknum_to_offset_size(h->blocktable, blocknum, &offset, &size); printf("%" PRId64 " at %" PRId64 " size %" PRId64 "\n", blocknum.b, offset, size); unsigned char *CAST_FROM_VOIDP(vp, toku_malloc(size)); - uint64_t r = pread(f, vp, size, offset); + uint64_t r = pread(fd, vp, size, offset); if (r == (uint64_t)size) { verify_block(vp, offset, size); } toku_free(vp); } -static void -dump_file(int f, uint64_t offset, uint64_t size, FILE *outfp) { +static void dump_file(int fd, uint64_t offset, uint64_t size, FILE *outfp) { unsigned char *XMALLOC_N(size, vp); - uint64_t r = pread(f, vp, size, offset); + uint64_t r = pread(fd, vp, size, offset); if (r == size) { if (outfp == stdout) { hex_dump(vp, offset, size); @@ -490,13 +489,11 @@ dump_file(int f, uint64_t offset, uint64_t size, FILE *outfp) { toku_free(vp); } -static void -set_file(int f, uint64_t offset, unsigned char newc) { - toku_os_pwrite(f, &newc, sizeof newc, offset); +static void set_file(int fd, uint64_t offset, unsigned char newc) { + toku_os_pwrite(fd, &newc, sizeof newc, offset); } -static int -readline (char *line, int maxline) { +static int readline(char *line, int maxline) { int i = 0; int c; while ((c = getchar()) != EOF && c != '\n' && i < maxline) { @@ -506,8 +503,7 @@ readline (char *line, int maxline) { return c == EOF ? EOF : i; } -static int -split_fields (char *line, char *fields[], int maxfields) { +static int split_fields(char *line, char *fields[], int maxfields) { int i; for (i=0; if, b, info->h); - return 0; -} - -static void -interactive_help(void) { +static void interactive_help(void) { fprintf(stderr, "help\n"); fprintf(stderr, "header\n"); fprintf(stderr, "node NUMBER\n"); @@ -552,133 +538,160 @@ interactive_help(void) { fprintf(stderr, "quit\n"); } -static uint64_t -getuint64(const char *f) { - if (strncmp(f, "0x", 2) == 0 || strncmp(f, "0X", 2) == 0) - return strtoull(f, 0, 16); - else if (strncmp(f, "0", 1) == 0) - return strtoull(f, 0, 8); - else - return strtoull(f, 0, 10); +static void run_iteractive_loop(int fd, FT ft, CACHEFILE cf) { + while (1) { + printf("ftdump>"); fflush(stdout); + enum { maxline = 64}; + char line[maxline+1]; + int r = readline(line, maxline); + if (r == EOF) + break; + const int maxfields = 4; + char *fields[maxfields]; + int nfields = split_fields(line, fields, maxfields); + if (nfields == 0) + continue; + if (strcmp(fields[0], "help") == 0) { + interactive_help(); + } else if (strcmp(fields[0], "header") == 0) { + toku_ft_free(ft); + open_header(fd, &ft, cf); + dump_header(ft); + } else if (strcmp(fields[0], "block") == 0 && nfields == 2) { + BLOCKNUM blocknum = make_blocknum(getuint64(fields[1])); + dump_block(fd, blocknum, ft); + } else if (strcmp(fields[0], "node") == 0 && nfields == 2) { + BLOCKNUM off = make_blocknum(getuint64(fields[1])); + dump_node(fd, off, ft); + } else if (strcmp(fields[0], "dumpdata") == 0 && nfields == 2) { + do_dump_data = strtol(fields[1], NULL, 10); + } else if (strcmp(fields[0], "block_translation") == 0 || strcmp(fields[0], "bx") == 0) { + uint64_t offset = 0; + if (nfields == 2) + offset = getuint64(fields[1]); + dump_block_translation(ft, offset); + } else if (strcmp(fields[0], "fragmentation") == 0) { + dump_fragmentation(fd, ft, do_tsv); + } else if (strcmp(fields[0], "nodesizes") == 0) { + dump_nodesizes(fd, ft); + } else if (strcmp(fields[0], "garbage") == 0) { + dump_garbage_stats(fd, ft); + } else if (strcmp(fields[0], "file") == 0 && nfields >= 3) { + uint64_t offset = getuint64(fields[1]); + uint64_t size = getuint64(fields[2]); + FILE *outfp = stdout; + if (nfields >= 4) + outfp = fopen(fields[3], "w"); + dump_file(fd, offset, size, outfp); + } else if (strcmp(fields[0], "setfile") == 0 && nfields == 3) { + uint64_t offset = getuint64(fields[1]); + unsigned char newc = getuint64(fields[2]); + set_file(fd, offset, newc); + } else if (strcmp(fields[0], "quit") == 0 || strcmp(fields[0], "q") == 0) { + break; + } + } } -int -main (int argc, const char *const argv[]) { - int interactive = 0; - int fragmentation = 0; - int translation_table = 0; - int rootnode = 0; - int tsv = 0; +static int usage(void) { + fprintf(stderr, "Usage: %s ", arg0); + fprintf(stderr, "--interactive "); + fprintf(stderr, "--nodata "); + fprintf(stderr, "--dumpdata 0|1 "); + fprintf(stderr, "--header "); + fprintf(stderr, "--rootnode "); + fprintf(stderr, "--fragmentation "); + fprintf(stderr, "--garbage "); + fprintf(stderr, "--tsv "); + fprintf(stderr, "--translation-table "); + fprintf(stderr, "--tsv "); + fprintf(stderr, "ftfilename \n"); + return 1; +} - const char *arg0 = argv[0]; +int main (int argc, const char *const argv[]) { + arg0 = argv[0]; argc--; argv++; while (argc>0) { - if (strcmp(argv[0], "--nodata") == 0) { - dump_data = 0; - } else if (strcmp(argv[0], "--interactive") == 0 || strcmp(argv[0], "--i") == 0) { - interactive = 1; - } else if (strcmp(argv[0], "--fragmentation") == 0) { - fragmentation = 1; - } else if (strcmp(argv[0], "--tsv") == 0) { - tsv = 1; - } else if (strcmp(argv[0], "--translation-table") == 0) { - translation_table = 1; + if (strcmp(argv[0], "--interactive") == 0 || strcmp(argv[0], "--i") == 0) { + do_interactive = 1; + } else if (strcmp(argv[0], "--nodata") == 0) { + do_dump_data = 0; + } else if (strcmp(argv[0], "--dumpdata") == 0 && argc > 1) { + argc--; argv++; + do_dump_data = atoi(argv[0]); + } else if (strcmp(argv[0], "--header") == 0) { + do_header = 1; } else if (strcmp(argv[0], "--rootnode") == 0) { - rootnode = 1; - } else if (strcmp(argv[0], "--help") == 0) { - return usage(arg0); + do_rootnode = 1; + } else if (strcmp(argv[0], "--fragmentation") == 0) { + do_fragmentation = 1; + } else if (strcmp(argv[0], "--garbage") == 0) { + do_garbage = 1; + } else if (strcmp(argv[0], "--tsv") == 0) { + do_tsv = 1; + } else if (strcmp(argv[0], "--translation-table") == 0) { + do_translation_table = 1; + } else if (strcmp(argv[0], "--help") == 0 || strcmp(argv[0], "-?") == 0 || strcmp(argv[0], "-h") == 0) { + return usage(); } else { break; } argc--; argv++; } - if (argc != 1) return usage(arg0); + if (argc != 1) + return usage(); int r = toku_ft_layer_init(); - invariant_zero(r); + assert_zero(r); - const char *n = argv[0]; - int f = open(n, O_RDWR + O_BINARY); assert(f>=0); - FT ft; - // create a cachefile for the header - toku_cachetable_create(&ct, 1<<25, (LSN){0}, 0); - CACHEFILE cf = NULL; - r = toku_cachetable_openfd (&cf, ct, f, n); - assert(r==0); - open_header(f, &ft, cf); - if (!fragmentation && !translation_table) { - // quick fix for now, we want those two to have clean output - dump_header(ft); + fname = argv[0]; + int fd = open(fname, O_RDWR + O_BINARY); + if (fd < 0) { + fprintf(stderr, "%s: can not open %s errno %d\n", arg0, fname, errno); + return 1; } - if (interactive) { - while (1) { - printf("ftdump>"); fflush(stdout); - enum { maxline = 64}; - char line[maxline+1]; - r = readline(line, maxline); - if (r == EOF) - break; - const int maxfields = 4; - char *fields[maxfields]; - int nfields = split_fields(line, fields, maxfields); - if (nfields == 0) - continue; - if (strcmp(fields[0], "help") == 0) { - interactive_help(); - } else if (strcmp(fields[0], "header") == 0) { - toku_ft_free(ft); - open_header(f, &ft, cf); - dump_header(ft); - } else if (strcmp(fields[0], "block") == 0 && nfields == 2) { - BLOCKNUM blocknum = make_blocknum(getuint64(fields[1])); - dump_block(f, blocknum, ft); - } else if (strcmp(fields[0], "node") == 0 && nfields == 2) { - BLOCKNUM off = make_blocknum(getuint64(fields[1])); - dump_node(f, off, ft); - } else if (strcmp(fields[0], "dumpdata") == 0 && nfields == 2) { - dump_data = strtol(fields[1], NULL, 10); - } else if (strcmp(fields[0], "block_translation") == 0 || strcmp(fields[0], "bx") == 0) { - uint64_t offset = 0; - if (nfields == 2) - offset = getuint64(fields[1]); - dump_block_translation(ft, offset); - } else if (strcmp(fields[0], "fragmentation") == 0) { - dump_fragmentation(f, ft, tsv); - } else if (strcmp(fields[0], "nodesizes") == 0) { - dump_nodesizes(f, ft); - } else if (strcmp(fields[0], "garbage") == 0) { - dump_garbage_stats(f, ft); - } else if (strcmp(fields[0], "file") == 0 && nfields >= 3) { - uint64_t offset = getuint64(fields[1]); - uint64_t size = getuint64(fields[2]); - FILE *outfp = stdout; - if (nfields >= 4) - outfp = fopen(fields[3], "w"); - dump_file(f, offset, size, outfp); - } else if (strcmp(fields[0], "setfile") == 0 && nfields == 3) { - uint64_t offset = getuint64(fields[1]); - unsigned char newc = getuint64(fields[2]); - set_file(f, offset, newc); - } else if (strcmp(fields[0], "quit") == 0 || strcmp(fields[0], "q") == 0) { - break; - } - } - } else if (rootnode) { - dump_node(f, ft->h->root_blocknum, ft); - } else if (fragmentation) { - dump_fragmentation(f, ft, tsv); - } else if (translation_table) { - toku_dump_translation_table_pretty(stdout, ft->blocktable); + + // create a cachefile for the header + CACHETABLE ct = NULL; + toku_cachetable_create(&ct, 1<<25, (LSN){0}, 0); + + CACHEFILE cf = NULL; + r = toku_cachetable_openfd (&cf, ct, fd, fname); + assert_zero(r); + + FT ft = NULL; + open_header(fd, &ft, cf); + + if (do_interactive) { + run_iteractive_loop(fd, ft, cf); } else { - printf("Block translation:"); - - toku_dump_translation_table(stdout, ft->blocktable); - - struct __dump_node_extra info; - info.f = f; - info.h = ft; - toku_blocktable_iterate(ft->blocktable, TRANSLATION_CHECKPOINTED, - dump_node_wrapper, &info, true, true); + if (do_header) { + dump_header(ft); + } + if (do_rootnode) { + dump_node(fd, ft->h->root_blocknum, ft); + } + if (do_fragmentation) { + dump_fragmentation(fd, ft, do_tsv); + } + if (do_translation_table) { + toku_dump_translation_table_pretty(stdout, ft->blocktable); + } + if (do_garbage) { + dump_garbage_stats(fd, ft); + } + if (!do_header && !do_rootnode && !do_fragmentation && !do_translation_table && !do_garbage) { + printf("Block translation:"); + + toku_dump_translation_table(stdout, ft->blocktable); + + struct __dump_node_extra info; + info.fd = fd; + info.h = ft; + toku_blocktable_iterate(ft->blocktable, TRANSLATION_CHECKPOINTED, + dump_node_wrapper, &info, true, true); + } } toku_cachefile_close(&cf, false, ZERO_LSN); toku_cachetable_close(&ct); From 676c38a886bbceb179794a2b2982fc08804fadb6 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Sat, 31 May 2014 12:16:56 -0400 Subject: [PATCH 36/46] #245 use row estimate parameter to start_bulk_insert to decide if a loader is used --- storage/tokudb/ha_tokudb.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index ddacb6d1382..020fa3b526d 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -3250,7 +3250,7 @@ void ha_tokudb::start_bulk_insert(ha_rows rows) { num_DBs_locked_in_bulk = true; lock_count = 0; - if (share->try_table_lock) { + if ((rows == 0 || rows > 1) && share->try_table_lock) { if (get_prelock_empty(thd) && may_table_be_empty(transaction)) { if (using_ignore || is_insert_ignore(thd) || thd->lex->duplicates != DUP_ERROR) { acquire_table_lock(transaction, lock_write); From 454e974ce453bc57748f4029827c540833e45d52 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Sun, 1 Jun 2014 07:49:28 -0400 Subject: [PATCH 37/46] #225 hot optimize for 5.6 and 10.0 using alter recreate --- storage/tokudb/ha_tokudb.cc | 41 +++++++++++----------------- storage/tokudb/ha_tokudb.h | 10 +------ storage/tokudb/ha_tokudb_admin.cc | 38 +++++++++++++++++++++----- storage/tokudb/ha_tokudb_alter_56.cc | 11 ++++++++ storage/tokudb/hatoku_defines.h | 10 ++++--- 5 files changed, 65 insertions(+), 45 deletions(-) diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index 020fa3b526d..292e94dca22 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -6299,7 +6299,7 @@ uint32_t ha_tokudb::get_cursor_isolation_flags(enum thr_lock_type lock_type, THD lock (if we don't want to use MySQL table locks at all) or add locks for many tables (like we do when we are using a MERGE handler). - Tokudb DB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which + TokuDB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which signals that we are doing WRITES, but we are still allowing other reader's and writer's. @@ -6321,31 +6321,22 @@ THR_LOCK_DATA **ha_tokudb::store_lock(THD * thd, THR_LOCK_DATA ** to, enum thr_l } if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) { - // if creating a hot index - if (thd_sql_command(thd)== SQLCOM_CREATE_INDEX && get_create_index_online(thd)) { - rw_rdlock(&share->num_DBs_lock); - if (share->num_DBs == (table->s->keys + tokudb_test(hidden_primary_key))) { - lock_type = TL_WRITE_ALLOW_WRITE; - } - lock.type = lock_type; - rw_unlock(&share->num_DBs_lock); - } - - // 5.5 supports reads concurrent with alter table. just use the default lock type. -#if MYSQL_VERSION_ID < 50500 - else if (thd_sql_command(thd)== SQLCOM_CREATE_INDEX || - thd_sql_command(thd)== SQLCOM_ALTER_TABLE || - thd_sql_command(thd)== SQLCOM_DROP_INDEX) { - // force alter table to lock out other readers - lock_type = TL_WRITE; - lock.type = lock_type; - } -#endif - else { - // If we are not doing a LOCK TABLE, then allow multiple writers - if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) && - !thd->in_lock_tables && thd_sql_command(thd) != SQLCOM_TRUNCATE && !thd_tablespace_op(thd)) { + enum_sql_command sql_command = (enum_sql_command) thd_sql_command(thd); + if (!thd->in_lock_tables) { + if (sql_command == SQLCOM_CREATE_INDEX && get_create_index_online(thd)) { + // hot indexing + rw_rdlock(&share->num_DBs_lock); + if (share->num_DBs == (table->s->keys + tokudb_test(hidden_primary_key))) { + lock_type = TL_WRITE_ALLOW_WRITE; + } + rw_unlock(&share->num_DBs_lock); + } else if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) && + sql_command != SQLCOM_TRUNCATE && !thd_tablespace_op(thd)) { + // allow concurrent writes lock_type = TL_WRITE_ALLOW_WRITE; + } else if (sql_command == SQLCOM_OPTIMIZE && lock_type == TL_READ_NO_INSERT) { + // hot optimize table + lock_type = TL_READ; } lock.type = lock_type; } diff --git a/storage/tokudb/ha_tokudb.h b/storage/tokudb/ha_tokudb.h index 1ad602ff9b0..f75d75bee4c 100644 --- a/storage/tokudb/ha_tokudb.h +++ b/storage/tokudb/ha_tokudb.h @@ -109,15 +109,6 @@ typedef struct loader_context { ha_tokudb* ha; } *LOADER_CONTEXT; -typedef struct hot_optimize_context { - THD *thd; - char* write_status_msg; - ha_tokudb *ha; - uint progress_stage; - uint current_table; - uint num_tables; -} *HOT_OPTIMIZE_CONTEXT; - // // This object stores table information that is to be shared // among all ha_tokudb objects. @@ -805,6 +796,7 @@ private: void remove_from_trx_handler_list(); private: + int do_optimize(THD *thd); int map_to_handler_error(int error); }; diff --git a/storage/tokudb/ha_tokudb_admin.cc b/storage/tokudb/ha_tokudb_admin.cc index 8d202eeda41..3f8f2b92662 100644 --- a/storage/tokudb/ha_tokudb_admin.cc +++ b/storage/tokudb/ha_tokudb_admin.cc @@ -130,6 +130,12 @@ int ha_tokudb::analyze(THD *thd, HA_CHECK_OPT *check_opt) { TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); uint64_t rec_per_key[table_share->key_parts]; int result = HA_ADMIN_OK; + + // stub out analyze if optimize is remapped to alter recreate + analyze + if (thd_sql_command(thd) != SQLCOM_ANALYZE) { + TOKUDB_HANDLER_DBUG_RETURN(result); + } + DB_TXN *txn = transaction; if (!txn) { result = HA_ADMIN_FAILED; @@ -171,6 +177,15 @@ int ha_tokudb::analyze(THD *thd, HA_CHECK_OPT *check_opt) { TOKUDB_HANDLER_DBUG_RETURN(result); } +typedef struct hot_optimize_context { + THD *thd; + char* write_status_msg; + ha_tokudb *ha; + uint progress_stage; + uint current_table; + uint num_tables; +} *HOT_OPTIMIZE_CONTEXT; + static int hot_poll_fun(void *extra, float progress) { HOT_OPTIMIZE_CONTEXT context = (HOT_OPTIMIZE_CONTEXT)extra; if (context->thd->killed) { @@ -193,10 +208,11 @@ static int hot_poll_fun(void *extra, float progress) { return 0; } -// flatten all DB's in this table, to do so, peform hot optimize on each db -int ha_tokudb::optimize(THD * thd, HA_CHECK_OPT * check_opt) { - TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); +volatile int ha_tokudb_optimize_wait; +// flatten all DB's in this table, to do so, peform hot optimize on each db +int ha_tokudb::do_optimize(THD *thd) { + TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); int error; uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key); @@ -206,9 +222,7 @@ int ha_tokudb::optimize(THD * thd, HA_CHECK_OPT * check_opt) { thd_progress_init(thd, curr_num_DBs); #endif - // // for each DB, run optimize and hot_optimize - // for (uint i = 0; i < curr_num_DBs; i++) { DB* db = share->key_file[i]; error = db->optimize(db); @@ -228,14 +242,24 @@ int ha_tokudb::optimize(THD * thd, HA_CHECK_OPT * check_opt) { goto cleanup; } } - error = 0; -cleanup: +cleanup: + while (ha_tokudb_optimize_wait) sleep(1); #ifdef HA_TOKUDB_HAS_THD_PROGRESS thd_progress_end(thd); #endif + TOKUDB_HANDLER_DBUG_RETURN(error); +} +int ha_tokudb::optimize(THD *thd, HA_CHECK_OPT *check_opt) { + TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); + int error; +#if TOKU_OPTIMIZE_WITH_RECREATE + error = HA_ADMIN_TRY_ALTER; +#else + error = do_optimize(thd); +#endif TOKUDB_HANDLER_DBUG_RETURN(error); } diff --git a/storage/tokudb/ha_tokudb_alter_56.cc b/storage/tokudb/ha_tokudb_alter_56.cc index 5289779bb32..5062a2ae67b 100644 --- a/storage/tokudb/ha_tokudb_alter_56.cc +++ b/storage/tokudb/ha_tokudb_alter_56.cc @@ -122,6 +122,7 @@ public: expand_varchar_update_needed(false), expand_fixed_update_needed(false), expand_blob_update_needed(false), + optimize_needed(false), table_kc_info(NULL), altered_table_kc_info(NULL) { } @@ -141,6 +142,7 @@ public: bool expand_varchar_update_needed; bool expand_fixed_update_needed; bool expand_blob_update_needed; + bool optimize_needed; Dynamic_array changed_fields; KEY_AND_COL_INFO *table_kc_info; KEY_AND_COL_INFO *altered_table_kc_info; @@ -439,7 +441,13 @@ enum_alter_inplace_result ha_tokudb::check_if_supported_inplace_alter(TABLE *alt result = HA_ALTER_INPLACE_EXCLUSIVE_LOCK; } } + } +#if TOKU_OPTIMIZE_WITH_RECREATE + else if (only_flags(ctx->handler_flags, Alter_inplace_info::RECREATE_TABLE + Alter_inplace_info::ALTER_COLUMN_DEFAULT)) { + ctx->optimize_needed = true; + result = HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE; } +#endif if (result != HA_ALTER_INPLACE_NOT_SUPPORTED && table->s->null_bytes != altered_table->s->null_bytes && (tokudb_debug & TOKUDB_DEBUG_ALTER_TABLE)) { @@ -522,6 +530,9 @@ bool ha_tokudb::inplace_alter_table(TABLE *altered_table, Alter_inplace_info *ha if (error == 0 && ctx->reset_card) { error = tokudb::set_card_from_status(share->status_block, ctx->alter_txn, table->s, altered_table->s); } + if (error == 0 && ctx->optimize_needed) { + error = do_optimize(ha_thd()); + } #if (50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \ (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) diff --git a/storage/tokudb/hatoku_defines.h b/storage/tokudb/hatoku_defines.h index 444ae425b2d..a24aade6bb8 100644 --- a/storage/tokudb/hatoku_defines.h +++ b/storage/tokudb/hatoku_defines.h @@ -112,6 +112,7 @@ PATENT RIGHTS GRANT: #define TOKU_INCLUDE_EXTENDED_KEYS 1 #endif #define TOKU_INCLUDE_OPTION_STRUCTS 1 +#define TOKU_OPTIMIZE_WITH_RECREATE 1 #elif 50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799 // mysql 5.7 with no patches @@ -134,17 +135,18 @@ PATENT RIGHTS GRANT: #define TOKU_PARTITION_WRITE_FRM_DATA 0 #else // mysql 5.6 with tokutek patches -#define TOKU_USE_DB_TYPE_TOKUDB 1 /* has DB_TYPE_TOKUDB patch */ +#define TOKU_USE_DB_TYPE_TOKUDB 1 // has DB_TYPE_TOKUDB patch #define TOKU_INCLUDE_ALTER_56 1 -#define TOKU_INCLUDE_ROW_TYPE_COMPRESSION 1 /* has tokudb row format compression patch */ -#define TOKU_INCLUDE_XA 1 /* has patch that fixes TC_LOG_MMAP code */ +#define TOKU_INCLUDE_ROW_TYPE_COMPRESSION 1 // has tokudb row format compression patch +#define TOKU_INCLUDE_XA 1 // has patch that fixes TC_LOG_MMAP code #define TOKU_PARTITION_WRITE_FRM_DATA 0 #define TOKU_INCLUDE_WRITE_FRM_DATA 0 -#define TOKU_INCLUDE_UPSERT 1 /* has tokudb upsert patch */ +#define TOKU_INCLUDE_UPSERT 1 // has tokudb upsert patch #if defined(HTON_SUPPORTS_EXTENDED_KEYS) #define TOKU_INCLUDE_EXTENDED_KEYS 1 #endif #endif +#define TOKU_OPTIMIZE_WITH_RECREATE 1 #elif 50500 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50599 // mysql 5.5 and mariadb 5.5 From 39ef4081659aa20783d2cbfd8e85a7399434a46e Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Sun, 1 Jun 2014 12:48:53 -0400 Subject: [PATCH 38/46] #225 fix tokudb store lock to fix lock tables crash --- storage/tokudb/ha_tokudb.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index 292e94dca22..f64e4821108 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -6338,8 +6338,8 @@ THR_LOCK_DATA **ha_tokudb::store_lock(THD * thd, THR_LOCK_DATA ** to, enum thr_l // hot optimize table lock_type = TL_READ; } - lock.type = lock_type; } + lock.type = lock_type; } *to++ = &lock; if (tokudb_debug & TOKUDB_DEBUG_LOCK) From af53db60ccbdbe138b68183c80869ed6e14d345d Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Mon, 2 Jun 2014 16:24:47 -0400 Subject: [PATCH 39/46] #248 install PS+TokuDB tarballs --- scripts/setup.mysql.bash | 79 ++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/scripts/setup.mysql.bash b/scripts/setup.mysql.bash index e97e4a4f562..85132350289 100755 --- a/scripts/setup.mysql.bash +++ b/scripts/setup.mysql.bash @@ -5,6 +5,39 @@ function usage() { echo "--mysqlbuild=$mysqlbuild --shutdown=$shutdown --install=$install --startup=$startup" } +function download_file() { + local file=$1 + s3get $s3bucket $file $file +} + +function download_tarball() { + local tarball=$1 + if [ ! -f $tarball ] ; then + download_file $tarball + if [ $? -ne 0 ] ; then test 0 = 1; return; fi + fi + if [ ! -f $tarball.md5 ] ; then + download_file $tarball.md5 + if [ $? -ne 0 ] ; then test 0 = 1; return; fi + fi +} + +function install_tarball() { + local basedir=$1; local tarball=$2 + tar -x -z -f $basedir/$tarball + if [ $? -ne 0 ] ; then test 0 = 1; return; fi +} + +function check_md5() { + local tarball=$1 + md5sum --check $tarball.md5 + if [ $? -ne 0 ] ; then + # support jacksum md5 output which is almost the same as md5sum + diff -b <(cat $tarball.md5) <(md5sum $tarball) + if [ $? -ne 0 ] ; then test 0 = 1; return; fi + fi +} + mysqlbuild= shutdown=1 install=1 @@ -64,30 +97,24 @@ basedir=$PWD mysqltarball=$mysqlbuild.tar.gz -if [ -f $mysqlbuild.tar.gz ] ; then - compression=-z - mysqltarball=$mysqlbuild.tar.gz -elif [ -f $mysqlbuild.tar.bz2 ] ; then - compression=-j - mysqltarball=$mysqlbuild.tar.bz2 -fi - -# get the release -if [ ! -f $mysqltarball ] ; then - s3get $s3bucket $mysqltarball $mysqltarball - if [ $? -ne 0 ] ; then exit 1; fi -fi -if [ ! -f $mysqltarball.md5 ] ; then - s3get $s3bucket $mysqltarball.md5 $mysqltarball.md5 - if [ $? -ne 0 ] ; then exit 1; fi -fi +# get the tarball +download_tarball $mysqltarball +if [ $? -ne 0 ] ; then exit 1; fi # check the md5 sum -md5sum --check $mysqltarball.md5 -if [ $? -ne 0 ] ; then - # support jacksum md5 output which is almost the same as md5sum - diff -b <(cat $mysqltarball.md5) <(md5sum $mysqltarball) - if [ $? -ne 0 ] ; then exit 1; fi +check_md5 $mysqltarball +if [ $? -ne 0 ] ; then exit 1; fi + +tokudbtarball="" +if [[ $mysqltarball =~ ^(Percona-Server.*)\.(Linux\.x86_64.*)$ ]] ; then + tar tzf $mysqltarball | egrep ha_tokudb.so >/dev/null 2>&1 + if [ $? -ne 0 ] ; then + tokudbtarball=${BASH_REMATCH[1]}.TokuDB.${BASH_REMATCH[2]} + download_tarball $tokudbtarball + if [ $? -ne 0 ] ; then exit 1; fi + check_md5 $tokudbtarball + if [ $? -ne 0 ] ; then exit 1; fi + fi fi # set ldpath @@ -126,8 +153,14 @@ if [ ! -d $mysqlbuild ] || [ $install -ne 0 ] ; then rm mysql if [ -d $mysqlbuild ] ; then $sudo rm -rf $mysqlbuild; fi - tar -x $compression -f $basedir/$mysqltarball + install_tarball $basedir $mysqltarball if [ $? -ne 0 ] ; then exit 1; fi + + if [ $tokudbtarball ] ; then + install_tarball $basedir $tokudbtarball + if [ $? -ne 0 ] ; then exit 1; fi + fi + ln -s $mysqldir /usr/local/mysql if [ $? -ne 0 ] ; then exit 1; fi ln -s $mysqldir /usr/local/$mysqlbuild From 2f948975f6e6064db0b37735b2198ae1740f434a Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Wed, 4 Jun 2014 10:44:15 -0400 Subject: [PATCH 40/46] #250 restore proc info to valid pointers in commit, abort, analyze, and optimize --- storage/tokudb/ha_tokudb.cc | 15 +++++++-------- storage/tokudb/ha_tokudb_admin.cc | 11 ++++++----- storage/tokudb/hatoku_hton.cc | 10 ++++++---- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index f64e4821108..9dc724a9dee 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -469,7 +469,6 @@ typedef struct index_read_info { DBT* orig_key; } *INDEX_READ_INFO; - static int ai_poll_fun(void *extra, float progress) { LOADER_CONTEXT context = (LOADER_CONTEXT)extra; if (context->thd->killed) { @@ -3322,10 +3321,10 @@ int ha_tokudb::end_bulk_insert(bool abort) { if (loader) { if (!abort_loader && !thd->killed) { DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", { - const char *old_proc_info = tokudb_thd_get_proc_info(thd); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); thd_proc_info(thd, "DBUG sleep"); my_sleep(20000000); - thd_proc_info(thd, old_proc_info); + thd_proc_info(thd, orig_proc_info); }); error = loader->close(loader); loader = NULL; @@ -3398,7 +3397,7 @@ int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_in uint64_t cnt = 0; char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound. THD* thd = ha_thd(); - const char *old_proc_info = tokudb_thd_get_proc_info(thd); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); memset(&key1, 0, sizeof(key1)); memset(&key2, 0, sizeof(key2)); memset(&val, 0, sizeof(val)); @@ -3475,7 +3474,7 @@ int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_in error = 0; cleanup: - thd_proc_info(thd, old_proc_info); + thd_proc_info(thd, orig_proc_info); if (tmp_cursor1) { tmp_cursor1->c_close(tmp_cursor1); tmp_cursor1 = NULL; @@ -7429,7 +7428,7 @@ int ha_tokudb::tokudb_add_index( DBC* tmp_cursor = NULL; int cursor_ret_val = 0; DBT curr_pk_key, curr_pk_val; - THD* thd = ha_thd(); + THD* thd = ha_thd(); DB_LOADER* loader = NULL; DB_INDEXER* indexer = NULL; bool loader_save_space = get_load_save_space(thd); @@ -7467,7 +7466,7 @@ int ha_tokudb::tokudb_add_index( // // status message to be shown in "show process list" // - const char *old_proc_info = tokudb_thd_get_proc_info(thd); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound. ulonglong num_processed = 0; //variable that stores number of elements inserted thus far thd_proc_info(thd, "Adding indexes"); @@ -7785,7 +7784,7 @@ cleanup: another transaction has accessed the table. \ To add indexes, make sure no transactions touch the table.", share->table_name); } - thd_proc_info(thd, old_proc_info); + thd_proc_info(thd, orig_proc_info); TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error); } diff --git a/storage/tokudb/ha_tokudb_admin.cc b/storage/tokudb/ha_tokudb_admin.cc index 3f8f2b92662..80ae149dba3 100644 --- a/storage/tokudb/ha_tokudb_admin.cc +++ b/storage/tokudb/ha_tokudb_admin.cc @@ -128,6 +128,7 @@ static int analyze_progress(void *v_extra, uint64_t rows) { int ha_tokudb::analyze(THD *thd, HA_CHECK_OPT *check_opt) { TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); uint64_t rec_per_key[table_share->key_parts]; int result = HA_ADMIN_OK; @@ -174,6 +175,7 @@ int ha_tokudb::analyze(THD *thd, HA_CHECK_OPT *check_opt) { if (error) result = HA_ADMIN_FAILED; } + thd_proc_info(thd, orig_proc_info); TOKUDB_HANDLER_DBUG_RETURN(result); } @@ -254,12 +256,14 @@ cleanup: int ha_tokudb::optimize(THD *thd, HA_CHECK_OPT *check_opt) { TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); int error; #if TOKU_OPTIMIZE_WITH_RECREATE error = HA_ADMIN_TRY_ALTER; #else error = do_optimize(thd); #endif + thd_proc_info(thd, orig_proc_info); TOKUDB_HANDLER_DBUG_RETURN(error); } @@ -290,10 +294,7 @@ static void ha_tokudb_check_info(THD *thd, TABLE *table, const char *msg) { int ha_tokudb::check(THD *thd, HA_CHECK_OPT *check_opt) { TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); - - const char *old_proc_info = tokudb_thd_get_proc_info(thd); - thd_proc_info(thd, "tokudb::check"); - + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); int result = HA_ADMIN_OK; int r; @@ -345,6 +346,6 @@ int ha_tokudb::check(THD *thd, HA_CHECK_OPT *check_opt) { } } } - thd_proc_info(thd, old_proc_info); + thd_proc_info(thd, orig_proc_info); TOKUDB_HANDLER_DBUG_RETURN(result); } diff --git a/storage/tokudb/hatoku_hton.cc b/storage/tokudb/hatoku_hton.cc index 9a6eef92d6a..ad80b85e831 100644 --- a/storage/tokudb/hatoku_hton.cc +++ b/storage/tokudb/hatoku_hton.cc @@ -685,25 +685,27 @@ static void txn_progress_func(TOKU_TXN_PROGRESS progress, void* extra) { } static void commit_txn_with_progress(DB_TXN* txn, uint32_t flags, THD* thd) { - int r; + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); struct txn_progress_info info; info.thd = thd; - r = txn->commit_with_progress(txn, flags, txn_progress_func, &info); + int r = txn->commit_with_progress(txn, flags, txn_progress_func, &info); if (r != 0) { sql_print_error("tried committing transaction %p and got error code %d", txn, r); } assert(r == 0); + thd_proc_info(thd, orig_proc_info); } static void abort_txn_with_progress(DB_TXN* txn, THD* thd) { - int r; + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); struct txn_progress_info info; info.thd = thd; - r = txn->abort_with_progress(txn, txn_progress_func, &info); + int r = txn->abort_with_progress(txn, txn_progress_func, &info); if (r != 0) { sql_print_error("tried aborting transaction %p and got error code %d", txn, r); } assert(r == 0); + thd_proc_info(thd, orig_proc_info); } static void tokudb_cleanup_handlers(tokudb_trx_data *trx, DB_TXN *txn) { From 5cd670f646f413a4b720884dbcbe2ff8c7ac9151 Mon Sep 17 00:00:00 2001 From: Zardosht Kasheff Date: Wed, 4 Jun 2014 15:34:08 -0400 Subject: [PATCH 41/46] refs #255, have the fsync_log minicron shutdown before we close the logger in env_close --- src/ydb.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ydb.cc b/src/ydb.cc index 4a01c37bea6..df4fd6baf87 100644 --- a/src/ydb.cc +++ b/src/ydb.cc @@ -1160,6 +1160,7 @@ env_close(DB_ENV * env, uint32_t flags) { goto panic_and_quit_early; } } + env_fsync_log_cron_destroy(env); if (env->i->cachetable) { toku_cachetable_minicron_shutdown(env->i->cachetable); if (env->i->logger) { @@ -1200,7 +1201,6 @@ env_close(DB_ENV * env, uint32_t flags) { } env_fs_destroy(env); - env_fsync_log_cron_destroy(env); env->i->ltm.destroy(); if (env->i->data_dir) toku_free(env->i->data_dir); From 9f3d1221407ebd189cfea0405cb150cc29b6af9f Mon Sep 17 00:00:00 2001 From: Zardosht Kasheff Date: Wed, 4 Jun 2014 15:40:35 -0400 Subject: [PATCH 42/46] Revert "refs #255, have the fsync_log minicron shutdown before we close the logger in env_close" This reverts commit c58c4949b8b12969db37d00810891a4490cb5fe9. --- src/ydb.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ydb.cc b/src/ydb.cc index df4fd6baf87..4a01c37bea6 100644 --- a/src/ydb.cc +++ b/src/ydb.cc @@ -1160,7 +1160,6 @@ env_close(DB_ENV * env, uint32_t flags) { goto panic_and_quit_early; } } - env_fsync_log_cron_destroy(env); if (env->i->cachetable) { toku_cachetable_minicron_shutdown(env->i->cachetable); if (env->i->logger) { @@ -1201,6 +1200,7 @@ env_close(DB_ENV * env, uint32_t flags) { } env_fs_destroy(env); + env_fsync_log_cron_destroy(env); env->i->ltm.destroy(); if (env->i->data_dir) toku_free(env->i->data_dir); From bb70a4daa910cc9e232cfd38d73b8df94a07887d Mon Sep 17 00:00:00 2001 From: Zardosht Kasheff Date: Wed, 4 Jun 2014 15:42:11 -0400 Subject: [PATCH 43/46] refs #255, have the fsync_log minicron shutdown before we close the logger in env_close --- src/ydb.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ydb.cc b/src/ydb.cc index 4a01c37bea6..df4fd6baf87 100644 --- a/src/ydb.cc +++ b/src/ydb.cc @@ -1160,6 +1160,7 @@ env_close(DB_ENV * env, uint32_t flags) { goto panic_and_quit_early; } } + env_fsync_log_cron_destroy(env); if (env->i->cachetable) { toku_cachetable_minicron_shutdown(env->i->cachetable); if (env->i->logger) { @@ -1200,7 +1201,6 @@ env_close(DB_ENV * env, uint32_t flags) { } env_fs_destroy(env); - env_fsync_log_cron_destroy(env); env->i->ltm.destroy(); if (env->i->data_dir) toku_free(env->i->data_dir); From f16b3122abe9d509a588450d7bf5372d75ec6b4d Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Tue, 10 Jun 2014 18:55:57 -0400 Subject: [PATCH 44/46] #252 fix MDEV-6324 uninit var in discover3 --- storage/tokudb/hatoku_defines.h | 2 +- storage/tokudb/hatoku_hton.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/tokudb/hatoku_defines.h b/storage/tokudb/hatoku_defines.h index a24aade6bb8..fcaeb2e4118 100644 --- a/storage/tokudb/hatoku_defines.h +++ b/storage/tokudb/hatoku_defines.h @@ -161,7 +161,7 @@ PATENT RIGHTS GRANT: #if defined(MARIADB_BASE_VERSION) #define TOKU_INCLUDE_EXTENDED_KEYS 1 #endif -#define TOKU_INCLUDE_HANDLERTON_HANDLE_FATAL_SIGNAL 1 +#define TOKU_INCLUDE_HANDLERTON_HANDLE_FATAL_SIGNAL 0 #else #error diff --git a/storage/tokudb/hatoku_hton.cc b/storage/tokudb/hatoku_hton.cc index ad80b85e831..276fc096d3f 100644 --- a/storage/tokudb/hatoku_hton.cc +++ b/storage/tokudb/hatoku_hton.cc @@ -969,7 +969,7 @@ static int tokudb_discover3(handlerton *hton, THD* thd, const char *db, const ch HA_METADATA_KEY curr_key = hatoku_frm_data; DBT key = {}; DBT value = {}; - bool do_commit; + bool do_commit = false; #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099 tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); From eb00a1221ecf7511606d8fdfdd701528ba6ea8a6 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Fri, 13 Jun 2014 12:06:05 -0400 Subject: [PATCH 45/46] #250 reset thd proc info in end_bulk_insert to fix invalid proc info pointer inside of a deleted ha_tokudb object --- storage/tokudb/ha_tokudb.cc | 1 + storage/tokudb/ha_tokudb_admin.cc | 7 ++----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index 9dc724a9dee..43de1c05aa5 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -3382,6 +3382,7 @@ cleanup: } } trx->stmt_progress.using_loader = false; + thd_proc_info(thd, 0); TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error); } diff --git a/storage/tokudb/ha_tokudb_admin.cc b/storage/tokudb/ha_tokudb_admin.cc index 80ae149dba3..100c88a76a8 100644 --- a/storage/tokudb/ha_tokudb_admin.cc +++ b/storage/tokudb/ha_tokudb_admin.cc @@ -210,11 +210,10 @@ static int hot_poll_fun(void *extra, float progress) { return 0; } -volatile int ha_tokudb_optimize_wait; - // flatten all DB's in this table, to do so, peform hot optimize on each db int ha_tokudb::do_optimize(THD *thd) { TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); int error; uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key); @@ -247,23 +246,21 @@ int ha_tokudb::do_optimize(THD *thd) { error = 0; cleanup: - while (ha_tokudb_optimize_wait) sleep(1); #ifdef HA_TOKUDB_HAS_THD_PROGRESS thd_progress_end(thd); #endif + thd_proc_info(thd, orig_proc_info); TOKUDB_HANDLER_DBUG_RETURN(error); } int ha_tokudb::optimize(THD *thd, HA_CHECK_OPT *check_opt) { TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); - const char *orig_proc_info = tokudb_thd_get_proc_info(thd); int error; #if TOKU_OPTIMIZE_WITH_RECREATE error = HA_ADMIN_TRY_ALTER; #else error = do_optimize(thd); #endif - thd_proc_info(thd, orig_proc_info); TOKUDB_HANDLER_DBUG_RETURN(error); } From 4a993fc93f16fd478c2351799fbf8d02dbbc2209 Mon Sep 17 00:00:00 2001 From: Rich Prohaska Date: Fri, 13 Jun 2014 14:43:36 -0400 Subject: [PATCH 46/46] #252 fix the MDEV-6324 fix --- storage/tokudb/hatoku_defines.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/tokudb/hatoku_defines.h b/storage/tokudb/hatoku_defines.h index fcaeb2e4118..a24aade6bb8 100644 --- a/storage/tokudb/hatoku_defines.h +++ b/storage/tokudb/hatoku_defines.h @@ -161,7 +161,7 @@ PATENT RIGHTS GRANT: #if defined(MARIADB_BASE_VERSION) #define TOKU_INCLUDE_EXTENDED_KEYS 1 #endif -#define TOKU_INCLUDE_HANDLERTON_HANDLE_FATAL_SIGNAL 0 +#define TOKU_INCLUDE_HANDLERTON_HANDLE_FATAL_SIGNAL 1 #else #error