diff --git a/storage/tokudb/CMakeLists.in b/storage/tokudb/CMakeLists.in deleted file mode 100644 index 20c05126841..00000000000 --- a/storage/tokudb/CMakeLists.in +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (C) 2006 MySQL AB -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; version 2 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTOKUDB_VERSION=\\\"TOKUDB_VERSION_REPLACE_ME\\\"") -SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DSAFEMALLOC -DSAFE_MUTEX") -SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DSAFEMALLOC -DSAFE_MUTEX") - -INCLUDE_DIRECTORIES(TOKUDB_DIR_REPLACE_ME/windows - TOKUDB_DIR_REPLACE_ME/src - TOKUDB_DIR_REPLACE_ME/include - TOKUDB_DIR_REPLACE_ME/toku_include) - -INCLUDE("${PROJECT_SOURCE_DIR}/storage/mysql_storage_engine.cmake") -SET(TOKUDB_SOURCES hatoku_hton.cc ha_tokudb.cc hatoku_cmp.cc) -MYSQL_STORAGE_ENGINE(TOKUDB) - -TARGET_LINK_LIBRARIES(ha_tokudb PowrProf optimized TOKUDB_OBJ_DIR_REPLACE_ME/opt/ipo_libtokudb optimized TOKUDB_OBJ_DIR_REPLACE_ME/opt/libtokuportability debug TOKUDB_OBJ_DIR_REPLACE_ME/debug/static_libtokudb debug TOKUDB_OBJ_DIR_REPLACE_ME/debug/libtokuportability) diff --git a/storage/tokudb/README.md b/storage/tokudb/README.md index 7d4ebcefce1..1deb3699c5a 100644 --- a/storage/tokudb/README.md +++ b/storage/tokudb/README.md @@ -24,14 +24,14 @@ working MySQL or MariaDB with Tokutek patches, and with the TokuDB storage engine, called `make.mysql.bash`. This script will download copies of the needed source code from github and build everything. -To build MySQL 5.5.36 with TokuDB 7.1.5: +To build MySQL 5.5.37 with TokuDB 7.1.6: ```sh -scripts/make.mysql.bash --mysqlbuild=mysql-5.5.36-tokudb-7.1.5-linux-x86_64 +scripts/make.mysql.bash --mysqlbuild=mysql-5.5.37-tokudb-7.1.6-linux-x86_64 ``` -To build MariaDB 5.5.36 with TokuDB 7.1.5: +To build MariaDB 5.5.37 with TokuDB 7.1.6: ```sh -scripts/make.mysql.bash --mysqlbuild=mariadb-5.5.36-tokudb-7.1.5-linux-x86_64 +scripts/make.mysql.bash --mysqlbuild=mariadb-5.5.37-tokudb-7.1.6-linux-x86_64 ``` Before you start, make sure you have a C++11-compatible compiler (GCC >= diff --git a/storage/tokudb/ft-index/CMakeLists.txt b/storage/tokudb/ft-index/CMakeLists.txt index 1228da8c35d..f28e7745295 100644 --- a/storage/tokudb/ft-index/CMakeLists.txt +++ b/storage/tokudb/ft-index/CMakeLists.txt @@ -6,6 +6,31 @@ project(TokuDB) set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "") set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "") +## Versions of gcc >= 4.9.0 require special version of 'ar' and 'ranlib' for +## link-time optimizations to work properly. +## +## From https://gcc.gnu.org/gcc-4.9/changes.html: +## +## When using a linker plugin, compiling with the -flto option now +## generates slim objects files (.o) which only contain intermediate +## language representation for LTO. Use -ffat-lto-objects to create +## files which contain additionally the object code. To generate +## static libraries suitable for LTO processing, use gcc-ar and +## gcc-ranlib; to list symbols from a slim object file use +## gcc-nm. (Requires that ar, ranlib and nm have been compiled with +## plugin support.) +if ((CMAKE_CXX_COMPILER_ID STREQUAL GNU) AND + NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9.0")) + find_program(gcc_ar "gcc-ar") + if (gcc_ar) + set(CMAKE_AR "${gcc_ar}") + endif () + find_program(gcc_ranlib "gcc-ranlib") + if (gcc_ranlib) + set(CMAKE_RANLIB "${gcc_ranlib}") + endif () +endif() + include(TokuFeatureDetection) include(TokuSetupCompiler) include(TokuSetupCTest) diff --git a/storage/tokudb/ft-index/cmake_modules/TokuThirdParty.cmake b/storage/tokudb/ft-index/cmake_modules/TokuThirdParty.cmake index 461390ffb7c..cb474c385af 100644 --- a/storage/tokudb/ft-index/cmake_modules/TokuThirdParty.cmake +++ b/storage/tokudb/ft-index/cmake_modules/TokuThirdParty.cmake @@ -3,35 +3,34 @@ include(ExternalProject) if (CMAKE_PROJECT_NAME STREQUAL TokuDB) ## add jemalloc with an external project set(JEMALLOC_SOURCE_DIR "${TokuDB_SOURCE_DIR}/third_party/jemalloc" CACHE FILEPATH "Where to find jemalloc sources.") - if (NOT EXISTS "${JEMALLOC_SOURCE_DIR}/configure") - message(FATAL_ERROR "Can't find jemalloc sources. Please check them out to ${JEMALLOC_SOURCE_DIR} or modify JEMALLOC_SOURCE_DIR.") - endif () - set(jemalloc_configure_opts "CC=${CMAKE_C_COMPILER}" "--with-jemalloc-prefix=" "--with-private-namespace=tokudb_jemalloc_internal_" "--enable-cc-silence") - option(JEMALLOC_DEBUG "Build jemalloc with --enable-debug." OFF) - if (JEMALLOC_DEBUG) - list(APPEND jemalloc_configure_opts --enable-debug) - endif () - ExternalProject_Add(build_jemalloc - PREFIX jemalloc - SOURCE_DIR "${JEMALLOC_SOURCE_DIR}" - CONFIGURE_COMMAND - "${JEMALLOC_SOURCE_DIR}/configure" ${jemalloc_configure_opts} - "--prefix=${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc" - ) + if (EXISTS "${JEMALLOC_SOURCE_DIR}/configure") + set(jemalloc_configure_opts "CC=${CMAKE_C_COMPILER}" "--with-jemalloc-prefix=" "--with-private-namespace=tokudb_jemalloc_internal_" "--enable-cc-silence") + option(JEMALLOC_DEBUG "Build jemalloc with --enable-debug." OFF) + if (JEMALLOC_DEBUG) + list(APPEND jemalloc_configure_opts --enable-debug) + endif () + ExternalProject_Add(build_jemalloc + PREFIX jemalloc + SOURCE_DIR "${JEMALLOC_SOURCE_DIR}" + CONFIGURE_COMMAND + "${JEMALLOC_SOURCE_DIR}/configure" ${jemalloc_configure_opts} + "--prefix=${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc" + ) - add_library(jemalloc STATIC IMPORTED GLOBAL) - set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION - "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib/libjemalloc_pic.a") - add_dependencies(jemalloc build_jemalloc) - add_library(jemalloc_nopic STATIC IMPORTED GLOBAL) - set_target_properties(jemalloc_nopic PROPERTIES IMPORTED_LOCATION - "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib/libjemalloc.a") - add_dependencies(jemalloc_nopic build_jemalloc) + add_library(jemalloc STATIC IMPORTED GLOBAL) + set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION + "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib/libjemalloc_pic.a") + add_dependencies(jemalloc build_jemalloc) + add_library(jemalloc_nopic STATIC IMPORTED GLOBAL) + set_target_properties(jemalloc_nopic PROPERTIES IMPORTED_LOCATION + "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib/libjemalloc.a") + add_dependencies(jemalloc_nopic build_jemalloc) - # detect when we are being built as a subproject - if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) - install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib" DESTINATION . - COMPONENT tokukv_libs_extra) + # detect when we are being built as a subproject + if (NOT DEFINED MYSQL_PROJECT_NAME_DOCSTRING) + install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/jemalloc/lib" DESTINATION . + COMPONENT tokukv_libs_extra) + endif () endif () endif () diff --git a/storage/tokudb/ft-index/ft/checkpoint.cc b/storage/tokudb/ft-index/ft/checkpoint.cc index 3d26c3a460e..bc4629a1d08 100644 --- a/storage/tokudb/ft-index/ft/checkpoint.cc +++ b/storage/tokudb/ft-index/ft/checkpoint.cc @@ -158,8 +158,8 @@ status_init(void) { STATUS_INIT(CP_TIME_LAST_CHECKPOINT_BEGIN, CHECKPOINT_LAST_BEGAN, UNIXTIME, "last checkpoint began ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CP_TIME_LAST_CHECKPOINT_BEGIN_COMPLETE, CHECKPOINT_LAST_COMPLETE_BEGAN, UNIXTIME, "last complete checkpoint began ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CP_TIME_LAST_CHECKPOINT_END, CHECKPOINT_LAST_COMPLETE_ENDED, UNIXTIME, "last complete checkpoint ended", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); - STATUS_INIT(CP_TIME_CHECKPOINT_DURATION, CHECKPOINT_DURATION, UNIXTIME, "time spent during checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); - STATUS_INIT(CP_TIME_CHECKPOINT_DURATION_LAST, CHECKPOINT_DURATION_LAST, UNIXTIME, "time spent during last checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(CP_TIME_CHECKPOINT_DURATION, CHECKPOINT_DURATION, UINT64, "time spent during checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(CP_TIME_CHECKPOINT_DURATION_LAST, CHECKPOINT_DURATION_LAST, UINT64, "time spent during last checkpoint (begin and end phases)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CP_LAST_LSN, nullptr, UINT64, "last complete checkpoint LSN", TOKU_ENGINE_STATUS); STATUS_INIT(CP_CHECKPOINT_COUNT, CHECKPOINT_TAKEN, UINT64, "checkpoints taken ", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(CP_CHECKPOINT_COUNT_FAIL, CHECKPOINT_FAILED, UINT64, "checkpoints failed", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); @@ -381,8 +381,8 @@ toku_checkpoint(CHECKPOINTER cp, TOKULOGGER logger, STATUS_VALUE(CP_LONG_BEGIN_TIME) += duration; STATUS_VALUE(CP_LONG_BEGIN_COUNT) += 1; } - STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION) += ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN)); - STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION_LAST) = ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN)); + STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION) += (uint64_t) ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN)); + STATUS_VALUE(CP_TIME_CHECKPOINT_DURATION_LAST) = (uint64_t) ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END)) - ((time_t) STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN)); STATUS_VALUE(CP_FOOTPRINT) = 0; checkpoint_safe_checkpoint_unlock(); diff --git a/storage/tokudb/ft-index/ft/ft-cachetable-wrappers.cc b/storage/tokudb/ft-index/ft/ft-cachetable-wrappers.cc index 1f3aa3e0baa..91a0040b02e 100644 --- a/storage/tokudb/ft-index/ft/ft-cachetable-wrappers.cc +++ b/storage/tokudb/ft-index/ft/ft-cachetable-wrappers.cc @@ -403,3 +403,25 @@ toku_unpin_ftnode_read_only(FT ft, FTNODE node) ); assert(r==0); } + +void toku_ftnode_swap_pair_values(FTNODE a, FTNODE b) +// Effect: Swap the blocknum, fullhash, and PAIR for for a and b +// Requires: Both nodes are pinned +{ + BLOCKNUM tmp_blocknum = a->thisnodename; + uint32_t tmp_fullhash = a->fullhash; + PAIR tmp_pair = a->ct_pair; + + a->thisnodename = b->thisnodename; + a->fullhash = b->fullhash; + a->ct_pair = b->ct_pair; + + b->thisnodename = tmp_blocknum; + b->fullhash = tmp_fullhash; + b->ct_pair = tmp_pair; + + // A and B swapped pair pointers, but we still have to swap + // the actual pair values (ie: the FTNODEs they represent) + // in the cachetable. + toku_cachetable_swap_pair_values(a->ct_pair, b->ct_pair); +} diff --git a/storage/tokudb/ft-index/ft/ft-cachetable-wrappers.h b/storage/tokudb/ft-index/ft/ft-cachetable-wrappers.h index 9a56f4ff220..dc84d7f006b 100644 --- a/storage/tokudb/ft-index/ft/ft-cachetable-wrappers.h +++ b/storage/tokudb/ft-index/ft/ft-cachetable-wrappers.h @@ -190,4 +190,7 @@ int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pai void toku_unpin_ftnode(FT h, FTNODE node); void toku_unpin_ftnode_read_only(FT ft, FTNODE node); +// Effect: Swaps pair values of two pinned nodes +void toku_ftnode_swap_pair_values(FTNODE nodea, FTNODE nodeb); + #endif diff --git a/storage/tokudb/ft-index/ft/ft-flusher.cc b/storage/tokudb/ft-index/ft/ft-flusher.cc index 0fe556aec0f..dc4096a7993 100644 --- a/storage/tokudb/ft-index/ft/ft-flusher.cc +++ b/storage/tokudb/ft-index/ft/ft-flusher.cc @@ -565,6 +565,7 @@ static bool may_node_be_reactive(FT ft, FTNODE node) */ static void handle_split_of_child( + FT ft, FTNODE node, int childnum, FTNODE childa, @@ -607,8 +608,20 @@ handle_split_of_child( paranoid_invariant(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child + // We never set the rightmost blocknum to be the root. + // Instead, we wait for the root to split and let promotion initialize the rightmost + // blocknum to be the first non-root leaf node on the right extreme to recieve an insert. + invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b); + if (childa->thisnodename.b == ft->rightmost_blocknum.b) { + // The rightmost leaf (a) split into (a) and (b). We want (b) to swap pair values + // with (a), now that it is the new rightmost leaf. This keeps the rightmost blocknum + // constant, the same the way we keep the root blocknum constant. + toku_ftnode_swap_pair_values(childa, childb); + BP_BLOCKNUM(node, childnum) = childa->thisnodename; + } + BP_BLOCKNUM(node, childnum+1) = childb->thisnodename; - BP_WORKDONE(node, childnum+1) = 0; + BP_WORKDONE(node, childnum+1) = 0; BP_STATE(node,childnum+1) = PT_AVAIL; NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl(); @@ -1071,7 +1084,7 @@ ft_split_child( ft_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes); } // printf("%s:%d child did split\n", __FILE__, __LINE__); - handle_split_of_child (node, childnum, nodea, nodeb, &splitk); + handle_split_of_child (h, node, childnum, nodea, nodeb, &splitk); // for test call_flusher_thread_callback(flt_flush_during_split); @@ -1489,6 +1502,14 @@ ft_merge_child( &node->childkeys[childnuma+1], (node->n_children-childnumb)*sizeof(node->childkeys[0])); REALLOC_N(node->n_children-1, node->childkeys); + + // Handle a merge of the rightmost leaf node. + if (did_merge && childb->thisnodename.b == h->rightmost_blocknum.b) { + invariant(childb->thisnodename.b != h->h->root_blocknum.b); + toku_ftnode_swap_pair_values(childa, childb); + BP_BLOCKNUM(node, childnuma) = childa->thisnodename; + } + paranoid_invariant(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b); childa->dirty = 1; // just to make sure childb->dirty = 1; // just to make sure diff --git a/storage/tokudb/ft-index/ft/ft-internal.h b/storage/tokudb/ft-index/ft/ft-internal.h index 42d27638330..378e8921328 100644 --- a/storage/tokudb/ft-index/ft/ft-internal.h +++ b/storage/tokudb/ft-index/ft/ft-internal.h @@ -123,6 +123,10 @@ enum { FT_DEFAULT_FANOUT = 16 }; enum { FT_DEFAULT_NODE_SIZE = 4 * 1024 * 1024 }; enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 }; +// We optimize for a sequential insert pattern if 100 consecutive injections +// happen into the rightmost leaf node due to promotion. +enum { FT_SEQINSERT_SCORE_THRESHOLD = 100 }; + // // Field in ftnode_fetch_extra that tells the // partial fetch callback what piece of the node @@ -572,6 +576,22 @@ struct ft { // is this ft a blackhole? if so, all messages are dropped. bool blackhole; + + // The blocknum of the rightmost leaf node in the tree. Stays constant through splits + // and merges using pair-swapping (like the root node, see toku_ftnode_swap_pair_values()) + // + // This field only transitions from RESERVED_BLOCKNUM_NULL to non-null, never back. + // We initialize it when promotion inserts into a non-root leaf node on the right extreme. + // We use the blocktable lock to protect the initialize transition, though it's not really + // necessary since all threads should be setting it to the same value. We maintain that invariant + // on first initialization, see ft_set_or_verify_rightmost_blocknum() + BLOCKNUM rightmost_blocknum; + + // sequential access pattern heuristic + // - when promotion pushes a message directly into the rightmost leaf, the score goes up. + // - if the score is high enough, we optimistically attempt to insert directly into the rightmost leaf + // - if our attempt fails because the key was not in range of the rightmost leaf, we reset the score back to 0 + uint32_t seqinsert_score; }; // Allocate a DB struct off the stack and only set its comparison @@ -1037,7 +1057,7 @@ toku_get_node_for_verify( int toku_verify_ftnode (FT_HANDLE ft_h, - MSN rootmsn, MSN parentmsn, bool messages_exist_above, + MSN rootmsn, MSN parentmsn_with_messages, bool messages_exist_above, FTNODE node, int height, const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) const DBT *greatereq_pivot, // Everything in the subtree should be <= lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) @@ -1186,6 +1206,9 @@ typedef enum { FT_PRO_NUM_DIDNT_WANT_PROMOTE, FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize + FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, + FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, + FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE, FT_STATUS_NUM_ROWS } ft_status_entry; diff --git a/storage/tokudb/ft-index/ft/ft-ops.cc b/storage/tokudb/ft-index/ft/ft-ops.cc index 64b6b498c9a..f9701ec34b1 100644 --- a/storage/tokudb/ft-index/ft/ft-ops.cc +++ b/storage/tokudb/ft-index/ft/ft-ops.cc @@ -367,6 +367,9 @@ status_init(void) STATUS_INIT(FT_PRO_NUM_DIDNT_WANT_PROMOTE, PROMOTION_STOPPED_AFTER_LOCKING_CHILD, PARCOUNT, "promotion: stopped anyway, after locking the child", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, BASEMENT_DESERIALIZATION_FIXED_KEY, PARCOUNT, "basement nodes deserialized with fixed-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, BASEMENT_DESERIALIZATION_VARIABLE_KEY, PARCOUNT, "basement nodes deserialized with variable-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); + STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, nullptr, PARCOUNT, "promotion: succeeded in using the rightmost leaf shortcut", TOKU_ENGINE_STATUS); + STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, nullptr, PARCOUNT, "promotion: tried the rightmost leaf shorcut but failed (out-of-bounds)", TOKU_ENGINE_STATUS); + STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE,nullptr, PARCOUNT, "promotion: tried the rightmost leaf shorcut but failed (child reactive)", TOKU_ENGINE_STATUS); ft_status.initialized = true; } @@ -890,6 +893,11 @@ void toku_ftnode_clone_callback( for (int i = 0; i < node->n_children-1; i++) { toku_clone_dbt(&cloned_node->childkeys[i], node->childkeys[i]); } + if (node->height > 0) { + // need to move messages here so that we don't serialize stale + // messages to the fresh tree - ft verify code complains otherwise. + toku_move_ftnode_messages_to_stale(ft, node); + } // clone partition ftnode_clone_partitions(node, cloned_node); @@ -932,11 +940,14 @@ void toku_ftnode_flush_callback( int height = ftnode->height; if (write_me) { toku_assert_entire_node_in_memory(ftnode); - if (height == 0) { + if (height > 0 && !is_clone) { + // cloned nodes already had their stale messages moved, see toku_ftnode_clone_callback() + toku_move_ftnode_messages_to_stale(h, ftnode); + } else if (height == 0) { ft_leaf_run_gc(h, ftnode); - } - if (height == 0 && !is_clone) { - ftnode_update_disk_stats(ftnode, h, for_checkpoint); + if (!is_clone) { + ftnode_update_disk_stats(ftnode, h, for_checkpoint); + } } int r = toku_serialize_ftnode_to(fd, ftnode->thisnodename, ftnode, ndd, !is_clone, h, for_checkpoint); assert_zero(r); @@ -1079,9 +1090,10 @@ exit: return; } +static void ft_bnc_move_messages_to_stale(FT ft, NONLEAF_CHILDINFO bnc); + // replace the child buffer with a compressed version of itself. -// @return the old child buffer -static NONLEAF_CHILDINFO +static void compress_internal_node_partition(FTNODE node, int i, enum toku_compression_method compression_method) { // if we should evict, compress the @@ -1092,11 +1104,9 @@ compress_internal_node_partition(FTNODE node, int i, enum toku_compression_metho sub_block_init(sb); toku_create_compressed_partition_from_available(node, i, compression_method, sb); - // now set the state to compressed and return the old, available partition - NONLEAF_CHILDINFO bnc = BNC(node, i); + // now set the state to compressed set_BSB(node, i, sb); BP_STATE(node,i) = PT_COMPRESSED; - return bnc; } void toku_evict_bn_from_memory(FTNODE node, int childnum, FT h) { @@ -1149,18 +1159,27 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext for (int i = 0; i < node->n_children; i++) { if (BP_STATE(node,i) == PT_AVAIL) { if (BP_SHOULD_EVICT(node,i)) { - NONLEAF_CHILDINFO bnc; - if (ft_compress_buffers_before_eviction) { - // When partially evicting, always compress with quicklz - bnc = compress_internal_node_partition( + NONLEAF_CHILDINFO bnc = BNC(node, i); + if (ft_compress_buffers_before_eviction && + // We may not serialize and compress a partition in memory if its + // in memory layout version is different than what's on disk (and + // therefore requires upgrade). + // + // Auto-upgrade code assumes that if a node's layout version read + // from disk is not current, it MUST require upgrade. Breaking + // this rule would cause upgrade code to upgrade this partition + // again after we serialize it as the current version, which is bad. + node->layout_version == node->layout_version_read_from_disk) { + ft_bnc_move_messages_to_stale(ft, bnc); + compress_internal_node_partition( node, i, + // Always compress with quicklz TOKU_QUICKLZ_METHOD ); } else { // We're not compressing buffers before eviction. Simply // detach the buffer and set the child's state to on-disk. - bnc = BNC(node, i); set_BNULL(node, i); BP_STATE(node, i) = PT_ON_DISK; } @@ -1626,12 +1645,10 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp) BLOCKNUM old_blocknum = oldroot->thisnodename; uint32_t old_fullhash = oldroot->fullhash; - PAIR old_pair = oldroot->ct_pair; int new_height = oldroot->height+1; uint32_t new_fullhash; BLOCKNUM new_blocknum; - PAIR new_pair = NULL; cachetable_put_empty_node_with_dep_nodes( ft, @@ -1641,7 +1658,6 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp) &new_fullhash, &newroot ); - new_pair = newroot->ct_pair; assert(newroot); assert(new_height > 0); @@ -1653,22 +1669,18 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp) ft->h->layout_version, ft->h->flags ); + newroot->fullhash = new_fullhash; MSN msna = oldroot->max_msn_applied_to_node_on_disk; newroot->max_msn_applied_to_node_on_disk = msna; BP_STATE(newroot,0) = PT_AVAIL; newroot->dirty = 1; - // now do the "switcheroo" - BP_BLOCKNUM(newroot,0) = new_blocknum; - newroot->thisnodename = old_blocknum; - newroot->fullhash = old_fullhash; - newroot->ct_pair = old_pair; - - oldroot->thisnodename = new_blocknum; - oldroot->fullhash = new_fullhash; - oldroot->ct_pair = new_pair; - - toku_cachetable_swap_pair_values(old_pair, new_pair); + // Set the first child to have the new blocknum, + // and then swap newroot with oldroot. The new root + // will inherit the hash/blocknum/pair from oldroot, + // keeping the root blocknum constant. + BP_BLOCKNUM(newroot, 0) = new_blocknum; + toku_ftnode_swap_pair_values(newroot, oldroot); toku_ft_split_child( ft, @@ -2757,6 +2769,16 @@ static void inject_message_in_locked_node( // verify that msn of latest message was captured in root node paranoid_invariant(msg->msn.msn == node->max_msn_applied_to_node_on_disk.msn); + if (node->thisnodename.b == ft->rightmost_blocknum.b) { + if (ft->seqinsert_score < FT_SEQINSERT_SCORE_THRESHOLD) { + // we promoted to the rightmost leaf node and the seqinsert score has not yet saturated. + toku_sync_fetch_and_add(&ft->seqinsert_score, 1); + } + } else if (ft->seqinsert_score != 0) { + // we promoted to something other than the rightmost leaf node and the score should reset + ft->seqinsert_score = 0; + } + // if we call toku_ft_flush_some_child, then that function unpins the root // otherwise, we unpin ourselves if (node->height > 0 && toku_ft_nonleaf_is_gorged(node, ft->h->nodesize)) { @@ -2913,6 +2935,21 @@ static inline bool should_inject_in_node(seqinsert_loc loc, int height, int dept return (height == 0 || (loc == NEITHER_EXTREME && (height <= 1 || depth >= 2))); } +static void ft_set_or_verify_rightmost_blocknum(FT ft, BLOCKNUM b) +// Given: 'b', the _definitive_ and constant rightmost blocknum of 'ft' +{ + if (ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL) { + toku_ft_lock(ft); + if (ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL) { + ft->rightmost_blocknum = b; + } + toku_ft_unlock(ft); + } + // The rightmost blocknum only transitions from RESERVED_BLOCKNUM_NULL to non-null. + // If it's already set, verify that the stored value is consistent with 'b' + invariant(ft->rightmost_blocknum.b == b.b); +} + static void push_something_in_subtree( FT ft, FTNODE subtree_root, @@ -2960,6 +2997,14 @@ static void push_something_in_subtree( default: STATUS_INC(FT_PRO_NUM_INJECT_DEPTH_GT3, 1); break; } + // If the target node is a non-root leaf node on the right extreme, + // set the rightmost blocknum. We know there are no messages above us + // because promotion would not chose to inject directly into this leaf + // otherwise. We explicitly skip the root node because then we don't have + // to worry about changing the rightmost blocknum when the root splits. + if (subtree_root->height == 0 && loc == RIGHT_EXTREME && subtree_root->thisnodename.b != ft->h->root_blocknum.b) { + ft_set_or_verify_rightmost_blocknum(ft, subtree_root->thisnodename); + } inject_message_in_locked_node(ft, subtree_root, target_childnum, msg, flow_deltas, gc_info); } else { int r; @@ -3230,7 +3275,260 @@ void toku_ft_root_put_msg( } } -// Effect: Insert the key-val pair into ft. +static int ft_compare_keys(FT ft, const DBT *a, const DBT *b) +// Effect: Compare two keys using the given fractal tree's comparator/descriptor +{ + FAKE_DB(db, &ft->cmp_descriptor); + return ft->compare_fun(&db, a, b); +} + +static LEAFENTRY bn_get_le_and_key(BASEMENTNODE bn, int idx, DBT *key) +// Effect: Gets the i'th leafentry from the given basement node and +// fill its key in *key +// Requires: The i'th leafentry exists. +{ + LEAFENTRY le; + uint32_t le_len; + void *le_key; + int r = bn->data_buffer.fetch_klpair(idx, &le, &le_len, &le_key); + invariant_zero(r); + toku_fill_dbt(key, le_key, le_len); + return le; +} + +static LEAFENTRY ft_leaf_leftmost_le_and_key(FTNODE leaf, DBT *leftmost_key) +// Effect: If a leftmost key exists in the given leaf, toku_fill_dbt() +// the key into *leftmost_key +// Requires: Leaf is fully in memory and pinned for read or write. +// Return: leafentry if it exists, nullptr otherwise +{ + for (int i = 0; i < leaf->n_children; i++) { + BASEMENTNODE bn = BLB(leaf, i); + if (bn->data_buffer.num_klpairs() > 0) { + // Get the first (leftmost) leafentry and its key + return bn_get_le_and_key(bn, 0, leftmost_key); + } + } + return nullptr; +} + +static LEAFENTRY ft_leaf_rightmost_le_and_key(FTNODE leaf, DBT *rightmost_key) +// Effect: If a rightmost key exists in the given leaf, toku_fill_dbt() +// the key into *rightmost_key +// Requires: Leaf is fully in memory and pinned for read or write. +// Return: leafentry if it exists, nullptr otherwise +{ + for (int i = leaf->n_children - 1; i >= 0; i--) { + BASEMENTNODE bn = BLB(leaf, i); + size_t num_les = bn->data_buffer.num_klpairs(); + if (num_les > 0) { + // Get the last (rightmost) leafentry and its key + return bn_get_le_and_key(bn, num_les - 1, rightmost_key); + } + } + return nullptr; +} + +static int ft_leaf_get_relative_key_pos(FT ft, FTNODE leaf, const DBT *key, bool *nondeleted_key_found, int *target_childnum) +// Effect: Determines what the relative position of the given key is with +// respect to a leaf node, and if it exists. +// Requires: Leaf is fully in memory and pinned for read or write. +// Requires: target_childnum is non-null +// Return: < 0 if key is less than the leftmost key in the leaf OR the relative position is unknown, for any reason. +// 0 if key is in the bounds [leftmost_key, rightmost_key] for this leaf or the leaf is empty +// > 0 if key is greater than the rightmost key in the leaf +// *nondeleted_key_found is set (if non-null) if the target key was found and is not deleted, unmodified otherwise +// *target_childnum is set to the child that (does or would) contain the key, if calculated, unmodified otherwise +{ + DBT rightmost_key; + LEAFENTRY rightmost_le = ft_leaf_rightmost_le_and_key(leaf, &rightmost_key); + if (rightmost_le == nullptr) { + // If we can't get a rightmost key then the leaf is empty. + // In such a case, we don't have any information about what keys would be in this leaf. + // We have to assume the leaf node that would contain this key is to the left. + return -1; + } + // We have a rightmost leafentry, so it must exist in some child node + invariant(leaf->n_children > 0); + + int relative_pos = 0; + int c = ft_compare_keys(ft, key, &rightmost_key); + if (c > 0) { + relative_pos = 1; + *target_childnum = leaf->n_children - 1; + } else if (c == 0) { + if (nondeleted_key_found != nullptr && !le_latest_is_del(rightmost_le)) { + *nondeleted_key_found = true; + } + relative_pos = 0; + *target_childnum = leaf->n_children - 1; + } else { + // The key is less than the rightmost. It may still be in bounds if it's >= the leftmost. + DBT leftmost_key; + LEAFENTRY leftmost_le = ft_leaf_leftmost_le_and_key(leaf, &leftmost_key); + invariant_notnull(leftmost_le); // Must exist because a rightmost exists + c = ft_compare_keys(ft, key, &leftmost_key); + if (c > 0) { + if (nondeleted_key_found != nullptr) { + // The caller wants to know if a nondeleted key can be found. + LEAFENTRY target_le; + int childnum = toku_ftnode_which_child(leaf, key, &ft->cmp_descriptor, ft->compare_fun); + BASEMENTNODE bn = BLB(leaf, childnum); + struct msg_leafval_heaviside_extra extra = { ft->compare_fun, &ft->cmp_descriptor, key }; + int r = bn->data_buffer.find_zero( + extra, + &target_le, + nullptr, nullptr, nullptr + ); + *target_childnum = childnum; + if (r == 0 && !le_latest_is_del(leftmost_le)) { + *nondeleted_key_found = true; + } + } + relative_pos = 0; + } else if (c == 0) { + if (nondeleted_key_found != nullptr && !le_latest_is_del(leftmost_le)) { + *nondeleted_key_found = true; + } + relative_pos = 0; + *target_childnum = 0; + } else { + relative_pos = -1; + } + } + + return relative_pos; +} + +static void ft_insert_directly_into_leaf(FT ft, FTNODE leaf, int target_childnum, DBT *key, DBT *val, + XIDS message_xids, enum ft_msg_type type, txn_gc_info *gc_info); +static int getf_nothing(ITEMLEN, bytevec, ITEMLEN, bytevec, void *, bool); + +static int ft_maybe_insert_into_rightmost_leaf(FT ft, DBT *key, DBT *val, XIDS message_xids, enum ft_msg_type type, + txn_gc_info *gc_info, bool unique) +// Effect: Pins the rightmost leaf node and attempts to do an insert. +// There are three reasons why we may not succeed. +// - The rightmost leaf is too full and needs a split. +// - The key to insert is not within the provable bounds of this leaf node. +// - The key is within bounds, but it already exists. +// Return: 0 if this function did insert, DB_KEYEXIST if a unique key constraint exists and +// some nondeleted leafentry with the same key exists +// < 0 if this function did not insert, for a reason other than DB_KEYEXIST. +// Note: Treat this function as a possible, but not necessary, optimization for insert. +// Rationale: We want O(1) insertions down the rightmost path of the tree. +{ + int r = -1; + + uint32_t rightmost_fullhash; + BLOCKNUM rightmost_blocknum = ft->rightmost_blocknum; + FTNODE rightmost_leaf = nullptr; + + // Don't do the optimization if our heurstic suggests that + // insertion pattern is not sequential. + if (ft->seqinsert_score < FT_SEQINSERT_SCORE_THRESHOLD) { + goto cleanup; + } + + // We know the seqinsert score is high enough that we should + // attemp to directly insert into the right most leaf. Because + // the score is non-zero, the rightmost blocknum must have been + // set. See inject_message_in_locked_node(), which only increases + // the score if the target node blocknum == rightmost_blocknum + invariant(rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL); + + // Pin the rightmost leaf with a write lock. + rightmost_fullhash = toku_cachetable_hash(ft->cf, rightmost_blocknum); + struct ftnode_fetch_extra bfe; + fill_bfe_for_full_read(&bfe, ft); + toku_pin_ftnode(ft, rightmost_blocknum, rightmost_fullhash, &bfe, PL_WRITE_CHEAP, &rightmost_leaf, true); + + // The rightmost blocknum never chances once it is initialized to something + // other than null. Verify that the pinned node has the correct blocknum. + invariant(rightmost_leaf->thisnodename.b == rightmost_blocknum.b); + + // If the rightmost leaf is reactive, bail out out and let the normal promotion pass + // take care of it. This also ensures that if any of our ancestors are reactive, + // they'll be taken care of too. + if (get_leaf_reactivity(rightmost_leaf, ft->h->nodesize) != RE_STABLE) { + STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE, 1); + goto cleanup; + } + + // The groundwork has been laid for an insertion directly into the rightmost + // leaf node. We know that it is pinned for write, fully in memory, has + // no messages above it, and is not reactive. + // + // Now, two more things must be true for this insertion to actually happen: + // 1. The key to insert is within the bounds of this leafnode, or to the right. + // 2. If there is a uniqueness constraint, it passes. + bool nondeleted_key_found; + int relative_pos; + int target_childnum; + + nondeleted_key_found = false; + target_childnum = -1; + relative_pos = ft_leaf_get_relative_key_pos(ft, rightmost_leaf, key, + unique ? &nondeleted_key_found : nullptr, + &target_childnum); + if (relative_pos >= 0) { + STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, 1); + if (unique && nondeleted_key_found) { + r = DB_KEYEXIST; + } else { + ft_insert_directly_into_leaf(ft, rightmost_leaf, target_childnum, + key, val, message_xids, type, gc_info); + r = 0; + } + } else { + STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, 1); + r = -1; + } + +cleanup: + // If we did the insert, the rightmost leaf was unpinned for us. + if (r != 0 && rightmost_leaf != nullptr) { + toku_unpin_ftnode(ft, rightmost_leaf); + } + + return r; +} + +static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type); + +int toku_ft_insert_unique(FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool do_logging) { +// Effect: Insert a unique key-val pair into the fractal tree. +// Return: 0 on success, DB_KEYEXIST if the overwrite constraint failed + XIDS message_xids = txn != nullptr ? toku_txn_get_xids(txn) : xids_get_root_xids(); + + TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h); + txn_manager_state txn_state_for_gc(txn_manager); + + TXNID oldest_referenced_xid_estimate = toku_ft_get_oldest_referenced_xid_estimate(ft_h); + txn_gc_info gc_info(&txn_state_for_gc, + oldest_referenced_xid_estimate, + // no messages above us, we can implicitly promote uxrs based on this xid + oldest_referenced_xid_estimate, + true); + int r = ft_maybe_insert_into_rightmost_leaf(ft_h->ft, key, val, message_xids, FT_INSERT, &gc_info, true); + if (r != 0 && r != DB_KEYEXIST) { + // Default to a regular unique check + insert algorithm if we couldn't + // do it based on the rightmost leaf alone. + int lookup_r = toku_ft_lookup(ft_h, key, getf_nothing, nullptr); + if (lookup_r == DB_NOTFOUND) { + toku_ft_send_insert(ft_h, key, val, message_xids, FT_INSERT, &gc_info); + r = 0; + } else { + r = DB_KEYEXIST; + } + } + + if (r == 0) { + ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, FT_INSERT); + } + return r; +} + +// Effect: Insert the key-val pair into an ft. void toku_ft_insert (FT_HANDLE ft_handle, DBT *key, DBT *val, TOKUTXN txn) { toku_ft_maybe_insert(ft_handle, key, val, txn, false, ZERO_LSN, true, FT_INSERT); } @@ -3356,32 +3654,38 @@ TXNID toku_ft_get_oldest_referenced_xid_estimate(FT_HANDLE ft_h) { return txn_manager != nullptr ? toku_txn_manager_get_oldest_referenced_xid_estimate(txn_manager) : TXNID_NONE; } -void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type) { - paranoid_invariant(type==FT_INSERT || type==FT_INSERT_NO_OVERWRITE); - XIDS message_xids = xids_get_root_xids(); //By default use committed messages +static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type) { + paranoid_invariant(type == FT_INSERT || type == FT_INSERT_NO_OVERWRITE); + + //By default use committed messages TXNID_PAIR xid = toku_txn_get_txnid(txn); if (txn) { BYTESTRING keybs = {key->size, (char *) key->data}; - toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft_h->ft->cf), &keybs); - toku_txn_maybe_note_ft(txn, ft_h->ft); - message_xids = toku_txn_get_xids(txn); + toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft->cf), &keybs); + toku_txn_maybe_note_ft(txn, ft); } TOKULOGGER logger = toku_txn_logger(txn); if (do_logging && logger) { BYTESTRING keybs = {.len=key->size, .data=(char *) key->data}; BYTESTRING valbs = {.len=val->size, .data=(char *) val->data}; if (type == FT_INSERT) { - toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs); + toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs); } else { - toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs); + toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs); } } +} + +void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type) { + ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, type); LSN treelsn; if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) { // do nothing } else { + XIDS message_xids = txn ? toku_txn_get_xids(txn) : xids_get_root_xids(); + TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h); txn_manager_state txn_state_for_gc(txn_manager); @@ -3391,10 +3695,26 @@ void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool // no messages above us, we can implicitly promote uxrs based on this xid oldest_referenced_xid_estimate, txn != nullptr ? !txn->for_recovery : false); - toku_ft_send_insert(ft_h, key, val, message_xids, type, &gc_info); + int r = ft_maybe_insert_into_rightmost_leaf(ft_h->ft, key, val, message_xids, FT_INSERT, &gc_info, false); + if (r != 0) { + toku_ft_send_insert(ft_h, key, val, message_xids, type, &gc_info); + } } } +static void ft_insert_directly_into_leaf(FT ft, FTNODE leaf, int target_childnum, DBT *key, DBT *val, + XIDS message_xids, enum ft_msg_type type, txn_gc_info *gc_info) +// Effect: Insert directly into a leaf node a fractal tree. Does not do any logging. +// Requires: Leaf is fully in memory and pinned for write. +// Requires: If this insertion were to happen through the root node, the promotion +// algorithm would have selected the given leaf node as the point of injection. +// That means this function relies on the current implementation of promotion. +{ + FT_MSG_S ftcmd = { type, ZERO_MSN, message_xids, .u = { .id = { key, val } } }; + size_t flow_deltas[] = { 0, 0 }; + inject_message_in_locked_node(ft, leaf, target_childnum, &ftcmd, flow_deltas, gc_info); +} + static void ft_send_update_msg(FT_HANDLE ft_h, FT_MSG_S *msg, TOKUTXN txn) { msg->xids = (txn @@ -4894,6 +5214,13 @@ int copy_to_stale(const int32_t &offset, const uint32_t UU(idx), struct copy_to_ return 0; } +static void ft_bnc_move_messages_to_stale(FT ft, NONLEAF_CHILDINFO bnc) { + struct copy_to_stale_extra cts_extra = { .ft = ft, .bnc = bnc }; + int r = bnc->fresh_message_tree.iterate_over_marked(&cts_extra); + invariant_zero(r); + bnc->fresh_message_tree.delete_all_marked(); +} + __attribute__((nonnull)) void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node) { @@ -4906,10 +5233,7 @@ toku_move_ftnode_messages_to_stale(FT ft, FTNODE node) { // We can't delete things out of the fresh tree inside the above // procedures because we're still looking at the fresh tree. Instead // we have to move messages after we're done looking at it. - struct copy_to_stale_extra cts_extra = { .ft = ft, .bnc = bnc }; - int r = bnc->fresh_message_tree.iterate_over_marked(&cts_extra); - invariant_zero(r); - bnc->fresh_message_tree.delete_all_marked(); + ft_bnc_move_messages_to_stale(ft, bnc); } } diff --git a/storage/tokudb/ft-index/ft/ft-ops.h b/storage/tokudb/ft-index/ft/ft-ops.h index b482d2b8206..cfa6ba20f6f 100644 --- a/storage/tokudb/ft-index/ft/ft-ops.h +++ b/storage/tokudb/ft-index/ft/ft-ops.h @@ -213,6 +213,9 @@ int toku_ft_lookup (FT_HANDLE ft_h, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void // Effect: Insert a key and data pair into an ft void toku_ft_insert (FT_HANDLE ft_h, DBT *k, DBT *v, TOKUTXN txn); +// Returns: 0 if the key was inserted, DB_KEYEXIST if the key already exists +int toku_ft_insert_unique(FT_HANDLE ft, DBT *k, DBT *v, TOKUTXN txn, bool do_logging); + // Effect: Optimize the ft void toku_ft_optimize (FT_HANDLE ft_h); diff --git a/storage/tokudb/ft-index/ft/ft-serialize.cc b/storage/tokudb/ft-index/ft/ft-serialize.cc index 4a4817e7f6c..1879561f20a 100644 --- a/storage/tokudb/ft-index/ft/ft-serialize.cc +++ b/storage/tokudb/ft-index/ft/ft-serialize.cc @@ -462,6 +462,7 @@ serialize_ft_min_size (uint32_t version) { size_t size = 0; switch(version) { + case FT_LAYOUT_VERSION_27: case FT_LAYOUT_VERSION_26: case FT_LAYOUT_VERSION_25: case FT_LAYOUT_VERSION_24: diff --git a/storage/tokudb/ft-index/ft/ft-verify.cc b/storage/tokudb/ft-index/ft/ft-verify.cc index 506a54a07a0..7e8d241cce2 100644 --- a/storage/tokudb/ft-index/ft/ft-verify.cc +++ b/storage/tokudb/ft-index/ft/ft-verify.cc @@ -310,7 +310,7 @@ toku_get_node_for_verify( static int toku_verify_ftnode_internal(FT_HANDLE ft_handle, - MSN rootmsn, MSN parentmsn, bool messages_exist_above, + MSN rootmsn, MSN parentmsn_with_messages, bool messages_exist_above, FTNODE node, int height, const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) const DBT *greatereq_pivot, // Everything in the subtree should be <= lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) @@ -328,7 +328,7 @@ toku_verify_ftnode_internal(FT_HANDLE ft_handle, invariant(height == node->height); // this is a bad failure if wrong } if (node->height > 0 && messages_exist_above) { - VERIFY_ASSERTION((parentmsn.msn >= this_msn.msn), 0, "node msn must be descending down tree, newest messages at top"); + VERIFY_ASSERTION((parentmsn_with_messages.msn >= this_msn.msn), 0, "node msn must be descending down tree, newest messages at top"); } // Verify that all the pivot keys are in order. for (int i = 0; i < node->n_children-2; i++) { @@ -450,7 +450,7 @@ done: // input is a pinned node, on exit, node is unpinned int toku_verify_ftnode (FT_HANDLE ft_handle, - MSN rootmsn, MSN parentmsn, bool messages_exist_above, + MSN rootmsn, MSN parentmsn_with_messages, bool messages_exist_above, FTNODE node, int height, const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) const DBT *greatereq_pivot, // Everything in the subtree should be <= lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.) @@ -469,7 +469,7 @@ toku_verify_ftnode (FT_HANDLE ft_handle, // Otherwise we'll just do the next call result = toku_verify_ftnode_internal( - ft_handle, rootmsn, parentmsn, messages_exist_above, node, height, lesser_pivot, greatereq_pivot, + ft_handle, rootmsn, parentmsn_with_messages, messages_exist_above, node, height, lesser_pivot, greatereq_pivot, verbose, keep_going_on_failure, false); if (result != 0 && (!keep_going_on_failure || result != TOKUDB_NEEDS_REPAIR)) goto done; } @@ -477,7 +477,7 @@ toku_verify_ftnode (FT_HANDLE ft_handle, toku_move_ftnode_messages_to_stale(ft_handle->ft, node); } result2 = toku_verify_ftnode_internal( - ft_handle, rootmsn, parentmsn, messages_exist_above, node, height, lesser_pivot, greatereq_pivot, + ft_handle, rootmsn, parentmsn_with_messages, messages_exist_above, node, height, lesser_pivot, greatereq_pivot, verbose, keep_going_on_failure, true); if (result == 0) { result = result2; @@ -489,12 +489,16 @@ toku_verify_ftnode (FT_HANDLE ft_handle, for (int i = 0; i < node->n_children; i++) { FTNODE child_node; toku_get_node_for_verify(BP_BLOCKNUM(node, i), ft_handle, &child_node); - int r = toku_verify_ftnode(ft_handle, rootmsn, this_msn, messages_exist_above || toku_bnc_n_entries(BNC(node, i)) > 0, - child_node, node->height-1, - (i==0) ? lesser_pivot : &node->childkeys[i-1], - (i==node->n_children-1) ? greatereq_pivot : &node->childkeys[i], - progress_callback, progress_extra, - recurse, verbose, keep_going_on_failure); + int r = toku_verify_ftnode(ft_handle, rootmsn, + (toku_bnc_n_entries(BNC(node, i)) > 0 + ? this_msn + : parentmsn_with_messages), + messages_exist_above || toku_bnc_n_entries(BNC(node, i)) > 0, + child_node, node->height-1, + (i==0) ? lesser_pivot : &node->childkeys[i-1], + (i==node->n_children-1) ? greatereq_pivot : &node->childkeys[i], + progress_callback, progress_extra, + recurse, verbose, keep_going_on_failure); if (r) { result = r; if (!keep_going_on_failure || result != TOKUDB_NEEDS_REPAIR) goto done; diff --git a/storage/tokudb/ft-index/ft/ft_layout_version.h b/storage/tokudb/ft-index/ft/ft_layout_version.h index e9c6a68328b..01c7363e98d 100644 --- a/storage/tokudb/ft-index/ft/ft_layout_version.h +++ b/storage/tokudb/ft-index/ft/ft_layout_version.h @@ -120,6 +120,7 @@ enum ft_layout_version_e { FT_LAYOUT_VERSION_24 = 24, // Riddler: change logentries that log transactions to store TXNID_PAIRs instead of TXNIDs FT_LAYOUT_VERSION_25 = 25, // SecretSquirrel: ROLLBACK_LOG_NODES (on disk and in memory) now just use blocknum (instead of blocknum + hash) to point to other log nodes. same for xstillopen log entry FT_LAYOUT_VERSION_26 = 26, // Hojo: basements store key/vals separately on disk for fixed klpair length BNs + FT_LAYOUT_VERSION_27 = 27, // serialize message trees with nonleaf buffers to avoid key, msn sort on deserialize FT_NEXT_VERSION, // the version after the current version FT_LAYOUT_VERSION = FT_NEXT_VERSION-1, // A hack so I don't have to change this line. FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported diff --git a/storage/tokudb/ft-index/ft/ft_node-serialize.cc b/storage/tokudb/ft-index/ft/ft_node-serialize.cc index fcb38f11834..91ea0890c30 100644 --- a/storage/tokudb/ft-index/ft/ft_node-serialize.cc +++ b/storage/tokudb/ft-index/ft/ft_node-serialize.cc @@ -291,8 +291,13 @@ serialize_ftnode_partition_size (FTNODE node, int i) paranoid_invariant(node->bp[i].state == PT_AVAIL); result++; // Byte that states what the partition is if (node->height > 0) { - result += 4; // size of bytes in buffer table - result += toku_bnc_nbytesinbuf(BNC(node, i)); + NONLEAF_CHILDINFO bnc = BNC(node, i); + // number of messages (4 bytes) plus size of the buffer + result += (4 + toku_bnc_nbytesinbuf(bnc)); + // number of offsets (4 bytes) plus an array of 4 byte offsets, for each message tree + result += (4 + (4 * bnc->fresh_message_tree.size())); + result += (4 + (4 * bnc->stale_message_tree.size())); + result += (4 + (4 * bnc->broadcast_list.size())); } else { result += 4 + bn_data::HEADER_LENGTH; // n_entries in buffer table + basement header @@ -305,8 +310,35 @@ serialize_ftnode_partition_size (FTNODE node, int i) #define FTNODE_PARTITION_DMT_LEAVES 0xaa #define FTNODE_PARTITION_FIFO_MSG 0xbb +UU() static int +assert_fresh(const int32_t &offset, const uint32_t UU(idx), struct fifo *const f) { + struct fifo_entry *entry = toku_fifo_get_entry(f, offset); + assert(entry->is_fresh); + return 0; +} + +UU() static int +assert_stale(const int32_t &offset, const uint32_t UU(idx), struct fifo *const f) { + struct fifo_entry *entry = toku_fifo_get_entry(f, offset); + assert(!entry->is_fresh); + return 0; +} + +static void bnc_verify_message_trees(NONLEAF_CHILDINFO UU(bnc)) { +#ifdef TOKU_DEBUG_PARANOID + bnc->fresh_message_tree.iterate(bnc->buffer); + bnc->stale_message_tree.iterate(bnc->buffer); +#endif +} + +static int +wbuf_write_offset(const int32_t &offset, const uint32_t UU(idx), struct wbuf *const wb) { + wbuf_nocrc_int(wb, offset); + return 0; +} + static void -serialize_nonleaf_childinfo(NONLEAF_CHILDINFO bnc, struct wbuf *wb) +serialize_child_buffer(NONLEAF_CHILDINFO bnc, struct wbuf *wb) { unsigned char ch = FTNODE_PARTITION_FIFO_MSG; wbuf_nocrc_char(wb, ch); @@ -323,6 +355,19 @@ serialize_nonleaf_childinfo(NONLEAF_CHILDINFO bnc, struct wbuf *wb) wbuf_nocrc_bytes(wb, key, keylen); wbuf_nocrc_bytes(wb, data, datalen); }); + + bnc_verify_message_trees(bnc); + + // serialize the message trees (num entries, offsets array): + // fresh, stale, broadcast + wbuf_nocrc_int(wb, bnc->fresh_message_tree.size()); + bnc->fresh_message_tree.iterate(wb); + + wbuf_nocrc_int(wb, bnc->stale_message_tree.size()); + bnc->stale_message_tree.iterate(wb); + + wbuf_nocrc_int(wb, bnc->broadcast_list.size()); + bnc->broadcast_list.iterate(wb); } // @@ -346,7 +391,7 @@ serialize_ftnode_partition(FTNODE node, int i, struct sub_block *sb) { wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size); if (node->height > 0) { // TODO: (Zardosht) possibly exit early if there are no messages - serialize_nonleaf_childinfo(BNC(node, i), &wb); + serialize_child_buffer(BNC(node, i), &wb); } else { unsigned char ch = FTNODE_PARTITION_DMT_LEAVES; @@ -1024,8 +1069,8 @@ toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DA } static void -deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf, - DESCRIPTOR desc, ft_compare_func cmp) { +deserialize_child_buffer_v26(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf, + DESCRIPTOR desc, ft_compare_func cmp) { int r; int n_in_this_buffer = rbuf_int(rbuf); int32_t *fresh_offsets = NULL, *stale_offsets = NULL; @@ -1090,6 +1135,68 @@ deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf, } } +// effect: deserialize a single message from rbuf and enqueue the result into the given fifo +static void +fifo_deserialize_msg_from_rbuf(FIFO fifo, struct rbuf *rbuf) { + bytevec key, val; + ITEMLEN keylen, vallen; + enum ft_msg_type type = (enum ft_msg_type) rbuf_char(rbuf); + bool is_fresh = rbuf_char(rbuf); + MSN msn = rbuf_msn(rbuf); + XIDS xids; + xids_create_from_buffer(rbuf, &xids); + rbuf_bytes(rbuf, &key, &keylen); /* Returns a pointer into the rbuf. */ + rbuf_bytes(rbuf, &val, &vallen); + int r = toku_fifo_enq(fifo, key, keylen, val, vallen, type, msn, xids, is_fresh, nullptr); + lazy_assert_zero(r); + xids_destroy(&xids); +} + +static void +deserialize_child_buffer(NONLEAF_CHILDINFO bnc, struct rbuf *rbuf) { + int n_in_this_buffer = rbuf_int(rbuf); + int nfresh = 0, nstale = 0, nbroadcast_offsets = 0; + int32_t *XMALLOC_N(n_in_this_buffer, stale_offsets); + int32_t *XMALLOC_N(n_in_this_buffer, fresh_offsets); + int32_t *XMALLOC_N(n_in_this_buffer, broadcast_offsets); + + toku_fifo_resize(bnc->buffer, rbuf->size + 64); + for (int i = 0; i < n_in_this_buffer; i++) { + fifo_deserialize_msg_from_rbuf(bnc->buffer, rbuf); + } + + // read in each message tree (fresh, stale, broadcast) + nfresh = rbuf_int(rbuf); + bytevec fresh_offsets_src_v; + rbuf_literal_bytes(rbuf, &fresh_offsets_src_v, nfresh * (sizeof *fresh_offsets)); + const int32_t *fresh_offsets_src = (const int32_t *) fresh_offsets_src_v; + for (int i = 0; i < nfresh; i++) { + fresh_offsets[i] = toku_dtoh32(fresh_offsets_src[i]); + } + nstale = rbuf_int(rbuf); + bytevec stale_offsets_src_v; + rbuf_literal_bytes(rbuf, &stale_offsets_src_v, nstale * (sizeof *stale_offsets)); + const int32_t *stale_offsets_src = (const int32_t *) stale_offsets_src_v; + for (int i = 0; i < nstale; i++) { + stale_offsets[i] = toku_dtoh32(stale_offsets_src[i]); + } + nbroadcast_offsets = rbuf_int(rbuf); + bytevec broadcast_offsets_src_v; + rbuf_literal_bytes(rbuf, &broadcast_offsets_src_v, nbroadcast_offsets * (sizeof *broadcast_offsets)); + const int32_t *broadcast_offsets_src = (const int32_t *) broadcast_offsets_src_v; + for (int i = 0; i < nbroadcast_offsets; i++) { + broadcast_offsets[i] = toku_dtoh32(broadcast_offsets_src[i]); + } + + // build OMTs out of each offset array + bnc->fresh_message_tree.destroy(); + bnc->fresh_message_tree.create_steal_sorted_array(&fresh_offsets, nfresh, n_in_this_buffer); + bnc->stale_message_tree.destroy(); + bnc->stale_message_tree.create_steal_sorted_array(&stale_offsets, nstale, n_in_this_buffer); + bnc->broadcast_list.destroy(); + bnc->broadcast_list.create_steal_sorted_array(&broadcast_offsets, nbroadcast_offsets, n_in_this_buffer); +} + // dump a buffer to stderr // no locking around this for now void @@ -1161,13 +1268,16 @@ NONLEAF_CHILDINFO toku_create_empty_nl(void) { return cn; } -// does NOT create OMTs, just the FIFO +// must clone the OMTs, since we serialize them along with the FIFO NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo) { NONLEAF_CHILDINFO XMALLOC(cn); toku_fifo_clone(orig_childinfo->buffer, &cn->buffer); cn->fresh_message_tree.create_no_array(); + cn->fresh_message_tree.clone(orig_childinfo->fresh_message_tree); cn->stale_message_tree.create_no_array(); + cn->stale_message_tree.clone(orig_childinfo->stale_message_tree); cn->broadcast_list.create_no_array(); + cn->broadcast_list.clone(orig_childinfo->broadcast_list); memset(cn->flow, 0, sizeof cn->flow); return cn; } @@ -1513,7 +1623,13 @@ deserialize_ftnode_partition( if (node->height > 0) { assert(ch == FTNODE_PARTITION_FIFO_MSG); - deserialize_child_buffer(BNC(node, childnum), &rb, desc, cmp); + NONLEAF_CHILDINFO bnc = BNC(node, childnum); + if (node->layout_version_read_from_disk <= FT_LAYOUT_VERSION_26) { + // Layout version <= 26 did not serialize sorted message trees to disk. + deserialize_child_buffer_v26(bnc, &rb, desc, cmp); + } else { + deserialize_child_buffer(bnc, &rb); + } BP_WORKDONE(node, childnum) = 0; } else { diff --git a/storage/tokudb/ft-index/ft/ftloader-internal.h b/storage/tokudb/ft-index/ft/ftloader-internal.h index be1ded59890..d60537490dd 100644 --- a/storage/tokudb/ft-index/ft/ftloader-internal.h +++ b/storage/tokudb/ft-index/ft/ftloader-internal.h @@ -245,6 +245,7 @@ struct ft_loader_s { CACHETABLE cachetable; bool did_reserve_memory; bool compress_intermediates; + bool allow_puts; uint64_t reserved_memory; // how much memory are we allowed to use? /* To make it easier to recover from errors, we don't use FILE*, instead we use an index into the file_infos. */ @@ -346,7 +347,8 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp, TOKUTXN txn, bool reserve_memory, uint64_t reserve_memory_size, - bool compress_intermediates); + bool compress_intermediates, + bool allow_puts); void toku_ft_loader_internal_destroy (FTLOADER bl, bool is_error); diff --git a/storage/tokudb/ft-index/ft/ftloader.cc b/storage/tokudb/ft-index/ft/ftloader.cc index 2df6d0a1cda..67b3cf9905e 100644 --- a/storage/tokudb/ft-index/ft/ftloader.cc +++ b/storage/tokudb/ft-index/ft/ftloader.cc @@ -356,6 +356,8 @@ int ft_loader_open_temp_file (FTLOADER bl, FIDX *file_idx) */ { int result = 0; + if (result) // debug hack + return result; FILE *f = NULL; int fd = -1; char *fname = toku_strdup(bl->temp_file_template); @@ -420,6 +422,10 @@ void toku_ft_loader_internal_destroy (FTLOADER bl, bool is_error) { } destroy_rowset(&bl->primary_rowset); + if (bl->primary_rowset_queue) { + queue_destroy(bl->primary_rowset_queue); + bl->primary_rowset_queue = nullptr; + } for (int i=0; iN; i++) { if ( bl->fractal_queues ) { @@ -543,7 +549,8 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp, TOKUTXN txn, bool reserve_memory, uint64_t reserve_memory_size, - bool compress_intermediates) + bool compress_intermediates, + bool allow_puts) // Effect: Allocate and initialize a FTLOADER, but do not create the extractor thread. { FTLOADER CALLOC(bl); // initialized to all zeros (hence CALLOC) @@ -560,10 +567,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp, bl->reserved_memory = 512*1024*1024; // if no cache table use 512MB. } bl->compress_intermediates = compress_intermediates; - if (0) { // debug - fprintf(stderr, "%s Reserved memory=%" PRId64 "\n", __FUNCTION__, bl->reserved_memory); - } - + bl->allow_puts = allow_puts; bl->src_db = src_db; bl->N = N; bl->load_lsn = load_lsn; @@ -628,7 +632,6 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp, { int r = queue_create(&bl->primary_rowset_queue, EXTRACTOR_QUEUE_DEPTH); if (r!=0) { toku_ft_loader_internal_destroy(bl, true); return r; } } - //printf("%s:%d toku_pthread_create\n", __FILE__, __LINE__); { ft_loader_lock_init(bl); } @@ -650,34 +653,38 @@ int toku_ft_loader_open (/* out */ FTLOADER *blp, TOKUTXN txn, bool reserve_memory, uint64_t reserve_memory_size, - bool compress_intermediates) -/* Effect: called by DB_ENV->create_loader to create an ft loader. - * Arguments: - * blp Return the ft loader here. - * g The function for generating a row - * src_db The source database. Needed by g. May be NULL if that's ok with g. - * N The number of dbs to create. - * dbs An array of open databases. Used by g. The data will be put in these database. - * new_fnames The file names (these strings are owned by the caller: we make a copy for our own purposes). - * temp_file_template A template suitable for mkstemp() - * Return value: 0 on success, an error number otherwise. - */ -{ + bool compress_intermediates, + bool allow_puts) { +// Effect: called by DB_ENV->create_loader to create a brt loader. +// Arguments: +// blp Return the brt loader here. +// g The function for generating a row +// src_db The source database. Needed by g. May be NULL if that's ok with g. +// N The number of dbs to create. +// dbs An array of open databases. Used by g. The data will be put in these database. +// new_fnames The file names (these strings are owned by the caller: we make a copy for our own purposes). +// temp_file_template A template suitable for mkstemp() +// reserve_memory Cause the loader to reserve memory for its use from the cache table. +// compress_intermediates Cause the loader to compress intermediate loader files. +// allow_puts Prepare the loader for rows to insert. When puts are disabled, the loader does not run the +// extractor or the fractal tree writer threads. +// Return value: 0 on success, an error number otherwise. int result = 0; { int r = toku_ft_loader_internal_init(blp, cachetable, g, src_db, - N, fts, dbs, - new_fnames_in_env, - bt_compare_functions, - temp_file_template, - load_lsn, - txn, - reserve_memory, - reserve_memory_size, - compress_intermediates); + N, fts, dbs, + new_fnames_in_env, + bt_compare_functions, + temp_file_template, + load_lsn, + txn, + reserve_memory, + reserve_memory_size, + compress_intermediates, + allow_puts); if (r!=0) result = r; } - if (result==0) { + if (result==0 && allow_puts) { FTLOADER bl = *blp; int r = toku_pthread_create(&bl->extractor_thread, NULL, extractor_thread, (void*)bl); if (r==0) { @@ -1213,6 +1220,7 @@ finish_extractor (FTLOADER bl) { { int r = queue_destroy(bl->primary_rowset_queue); invariant(r==0); + bl->primary_rowset_queue = nullptr; } rval = ft_loader_fi_close_all(&bl->file_infos); @@ -1374,10 +1382,9 @@ int toku_ft_loader_put (FTLOADER bl, DBT *key, DBT *val) * Return value: 0 on success, an error number otherwise. */ { - if (ft_loader_get_error(&bl->error_callback)) + if (!bl->allow_puts || ft_loader_get_error(&bl->error_callback)) return EINVAL; // previous panic bl->n_rows++; -// return loader_write_row(key, val, bl->fprimary_rows, &bl->fprimary_offset, bl); return loader_do_put(bl, key, val); } @@ -2425,6 +2432,8 @@ static int toku_loader_write_ft_from_q (FTLOADER bl, if (r) { result = r; drain_writer_q(q); + r = toku_os_close(fd); + assert_zero(r); return result; } FILE *pivots_stream = toku_bl_fidx2file(bl, pivots_file); @@ -2714,12 +2723,7 @@ static int loader_do_i (FTLOADER bl, struct rowset *rows = &(bl->rows[which_db]); invariant(rows->data==NULL); // the rows should be all cleaned up already - // a better allocation would be to figure out roughly how many merge passes we'll need. - int allocation_for_merge = (2*progress_allocation)/3; - progress_allocation -= allocation_for_merge; - - int r; - r = queue_create(&bl->fractal_queues[which_db], FRACTAL_WRITER_QUEUE_DEPTH); + int r = queue_create(&bl->fractal_queues[which_db], FRACTAL_WRITER_QUEUE_DEPTH); if (r) goto error; { @@ -2740,49 +2744,62 @@ static int loader_do_i (FTLOADER bl, r = dest_db->get_fanout(dest_db, &target_fanout); invariant_zero(r); - // This structure must stay live until the join below. - struct fractal_thread_args fta = { bl, - descriptor, - fd, - progress_allocation, - bl->fractal_queues[which_db], - bl->extracted_datasizes[which_db], - 0, - which_db, - target_nodesize, - target_basementnodesize, - target_compression_method, - target_fanout - }; + if (bl->allow_puts) { + // a better allocation would be to figure out roughly how many merge passes we'll need. + int allocation_for_merge = (2*progress_allocation)/3; + progress_allocation -= allocation_for_merge; + + // This structure must stay live until the join below. + struct fractal_thread_args fta = { + bl, + descriptor, + fd, + progress_allocation, + bl->fractal_queues[which_db], + bl->extracted_datasizes[which_db], + 0, + which_db, + target_nodesize, + target_basementnodesize, + target_compression_method, + target_fanout + }; - r = toku_pthread_create(bl->fractal_threads+which_db, NULL, fractal_thread, (void*)&fta); - if (r) { - int r2 __attribute__((__unused__)) = queue_destroy(bl->fractal_queues[which_db]); - // ignore r2, since we already have an error - goto error; - } - invariant(bl->fractal_threads_live[which_db]==false); - bl->fractal_threads_live[which_db] = true; + r = toku_pthread_create(bl->fractal_threads+which_db, NULL, fractal_thread, (void*)&fta); + if (r) { + int r2 __attribute__((__unused__)) = queue_destroy(bl->fractal_queues[which_db]); + // ignore r2, since we already have an error + bl->fractal_queues[which_db] = nullptr; + goto error; + } + invariant(bl->fractal_threads_live[which_db]==false); + bl->fractal_threads_live[which_db] = true; - r = merge_files(fs, bl, which_db, dest_db, compare, allocation_for_merge, bl->fractal_queues[which_db]); + r = merge_files(fs, bl, which_db, dest_db, compare, allocation_for_merge, bl->fractal_queues[which_db]); - { - void *toku_pthread_retval; - int r2 = toku_pthread_join(bl->fractal_threads[which_db], &toku_pthread_retval); - invariant(fta.bl==bl); // this is a gratuitous assertion to make sure that the fta struct is still live here. A previous bug but that struct into a C block statement. - resource_assert_zero(r2); - invariant(toku_pthread_retval==NULL); - invariant(bl->fractal_threads_live[which_db]); - bl->fractal_threads_live[which_db] = false; - if (r == 0) r = fta.errno_result; + { + void *toku_pthread_retval; + int r2 = toku_pthread_join(bl->fractal_threads[which_db], &toku_pthread_retval); + invariant(fta.bl==bl); // this is a gratuitous assertion to make sure that the fta struct is still live here. A previous bug put that struct into a C block statement. + resource_assert_zero(r2); + invariant(toku_pthread_retval==NULL); + invariant(bl->fractal_threads_live[which_db]); + bl->fractal_threads_live[which_db] = false; + if (r == 0) r = fta.errno_result; + } + } else { + queue_eof(bl->fractal_queues[which_db]); + r = toku_loader_write_ft_from_q(bl, descriptor, fd, progress_allocation, + bl->fractal_queues[which_db], bl->extracted_datasizes[which_db], which_db, + target_nodesize, target_basementnodesize, target_compression_method, target_fanout); } } error: // this is the cleanup code. Even if r==0 (no error) we fall through to here. - { + if (bl->fractal_queues[which_db]) { int r2 = queue_destroy(bl->fractal_queues[which_db]); invariant(r2==0); - bl->fractal_queues[which_db]=NULL; + bl->fractal_queues[which_db] = nullptr; } // if we get here we need to free up the merge_fileset and the rowset, as well as the keys @@ -2851,6 +2868,10 @@ int toku_ft_loader_close (FTLOADER bl, if (r) result = r; invariant(!bl->extractor_live); + } else { + r = finish_primary_rows(bl); + if (r) + result = r; } // check for an error during extraction diff --git a/storage/tokudb/ft-index/ft/ftloader.h b/storage/tokudb/ft-index/ft/ftloader.h index c3376c90e91..c920b4c5362 100644 --- a/storage/tokudb/ft-index/ft/ftloader.h +++ b/storage/tokudb/ft-index/ft/ftloader.h @@ -113,7 +113,8 @@ int toku_ft_loader_open (FTLOADER *bl, TOKUTXN txn, bool reserve_memory, uint64_t reserve_memory_size, - bool compress_intermediates); + bool compress_intermediates, + bool allow_puts); int toku_ft_loader_put (FTLOADER bl, DBT *key, DBT *val); diff --git a/storage/tokudb/ft-index/ft/log_upgrade.cc b/storage/tokudb/ft-index/ft/log_upgrade.cc index e5a36a88cff..8dba57e9d8d 100644 --- a/storage/tokudb/ft-index/ft/log_upgrade.cc +++ b/storage/tokudb/ft-index/ft/log_upgrade.cc @@ -321,8 +321,8 @@ toku_maybe_upgrade_log(const char *env_dir, const char *log_dir, LSN * lsn_of_cl r = 0; //Logs are up to date else { FOOTPRINT(4); - LSN last_lsn; - TXNID last_xid; + LSN last_lsn = ZERO_LSN; + TXNID last_xid = TXNID_NONE; r = verify_clean_shutdown_of_log_version(log_dir, version_of_logs_on_disk, &last_lsn, &last_xid); if (r != 0) { goto cleanup; diff --git a/storage/tokudb/ft-index/ft/logger.cc b/storage/tokudb/ft-index/ft/logger.cc index e4fd854c637..bbac5cf7de3 100644 --- a/storage/tokudb/ft-index/ft/logger.cc +++ b/storage/tokudb/ft-index/ft/logger.cc @@ -621,7 +621,7 @@ int toku_logger_find_next_unused_log_file(const char *directory, long long *resu if (d==0) return get_error_errno(); while ((de=readdir(d))) { if (de==0) return get_error_errno(); - long long thisl; + long long thisl = -1; if ( is_a_logfile(de->d_name, &thisl) ) { if ((long long)thisl > maxf) maxf = thisl; } diff --git a/storage/tokudb/ft-index/ft/tests/ftloader-test-bad-generate.cc b/storage/tokudb/ft-index/ft/tests/ftloader-test-bad-generate.cc index 1ecae89da78..9ae24f7c4ec 100644 --- a/storage/tokudb/ft-index/ft/tests/ftloader-test-bad-generate.cc +++ b/storage/tokudb/ft-index/ft/tests/ftloader-test-bad-generate.cc @@ -170,7 +170,7 @@ static void test_extractor(int nrows, int nrowsets, bool expect_fail) { } FTLOADER loader; - r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, "tempXXXXXX", ZERO_LSN, nullptr, true, 0, false); + r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, "tempXXXXXX", ZERO_LSN, nullptr, true, 0, false, true); assert(r == 0); struct rowset *rowset[nrowsets]; diff --git a/storage/tokudb/ft-index/ft/tests/ftloader-test-extractor-errors.cc b/storage/tokudb/ft-index/ft/tests/ftloader-test-extractor-errors.cc index 4dcd7fb2f8c..007fd39fe08 100644 --- a/storage/tokudb/ft-index/ft/tests/ftloader-test-extractor-errors.cc +++ b/storage/tokudb/ft-index/ft/tests/ftloader-test-extractor-errors.cc @@ -180,7 +180,7 @@ static void test_extractor(int nrows, int nrowsets, bool expect_fail, const char sprintf(temp, "%s/%s", testdir, "tempXXXXXX"); FTLOADER loader; - r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, "tempXXXXXX", ZERO_LSN, nullptr, true, 0, false); + r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, "tempXXXXXX", ZERO_LSN, nullptr, true, 0, false, true); assert(r == 0); struct rowset *rowset[nrowsets]; diff --git a/storage/tokudb/ft-index/ft/tests/ftloader-test-extractor.cc b/storage/tokudb/ft-index/ft/tests/ftloader-test-extractor.cc index 0a8ce157269..afba44a7a22 100644 --- a/storage/tokudb/ft-index/ft/tests/ftloader-test-extractor.cc +++ b/storage/tokudb/ft-index/ft/tests/ftloader-test-extractor.cc @@ -402,7 +402,7 @@ static void test_extractor(int nrows, int nrowsets, const char *testdir) { sprintf(temp, "%s/%s", testdir, "tempXXXXXX"); FTLOADER loader; - r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, temp, ZERO_LSN, nullptr, true, 0, false); + r = toku_ft_loader_open(&loader, NULL, generate, NULL, N, fts, dbs, fnames, compares, temp, ZERO_LSN, nullptr, true, 0, false, true); assert(r == 0); struct rowset *rowset[nrowsets]; diff --git a/storage/tokudb/ft-index/ft/tests/ftloader-test-merge-files-dbufio.cc b/storage/tokudb/ft-index/ft/tests/ftloader-test-merge-files-dbufio.cc index 82583595470..cdd4c1d6691 100644 --- a/storage/tokudb/ft-index/ft/tests/ftloader-test-merge-files-dbufio.cc +++ b/storage/tokudb/ft-index/ft/tests/ftloader-test-merge-files-dbufio.cc @@ -412,7 +412,7 @@ static void test (const char *directory, bool is_error) { bt_compare_functions, "tempxxxxxx", *lsnp, - nullptr, true, 0, false); + nullptr, true, 0, false, true); assert(r==0); } @@ -500,11 +500,6 @@ static void test (const char *directory, bool is_error) { assert(cthunk.n_read == N_RECORDS); } } - //printf("%s:%d Destroying\n", __FILE__, __LINE__); - { - int r = queue_destroy(bl->primary_rowset_queue); - assert(r==0); - } { int r = queue_destroy(q); assert(r==0); diff --git a/storage/tokudb/ft-index/ft/tests/ftloader-test-open.cc b/storage/tokudb/ft-index/ft/tests/ftloader-test-open.cc index f2919f04d3d..cdf0a14ab00 100644 --- a/storage/tokudb/ft-index/ft/tests/ftloader-test-open.cc +++ b/storage/tokudb/ft-index/ft/tests/ftloader-test-open.cc @@ -143,7 +143,7 @@ static void test_loader_open(int ndbs) { for (i = 0; ; i++) { set_my_malloc_trigger(i+1); - r = toku_ft_loader_open(&loader, NULL, NULL, NULL, ndbs, fts, dbs, fnames, compares, "", ZERO_LSN, nullptr, true, 0, false); + r = toku_ft_loader_open(&loader, NULL, NULL, NULL, ndbs, fts, dbs, fnames, compares, "", ZERO_LSN, nullptr, true, 0, false, true); if (r == 0) break; } diff --git a/storage/tokudb/ft-index/ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc b/storage/tokudb/ft-index/ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc new file mode 100644 index 00000000000..100e5153636 --- /dev/null +++ b/storage/tokudb/ft-index/ft/tests/test_rightmost_leaf_seqinsert_heuristic.cc @@ -0,0 +1,183 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2014 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved." + +#include "test.h" + +#include +#include + +// Each FT maintains a sequential insert heuristic to determine if its +// worth trying to insert directly into a well-known rightmost leaf node. +// +// The heuristic is only maintained when a rightmost leaf node is known. +// +// This test verifies that sequential inserts increase the seqinsert score +// and that a single non-sequential insert resets the score. + +static void test_seqinsert_heuristic(void) { + int r = 0; + char name[TOKU_PATH_MAX + 1]; + toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata"); + toku_os_recursive_delete(TOKU_TEST_FILENAME); + r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r); + + FT_HANDLE ft_handle; + CACHETABLE ct; + toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER); + r = toku_open_ft_handle(name, 1, &ft_handle, + 4*1024*1024, 64*1024, + TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL, + toku_builtin_compare_fun); CKERR(r); + FT ft = ft_handle->ft; + + int k; + DBT key, val; + const int val_size = 1024 * 1024; + char *XMALLOC_N(val_size, val_buf); + memset(val_buf, 'x', val_size); + toku_fill_dbt(&val, val_buf, val_size); + + // Insert many rows sequentially. This is enough data to: + // - force the root to split (the righmost leaf will then be known) + // - raise the seqinsert score high enough to enable direct rightmost injections + const int rows_to_insert = 200; + for (int i = 0; i < rows_to_insert; i++) { + k = toku_htonl(i); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + } + invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL); + invariant(ft->seqinsert_score == FT_SEQINSERT_SCORE_THRESHOLD); + + // Insert on the left extreme. The seq insert score is high enough + // that we will attempt to insert into the rightmost leaf. We won't + // be successful because key 0 won't be in the bounds of the rightmost leaf. + // This failure should reset the seqinsert score back to 0. + k = toku_htonl(0); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 0); + + // Insert in the middle. The score should not go up. + k = toku_htonl(rows_to_insert / 2); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 0); + + // Insert on the right extreme. The score should go up. + k = toku_htonl(rows_to_insert); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 1); + + // Insert again on the right extreme again, the score should go up. + k = toku_htonl(rows_to_insert + 1); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 2); + + // Insert close to, but not at, the right extreme. The score should reset. + // -- the magic number 4 derives from the fact that vals are 1mb and nodes are 4mb + k = toku_htonl(rows_to_insert - 4); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + invariant(ft->seqinsert_score == 0); + + toku_free(val_buf); + toku_ft_handle_close(ft_handle); + toku_cachetable_close(&ct); + toku_os_recursive_delete(TOKU_TEST_FILENAME); +} + +int test_main(int argc, const char *argv[]) { + default_parse_args(argc, argv); + test_seqinsert_heuristic(); + return 0; +} diff --git a/storage/tokudb/ft-index/ft/tests/test_rightmost_leaf_split_merge.cc b/storage/tokudb/ft-index/ft/tests/test_rightmost_leaf_split_merge.cc new file mode 100644 index 00000000000..517fc277fd3 --- /dev/null +++ b/storage/tokudb/ft-index/ft/tests/test_rightmost_leaf_split_merge.cc @@ -0,0 +1,212 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2014 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved." + +#include "test.h" + +#include +#include + +// Promotion tracks the rightmost blocknum in the FT when a message +// is successfully promoted to a non-root leaf node on the right extreme. +// +// This test verifies that a split or merge of the rightmost leaf properly +// maintains the rightmost blocknum (which is constant - the pair's swap values, +// like the root blocknum). + +static void test_split_merge(void) { + int r = 0; + char name[TOKU_PATH_MAX + 1]; + toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata"); + toku_os_recursive_delete(TOKU_TEST_FILENAME); + r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r); + + FT_HANDLE ft_handle; + CACHETABLE ct; + toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER); + r = toku_open_ft_handle(name, 1, &ft_handle, + 4*1024*1024, 64*1024, + TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL, + toku_builtin_compare_fun); CKERR(r); + + // We have a root blocknum, but no rightmost blocknum yet. + FT ft = ft_handle->ft; + invariant(ft->h->root_blocknum.b != RESERVED_BLOCKNUM_NULL); + invariant(ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL); + + int k; + DBT key, val; + const int val_size = 1 * 1024 * 1024; + char *XMALLOC_N(val_size, val_buf); + memset(val_buf, 'x', val_size); + toku_fill_dbt(&val, val_buf, val_size); + + // Insert 16 rows (should induce a few splits) + const int rows_to_insert = 16; + for (int i = 0; i < rows_to_insert; i++) { + k = toku_htonl(i); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_insert(ft_handle, &key, &val, NULL); + } + + // rightmost blocknum should be set, because the root split and promotion + // did a rightmost insertion directly into the rightmost leaf, lazily + // initializing the rightmost blocknum. + invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL); + + BLOCKNUM root_blocknum = ft->h->root_blocknum; + FTNODE root_node; + struct ftnode_fetch_extra bfe; + fill_bfe_for_full_read(&bfe, ft); + toku_pin_ftnode(ft, root_blocknum, + toku_cachetable_hash(ft->cf, ft->h->root_blocknum), + &bfe, PL_WRITE_EXPENSIVE, &root_node, true); + // root blocknum should be consistent + invariant(root_node->thisnodename.b == ft->h->root_blocknum.b); + // root should have split at least once, and it should now be at height 1 + invariant(root_node->n_children > 1); + invariant(root_node->height == 1); + // rightmost blocknum should no longer be the root, since the root split + invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b); + // the right child should have the rightmost blocknum + invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b); + + BLOCKNUM rightmost_blocknum_before_merge = ft->rightmost_blocknum; + const int num_children_before_merge = root_node->n_children; + + // delete the last 6 rows. + // - 1mb each, so 6mb deleted + // - should be enough to delete the entire rightmost leaf + some of its neighbor + const int rows_to_delete = 6; + toku_unpin_ftnode(ft, root_node); + for (int i = 0; i < rows_to_delete; i++) { + k = toku_htonl(rows_to_insert - i); + toku_fill_dbt(&key, &k, sizeof(k)); + toku_ft_delete(ft_handle, &key, NULL); + } + toku_pin_ftnode(ft, root_blocknum, + toku_cachetable_hash(ft->cf, root_blocknum), + &bfe, PL_WRITE_EXPENSIVE, &root_node, true); + + // - rightmost leaf should be fusible after those deletes (which were promoted directly to the leaf) + FTNODE rightmost_leaf; + toku_pin_ftnode(ft, rightmost_blocknum_before_merge, + toku_cachetable_hash(ft->cf, rightmost_blocknum_before_merge), + &bfe, PL_WRITE_EXPENSIVE, &rightmost_leaf, true); + invariant(get_node_reactivity(ft, rightmost_leaf) == RE_FUSIBLE); + toku_unpin_ftnode(ft, rightmost_leaf); + + // - merge the rightmost child now that it's fusible + toku_ft_merge_child(ft, root_node, root_node->n_children - 1); + toku_pin_ftnode(ft, root_blocknum, + toku_cachetable_hash(ft->cf, root_blocknum), + &bfe, PL_WRITE_EXPENSIVE, &root_node, true); + + // the merge should have worked, and the root should still be at height 1 + invariant(root_node->n_children < num_children_before_merge); + invariant(root_node->height == 1); + // the rightmost child of the root has the rightmost blocknum + invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b); + // the value for rightmost blocknum itself should not have changed + // (we keep it constant, like the root blocknum) + invariant(rightmost_blocknum_before_merge.b == ft->rightmost_blocknum.b); + + toku_unpin_ftnode(ft, root_node); + + toku_free(val_buf); + toku_ft_handle_close(ft_handle); + toku_cachetable_close(&ct); + toku_os_recursive_delete(TOKU_TEST_FILENAME); +} + +int test_main(int argc, const char *argv[]) { + default_parse_args(argc, argv); + test_split_merge(); + return 0; +} diff --git a/storage/tokudb/ft-index/ft/tokuftdump.cc b/storage/tokudb/ft-index/ft/tokuftdump.cc index f2d4fce83cb..a7d94f41d78 100644 --- a/storage/tokudb/ft-index/ft/tokuftdump.cc +++ b/storage/tokudb/ft-index/ft/tokuftdump.cc @@ -89,7 +89,7 @@ PATENT RIGHTS GRANT: #ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -/* Tell me the diff between two FT files. */ +// Dump a fractal tree file #include "cachetable.h" #include "ft.h" @@ -102,20 +102,26 @@ PATENT RIGHTS GRANT: #include #include -static void -format_time(const uint64_t time_int, char *buf) { +static int do_dump_data = 1; +static int do_interactive = 0; +static int do_header = 0; +static int do_fragmentation = 0; +static int do_garbage = 0; +static int do_translation_table = 0; +static int do_rootnode = 0; +static int do_tsv = 0; + +static const char *arg0; +static const char *fname; + +static void format_time(const uint64_t time_int, char *buf) { time_t timer = (time_t) time_int; ctime_r(&timer, buf); assert(buf[24] == '\n'); buf[24] = 0; } -static int dump_data = 1; - -static CACHETABLE ct; - -static void -print_item (bytevec val, ITEMLEN len) { +static void print_item(bytevec val, ITEMLEN len) { printf("\""); ITEMLEN i; for (i=0; idbt.size); simple_hex_dump((unsigned char*) d->dbt.data, d->dbt.size); printf("\n"); } -static void -open_header (int f, FT *header, CACHEFILE cf) { +static void open_header(int fd, FT *header, CACHEFILE cf) { FT ft = NULL; int r; - r = toku_deserialize_ft_from (f, MAX_LSN, &ft); - assert(r==0); + r = toku_deserialize_ft_from (fd, MAX_LSN, &ft); + if (r != 0) { + fprintf(stderr, "%s: can not deserialize from %s error %d\n", arg0, fname, r); + exit(1); + } + assert_zero(r); ft->cf = cf; *header = ft; } -static void -dump_header(FT ft) { +static void dump_header(FT ft) { char timestr[26]; printf("ft:\n"); printf(" layout_version=%d\n", ft->h->layout_version); @@ -212,29 +217,19 @@ dump_header(FT ft) { printf(" estimated numbytes=%" PRId64 "\n", ft->in_memory_stats.numbytes); } -static int -print_le( - const void* key, - const uint32_t keylen, - const LEAFENTRY &le, - const uint32_t idx UU(), - void *const ai UU() - ) -{ +static int print_le(const void* key, const uint32_t keylen, const LEAFENTRY &le, const uint32_t idx UU(), void *const ai UU()) { print_klpair(stdout, key, keylen, le); printf("\n"); return 0; } - -static void -dump_node (int f, BLOCKNUM blocknum, FT h) { +static void dump_node(int fd, BLOCKNUM blocknum, FT h) { FTNODE n; struct ftnode_fetch_extra bfe; FTNODE_DISK_DATA ndd = NULL; fill_bfe_for_full_read(&bfe, h); - int r = toku_deserialize_ftnode_from (f, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe); - assert(r==0); + int r = toku_deserialize_ftnode_from (fd, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe); + assert_zero(r); assert(n!=0); printf("ftnode\n"); DISKOFF disksize, diskoffset; @@ -271,15 +266,16 @@ dump_node (int f, BLOCKNUM blocknum, FT h) { } printf(" children:\n"); for (int i=0; in_children; i++) { + printf(" child %d: ", i); if (n->height > 0) { - printf(" child %d: %" PRId64 "\n", i, BP_BLOCKNUM(n, i).b); + printf("%" PRId64 "\n", BP_BLOCKNUM(n, i).b); NONLEAF_CHILDINFO bnc = BNC(n, i); unsigned int n_bytes = toku_bnc_nbytesinbuf(bnc); int n_entries = toku_bnc_n_entries(bnc); if (n_bytes > 0 || n_entries > 0) { printf(" buffer contains %u bytes (%d items)\n", n_bytes, n_entries); } - if (dump_data) { + if (do_dump_data) { FIFO_ITERATE(bnc->buffer, key, keylen, data, datalen, typ, msn, xids, UU(is_fresh), { printf(" msn=%" PRIu64 " (0x%" PRIx64 ") ", msn.msn, msn.msn); @@ -316,7 +312,7 @@ dump_node (int f, BLOCKNUM blocknum, FT h) { } else { printf(" n_bytes_in_buffer= %" PRIu64 "", BLB_DATA(n, i)->get_disk_size()); printf(" items_in_buffer=%u\n", BLB_DATA(n, i)->num_klpairs()); - if (dump_data) { + if (do_dump_data) { BLB_DATA(n, i)->iterate(NULL); } } @@ -325,13 +321,11 @@ dump_node (int f, BLOCKNUM blocknum, FT h) { toku_free(ndd); } -static void -dump_block_translation(FT h, uint64_t offset) { +static void dump_block_translation(FT h, uint64_t offset) { toku_blocknum_dump_translation(h->blocktable, make_blocknum(offset)); } -static void -dump_fragmentation(int UU(f), FT h, int tsv) { +static void dump_fragmentation(int UU(f), FT h, int tsv) { int64_t used_space; int64_t total_space; toku_blocktable_internal_fragmentation(h->blocktable, &total_space, &used_space); @@ -349,21 +343,20 @@ dump_fragmentation(int UU(f), FT h, int tsv) { } typedef struct { - int f; + int fd; FT h; uint64_t blocksizes; uint64_t leafsizes; uint64_t leafblocks; } frag_help_extra; -static int -nodesizes_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra) { +static int nodesizes_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra) { frag_help_extra *CAST_FROM_VOIDP(info, extra); FTNODE n; FTNODE_DISK_DATA ndd = NULL; struct ftnode_fetch_extra bfe; fill_bfe_for_full_read(&bfe, info->h); - int r = toku_deserialize_ftnode_from(info->f, b, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe); + int r = toku_deserialize_ftnode_from(info->fd, b, 0 /*pass zero for hash, it doesn't matter*/, &n, &ndd, &bfe); if (r==0) { info->blocksizes += size; if (n->height == 0) { @@ -376,11 +369,10 @@ nodesizes_helper(BLOCKNUM b, int64_t size, int64_t UU(address), void *extra) { return 0; } -static void -dump_nodesizes(int f, FT h) { +static void dump_nodesizes(int fd, FT h) { frag_help_extra info; memset(&info, 0, sizeof(info)); - info.f = f; + info.fd = fd; info.h = h; toku_blocktable_iterate(h->blocktable, TRANSLATION_CHECKPOINTED, nodesizes_helper, &info, true, true); @@ -389,36 +381,45 @@ dump_nodesizes(int f, FT h) { printf("leafsizes\t%" PRIu64 "\n", info.leafsizes); } -static void -dump_garbage_stats(int f, FT ft) { - invariant(f == toku_cachefile_get_fd(ft->cf)); +static void dump_garbage_stats(int fd, FT ft) { + assert(fd == toku_cachefile_get_fd(ft->cf)); uint64_t total_space = 0; uint64_t used_space = 0; toku_ft_get_garbage(ft, &total_space, &used_space); - printf("total_size\t%" PRIu64 "\n", total_space); - printf("used_size\t%" PRIu64 "\n", used_space); + printf("garbage total size\t%" PRIu64 "\n", total_space); + printf("garbage used size\t%" PRIu64 "\n", used_space); } -static uint32_t -get_unaligned_uint32(unsigned char *p) { - return *(uint32_t *)p; +typedef struct __dump_node_extra { + int fd; + FT h; +} dump_node_extra; + +static int dump_node_wrapper(BLOCKNUM b, int64_t UU(size), int64_t UU(address), void *extra) { + dump_node_extra *CAST_FROM_VOIDP(info, extra); + dump_node(info->fd, b, info->h); + return 0; +} + +static uint32_t get_unaligned_uint32(unsigned char *p) { + uint32_t n; + memcpy(&n, p, sizeof n); + return n; } struct dump_sub_block { - uint32_t compressed_size; - uint32_t uncompressed_size; - uint32_t xsum; + uint32_t compressed_size; + uint32_t uncompressed_size; + uint32_t xsum; }; -static void -sub_block_deserialize(struct dump_sub_block *sb, unsigned char *sub_block_header) { +static void sub_block_deserialize(struct dump_sub_block *sb, unsigned char *sub_block_header) { sb->compressed_size = toku_dtoh32(get_unaligned_uint32(sub_block_header+0)); sb->uncompressed_size = toku_dtoh32(get_unaligned_uint32(sub_block_header+4)); sb->xsum = toku_dtoh32(get_unaligned_uint32(sub_block_header+8)); } -static void -verify_block(unsigned char *cp, uint64_t file_offset, uint64_t size) { +static void verify_block(unsigned char *cp, uint64_t file_offset, uint64_t size) { // verify the header checksum const size_t node_header = 8 + sizeof (uint32_t) + sizeof (uint32_t) + sizeof (uint32_t); @@ -461,24 +462,22 @@ verify_block(unsigned char *cp, uint64_t file_offset, uint64_t size) { printf("offset %u expected %" PRIu64 "\n", offset, size); } -static void -dump_block(int f, BLOCKNUM blocknum, FT h) { +static void dump_block(int fd, BLOCKNUM blocknum, FT h) { DISKOFF offset, size; toku_translate_blocknum_to_offset_size(h->blocktable, blocknum, &offset, &size); printf("%" PRId64 " at %" PRId64 " size %" PRId64 "\n", blocknum.b, offset, size); unsigned char *CAST_FROM_VOIDP(vp, toku_malloc(size)); - uint64_t r = pread(f, vp, size, offset); + uint64_t r = pread(fd, vp, size, offset); if (r == (uint64_t)size) { verify_block(vp, offset, size); } toku_free(vp); } -static void -dump_file(int f, uint64_t offset, uint64_t size, FILE *outfp) { +static void dump_file(int fd, uint64_t offset, uint64_t size, FILE *outfp) { unsigned char *XMALLOC_N(size, vp); - uint64_t r = pread(f, vp, size, offset); + uint64_t r = pread(fd, vp, size, offset); if (r == size) { if (outfp == stdout) { hex_dump(vp, offset, size); @@ -490,13 +489,11 @@ dump_file(int f, uint64_t offset, uint64_t size, FILE *outfp) { toku_free(vp); } -static void -set_file(int f, uint64_t offset, unsigned char newc) { - toku_os_pwrite(f, &newc, sizeof newc, offset); +static void set_file(int fd, uint64_t offset, unsigned char newc) { + toku_os_pwrite(fd, &newc, sizeof newc, offset); } -static int -readline (char *line, int maxline) { +static int readline(char *line, int maxline) { int i = 0; int c; while ((c = getchar()) != EOF && c != '\n' && i < maxline) { @@ -506,8 +503,7 @@ readline (char *line, int maxline) { return c == EOF ? EOF : i; } -static int -split_fields (char *line, char *fields[], int maxfields) { +static int split_fields(char *line, char *fields[], int maxfields) { int i; for (i=0; if, b, info->h); - return 0; -} - -static void -interactive_help(void) { +static void interactive_help(void) { fprintf(stderr, "help\n"); fprintf(stderr, "header\n"); fprintf(stderr, "node NUMBER\n"); @@ -552,133 +538,160 @@ interactive_help(void) { fprintf(stderr, "quit\n"); } -static uint64_t -getuint64(const char *f) { - if (strncmp(f, "0x", 2) == 0 || strncmp(f, "0X", 2) == 0) - return strtoull(f, 0, 16); - else if (strncmp(f, "0", 1) == 0) - return strtoull(f, 0, 8); - else - return strtoull(f, 0, 10); +static void run_iteractive_loop(int fd, FT ft, CACHEFILE cf) { + while (1) { + printf("ftdump>"); fflush(stdout); + enum { maxline = 64}; + char line[maxline+1]; + int r = readline(line, maxline); + if (r == EOF) + break; + const int maxfields = 4; + char *fields[maxfields]; + int nfields = split_fields(line, fields, maxfields); + if (nfields == 0) + continue; + if (strcmp(fields[0], "help") == 0) { + interactive_help(); + } else if (strcmp(fields[0], "header") == 0) { + toku_ft_free(ft); + open_header(fd, &ft, cf); + dump_header(ft); + } else if (strcmp(fields[0], "block") == 0 && nfields == 2) { + BLOCKNUM blocknum = make_blocknum(getuint64(fields[1])); + dump_block(fd, blocknum, ft); + } else if (strcmp(fields[0], "node") == 0 && nfields == 2) { + BLOCKNUM off = make_blocknum(getuint64(fields[1])); + dump_node(fd, off, ft); + } else if (strcmp(fields[0], "dumpdata") == 0 && nfields == 2) { + do_dump_data = strtol(fields[1], NULL, 10); + } else if (strcmp(fields[0], "block_translation") == 0 || strcmp(fields[0], "bx") == 0) { + uint64_t offset = 0; + if (nfields == 2) + offset = getuint64(fields[1]); + dump_block_translation(ft, offset); + } else if (strcmp(fields[0], "fragmentation") == 0) { + dump_fragmentation(fd, ft, do_tsv); + } else if (strcmp(fields[0], "nodesizes") == 0) { + dump_nodesizes(fd, ft); + } else if (strcmp(fields[0], "garbage") == 0) { + dump_garbage_stats(fd, ft); + } else if (strcmp(fields[0], "file") == 0 && nfields >= 3) { + uint64_t offset = getuint64(fields[1]); + uint64_t size = getuint64(fields[2]); + FILE *outfp = stdout; + if (nfields >= 4) + outfp = fopen(fields[3], "w"); + dump_file(fd, offset, size, outfp); + } else if (strcmp(fields[0], "setfile") == 0 && nfields == 3) { + uint64_t offset = getuint64(fields[1]); + unsigned char newc = getuint64(fields[2]); + set_file(fd, offset, newc); + } else if (strcmp(fields[0], "quit") == 0 || strcmp(fields[0], "q") == 0) { + break; + } + } } -int -main (int argc, const char *const argv[]) { - int interactive = 0; - int fragmentation = 0; - int translation_table = 0; - int rootnode = 0; - int tsv = 0; +static int usage(void) { + fprintf(stderr, "Usage: %s ", arg0); + fprintf(stderr, "--interactive "); + fprintf(stderr, "--nodata "); + fprintf(stderr, "--dumpdata 0|1 "); + fprintf(stderr, "--header "); + fprintf(stderr, "--rootnode "); + fprintf(stderr, "--fragmentation "); + fprintf(stderr, "--garbage "); + fprintf(stderr, "--tsv "); + fprintf(stderr, "--translation-table "); + fprintf(stderr, "--tsv "); + fprintf(stderr, "ftfilename \n"); + return 1; +} - const char *arg0 = argv[0]; +int main (int argc, const char *const argv[]) { + arg0 = argv[0]; argc--; argv++; while (argc>0) { - if (strcmp(argv[0], "--nodata") == 0) { - dump_data = 0; - } else if (strcmp(argv[0], "--interactive") == 0 || strcmp(argv[0], "--i") == 0) { - interactive = 1; - } else if (strcmp(argv[0], "--fragmentation") == 0) { - fragmentation = 1; - } else if (strcmp(argv[0], "--tsv") == 0) { - tsv = 1; - } else if (strcmp(argv[0], "--translation-table") == 0) { - translation_table = 1; + if (strcmp(argv[0], "--interactive") == 0 || strcmp(argv[0], "--i") == 0) { + do_interactive = 1; + } else if (strcmp(argv[0], "--nodata") == 0) { + do_dump_data = 0; + } else if (strcmp(argv[0], "--dumpdata") == 0 && argc > 1) { + argc--; argv++; + do_dump_data = atoi(argv[0]); + } else if (strcmp(argv[0], "--header") == 0) { + do_header = 1; } else if (strcmp(argv[0], "--rootnode") == 0) { - rootnode = 1; - } else if (strcmp(argv[0], "--help") == 0) { - return usage(arg0); + do_rootnode = 1; + } else if (strcmp(argv[0], "--fragmentation") == 0) { + do_fragmentation = 1; + } else if (strcmp(argv[0], "--garbage") == 0) { + do_garbage = 1; + } else if (strcmp(argv[0], "--tsv") == 0) { + do_tsv = 1; + } else if (strcmp(argv[0], "--translation-table") == 0) { + do_translation_table = 1; + } else if (strcmp(argv[0], "--help") == 0 || strcmp(argv[0], "-?") == 0 || strcmp(argv[0], "-h") == 0) { + return usage(); } else { break; } argc--; argv++; } - if (argc != 1) return usage(arg0); + if (argc != 1) + return usage(); int r = toku_ft_layer_init(); - invariant_zero(r); + assert_zero(r); - const char *n = argv[0]; - int f = open(n, O_RDWR + O_BINARY); assert(f>=0); - FT ft; - // create a cachefile for the header - toku_cachetable_create(&ct, 1<<25, (LSN){0}, 0); - CACHEFILE cf = NULL; - r = toku_cachetable_openfd (&cf, ct, f, n); - assert(r==0); - open_header(f, &ft, cf); - if (!fragmentation && !translation_table) { - // quick fix for now, we want those two to have clean output - dump_header(ft); + fname = argv[0]; + int fd = open(fname, O_RDWR + O_BINARY); + if (fd < 0) { + fprintf(stderr, "%s: can not open %s errno %d\n", arg0, fname, errno); + return 1; } - if (interactive) { - while (1) { - printf("ftdump>"); fflush(stdout); - enum { maxline = 64}; - char line[maxline+1]; - r = readline(line, maxline); - if (r == EOF) - break; - const int maxfields = 4; - char *fields[maxfields]; - int nfields = split_fields(line, fields, maxfields); - if (nfields == 0) - continue; - if (strcmp(fields[0], "help") == 0) { - interactive_help(); - } else if (strcmp(fields[0], "header") == 0) { - toku_ft_free(ft); - open_header(f, &ft, cf); - dump_header(ft); - } else if (strcmp(fields[0], "block") == 0 && nfields == 2) { - BLOCKNUM blocknum = make_blocknum(getuint64(fields[1])); - dump_block(f, blocknum, ft); - } else if (strcmp(fields[0], "node") == 0 && nfields == 2) { - BLOCKNUM off = make_blocknum(getuint64(fields[1])); - dump_node(f, off, ft); - } else if (strcmp(fields[0], "dumpdata") == 0 && nfields == 2) { - dump_data = strtol(fields[1], NULL, 10); - } else if (strcmp(fields[0], "block_translation") == 0 || strcmp(fields[0], "bx") == 0) { - uint64_t offset = 0; - if (nfields == 2) - offset = getuint64(fields[1]); - dump_block_translation(ft, offset); - } else if (strcmp(fields[0], "fragmentation") == 0) { - dump_fragmentation(f, ft, tsv); - } else if (strcmp(fields[0], "nodesizes") == 0) { - dump_nodesizes(f, ft); - } else if (strcmp(fields[0], "garbage") == 0) { - dump_garbage_stats(f, ft); - } else if (strcmp(fields[0], "file") == 0 && nfields >= 3) { - uint64_t offset = getuint64(fields[1]); - uint64_t size = getuint64(fields[2]); - FILE *outfp = stdout; - if (nfields >= 4) - outfp = fopen(fields[3], "w"); - dump_file(f, offset, size, outfp); - } else if (strcmp(fields[0], "setfile") == 0 && nfields == 3) { - uint64_t offset = getuint64(fields[1]); - unsigned char newc = getuint64(fields[2]); - set_file(f, offset, newc); - } else if (strcmp(fields[0], "quit") == 0 || strcmp(fields[0], "q") == 0) { - break; - } - } - } else if (rootnode) { - dump_node(f, ft->h->root_blocknum, ft); - } else if (fragmentation) { - dump_fragmentation(f, ft, tsv); - } else if (translation_table) { - toku_dump_translation_table_pretty(stdout, ft->blocktable); + + // create a cachefile for the header + CACHETABLE ct = NULL; + toku_cachetable_create(&ct, 1<<25, (LSN){0}, 0); + + CACHEFILE cf = NULL; + r = toku_cachetable_openfd (&cf, ct, fd, fname); + assert_zero(r); + + FT ft = NULL; + open_header(fd, &ft, cf); + + if (do_interactive) { + run_iteractive_loop(fd, ft, cf); } else { - printf("Block translation:"); - - toku_dump_translation_table(stdout, ft->blocktable); - - struct __dump_node_extra info; - info.f = f; - info.h = ft; - toku_blocktable_iterate(ft->blocktable, TRANSLATION_CHECKPOINTED, - dump_node_wrapper, &info, true, true); + if (do_header) { + dump_header(ft); + } + if (do_rootnode) { + dump_node(fd, ft->h->root_blocknum, ft); + } + if (do_fragmentation) { + dump_fragmentation(fd, ft, do_tsv); + } + if (do_translation_table) { + toku_dump_translation_table_pretty(stdout, ft->blocktable); + } + if (do_garbage) { + dump_garbage_stats(fd, ft); + } + if (!do_header && !do_rootnode && !do_fragmentation && !do_translation_table && !do_garbage) { + printf("Block translation:"); + + toku_dump_translation_table(stdout, ft->blocktable); + + struct __dump_node_extra info; + info.fd = fd; + info.h = ft; + toku_blocktable_iterate(ft->blocktable, TRANSLATION_CHECKPOINTED, + dump_node_wrapper, &info, true, true); + } } toku_cachefile_close(&cf, false, ZERO_LSN); toku_cachetable_close(&ct); diff --git a/storage/tokudb/ft-index/scripts/run.stress-tests.py b/storage/tokudb/ft-index/scripts/run.stress-tests.py index fbbf5ee6472..d4245a7c4b4 100755 --- a/storage/tokudb/ft-index/scripts/run.stress-tests.py +++ b/storage/tokudb/ft-index/scripts/run.stress-tests.py @@ -735,6 +735,7 @@ if __name__ == '__main__': 'test_stress6.tdb', 'test_stress7.tdb', 'test_stress_hot_indexing.tdb', + 'test_stress_with_verify.tdb', 'test_stress_openclose.tdb'] default_recover_testnames = ['recover-test_stress1.tdb', 'recover-test_stress2.tdb', @@ -766,8 +767,8 @@ if __name__ == '__main__': help="skip the tests that don't involve upgrade [default=False]") upgrade_group.add_option('--double_upgrade', action='store_true', dest='double_upgrade', default=False, help='run the upgrade tests twice in a row [default=False]') - upgrade_group.add_option('--add_old_version', action='append', type='choice', dest='old_versions', choices=['4.2.0', '5.0.8', '5.2.7', '6.0.0', '6.1.0', '6.5.1', '6.6.3'], - help='which old versions to use for running the stress tests in upgrade mode. can be specified multiple times [options=4.2.0, 5.0.8, 5.2.7, 6.0.0, 6.1.0, 6.5.1, 6.6.3]') + upgrade_group.add_option('--add_old_version', action='append', type='choice', dest='old_versions', choices=['4.2.0', '5.0.8', '5.2.7', '6.0.0', '6.1.0', '6.5.1', '6.6.3', '7.1.6'], + help='which old versions to use for running the stress tests in upgrade mode. can be specified multiple times [options=4.2.0, 5.0.8, 5.2.7, 6.0.0, 6.1.0, 6.5.1, 6.6.3, 7.1.6]') upgrade_group.add_option('--old_environments_dir', type='string', dest='old_environments_dir', default=('%s/old-stress-test-envs' % default_tokudb_data), help='directory containing old version environments (should contain 5.0.8/, 5.2.7/, etc, and the environments should be in those) [default=../../tokudb.data/stress_environments]') diff --git a/storage/tokudb/ft-index/src/loader.cc b/storage/tokudb/ft-index/src/loader.cc index 88db258e1ff..62b4f0b6cef 100644 --- a/storage/tokudb/ft-index/src/loader.cc +++ b/storage/tokudb/ft-index/src/loader.cc @@ -172,6 +172,13 @@ struct __toku_loader_internal { char **inames_in_env; /* [N] inames of new files to be created */ }; +static void free_inames(char **inames, int n) { + for (int i = 0; i < n; i++) { + toku_free(inames[i]); + } + toku_free(inames); +} + /* * free_loader_resources() frees all of the resources associated with * struct __toku_loader_internal @@ -185,16 +192,15 @@ static void free_loader_resources(DB_LOADER *loader) toku_destroy_dbt(&loader->i->err_val); if (loader->i->inames_in_env) { - for (int i=0; ii->N; i++) { - if (loader->i->inames_in_env[i]) toku_free(loader->i->inames_in_env[i]); - } - toku_free(loader->i->inames_in_env); + free_inames(loader->i->inames_in_env, loader->i->N); + loader->i->inames_in_env = nullptr; } - if (loader->i->temp_file_template) toku_free(loader->i->temp_file_template); + toku_free(loader->i->temp_file_template); + loader->i->temp_file_template = nullptr; // loader->i toku_free(loader->i); - loader->i = NULL; + loader->i = nullptr; } } @@ -245,6 +251,7 @@ toku_loader_create_loader(DB_ENV *env, bool check_empty) { int rval; HANDLE_READ_ONLY_TXN(txn); + DB_TXN *loader_txn = nullptr; *blp = NULL; // set later when created @@ -299,6 +306,13 @@ toku_loader_create_loader(DB_ENV *env, } { + if (env->i->open_flags & DB_INIT_TXN) { + rval = env->txn_begin(env, txn, &loader_txn, 0); + if (rval) { + goto create_exit; + } + } + ft_compare_func compare_functions[N]; for (int i=0; ii->bt_compare; @@ -306,18 +320,21 @@ toku_loader_create_loader(DB_ENV *env, // time to open the big kahuna char **XMALLOC_N(N, new_inames_in_env); + for (int i = 0; i < N; i++) { + new_inames_in_env[i] = nullptr; + } FT_HANDLE *XMALLOC_N(N, fts); for (int i=0; ii->ft_handle; } LSN load_lsn; - rval = locked_load_inames(env, txn, N, dbs, new_inames_in_env, &load_lsn, puts_allowed); + rval = locked_load_inames(env, loader_txn, N, dbs, new_inames_in_env, &load_lsn, puts_allowed); if ( rval!=0 ) { - toku_free(new_inames_in_env); + free_inames(new_inames_in_env, N); toku_free(fts); goto create_exit; } - TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; + TOKUTXN ttxn = loader_txn ? db_txn_struct_i(loader_txn)->tokutxn : NULL; rval = toku_ft_loader_open(&loader->i->ft_loader, env->i->cachetable, env->i->generate_row_for_put, @@ -331,12 +348,14 @@ toku_loader_create_loader(DB_ENV *env, ttxn, puts_allowed, env->get_loader_memory_size(env), - compress_intermediates); + compress_intermediates, + puts_allowed); if ( rval!=0 ) { - toku_free(new_inames_in_env); + free_inames(new_inames_in_env, N); toku_free(fts); goto create_exit; } + loader->i->inames_in_env = new_inames_in_env; toku_free(fts); @@ -348,10 +367,19 @@ toku_loader_create_loader(DB_ENV *env, rval = 0; } + rval = loader_txn->commit(loader_txn, 0); + assert_zero(rval); + loader_txn = nullptr; + rval = 0; } *blp = loader; create_exit: + if (loader_txn) { + int r = loader_txn->abort(loader_txn); + assert_zero(r); + loader_txn = nullptr; + } if (rval == 0) { (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_CREATE), 1); (void) toku_sync_fetch_and_add(&STATUS_VALUE(LOADER_CURRENT), 1); @@ -441,7 +469,7 @@ static void redirect_loader_to_empty_dictionaries(DB_LOADER *loader) { loader->i->dbs, loader->i->db_flags, loader->i->dbt_flags, - 0, + LOADER_DISALLOW_PUTS, false ); lazy_assert_zero(r); diff --git a/storage/tokudb/ft-index/src/tests/dbremove-nofile-limit.cc b/storage/tokudb/ft-index/src/tests/dbremove-nofile-limit.cc new file mode 100644 index 00000000000..eb5c6b80b63 --- /dev/null +++ b/storage/tokudb/ft-index/src/tests/dbremove-nofile-limit.cc @@ -0,0 +1,177 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved." +#ident "$Id$" + +// This test verifies that the env->dbremove function returns an error rather than +// crash when the NOFILE resource limit is exceeded. + +#include "test.h" +#include +#include + +static const char *envdir = TOKU_TEST_FILENAME; + +static void test_dbremove() { + int r; + + char rmcmd[32 + strlen(envdir)]; + snprintf(rmcmd, sizeof rmcmd, "rm -rf %s", envdir); + r = system(rmcmd); CKERR(r); + r = toku_os_mkdir(envdir, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; + r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + + DB *db; + r = db_create(&db, env, 0); CKERR(r); + char fname[32]; + sprintf(fname, "db%d", 0); + r = db->open(db, nullptr, fname, nullptr, DB_BTREE, DB_CREATE, 0666); CKERR(r); + + r = db->close(db, 0); CKERR(r); + + DB_TXN *txn; + r = env->txn_begin(env, nullptr, &txn, 0); CKERR(r); + + struct rlimit current_limit; + r = getrlimit(RLIMIT_NOFILE, ¤t_limit); + assert(r == 0); + + struct rlimit new_limit = current_limit; + new_limit.rlim_cur = 0; + r = setrlimit(RLIMIT_NOFILE, &new_limit); + assert(r == 0); + + r = env->dbremove(env, txn, fname, nullptr, 0); + CKERR2(r, EMFILE); + + r = setrlimit(RLIMIT_NOFILE, ¤t_limit); + assert(r == 0); + + r = env->dbremove(env, txn, fname, nullptr, 0); + CKERR(r); + + r = txn->commit(txn, 0); CKERR(r); + + r = env->close(env, 0); CKERR(r); +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: %s -h -v -q\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + test_dbremove(); + return 0; +} diff --git a/storage/tokudb/ft-index/src/tests/loader-close-nproc-limit.cc b/storage/tokudb/ft-index/src/tests/loader-close-nproc-limit.cc new file mode 100644 index 00000000000..3ef2b0541f7 --- /dev/null +++ b/storage/tokudb/ft-index/src/tests/loader-close-nproc-limit.cc @@ -0,0 +1,198 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +// Verify that loader->close works correctly (does not crash, does not leak memory, returns the right error code) +// when the NPROC limit is exceeded. + +#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." +#ident "$Id$" + +#include "test.h" +#include +#include + +static int loader_flags = 0; +static const char *envdir = TOKU_TEST_FILENAME; + +static void run_test(int ndb) { + int r; + + char rmcmd[32 + strlen(envdir)]; + snprintf(rmcmd, sizeof rmcmd, "rm -rf %s", envdir); + r = system(rmcmd); CKERR(r); + r = toku_os_mkdir(envdir, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; + r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + + DB *dbs[ndb]; + uint32_t db_flags[ndb]; + uint32_t dbt_flags[ndb]; + for (int i = 0; i < ndb; i++) { + db_flags[i] = DB_NOOVERWRITE; + dbt_flags[i] = 0; + r = db_create(&dbs[i], env, 0); CKERR(r); + char name[32]; + sprintf(name, "db%d", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + + DB_LOADER *loader; + r = env->create_loader(env, txn, &loader, ndb > 0 ? dbs[0] : NULL, ndb, dbs, db_flags, dbt_flags, loader_flags); CKERR(r); + + struct rlimit current_nproc_limit; + r = getrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + struct rlimit new_nproc_limit = current_nproc_limit; + new_nproc_limit.rlim_cur = 0; + r = setrlimit(RLIMIT_NPROC, &new_nproc_limit); + assert(r == 0); + + r = loader->close(loader); + + if (loader_flags & LOADER_DISALLOW_PUTS) + CKERR(r); + else + CKERR2(r, EAGAIN); + + r = setrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + r = txn->abort(txn); CKERR(r); + + for (int i = 0; i < ndb; i++) { + r = dbs[i]->close(dbs[i], 0); CKERR(r); + } + + r = env->close(env, 0); CKERR(r); +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: %s -h -v -q -p\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-p") == 0) { + loader_flags |= LOADER_DISALLOW_PUTS; + } else if (strcmp(argv[0], "-z") == 0) { + loader_flags |= LOADER_COMPRESS_INTERMEDIATES; + } else if (strcmp(argv[0], "-e") == 0) { + argc--; argv++; + if (argc > 0) + envdir = argv[0]; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + run_test(1); + return 0; +} diff --git a/storage/tokudb/ft-index/src/tests/loader-create-close.cc b/storage/tokudb/ft-index/src/tests/loader-create-close.cc index 6a04387152f..4d66a9df004 100644 --- a/storage/tokudb/ft-index/src/tests/loader-create-close.cc +++ b/storage/tokudb/ft-index/src/tests/loader-create-close.cc @@ -97,11 +97,7 @@ PATENT RIGHTS GRANT: static int loader_flags = 0; static const char *envdir = TOKU_TEST_FILENAME; -static int put_multiple_generate(DB *UU(dest_db), DB *UU(src_db), DBT_ARRAY *UU(dest_keys), DBT_ARRAY *UU(dest_vals), const DBT *UU(src_key), const DBT *UU(src_val)) { - return ENOMEM; -} - -static void loader_open_abort(int ndb) { +static void test_loader_create_close(int ndb) { int r; char rmcmd[32 + strlen(envdir)]; @@ -111,8 +107,6 @@ static void loader_open_abort(int ndb) { DB_ENV *env; r = db_env_create(&env, 0); CKERR(r); - r = env->set_generate_row_callback_for_put(env, put_multiple_generate); - CKERR(r); int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); env->set_errfile(env, stderr); @@ -181,8 +175,8 @@ static void do_args(int argc, char * const argv[]) { int test_main(int argc, char * const *argv) { do_args(argc, argv); - loader_open_abort(0); - loader_open_abort(1); - loader_open_abort(2); + test_loader_create_close(0); + test_loader_create_close(1); + test_loader_create_close(2); return 0; } diff --git a/storage/tokudb/ft-index/src/tests/loader-create-commit-nproc-limit.cc b/storage/tokudb/ft-index/src/tests/loader-create-commit-nproc-limit.cc new file mode 100644 index 00000000000..091809a8551 --- /dev/null +++ b/storage/tokudb/ft-index/src/tests/loader-create-commit-nproc-limit.cc @@ -0,0 +1,211 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." +#ident "$Id$" + +// This test crashes if a failed loader creation causes the db to be corrupted by unlinking +// the underlying fractal tree files. This unlinking occurs because the txn that logs the +// load log entries is committed rather than aborted. + +#include "test.h" +#include +#include + +static int loader_flags = 0; +static const char *envdir = TOKU_TEST_FILENAME; + +static void run_test(int ndb) { + int r; + + char rmcmd[32 + strlen(envdir)]; + snprintf(rmcmd, sizeof rmcmd, "rm -rf %s", envdir); + r = system(rmcmd); CKERR(r); + r = toku_os_mkdir(envdir, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; + r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + + DB *dbs[ndb]; + uint32_t db_flags[ndb]; + uint32_t dbt_flags[ndb]; + for (int i = 0; i < ndb; i++) { + db_flags[i] = DB_NOOVERWRITE; + dbt_flags[i] = 0; + r = db_create(&dbs[i], env, 0); CKERR(r); + char name[32]; + sprintf(name, "db%d", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + + struct rlimit current_nproc_limit; + r = getrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + struct rlimit new_nproc_limit = current_nproc_limit; + new_nproc_limit.rlim_cur = 0; + r = setrlimit(RLIMIT_NPROC, &new_nproc_limit); + assert(r == 0); + + DB_LOADER *loader; + int loader_r = env->create_loader(env, txn, &loader, ndb > 0 ? dbs[0] : NULL, ndb, dbs, db_flags, dbt_flags, loader_flags); + + r = setrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + if (loader_flags & LOADER_DISALLOW_PUTS) { + CKERR(loader_r); + loader_r = loader->close(loader); + CKERR(loader_r); + } else { + CKERR2(loader_r, EAGAIN); + } + + r = txn->commit(txn, 0); CKERR(r); + + for (int i = 0; i < ndb; i++) { + r = dbs[i]->close(dbs[i], 0); CKERR(r); + } + + for (int i = 0; i < ndb; i++) { + r = db_create(&dbs[i], env, 0); CKERR(r); + char name[32]; + sprintf(name, "db%d", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, 0, 0666); CKERR(r); + } + + for (int i = 0; i < ndb; i++) { + r = dbs[i]->close(dbs[i], 0); CKERR(r); + } + + r = env->close(env, 0); CKERR(r); +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: %s -h -v -q -p\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-p") == 0) { + loader_flags |= LOADER_DISALLOW_PUTS; + } else if (strcmp(argv[0], "-z") == 0) { + loader_flags |= LOADER_COMPRESS_INTERMEDIATES; + } else if (strcmp(argv[0], "-e") == 0) { + argc--; argv++; + if (argc > 0) + envdir = argv[0]; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + run_test(1); + return 0; +} diff --git a/storage/tokudb/ft-index/src/tests/loader-create-nproc-limit.cc b/storage/tokudb/ft-index/src/tests/loader-create-nproc-limit.cc new file mode 100644 index 00000000000..7a61fce7799 --- /dev/null +++ b/storage/tokudb/ft-index/src/tests/loader-create-nproc-limit.cc @@ -0,0 +1,199 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +// Verify that env->create_loader works correctly (does not crash, does not leak memory, returns the right error code) +// when the NPROC limit is exceeded. + +#ident "Copyright (c) 2010-2013 Tokutek Inc. All rights reserved." +#ident "$Id$" + +#include "test.h" +#include +#include + +static int loader_flags = 0; +static const char *envdir = TOKU_TEST_FILENAME; + +static void run_test(int ndb) { + int r; + + char rmcmd[32 + strlen(envdir)]; + snprintf(rmcmd, sizeof rmcmd, "rm -rf %s", envdir); + r = system(rmcmd); CKERR(r); + r = toku_os_mkdir(envdir, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; + r = env->open(env, envdir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + + DB *dbs[ndb]; + uint32_t db_flags[ndb]; + uint32_t dbt_flags[ndb]; + for (int i = 0; i < ndb; i++) { + db_flags[i] = DB_NOOVERWRITE; + dbt_flags[i] = 0; + r = db_create(&dbs[i], env, 0); CKERR(r); + char name[32]; + sprintf(name, "db%d", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + + struct rlimit current_nproc_limit; + r = getrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + struct rlimit new_nproc_limit = current_nproc_limit; + new_nproc_limit.rlim_cur = 0; + r = setrlimit(RLIMIT_NPROC, &new_nproc_limit); + assert(r == 0); + + DB_LOADER *loader; + int loader_r = env->create_loader(env, txn, &loader, ndb > 0 ? dbs[0] : NULL, ndb, dbs, db_flags, dbt_flags, loader_flags); + + r = setrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + if (loader_flags & LOADER_DISALLOW_PUTS) { + CKERR(loader_r); + loader_r = loader->close(loader); + CKERR(loader_r); + } else { + CKERR2(loader_r, EAGAIN); + } + + r = txn->abort(txn); CKERR(r); + + for (int i = 0; i < ndb; i++) { + r = dbs[i]->close(dbs[i], 0); CKERR(r); + } + + r = env->close(env, 0); CKERR(r); +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: %s -h -v -q -p\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-p") == 0) { + loader_flags |= LOADER_DISALLOW_PUTS; + } else if (strcmp(argv[0], "-z") == 0) { + loader_flags |= LOADER_COMPRESS_INTERMEDIATES; + } else if (strcmp(argv[0], "-e") == 0) { + argc--; argv++; + if (argc > 0) + envdir = argv[0]; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + run_test(1); + return 0; +} diff --git a/storage/tokudb/ft-index/src/tests/test_insert_unique.cc b/storage/tokudb/ft-index/src/tests/test_insert_unique.cc new file mode 100644 index 00000000000..29439f9d704 --- /dev/null +++ b/storage/tokudb/ft-index/src/tests/test_insert_unique.cc @@ -0,0 +1,202 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved." +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." +/** + * Test that unique inserts work correctly. This exercises the rightmost leaf inject optimization. + */ + +#include + +#include "test.h" + +static char random_buf[8]; +static struct random_data random_data; + +static void test_simple_unique_insert(DB_ENV *env) { + int r; + DB *db; + r = db_create(&db, env, 0); CKERR(r); + r = db->open(db, NULL, "db", NULL, DB_BTREE, DB_CREATE, 0644); CKERR(r); + + DBT key1, key2, key3; + dbt_init(&key1, "a", sizeof("a")); + dbt_init(&key2, "b", sizeof("b")); + dbt_init(&key3, "c", sizeof("c")); + r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR(r); + r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + r = db->put(db, NULL, &key3, &key3, DB_NOOVERWRITE); CKERR(r); + r = db->put(db, NULL, &key3, &key3, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + r = db->put(db, NULL, &key2, &key2, DB_NOOVERWRITE); CKERR(r); + r = db->put(db, NULL, &key2, &key2, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + // sanity check + r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + r = db->put(db, NULL, &key1, &key3, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + + r = db->close(db, 0); CKERR(r); + r = env->dbremove(env, NULL, "db", NULL, 0); CKERR(r); +} + +static void test_large_sequential_insert_unique(DB_ENV *env) { + int r; + DB *db; + r = db_create(&db, env, 0); CKERR(r); + + // very small nodes/basements to make a taller tree + r = db->set_pagesize(db, 8 * 1024); CKERR(r); + r = db->set_readpagesize(db, 2 * 1024); CKERR(r); + r = db->open(db, NULL, "db", NULL, DB_BTREE, DB_CREATE, 0644); CKERR(r); + + const int val_size = 1024; + char *XMALLOC_N(val_size, val_buf); + memset(val_buf, 'k', val_size); + DBT val; + dbt_init(&val, val_buf, val_size); + + // grow a tree to about depth 3, taking sanity checks along the way + const int start_num_rows = (64 * 1024 * 1024) / val_size; + for (int i = 0; i < start_num_rows; i++) { + DBT key; + int k = toku_htonl(i); + dbt_init(&key, &k, sizeof(k)); + r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR(r); + if (i % 50 == 0) { + // sanity check - should not be able to insert this key twice in a row + r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + + // .. but re-inserting is okay, if we provisionally deleted the row + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + r = db->del(db, NULL, &key, DB_DELETE_ANY); CKERR(r); + r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR(r); + r = txn->commit(txn, 0); CKERR(r); + } + if (i > 0 && i % 250 == 0) { + // sanity check - unique checks on random keys we already inserted should + // fail (exercises middle-of-the-tree checks) + for (int check_i = 0; check_i < 4; check_i++) { + DBT rand_key; + int rand_k = toku_htonl(myrandom_r(&random_data) % i); + dbt_init(&rand_key, &rand_k, sizeof(rand_k)); + r = db->put(db, NULL, &rand_key, &val, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST); + } + } + } + + toku_free(val_buf); + r = db->close(db, 0); CKERR(r); + r = env->dbremove(env, NULL, "db", NULL, 0); CKERR(r); +} + + +int test_main(int argc, char * const argv[]) { + default_parse_args(argc, argv); + + int r; + const int envflags = DB_INIT_MPOOL | DB_CREATE | DB_THREAD | + DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_TXN | DB_PRIVATE; + + // startup + DB_ENV *env; + toku_os_recursive_delete(TOKU_TEST_FILENAME); + r = toku_os_mkdir(TOKU_TEST_FILENAME, 0755); CKERR(r); + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, TOKU_TEST_FILENAME, envflags, 0755); + + r = myinitstate_r(random(), random_buf, 8, &random_data); CKERR(r); + + test_simple_unique_insert(env); + test_large_sequential_insert_unique(env); + + // cleanup + r = env->close(env, 0); CKERR(r); + + return 0; +} + diff --git a/storage/tokudb/ft-index/src/ydb.cc b/storage/tokudb/ft-index/src/ydb.cc index a2bb221a40b..df4fd6baf87 100644 --- a/storage/tokudb/ft-index/src/ydb.cc +++ b/storage/tokudb/ft-index/src/ydb.cc @@ -1160,6 +1160,7 @@ env_close(DB_ENV * env, uint32_t flags) { goto panic_and_quit_early; } } + env_fsync_log_cron_destroy(env); if (env->i->cachetable) { toku_cachetable_minicron_shutdown(env->i->cachetable); if (env->i->logger) { @@ -1200,7 +1201,6 @@ env_close(DB_ENV * env, uint32_t flags) { } env_fs_destroy(env); - env_fsync_log_cron_destroy(env); env->i->ltm.destroy(); if (env->i->data_dir) toku_free(env->i->data_dir); @@ -2901,7 +2901,13 @@ env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u r = toku_db_create(&db, env, 0); lazy_assert_zero(r); r = toku_db_open_iname(db, txn, iname, 0, 0); - lazy_assert_zero(r); + if (txn && r) { + if (r == EMFILE || r == ENFILE) + r = toku_ydb_do_error(env, r, "toku dbremove failed because open file limit reached\n"); + else + r = toku_ydb_do_error(env, r, "toku dbremove failed\n"); + goto exit; + } if (txn) { // Now that we have a writelock on dname, verify that there are still no handles open. (to prevent race conditions) if (env_is_db_with_dname_open(env, dname)) { diff --git a/storage/tokudb/ft-index/src/ydb_db.cc b/storage/tokudb/ft-index/src/ydb_db.cc index 78e08705ac6..b9fa32eb4a0 100644 --- a/storage/tokudb/ft-index/src/ydb_db.cc +++ b/storage/tokudb/ft-index/src/ydb_db.cc @@ -1221,36 +1221,14 @@ load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[/*N*/], const char * new int locked_load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[/*N*/], char * new_inames_in_env[/*N*/], LSN *load_lsn, bool mark_as_loader) { - int ret, r; + int r; HANDLE_READ_ONLY_TXN(txn); - DB_TXN *child_txn = NULL; - int using_txns = env->i->open_flags & DB_INIT_TXN; - if (using_txns) { - ret = toku_txn_begin(env, txn, &child_txn, 0); - invariant_zero(ret); - } - // cannot begin a checkpoint toku_multi_operation_client_lock(); - r = load_inames(env, child_txn, N, dbs, (const char **) new_inames_in_env, load_lsn, mark_as_loader); + r = load_inames(env, txn, N, dbs, (const char **) new_inames_in_env, load_lsn, mark_as_loader); toku_multi_operation_client_unlock(); - if (using_txns) { - if (r == 0) { - ret = locked_txn_commit(child_txn, DB_TXN_NOSYNC); - invariant_zero(ret); - } else { - ret = locked_txn_abort(child_txn); - invariant_zero(ret); - for (int i = 0; i < N; i++) { - if (new_inames_in_env[i]) { - toku_free(new_inames_in_env[i]); - new_inames_in_env[i] = NULL; - } - } - } - } return r; } diff --git a/storage/tokudb/ft-index/src/ydb_write.cc b/storage/tokudb/ft-index/src/ydb_write.cc index 4826e418ab5..82fbf439885 100644 --- a/storage/tokudb/ft-index/src/ydb_write.cc +++ b/storage/tokudb/ft-index/src/ydb_write.cc @@ -253,6 +253,30 @@ toku_db_del(DB *db, DB_TXN *txn, DBT *key, uint32_t flags, bool holds_mo_lock) { return r; } +static int +db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, int flags, bool do_log) { + int r = 0; + bool unique = false; + enum ft_msg_type type = FT_INSERT; + if (flags == DB_NOOVERWRITE) { + unique = true; + } else if (flags == DB_NOOVERWRITE_NO_ERROR) { + type = FT_INSERT_NO_OVERWRITE; + } else if (flags != 0) { + // All other non-zero flags are unsupported + r = EINVAL; + } + if (r == 0) { + TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : nullptr; + if (unique) { + r = toku_ft_insert_unique(db->i->ft_handle, key, val, ttxn, do_log); + } else { + toku_ft_maybe_insert(db->i->ft_handle, key, val, ttxn, false, ZERO_LSN, do_log, type); + } + invariant(r == DB_KEYEXIST || r == 0); + } + return r; +} int toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_mo_lock) { @@ -265,25 +289,16 @@ toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_ flags &= ~lock_flags; r = db_put_check_size_constraints(db, key, val); - if (r == 0) { - //Do any checking required by the flags. - r = db_put_check_overwrite_constraint(db, txn, key, lock_flags, flags); - } - //Do locking if necessary. Do not grab the lock again if this DB had a unique - //check performed because the lock was already grabbed by its cursor callback. + + //Do locking if necessary. bool do_locking = (bool)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)); - if (r == 0 && do_locking && !(flags & DB_NOOVERWRITE)) { + if (r == 0 && do_locking) { r = toku_db_get_point_write_lock(db, txn, key); } if (r == 0) { //Insert into the ft. - TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; - enum ft_msg_type type = FT_INSERT; - if (flags==DB_NOOVERWRITE_NO_ERROR) { - type = FT_INSERT_NO_OVERWRITE; - } if (!holds_mo_lock) toku_multi_operation_client_lock(); - toku_ft_maybe_insert(db->i->ft_handle, key, val, ttxn, false, ZERO_LSN, true, type); + r = db_put(db, txn, key, val, flags, true); if (!holds_mo_lock) toku_multi_operation_client_unlock(); } @@ -635,9 +650,11 @@ log_put_multiple(DB_TXN *txn, DB *src_db, const DBT *src_key, const DBT *src_val } } +// Requires: If remaining_flags is non-null, this function performs any required uniqueness checks +// Otherwise, the caller is responsible. static int -do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], DBT_ARRAY vals[], DB *src_db, const DBT *src_key, bool indexer_shortcut) { - TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; +do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], DBT_ARRAY vals[], uint32_t *remaining_flags, DB *src_db, const DBT *src_key, bool indexer_shortcut) { + int r = 0; for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { DB *db = db_array[which_db]; @@ -666,16 +683,21 @@ do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], } if (do_put) { for (uint32_t i = 0; i < keys[which_db].size; i++) { - // if db is being indexed by an indexer, then put into that db if the src key is to the left or equal to the - // indexers cursor. we have to get the src_db from the indexer and find it in the db_array. - toku_ft_maybe_insert(db->i->ft_handle, - &keys[which_db].dbts[i], &vals[which_db].dbts[i], - ttxn, false, ZERO_LSN, false, FT_INSERT); + int flags = 0; + if (remaining_flags != nullptr) { + flags = remaining_flags[which_db]; + invariant(!(flags & DB_NOOVERWRITE_NO_ERROR)); + } + r = db_put(db, txn, &keys[which_db].dbts[i], &vals[which_db].dbts[i], flags, false); + if (r != 0) { + goto done; + } } } } } - return 0; +done: + return r; } static int @@ -754,20 +776,14 @@ env_put_multiple_internal( r = db_put_check_size_constraints(db, &put_key, &put_val); if (r != 0) goto cleanup; - //Check overwrite constraints - r = db_put_check_overwrite_constraint(db, txn, - &put_key, - lock_flags[which_db], remaining_flags[which_db]); - if (r != 0) goto cleanup; if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) { //put_multiple does not support delaying the no error, since we would //have to log the flag in the put_multiple. r = EINVAL; goto cleanup; } - //Do locking if necessary. Do not grab the lock again if this DB had a unique - //check performed because the lock was already grabbed by its cursor callback. - if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE) && !(remaining_flags[which_db] & DB_NOOVERWRITE)) { + //Do locking if necessary. + if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) { //Needs locking r = toku_db_get_point_write_lock(db, txn, &put_key); if (r != 0) goto cleanup; @@ -790,8 +806,10 @@ env_put_multiple_internal( } } toku_multi_operation_client_lock(); - log_put_multiple(txn, src_db, src_key, src_val, num_dbs, fts); - r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, src_db, src_key, indexer_shortcut); + r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, remaining_flags, src_db, src_key, indexer_shortcut); + if (r == 0) { + log_put_multiple(txn, src_db, src_key, src_val, num_dbs, fts); + } toku_multi_operation_client_unlock(); if (indexer_lock_taken) { toku_indexer_unlock(indexer); @@ -1075,7 +1093,7 @@ env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, // recovery so we don't end up losing data. // So unlike env->put_multiple, we ONLY log a 'put_multiple' log entry. log_put_multiple(txn, src_db, new_src_key, new_src_data, n_put_dbs, put_fts); - r = do_put_multiple(txn, n_put_dbs, put_dbs, put_key_arrays, put_val_arrays, src_db, new_src_key, indexer_shortcut); + r = do_put_multiple(txn, n_put_dbs, put_dbs, put_key_arrays, put_val_arrays, nullptr, src_db, new_src_key, indexer_shortcut); } toku_multi_operation_client_unlock(); if (indexer_lock_taken) { diff --git a/storage/tokudb/ft-index/util/omt.cc b/storage/tokudb/ft-index/util/omt.cc index 92cda38aefe..709c7eab4c3 100644 --- a/storage/tokudb/ft-index/util/omt.cc +++ b/storage/tokudb/ft-index/util/omt.cc @@ -207,6 +207,9 @@ void omt::clone(const omt &src) { src.fill_array_with_subtree_values(&this->d.a.values[0], src.d.t.root); } this->d.a.num_values = src.size(); + if (supports_marks) { + this->convert_to_tree(); + } } template diff --git a/storage/tokudb/ft-index/util/tests/threadpool-nproc-limit.cc b/storage/tokudb/ft-index/util/tests/threadpool-nproc-limit.cc new file mode 100644 index 00000000000..f1ba10dad84 --- /dev/null +++ b/storage/tokudb/ft-index/util/tests/threadpool-nproc-limit.cc @@ -0,0 +1,171 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/* +COPYING CONDITIONS NOTICE: + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation, and provided that the + following conditions are met: + + * Redistributions of source code must retain this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below). + + * Redistributions in binary form must reproduce this COPYING + CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the + DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the + PATENT MARKING NOTICE (below), and the PATENT RIGHTS + GRANT (below) in the documentation and/or other materials + provided with the distribution. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +COPYRIGHT NOTICE: + + TokuDB, Tokutek Fractal Tree Indexing Library. + Copyright (C) 2007-2013 Tokutek, Inc. + +DISCLAIMER: + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + +UNIVERSITY PATENT NOTICE: + + The technology is licensed by the Massachusetts Institute of + Technology, Rutgers State University of New Jersey, and the Research + Foundation of State University of New York at Stony Brook under + United States of America Serial No. 11/760379 and to the patents + and/or patent applications resulting from it. + +PATENT MARKING NOTICE: + + This software is covered by US Patent No. 8,185,551. + This software is covered by US Patent No. 8,489,638. + +PATENT RIGHTS GRANT: + + "THIS IMPLEMENTATION" means the copyrightable works distributed by + Tokutek as part of the Fractal Tree project. + + "PATENT CLAIMS" means the claims of patents that are owned or + licensable by Tokutek, both currently or in the future; and that in + the absence of this license would be infringed by THIS + IMPLEMENTATION or by using or running THIS IMPLEMENTATION. + + "PATENT CHALLENGE" shall mean a challenge to the validity, + patentability, enforceability and/or non-infringement of any of the + PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS. + + Tokutek hereby grants to you, for the term and geographical scope of + the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free, + irrevocable (except as stated in this section) patent license to + make, have made, use, offer to sell, sell, import, transfer, and + otherwise run, modify, and propagate the contents of THIS + IMPLEMENTATION, where such license applies only to the PATENT + CLAIMS. This grant does not include claims that would be infringed + only as a consequence of further modifications of THIS + IMPLEMENTATION. If you or your agent or licensee institute or order + or agree to the institution of patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that + THIS IMPLEMENTATION constitutes direct or contributory patent + infringement, or inducement of patent infringement, then any rights + granted to you under this License shall terminate as of the date + such litigation is filed. If you or your agent or exclusive + licensee institute or order or agree to the institution of a PATENT + CHALLENGE, then Tokutek may terminate any rights granted to you + under this License. +*/ + +#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved." + +// this test verifies that the toku thread pool is resilient when hitting the nproc limit. + +#include +#include +#include +#include +#include +#include +#include +#include + +int verbose = 0; + +static int usage(void) { + fprintf(stderr, "[-q] [-v] [--verbose] (%d)\n", verbose); + return 1; +} + +static void *f(void *arg) { + return arg; +} + +static int dotest(int the_limit) { + if (verbose) + fprintf(stderr, "%s:%u %d\n", __FILE__, __LINE__, the_limit); + int r; + struct toku_thread_pool *pool = nullptr; + r = toku_thread_pool_create(&pool, 10); + assert(r == 0 && pool != nullptr); + + struct rlimit current_nproc_limit; + r = getrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + struct rlimit new_nproc_limit = current_nproc_limit; + new_nproc_limit.rlim_cur = the_limit; + r = setrlimit(RLIMIT_NPROC, &new_nproc_limit); + assert(r == 0); + + int want_n = 20; + int got_n = want_n; + r = toku_thread_pool_run(pool, 0, &got_n, f, nullptr); + if (r == 0) + assert(want_n == got_n); + else { + assert(r == EWOULDBLOCK); + assert(got_n <= want_n); + } + + r = setrlimit(RLIMIT_NPROC, ¤t_nproc_limit); + assert(r == 0); + + if (verbose) + toku_thread_pool_print(pool, stderr); + toku_thread_pool_destroy(&pool); + return got_n > 0; +} + +int main(int argc, char *argv[]) { + // parse args + for (int i = 1; i < argc; i++) { + char *arg = argv[i]; + if (arg[0] != '-') + break; + if (strcmp(arg, "-v") == 0 || strcmp(arg, "--verbose") == 0) { + verbose = verbose+1; + continue; + } + if (strcmp(arg, "-q") == 0) { + verbose = verbose > 0 ? verbose-1 : 0; + continue; + } + return usage(); + } + // set increasing nproc limits until the test succeeds in hitting the limit after > 0 threads are created + for (int i = 0; 1; i++) { + if (dotest(i)) + break; + } + return 0; +} diff --git a/storage/tokudb/ft-index/util/threadpool.cc b/storage/tokudb/ft-index/util/threadpool.cc index d6652b7a71c..4f1105d83c2 100644 --- a/storage/tokudb/ft-index/util/threadpool.cc +++ b/storage/tokudb/ft-index/util/threadpool.cc @@ -132,13 +132,18 @@ static int toku_thread_create(struct toku_thread_pool *pool, struct toku_thread **toku_thread_return) { int r; struct toku_thread *MALLOC(thread); - if (thread == NULL) { + if (thread == nullptr) { r = get_error_errno(); } else { memset(thread, 0, sizeof *thread); thread->pool = pool; - toku_cond_init(&thread->wait, NULL); - r = toku_pthread_create(&thread->tid, NULL, toku_thread_run_internal, thread); resource_assert_zero(r); + toku_cond_init(&thread->wait, nullptr); + r = toku_pthread_create(&thread->tid, nullptr, toku_thread_run_internal, thread); + if (r) { + toku_cond_destroy(&thread->wait); + toku_free(thread); + thread = nullptr; + } *toku_thread_return = thread; } return r; @@ -192,7 +197,7 @@ toku_thread_run_internal(void *arg) { if (doexit) break; toku_thread_pool_lock(pool); - thread->f = NULL; + thread->f = nullptr; toku_list_push(&pool->free_threads, &thread->free_link); } return arg; @@ -202,13 +207,13 @@ int toku_thread_pool_create(struct toku_thread_pool **pool_return, int max_threads) { int r; struct toku_thread_pool *CALLOC(pool); - if (pool == NULL) { + if (pool == nullptr) { r = get_error_errno(); } else { - toku_mutex_init(&pool->lock, NULL); + toku_mutex_init(&pool->lock, nullptr); toku_list_init(&pool->free_threads); toku_list_init(&pool->all_threads); - toku_cond_init(&pool->wait_free, NULL); + toku_cond_init(&pool->wait_free, nullptr); pool->cur_threads = 0; pool->max_threads = max_threads; *pool_return = pool; @@ -230,7 +235,7 @@ toku_thread_pool_unlock(struct toku_thread_pool *pool) { void toku_thread_pool_destroy(struct toku_thread_pool **poolptr) { struct toku_thread_pool *pool = *poolptr; - *poolptr = NULL; + *poolptr = nullptr; // ask the threads to exit toku_thread_pool_lock(pool); @@ -260,7 +265,7 @@ toku_thread_pool_destroy(struct toku_thread_pool **poolptr) { static int toku_thread_pool_add(struct toku_thread_pool *pool) { - struct toku_thread *thread = NULL; + struct toku_thread *thread = nullptr; int r = toku_thread_create(pool, &thread); if (r == 0) { pool->cur_threads += 1; @@ -294,7 +299,7 @@ toku_thread_pool_get_one(struct toku_thread_pool *pool, int dowait, struct toku_ struct toku_thread *thread = toku_list_struct(list, struct toku_thread, free_link); *toku_thread_return = thread; } else - *toku_thread_return = NULL; + *toku_thread_return = nullptr; toku_thread_pool_unlock(pool); return r; } diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index a47b8868b25..c6e3ce27c0f 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -120,14 +120,6 @@ extern "C" { #include "hatoku_defines.h" #include "hatoku_cmp.h" -static inline void *thd_data_get(THD *thd, int slot) { - return thd->ha_data[slot].ha_ptr; -} - -static inline void thd_data_set(THD *thd, int slot, void *data) { - thd->ha_data[slot].ha_ptr = data; -} - static inline uint get_key_parts(const KEY *key); #undef PACKAGE @@ -477,7 +469,6 @@ typedef struct index_read_info { DBT* orig_key; } *INDEX_READ_INFO; - static int ai_poll_fun(void *extra, float progress) { LOADER_CONTEXT context = (LOADER_CONTEXT)extra; if (context->thd->killed) { @@ -1016,8 +1007,7 @@ static uchar* pack_toku_field_blob( static int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) { int error; - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) tokudb_my_malloc(sizeof(*trx), MYF(MY_ZEROFILL)); + tokudb_trx_data* trx = (tokudb_trx_data *) tokudb_my_malloc(sizeof(*trx), MYF(MY_ZEROFILL)); if (!trx) { error = ENOMEM; goto cleanup; @@ -1614,8 +1604,7 @@ int ha_tokudb::initialize_share( DB_TXN* txn = NULL; bool do_commit = false; THD* thd = ha_thd(); - tokudb_trx_data *trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(ha_thd(), tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton); if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) { txn = trx->sub_sp_level; } @@ -1727,7 +1716,7 @@ int ha_tokudb::initialize_share( } share->ref_length = ref_length; - error = estimate_num_rows(share->file,&num_rows, txn); + error = estimate_num_rows(share->file, &num_rows, txn); // // estimate_num_rows should not fail under normal conditions // @@ -1937,7 +1926,6 @@ exit: // int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) { int error = ENOSYS; - DBC* crsr = NULL; bool do_commit = false; DB_BTREE_STAT64 dict_stats; DB_TXN* txn_to_use = NULL; @@ -1951,21 +1939,12 @@ int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) { txn_to_use = txn; } - error = db->stat64( - share->file, - txn_to_use, - &dict_stats - ); + error = db->stat64(db, txn_to_use, &dict_stats); if (error) { goto cleanup; } *num_rows = dict_stats.bt_ndata; error = 0; cleanup: - if (crsr != NULL) { - int r = crsr->c_close(crsr); - assert(r==0); - crsr = NULL; - } if (do_commit) { commit_txn(txn_to_use, 0); txn_to_use = NULL; @@ -3271,7 +3250,7 @@ void ha_tokudb::start_bulk_insert(ha_rows rows) { TOKUDB_HANDLER_DBUG_ENTER("%llu txn %p", (unsigned long long) rows, transaction); #endif THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); delay_updating_ai_metadata = true; ai_metadata_update_required = false; abort_loader = false; @@ -3281,7 +3260,7 @@ void ha_tokudb::start_bulk_insert(ha_rows rows) { num_DBs_locked_in_bulk = true; lock_count = 0; - if (share->try_table_lock) { + if ((rows == 0 || rows > 1) && share->try_table_lock) { if (get_prelock_empty(thd) && may_table_be_empty(transaction)) { if (using_ignore || is_insert_ignore(thd) || thd->lex->duplicates != DUP_ERROR || table->s->next_number_key_offset) { @@ -3340,7 +3319,7 @@ int ha_tokudb::end_bulk_insert(bool abort) { TOKUDB_HANDLER_DBUG_ENTER(""); int error = 0; THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); bool using_loader = (loader != NULL); if (ai_metadata_update_required) { tokudb_pthread_mutex_lock(&share->mutex); @@ -3354,10 +3333,10 @@ int ha_tokudb::end_bulk_insert(bool abort) { if (loader) { if (!abort_loader && !thd->killed) { DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", { - const char *old_proc_info = tokudb_thd_get_proc_info(thd); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); thd_proc_info(thd, "DBUG sleep"); my_sleep(20000000); - thd_proc_info(thd, old_proc_info); + thd_proc_info(thd, orig_proc_info); }); error = loader->close(loader); loader = NULL; @@ -3374,12 +3353,8 @@ int ha_tokudb::end_bulk_insert(bool abort) { if (i == primary_key && !share->pk_has_string) { continue; } - error = is_index_unique( - &is_unique, - transaction, - share->key_file[i], - &table->key_info[i] - ); + error = is_index_unique(&is_unique, transaction, share->key_file[i], &table->key_info[i], + DB_PRELOCKED_WRITE); if (error) goto cleanup; if (!is_unique) { error = HA_ERR_FOUND_DUPP_KEY; @@ -3419,6 +3394,7 @@ cleanup: } } trx->stmt_progress.using_loader = false; + thd_proc_info(thd, 0); TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error); } @@ -3426,7 +3402,7 @@ int ha_tokudb::end_bulk_insert() { return end_bulk_insert( false ); } -int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info) { +int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags) { int error; DBC* tmp_cursor1 = NULL; DBC* tmp_cursor2 = NULL; @@ -3434,7 +3410,7 @@ int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_in uint64_t cnt = 0; char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound. THD* thd = ha_thd(); - const char *old_proc_info = tokudb_thd_get_proc_info(thd); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); memset(&key1, 0, sizeof(key1)); memset(&key2, 0, sizeof(key2)); memset(&val, 0, sizeof(val)); @@ -3442,49 +3418,23 @@ int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_in memset(&packed_key2, 0, sizeof(packed_key2)); *is_unique = true; - error = db->cursor( - db, - txn, - &tmp_cursor1, - DB_SERIALIZABLE - ); + error = db->cursor(db, txn, &tmp_cursor1, DB_SERIALIZABLE); if (error) { goto cleanup; } - error = db->cursor( - db, - txn, - &tmp_cursor2, - DB_SERIALIZABLE - ); + error = db->cursor(db, txn, &tmp_cursor2, DB_SERIALIZABLE); if (error) { goto cleanup; } - - error = tmp_cursor1->c_get( - tmp_cursor1, - &key1, - &val, - DB_NEXT - ); + error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags); if (error == DB_NOTFOUND) { *is_unique = true; error = 0; goto cleanup; } else if (error) { goto cleanup; } - error = tmp_cursor2->c_get( - tmp_cursor2, - &key2, - &val, - DB_NEXT - ); + error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags); if (error) { goto cleanup; } - error = tmp_cursor2->c_get( - tmp_cursor2, - &key2, - &val, - DB_NEXT - ); + error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags); if (error == DB_NOTFOUND) { *is_unique = true; error = 0; @@ -3496,59 +3446,25 @@ int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_in bool has_null1; bool has_null2; int cmp; - place_key_into_mysql_buff( - key_info, - table->record[0], - (uchar *) key1.data + 1 - ); - place_key_into_mysql_buff( - key_info, - table->record[1], - (uchar *) key2.data + 1 - ); + place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key1.data + 1); + place_key_into_mysql_buff(key_info, table->record[1], (uchar *) key2.data + 1); - create_dbt_key_for_lookup( - &packed_key1, - key_info, - key_buff, - table->record[0], - &has_null1 - ); - create_dbt_key_for_lookup( - &packed_key2, - key_info, - key_buff2, - table->record[1], - &has_null2 - ); + create_dbt_key_for_lookup(&packed_key1, key_info, key_buff, table->record[0], &has_null1); + create_dbt_key_for_lookup(&packed_key2, key_info, key_buff2, table->record[1], &has_null2); if (!has_null1 && !has_null2) { cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2); if (cmp == 0) { memcpy(key_buff, key1.data, key1.size); - place_key_into_mysql_buff( - key_info, - table->record[0], - (uchar *) key_buff + 1 - ); + place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key_buff + 1); *is_unique = false; break; } } - error = tmp_cursor1->c_get( - tmp_cursor1, - &key1, - &val, - DB_NEXT - ); + error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags); if (error) { goto cleanup; } - error = tmp_cursor2->c_get( - tmp_cursor2, - &key2, - &val, - DB_NEXT - ); + error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags); if (error && (error != DB_NOTFOUND)) { goto cleanup; } cnt++; @@ -3571,7 +3487,7 @@ int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_in error = 0; cleanup: - thd_proc_info(thd, old_proc_info); + thd_proc_info(thd, orig_proc_info); if (tmp_cursor1) { tmp_cursor1->c_close(tmp_cursor1); tmp_cursor1 = NULL; @@ -4072,7 +3988,7 @@ int ha_tokudb::write_row(uchar * record) { } } - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (!error) { added_rows++; trx->stmt_progress.inserted++; @@ -4129,7 +4045,7 @@ int ha_tokudb::update_row(const uchar * old_row, uchar * new_row) { THD* thd = ha_thd(); DB_TXN* sub_trans = NULL; DB_TXN* txn = NULL; - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); uint curr_num_DBs; LINT_INIT(error); @@ -4303,7 +4219,7 @@ int ha_tokudb::delete_row(const uchar * record) { bool has_null; THD* thd = ha_thd(); uint curr_num_DBs; - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; ha_statistic_increment(&SSV::ha_delete_count); @@ -4870,7 +4786,7 @@ int ha_tokudb::index_read(uchar * buf, const uchar * key, uint key_len, enum ha_ int error = 0; uint32_t flags = 0; THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; struct smart_dbt_info info; struct index_read_info ir_info; @@ -5348,7 +5264,7 @@ int ha_tokudb::get_next(uchar* buf, int direction, DBT* key_to_compare, bool do_ int error = 0; uint32_t flags = SET_PRELOCK_FLAG(0); THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; bool need_val; HANDLE_INVALID_CURSOR(); @@ -5501,7 +5417,7 @@ int ha_tokudb::index_first(uchar * buf) { struct smart_dbt_info info; uint32_t flags = SET_PRELOCK_FLAG(0); THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; HANDLE_INVALID_CURSOR(); ha_statistic_increment(&SSV::ha_read_first_count); @@ -5544,7 +5460,7 @@ int ha_tokudb::index_last(uchar * buf) { struct smart_dbt_info info; uint32_t flags = SET_PRELOCK_FLAG(0); THD* thd = ha_thd(); - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot);; + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);; HANDLE_INVALID_CURSOR(); ha_statistic_increment(&SSV::ha_read_last_count); @@ -5635,7 +5551,7 @@ int ha_tokudb::rnd_next(uchar * buf) { void ha_tokudb::track_progress(THD* thd) { - tokudb_trx_data* trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (trx) { ulonglong num_written = trx->stmt_progress.inserted + trx->stmt_progress.updated + trx->stmt_progress.deleted; bool update_status = @@ -6225,12 +6141,11 @@ int ha_tokudb::external_lock(THD * thd, int lock_type) { } int error = 0; - tokudb_trx_data *trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (!trx) { error = create_tokudb_trx_data_instance(&trx); if (error) { goto cleanup; } - thd_data_set(thd, tokudb_hton->slot, trx); + thd_set_ha_data(thd, tokudb_hton, trx); } if (trx->all == NULL) { trx->sp_level = NULL; @@ -6304,7 +6219,7 @@ int ha_tokudb::start_stmt(THD * thd, thr_lock_type lock_type) { TOKUDB_HANDLER_TRACE("q %s", thd->query()); int error = 0; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); DBUG_ASSERT(trx); /* @@ -6404,7 +6319,7 @@ uint32_t ha_tokudb::get_cursor_isolation_flags(enum thr_lock_type lock_type, THD lock (if we don't want to use MySQL table locks at all) or add locks for many tables (like we do when we are using a MERGE handler). - Tokudb DB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which + TokuDB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which signals that we are doing WRITES, but we are still allowing other reader's and writer's. @@ -6426,34 +6341,25 @@ THR_LOCK_DATA **ha_tokudb::store_lock(THD * thd, THR_LOCK_DATA ** to, enum thr_l } if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) { - // if creating a hot index - if (thd_sql_command(thd)== SQLCOM_CREATE_INDEX && get_create_index_online(thd)) { - rw_rdlock(&share->num_DBs_lock); - if (share->num_DBs == (table->s->keys + tokudb_test(hidden_primary_key))) { + enum_sql_command sql_command = (enum_sql_command) thd_sql_command(thd); + if (!thd->in_lock_tables) { + if (sql_command == SQLCOM_CREATE_INDEX && get_create_index_online(thd)) { + // hot indexing + rw_rdlock(&share->num_DBs_lock); + if (share->num_DBs == (table->s->keys + tokudb_test(hidden_primary_key))) { + lock_type = TL_WRITE_ALLOW_WRITE; + } + rw_unlock(&share->num_DBs_lock); + } else if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) && + sql_command != SQLCOM_TRUNCATE && !thd_tablespace_op(thd)) { + // allow concurrent writes lock_type = TL_WRITE_ALLOW_WRITE; + } else if (sql_command == SQLCOM_OPTIMIZE && lock_type == TL_READ_NO_INSERT) { + // hot optimize table + lock_type = TL_READ; } - lock.type = lock_type; - rw_unlock(&share->num_DBs_lock); - } - - // 5.5 supports reads concurrent with alter table. just use the default lock type. -#if MYSQL_VERSION_ID < 50500 - else if (thd_sql_command(thd)== SQLCOM_CREATE_INDEX || - thd_sql_command(thd)== SQLCOM_ALTER_TABLE || - thd_sql_command(thd)== SQLCOM_DROP_INDEX) { - // force alter table to lock out other readers - lock_type = TL_WRITE; - lock.type = lock_type; - } -#endif - else { - // If we are not doing a LOCK TABLE, then allow multiple writers - if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) && - !thd->in_lock_tables && thd_sql_command(thd) != SQLCOM_TRUNCATE && !thd_tablespace_op(thd)) { - lock_type = TL_WRITE_ALLOW_WRITE; - } - lock.type = lock_type; } + lock.type = lock_type; } *to++ = &lock; if (tokudb_debug & TOKUDB_DEBUG_LOCK) @@ -6903,7 +6809,7 @@ int ha_tokudb::create(const char *name, TABLE * form, HA_CREATE_INFO * create_in newname = (char *)tokudb_my_malloc(get_max_dict_name_path_length(name),MYF(MY_WME)); if (newname == NULL){ error = ENOMEM; goto cleanup;} - trx = (tokudb_trx_data *) thd_data_get(ha_thd(), tokudb_hton->slot); + trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton); if (trx && trx->sub_sp_level && thd_sql_command(thd) == SQLCOM_CREATE_TABLE) { txn = trx->sub_sp_level; } @@ -7093,7 +6999,7 @@ int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_nam DB_TXN *parent_txn = NULL; tokudb_trx_data *trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (thd_sql_command(ha_thd()) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) { parent_txn = trx->sub_sp_level; } @@ -7534,7 +7440,7 @@ int ha_tokudb::tokudb_add_index( DBC* tmp_cursor = NULL; int cursor_ret_val = 0; DBT curr_pk_key, curr_pk_val; - THD* thd = ha_thd(); + THD* thd = ha_thd(); DB_LOADER* loader = NULL; DB_INDEXER* indexer = NULL; bool loader_save_space = get_load_save_space(thd); @@ -7572,7 +7478,7 @@ int ha_tokudb::tokudb_add_index( // // status message to be shown in "show process list" // - const char *old_proc_info = tokudb_thd_get_proc_info(thd); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound. ulonglong num_processed = 0; //variable that stores number of elements inserted thus far thd_proc_info(thd, "Adding indexes"); @@ -7798,7 +7704,8 @@ int ha_tokudb::tokudb_add_index( num_processed++; if ((num_processed % 1000) == 0) { - sprintf(status_msg, "Adding indexes: Fetched %llu of about %llu rows, loading of data still remains.", num_processed, (long long unsigned) share->rows); + sprintf(status_msg, "Adding indexes: Fetched %llu of about %llu rows, loading of data still remains.", + num_processed, (long long unsigned) share->rows); thd_proc_info(thd, status_msg); #ifdef HA_TOKUDB_HAS_THD_PROGRESS @@ -7830,12 +7737,8 @@ int ha_tokudb::tokudb_add_index( for (uint i = 0; i < num_of_keys; i++, curr_index++) { if (key_info[i].flags & HA_NOSAME) { bool is_unique; - error = is_index_unique( - &is_unique, - txn, - share->key_file[curr_index], - &key_info[i] - ); + error = is_index_unique(&is_unique, txn, share->key_file[curr_index], &key_info[i], + creating_hot_index ? 0 : DB_PRELOCKED_WRITE); if (error) goto cleanup; if (!is_unique) { error = HA_ERR_FOUND_DUPP_KEY; @@ -7893,7 +7796,7 @@ cleanup: another transaction has accessed the table. \ To add indexes, make sure no transactions touch the table.", share->table_name); } - thd_proc_info(thd, old_proc_info); + thd_proc_info(thd, orig_proc_info); TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error); } @@ -8245,12 +8148,12 @@ void ha_tokudb::cleanup_txn(DB_TXN *txn) { } void ha_tokudb::add_to_trx_handler_list() { - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(ha_thd(), tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton); trx->handlers = list_add(trx->handlers, &trx_handler_list); } void ha_tokudb::remove_from_trx_handler_list() { - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(ha_thd(), tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton); trx->handlers = list_delete(trx->handlers, &trx_handler_list); } diff --git a/storage/tokudb/ha_tokudb.h b/storage/tokudb/ha_tokudb.h index 52d26d6ad26..be54e2f4f41 100644 --- a/storage/tokudb/ha_tokudb.h +++ b/storage/tokudb/ha_tokudb.h @@ -109,15 +109,6 @@ typedef struct loader_context { ha_tokudb* ha; } *LOADER_CONTEXT; -typedef struct hot_optimize_context { - THD *thd; - char* write_status_msg; - ha_tokudb *ha; - uint progress_stage; - uint current_table; - uint num_tables; -} *HOT_OPTIMIZE_CONTEXT; - // // This object stores table information that is to be shared // among all ha_tokudb objects. @@ -475,7 +466,7 @@ private: ); int create_main_dictionary(const char* name, TABLE* form, DB_TXN* txn, KEY_AND_COL_INFO* kc_info, toku_compression_method compression_method); void trace_create_table_info(const char *name, TABLE * form); - int is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info); + int is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags); int is_val_unique(bool* is_unique, uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn); int do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd); void set_main_dict_put_flags(THD* thd, bool opt_eligible, uint32_t* put_flags); @@ -803,6 +794,7 @@ private: void remove_from_trx_handler_list(); private: + int do_optimize(THD *thd); int map_to_handler_error(int error); }; diff --git a/storage/tokudb/ha_tokudb_admin.cc b/storage/tokudb/ha_tokudb_admin.cc index 8d202eeda41..100c88a76a8 100644 --- a/storage/tokudb/ha_tokudb_admin.cc +++ b/storage/tokudb/ha_tokudb_admin.cc @@ -128,8 +128,15 @@ static int analyze_progress(void *v_extra, uint64_t rows) { int ha_tokudb::analyze(THD *thd, HA_CHECK_OPT *check_opt) { TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); uint64_t rec_per_key[table_share->key_parts]; int result = HA_ADMIN_OK; + + // stub out analyze if optimize is remapped to alter recreate + analyze + if (thd_sql_command(thd) != SQLCOM_ANALYZE) { + TOKUDB_HANDLER_DBUG_RETURN(result); + } + DB_TXN *txn = transaction; if (!txn) { result = HA_ADMIN_FAILED; @@ -168,9 +175,19 @@ int ha_tokudb::analyze(THD *thd, HA_CHECK_OPT *check_opt) { if (error) result = HA_ADMIN_FAILED; } + thd_proc_info(thd, orig_proc_info); TOKUDB_HANDLER_DBUG_RETURN(result); } +typedef struct hot_optimize_context { + THD *thd; + char* write_status_msg; + ha_tokudb *ha; + uint progress_stage; + uint current_table; + uint num_tables; +} *HOT_OPTIMIZE_CONTEXT; + static int hot_poll_fun(void *extra, float progress) { HOT_OPTIMIZE_CONTEXT context = (HOT_OPTIMIZE_CONTEXT)extra; if (context->thd->killed) { @@ -194,9 +211,9 @@ static int hot_poll_fun(void *extra, float progress) { } // flatten all DB's in this table, to do so, peform hot optimize on each db -int ha_tokudb::optimize(THD * thd, HA_CHECK_OPT * check_opt) { +int ha_tokudb::do_optimize(THD *thd) { TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); - + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); int error; uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key); @@ -206,9 +223,7 @@ int ha_tokudb::optimize(THD * thd, HA_CHECK_OPT * check_opt) { thd_progress_init(thd, curr_num_DBs); #endif - // // for each DB, run optimize and hot_optimize - // for (uint i = 0; i < curr_num_DBs; i++) { DB* db = share->key_file[i]; error = db->optimize(db); @@ -228,14 +243,24 @@ int ha_tokudb::optimize(THD * thd, HA_CHECK_OPT * check_opt) { goto cleanup; } } - error = 0; -cleanup: +cleanup: #ifdef HA_TOKUDB_HAS_THD_PROGRESS thd_progress_end(thd); #endif + thd_proc_info(thd, orig_proc_info); + TOKUDB_HANDLER_DBUG_RETURN(error); +} +int ha_tokudb::optimize(THD *thd, HA_CHECK_OPT *check_opt) { + TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); + int error; +#if TOKU_OPTIMIZE_WITH_RECREATE + error = HA_ADMIN_TRY_ALTER; +#else + error = do_optimize(thd); +#endif TOKUDB_HANDLER_DBUG_RETURN(error); } @@ -266,10 +291,7 @@ static void ha_tokudb_check_info(THD *thd, TABLE *table, const char *msg) { int ha_tokudb::check(THD *thd, HA_CHECK_OPT *check_opt) { TOKUDB_HANDLER_DBUG_ENTER("%s", share->table_name); - - const char *old_proc_info = tokudb_thd_get_proc_info(thd); - thd_proc_info(thd, "tokudb::check"); - + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); int result = HA_ADMIN_OK; int r; @@ -321,6 +343,6 @@ int ha_tokudb::check(THD *thd, HA_CHECK_OPT *check_opt) { } } } - thd_proc_info(thd, old_proc_info); + thd_proc_info(thd, orig_proc_info); TOKUDB_HANDLER_DBUG_RETURN(result); } diff --git a/storage/tokudb/ha_tokudb_alter_56.cc b/storage/tokudb/ha_tokudb_alter_56.cc index e0e1e7deee4..5062a2ae67b 100644 --- a/storage/tokudb/ha_tokudb_alter_56.cc +++ b/storage/tokudb/ha_tokudb_alter_56.cc @@ -122,6 +122,7 @@ public: expand_varchar_update_needed(false), expand_fixed_update_needed(false), expand_blob_update_needed(false), + optimize_needed(false), table_kc_info(NULL), altered_table_kc_info(NULL) { } @@ -141,6 +142,7 @@ public: bool expand_varchar_update_needed; bool expand_fixed_update_needed; bool expand_blob_update_needed; + bool optimize_needed; Dynamic_array changed_fields; KEY_AND_COL_INFO *table_kc_info; KEY_AND_COL_INFO *altered_table_kc_info; @@ -219,6 +221,11 @@ static bool change_type_is_supported(TABLE *table, TABLE *altered_table, Alter_i static ulong fix_handler_flags(THD *thd, TABLE *table, TABLE *altered_table, Alter_inplace_info *ha_alter_info) { ulong handler_flags = ha_alter_info->handler_flags; +#if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099 + // This is automatically supported, hide the flag from later checks + handler_flags &= ~Alter_inplace_info::ALTER_PARTITIONED; +#endif + // workaround for fill_alter_inplace_info bug (#5193) // the function erroneously sets the ADD_INDEX and DROP_INDEX flags for a column addition that does not // change the keys. the following code turns the ADD_INDEX and DROP_INDEX flags so that we can do hot @@ -434,7 +441,13 @@ enum_alter_inplace_result ha_tokudb::check_if_supported_inplace_alter(TABLE *alt result = HA_ALTER_INPLACE_EXCLUSIVE_LOCK; } } + } +#if TOKU_OPTIMIZE_WITH_RECREATE + else if (only_flags(ctx->handler_flags, Alter_inplace_info::RECREATE_TABLE + Alter_inplace_info::ALTER_COLUMN_DEFAULT)) { + ctx->optimize_needed = true; + result = HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE; } +#endif if (result != HA_ALTER_INPLACE_NOT_SUPPORTED && table->s->null_bytes != altered_table->s->null_bytes && (tokudb_debug & TOKUDB_DEBUG_ALTER_TABLE)) { @@ -517,6 +530,9 @@ bool ha_tokudb::inplace_alter_table(TABLE *altered_table, Alter_inplace_info *ha if (error == 0 && ctx->reset_card) { error = tokudb::set_card_from_status(share->status_block, ctx->alter_txn, table->s, altered_table->s); } + if (error == 0 && ctx->optimize_needed) { + error = do_optimize(ha_thd()); + } #if (50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \ (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) @@ -728,7 +744,8 @@ bool ha_tokudb::commit_inplace_alter_table(TABLE *altered_table, Alter_inplace_i if (commit) { #if (50613 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \ - (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) + (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799) || \ + (100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099) if (ha_alter_info->group_commit_ctx) { ha_alter_info->group_commit_ctx = NULL; } @@ -752,7 +769,7 @@ bool ha_tokudb::commit_inplace_alter_table(TABLE *altered_table, Alter_inplace_i if (!commit) { // abort the alter transaction NOW so that any alters are rolled back. this allows the following restores to work. - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); assert(ctx->alter_txn == trx->stmt); assert(trx->tokudb_lock_count > 0); // for partitioned tables, we use a single transaction to do all of the partition changes. the tokudb_lock_count diff --git a/storage/tokudb/ha_tokudb_alter_common.cc b/storage/tokudb/ha_tokudb_alter_common.cc index ecef0fb7415..414e8280daf 100644 --- a/storage/tokudb/ha_tokudb_alter_common.cc +++ b/storage/tokudb/ha_tokudb_alter_common.cc @@ -814,7 +814,7 @@ int ha_tokudb::write_frm_data(const uchar *frm_data, size_t frm_len) { if (TOKU_PARTITION_WRITE_FRM_DATA || table->part_info == NULL) { // write frmdata to status THD *thd = ha_thd(); - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); assert(trx); DB_TXN *txn = trx->stmt; // use alter table transaction assert(txn); diff --git a/storage/tokudb/hatoku_defines.h b/storage/tokudb/hatoku_defines.h index 2986b182b59..308afb4035b 100644 --- a/storage/tokudb/hatoku_defines.h +++ b/storage/tokudb/hatoku_defines.h @@ -96,6 +96,10 @@ PATENT RIGHTS GRANT: #pragma interface /* gcc class implementation */ #endif +#if !defined(TOKUDB_CHECK_JEMALLOC) +#define TOKUDB_CHECK_JEMALLOC 1 +#endif + #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099 // mariadb 10.0 #define TOKU_USE_DB_TYPE_TOKUDB 1 @@ -108,6 +112,7 @@ PATENT RIGHTS GRANT: #define TOKU_INCLUDE_EXTENDED_KEYS 1 #endif #define TOKU_INCLUDE_OPTION_STRUCTS 1 +#define TOKU_OPTIMIZE_WITH_RECREATE 1 #elif 50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799 // mysql 5.7 with no patches @@ -130,17 +135,18 @@ PATENT RIGHTS GRANT: #define TOKU_PARTITION_WRITE_FRM_DATA 0 #else // mysql 5.6 with tokutek patches -#define TOKU_USE_DB_TYPE_TOKUDB 1 /* has DB_TYPE_TOKUDB patch */ +#define TOKU_USE_DB_TYPE_TOKUDB 1 // has DB_TYPE_TOKUDB patch #define TOKU_INCLUDE_ALTER_56 1 -#define TOKU_INCLUDE_ROW_TYPE_COMPRESSION 1 /* has tokudb row format compression patch */ -#define TOKU_INCLUDE_XA 1 /* has patch that fixes TC_LOG_MMAP code */ +#define TOKU_INCLUDE_ROW_TYPE_COMPRESSION 1 // has tokudb row format compression patch +#define TOKU_INCLUDE_XA 1 // has patch that fixes TC_LOG_MMAP code #define TOKU_PARTITION_WRITE_FRM_DATA 0 #define TOKU_INCLUDE_WRITE_FRM_DATA 0 -#define TOKU_INCLUDE_UPSERT 1 /* has tokudb upsert patch */ +#define TOKU_INCLUDE_UPSERT 1 // has tokudb upsert patch #if defined(HTON_SUPPORTS_EXTENDED_KEYS) #define TOKU_INCLUDE_EXTENDED_KEYS 1 #endif #endif +#define TOKU_OPTIMIZE_WITH_RECREATE 1 #elif 50500 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50599 #define TOKU_USE_DB_TYPE_TOKUDB 1 diff --git a/storage/tokudb/hatoku_hton.cc b/storage/tokudb/hatoku_hton.cc index 2cb33e38547..9d814621521 100644 --- a/storage/tokudb/hatoku_hton.cc +++ b/storage/tokudb/hatoku_hton.cc @@ -589,9 +589,6 @@ static int tokudb_done_func(void *p) { toku_global_status_rows = NULL; my_hash_free(&tokudb_open_tables); tokudb_pthread_mutex_destroy(&tokudb_mutex); -#if defined(_WIN64) - toku_ydb_destroy(); -#endif TOKUDB_DBUG_RETURN(0); } @@ -637,8 +634,7 @@ int tokudb_end(handlerton * hton, ha_panic_function type) { static int tokudb_close_connection(handlerton * hton, THD * thd) { int error = 0; - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (trx && trx->checkpoint_lock_taken) { error = db_env->checkpointing_resume(db_env); } @@ -702,25 +698,27 @@ static void txn_progress_func(TOKU_TXN_PROGRESS progress, void* extra) { } static void commit_txn_with_progress(DB_TXN* txn, uint32_t flags, THD* thd) { - int r; + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); struct txn_progress_info info; info.thd = thd; - r = txn->commit_with_progress(txn, flags, txn_progress_func, &info); + int r = txn->commit_with_progress(txn, flags, txn_progress_func, &info); if (r != 0) { sql_print_error("tried committing transaction %p and got error code %d", txn, r); } assert(r == 0); + thd_proc_info(thd, orig_proc_info); } static void abort_txn_with_progress(DB_TXN* txn, THD* thd) { - int r; + const char *orig_proc_info = tokudb_thd_get_proc_info(thd); struct txn_progress_info info; info.thd = thd; - r = txn->abort_with_progress(txn, txn_progress_func, &info); + int r = txn->abort_with_progress(txn, txn_progress_func, &info); if (r != 0) { sql_print_error("tried aborting transaction %p and got error code %d", txn, r); } assert(r == 0); + thd_proc_info(thd, orig_proc_info); } static void tokudb_cleanup_handlers(tokudb_trx_data *trx, DB_TXN *txn) { @@ -736,7 +734,7 @@ static int tokudb_commit(handlerton * hton, THD * thd, bool all) { TOKUDB_DBUG_ENTER(""); DBUG_PRINT("trans", ("ending transaction %s", all ? "all" : "stmt")); uint32_t syncflag = THDVAR(thd, commit_sync) ? 0 : DB_TXN_NOSYNC; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); DB_TXN **txn = all ? &trx->all : &trx->stmt; DB_TXN *this_txn = *txn; if (this_txn) { @@ -765,7 +763,7 @@ static int tokudb_commit(handlerton * hton, THD * thd, bool all) { static int tokudb_rollback(handlerton * hton, THD * thd, bool all) { TOKUDB_DBUG_ENTER(""); DBUG_PRINT("trans", ("aborting transaction %s", all ? "all" : "stmt")); - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); DB_TXN **txn = all ? &trx->all : &trx->stmt; DB_TXN *this_txn = *txn; if (this_txn) { @@ -795,7 +793,7 @@ static int tokudb_xa_prepare(handlerton* hton, THD* thd, bool all) { TOKUDB_DBUG_ENTER(""); int r = 0; DBUG_PRINT("trans", ("preparing transaction %s", all ? "all" : "stmt")); - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); DB_TXN* txn = all ? trx->all : trx->stmt; if (txn) { if (tokudb_debug & TOKUDB_DEBUG_TXN) { @@ -874,7 +872,7 @@ static int tokudb_savepoint(handlerton * hton, THD * thd, void *savepoint) { TOKUDB_DBUG_ENTER(""); int error; SP_INFO save_info = (SP_INFO)savepoint; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); if (thd->in_sub_stmt) { assert(trx->stmt); error = txn_begin(db_env, trx->sub_sp_level, &(save_info->txn), DB_INHERIT_ISOLATION, thd); @@ -905,7 +903,7 @@ static int tokudb_rollback_to_savepoint(handlerton * hton, THD * thd, void *save DB_TXN* parent = NULL; DB_TXN* txn_to_rollback = save_info->txn; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); parent = txn_to_rollback->parent; if (!(error = txn_to_rollback->abort(txn_to_rollback))) { if (save_info->in_sub_stmt) { @@ -927,7 +925,7 @@ static int tokudb_release_savepoint(handlerton * hton, THD * thd, void *savepoin DB_TXN* parent = NULL; DB_TXN* txn_to_commit = save_info->txn; - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, hton); parent = txn_to_commit->parent; if (!(error = txn_to_commit->commit(txn_to_commit, 0))) { if (save_info->in_sub_stmt) { @@ -984,10 +982,10 @@ static int tokudb_discover3(handlerton *hton, THD* thd, const char *db, const ch HA_METADATA_KEY curr_key = hatoku_frm_data; DBT key = {}; DBT value = {}; - bool do_commit; + bool do_commit = false; #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099 - tokudb_trx_data *trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) { do_commit = false; txn = trx->sub_sp_level; @@ -1142,15 +1140,14 @@ static bool tokudb_show_engine_status(THD * thd, stat_print_fn * stat_print) { static void tokudb_checkpoint_lock(THD * thd) { int error; const char *old_proc_info; - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (!trx) { error = create_tokudb_trx_data_instance(&trx); // // can only fail due to memory allocation, so ok to assert // assert(!error); - thd_data_set(thd, tokudb_hton->slot, trx); + thd_set_ha_data(thd, tokudb_hton, trx); } if (trx->checkpoint_lock_taken) { @@ -1174,8 +1171,7 @@ cleanup: static void tokudb_checkpoint_unlock(THD * thd) { int error; const char *old_proc_info; - tokudb_trx_data* trx = NULL; - trx = (tokudb_trx_data *) thd_data_get(thd, tokudb_hton->slot); + tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton); if (!trx) { error = 0; goto cleanup; diff --git a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_released.result b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_released.result index f84be01163f..db63d23e382 100644 --- a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_released.result +++ b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_released.result @@ -9,6 +9,7 @@ locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right select * from information_schema.tokudb_lock_waits; requesting_trx_id blocking_trx_id lock_waits_dname lock_waits_key_left lock_waits_key_right lock_waits_start_time set autocommit=0; +set tokudb_prelock_empty=OFF; insert into t values (1); set autocommit=0; insert into t values (1); @@ -38,6 +39,7 @@ locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right select * from information_schema.tokudb_lock_waits; requesting_trx_id blocking_trx_id lock_waits_dname lock_waits_key_left lock_waits_key_right lock_waits_start_time set autocommit=0; +set tokudb_prelock_empty=OFF; replace into t values (1); set autocommit=0; replace into t values (1); diff --git a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_timeout.result b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_timeout.result index 1e0668164ff..10e3830506d 100644 --- a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_timeout.result +++ b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_timeout.result @@ -9,6 +9,7 @@ locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right select * from information_schema.tokudb_lock_waits; requesting_trx_id blocking_trx_id lock_waits_dname lock_waits_key_left lock_waits_key_right lock_waits_start_time set autocommit=0; +set tokudb_prelock_empty=OFF; insert into t values (1); set autocommit=0; insert into t values (1); diff --git a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_locks.result b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_locks.result index ad252da448f..9fce0695983 100644 --- a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_locks.result +++ b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_locks.result @@ -12,7 +12,7 @@ set autocommit=0; insert into t values (2); insert into t values (4); insert into t values (6); -select * from information_schema.tokudb_locks order by locks_trx_id; +select * from information_schema.tokudb_locks order by locks_trx_id,locks_key_left; locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right TRX_ID MYSQL_ID ./test/t-main 0001000000 0001000000 TRX_ID MYSQL_ID ./test/t-main 0003000000 0003000000 diff --git a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_locks_released.result b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_locks_released.result index 21a6b5d308c..628ff46ffc4 100644 --- a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_locks_released.result +++ b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_locks_released.result @@ -6,6 +6,7 @@ set autocommit=0; select * from information_schema.tokudb_locks; locks_trx_id locks_mysql_thread_id locks_dname locks_key_left locks_key_right set autocommit=0; +set tokudb_prelock_empty=OFF; insert into t values (1); set autocommit=0; insert into t values (1); diff --git a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_released.test b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_released.test index 6b4e5d88673..3b56660ff83 100644 --- a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_released.test +++ b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_released.test @@ -23,6 +23,7 @@ select * from information_schema.tokudb_lock_waits; connect (conn_a,localhost,root,,); set autocommit=0; +set tokudb_prelock_empty=OFF; # disable the bulk loader insert into t values (1); connect (conn_b,localhost,root,,); @@ -72,6 +73,7 @@ select * from information_schema.tokudb_lock_waits; connect (conn_a,localhost,root,,); set autocommit=0; +set tokudb_prelock_empty=OFF; # disable the bulk loader replace into t values (1); connect (conn_b,localhost,root,,); diff --git a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_timeout.test b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_timeout.test index ea7eb9a2c89..d7925733a0f 100644 --- a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_timeout.test +++ b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_timeout.test @@ -16,6 +16,7 @@ select * from information_schema.tokudb_lock_waits; connect (conn_a,localhost,root,,); set autocommit=0; +set tokudb_prelock_empty=OFF; insert into t values (1); connect (conn_b,localhost,root,,); diff --git a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_locks.test b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_locks.test index a3745b5471b..e5a67559b1a 100644 --- a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_locks.test +++ b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_locks.test @@ -29,7 +29,7 @@ insert into t values (6); # should find 3 locks for 2 transactions connection default; replace_column 1 TRX_ID 2 MYSQL_ID; -eval select * from information_schema.tokudb_locks order by locks_trx_id; +eval select * from information_schema.tokudb_locks order by locks_trx_id,locks_key_left; connection conn_a; commit; diff --git a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_locks_released.test b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_locks_released.test index 3a1cf2023da..f2ca9b8ed9c 100644 --- a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_locks_released.test +++ b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_locks_released.test @@ -18,20 +18,17 @@ let $default_id=`select connection_id()`; # should be empty select * from information_schema.tokudb_locks; - connect (conn_a,localhost,root,,); set autocommit=0; -let $a_id=`select connection_id()`; +set tokudb_prelock_empty=OFF; # disable bulk loader insert into t values (1); connect (conn_b,localhost,root,,); set autocommit=0; -let $b_id=`select connection_id()`; send insert into t values (1); - # should find the presence of a lock on 2nd transaction connection default; let $wait_condition= select count(*)=1 from information_schema.processlist where info='insert into t values (1)' and state='update'; diff --git a/storage/tokudb/mysql-test/tokudb_bugs/r/lock_uniq_key_empty.result b/storage/tokudb/mysql-test/tokudb_bugs/r/lock_uniq_key_empty.result index 6966aa24ff8..325aef46afe 100644 --- a/storage/tokudb/mysql-test/tokudb_bugs/r/lock_uniq_key_empty.result +++ b/storage/tokudb/mysql-test/tokudb_bugs/r/lock_uniq_key_empty.result @@ -1,6 +1,7 @@ set default_storage_engine=tokudb; drop table if exists t; create table t (id int, unique key(id)); +set tokudb_prelock_empty=OFF; begin; insert into t values (1); begin; @@ -13,6 +14,7 @@ id 2 drop table if exists t; create table t (id int not null, unique key(id)); +set tokudb_prelock_empty=OFF; begin; insert into t values (1); begin; diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/lock_uniq_key_empty.test b/storage/tokudb/mysql-test/tokudb_bugs/t/lock_uniq_key_empty.test index 3f8d7113dff..0a001c2736d 100644 --- a/storage/tokudb/mysql-test/tokudb_bugs/t/lock_uniq_key_empty.test +++ b/storage/tokudb/mysql-test/tokudb_bugs/t/lock_uniq_key_empty.test @@ -7,6 +7,7 @@ enable_warnings; create table t (id int, unique key(id)); connect(c1,localhost,root,,); +set tokudb_prelock_empty=OFF; # disable the tokudb bulk loader begin; insert into t values (1); connect(c2,localhost,root,,); @@ -24,6 +25,7 @@ drop table if exists t; create table t (id int not null, unique key(id)); connect(c1,localhost,root,,); +set tokudb_prelock_empty=OFF; # disable the tokudb bulk loader begin; insert into t values (1); connect(c2,localhost,root,,); diff --git a/storage/tokudb/scripts/setup.mysql.bash b/storage/tokudb/scripts/setup.mysql.bash index 6ae604e34c1..85132350289 100755 --- a/storage/tokudb/scripts/setup.mysql.bash +++ b/storage/tokudb/scripts/setup.mysql.bash @@ -5,6 +5,39 @@ function usage() { echo "--mysqlbuild=$mysqlbuild --shutdown=$shutdown --install=$install --startup=$startup" } +function download_file() { + local file=$1 + s3get $s3bucket $file $file +} + +function download_tarball() { + local tarball=$1 + if [ ! -f $tarball ] ; then + download_file $tarball + if [ $? -ne 0 ] ; then test 0 = 1; return; fi + fi + if [ ! -f $tarball.md5 ] ; then + download_file $tarball.md5 + if [ $? -ne 0 ] ; then test 0 = 1; return; fi + fi +} + +function install_tarball() { + local basedir=$1; local tarball=$2 + tar -x -z -f $basedir/$tarball + if [ $? -ne 0 ] ; then test 0 = 1; return; fi +} + +function check_md5() { + local tarball=$1 + md5sum --check $tarball.md5 + if [ $? -ne 0 ] ; then + # support jacksum md5 output which is almost the same as md5sum + diff -b <(cat $tarball.md5) <(md5sum $tarball) + if [ $? -ne 0 ] ; then test 0 = 1; return; fi + fi +} + mysqlbuild= shutdown=1 install=1 @@ -64,30 +97,24 @@ basedir=$PWD mysqltarball=$mysqlbuild.tar.gz -if [ -f $mysqlbuild.tar.gz ] ; then - compression=-z - mysqltarball=$mysqlbuild.tar.gz -elif [ -f $mysqlbuild.tar.bz2 ] ; then - compression=-j - mysqltarball=$mysqlbuild.tar.bz2 -fi - -# get the release -if [ ! -f $mysqltarball ] ; then - s3get $s3bucket $mysqltarball $mysqltarball - if [ $? -ne 0 ] ; then exit 1; fi -fi -if [ ! -f $mysqltarball.md5 ] ; then - s3get $s3bucket $mysqltarball.md5 $mysqltarball.md5 - if [ $? -ne 0 ] ; then exit 1; fi -fi +# get the tarball +download_tarball $mysqltarball +if [ $? -ne 0 ] ; then exit 1; fi # check the md5 sum -md5sum --check $mysqltarball.md5 -if [ $? -ne 0 ] ; then - # support jacksum md5 output which is almost the same as md5sum - diff -b <(cat $mysqltarball.md5) <(md5sum $mysqltarball) - if [ $? -ne 0 ] ; then exit 1; fi +check_md5 $mysqltarball +if [ $? -ne 0 ] ; then exit 1; fi + +tokudbtarball="" +if [[ $mysqltarball =~ ^(Percona-Server.*)\.(Linux\.x86_64.*)$ ]] ; then + tar tzf $mysqltarball | egrep ha_tokudb.so >/dev/null 2>&1 + if [ $? -ne 0 ] ; then + tokudbtarball=${BASH_REMATCH[1]}.TokuDB.${BASH_REMATCH[2]} + download_tarball $tokudbtarball + if [ $? -ne 0 ] ; then exit 1; fi + check_md5 $tokudbtarball + if [ $? -ne 0 ] ; then exit 1; fi + fi fi # set ldpath @@ -126,8 +153,14 @@ if [ ! -d $mysqlbuild ] || [ $install -ne 0 ] ; then rm mysql if [ -d $mysqlbuild ] ; then $sudo rm -rf $mysqlbuild; fi - tar -x $compression -f $basedir/$mysqltarball + install_tarball $basedir $mysqltarball if [ $? -ne 0 ] ; then exit 1; fi + + if [ $tokudbtarball ] ; then + install_tarball $basedir $tokudbtarball + if [ $? -ne 0 ] ; then exit 1; fi + fi + ln -s $mysqldir /usr/local/mysql if [ $? -ne 0 ] ; then exit 1; fi ln -s $mysqldir /usr/local/$mysqlbuild @@ -180,6 +213,10 @@ if [ $startup -ne 0 ] ; then else default_arg="--defaults-file=$defaultsfile" fi + j=/usr/local/mysql/lib/mysql/libjemalloc.so + if [ -f $j ] ; then + default_arg="$default_arg --malloc-lib=$j" + fi $sudo -b bash -c "$ldpath /usr/local/mysql/bin/mysqld_safe $default_arg $mysqld_args" >/dev/null 2>&1 & fi sleep $sleeptime