From 05c4a3518e26c8acd9cd8e2436da7e11ddbc6474 Mon Sep 17 00:00:00 2001 From: Vasil Dimov Date: Fri, 25 Feb 2011 11:50:18 +0200 Subject: [PATCH 1/5] Fix BUG#11798085 - INCORRECT INTEGER TYPES USED IN CALCULATION RESULT IN OVERFLOW Do not assign the result of the difference to a signed variable and checking whether it is negative afterwards because this limits the max diff to 2G on 32 bit systems. E.g. "signed = 3.5G - 1G" would be negative and the code would assume that 3.5G < 1G. Instead compare the two variables directly and assign to unsigned only if we know that the result of the subtraction will be positive. Discussed with: Jimmy and Sunny (via IRC) --- storage/innodb_plugin/buf/buf0buf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index 6bbd5565c58..51a3a393d36 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -1893,16 +1893,19 @@ buf_block_align( /* TODO: protect buf_pool->chunks with a mutex (it will currently remain constant after buf_pool_init()) */ for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) { - lint offs = ptr - chunk->blocks->frame; + ulint offs; - if (UNIV_UNLIKELY(offs < 0)) { + if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) { continue; } + /* else */ + + offs = ptr - chunk->blocks->frame; offs >>= UNIV_PAGE_SIZE_SHIFT; - if (UNIV_LIKELY((ulint) offs < chunk->size)) { + if (UNIV_LIKELY(offs < chunk->size)) { buf_block_t* block = &chunk->blocks[offs]; /* The function buf_chunk_init() invokes From 8da1dfa9cb607a824c1e23c8dec44bb2c70095ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 28 Feb 2011 13:51:18 +0200 Subject: [PATCH 2/5] Bug #58549 Race condition in buf_LRU_drop_page_hash_for_tablespace() and compressed tables buf_LRU_drop_page_hash_for_tablespace(): after releasing and reacquiring the buffer pool mutex, do not dereference any block descriptor pointer that is not known to be a pointer to an uncompressed page frame (type buf_block_t; state == BUF_BLOCK_FILE_PAGE). Also, defer the acquisition of the block_mutex until it is needed. buf_page_get_gen(): Add mode == BUF_GET_IF_IN_POOL_PEEK for buffer-fixing a block without making it young in the LRU list. buf_page_get_gen(), buf_page_init(), buf_LRU_block_remove_hashed_page(): Set bpage->state = BUF_BLOCK_ZIP_FREE before buf_buddy_free(bpage), so that similar race conditions might be detected a little easier. btr_search_drop_page_hash_when_freed(): Use BUF_GET_IF_IN_POOL_PEEK when dropping the hash indexes. rb://528 approved by Jimmy Yang --- storage/innodb_plugin/ChangeLog | 6 ++ storage/innodb_plugin/btr/btr0sea.c | 4 +- storage/innodb_plugin/buf/buf0buf.c | 32 ++++++--- storage/innodb_plugin/buf/buf0lru.c | 87 +++++++++++++------------ storage/innodb_plugin/include/buf0buf.h | 4 +- 5 files changed, 81 insertions(+), 52 deletions(-) diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 1b2747ab012..1ece3ad1825 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,9 @@ +2011-02-28 The InnoDB Team + + * btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c: + Fix Bug#58549 Race condition in buf_LRU_drop_page_hash_for_tablespace() + and compressed tables + 2011-02-15 The InnoDB Team * sync/sync0rw.c, innodb_bug59307.test: diff --git a/storage/innodb_plugin/btr/btr0sea.c b/storage/innodb_plugin/btr/btr0sea.c index 9835efcf712..cd0eadbb1b8 100644 --- a/storage/innodb_plugin/btr/btr0sea.c +++ b/storage/innodb_plugin/btr/btr0sea.c @@ -1201,8 +1201,8 @@ btr_search_drop_page_hash_when_freed( having to fear a deadlock. */ block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL, - BUF_GET_IF_IN_POOL, __FILE__, __LINE__, - &mtr); + BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__, + &mtr); /* Because the buffer pool mutex was released by buf_page_peek_if_search_hashed(), it is possible that the block was removed from the buffer pool by another thread diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index 51a3a393d36..14ec7b75911 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -2031,7 +2031,7 @@ buf_page_get_gen( ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ buf_block_t* guess, /*!< in: guessed block or NULL */ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, - BUF_GET_NO_LATCH */ + BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */ const char* file, /*!< in: file name */ ulint line, /*!< in: line where called */ mtr_t* mtr) /*!< in: mini-transaction */ @@ -2047,9 +2047,19 @@ buf_page_get_gen( ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH) || (rw_latch == RW_NO_LATCH)); - ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH)); - ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL) - || (mode == BUF_GET_NO_LATCH)); +#ifdef UNIV_DEBUG + switch (mode) { + case BUF_GET_NO_LATCH: + ut_ad(rw_latch == RW_NO_LATCH); + break; + case BUF_GET: + case BUF_GET_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ ut_ad(zip_size == fil_space_get_zip_size(space)); ut_ad(ut_is_2pow(zip_size)); #ifndef UNIV_LOG_DEBUG @@ -2091,7 +2101,8 @@ loop2: buf_pool_mutex_exit(); - if (mode == BUF_GET_IF_IN_POOL) { + if (mode == BUF_GET_IF_IN_POOL + || mode == BUF_PEEK_IF_IN_POOL) { return(NULL); } @@ -2130,7 +2141,8 @@ loop2: must_read = buf_block_get_io_fix(block) == BUF_IO_READ; - if (must_read && mode == BUF_GET_IF_IN_POOL) { + if (must_read && (mode == BUF_GET_IF_IN_POOL + || mode == BUF_PEEK_IF_IN_POOL)) { /* The page is only being read to buffer */ buf_pool_mutex_exit(); @@ -2248,6 +2260,7 @@ wait_until_unfixed: mutex_exit(&buf_pool_zip_mutex); buf_pool->n_pend_unzip++; + bpage->state = BUF_BLOCK_ZIP_FREE; buf_buddy_free(bpage, sizeof *bpage); buf_pool_mutex_exit(); @@ -2324,7 +2337,9 @@ wait_until_unfixed: buf_pool_mutex_exit(); - buf_page_set_accessed_make_young(&block->page, access_time); + if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL)) { + buf_page_set_accessed_make_young(&block->page, access_time); + } #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG ut_a(!block->page.file_page_was_freed); @@ -2377,7 +2392,7 @@ wait_until_unfixed: mtr_memo_push(mtr, block, fix_type); - if (!access_time) { + if (UNIV_LIKELY(mode != BUF_PEEK_IF_IN_POOL) && !access_time) { /* In the case of a first access, try to apply linear read-ahead */ @@ -2926,6 +2941,7 @@ err_exit: && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) { /* The block was added by some other thread. */ + bpage->state = BUF_BLOCK_ZIP_FREE; buf_buddy_free(bpage, sizeof *bpage); buf_buddy_free(data, zip_size); diff --git a/storage/innodb_plugin/buf/buf0lru.c b/storage/innodb_plugin/buf/buf0lru.c index 39feb06ff23..a69b2658c51 100644 --- a/storage/innodb_plugin/buf/buf0lru.c +++ b/storage/innodb_plugin/buf/buf0lru.c @@ -246,71 +246,75 @@ buf_LRU_drop_page_hash_for_tablespace( page_arr = ut_malloc(sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE); buf_pool_mutex_enter(); + num_entries = 0; scan_again: - num_entries = 0; bpage = UT_LIST_GET_LAST(buf_pool->LRU); while (bpage != NULL) { - mutex_t* block_mutex = buf_page_get_mutex(bpage); buf_page_t* prev_bpage; + ibool is_fixed; - mutex_enter(block_mutex); prev_bpage = UT_LIST_GET_PREV(LRU, bpage); ut_a(buf_page_in_file(bpage)); if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE || bpage->space != id - || bpage->buf_fix_count > 0 || bpage->io_fix != BUF_IO_NONE) { - /* We leave the fixed pages as is in this scan. - To be dealt with later in the final scan. */ - mutex_exit(block_mutex); + /* Compressed pages are never hashed. + Skip blocks of other tablespaces. + Skip I/O-fixed blocks (to be dealt with later). */ +next_page: + bpage = prev_bpage; + continue; + } + + mutex_enter(&((buf_block_t*) bpage)->mutex); + is_fixed = bpage->buf_fix_count > 0 + || !((buf_block_t*) bpage)->is_hashed; + mutex_exit(&((buf_block_t*) bpage)->mutex); + + if (is_fixed) { goto next_page; } - if (((buf_block_t*) bpage)->is_hashed) { + /* Store the page number so that we can drop the hash + index in a batch later. */ + page_arr[num_entries] = bpage->offset; + ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); + ++num_entries; - /* Store the offset(i.e.: page_no) in the array - so that we can drop hash index in a batch - later. */ - page_arr[num_entries] = bpage->offset; - mutex_exit(block_mutex); - ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); - ++num_entries; - - if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { - goto next_page; - } - /* Array full. We release the buf_pool_mutex to - obey the latching order. */ - buf_pool_mutex_exit(); - - buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, - num_entries); - num_entries = 0; - buf_pool_mutex_enter(); - } else { - mutex_exit(block_mutex); + if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { + goto next_page; } -next_page: - /* Note that we may have released the buf_pool mutex - above after reading the prev_bpage during processing - of a page_hash_batch (i.e.: when the array was full). - This means that prev_bpage can change in LRU list. - This is OK because this function is a 'best effort' - to drop as many search hash entries as possible and - it does not guarantee that ALL such entries will be - dropped. */ - bpage = prev_bpage; + /* Array full. We release the buf_pool_mutex to + obey the latching order. */ + buf_pool_mutex_exit(); + buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, + num_entries); + buf_pool_mutex_enter(); + num_entries = 0; + + /* Note that we released the buf_pool mutex above + after reading the prev_bpage during processing of a + page_hash_batch (i.e.: when the array was full). + Because prev_bpage could belong to a compressed-only + block, it may have been relocated, and thus the + pointer cannot be trusted. Because bpage is of type + buf_block_t, it is safe to dereference. + + bpage can change in the LRU list. This is OK because + this function is a 'best effort' to drop as many + search hash entries as possible and it does not + guarantee that ALL such entries will be dropped. */ /* If, however, bpage has been removed from LRU list to the free list then we should restart the scan. bpage->state is protected by buf_pool mutex. */ - if (bpage && !buf_page_in_file(bpage)) { - ut_a(num_entries == 0); + if (bpage + && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { goto scan_again; } } @@ -1799,6 +1803,7 @@ buf_LRU_block_remove_hashed_page( buf_pool_mutex_exit_forbid(); buf_buddy_free(bpage->zip.data, page_zip_get_size(&bpage->zip)); + bpage->state = BUF_BLOCK_ZIP_FREE; buf_buddy_free(bpage, sizeof(*bpage)); buf_pool_mutex_exit_allow(); UNIV_MEM_UNDESC(bpage); diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h index a16de67aa3a..05dead5ac9e 100644 --- a/storage/innodb_plugin/include/buf0buf.h +++ b/storage/innodb_plugin/include/buf0buf.h @@ -41,6 +41,8 @@ Created 11/5/1995 Heikki Tuuri /* @{ */ #define BUF_GET 10 /*!< get always */ #define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */ +#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make + the block young in the LRU list */ #define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but set no latch; we have separated this case, because @@ -284,7 +286,7 @@ buf_page_get_gen( ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ buf_block_t* guess, /*!< in: guessed block or NULL */ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, - BUF_GET_NO_LATCH */ + BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH */ const char* file, /*!< in: file name */ ulint line, /*!< in: line where called */ mtr_t* mtr); /*!< in: mini-transaction */ From 3d124532134b2442934f7ba5e8fd75d96be33c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 15 Mar 2011 12:01:02 +0200 Subject: [PATCH 3/5] Bug#11849231 inflateInit() invoked without initializing all memory According to the zlib documentation, next_in and avail_in must be initialized before invoking inflateInit or inflateInit2. Furthermore, the zalloc function must clear the allocated memory. btr_copy_zblob_prefix(): Replace the d_stream parameter with buf,len and return the copied length. page_zip_decompress(): Invoke inflateInit2 a little later. page_zip_zalloc(): Rename from page_zip_alloc(). Invoke mem_heap_zalloc() instead of mem_heap_alloc(). rb:619 approved by Jimmy Yang --- storage/innodb_plugin/ChangeLog | 5 ++ storage/innodb_plugin/btr/btr0cur.c | 76 ++++++++++++++------------- storage/innodb_plugin/page/page0zip.c | 17 +++--- 3 files changed, 53 insertions(+), 45 deletions(-) diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index fdd29908192..7c82cd9c27f 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,8 @@ +2011-03-15 The InnoDB Team + + * btr/btr0cur.c, page/page0zip.c: + Fix Bug#11849231 inflateInit() invoked without initializing all memory + 2011-02-28 The InnoDB Team * btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c: diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index 86d77c79e7b..d7b5ed0d135 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -4627,27 +4627,45 @@ btr_copy_blob_prefix( /*******************************************************************//** Copies the prefix of a compressed BLOB. The clustered index record -that points to this BLOB must be protected by a lock or a page latch. */ +that points to this BLOB must be protected by a lock or a page latch. +@return number of bytes written to buf */ static -void +ulint btr_copy_zblob_prefix( /*==================*/ - z_stream* d_stream,/*!< in/out: the decompressing stream */ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + ulint len, /*!< in: length of buf, in bytes */ ulint zip_size,/*!< in: compressed BLOB page size */ ulint space_id,/*!< in: space id of the BLOB pages */ ulint page_no,/*!< in: page number of the first BLOB page */ ulint offset) /*!< in: offset on the first BLOB page */ { - ulint page_type = FIL_PAGE_TYPE_ZBLOB; + ulint page_type = FIL_PAGE_TYPE_ZBLOB; + mem_heap_t* heap; + int err; + z_stream d_stream; + + d_stream.next_out = buf; + d_stream.avail_out = len; + d_stream.next_in = Z_NULL; + d_stream.avail_in = 0; + + /* Zlib inflate needs 32 kilobytes for the default + window size, plus a few kilobytes for small objects. */ + heap = mem_heap_create(40000); + page_zip_set_alloc(&d_stream, heap); ut_ad(ut_is_2pow(zip_size)); ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE); ut_ad(zip_size <= UNIV_PAGE_SIZE); ut_ad(space_id); + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + for (;;) { buf_page_t* bpage; - int err; ulint next_page_no; /* There is no latch on bpage directly. Instead, @@ -4663,7 +4681,7 @@ btr_copy_zblob_prefix( " compressed BLOB" " page %lu space %lu\n", (ulong) page_no, (ulong) space_id); - return; + goto func_exit; } if (UNIV_UNLIKELY @@ -4689,13 +4707,13 @@ btr_copy_zblob_prefix( offset += 4; } - d_stream->next_in = bpage->zip.data + offset; - d_stream->avail_in = zip_size - offset; + d_stream.next_in = bpage->zip.data + offset; + d_stream.avail_in = zip_size - offset; - err = inflate(d_stream, Z_NO_FLUSH); + err = inflate(&d_stream, Z_NO_FLUSH); switch (err) { case Z_OK: - if (!d_stream->avail_out) { + if (!d_stream.avail_out) { goto end_of_blob; } break; @@ -4712,13 +4730,13 @@ inflate_error: " compressed BLOB" " page %lu space %lu returned %d (%s)\n", (ulong) page_no, (ulong) space_id, - err, d_stream->msg); + err, d_stream.msg); case Z_BUF_ERROR: goto end_of_blob; } if (next_page_no == FIL_NULL) { - if (!d_stream->avail_in) { + if (!d_stream.avail_in) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: unexpected end of" @@ -4727,7 +4745,7 @@ inflate_error: (ulong) page_no, (ulong) space_id); } else { - err = inflate(d_stream, Z_FINISH); + err = inflate(&d_stream, Z_FINISH); switch (err) { case Z_STREAM_END: case Z_BUF_ERROR: @@ -4739,7 +4757,7 @@ inflate_error: end_of_blob: buf_page_release_zip(bpage); - return; + goto func_exit; } buf_page_release_zip(bpage); @@ -4751,6 +4769,12 @@ end_of_blob: offset = FIL_PAGE_NEXT; page_type = FIL_PAGE_TYPE_ZBLOB2; } + +func_exit: + inflateEnd(&d_stream); + mem_heap_free(heap); + UNIV_MEM_ASSERT_RW(buf, d_stream.total_out); + return(d_stream.total_out); } /*******************************************************************//** @@ -4776,28 +4800,8 @@ btr_copy_externally_stored_field_prefix_low( } if (UNIV_UNLIKELY(zip_size)) { - int err; - z_stream d_stream; - mem_heap_t* heap; - - /* Zlib inflate needs 32 kilobytes for the default - window size, plus a few kilobytes for small objects. */ - heap = mem_heap_create(40000); - page_zip_set_alloc(&d_stream, heap); - - err = inflateInit(&d_stream); - ut_a(err == Z_OK); - - d_stream.next_out = buf; - d_stream.avail_out = len; - d_stream.avail_in = 0; - - btr_copy_zblob_prefix(&d_stream, zip_size, - space_id, page_no, offset); - inflateEnd(&d_stream); - mem_heap_free(heap); - UNIV_MEM_ASSERT_RW(buf, d_stream.total_out); - return(d_stream.total_out); + return(btr_copy_zblob_prefix(buf, len, zip_size, + space_id, page_no, offset)); } else { return(btr_copy_blob_prefix(buf, len, space_id, page_no, offset)); diff --git a/storage/innodb_plugin/page/page0zip.c b/storage/innodb_plugin/page/page0zip.c index a1dd4177ba8..6e866b3f016 100644 --- a/storage/innodb_plugin/page/page0zip.c +++ b/storage/innodb_plugin/page/page0zip.c @@ -653,13 +653,13 @@ page_zip_dir_encode( Allocate memory for zlib. */ static void* -page_zip_malloc( +page_zip_zalloc( /*============*/ void* opaque, /*!< in/out: memory heap */ uInt items, /*!< in: number of items to allocate */ uInt size) /*!< in: size of an item in bytes */ { - return(mem_heap_alloc(opaque, items * size)); + return(mem_heap_zalloc(opaque, items * size)); } /**********************************************************************//** @@ -684,7 +684,7 @@ page_zip_set_alloc( { z_stream* strm = stream; - strm->zalloc = page_zip_malloc; + strm->zalloc = page_zip_zalloc; strm->zfree = page_zip_free; strm->opaque = heap; } @@ -2912,19 +2912,18 @@ zlib_error: page_zip_set_alloc(&d_stream, heap); - if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) - != Z_OK)) { - ut_error; - } - d_stream.next_in = page_zip->data + PAGE_DATA; /* Subtract the space reserved for the page header and the end marker of the modification log. */ d_stream.avail_in = page_zip_get_size(page_zip) - (PAGE_DATA + 1); - d_stream.next_out = page + PAGE_ZIP_START; d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START; + if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT) + != Z_OK)) { + ut_error; + } + /* Decode the zlib header and the index information. */ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) { From 764871292ed6dcf86875ac20408c132bf2bba4cf Mon Sep 17 00:00:00 2001 From: Vasil Dimov Date: Mon, 28 Mar 2011 11:34:12 +0300 Subject: [PATCH 4/5] Store the '\0'-terminated query in row->trx_query This problem was introduced in marko.makela@oracle.com-20100514130815-ym7j7cfu88ro6km4 and is probably the reason for the following valgrind warning: from http://bugs.mysql.com/52691 , http://bugs.mysql.com/file.php?id=16880 : Version: '5.6.3-m5-valgrind-max-debug' socket: '/tmp/mysql.sock' port: 3306 Source distribution ==14947== Thread 18: ==14947== Conditional jump or move depends on uninitialised value(s) ==14947== at 0x4A06318: __GI_strlen (mc_replace_strmem.c:284) ==14947== by 0x9F3D7A: fill_innodb_trx_from_cache(trx_i_s_cache_struct*, THD*, TABLE*) (i_s.cc:591) ==14947== by 0x9F4D7D: trx_i_s_common_fill_table(THD*, TABLE_LIST*, Item*) (i_s.cc:1238) ==14947== by 0x7689F3: get_schema_tables_result(JOIN*, enum_schema_table_state) (sql_show.cc:6745) ==14947== by 0x715A75: JOIN::exec() (sql_select.cc:2861) ==14947== by 0x7185BD: mysql_select(THD*, Item***, TABLE_LIST*, unsigned int, List&, Item*, unsigned int, st_order*, st_order*, Item*, st_order*, unsigned long long, select_result*, st_select_lex_unit*, st_select_lex*) (sql_select.cc:3609) ==14947== by 0x70E823: handle_select(THD*, LEX*, select_result*, unsigned long) (sql_select.cc:319) ==14947== by 0x6F2305: execute_sqlcom_select(THD*, TABLE_LIST*) (sql_parse.cc:4557) ==14947== by 0x6EAED4: mysql_execute_command(THD*) (sql_parse.cc:2135) ==14947== by 0x6F44C9: mysql_parse(THD*, char*, unsigned int, Parser_state*) (sql_parse.cc:5597) ==14947== by 0x6E864B: dispatch_command(enum_server_command, THD*, char*, unsigned int) (sql_parse.cc:1093) ==14947== by 0x6E785E: do_command(THD*) (sql_parse.cc:815) ==14947== by 0x6C18DD: do_handle_one_connection(THD*) (sql_connect.cc:771) ==14947== by 0x6C146E: handle_one_connection (sql_connect.cc:707) ==14947== by 0x30E1807760: start_thread (pthread_create.c:301) ==14947== by 0x35EA670F: ??? ==14947== Uninitialised value was created by a heap allocation ==14947== at 0x4A0515D: malloc (vg_replace_malloc.c:195) ==14947== by 0xB4B948: mem_area_alloc (mem0pool.c:385) ==14947== by 0xB4A27C: mem_heap_create_block (mem0mem.c:333) ==14947== by 0xB4A530: mem_heap_add_block (mem0mem.c:446) ==14947== by 0xB0D2A4: mem_heap_alloc (mem0mem.ic:186) ==14947== by 0xB0D9C2: ha_storage_put_memlim (ha0storage.c:118) ==14947== by 0xA479D8: fill_trx_row (trx0i_s.c:521) ==14947== by 0xA490E9: fetch_data_into_cache (trx0i_s.c:1319) ==14947== by 0xA491BA: trx_i_s_possibly_fetch_data_into_cache (trx0i_s.c:1352) ==14947== by 0x9F4CE7: trx_i_s_common_fill_table(THD*, TABLE_LIST*, Item*) (i_s.cc:1221) ==14947== by 0x7689F3: get_schema_tables_result(JOIN*, enum_schema_table_state) (sql_show.cc:6745) ==14947== by 0x715A75: JOIN::exec() (sql_select.cc:2861) ==14947== by 0x7185BD: mysql_select(THD*, Item***, TABLE_LIST*, unsigned int, List&, Item*, unsigned int, st_order*, st_order*, Item*, st_order*, unsigned long long, select_result*, st_select_lex_unit*, st_select_lex*) (sql_select.cc:3609) ==14947== by 0x70E823: handle_select(THD*, LEX*, select_result*, unsigned long) (sql_select.cc:319) ==14947== by 0x6F2305: execute_sqlcom_select(THD*, TABLE_LIST*) (sql_parse.cc:4557) ==14947== by 0x6EAED4: mysql_execute_command(THD*) (sql_parse.cc:2135) ==14947== by 0x6F44C9: mysql_parse(THD*, char*, unsigned int, Parser_state*) (sql_parse.cc:5597) ==14947== by 0x6E864B: dispatch_command(enum_server_command, THD*, char*, unsigned int) (sql_parse.cc:1093) ==14947== by 0x6E785E: do_command(THD*) (sql_parse.cc:815) ==14947== by 0x6C18DD: do_handle_one_connection(THD*) (sql_connect.cc:771) ==14947== by 0x6C146E: handle_one_connection (sql_connect.cc:707) ==14947== by 0x30E1807760: start_thread (pthread_create.c:301) ==14947== by 0x35EA670F: ??? (gdb) bt #0 0x0000000004a06318 in _vgrZU_libcZdsoZa___GI_strlen (str=0x3026bfa0 "insert into `blobtest` set `data`='pkefxxpkalpabzgrczlxefkreqljeqbvzrcnhvhsjsfnvxzjsltfuincffigdkmhvvcmnseluzgbtedrfmxvnrdmzesbinjgwvharkpgjplrlnqudfidbqwgbykupycxzyikzqincnsjrxgncqzlgyqwjdbjulztgsffxpjgymsnntdibvklwqylmwhsmdskmllxuwafabdjnwlyofknwuixiyrgnplmerfdewgizkdhznitesfqepsqbbwkdepkmjoseyxjofmmjaqdipwopfrwidmhqbtovdslvayxcnpewzhppeetblccppniamezibuoinvlxkafpcmozawtplfpepxwlwhymsuraezcwvjqzwogsozodlsfzjiyrcaljjhqwdrcjawvelhefzzaexvcbyorlcyupqwgjuamiqpiputtndjwcsuyzdfhuxswuowhrzdvriwrxqmcqthvzzzvivbabbnhdbtcfdtgssvmirrcddnytnctcvqplwytxxzxelldhwahalzxvgynaiwjyezhxqhlsqudngekocfvlbqprxqhyhwbaomgqiwkpfguohuvlnhtrsszgacxhhzeppyqwfwabiqzgyzkperiidyunrykopysvlcxwhrcboetjltawdjergalsfvaxncmzoznryumrjmncvhvxqvqhhbznnifkguuiffmlrbmgwtzvnuwlaguixqadkupfhasbbxnwkrvsfhrqanfmvjtzfqodtutkjlxfcogtsjywrdgmzgszjtsmimaelsveayqrwviqwwefeziuaqsqpauxpnzhaxjtkdfvvodniwezskbxfxszyniyzkzxngcfwgjlyrlskmrzxqnptwlilsxybuguafxxkvryyjrnkhhcmxuusitaflaiuxjhyfnzkahlgmaszujqmfdhyppdnpweqanmvzgjfyzjolbmprhnuuxextcaxzicfvsuochprmlf"...) at mc_replace_strmem.c:284 #1 0x00000000009f3d7b in fill_innodb_trx_from_cache (cache=0x1462440, thd=0x2a495000, table=0x2a422500) at /home/sbester/build/bzr/mysql-trunk/storage/innobase/handler/i_s.cc:591 #2 0x00000000009f4d7e in trx_i_s_common_fill_table (thd=0x2a495000, tables=0x2a4c3ec0) at /home/sbester/build/bzr/mysql-trunk/storage/innobase/handler/i_s.cc:1238 #3 0x00000000007689f4 in get_schema_tables_result (join=0x30f90c40, executed_place=PROCESSED_BY_JOIN_EXEC) at /home/sbester/build/bzr/mysql-trunk/sql/sql_show.cc:6745 #4 0x0000000000715a76 in JOIN::exec (this=0x30f90c40) at /home/sbester/build/bzr/mysql-trunk/sql/sql_select.cc:2861 #5 0x00000000007185be in mysql_select (thd=0x2a495000, rref_pointer_array=0x2a497590, tables=0x2a4c3ec0, wild_num=1, fields=..., conds=0x0, og_num=0, order=0x0, group=0x0, having=0x0, proc_param=0x0, select_options=2684619520, result=0x30319720, unit=0x2a496d28, select_lex=0x2a497378) at /home/sbester/build/bzr/mysql-trunk/sql/sql_select.cc:3609 #6 0x000000000070e824 in handle_select (thd=0x2a495000, lex=0x2a496c78, result=0x30319720, setup_tables_done_option=0) at /home/sbester/build/bzr/mysql-trunk/sql/sql_select.cc:319 #7 0x00000000006f2306 in execute_sqlcom_select (thd=0x2a495000, all_tables=0x2a4c3ec0) at /home/sbester/build/bzr/mysql-trunk/sql/sql_parse.cc:4557 #8 0x00000000006eaed5 in mysql_execute_command (thd=0x2a495000) at /home/sbester/build/bzr/mysql-trunk/sql/sql_parse.cc:2135 #9 0x00000000006f44ca in mysql_parse (thd=0x2a495000, rawbuf=0x30d80060 "select * from innodb_trx", length=24, parser_state=0x35ea5540) at /home/sbester/build/bzr/mysql-trunk/sql/sql_parse.cc:5597 #10 0x00000000006e864c in dispatch_command (command=COM_QUERY, thd=0x2a495000, packet=0x30bb4e31 "select * from innodb_trx", packet_length=24) at /home/sbester/build/bzr/mysql-trunk/sql/sql_parse.cc:1093 #11 0x00000000006e785f in do_command (thd=0x2a495000) at /home/sbester/build/bzr/mysql-trunk/sql/sql_parse.cc:815 #12 0x00000000006c18de in do_handle_one_connection (thd_arg=0x2a495000) at /home/sbester/build/bzr/mysql-trunk/sql/sql_connect.cc:771 #13 0x00000000006c146f in handle_one_connection (arg=0x2a495000) at /home/sbester/build/bzr/mysql-trunk/sql/sql_connect.cc:707 #14 0x00000030e1807761 in start_thread (arg=0x35ea6710) at pthread_create.c:301 #15 0x00000030e14e14ed in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115 (gdb) frame 1 #1 0x00000000009f3d7b in fill_innodb_trx_from_cache (cache=0x1462440, thd=0x2a495000, table=0x2a422500) at /home/sbester/build/bzr/mysql-trunk/storage/innobase/handler/i_s.cc:591 591 row->trx_query_cs); (gdb) list 586 if (row->trx_query) { 587 /* store will do appropriate character set 588 conversion check */ 589 fields[IDX_TRX_QUERY]->store( 590 row->trx_query, strlen(row->trx_query), 591 row->trx_query_cs); 592 fields[IDX_TRX_QUERY]->set_notnull(); 593 } else { 594 fields[IDX_TRX_QUERY]->set_null(); 595 } --- storage/innodb_plugin/trx/trx0i_s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/innodb_plugin/trx/trx0i_s.c b/storage/innodb_plugin/trx/trx0i_s.c index 267e91db22e..53f4dcb0bef 100644 --- a/storage/innodb_plugin/trx/trx0i_s.c +++ b/storage/innodb_plugin/trx/trx0i_s.c @@ -508,7 +508,7 @@ fill_trx_row( query[stmt_len] = '\0'; row->trx_query = ha_storage_put_memlim( - cache->storage, stmt, stmt_len + 1, + cache->storage, query, stmt_len + 1, MAX_ALLOWED_FOR_STORAGE(cache)); row->trx_query_cs = innobase_get_charset(trx->mysql_thd); From 4ffb26de4baba4fc1d012d3bb9a4033752f3d887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 30 Mar 2011 14:25:58 +0300 Subject: [PATCH 5/5] Bug#11877216 InnoDB too eager to commit suicide on a busy server sync_array_print_long_waits(): Return the longest waiting thread ID and the longest waited-for lock. Only if those remain unchanged between calls in srv_error_monitor_thread(), increment fatal_cnt. Otherwise, reset fatal_cnt. Background: There is a built-in watchdog in InnoDB whose purpose is to kill the server when some thread is stuck waiting for a mutex or rw-lock. Before this fix, the logic was flawed. The function sync_array_print_long_waits() returns TRUE if it finds a lock wait that exceeds 10 minutes (srv_fatal_semaphore_wait_threshold). The function srv_error_monitor_thread() will kill the server if this happens 10 times in a row (fatal_cnt reaches 10), checked every 30 seconds. This is wrong, because this situation does not mean that the server is hung. If the server is very busy for a little over 15 minutes, it will be killed. Consider this example. Thread T1 is waiting for mutex M. Some time later, threads T2..Tn start waiting for the same mutex M. If T1 keeps waiting for 600 seconds, fatal_cnt will be incremented to 1. So far, so good. Now, if M is granted to T1, the server was obviously not stuck. But, T2..Tn keeps waiting, and their wait time will be longer than 600 seconds. If 5 minutes later, some Tn has still been waiting for more than 10 minutes for the mutex M, the server can be killed, even though it is not stuck. rb:622 approved by Jimmy Yang --- storage/innobase/include/sync0arr.h | 11 +++++--- storage/innobase/srv/srv0srv.c | 19 +++++++++---- storage/innobase/sync/sync0arr.c | 36 ++++++++++++++++++------ storage/innodb_plugin/ChangeLog | 5 ++++ storage/innodb_plugin/include/sync0arr.h | 7 +++-- storage/innodb_plugin/srv/srv0srv.c | 11 +++++++- storage/innodb_plugin/sync/sync0arr.c | 32 ++++++++++++++++----- 7 files changed, 93 insertions(+), 28 deletions(-) diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h index fae26b7a63e..ec48059dbcb 100644 --- a/storage/innobase/include/sync0arr.h +++ b/storage/innobase/include/sync0arr.h @@ -93,10 +93,13 @@ sync_arr_wake_threads_if_sema_free(void); Prints warnings of long semaphore waits to stderr. */ ibool -sync_array_print_long_waits(void); -/*=============================*/ - /* out: TRUE if fatal semaphore wait threshold - was exceeded */ +sync_array_print_long_waits( +/*========================*/ + /* out: TRUE if fatal semaphore wait threshold + was exceeded */ + os_thread_id_t* waiter, /* out: longest waiting thread */ + const void** sema) /* out: longest-waited-for semaphore */ + __attribute__((nonnull)); /************************************************************************ Validates the integrity of the wait array. Checks that the number of reserved cells equals the count variable. */ diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c index 9c34e73109c..3f6f1982992 100644 --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -2180,9 +2180,15 @@ srv_error_monitor_thread( os_thread_create */ { /* number of successive fatal timeouts observed */ - ulint fatal_cnt = 0; - dulint old_lsn; - dulint new_lsn; + ulint fatal_cnt = 0; + dulint old_lsn; + dulint new_lsn; + /* longest waiting thread for a semaphore */ + os_thread_id_t waiter = os_thread_get_curr_id(); + os_thread_id_t old_waiter = waiter; + /* the semaphore that is being waited for */ + const void* sema = NULL; + const void* old_sema = NULL; old_lsn = srv_start_lsn; @@ -2224,10 +2230,11 @@ loop: /* In case mutex_exit is not a memory barrier, it is theoretically possible some threads are left waiting though the semaphore is already released. Wake up those threads: */ - + sync_arr_wake_threads_if_sema_free(); - if (sync_array_print_long_waits()) { + if (sync_array_print_long_waits(&waiter, &sema) + && sema == old_sema && os_thread_eq(waiter, old_waiter)) { fatal_cnt++; if (fatal_cnt > 10) { @@ -2242,6 +2249,8 @@ loop: } } else { fatal_cnt = 0; + old_waiter = waiter; + old_sema = sema; } /* Flush stderr so that a database user gets the output diff --git a/storage/innobase/sync/sync0arr.c b/storage/innobase/sync/sync0arr.c index 41d3492c8c9..93a7398f252 100644 --- a/storage/innobase/sync/sync0arr.c +++ b/storage/innobase/sync/sync0arr.c @@ -916,10 +916,12 @@ sync_arr_wake_threads_if_sema_free(void) Prints warnings of long semaphore waits to stderr. */ ibool -sync_array_print_long_waits(void) -/*=============================*/ - /* out: TRUE if fatal semaphore wait threshold - was exceeded */ +sync_array_print_long_waits( +/*========================*/ + /* out: TRUE if fatal semaphore wait threshold + was exceeded */ + os_thread_id_t* waiter, /* out: longest waiting thread */ + const void** sema) /* out: longest-waited-for semaphore */ { sync_cell_t* cell; ibool old_val; @@ -927,24 +929,40 @@ sync_array_print_long_waits(void) ulint i; ulint fatal_timeout = srv_fatal_semaphore_wait_threshold; ibool fatal = FALSE; + double longest_diff = 0; for (i = 0; i < sync_primary_wait_array->n_cells; i++) { + double diff; + void* wait_object; + cell = sync_array_get_nth_cell(sync_primary_wait_array, i); - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) > 240) { + wait_object = cell->wait_object; + + if (wait_object == NULL || !cell->waiting) { + + continue; + } + + diff = difftime(time(NULL), cell->reservation_time); + + if (diff > 240) { fputs("InnoDB: Warning: a long semaphore wait:\n", stderr); sync_array_cell_print(stderr, cell); noticed = TRUE; } - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) - > fatal_timeout) { + if (diff > fatal_timeout) { fatal = TRUE; } + + if (diff > longest_diff) { + longest_diff = diff; + *sema = wait_object; + *waiter = cell->thread; + } } if (noticed) { diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 7c82cd9c27f..100cf3690ce 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,8 @@ +2011-03-30 The InnoDB Team + + * srv/srv0srv.c, sync/sync0arr.h, sync/sync0arr.c: + Fix Bug#11877216 InnoDB too eager to commit suicide on a busy server + 2011-03-15 The InnoDB Team * btr/btr0cur.c, page/page0zip.c: diff --git a/storage/innodb_plugin/include/sync0arr.h b/storage/innodb_plugin/include/sync0arr.h index 5f1280f5e28..6e931346238 100644 --- a/storage/innodb_plugin/include/sync0arr.h +++ b/storage/innodb_plugin/include/sync0arr.h @@ -115,8 +115,11 @@ Prints warnings of long semaphore waits to stderr. @return TRUE if fatal semaphore wait threshold was exceeded */ UNIV_INTERN ibool -sync_array_print_long_waits(void); -/*=============================*/ +sync_array_print_long_waits( +/*========================*/ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema) /*!< out: longest-waited-for semaphore */ + __attribute__((nonnull)); /********************************************************************//** Validates the integrity of the wait array. Checks that the number of reserved cells equals the count variable. */ diff --git a/storage/innodb_plugin/srv/srv0srv.c b/storage/innodb_plugin/srv/srv0srv.c index 3cf17f33c40..b1fc1ac67fd 100644 --- a/storage/innodb_plugin/srv/srv0srv.c +++ b/storage/innodb_plugin/srv/srv0srv.c @@ -2236,6 +2236,12 @@ srv_error_monitor_thread( ulint fatal_cnt = 0; ib_uint64_t old_lsn; ib_uint64_t new_lsn; + /* longest waiting thread for a semaphore */ + os_thread_id_t waiter = os_thread_get_curr_id(); + os_thread_id_t old_waiter = waiter; + /* the semaphore that is being waited for */ + const void* sema = NULL; + const void* old_sema = NULL; old_lsn = srv_start_lsn; @@ -2284,7 +2290,8 @@ loop: sync_arr_wake_threads_if_sema_free(); - if (sync_array_print_long_waits()) { + if (sync_array_print_long_waits(&waiter, &sema) + && sema == old_sema && os_thread_eq(waiter, old_waiter)) { fatal_cnt++; if (fatal_cnt > 10) { @@ -2299,6 +2306,8 @@ loop: } } else { fatal_cnt = 0; + old_waiter = waiter; + old_sema = sema; } /* Flush stderr so that a database user gets the output diff --git a/storage/innodb_plugin/sync/sync0arr.c b/storage/innodb_plugin/sync/sync0arr.c index ad29b90d344..13970023573 100644 --- a/storage/innodb_plugin/sync/sync0arr.c +++ b/storage/innodb_plugin/sync/sync0arr.c @@ -914,8 +914,10 @@ Prints warnings of long semaphore waits to stderr. @return TRUE if fatal semaphore wait threshold was exceeded */ UNIV_INTERN ibool -sync_array_print_long_waits(void) -/*=============================*/ +sync_array_print_long_waits( +/*========================*/ + os_thread_id_t* waiter, /*!< out: longest waiting thread */ + const void** sema) /*!< out: longest-waited-for semaphore */ { sync_cell_t* cell; ibool old_val; @@ -923,24 +925,40 @@ sync_array_print_long_waits(void) ulint i; ulint fatal_timeout = srv_fatal_semaphore_wait_threshold; ibool fatal = FALSE; + double longest_diff = 0; for (i = 0; i < sync_primary_wait_array->n_cells; i++) { + double diff; + void* wait_object; + cell = sync_array_get_nth_cell(sync_primary_wait_array, i); - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) > 240) { + wait_object = cell->wait_object; + + if (wait_object == NULL || !cell->waiting) { + + continue; + } + + diff = difftime(time(NULL), cell->reservation_time); + + if (diff > 240) { fputs("InnoDB: Warning: a long semaphore wait:\n", stderr); sync_array_cell_print(stderr, cell); noticed = TRUE; } - if (cell->wait_object != NULL && cell->waiting - && difftime(time(NULL), cell->reservation_time) - > fatal_timeout) { + if (diff > fatal_timeout) { fatal = TRUE; } + + if (diff > longest_diff) { + longest_diff = diff; + *sema = wait_object; + *waiter = cell->thread; + } } if (noticed) {