From 0f9c818545946dbee97bc7a8ff80ce09ed7e7cd1 Mon Sep 17 00:00:00 2001
From: Inaam Rana <inaam.rana@oracle.com>
Date: Fri, 30 Jul 2010 10:39:16 -0400
Subject: [PATCH 01/18] When the caller of buf_flush_list() provides us with
 the number of pages that it wants to flush then we should honor that value as
 in not going beyond that in our eagerness to flush the neighbors of the
 selected victim.

---
 storage/innobase/buf/buf0flu.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/storage/innobase/buf/buf0flu.c b/storage/innobase/buf/buf0flu.c
index 3737627301f..4131d863e6a 100644
--- a/storage/innobase/buf/buf0flu.c
+++ b/storage/innobase/buf/buf0flu.c
@@ -1248,8 +1248,12 @@ buf_flush_try_neighbors(
 /*====================*/
 	ulint		space,		/*!< in: space id */
 	ulint		offset,		/*!< in: page offset */
-	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU or
+	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
 					BUF_FLUSH_LIST */
+	ulint		n_flushed,	/*!< in: number of pages
+					flushed so far in this batch */
+	ulint		n_to_flush)	/*!< in: maximum number of pages
+					we are allowed to flush */
 {
 	ulint		i;
 	ulint		low;
@@ -1290,6 +1294,21 @@ buf_flush_try_neighbors(
 
 		buf_page_t*	bpage;
 
+		if ((count + n_flushed) >= n_to_flush) {
+
+			/* We have already flushed enough pages and
+			should call it a day. There is, however, one
+			exception. If the page whose neighbors we
+			are flushing has not been flushed yet then
+			we'll try to flush the victim that we
+			selected originally. */
+			if (i <= offset) {
+				i = offset;
+			} else {
+				break;
+			}
+		}
+
 		buf_pool = buf_pool_get(space, i);
 
 		buf_pool_mutex_enter(buf_pool);
@@ -1357,6 +1376,8 @@ buf_flush_page_and_try_neighbors(
 					buf_page_in_file(bpage) */
 	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU
 					or BUF_FLUSH_LIST */
+	ulint		n_to_flush,	/*!< in: number of pages to
+					flush */
 	ulint*		count)		/*!< in/out: number of pages
 					flushed */
 {
@@ -1390,7 +1411,11 @@ buf_flush_page_and_try_neighbors(
 		mutex_exit(block_mutex);
 
 		/* Try to flush also all the neighbors */
-		*count += buf_flush_try_neighbors(space, offset, flush_type);
+		*count += buf_flush_try_neighbors(space,
+						  offset,
+						  flush_type,
+						  *count,
+						  n_to_flush);
 
 		buf_pool_mutex_enter(buf_pool);
 		flushed = TRUE;
@@ -1430,7 +1455,7 @@ buf_flush_LRU_list_batch(
 		a page that isn't ready for flushing. */
 		while (bpage != NULL
 		       && !buf_flush_page_and_try_neighbors(
-				bpage, BUF_FLUSH_LRU, &count)) {
+				bpage, BUF_FLUSH_LRU, max, &count)) {
 
 			bpage = UT_LIST_GET_PREV(LRU, bpage);
 		}
@@ -1511,7 +1536,7 @@ buf_flush_flush_list_batch(
 		while (bpage != NULL
 		       && len > 0
 		       && !buf_flush_page_and_try_neighbors(
-				bpage, BUF_FLUSH_LIST, &count)) {
+				bpage, BUF_FLUSH_LIST, min_n, &count)) {
 
 			buf_flush_list_mutex_enter(buf_pool);
 

From b003544f6511d32320d6240de87823b719ab0a02 Mon Sep 17 00:00:00 2001
From: Calvin Sun <calvin.sun@oracle.com>
Date: Tue, 3 Aug 2010 01:12:03 -0500
Subject: [PATCH 02/18] Bug #54702: revert the default of innodb_strict_mode to
 false.

---
 storage/innobase/handler/ha_innodb.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index ab9df9a0272..e78f167beb6 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -425,7 +425,7 @@ static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
 
 static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
   "Use strict mode when evaluating create options.",
-  NULL, NULL, TRUE);
+  NULL, NULL, FALSE);
 
 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
   "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",

From b4a25f462ebd91612d7faeaf0df82ed34d862f38 Mon Sep 17 00:00:00 2001
From: Inaam Rana <inaam.rana@oracle.com>
Date: Thu, 5 Aug 2010 11:09:05 -0400
Subject: [PATCH 03/18] Currently we do a full validation of AHI whenever check
 tables is called on any table. This patch fixes this by only doing this full
 check in debug versions.

bug#55716
rb://423
approved by: Marko
---
 storage/innobase/btr/btr0sea.c     | 2 ++
 storage/innobase/ha/ha0ha.c        | 2 ++
 storage/innobase/include/btr0sea.h | 4 ++++
 storage/innobase/include/ha0ha.h   | 2 ++
 4 files changed, 10 insertions(+)

diff --git a/storage/innobase/btr/btr0sea.c b/storage/innobase/btr/btr0sea.c
index 06cc48c7c60..fb667bcae82 100644
--- a/storage/innobase/btr/btr0sea.c
+++ b/storage/innobase/btr/btr0sea.c
@@ -1746,6 +1746,7 @@ function_exit:
 	}
 }
 
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 /********************************************************************//**
 Validates the search system.
 @return	TRUE if ok */
@@ -1913,3 +1914,4 @@ btr_search_validate(void)
 
 	return(ok);
 }
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
diff --git a/storage/innobase/ha/ha0ha.c b/storage/innobase/ha/ha0ha.c
index f9e798012f8..7f11917de0a 100644
--- a/storage/innobase/ha/ha0ha.c
+++ b/storage/innobase/ha/ha0ha.c
@@ -354,6 +354,7 @@ ha_remove_all_nodes_to_page(
 #endif
 }
 
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 /*************************************************************//**
 Validates a given range of the cells in hash table.
 @return	TRUE if ok */
@@ -400,6 +401,7 @@ ha_validate(
 
 	return(ok);
 }
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
 
 /*************************************************************//**
 Prints info of a hash table. */
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
index 20a2be7f877..6493689a969 100644
--- a/storage/innobase/include/btr0sea.h
+++ b/storage/innobase/include/btr0sea.h
@@ -180,6 +180,7 @@ btr_search_update_hash_on_delete(
 	btr_cur_t*	cursor);/*!< in: cursor which was positioned on the
 				record to delete using btr_cur_search_...,
 				the record is not yet deleted */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 /********************************************************************//**
 Validates the search system.
 @return	TRUE if ok */
@@ -187,6 +188,9 @@ UNIV_INTERN
 ibool
 btr_search_validate(void);
 /*======================*/
+#else
+# define btr_search_validate()	TRUE
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
 
 /** Flag: has the search system been enabled?
 Protected by btr_search_latch and btr_search_enabled_mutex. */
diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h
index 1ffbd3440aa..3299000bf3c 100644
--- a/storage/innobase/include/ha0ha.h
+++ b/storage/innobase/include/ha0ha.h
@@ -186,6 +186,7 @@ ha_remove_all_nodes_to_page(
 	hash_table_t*	table,	/*!< in: hash table */
 	ulint		fold,	/*!< in: fold value */
 	const page_t*	page);	/*!< in: buffer page */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
 /*************************************************************//**
 Validates a given range of the cells in hash table.
 @return	TRUE if ok */
@@ -196,6 +197,7 @@ ha_validate(
 	hash_table_t*	table,		/*!< in: hash table */
 	ulint		start_index,	/*!< in: start index */
 	ulint		end_index);	/*!< in: end index */
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
 /*************************************************************//**
 Prints info of a hash table. */
 UNIV_INTERN

From a4c5cf7ca9efcf386600c5da5f049dcab9e86046 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@oracle.com>
Date: Mon, 9 Aug 2010 11:58:37 +0300
Subject: [PATCH 04/18] Reduce the ibuf_mutex hold time. This does not fix the
 update regression in Bug #54914, but it does speed up the execution for
 innodb_change_buffering=inserts.

ibuf_add_ops(), ibuf_merge_or_delete_for_page(),
ibuf_delete_for_discarded_space(): Use atomic built-ins instead of
ibuf_mutex, when available.

ibuf_add_free_page(), ibuf_remove_free_page(), ibuf_contract_ext():
Release ibuf_mutex earlier.

ibuf_free_excess_pages(): Release ibuf_mutex before a conditional branch.

ibuf_insert_low(): Release ibuf_mutex before a conditional
branch. Create ibuf_entry before re-acquiring ibuf_mutex. Simplify a
loop to reduce code footprint. Release ibuf_mutex before mtr_commit()
[btr_pcur_close()].

ibuf_is_empty(): Release ibuf_mutex before mtr_commit().
---
 storage/innobase/ibuf/ibuf0ibuf.c | 123 +++++++++++++++++-------------
 1 file changed, 69 insertions(+), 54 deletions(-)

diff --git a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.c
index dc8e61e5070..1d162f82b93 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.c
+++ b/storage/innobase/ibuf/ibuf0ibuf.c
@@ -1350,10 +1350,18 @@ ibuf_add_ops(
 	const ulint*	ops)	/*!< in: operation counts */
 
 {
+#ifndef HAVE_ATOMIC_BUILTINS
+	ut_ad(mutex_own(&ibuf_mutex));
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
 	ulint	i;
 
 	for (i = 0; i < IBUF_OP_COUNT; i++) {
+#ifdef HAVE_ATOMIC_BUILTINS
+		os_atomic_increment_ulint(&arr[i], ops[i]);
+#else /* HAVE_ATOMIC_BUILTINS */
 		arr[i] += ops[i];
+#endif /* HAVE_ATOMIC_BUILTINS */
 	}
 }
 
@@ -2096,13 +2104,13 @@ ibuf_add_free_page(void)
 	bitmap_page = ibuf_bitmap_get_map_page(
 		IBUF_SPACE_ID, page_no, zip_size, &mtr);
 
+	mutex_exit(&ibuf_mutex);
+
 	ibuf_bitmap_page_set_bits(
 		bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, TRUE, &mtr);
 
 	mtr_commit(&mtr);
 
-	mutex_exit(&ibuf_mutex);
-
 	ibuf_exit();
 
 	return(DB_SUCCESS);
@@ -2158,6 +2166,8 @@ ibuf_remove_free_page(void)
 
 	root = ibuf_tree_root_get(&mtr2);
 
+	mutex_exit(&ibuf_mutex);
+
 	page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 				&mtr2).page;
 
@@ -2166,7 +2176,6 @@ ibuf_remove_free_page(void)
 	is a level 2 page. */
 
 	mtr_commit(&mtr2);
-	mutex_exit(&ibuf_mutex);
 
 	ibuf_exit();
 
@@ -2220,6 +2229,8 @@ ibuf_remove_free_page(void)
 	bitmap_page = ibuf_bitmap_get_map_page(
 		IBUF_SPACE_ID, page_no, zip_size, &mtr);
 
+	mutex_exit(&ibuf_mutex);
+
 	ibuf_bitmap_page_set_bits(
 		bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr);
 
@@ -2228,8 +2239,6 @@ ibuf_remove_free_page(void)
 #endif
 	mtr_commit(&mtr);
 
-	mutex_exit(&ibuf_mutex);
-
 	ibuf_exit();
 }
 
@@ -2270,17 +2279,16 @@ ibuf_free_excess_pages(void)
 
 	for (i = 0; i < 4; i++) {
 
+		ibool	too_much_free;
+
 		mutex_enter(&ibuf_mutex);
+		too_much_free = ibuf_data_too_much_free();
+		mutex_exit(&ibuf_mutex);
 
-		if (!ibuf_data_too_much_free()) {
-
-			mutex_exit(&ibuf_mutex);
-
+		if (!too_much_free) {
 			return;
 		}
 
-		mutex_exit(&ibuf_mutex);
-
 		ibuf_remove_free_page();
 	}
 }
@@ -2486,8 +2494,8 @@ ibuf_contract_ext(
 	mutex_enter(&ibuf_mutex);
 
 	if (ibuf->empty) {
-ibuf_is_empty:
 		mutex_exit(&ibuf_mutex);
+ibuf_is_empty:
 
 #if 0 /* TODO */
 		if (srv_shutdown_state) {
@@ -2515,6 +2523,7 @@ ibuf_is_empty:
 	position within the leaf */
 
 	btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr);
+	mutex_exit(&ibuf_mutex);
 
 	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
 
@@ -2535,8 +2544,6 @@ ibuf_is_empty:
 		goto ibuf_is_empty;
 	}
 
-	mutex_exit(&ibuf_mutex);
-
 	sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur),
 					    space_ids, space_versions,
 					    page_nos, &n_stored);
@@ -3304,6 +3311,7 @@ ibuf_insert_low(
 	ulint		n_stored;
 	mtr_t		mtr;
 	mtr_t		bitmap_mtr;
+	ibool		too_big;
 
 	ut_a(!dict_index_is_clust(index));
 	ut_ad(dtuple_check_typed(entry));
@@ -3316,12 +3324,13 @@ ibuf_insert_low(
 	do_merge = FALSE;
 
 	mutex_enter(&ibuf_mutex);
+	too_big = ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT;
+	mutex_exit(&ibuf_mutex);
 
-	if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
+	if (too_big) {
 		/* Insert buffer is now too big, contract it but do not try
 		to insert */
 
-		mutex_exit(&ibuf_mutex);
 
 #ifdef UNIV_IBUF_DEBUG
 		fputs("Ibuf too big\n", stderr);
@@ -3332,40 +3341,6 @@ ibuf_insert_low(
 		return(DB_STRONG_FAIL);
 	}
 
-	mutex_exit(&ibuf_mutex);
-
-	if (mode == BTR_MODIFY_TREE) {
-		mutex_enter(&ibuf_pessimistic_insert_mutex);
-
-		ibuf_enter();
-
-		mutex_enter(&ibuf_mutex);
-
-		while (!ibuf_data_enough_free_for_insert()) {
-
-			mutex_exit(&ibuf_mutex);
-
-			ibuf_exit();
-
-			mutex_exit(&ibuf_pessimistic_insert_mutex);
-
-			err = ibuf_add_free_page();
-
-			if (err == DB_STRONG_FAIL) {
-
-				return(err);
-			}
-
-			mutex_enter(&ibuf_pessimistic_insert_mutex);
-
-			ibuf_enter();
-
-			mutex_enter(&ibuf_mutex);
-		}
-	} else {
-		ibuf_enter();
-	}
-
 	heap = mem_heap_create(512);
 
 	/* Build the entry which contains the space id and the page number
@@ -3384,6 +3359,37 @@ ibuf_insert_low(
 	the new entry to it without exceeding the free space limit for the
 	page. */
 
+	if (mode == BTR_MODIFY_TREE) {
+		for (;;) {
+			mutex_enter(&ibuf_pessimistic_insert_mutex);
+
+			ibuf_enter();
+
+			mutex_enter(&ibuf_mutex);
+
+			if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) {
+
+				break;
+			}
+
+			mutex_exit(&ibuf_mutex);
+
+			ibuf_exit();
+
+			mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+			err = ibuf_add_free_page();
+
+			if (UNIV_UNLIKELY(err == DB_STRONG_FAIL)) {
+
+				mem_heap_free(heap);
+				return(err);
+			}
+		}
+	} else {
+		ibuf_enter();
+	}
+
 	mtr_start(&mtr);
 
 	btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
@@ -4118,9 +4124,8 @@ ibuf_delete_rec(
 	btr_pcur_commit_specify_mtr(pcur, mtr);
 
 func_exit:
-	btr_pcur_close(pcur);
-
 	mutex_exit(&ibuf_mutex);
+	btr_pcur_close(pcur);
 
 	return(TRUE);
 }
@@ -4495,6 +4500,11 @@ reset_bit:
 	btr_pcur_close(&pcur);
 	mem_heap_free(heap);
 
+#ifdef HAVE_ATOMIC_BUILTINS
+	os_atomic_increment_ulint(&ibuf->n_merges, 1);
+	ibuf_add_ops(ibuf->n_merged_ops, mops);
+	ibuf_add_ops(ibuf->n_discarded_ops, dops);
+#else /* HAVE_ATOMIC_BUILTINS */
 	/* Protect our statistics keeping from race conditions */
 	mutex_enter(&ibuf_mutex);
 
@@ -4503,6 +4513,7 @@ reset_bit:
 	ibuf_add_ops(ibuf->n_discarded_ops, dops);
 
 	mutex_exit(&ibuf_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
 
 	if (update_ibuf_bitmap && !tablespace_being_deleted) {
 
@@ -4604,10 +4615,14 @@ leave_loop:
 	mtr_commit(&mtr);
 	btr_pcur_close(&pcur);
 
+#ifdef HAVE_ATOMIC_BUILTINS
+	ibuf_add_ops(ibuf->n_discarded_ops, dops);
+#else /* HAVE_ATOMIC_BUILTINS */
 	/* Protect our statistics keeping from race conditions */
 	mutex_enter(&ibuf_mutex);
 	ibuf_add_ops(ibuf->n_discarded_ops, dops);
 	mutex_exit(&ibuf_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
 
 	ibuf_exit();
 
@@ -4652,10 +4667,10 @@ ibuf_is_empty(void)
 		is_empty = FALSE;
 	}
 
-	mtr_commit(&mtr);
-
 	mutex_exit(&ibuf_mutex);
 
+	mtr_commit(&mtr);
+
 	ibuf_exit();
 
 	return(is_empty);

From 271e6ae34117db9475da877beb5ec2a0c7495872 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@oracle.com>
Date: Tue, 10 Aug 2010 13:22:48 +0300
Subject: [PATCH 05/18] Bug#54914: InnoDB: performance drop with
 innodb_change_buffering=all

Reduce ibuf_mutex and ibuf_pessimistic_insert_mutex contention further.

Protect ibuf->empty by the insert buffer root page latch, not ibuf_mutex.

ibuf_tree_root_get(): Assert that ibuf_mutex is owned by the
caller. Assert that the stamped page number is correct. Assert that
ibuf->empty agrees with the root page.

ibuf_size_update(): Do not update ibuf->empty.

ibuf_init_at_db_start(): Update ibuf->empty while holding the root page latch.

ibuf_add_free_page(): Return TRUE/FALSE instead of DB_SUCCESS/DB_STRONG_FAIL.

ibuf_remove_free_page(): Release ibuf_pessimistic_insert_mutex as
early as possible.

ibuf_contract_ext(): Rely on a dirty read of ibuf->empty, unless the
server is being shut down. Never acquire ibuf_mutex. Eliminate n_stored.

ibuf_contract_after_insert(): Never acquire ibuf_mutex. Perform dirty
reads of ibuf->size and ibuf->max_size.

ibuf_insert_low(): Only acquire ibuf_mutex for mode==BTR_MODIFY_TREE.
Perform dirty reads of ibuf->size and ibuf->max_size. Update
ibuf->empty while holding the root page latch.

ibuf_delete_rec(): Update ibuf->empty while holding the root page latch.

ibuf_is_empty(): Release ibuf_mutex earlier.
---
 storage/innobase/ibuf/ibuf0ibuf.c     | 246 +++++++++++++-------------
 storage/innobase/include/ibuf0ibuf.ic |  11 +-
 2 files changed, 132 insertions(+), 125 deletions(-)

diff --git a/storage/innobase/ibuf/ibuf0ibuf.c b/storage/innobase/ibuf/ibuf0ibuf.c
index 1d162f82b93..a048de0e884 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.c
+++ b/storage/innobase/ibuf/ibuf0ibuf.c
@@ -55,6 +55,7 @@ Created 7/19/1997 Heikki Tuuri
 #include "lock0lock.h"
 #include "log0recv.h"
 #include "que0que.h"
+#include "srv0start.h" /* srv_shutdown_state */
 
 /*	STRUCTURE OF AN INSERT BUFFER RECORD
 
@@ -395,8 +396,10 @@ ibuf_tree_root_get(
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	buf_block_t*	block;
+	page_t*		root;
 
 	ut_ad(ibuf_inside());
+	ut_ad(mutex_own(&ibuf_mutex));
 
 	mtr_x_lock(dict_index_get_lock(ibuf->index), mtr);
 
@@ -405,7 +408,13 @@ ibuf_tree_root_get(
 
 	buf_block_dbg_add_level(block, SYNC_TREE_NODE);
 
-	return(buf_block_get_frame(block));
+	root = buf_block_get_frame(block);
+
+	ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+	ut_ad(page_get_page_no(root) == FSP_IBUF_TREE_ROOT_PAGE_NO);
+	ut_ad(ibuf->empty == (page_get_n_recs(root) == 0));
+
+	return(root);
 }
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
@@ -482,8 +491,6 @@ ibuf_size_update(
 
 	/* the '1 +' is the ibuf header page */
 	ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len);
-
-	ibuf->empty = page_get_n_recs(root) == 0;
 }
 
 /******************************************************************//**
@@ -554,6 +561,7 @@ ibuf_init_at_db_start(void)
 	ibuf_size_update(root, &mtr);
 	mutex_exit(&ibuf_mutex);
 
+	ibuf->empty = (page_get_n_recs(root) == 0);
 	mtr_commit(&mtr);
 
 	ibuf_exit();
@@ -2025,9 +2033,9 @@ ibuf_data_too_much_free(void)
 /*********************************************************************//**
 Allocates a new page from the ibuf file segment and adds it to the free
 list.
-@return	DB_SUCCESS, or DB_STRONG_FAIL if no space left */
+@return	TRUE on success, FALSE if no space left */
 static
-ulint
+ibool
 ibuf_add_free_page(void)
 /*====================*/
 {
@@ -2063,10 +2071,10 @@ ibuf_add_free_page(void)
 		header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
 		&mtr);
 
-	if (page_no == FIL_NULL) {
+	if (UNIV_UNLIKELY(page_no == FIL_NULL)) {
 		mtr_commit(&mtr);
 
-		return(DB_STRONG_FAIL);
+		return(FALSE);
 	}
 
 	{
@@ -2113,7 +2121,7 @@ ibuf_add_free_page(void)
 
 	ibuf_exit();
 
-	return(DB_SUCCESS);
+	return(TRUE);
 }
 
 /*********************************************************************//**
@@ -2143,20 +2151,17 @@ ibuf_remove_free_page(void)
 	header_page = ibuf_header_page_get(&mtr);
 
 	/* Prevent pessimistic inserts to insert buffer trees for a while */
-	mutex_enter(&ibuf_pessimistic_insert_mutex);
-
 	ibuf_enter();
-
+	mutex_enter(&ibuf_pessimistic_insert_mutex);
 	mutex_enter(&ibuf_mutex);
 
 	if (!ibuf_data_too_much_free()) {
 
 		mutex_exit(&ibuf_mutex);
+		mutex_exit(&ibuf_pessimistic_insert_mutex);
 
 		ibuf_exit();
 
-		mutex_exit(&ibuf_pessimistic_insert_mutex);
-
 		mtr_commit(&mtr);
 
 		return;
@@ -2218,11 +2223,11 @@ ibuf_remove_free_page(void)
 	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 		    page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
 
+	mutex_exit(&ibuf_pessimistic_insert_mutex);
+
 	ibuf->seg_size--;
 	ibuf->free_list_len--;
 
-	mutex_exit(&ibuf_pessimistic_insert_mutex);
-
 	/* Set the bit indicating that this page is no more an ibuf tree page
 	(level 2 page) */
 
@@ -2484,17 +2489,19 @@ ibuf_contract_ext(
 	ulint		page_nos[IBUF_MAX_N_PAGES_MERGED];
 	ulint		space_ids[IBUF_MAX_N_PAGES_MERGED];
 	ib_int64_t	space_versions[IBUF_MAX_N_PAGES_MERGED];
-	ulint		n_stored;
 	ulint		sum_sizes;
 	mtr_t		mtr;
 
 	*n_pages = 0;
 	ut_ad(!ibuf_inside());
 
-	mutex_enter(&ibuf_mutex);
+	/* We perform a dirty read of ibuf->empty, without latching
+	the insert buffer root page. We trust this dirty read except
+	when a slow shutdown is being executed. During a slow
+	shutdown, the insert buffer merge must be completed. */
 
-	if (ibuf->empty) {
-		mutex_exit(&ibuf_mutex);
+	if (UNIV_UNLIKELY(ibuf->empty)
+	    && UNIV_LIKELY(!srv_shutdown_state)) {
 ibuf_is_empty:
 
 #if 0 /* TODO */
@@ -2523,18 +2530,18 @@ ibuf_is_empty:
 	position within the leaf */
 
 	btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr);
-	mutex_exit(&ibuf_mutex);
 
 	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
 
 	if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) {
-		/* When the ibuf tree is emptied completely, the last record
-		is removed using an optimistic delete and ibuf_size_update
-		is not called, causing ibuf->empty to remain FALSE. If we do
-		not reset it to TRUE here then database shutdown will hang
-		in the loop in ibuf_contract_for_n_pages. */
-
-		ibuf->empty = TRUE;
+		/* If a B-tree page is empty, it must be the root page
+		and the whole B-tree must be empty. InnoDB does not
+		allow empty B-tree pages other than the root. */
+		ut_ad(ibuf->empty);
+		ut_ad(page_get_space_id(btr_pcur_get_page(&pcur))
+		      == IBUF_SPACE_ID);
+		ut_ad(page_get_page_no(btr_pcur_get_page(&pcur))
+		      == FSP_IBUF_TREE_ROOT_PAGE_NO);
 
 		ibuf_exit();
 
@@ -2546,10 +2553,10 @@ ibuf_is_empty:
 
 	sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur),
 					    space_ids, space_versions,
-					    page_nos, &n_stored);
+					    page_nos, n_pages);
 #if 0 /* defined UNIV_IBUF_DEBUG */
 	fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n",
-		sync, n_stored, sum_sizes);
+		sync, *n_pages, sum_sizes);
 #endif
 	ibuf_exit();
 
@@ -2557,8 +2564,7 @@ ibuf_is_empty:
 	btr_pcur_close(&pcur);
 
 	buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos,
-				  n_stored);
-	*n_pages = n_stored;
+				  *n_pages);
 
 	return(sum_sizes + 1);
 }
@@ -2628,33 +2634,33 @@ ibuf_contract_after_insert(
 	ibool	sync;
 	ulint	sum_sizes;
 	ulint	size;
+	ulint	max_size;
 
-	mutex_enter(&ibuf_mutex);
-
-	if (ibuf->size < ibuf->max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
-		mutex_exit(&ibuf_mutex);
+	/* Perform dirty reads of ibuf->size and ibuf->max_size, to
+	reduce ibuf_mutex contention. ibuf->max_size remains constant
+	after ibuf_init_at_db_start(), but ibuf->size should be
+	protected by ibuf_mutex. Given that ibuf->size fits in a
+	machine word, this should be OK; at worst we are doing some
+	excessive ibuf_contract() or occasionally skipping a
+	ibuf_contract(). */
+	size = ibuf->size;
+	max_size = ibuf->max_size;
 
+	if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
 		return;
 	}
 
-	sync = FALSE;
-
-	if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_ON_INSERT_SYNC) {
-
-		sync = TRUE;
-	}
-
-	mutex_exit(&ibuf_mutex);
+	sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
 
 	/* Contract at least entry_size many bytes */
 	sum_sizes = 0;
 	size = 1;
 
-	while ((size > 0) && (sum_sizes < entry_size)) {
+	do {
 
 		size = ibuf_contract(sync);
 		sum_sizes += size;
-	}
+	} while (size > 0 && sum_sizes < entry_size);
 }
 
 /*********************************************************************//**
@@ -3272,7 +3278,7 @@ ibuf_set_entry_counter(
 /*********************************************************************//**
 Buffer an operation in the insert/delete buffer, instead of doing it
 directly to the disk page, if this is possible.
-@return	DB_SUCCESS, DB_FAIL, DB_STRONG_FAIL */
+@return	DB_SUCCESS, DB_STRONG_FAIL or other error */
 static
 ulint
 ibuf_insert_low(
@@ -3302,6 +3308,7 @@ ibuf_insert_low(
 	rec_t*		ins_rec;
 	ibool		old_bit_value;
 	page_t*		bitmap_page;
+	buf_block_t*	block;
 	page_t*		root;
 	ulint		err;
 	ibool		do_merge;
@@ -3311,7 +3318,6 @@ ibuf_insert_low(
 	ulint		n_stored;
 	mtr_t		mtr;
 	mtr_t		bitmap_mtr;
-	ibool		too_big;
 
 	ut_a(!dict_index_is_clust(index));
 	ut_ad(dtuple_check_typed(entry));
@@ -3323,11 +3329,14 @@ ibuf_insert_low(
 
 	do_merge = FALSE;
 
-	mutex_enter(&ibuf_mutex);
-	too_big = ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT;
-	mutex_exit(&ibuf_mutex);
-
-	if (too_big) {
+	/* Perform dirty reads of ibuf->size and ibuf->max_size, to
+	reduce ibuf_mutex contention. ibuf->max_size remains constant
+	after ibuf_init_at_db_start(), but ibuf->size should be
+	protected by ibuf_mutex. Given that ibuf->size fits in a
+	machine word, this should be OK; at worst we are doing some
+	excessive ibuf_contract() or occasionally skipping a
+	ibuf_contract(). */
+	if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
 		/* Insert buffer is now too big, contract it but do not try
 		to insert */
 
@@ -3361,10 +3370,8 @@ ibuf_insert_low(
 
 	if (mode == BTR_MODIFY_TREE) {
 		for (;;) {
-			mutex_enter(&ibuf_pessimistic_insert_mutex);
-
 			ibuf_enter();
-
+			mutex_enter(&ibuf_pessimistic_insert_mutex);
 			mutex_enter(&ibuf_mutex);
 
 			if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) {
@@ -3373,17 +3380,13 @@ ibuf_insert_low(
 			}
 
 			mutex_exit(&ibuf_mutex);
-
+			mutex_exit(&ibuf_pessimistic_insert_mutex);
 			ibuf_exit();
 
-			mutex_exit(&ibuf_pessimistic_insert_mutex);
-
-			err = ibuf_add_free_page();
-
-			if (UNIV_UNLIKELY(err == DB_STRONG_FAIL)) {
+			if (UNIV_UNLIKELY(!ibuf_add_free_page())) {
 
 				mem_heap_free(heap);
-				return(err);
+				return(DB_STRONG_FAIL);
 			}
 		}
 	} else {
@@ -3423,9 +3426,14 @@ ibuf_insert_low(
 		before mtr_commit(&mtr).  We must not mtr_commit(&mtr)
 		until after the IBUF_OP_DELETE has been buffered. */
 
-		err = DB_STRONG_FAIL;
+fail_exit:
+		if (mode == BTR_MODIFY_TREE) {
+			mutex_exit(&ibuf_mutex);
+			mutex_exit(&ibuf_pessimistic_insert_mutex);
+		}
 
-		goto function_exit;
+		err = DB_STRONG_FAIL;
+		goto func_exit;
 	}
 
 	/* After this point, the page could still be loaded to the
@@ -3471,9 +3479,7 @@ ibuf_insert_low(
 				space_ids, space_versions,
 				page_nos, &n_stored);
 
-			err = DB_STRONG_FAIL;
-
-			goto function_exit;
+			goto fail_exit;
 		}
 	}
 
@@ -3484,11 +3490,9 @@ ibuf_insert_low(
 	    && !ibuf_set_entry_counter(ibuf_entry, space, page_no, &pcur,
 				       mode == BTR_MODIFY_PREV, &mtr)) {
 bitmap_fail:
-		err = DB_STRONG_FAIL;
-
 		mtr_commit(&bitmap_mtr);
 
-		goto function_exit;
+		goto fail_exit;
 	}
 
 	/* Set the bitmap bit denoting that the insert buffer contains
@@ -3512,10 +3516,19 @@ bitmap_fail:
 		err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor,
 						ibuf_entry, &ins_rec,
 						&dummy_big_rec, 0, thr, &mtr);
-		if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
-			/* Update the page max trx id field */
-			page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
-					       thr_get_trx(thr)->id, &mtr);
+		block = btr_cur_get_block(cursor);
+		ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID);
+
+		/* If this is the root page, update ibuf->empty. */
+		if (UNIV_UNLIKELY(buf_block_get_page_no(block)
+				  == FSP_IBUF_TREE_ROOT_PAGE_NO)) {
+			const page_t*	root = buf_block_get_frame(block);
+
+			ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+			ut_ad(page_get_page_no(root)
+			      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+			ibuf->empty = (page_get_n_recs(root) == 0);
 		}
 	} else {
 		ut_ad(mode == BTR_MODIFY_TREE);
@@ -3532,16 +3545,22 @@ bitmap_fail:
 						 cursor,
 						 ibuf_entry, &ins_rec,
 						 &dummy_big_rec, 0, thr, &mtr);
-		if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
-			/* Update the page max trx id field */
-			page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
-					       thr_get_trx(thr)->id, &mtr);
-		}
-
+		mutex_exit(&ibuf_pessimistic_insert_mutex);
 		ibuf_size_update(root, &mtr);
+		mutex_exit(&ibuf_mutex);
+		ibuf->empty = (page_get_n_recs(root) == 0);
+
+		block = btr_cur_get_block(cursor);
+		ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID);
 	}
 
-function_exit:
+	if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
+		/* Update the page max trx id field */
+		page_update_max_trx_id(block, NULL,
+				       thr_get_trx(thr)->id, &mtr);
+	}
+
+func_exit:
 #ifdef UNIV_IBUF_COUNT_DEBUG
 	if (err == DB_SUCCESS) {
 		fprintf(stderr,
@@ -3553,11 +3572,6 @@ function_exit:
 			       ibuf_count_get(space, page_no) + 1);
 	}
 #endif
-	if (mode == BTR_MODIFY_TREE) {
-
-		mutex_exit(&ibuf_mutex);
-		mutex_exit(&ibuf_pessimistic_insert_mutex);
-	}
 
 	mtr_commit(&mtr);
 	btr_pcur_close(&pcur);
@@ -3565,16 +3579,8 @@ function_exit:
 
 	mem_heap_free(heap);
 
-	if (err == DB_SUCCESS) {
-		mutex_enter(&ibuf_mutex);
-
-		ibuf->empty = FALSE;
-
-		mutex_exit(&ibuf_mutex);
-
-		if (mode == BTR_MODIFY_TREE) {
-			ibuf_contract_after_insert(entry_size);
-		}
+	if (err == DB_SUCCESS && mode == BTR_MODIFY_TREE) {
+		ibuf_contract_after_insert(entry_size);
 	}
 
 	if (do_merge) {
@@ -4081,6 +4087,22 @@ ibuf_delete_rec(
 	success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr);
 
 	if (success) {
+		if (UNIV_UNLIKELY(!page_get_n_recs(btr_pcur_get_page(pcur)))) {
+			/* If a B-tree page is empty, it must be the root page
+			and the whole B-tree must be empty. InnoDB does not
+			allow empty B-tree pages other than the root. */
+			root = btr_pcur_get_page(pcur);
+
+			ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+			ut_ad(page_get_page_no(root)
+			      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+			/* ibuf->empty is protected by the root page latch.
+			Before the deletion, it had to be FALSE. */
+			ut_ad(!ibuf->empty);
+			ibuf->empty = TRUE;
+		}
+
 #ifdef UNIV_IBUF_COUNT_DEBUG
 		fprintf(stderr,
 			"Decrementing ibuf count of space %lu page %lu\n"
@@ -4108,6 +4130,7 @@ ibuf_delete_rec(
 	if (!ibuf_restore_pos(space, page_no, search_tuple,
 			      BTR_MODIFY_TREE, pcur, mtr)) {
 
+		mutex_exit(&ibuf_mutex);
 		goto func_exit;
 	}
 
@@ -4121,10 +4144,12 @@ ibuf_delete_rec(
 	ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1);
 #endif
 	ibuf_size_update(root, mtr);
+	mutex_exit(&ibuf_mutex);
+
+	ibuf->empty = (page_get_n_recs(root) == 0);
 	btr_pcur_commit_specify_mtr(pcur, mtr);
 
 func_exit:
-	mutex_exit(&ibuf_mutex);
 	btr_pcur_close(pcur);
 
 	return(TRUE);
@@ -4642,37 +4667,18 @@ ibuf_is_empty(void)
 	mtr_t		mtr;
 
 	ibuf_enter();
-
-	mutex_enter(&ibuf_mutex);
-
 	mtr_start(&mtr);
 
+	mutex_enter(&ibuf_mutex);
 	root = ibuf_tree_root_get(&mtr);
-
-	if (page_get_n_recs(root) == 0) {
-
-		is_empty = TRUE;
-
-		if (ibuf->empty == FALSE) {
-			fprintf(stderr,
-				"InnoDB: Warning: insert buffer tree is empty"
-				" but the data struct does not\n"
-				"InnoDB: know it. This condition is legal"
-				" if the master thread has not yet\n"
-				"InnoDB: run to completion.\n");
-		}
-	} else {
-		ut_a(ibuf->empty == FALSE);
-
-		is_empty = FALSE;
-	}
-
 	mutex_exit(&ibuf_mutex);
 
+	is_empty = (page_get_n_recs(root) == 0);
 	mtr_commit(&mtr);
-
 	ibuf_exit();
 
+	ut_a(is_empty == ibuf->empty);
+
 	return(is_empty);
 }
 
diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic
index aee27cf9739..e3fa6e3e929 100644
--- a/storage/innobase/include/ibuf0ibuf.ic
+++ b/storage/innobase/include/ibuf0ibuf.ic
@@ -46,11 +46,12 @@ struct ibuf_struct{
 	ulint		seg_size;	/*!< allocated pages of the file
 					segment containing ibuf header and
 					tree */
-	ibool		empty;		/*!< after an insert to the ibuf tree
-					is performed, this is set to FALSE,
-					and if a contract operation finds
-					the tree empty, this is set to
-					TRUE */
+	ibool		empty;		/*!< Protected by the page
+					latch of the root page of the
+					insert buffer tree
+					(FSP_IBUF_TREE_ROOT_PAGE_NO). TRUE
+					if and only if the insert
+					buffer tree is empty. */
 	ulint		free_list_len;	/*!< length of the free list */
 	ulint		height;		/*!< tree height */
 	dict_index_t*	index;		/*!< insert buffer index */

From 9d2a49d16d8f1a4bf0fb97c3718f05a59e9ed718 Mon Sep 17 00:00:00 2001
From: Vasil Dimov <vasil.dimov@oracle.com>
Date: Tue, 10 Aug 2010 17:18:21 +0300
Subject: [PATCH 06/18] Adjust tree name in .bzr-mysql/default.conf after
 rename

---
 .bzr-mysql/default.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.bzr-mysql/default.conf b/.bzr-mysql/default.conf
index df9a60f35ad..255e320de4a 100644
--- a/.bzr-mysql/default.conf
+++ b/.bzr-mysql/default.conf
@@ -1,4 +1,4 @@
 [MYSQL]
 post_commit_to = commits@lists.mysql.com, innodb_dev_ww@oracle.com
 post_push_to = commits@lists.mysql.com, innodb_dev_ww@oracle.com
-tree_name = "mysql-trunk-innodb"
+tree_name = "mysql-5.5-innodb"

From 34a05995dfd2b9248bc78f06f291c59fc6d456fb Mon Sep 17 00:00:00 2001
From: Sunny Bains <Sunny.Bains@Oracle.Com>
Date: Thu, 12 Aug 2010 20:00:07 +1000
Subject: [PATCH 07/18] Fix bug #52263	innodb does not compile on OpenSolaris
 with gcc4.3.2

Disable the GCC visibility attribute on all sun platforms.

Approved by Marko on IRC.
---
 storage/innobase/include/univ.i | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index ac87942f255..5a5af76e175 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -254,8 +254,10 @@ by one. */
 			option off; also some ibuf tests are suppressed */
 
 /* Linkage specifier for non-static InnoDB symbols (variables and functions)
-that are only referenced from within InnoDB, not from MySQL */
-#if defined(__GNUC__) && (__GNUC__ >= 4) || defined(__INTEL_COMPILER)
+that are only referenced from within InnoDB, not from MySQL. We disable the
+GCC visibility directive on all Sun operating systems because there is no
+easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(sun) || defined(__INTEL_COMPILER)
 # define UNIV_INTERN __attribute__((visibility ("hidden")))
 #else
 # define UNIV_INTERN

From 50af6a8aea0d74789bd6d210e874f34a932c7268 Mon Sep 17 00:00:00 2001
From: Inaam Rana <inaam.rana@oracle.com>
Date: Fri, 13 Aug 2010 12:14:59 -0400
Subject: [PATCH 08/18] Undo changes to innodb_strict_mode that went in by
 mistake in r3149

---
 storage/innobase/handler/ha_innodb.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index e78f167beb6..ab9df9a0272 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -425,7 +425,7 @@ static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
 
 static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
   "Use strict mode when evaluating create options.",
-  NULL, NULL, FALSE);
+  NULL, NULL, TRUE);
 
 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
   "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",

From 1a649603f81c376fd8860fc636d2f4f185a5c945 Mon Sep 17 00:00:00 2001
From: Inaam Rana <inaam.rana@oracle.com>
Date: Fri, 13 Aug 2010 15:07:22 -0400
Subject: [PATCH 09/18] Change default for innodb_strict_mode to FALSE. Note
 that this was originally pushed by Calvin but the was later reverted by
 mistake.

bug#54702
---
 .../sys_vars/r/innodb_strict_mode_basic.result   | 16 ++++++++--------
 storage/innobase/handler/ha_innodb.cc            |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/mysql-test/suite/sys_vars/r/innodb_strict_mode_basic.result b/mysql-test/suite/sys_vars/r/innodb_strict_mode_basic.result
index 5e55faa99c9..200f9166215 100644
--- a/mysql-test/suite/sys_vars/r/innodb_strict_mode_basic.result
+++ b/mysql-test/suite/sys_vars/r/innodb_strict_mode_basic.result
@@ -1,32 +1,32 @@
 SET @start_global_value = @@global.innodb_strict_mode;
 SELECT @start_global_value;
 @start_global_value
-1
+0
 Valid values are 'ON' and 'OFF' 
 select @@global.innodb_strict_mode in (0, 1);
 @@global.innodb_strict_mode in (0, 1)
 1
 select @@global.innodb_strict_mode;
 @@global.innodb_strict_mode
-1
+0
 select @@session.innodb_strict_mode in (0, 1);
 @@session.innodb_strict_mode in (0, 1)
 1
 select @@session.innodb_strict_mode;
 @@session.innodb_strict_mode
-1
+0
 show global variables like 'innodb_strict_mode';
 Variable_name	Value
-innodb_strict_mode	ON
+innodb_strict_mode	OFF
 show session variables like 'innodb_strict_mode';
 Variable_name	Value
-innodb_strict_mode	ON
+innodb_strict_mode	OFF
 select * from information_schema.global_variables where variable_name='innodb_strict_mode';
 VARIABLE_NAME	VARIABLE_VALUE
-INNODB_STRICT_MODE	ON
+INNODB_STRICT_MODE	OFF
 select * from information_schema.session_variables where variable_name='innodb_strict_mode';
 VARIABLE_NAME	VARIABLE_VALUE
-INNODB_STRICT_MODE	ON
+INNODB_STRICT_MODE	OFF
 set global innodb_strict_mode='OFF';
 set session innodb_strict_mode='OFF';
 select @@global.innodb_strict_mode;
@@ -117,4 +117,4 @@ INNODB_STRICT_MODE	ON
 SET @@global.innodb_strict_mode = @start_global_value;
 SELECT @@global.innodb_strict_mode;
 @@global.innodb_strict_mode
-1
+0
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index ab9df9a0272..e78f167beb6 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -425,7 +425,7 @@ static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
 
 static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
   "Use strict mode when evaluating create options.",
-  NULL, NULL, TRUE);
+  NULL, NULL, FALSE);
 
 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
   "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",

From 7f62ec7b38a3bceb23132dad117a7e4f8b592898 Mon Sep 17 00:00:00 2001
From: Vasil Dimov <vasil.dimov@oracle.com>
Date: Mon, 16 Aug 2010 17:23:29 +0300
Subject: [PATCH 10/18] Fix Bug#53761 RANGE estimation for matched rows may be
 200 times different

Improve the range estimation algorithm.

Previously:
For a given level the algo knows the number of pages in the requested range and the n

With this change:
Same idea, but peek a few (10) of the intermediate pages to get a better estimate of

In the bug report one of the examples has a btree with a snippet of the leaf level li
page1(899 records), page2(1 record), page3(1 record), page4(1 record)
so when trying to estimate, the previous algo, assumed there are average (899+1)/2=45
Fix Bug#53761 RANGE estimation for matched rows may be 200 times different

Improve the range estimation algorithm.

Previously:
For a given level the algo knows the number of pages in the requested range
and the number of records on the leftmost and the rightmost page. Then it
assumes all pages in between contain the average between the two border pages
and multiplies this average number by the number of intermediate pages.

With this change:
Same idea, but peek a few (10) of the intermediate pages to get a better
estimate of the average number of records per page. If there are less than 10
intermediate pages then all of them will be scanned and the result will be
precise, not an estimation.

In the bug report one of the examples has a btree with a snippet of the leaf
level like this:
page1(899 records), page2(1 record), page3(1 record), page4(1 record)
so when trying to estimate, the previous algo, assumed there are average
(899+1)/2=450 records per page which went terribly wrong. With this change
page2 and page3 will be read and the exact number of records will be returned.

Approved by:	Sunny (rb://401)
---
 storage/innobase/btr/btr0cur.c     | 176 +++++++++++++++++++++++++++--
 storage/innobase/include/btr0cur.h |   5 +
 2 files changed, 172 insertions(+), 9 deletions(-)

diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c
index 537d5f51184..2549589b0c7 100644
--- a/storage/innobase/btr/btr0cur.c
+++ b/storage/innobase/btr/btr0cur.c
@@ -3153,6 +3153,7 @@ btr_cur_add_path_info(
 {
 	btr_path_t*	slot;
 	rec_t*		rec;
+	page_t*		page;
 
 	ut_a(cursor->path_arr);
 
@@ -3175,8 +3176,155 @@ btr_cur_add_path_info(
 
 	slot = cursor->path_arr + (root_height - height);
 
+	page = page_align(rec);
+
 	slot->nth_rec = page_rec_get_n_recs_before(rec);
-	slot->n_recs = page_get_n_recs(page_align(rec));
+	slot->n_recs = page_get_n_recs(page);
+	slot->page_no = page_get_page_no(page);
+	slot->page_level = btr_page_get_level_low(page);
+}
+
+/*******************************************************************//**
+Estimate the number of rows between slot1 and slot2 for any level on a
+B-tree. This function starts from slot1->page and reads a few pages to
+the right, counting their records. If we reach slot2->page quickly then
+we know exactly how many records there are between slot1 and slot2 and
+we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
+then we calculate the average number of records in the pages scanned
+so far and assume that all pages that we did not scan up to slot2->page
+contain the same number of records, then we multiply that average to
+the number of pages between slot1->page and slot2->page (which is
+n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
+@return	number of rows (exact or estimated) */
+static
+ib_int64_t
+btr_estimate_n_rows_in_range_on_level(
+/*==================================*/
+	dict_index_t*	index,			/*!< in: index */
+	btr_path_t*	slot1,			/*!< in: left border */
+	btr_path_t*	slot2,			/*!< in: right border */
+	ib_int64_t	n_rows_on_prev_level,	/*!< in: number of rows
+						on the previous level for the
+						same descend paths; used to
+						determine the numbe of pages
+						on this level */
+	ibool*		is_n_rows_exact)	/*!< out: TRUE if the returned
+						value is exact i.e. not an
+						estimation */
+{
+	ulint		space;
+	ib_int64_t	n_rows;
+	ulint		n_pages_read;
+	ulint		page_no;
+	ulint		zip_size;
+	ulint		level;
+
+	space = dict_index_get_space(index);
+
+	n_rows = 0;
+	n_pages_read = 0;
+
+	/* Assume by default that we will scan all pages between
+	slot1->page_no and slot2->page_no */
+	*is_n_rows_exact = TRUE;
+
+	/* add records from slot1->page_no which are to the right of
+	the record which serves as a left border of the range, if any */
+	if (slot1->nth_rec < slot1->n_recs) {
+		n_rows += slot1->n_recs - slot1->nth_rec;
+	}
+
+	/* add records from slot2->page_no which are to the left of
+	the record which servers as a right border of the range, if any */
+	if (slot2->nth_rec > 1) {
+		n_rows += slot2->nth_rec - 1;
+	}
+
+	/* count the records in the pages between slot1->page_no and
+	slot2->page_no (non inclusive), if any */
+
+	zip_size = fil_space_get_zip_size(space);
+
+	/* Do not read more than this number of pages in order not to hurt
+	performance with this code which is just an estimation. If we read
+	this many pages before reaching slot2->page_no then we estimate the
+	average from the pages scanned so far */
+	#define N_PAGES_READ_LIMIT	10
+
+	page_no = slot1->page_no;
+	level = slot1->page_level;
+
+	do {
+		mtr_t		mtr;
+		page_t*		page;
+		buf_block_t*	block;
+
+		mtr_start(&mtr);
+
+		/* fetch the page */
+		block = buf_page_get(space, zip_size, page_no, RW_S_LATCH,
+				     &mtr);
+
+		page = buf_block_get_frame(block);
+
+		/* It is possible that the tree has been reorganized in the
+		meantime and this is a different page. If this happens the
+		calculated estimate will be bogus, which is not fatal as
+		this is only an estimate. We are sure that a page with
+		page_no exists because InnoDB never frees pages, only
+		reuses them. */
+		if (fil_page_get_type(page) != FIL_PAGE_INDEX
+		    || btr_page_get_index_id(page) != index->id
+		    || btr_page_get_level_low(page) != level) {
+
+			/* The page got reused for something else */
+			goto inexact;
+		}
+
+		n_pages_read++;
+
+		if (page_no != slot1->page_no) {
+			/* Do not count the records on slot1->page_no,
+			we already counted them before this loop. */
+			n_rows += page_get_n_recs(page);
+		}
+
+		page_no = btr_page_get_next(page, &mtr);
+
+		mtr_commit(&mtr);
+
+		if (n_pages_read == N_PAGES_READ_LIMIT
+		    || page_no == FIL_NULL) {
+			/* Either we read too many pages or
+			we reached the end of the level without passing
+			through slot2->page_no, the tree must have changed
+			in the meantime */
+			goto inexact;
+		}
+
+	} while (page_no != slot2->page_no);
+
+	return(n_rows);
+
+inexact:
+
+	*is_n_rows_exact = FALSE;
+
+	/* We did interrupt before reaching slot2->page */
+
+	if (n_pages_read > 0) {
+		/* The number of pages on this level is
+		n_rows_on_prev_level, multiply it by the
+		average number of recs per page so far */
+		n_rows = n_rows_on_prev_level
+			* n_rows / n_pages_read;
+	} else {
+		/* The tree changed before we could even
+		start with slot1->page_no */
+		n_rows = 10;
+	}
+
+	return(n_rows);
 }
 
 /*******************************************************************//**
@@ -3201,6 +3349,7 @@ btr_estimate_n_rows_in_range(
 	ibool		diverged_lot;
 	ulint		divergence_level;
 	ib_int64_t	n_rows;
+	ibool		is_n_rows_exact;
 	ulint		i;
 	mtr_t		mtr;
 
@@ -3243,6 +3392,7 @@ btr_estimate_n_rows_in_range(
 	/* We have the path information for the range in path1 and path2 */
 
 	n_rows = 1;
+	is_n_rows_exact = TRUE;
 	diverged = FALSE;	    /* This becomes true when the path is not
 				    the same any more */
 	diverged_lot = FALSE;	    /* This becomes true when the paths are
@@ -3258,7 +3408,7 @@ btr_estimate_n_rows_in_range(
 		if (slot1->nth_rec == ULINT_UNDEFINED
 		    || slot2->nth_rec == ULINT_UNDEFINED) {
 
-			if (i > divergence_level + 1) {
+			if (i > divergence_level + 1 && !is_n_rows_exact) {
 				/* In trees whose height is > 1 our algorithm
 				tends to underestimate: multiply the estimate
 				by 2: */
@@ -3270,7 +3420,9 @@ btr_estimate_n_rows_in_range(
 			to over 1 / 2 of the estimated rows in the whole
 			table */
 
-			if (n_rows > index->table->stat_n_rows / 2) {
+			if (n_rows > index->table->stat_n_rows / 2
+			    && !is_n_rows_exact) {
+
 				n_rows = index->table->stat_n_rows / 2;
 
 				/* If there are just 0 or 1 rows in the table,
@@ -3296,10 +3448,15 @@ btr_estimate_n_rows_in_range(
 					divergence_level = i;
 				}
 			} else {
-				/* Maybe the tree has changed between
-				searches */
-
-				return(10);
+				/* It is possible that
+				slot1->nth_rec >= slot2->nth_rec
+				if, for example, we have a single page
+				tree which contains (inf, 5, 6, supr)
+				and we select where x > 20 and x < 30;
+				in this case slot1->nth_rec will point
+				to the supr record and slot2->nth_rec
+				will point to 6 */
+				n_rows = 0;
 			}
 
 		} else if (diverged && !diverged_lot) {
@@ -3323,8 +3480,9 @@ btr_estimate_n_rows_in_range(
 			}
 		} else if (diverged_lot) {
 
-			n_rows = (n_rows * (slot1->n_recs + slot2->n_recs))
-				/ 2;
+			n_rows = btr_estimate_n_rows_in_range_on_level(
+				index, slot1, slot2, n_rows,
+				&is_n_rows_exact);
 		}
 	}
 }
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index 757477838ee..7cafa6e0df5 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -615,6 +615,11 @@ struct btr_path_struct{
 				order); value ULINT_UNDEFINED
 				denotes array end */
 	ulint	n_recs;		/*!< number of records on the page */
+	ulint	page_no;	/*!< no of the page containing the record */
+	ulint	page_level;	/*!< level of the page, if later we fetch
+				the page under page_no and it is no different
+				level then we know that the tree has been
+				reorganized */
 };
 
 #define BTR_PATH_ARRAY_N_SLOTS	250	/*!< size of path array (in slots) */

From 393aaa4c515f724f110e962680db6148c0cf2a0a Mon Sep 17 00:00:00 2001
From: Vasil Dimov <vasil.dimov@oracle.com>
Date: Tue, 17 Aug 2010 09:17:04 +0300
Subject: [PATCH 11/18] Adjust innodb_mysql.result

This is a followup to vasil.dimov@oracle.com-20100816142329-yimenbuktd416z1a
which improved the sampling algorithm. I have manually checked that the new
values are actually the correct ones, for example:
-rows	16
+rows	32
the number of rows returned by the query is 32.
---
 mysql-test/suite/innodb/r/innodb_mysql.result | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/mysql-test/suite/innodb/r/innodb_mysql.result b/mysql-test/suite/innodb/r/innodb_mysql.result
index 9a677f83080..51beed66f0b 100644
--- a/mysql-test/suite/innodb/r/innodb_mysql.result
+++ b/mysql-test/suite/innodb/r/innodb_mysql.result
@@ -889,13 +889,13 @@ EXPLAIN SELECT * FROM t1 WHERE b BETWEEN 1 AND 2 ORDER BY a;
 id	1
 select_type	SIMPLE
 table	t1
-type	range
+type	index
 possible_keys	bkey
-key	bkey
-key_len	5
+key	PRIMARY
+key_len	4
 ref	NULL
-rows	16
-Extra	Using where; Using index; Using filesort
+rows	32
+Extra	Using where
 SELECT * FROM t1 WHERE b BETWEEN 1 AND 2 ORDER BY a;
 a	b
 1	2
@@ -934,12 +934,12 @@ EXPLAIN SELECT * FROM t1 WHERE b BETWEEN 1 AND 2 ORDER BY b,a;
 id	1
 select_type	SIMPLE
 table	t1
-type	range
+type	index
 possible_keys	bkey
 key	bkey
 key_len	5
 ref	NULL
-rows	16
+rows	32
 Extra	Using where; Using index
 SELECT * FROM t1 WHERE b BETWEEN 1 AND 2 ORDER BY b,a;
 a	b
@@ -989,7 +989,7 @@ possible_keys	bkey
 key	bkey
 key_len	5
 ref	const
-rows	8
+rows	16
 Extra	Using where; Using index; Using filesort
 SELECT * FROM t2 WHERE b=1 ORDER BY a;
 a	b	c
@@ -1018,7 +1018,7 @@ possible_keys	bkey
 key	bkey
 key_len	10
 ref	const,const
-rows	8
+rows	16
 Extra	Using where; Using index
 SELECT * FROM t2 WHERE b=1 AND c=1 ORDER BY a;
 a	b	c
@@ -1047,7 +1047,7 @@ possible_keys	bkey
 key	bkey
 key_len	10
 ref	const,const
-rows	8
+rows	16
 Extra	Using where; Using index
 SELECT * FROM t2 WHERE b=1 AND c=1 ORDER BY b,c,a;
 a	b	c
@@ -1076,7 +1076,7 @@ possible_keys	bkey
 key	bkey
 key_len	10
 ref	const,const
-rows	8
+rows	16
 Extra	Using where; Using index
 SELECT * FROM t2 WHERE b=1 AND c=1 ORDER BY c,a;
 a	b	c
@@ -1213,7 +1213,7 @@ possible_keys	b
 key	b
 key_len	5
 ref	const
-rows	1
+rows	2
 Extra	Using where; Using index
 SELECT * FROM t1 WHERE b=2 ORDER BY a ASC;
 a	b
@@ -1228,7 +1228,7 @@ possible_keys	b
 key	b
 key_len	5
 ref	const
-rows	1
+rows	2
 Extra	Using where; Using index
 SELECT * FROM t1 WHERE b=2 ORDER BY a DESC;
 a	b
@@ -1372,7 +1372,7 @@ INSERT INTO t1 (a,b,c) VALUES (1,1,1), (2,1,1), (3,1,1), (4,1,1);
 INSERT INTO t1 (a,b,c) SELECT a+4,b,c FROM t1;
 EXPLAIN SELECT a, b, c FROM t1 WHERE b = 1 ORDER BY a DESC LIMIT 5;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index	t1_b	PRIMARY	4	NULL	8	Using where
+1	SIMPLE	t1	range	t1_b	t1_b	5	NULL	8	Using where
 SELECT a, b, c FROM t1 WHERE b = 1 ORDER BY a DESC LIMIT 5;
 a	b	c
 8	1	1
@@ -1735,7 +1735,7 @@ SELECT 1 FROM (SELECT COUNT(DISTINCT c1)
 FROM t1 WHERE c2 IN (1, 1) AND c3 = 2 GROUP BY c2) x;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	<derived2>	system	NULL	NULL	NULL	NULL	1	
-2	DERIVED	t1	index	c3,c2	c2	10	NULL	5	
+2	DERIVED	t1	ALL	c3,c2	c3	5		5	Using filesort
 DROP TABLE t1;
 CREATE TABLE t1 (c1 REAL, c2 REAL, c3 REAL, KEY (c3), KEY (c2, c3))
 ENGINE=InnoDB;
@@ -1749,7 +1749,7 @@ SELECT 1 FROM (SELECT COUNT(DISTINCT c1)
 FROM t1 WHERE c2 IN (1, 1) AND c3 = 2 GROUP BY c2) x;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	<derived2>	system	NULL	NULL	NULL	NULL	1	
-2	DERIVED	t1	index	c3,c2	c2	18	NULL	5	
+2	DERIVED	t1	ALL	c3,c2	c3	9		5	Using filesort
 DROP TABLE t1;
 CREATE TABLE t1 (c1 DECIMAL(12,2), c2 DECIMAL(12,2), c3 DECIMAL(12,2), 
 KEY (c3), KEY (c2, c3))
@@ -1764,7 +1764,7 @@ SELECT 1 FROM (SELECT COUNT(DISTINCT c1)
 FROM t1 WHERE c2 IN (1, 1) AND c3 = 2 GROUP BY c2) x;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	PRIMARY	<derived2>	system	NULL	NULL	NULL	NULL	1	
-2	DERIVED	t1	index	c3,c2	c2	14	NULL	5	
+2	DERIVED	t1	ALL	c3,c2	c3	7		5	Using filesort
 DROP TABLE t1;
 End of 5.1 tests
 #
@@ -1871,7 +1871,7 @@ possible_keys	b
 key	b
 key_len	5
 ref	NULL
-rows	3
+rows	5
 Extra	Using where; Using index
 EXPLAIN SELECT c FROM bar WHERE c>2;;
 id	1
@@ -2536,7 +2536,7 @@ f1	f2	f3	f4
 EXPLAIN SELECT * FROM t1 WHERE f2 = 1 AND f4 = TRUE
 ORDER BY f1 DESC LIMIT 5;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	f2,f4	f4	1	NULL	11	Using where
+1	SIMPLE	t1	range	f2,f4	f4	1	NULL	22	Using where
 DROP TABLE t1;
 #
 # Bug#54117 crash in thr_multi_unlock, temporary table

From aed93f872745f6b46b24dc2948a13074d1f368ea Mon Sep 17 00:00:00 2001
From: Vasil Dimov <vasil.dimov@oracle.com>
Date: Tue, 17 Aug 2010 09:24:33 +0300
Subject: [PATCH 12/18] Adjust innodb_gis.result

This is a followup to vasil.dimov@oracle.com-20100816142329-yimenbuktd416z1a
which improved the sampling algorithm.
---
 mysql-test/suite/innodb/r/innodb_gis.result | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mysql-test/suite/innodb/r/innodb_gis.result b/mysql-test/suite/innodb/r/innodb_gis.result
index 0ce1ebe56ad..5712d08c9fa 100644
--- a/mysql-test/suite/innodb/r/innodb_gis.result
+++ b/mysql-test/suite/innodb/r/innodb_gis.result
@@ -572,7 +572,7 @@ COUNT(*)
 EXPLAIN 
 SELECT COUNT(*) FROM t2 WHERE p=POINTFROMTEXT('POINT(1 2)');
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	ref	p	p	28	const	1	Using where
+1	SIMPLE	t2	ref	p	p	28	const	2	Using where
 SELECT COUNT(*) FROM t2 WHERE p=POINTFROMTEXT('POINT(1 2)');
 COUNT(*)
 2

From f0ba35c617945b0b767ce152d7ce0a91124bd13f Mon Sep 17 00:00:00 2001
From: Vasil Dimov <vasil.dimov@oracle.com>
Date: Tue, 17 Aug 2010 09:25:08 +0300
Subject: [PATCH 13/18] Adjust rowid_order_innodb.result

This is a followup to vasil.dimov@oracle.com-20100816142329-yimenbuktd416z1a
which improved the sampling algorithm.
---
 mysql-test/r/rowid_order_innodb.result | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mysql-test/r/rowid_order_innodb.result b/mysql-test/r/rowid_order_innodb.result
index e0796cd7ab5..dc339304041 100644
--- a/mysql-test/r/rowid_order_innodb.result
+++ b/mysql-test/r/rowid_order_innodb.result
@@ -15,7 +15,7 @@ insert into t1 values (-5, 1, 1),
 (10, 1, 1);
 explain select * from t1 force index(key1, key2) where key1 < 3 or key2 < 3;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	index_merge	key1,key2	key1,key2	5,5	NULL	4	Using sort_union(key1,key2); Using where
+1	SIMPLE	t1	index_merge	key1,key2	key1,key2	5,5	NULL	5	Using sort_union(key1,key2); Using where
 select * from t1 force index(key1, key2) where key1 < 3 or key2 < 3;
 pk1	key1	key2
 -100	1	1

From 8e168c5c27711b9ba36ce61ebfa674fc15190330 Mon Sep 17 00:00:00 2001
From: Vasil Dimov <vasil.dimov@oracle.com>
Date: Tue, 17 Aug 2010 09:26:41 +0300
Subject: [PATCH 14/18] Adjust type_bit_innodb.result

This is a followup to vasil.dimov@oracle.com-20100816142329-yimenbuktd416z1a
which improved the sampling algorithm.
---
 mysql-test/r/type_bit_innodb.result | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mysql-test/r/type_bit_innodb.result b/mysql-test/r/type_bit_innodb.result
index a9c3cae1770..909db576b27 100644
--- a/mysql-test/r/type_bit_innodb.result
+++ b/mysql-test/r/type_bit_innodb.result
@@ -233,7 +233,7 @@ a+0	b+0
 127	403
 explain select a+0, b+0 from t1 where a > 40 and b > 200 order by 1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	a	a	2	NULL	19	Using where; Using index; Using filesort
+1	SIMPLE	t1	range	a	a	2	NULL	27	Using where; Using index; Using filesort
 select a+0, b+0 from t1 where a > 40 and b > 200 order by 1;
 a+0	b+0
 44	307

From 524e0dc4d54205bebf7bd5d590246c215f0fb801 Mon Sep 17 00:00:00 2001
From: Vasil Dimov <vasil.dimov@oracle.com>
Date: Tue, 17 Aug 2010 09:34:30 +0300
Subject: [PATCH 15/18] Adjust endspace.result

This is a followup to vasil.dimov@oracle.com-20100816142329-yimenbuktd416z1a
which improved the sampling algorithm. The endspace test is non-deterministic
because it does not include ORDER BY clause in its queries.
---
 mysql-test/r/endspace.result | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mysql-test/r/endspace.result b/mysql-test/r/endspace.result
index 9c8d12362c4..25e2238e7bb 100644
--- a/mysql-test/r/endspace.result
+++ b/mysql-test/r/endspace.result
@@ -107,8 +107,8 @@ concat('|', text1, '|')
 |teststring |
 select concat('|', text1, '|') from t1 where text1='teststring' or text1 like 'teststring_%';
 concat('|', text1, '|')
-|teststring	|
 |teststring|
+|teststring	|
 |teststring |
 select concat('|', text1, '|') from t1 where text1='teststring' or text1 > 'teststring\t';
 concat('|', text1, '|')
@@ -203,13 +203,13 @@ teststring
 teststring 
 select text1, length(text1) from t1 where text1='teststring' or text1 like 'teststring_%';
 text1	length(text1)
-teststring		11
 teststring	10
+teststring		11
 teststring 	11
 select text1, length(text1) from t1 where text1='teststring' or text1 >= 'teststring\t';
 text1	length(text1)
-teststring		11
 teststring	10
+teststring		11
 teststring 	11
 select concat('|', text1, '|') from t1 order by text1;
 concat('|', text1, '|')

From b17b122b7daa2f6fbc04ab7a32269d6f2d22cbfe Mon Sep 17 00:00:00 2001
From: Jimmy Yang <jimmy.yang@oracle.com>
Date: Tue, 17 Aug 2010 01:19:24 -0700
Subject: [PATCH 16/18] Fix bug #53496 Use Lock_time in slow query log output
 for InnoDB row lock wait time. Including the InnoDB lock time in the exiting
 "Lock_time" output.

---
 include/mysql/plugin.h                   |  1 +
 include/mysql/plugin.h.pp                |  1 +
 sql/sql_class.cc                         |  5 +++++
 sql/sql_class.h                          |  2 +-
 storage/innobase/handler/ha_innodb.cc    | 14 ++++++++++++++
 storage/innobase/include/ha_prototypes.h |  8 ++++++++
 storage/innobase/srv/srv0srv.c           |  3 +++
 7 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/mysql/plugin.h b/include/mysql/plugin.h
index 19cf0ed050d..15f7d785ead 100644
--- a/include/mysql/plugin.h
+++ b/include/mysql/plugin.h
@@ -528,6 +528,7 @@ long long thd_test_options(const MYSQL_THD thd, long long test_options);
 int thd_sql_command(const MYSQL_THD thd);
 const char *thd_proc_info(MYSQL_THD thd, const char *info);
 void **thd_ha_data(const MYSQL_THD thd, const struct handlerton *hton);
+void thd_storage_lock_wait(MYSQL_THD thd, long long value);
 int thd_tx_isolation(const MYSQL_THD thd);
 char *thd_security_context(MYSQL_THD thd, char *buffer, unsigned int length,
                            unsigned int max_query_len);
diff --git a/include/mysql/plugin.h.pp b/include/mysql/plugin.h.pp
index 3a1b03742da..9d2877be5a2 100644
--- a/include/mysql/plugin.h.pp
+++ b/include/mysql/plugin.h.pp
@@ -154,6 +154,7 @@ long long thd_test_options(const void* thd, long long test_options);
 int thd_sql_command(const void* thd);
 const char *thd_proc_info(void* thd, const char *info);
 void **thd_ha_data(const void* thd, const struct handlerton *hton);
+void thd_storage_lock_wait(void* thd, long long value);
 int thd_tx_isolation(const void* thd);
 char *thd_security_context(void* thd, char *buffer, unsigned int length,
                            unsigned int max_query_len);
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 1bec02afa96..28e86ecc67f 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -307,6 +307,11 @@ void **thd_ha_data(const THD *thd, const struct handlerton *hton)
   return (void **) &thd->ha_data[hton->slot].ha_ptr;
 }
 
+extern "C"
+void thd_storage_lock_wait(THD *thd, long long value)
+{
+  thd->utime_after_lock+= value;
+}
 
 /**
   Provide a handler data getter to simplify coding
diff --git a/sql/sql_class.h b/sql/sql_class.h
index c095fee6232..b135af41af0 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1505,7 +1505,7 @@ public:
   // track down slow pthread_create
   ulonglong  prior_thr_create_utime, thr_create_utime;
   ulonglong  start_utime, utime_after_lock;
-  
+
   thr_lock_type update_lock_default;
   Delayed_insert *di;
 
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index e78f167beb6..a004cba9603 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -807,6 +807,20 @@ thd_lock_wait_timeout(
 	return(THDVAR((THD*) thd, lock_wait_timeout));
 }
 
+/******************************************************************//**
+Set the time waited for the lock for the current query. */
+extern "C" UNIV_INTERN
+void
+thd_set_lock_wait_time(
+/*===================*/
+	void*	thd,	/*!< in: thread handle (THD*) */
+	ulint	value)	/*!< in: time waited for the lock */
+{
+	if (thd) {
+		thd_storage_lock_wait((THD*)thd, value);
+	}
+}
+
 /********************************************************************//**
 Obtain the InnoDB transaction of a MySQL thread.
 @return	reference to transaction pointer */
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index a9ee1d66b99..b75002944bd 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -267,5 +267,13 @@ thd_lock_wait_timeout(
 /*==================*/
 	void*	thd);	/*!< in: thread handle (THD*), or NULL to query
 			the global innodb_lock_wait_timeout */
+/******************************************************************//**
+Add up the time waited for the lock for the current query. */
+UNIV_INTERN
+void
+thd_set_lock_wait_time(
+/*===================*/
+        void*   thd,	/*!< in: thread handle (THD*) */
+        ulint   value);	/*!< in: time waited for the lock */
 
 #endif
diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c
index 97d699dde99..bea8d7f8fdc 100644
--- a/storage/innobase/srv/srv0srv.c
+++ b/storage/innobase/srv/srv0srv.c
@@ -1643,6 +1643,9 @@ srv_suspend_mysql_thread(
 		    start_time != -1 && finish_time != -1) {
 			srv_n_lock_max_wait_time = diff_time;
 		}
+
+		/* Record the lock wait time for this thread */
+		thd_set_lock_wait_time(trx->mysql_thd, diff_time);
 	}
 
 	if (trx->was_chosen_as_deadlock_victim) {

From 026d301f960e2413d615799d76a1f59552a5f1d8 Mon Sep 17 00:00:00 2001
From: Vasil Dimov <vasil.dimov@oracle.com>
Date: Tue, 17 Aug 2010 14:54:29 +0300
Subject: [PATCH 17/18] Make main.endspace more deterministic

Followup to vasil.dimov@oracle.com-20100817063430-inglmzgdtj95t29d
which didn't fully fix the test because the order of the returned
rows was different in embedded and non-embedded version. So the only
way to fix this is to add an ORDER BY clause.
---
 mysql-test/r/endspace.result | 14 +++++++-------
 mysql-test/t/endspace.test   |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mysql-test/r/endspace.result b/mysql-test/r/endspace.result
index 25e2238e7bb..4eca88774b4 100644
--- a/mysql-test/r/endspace.result
+++ b/mysql-test/r/endspace.result
@@ -54,8 +54,8 @@ text1 like 'teststring_%' ORDER BY text1;
 text1
 teststring	
 teststring
-select concat('|', text1, '|') from t1 where text1='teststring' or text1 like 'teststring_%';
-concat('|', text1, '|')
+select concat('|', text1, '|') as c from t1 where text1='teststring' or text1 like 'teststring_%' order by c;
+c
 |teststring	|
 |teststring|
 select concat('|', text1, '|') from t1 where text1='teststring' or text1 > 'teststring\t';
@@ -105,11 +105,11 @@ select concat('|', text1, '|') from t1 where text1 like 'teststring_%';
 concat('|', text1, '|')
 |teststring	|
 |teststring |
-select concat('|', text1, '|') from t1 where text1='teststring' or text1 like 'teststring_%';
-concat('|', text1, '|')
-|teststring|
+select concat('|', text1, '|') as c from t1 where text1='teststring' or text1 like 'teststring_%' order by c;
+c
 |teststring	|
 |teststring |
+|teststring|
 select concat('|', text1, '|') from t1 where text1='teststring' or text1 > 'teststring\t';
 concat('|', text1, '|')
 |teststring|
@@ -123,8 +123,8 @@ concat('|', text1, '|')
 drop table t1;
 create table t1 (text1 varchar(32) not NULL, KEY key1 (text1)) pack_keys=0;
 insert into t1 values ('teststring'), ('nothing'), ('teststring\t');
-select concat('|', text1, '|') from t1 where text1='teststring' or text1 like 'teststring_%';
-concat('|', text1, '|')
+select concat('|', text1, '|') as c from t1 where text1='teststring' or text1 like 'teststring_%' order by c;
+c
 |teststring	|
 |teststring|
 select concat('|', text1, '|') from t1 where text1='teststring' or text1 >= 'teststring\t';
diff --git a/mysql-test/t/endspace.test b/mysql-test/t/endspace.test
index b223c683cde..7c71b05f687 100644
--- a/mysql-test/t/endspace.test
+++ b/mysql-test/t/endspace.test
@@ -27,7 +27,7 @@ alter table t1 modify text1 char(32) binary not null;
 check table t1;
 select * from t1 ignore key (key1) where text1='teststring' or 
   text1 like 'teststring_%' ORDER BY text1;
-select concat('|', text1, '|') from t1 where text1='teststring' or text1 like 'teststring_%';
+select concat('|', text1, '|') as c from t1 where text1='teststring' or text1 like 'teststring_%' order by c;
 select concat('|', text1, '|') from t1 where text1='teststring' or text1 > 'teststring\t';
 select text1, length(text1) from t1 order by text1;
 select text1, length(text1) from t1 order by binary text1;
@@ -44,14 +44,14 @@ select concat('|', text1, '|') from t1 where text1='teststring';
 select concat('|', text1, '|') from t1 where text1='teststring ';
 explain select concat('|', text1, '|') from t1 where text1='teststring ';
 select concat('|', text1, '|') from t1 where text1 like 'teststring_%';
-select concat('|', text1, '|') from t1 where text1='teststring' or text1 like 'teststring_%';
+select concat('|', text1, '|') as c from t1 where text1='teststring' or text1 like 'teststring_%' order by c;
 select concat('|', text1, '|') from t1 where text1='teststring' or text1 > 'teststring\t';
 select concat('|', text1, '|') from t1 order by text1;
 drop table t1;
 
 create table t1 (text1 varchar(32) not NULL, KEY key1 (text1)) pack_keys=0;
 insert into t1 values ('teststring'), ('nothing'), ('teststring\t');
-select concat('|', text1, '|') from t1 where text1='teststring' or text1 like 'teststring_%';
+select concat('|', text1, '|') as c from t1 where text1='teststring' or text1 like 'teststring_%' order by c;
 select concat('|', text1, '|') from t1 where text1='teststring' or text1 >= 'teststring\t';
 drop table t1;
 

From 085bb22ab275ee0b4733764f51feb986c0cac63a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@oracle.com>
Date: Tue, 17 Aug 2010 15:07:54 +0300
Subject: [PATCH 18/18] A non-functional change:

dict_load_index_low(): Rename the parameter "cached" to "allocated"
and clarify the comments.
---
 storage/innobase/dict/dict0load.c    | 26 +++++++++++++-------------
 storage/innobase/include/dict0load.h | 16 ++++++++++------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/storage/innobase/dict/dict0load.c b/storage/innobase/dict/dict0load.c
index 20a18c72a39..6bd15f0556a 100644
--- a/storage/innobase/dict/dict0load.c
+++ b/storage/innobase/dict/dict0load.c
@@ -1175,23 +1175,23 @@ static const char* dict_load_index_id_err = "SYS_INDEXES.TABLE_ID mismatch";
 
 /********************************************************************//**
 Loads an index definition from a SYS_INDEXES record to dict_index_t.
-If "cached" is set to "TRUE", we will create a dict_index_t structure
-and fill it accordingly. Otherwise, the dict_index_t will
-be supplied by the caller and filled with information read from
-the record.
-@return error message, or NULL on success */
+If allocate=TRUE, we will create a dict_index_t structure and fill it
+accordingly. If allocated=FALSE, the dict_index_t will be supplied by
+the caller and filled with information read from the record.  @return
+error message, or NULL on success */
 UNIV_INTERN
 const char*
 dict_load_index_low(
 /*================*/
 	byte*		table_id,	/*!< in/out: table id (8 bytes),
-					an "in" value if cached=TRUE
-					and "out" when cached=FALSE */
+					an "in" value if allocate=TRUE
+					and "out" when allocate=FALSE */
 	const char*	table_name,	/*!< in: table name */
 	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
 	const rec_t*	rec,		/*!< in: SYS_INDEXES record */
-	ibool		cached,		/*!< in: TRUE = add to cache,
-					FALSE = do not */
+	ibool		allocate,	/*!< in: TRUE=allocate *index,
+					FALSE=fill in a pre-allocated
+					*index */
 	dict_index_t**	index)		/*!< out,own: index, or NULL */
 {
 	const byte*	field;
@@ -1203,8 +1203,8 @@ dict_load_index_low(
 	ulint		type;
 	ulint		space;
 
-	if (cached) {
-		/* If "cached" is set to TRUE, no dict_index_t will
+	if (allocate) {
+		/* If allocate=TRUE, no dict_index_t will
 		be supplied. Initialize "*index" to NULL */
 		*index = NULL;
 	}
@@ -1223,7 +1223,7 @@ err_len:
 		return("incorrect column length in SYS_INDEXES");
 	}
 
-	if (!cached) {
+	if (!allocate) {
 		/* We are reading a SYS_INDEXES record. Copy the table_id */
 		memcpy(table_id, (const char*)field, 8);
 	} else if (memcmp(field, table_id, 8)) {
@@ -1279,7 +1279,7 @@ err_len:
 		goto err_len;
 	}
 
-	if (cached) {
+	if (allocate) {
 		*index = dict_mem_index_create(table_name, name_buf,
 					       space, type, n_fields);
 	} else {
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
index 6a718a464ab..05d3532d59a 100644
--- a/storage/innobase/include/dict0load.h
+++ b/storage/innobase/include/dict0load.h
@@ -116,19 +116,23 @@ dict_load_column_low(
 	const rec_t*	rec);		/*!< in: SYS_COLUMNS record */
 /********************************************************************//**
 Loads an index definition from a SYS_INDEXES record to dict_index_t.
-@return error message, or NULL on success */
+If allocate=TRUE, we will create a dict_index_t structure and fill it
+accordingly. If allocated=FALSE, the dict_index_t will be supplied by
+the caller and filled with information read from the record.  @return
+error message, or NULL on success */
 UNIV_INTERN
 const char*
 dict_load_index_low(
 /*================*/
-	byte*		table_id,	/*!< in/out: table id (8 bytes_,
-					an "in" value if cached=TRUE
-					and "out" when cached=FALSE */
+	byte*		table_id,	/*!< in/out: table id (8 bytes),
+					an "in" value if allocate=TRUE
+					and "out" when allocate=FALSE */
 	const char*	table_name,	/*!< in: table name */
 	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
 	const rec_t*	rec,		/*!< in: SYS_INDEXES record */
-	ibool		cached,		/*!< in: TRUE = add to cache
-					FALSE = do not */
+	ibool		allocate,	/*!< in: TRUE=allocate *index,
+					FALSE=fill in a pre-allocated
+					*index */
 	dict_index_t**	index);		/*!< out,own: index, or NULL */
 /********************************************************************//**
 Loads an index field definition from a SYS_FIELDS record to