Bug 12356373 - PERFORMANCE REGRESSION FROM 5.1 TO 5.5 : GROUP BY:
The title of the bug is a little confusing. The actual fix is to reintroduce random readahead inside InnoDB with a dynamic, global switch innodb_random_read_ahead [default = off]. Approved by: Sunny Bains rb://696
This commit is contained in:
parent
0f543e8e9d
commit
aa3d29681e
@ -1,3 +1,10 @@
|
||||
2011-07-19 The InnoDB Team
|
||||
|
||||
* buf/buf0buf.c, buf/buf0rea.c, handler/ha_innodb.cc,
|
||||
include/buf0buf.h, include/buf0buf.ic, include/srv0srv.h,
|
||||
srv/srv0srv.c:
|
||||
Fix bug#Bug 12356373 by reintroducing random readahead
|
||||
|
||||
2011-06-30 The InnoDB Team
|
||||
|
||||
* row/row0row.c:
|
||||
|
@ -3590,12 +3590,16 @@ buf_print_io(
|
||||
|
||||
/* Statistics about read ahead algorithm */
|
||||
fprintf(file, "Pages read ahead %.2f/s,"
|
||||
" evicted without access %.2f/s\n",
|
||||
" evicted without access %.2f/s,"
|
||||
" Random read ahead %.2f/s\n",
|
||||
(buf_pool->stat.n_ra_pages_read
|
||||
- buf_pool->old_stat.n_ra_pages_read)
|
||||
/ time_elapsed,
|
||||
(buf_pool->stat.n_ra_pages_evicted
|
||||
- buf_pool->old_stat.n_ra_pages_evicted)
|
||||
/ time_elapsed,
|
||||
(buf_pool->stat.n_ra_pages_read_rnd
|
||||
- buf_pool->old_stat.n_ra_pages_read_rnd)
|
||||
/ time_elapsed);
|
||||
|
||||
/* Print some values to help us with visualizing what is
|
||||
|
@ -38,6 +38,14 @@ Created 11/5/1995 Heikki Tuuri
|
||||
#include "srv0start.h"
|
||||
#include "srv0srv.h"
|
||||
|
||||
/** The size in blocks of the area where the random read-ahead algorithm counts
|
||||
the accessed pages when deciding whether to read-ahead */
|
||||
#define BUF_READ_AHEAD_RANDOM_AREA BUF_READ_AHEAD_AREA
|
||||
|
||||
/** There must be at least this many pages in buf_pool in the area to start
|
||||
a random read-ahead */
|
||||
#define BUF_READ_AHEAD_RANDOM_THRESHOLD (5 + BUF_READ_AHEAD_RANDOM_AREA / 8)
|
||||
|
||||
/** The linear read-ahead area size */
|
||||
#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
|
||||
|
||||
@ -157,6 +165,165 @@ buf_read_page_low(
|
||||
return(1);
|
||||
}
|
||||
|
||||
/********************************************************************//**
|
||||
Applies a random read-ahead in buf_pool if there are at least a threshold
|
||||
value of accessed pages from the random read-ahead area. Does not read any
|
||||
page, not even the one at the position (space, offset), if the read-ahead
|
||||
mechanism is not activated. NOTE 1: the calling thread may own latches on
|
||||
pages: to avoid deadlocks this function must be written such that it cannot
|
||||
end up waiting for these latches! NOTE 2: the calling thread must want
|
||||
access to the page given: this rule is set to prevent unintended read-aheads
|
||||
performed by ibuf routines, a situation which could result in a deadlock if
|
||||
the OS does not support asynchronous i/o.
|
||||
@return number of page read requests issued; NOTE that if we read ibuf
|
||||
pages, it may happen that the page at the given page number does not
|
||||
get read even if we return a positive value! */
|
||||
static
|
||||
ulint
|
||||
buf_read_ahead_random(
|
||||
/*==================*/
|
||||
ulint space, /*!< in: space id */
|
||||
ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
|
||||
ulint offset) /*!< in: page number of a page which the current thread
|
||||
wants to access */
|
||||
{
|
||||
ib_int64_t tablespace_version;
|
||||
ulint recent_blocks = 0;
|
||||
ulint count;
|
||||
ulint ibuf_mode;
|
||||
ulint low, high;
|
||||
ulint err;
|
||||
ulint i;
|
||||
ulint buf_read_ahead_random_area;
|
||||
|
||||
if (!srv_random_read_ahead) {
|
||||
/* Disabled by user */
|
||||
return(0);
|
||||
}
|
||||
|
||||
if (srv_startup_is_before_trx_rollback_phase) {
|
||||
/* No read-ahead to avoid thread deadlocks */
|
||||
return(0);
|
||||
}
|
||||
|
||||
if (ibuf_bitmap_page(zip_size, offset)
|
||||
|| trx_sys_hdr_page(space, offset)) {
|
||||
|
||||
/* If it is an ibuf bitmap page or trx sys hdr, we do
|
||||
no read-ahead, as that could break the ibuf page access
|
||||
order */
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
/* Remember the tablespace version before we ask the tablespace size
|
||||
below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
|
||||
do not try to read outside the bounds of the tablespace! */
|
||||
|
||||
tablespace_version = fil_space_get_version(space);
|
||||
|
||||
buf_read_ahead_random_area = BUF_READ_AHEAD_RANDOM_AREA;
|
||||
|
||||
low = (offset / buf_read_ahead_random_area)
|
||||
* buf_read_ahead_random_area;
|
||||
high = (offset / buf_read_ahead_random_area + 1)
|
||||
* buf_read_ahead_random_area;
|
||||
if (high > fil_space_get_size(space)) {
|
||||
|
||||
high = fil_space_get_size(space);
|
||||
}
|
||||
|
||||
buf_pool_mutex_enter();
|
||||
|
||||
if (buf_pool->n_pend_reads
|
||||
> buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
|
||||
buf_pool_mutex_exit();
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
/* Count how many blocks in the area have been recently accessed,
|
||||
that is, reside near the start of the LRU list. */
|
||||
|
||||
for (i = low; i < high; i++) {
|
||||
const buf_page_t* bpage = buf_page_hash_get(space, i);
|
||||
|
||||
if (bpage
|
||||
&& buf_page_is_accessed(bpage)
|
||||
&& buf_page_peek_if_young(bpage)) {
|
||||
|
||||
recent_blocks++;
|
||||
|
||||
if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD) {
|
||||
|
||||
buf_pool_mutex_exit();
|
||||
goto read_ahead;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buf_pool_mutex_exit();
|
||||
/* Do nothing */
|
||||
return(0);
|
||||
|
||||
read_ahead:
|
||||
/* Read all the suitable blocks within the area */
|
||||
|
||||
if (ibuf_inside()) {
|
||||
ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
|
||||
} else {
|
||||
ibuf_mode = BUF_READ_ANY_PAGE;
|
||||
}
|
||||
|
||||
count = 0;
|
||||
|
||||
for (i = low; i < high; i++) {
|
||||
/* It is only sensible to do read-ahead in the non-sync aio
|
||||
mode: hence FALSE as the first parameter */
|
||||
|
||||
if (!ibuf_bitmap_page(zip_size, i)) {
|
||||
count += buf_read_page_low(
|
||||
&err, FALSE,
|
||||
ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
|
||||
space, zip_size, FALSE,
|
||||
tablespace_version, i);
|
||||
if (err == DB_TABLESPACE_DELETED) {
|
||||
ut_print_timestamp(stderr);
|
||||
fprintf(stderr,
|
||||
" InnoDB: Warning: in random"
|
||||
" readahead trying to access\n"
|
||||
"InnoDB: tablespace %lu page %lu,\n"
|
||||
"InnoDB: but the tablespace does not"
|
||||
" exist or is just being dropped.\n",
|
||||
(ulong) space, (ulong) i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* In simulated aio we wake the aio handler threads only after
|
||||
queuing all aio requests, in native aio the following call does
|
||||
nothing: */
|
||||
|
||||
os_aio_simulated_wake_handler_threads();
|
||||
|
||||
#ifdef UNIV_DEBUG
|
||||
if (buf_debug_prints && (count > 0)) {
|
||||
fprintf(stderr,
|
||||
"Random read-ahead space %lu offset %lu pages %lu\n",
|
||||
(ulong) space, (ulong) offset,
|
||||
(ulong) count);
|
||||
}
|
||||
#endif /* UNIV_DEBUG */
|
||||
|
||||
/* Read ahead is considered one I/O operation for the purpose of
|
||||
LRU policy decision. */
|
||||
buf_LRU_stat_inc_io();
|
||||
|
||||
buf_pool->stat.n_ra_pages_read_rnd += count;
|
||||
return(count);
|
||||
}
|
||||
|
||||
|
||||
/********************************************************************//**
|
||||
High-level function which reads a page asynchronously from a file to the
|
||||
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
|
||||
@ -175,6 +342,9 @@ buf_read_page(
|
||||
ulint count;
|
||||
ulint err;
|
||||
|
||||
count = buf_read_ahead_random(space, zip_size, offset);
|
||||
srv_buf_pool_reads += count;
|
||||
|
||||
tablespace_version = fil_space_get_version(space);
|
||||
|
||||
/* We do the i/o in the synchronous aio mode to save thread
|
||||
|
@ -501,6 +501,8 @@ static SHOW_VAR innodb_status_variables[]= {
|
||||
(char*) &export_vars.innodb_buffer_pool_pages_misc, SHOW_LONG},
|
||||
{"buffer_pool_pages_total",
|
||||
(char*) &export_vars.innodb_buffer_pool_pages_total, SHOW_LONG},
|
||||
{"buffer_pool_read_ahead_rnd",
|
||||
(char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG},
|
||||
{"buffer_pool_read_ahead",
|
||||
(char*) &export_vars.innodb_buffer_pool_read_ahead, SHOW_LONG},
|
||||
{"buffer_pool_read_ahead_evicted",
|
||||
@ -11027,6 +11029,11 @@ static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
|
||||
NULL, NULL, 0, 0, 1, 0);
|
||||
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
|
||||
|
||||
static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead,
|
||||
PLUGIN_VAR_NOCMDARG,
|
||||
"Whether to use read ahead for random access within an extent.",
|
||||
NULL, NULL, FALSE);
|
||||
|
||||
static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
|
||||
PLUGIN_VAR_RQCMDARG,
|
||||
"Number of pages that must be accessed sequentially for InnoDB to "
|
||||
@ -11091,6 +11098,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
|
||||
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
|
||||
MYSQL_SYSVAR(change_buffering_debug),
|
||||
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
|
||||
MYSQL_SYSVAR(random_read_ahead),
|
||||
MYSQL_SYSVAR(read_ahead_threshold),
|
||||
MYSQL_SYSVAR(io_capacity),
|
||||
NULL
|
||||
|
@ -427,6 +427,18 @@ buf_block_get_freed_page_clock(
|
||||
__attribute__((pure));
|
||||
|
||||
/********************************************************************//**
|
||||
Tells if a block is still close enough to the MRU end of the LRU list
|
||||
meaning that it is not in danger of getting evicted and also implying
|
||||
that it has been accessed recently.
|
||||
Note that this is for heuristics only and does not reserve buffer pool
|
||||
mutex.
|
||||
@return TRUE if block is close to MRU end of LRU */
|
||||
UNIV_INLINE
|
||||
ibool
|
||||
buf_page_peek_if_young(
|
||||
/*===================*/
|
||||
const buf_page_t* bpage); /*!< in: block */
|
||||
/********************************************************************//**
|
||||
Recommends a move of a block to the start of the LRU list if there is danger
|
||||
of dropping from the buffer pool. NOTE: does not reserve the buffer pool
|
||||
mutex.
|
||||
@ -1334,6 +1346,8 @@ struct buf_pool_stat_struct{
|
||||
ulint n_pages_written;/*!< number write operations */
|
||||
ulint n_pages_created;/*!< number of pages created
|
||||
in the pool with no read */
|
||||
ulint n_ra_pages_read_rnd;/*!< number of pages read in
|
||||
as part of random read ahead */
|
||||
ulint n_ra_pages_read;/*!< number of pages read in
|
||||
as part of read ahead */
|
||||
ulint n_ra_pages_evicted;/*!< number of read ahead
|
||||
|
@ -61,6 +61,27 @@ buf_block_get_freed_page_clock(
|
||||
return(buf_page_get_freed_page_clock(&block->page));
|
||||
}
|
||||
|
||||
/********************************************************************//**
|
||||
Tells if a block is still close enough to the MRU end of the LRU list
|
||||
meaning that it is not in danger of getting evicted and also implying
|
||||
that it has been accessed recently.
|
||||
Note that this is for heuristics only and does not reserve buffer pool
|
||||
mutex.
|
||||
@return TRUE if block is close to MRU end of LRU */
|
||||
UNIV_INLINE
|
||||
ibool
|
||||
buf_page_peek_if_young(
|
||||
/*===================*/
|
||||
const buf_page_t* bpage) /*!< in: block */
|
||||
{
|
||||
/* FIXME: bpage->freed_page_clock is 31 bits */
|
||||
return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
|
||||
< ((ulint) bpage->freed_page_clock
|
||||
+ (buf_pool->curr_size
|
||||
* (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio)
|
||||
/ (BUF_LRU_OLD_RATIO_DIV * 4))));
|
||||
}
|
||||
|
||||
/********************************************************************//**
|
||||
Recommends a move of a block to the start of the LRU list if there is danger
|
||||
of dropping from the buffer pool. NOTE: does not reserve the buffer pool
|
||||
@ -89,12 +110,7 @@ buf_page_peek_if_too_old(
|
||||
buf_pool->stat.n_pages_not_made_young++;
|
||||
return(FALSE);
|
||||
} else {
|
||||
/* FIXME: bpage->freed_page_clock is 31 bits */
|
||||
return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
|
||||
> ((ulint) bpage->freed_page_clock
|
||||
+ (buf_pool->curr_size
|
||||
* (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio)
|
||||
/ (BUF_LRU_OLD_RATIO_DIV * 4))));
|
||||
return(!buf_page_peek_if_young(bpage));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -143,6 +143,7 @@ extern ulint srv_mem_pool_size;
|
||||
extern ulint srv_lock_table_size;
|
||||
|
||||
extern ulint srv_n_file_io_threads;
|
||||
extern my_bool srv_random_read_ahead;
|
||||
extern ulong srv_read_ahead_threshold;
|
||||
extern ulint srv_n_read_io_threads;
|
||||
extern ulint srv_n_write_io_threads;
|
||||
@ -618,6 +619,7 @@ struct export_var_struct{
|
||||
ulint innodb_buffer_pool_wait_free; /*!< srv_buf_pool_wait_free */
|
||||
ulint innodb_buffer_pool_pages_flushed; /*!< srv_buf_pool_flushed */
|
||||
ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */
|
||||
ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */
|
||||
ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */
|
||||
ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
|
||||
ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */
|
||||
|
@ -203,6 +203,8 @@ UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX;
|
||||
UNIV_INTERN ulint srv_n_read_io_threads = ULINT_MAX;
|
||||
UNIV_INTERN ulint srv_n_write_io_threads = ULINT_MAX;
|
||||
|
||||
/* Switch to enable random read ahead. */
|
||||
UNIV_INTERN my_bool srv_random_read_ahead = FALSE;
|
||||
/* User settable value of the number of pages that must be present
|
||||
in the buffer cache and accessed sequentially for InnoDB to trigger a
|
||||
readahead request. */
|
||||
@ -1906,6 +1908,8 @@ srv_export_innodb_status(void)
|
||||
export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free;
|
||||
export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed;
|
||||
export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads;
|
||||
export_vars.innodb_buffer_pool_read_ahead_rnd
|
||||
= buf_pool->stat.n_ra_pages_read_rnd;
|
||||
export_vars.innodb_buffer_pool_read_ahead
|
||||
= buf_pool->stat.n_ra_pages_read;
|
||||
export_vars.innodb_buffer_pool_read_ahead_evicted
|
||||
|
Loading…
x
Reference in New Issue
Block a user