From d53906e3872016cb06f48a80557fb8edda59b691 Mon Sep 17 00:00:00 2001 From: Mattias Jonsson Date: Wed, 16 Mar 2011 11:59:01 +0100 Subject: [PATCH 01/35] Bug#11746819: Bug#28928: UNIX_TIMESTAMP() should be considered unary monotonic by partition pruning Made UNIX_TIMESTAMP MONOTONIC_INCREASING when it have TIMESTAMP argument (only). --- mysql-test/r/partition.result | 18 + mysql-test/r/partition_datatype.result | 1000 ++++++++++++++++++++++++ mysql-test/t/partition.test | 6 + mysql-test/t/partition_datatype.test | 310 ++++++++ sql/item_timefunc.cc | 20 + sql/item_timefunc.h | 2 + 6 files changed, 1356 insertions(+) diff --git a/mysql-test/r/partition.result b/mysql-test/r/partition.result index 27ada9d1129..fb3dc648b1d 100644 --- a/mysql-test/r/partition.result +++ b/mysql-test/r/partition.result @@ -76,6 +76,15 @@ a b 2007-07-30 17:35:48 p1 2009-07-14 17:35:55 pmax 2009-09-21 17:31:42 pmax +SELECT * FROM t1 where a between '2007-01-01' and '2007-08-01'; +a b +2007-07-30 17:35:48 p1 +EXPLAIN PARTITIONS SELECT * FROM t1 where a between '2007-01-01' and '2007-08-01'; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t1 p1 system PRIMARY NULL NULL NULL 1 +EXPLAIN PARTITIONS SELECT * FROM t1 where a = '2007-07-30 17:35:48'; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t1 p1 system PRIMARY NULL NULL NULL 1 ALTER TABLE t1 REORGANIZE PARTITION pmax INTO ( PARTITION p3 VALUES LESS THAN (1247688000), PARTITION pmax VALUES LESS THAN MAXVALUE); @@ -84,6 +93,15 @@ a b 2007-07-30 17:35:48 p1 2009-07-14 17:35:55 pmax 2009-09-21 17:31:42 pmax +SELECT * FROM t1 where a between '2007-01-01' and '2007-08-01'; +a b +2007-07-30 17:35:48 p1 +EXPLAIN PARTITIONS SELECT * FROM t1 where a between '2007-01-01' and '2007-08-01'; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t1 p1 system PRIMARY NULL NULL NULL 1 +EXPLAIN PARTITIONS SELECT * FROM t1 where a = '2007-07-30 17:35:48'; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t1 p1 system PRIMARY NULL NULL NULL 1 SHOW CREATE TABLE t1; Table Create Table t1 CREATE TABLE `t1` ( diff --git a/mysql-test/r/partition_datatype.result b/mysql-test/r/partition_datatype.result index 607afb71da5..63972d6cce4 100644 --- a/mysql-test/r/partition_datatype.result +++ b/mysql-test/r/partition_datatype.result @@ -336,3 +336,1003 @@ select hex(a) from t1 where a = 7; hex(a) 7 drop table t1; +# +# Bug#28928: UNIX_TIMESTAMP() should be considered unary monotonic +# by partition pruning +SET @old_time_zone= @@session.time_zone; +SET @@session.time_zone = 'UTC'; +# Using MyISAM to get stable values on TABLE_ROWS in I_S.PARTITIONS +CREATE TABLE t1 +(a TIMESTAMP NULL, +tz varchar(16)) +ENGINE = MyISAM; +CREATE TABLE t2 LIKE t1; +ALTER TABLE t2 PARTITION BY RANGE (UNIX_TIMESTAMP(a)) +(PARTITION `p0` VALUES LESS THAN (0), +PARTITION `p-2000` VALUES LESS THAN (UNIX_TIMESTAMP('2000-01-01')), +PARTITION `p-2011-MSK` VALUES LESS THAN (UNIX_TIMESTAMP('2011-03-26 23:00:00')), +PARTITION `p-2011-MSD-1` VALUES LESS THAN (UNIX_TIMESTAMP('2011-10-29 22:00:00')), +PARTITION `p-2011-MSD-2` VALUES LESS THAN (UNIX_TIMESTAMP('2011-10-29 23:00:00')), +PARTITION `p-2012-MSK-1` VALUES LESS THAN (UNIX_TIMESTAMP('2011-10-30 00:00:00')), +PARTITION `p-2012-MSK-2` VALUES LESS THAN (UNIX_TIMESTAMP('2012-03-24 23:00:00')), +PARTITION `pEnd` VALUES LESS THAN (UNIX_TIMESTAMP('2038-01-19 03:14:07')), +PARTITION `pMax` VALUES LESS THAN MAXVALUE); +# Test 'odd' values +INSERT INTO t1 VALUES (NULL, 'UTC'); +INSERT INTO t1 VALUES ('0000-00-00 00:00:00', 'UTC'); +# Test invalid values +INSERT INTO t1 VALUES ('1901-01-01 00:00:00', 'UTCI'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +INSERT INTO t1 VALUES ('1969-12-31 23:59:59', 'UTCI'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +INSERT INTO t1 VALUES ('2038-01-19 03:14:08', 'UTCI'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +INSERT INTO t1 VALUES ('1970-01-01 00:00:00', 'UTCI'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +# Test start range +INSERT INTO t1 VALUES ('1970-01-01 00:00:01', 'UTC'); +INSERT INTO t1 VALUES ('1974-02-05 21:28:16', 'UTC'); +# Test end range +INSERT INTO t1 VALUES ('2038-01-19 03:14:06', 'UTC'); +INSERT INTO t1 VALUES ('2038-01-19 03:14:07', 'UTC'); +# Test Daylight saving shift +INSERT INTO t1 VALUES ('2011-03-26 22:59:59', 'UTC'); +INSERT INTO t1 VALUES ('2011-03-26 23:00:00', 'UTC'); +INSERT INTO t1 VALUES ('2011-03-26 23:00:01', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 21:59:59', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 22:00:00', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 22:00:01', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 22:59:59', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 23:00:00', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 23:00:01', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 23:59:59', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-30 00:00:00', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-30 00:00:01', 'UTC'); +SET @@session.time_zone = 'Europe/Moscow'; +# Test 'odd' values +INSERT INTO t1 VALUES (NULL, 'Moscow'); +INSERT INTO t1 VALUES ('0000-00-00 00:00:00', 'Moscow'); +# Test invalid values +INSERT INTO t1 VALUES ('0000-00-00 03:00:00', 'MoscowI'); +Warnings: +Warning 1265 Data truncated for column 'a' at row 1 +INSERT INTO t1 VALUES ('1901-01-01 00:00:00', 'MoscowI'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +INSERT INTO t1 VALUES ('1969-12-31 23:59:59', 'MoscowI'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +INSERT INTO t1 VALUES ('1970-01-01 02:29:29', 'MoscowI'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +INSERT INTO t1 VALUES ('2038-01-19 06:14:08', 'MoscowI'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +INSERT INTO t1 VALUES ('1970-01-01 03:00:00', 'MoscowI'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +# values truncated to 03:00:00 due to daylight saving shift +INSERT INTO t1 VALUES ('2011-03-27 02:00:00', 'MoscowI'); +Warnings: +Warning 1299 Invalid TIMESTAMP value in column 'a' at row 1 +INSERT INTO t1 VALUES ('2011-03-27 02:00:01', 'MoscowI'); +Warnings: +Warning 1299 Invalid TIMESTAMP value in column 'a' at row 1 +INSERT INTO t1 VALUES ('2011-03-27 02:59:59', 'MoscowI'); +Warnings: +Warning 1299 Invalid TIMESTAMP value in column 'a' at row 1 +# Test start range +INSERT INTO t1 VALUES ('1970-01-01 03:00:01', 'Moscow'); +INSERT INTO t1 VALUES ('1974-02-05 21:28:16', 'Moscow'); +# Test end range +INSERT INTO t1 VALUES ('2038-01-19 06:14:06', 'Moscow'); +INSERT INTO t1 VALUES ('2038-01-19 06:14:07', 'Moscow'); +# Test Daylight saving shift +INSERT INTO t1 VALUES ('2011-03-27 01:59:59', 'Moscow'); +INSERT INTO t1 VALUES ('2011-03-27 03:00:00', 'Moscow'); +INSERT INTO t1 VALUES ('2011-03-27 03:00:01', 'Moscow'); +INSERT INTO t1 VALUES ('2011-10-30 01:59:59', 'Moscow'); +# All values between 02:00 and 02:59:59 will be interpretated as DST +INSERT INTO t1 VALUES ('2011-10-30 02:00:00', 'MoscowD'); +INSERT INTO t1 VALUES ('2011-10-30 02:00:01', 'MoscowD'); +INSERT INTO t1 VALUES ('2011-10-30 02:59:59', 'MoscowD'); +INSERT INTO t1 VALUES ('2011-10-30 03:00:00', 'Moscow'); +INSERT INTO t1 VALUES ('2011-10-30 03:00:01', 'Moscow'); +SET @@session.time_zone = 'UTC'; +INSERT INTO t2 SELECT * FROM t1; +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +PARTITION_NAME TABLE_ROWS +p0 2 +p-2000 16 +p-2011-MSK 2 +p-2011-MSD-1 9 +p-2011-MSD-2 6 +p-2012-MSK-1 3 +p-2012-MSK-2 4 +pEnd 2 +pMax 2 +SELECT * FROM t1 ORDER BY a, tz; +a tz +NULL Moscow +NULL UTC +0000-00-00 00:00:00 Moscow +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 UTC +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +1970-01-01 00:00:01 Moscow +1970-01-01 00:00:01 UTC +1974-02-05 18:28:16 Moscow +1974-02-05 21:28:16 UTC +2011-03-26 22:59:59 Moscow +2011-03-26 22:59:59 UTC +2011-03-26 23:00:00 Moscow +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 UTC +2011-03-26 23:00:01 Moscow +2011-03-26 23:00:01 UTC +2011-10-29 21:59:59 Moscow +2011-10-29 21:59:59 UTC +2011-10-29 22:00:00 MoscowD +2011-10-29 22:00:00 UTC +2011-10-29 22:00:01 MoscowD +2011-10-29 22:00:01 UTC +2011-10-29 22:59:59 MoscowD +2011-10-29 22:59:59 UTC +2011-10-29 23:00:00 UTC +2011-10-29 23:00:01 UTC +2011-10-29 23:59:59 UTC +2011-10-30 00:00:00 Moscow +2011-10-30 00:00:00 UTC +2011-10-30 00:00:01 Moscow +2011-10-30 00:00:01 UTC +2038-01-19 03:14:06 Moscow +2038-01-19 03:14:06 UTC +2038-01-19 03:14:07 Moscow +2038-01-19 03:14:07 UTC +SELECT * FROM t2 ORDER BY a, tz; +a tz +NULL Moscow +NULL UTC +0000-00-00 00:00:00 Moscow +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 UTC +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +1970-01-01 00:00:01 Moscow +1970-01-01 00:00:01 UTC +1974-02-05 18:28:16 Moscow +1974-02-05 21:28:16 UTC +2011-03-26 22:59:59 Moscow +2011-03-26 22:59:59 UTC +2011-03-26 23:00:00 Moscow +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 UTC +2011-03-26 23:00:01 Moscow +2011-03-26 23:00:01 UTC +2011-10-29 21:59:59 Moscow +2011-10-29 21:59:59 UTC +2011-10-29 22:00:00 MoscowD +2011-10-29 22:00:00 UTC +2011-10-29 22:00:01 MoscowD +2011-10-29 22:00:01 UTC +2011-10-29 22:59:59 MoscowD +2011-10-29 22:59:59 UTC +2011-10-29 23:00:00 UTC +2011-10-29 23:00:01 UTC +2011-10-29 23:59:59 UTC +2011-10-30 00:00:00 Moscow +2011-10-30 00:00:00 UTC +2011-10-30 00:00:01 Moscow +2011-10-30 00:00:01 UTC +2038-01-19 03:14:06 Moscow +2038-01-19 03:14:06 UTC +2038-01-19 03:14:07 Moscow +2038-01-19 03:14:07 UTC +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-26 23:00:00' ORDER BY a, tz; +a tz +2011-03-26 22:59:59 Moscow +2011-03-26 22:59:59 UTC +2011-03-26 23:00:00 Moscow +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-26 23:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSK,p-2011-MSD-1 ALL NULL NULL NULL NULL 11 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-26 22:59:59' ORDER BY a, tz; +a tz +2011-03-26 22:59:59 Moscow +2011-03-26 22:59:59 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-26 22:59:59' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSK ALL NULL NULL NULL NULL 2 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 22:59:59' and '2011-03-28 00:00:00' ORDER BY a, tz; +a tz +2011-03-26 22:59:59 Moscow +2011-03-26 22:59:59 UTC +2011-03-26 23:00:00 Moscow +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 UTC +2011-03-26 23:00:01 Moscow +2011-03-26 23:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 22:59:59' and '2011-03-28 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSK,p-2011-MSD-1 ALL NULL NULL NULL NULL 11 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 23:00:00' and '2011-03-28 00:00:00' ORDER BY a, tz; +a tz +2011-03-26 23:00:00 Moscow +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 UTC +2011-03-26 23:00:01 Moscow +2011-03-26 23:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 23:00:00' and '2011-03-28 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1 ALL NULL NULL NULL NULL 9 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 23:00:00' ORDER BY a, tz; +a tz +2011-10-29 21:59:59 Moscow +2011-10-29 21:59:59 UTC +2011-10-29 22:00:00 MoscowD +2011-10-29 22:00:00 UTC +2011-10-29 22:00:01 MoscowD +2011-10-29 22:00:01 UTC +2011-10-29 22:59:59 MoscowD +2011-10-29 22:59:59 UTC +2011-10-29 23:00:00 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 23:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1,p-2011-MSD-2,p-2012-MSK-1 ALL NULL NULL NULL NULL 18 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 22:59:59' ORDER BY a, tz; +a tz +2011-10-29 21:59:59 Moscow +2011-10-29 21:59:59 UTC +2011-10-29 22:00:00 MoscowD +2011-10-29 22:00:00 UTC +2011-10-29 22:00:01 MoscowD +2011-10-29 22:00:01 UTC +2011-10-29 22:59:59 MoscowD +2011-10-29 22:59:59 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 22:59:59' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1,p-2011-MSD-2 ALL NULL NULL NULL NULL 15 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 22:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; +a tz +2011-10-29 22:59:59 MoscowD +2011-10-29 22:59:59 UTC +2011-10-29 23:00:00 UTC +2011-10-29 23:00:01 UTC +2011-10-29 23:59:59 UTC +2011-10-30 00:00:00 Moscow +2011-10-30 00:00:00 UTC +2011-10-30 00:00:01 Moscow +2011-10-30 00:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 22:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-2,p-2012-MSK-1,p-2012-MSK-2 ALL NULL NULL NULL NULL 13 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 23:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; +a tz +2011-10-29 23:00:00 UTC +2011-10-29 23:00:01 UTC +2011-10-29 23:59:59 UTC +2011-10-30 00:00:00 Moscow +2011-10-30 00:00:00 UTC +2011-10-30 00:00:01 Moscow +2011-10-30 00:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 23:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2012-MSK-1,p-2012-MSK-2 ALL NULL NULL NULL NULL 7 Using where; Using filesort +# Test end range changes +DELETE FROM t2 WHERE a = 0; +INSERT INTO t2 VALUES ('1970-01-01 00:00:00', 'UTC'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +SELECT COUNT(*) FROM t2; +COUNT(*) +35 +SELECT COUNT(*) FROM t2 WHERE a = 0; +COUNT(*) +1 +SELECT * FROM t2 ORDER BY a, tz LIMIT 3; +a tz +NULL Moscow +NULL UTC +0000-00-00 00:00:00 UTC +SELECT * FROM t2 ORDER BY a DESC, tz LIMIT 3; +a tz +2038-01-19 03:14:07 Moscow +2038-01-19 03:14:07 UTC +2038-01-19 03:14:06 Moscow +UPDATE t2 SET a = TIMESTAMPADD(SECOND, 1, a); +Warnings: +Warning 1264 Out of range value for column 'a' at row 34 +Warning 1264 Out of range value for column 'a' at row 35 +SELECT MIN(a), MAX(a) FROM t2; +MIN(a) MAX(a) +0000-00-00 00:00:00 2038-01-19 03:14:07 +SELECT COUNT(*) FROM t2; +COUNT(*) +35 +SELECT COUNT(*) FROM t2 WHERE a = 0; +COUNT(*) +2 +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +PARTITION_NAME TABLE_ROWS +p0 3 +p-2000 6 +p-2011-MSK 0 +p-2011-MSD-1 9 +p-2011-MSD-2 6 +p-2012-MSK-1 4 +p-2012-MSK-2 5 +pEnd 0 +pMax 2 +SELECT * FROM t2 ORDER BY a, tz; +a tz +NULL Moscow +NULL UTC +NULL UTC +0000-00-00 00:00:00 Moscow +0000-00-00 00:00:00 UTC +1970-01-01 00:00:02 Moscow +1970-01-01 00:00:02 UTC +1974-02-05 18:28:17 Moscow +1974-02-05 21:28:17 UTC +2011-03-26 23:00:00 Moscow +2011-03-26 23:00:00 UTC +2011-03-26 23:00:01 Moscow +2011-03-26 23:00:01 MoscowI +2011-03-26 23:00:01 MoscowI +2011-03-26 23:00:01 MoscowI +2011-03-26 23:00:01 UTC +2011-03-26 23:00:02 Moscow +2011-03-26 23:00:02 UTC +2011-10-29 22:00:00 Moscow +2011-10-29 22:00:00 UTC +2011-10-29 22:00:01 MoscowD +2011-10-29 22:00:01 UTC +2011-10-29 22:00:02 MoscowD +2011-10-29 22:00:02 UTC +2011-10-29 23:00:00 MoscowD +2011-10-29 23:00:00 UTC +2011-10-29 23:00:01 UTC +2011-10-29 23:00:02 UTC +2011-10-30 00:00:00 UTC +2011-10-30 00:00:01 Moscow +2011-10-30 00:00:01 UTC +2011-10-30 00:00:02 Moscow +2011-10-30 00:00:02 UTC +2038-01-19 03:14:07 Moscow +2038-01-19 03:14:07 UTC +# Test start range changes +INSERT INTO t2 VALUES ('1970-01-01 00:00:00', 'UTC'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +SELECT COUNT(*) FROM t2; +COUNT(*) +36 +SELECT COUNT(*) FROM t2 WHERE a = 0; +COUNT(*) +3 +SELECT * FROM t2 ORDER BY a, tz LIMIT 3; +a tz +NULL Moscow +NULL UTC +NULL UTC +SELECT * FROM t2 ORDER BY a DESC, tz LIMIT 3; +a tz +2038-01-19 03:14:07 Moscow +2038-01-19 03:14:07 UTC +2011-10-30 00:00:02 Moscow +UPDATE t2 SET a = TIMESTAMPADD(SECOND, -1, a); +SELECT MIN(a), MAX(a) FROM t2; +MIN(a) MAX(a) +1970-01-01 00:00:01 2038-01-19 03:14:06 +SELECT COUNT(*) FROM t2; +COUNT(*) +36 +SELECT COUNT(*) FROM t2 WHERE a = 0; +COUNT(*) +0 +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +PARTITION_NAME TABLE_ROWS +p0 6 +p-2000 4 +p-2011-MSK 2 +p-2011-MSD-1 9 +p-2011-MSD-2 6 +p-2012-MSK-1 3 +p-2012-MSK-2 4 +pEnd 2 +pMax 0 +SELECT * FROM t2 ORDER BY a, tz; +a tz +NULL Moscow +NULL Moscow +NULL UTC +NULL UTC +NULL UTC +NULL UTC +1970-01-01 00:00:01 Moscow +1970-01-01 00:00:01 UTC +1974-02-05 18:28:16 Moscow +1974-02-05 21:28:16 UTC +2011-03-26 22:59:59 Moscow +2011-03-26 22:59:59 UTC +2011-03-26 23:00:00 Moscow +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 MoscowI +2011-03-26 23:00:00 UTC +2011-03-26 23:00:01 Moscow +2011-03-26 23:00:01 UTC +2011-10-29 21:59:59 Moscow +2011-10-29 21:59:59 UTC +2011-10-29 22:00:00 MoscowD +2011-10-29 22:00:00 UTC +2011-10-29 22:00:01 MoscowD +2011-10-29 22:00:01 UTC +2011-10-29 22:59:59 MoscowD +2011-10-29 22:59:59 UTC +2011-10-29 23:00:00 UTC +2011-10-29 23:00:01 UTC +2011-10-29 23:59:59 UTC +2011-10-30 00:00:00 Moscow +2011-10-30 00:00:00 UTC +2011-10-30 00:00:01 Moscow +2011-10-30 00:00:01 UTC +2038-01-19 03:14:06 Moscow +2038-01-19 03:14:06 UTC +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `a` timestamp NULL DEFAULT NULL, + `tz` varchar(16) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +/*!50100 PARTITION BY RANGE (UNIX_TIMESTAMP(a)) +(PARTITION p0 VALUES LESS THAN (0) ENGINE = MyISAM, + PARTITION `p-2000` VALUES LESS THAN (946684800) ENGINE = MyISAM, + PARTITION `p-2011-MSK` VALUES LESS THAN (1301180400) ENGINE = MyISAM, + PARTITION `p-2011-MSD-1` VALUES LESS THAN (1319925600) ENGINE = MyISAM, + PARTITION `p-2011-MSD-2` VALUES LESS THAN (1319929200) ENGINE = MyISAM, + PARTITION `p-2012-MSK-1` VALUES LESS THAN (1319932800) ENGINE = MyISAM, + PARTITION `p-2012-MSK-2` VALUES LESS THAN (1332630000) ENGINE = MyISAM, + PARTITION pEnd VALUES LESS THAN (2147483647) ENGINE = MyISAM, + PARTITION pMax VALUES LESS THAN MAXVALUE ENGINE = MyISAM) */ +TRUNCATE TABLE t2; +SET @@session.time_zone = 'Europe/Moscow'; +INSERT INTO t2 SELECT * FROM t1; +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +PARTITION_NAME TABLE_ROWS +p0 2 +p-2000 16 +p-2011-MSK 2 +p-2011-MSD-1 9 +p-2011-MSD-2 6 +p-2012-MSK-1 3 +p-2012-MSK-2 4 +pEnd 2 +pMax 2 +SELECT * FROM t1 ORDER BY a, tz; +a tz +NULL Moscow +NULL UTC +0000-00-00 00:00:00 Moscow +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 UTC +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +1970-01-01 03:00:01 Moscow +1970-01-01 03:00:01 UTC +1974-02-05 21:28:16 Moscow +1974-02-06 00:28:16 UTC +2011-03-27 01:59:59 Moscow +2011-03-27 01:59:59 UTC +2011-03-27 03:00:00 Moscow +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 UTC +2011-03-27 03:00:01 Moscow +2011-03-27 03:00:01 UTC +2011-10-30 01:59:59 Moscow +2011-10-30 01:59:59 UTC +2011-10-30 02:00:00 MoscowD +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 MoscowD +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 MoscowD +2011-10-30 02:59:59 UTC +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 UTC +2011-10-30 03:00:00 Moscow +2011-10-30 03:00:00 UTC +2011-10-30 03:00:01 Moscow +2011-10-30 03:00:01 UTC +2038-01-19 06:14:06 Moscow +2038-01-19 06:14:06 UTC +2038-01-19 06:14:07 Moscow +2038-01-19 06:14:07 UTC +SELECT * FROM t2 ORDER BY a, tz; +a tz +NULL Moscow +NULL UTC +0000-00-00 00:00:00 Moscow +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 MoscowI +0000-00-00 00:00:00 UTC +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +0000-00-00 00:00:00 UTCI +1970-01-01 03:00:01 Moscow +1970-01-01 03:00:01 UTC +1974-02-05 21:28:16 Moscow +1974-02-06 00:28:16 UTC +2011-03-27 01:59:59 Moscow +2011-03-27 01:59:59 UTC +2011-03-27 03:00:00 Moscow +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 UTC +2011-03-27 03:00:01 Moscow +2011-03-27 03:00:01 UTC +2011-10-30 01:59:59 Moscow +2011-10-30 01:59:59 UTC +2011-10-30 02:00:00 MoscowD +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 MoscowD +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 MoscowD +2011-10-30 02:59:59 UTC +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 UTC +2011-10-30 03:00:00 Moscow +2011-10-30 03:00:00 UTC +2011-10-30 03:00:01 Moscow +2011-10-30 03:00:01 UTC +2038-01-19 06:14:06 Moscow +2038-01-19 06:14:06 UTC +2038-01-19 06:14:07 Moscow +2038-01-19 06:14:07 UTC +# Testing the leap from 01:59:59 to 03:00:00 +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-27 03:00:00' ORDER BY a, tz; +a tz +2011-03-27 01:59:59 Moscow +2011-03-27 01:59:59 UTC +2011-03-27 03:00:00 Moscow +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-27 03:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSK,p-2011-MSD-1 ALL NULL NULL NULL NULL 11 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-27 01:59:59' ORDER BY a, tz; +a tz +2011-03-27 01:59:59 Moscow +2011-03-27 01:59:59 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-27 01:59:59' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSK ALL NULL NULL NULL NULL 2 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 01:59:59' and '2011-03-28 00:00:00' ORDER BY a, tz; +a tz +2011-03-27 01:59:59 Moscow +2011-03-27 01:59:59 UTC +2011-03-27 03:00:00 Moscow +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 UTC +2011-03-27 03:00:01 Moscow +2011-03-27 03:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 01:59:59' and '2011-03-28 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSK,p-2011-MSD-1 ALL NULL NULL NULL NULL 11 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 03:00:00' and '2011-03-28 00:00:00' ORDER BY a, tz; +a tz +2011-03-27 01:59:59 Moscow +2011-03-27 01:59:59 UTC +2011-03-27 03:00:00 Moscow +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 UTC +2011-03-27 03:00:01 Moscow +2011-03-27 03:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 03:00:00' and '2011-03-28 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSK,p-2011-MSD-1 ALL NULL NULL NULL NULL 11 Using where; Using filesort +# Testing the leap from 02:59:59 to 02:00:00 +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 02:00:00' ORDER BY a, tz; +a tz +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 02:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1 ALL NULL NULL NULL NULL 9 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 02:59:59' ORDER BY a, tz; +a tz +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 02:59:59' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1 ALL NULL NULL NULL NULL 9 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 03:00:00' ORDER BY a, tz; +a tz +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 03:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1 ALL NULL NULL NULL NULL 9 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 01:59:59' ORDER BY a, tz; +a tz +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 01:59:59' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1 ALL NULL NULL NULL NULL 9 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 02:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; +a tz +2011-10-30 01:59:59 Moscow +2011-10-30 01:59:59 UTC +2011-10-30 02:00:00 MoscowD +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 MoscowD +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 MoscowD +2011-10-30 02:59:59 UTC +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 UTC +2011-10-30 03:00:00 Moscow +2011-10-30 03:00:00 UTC +2011-10-30 03:00:01 Moscow +2011-10-30 03:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 02:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1,p-2011-MSD-2,p-2012-MSK-1,p-2012-MSK-2 ALL NULL NULL NULL NULL 22 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 02:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; +a tz +2011-10-30 01:59:59 Moscow +2011-10-30 01:59:59 UTC +2011-10-30 02:00:00 MoscowD +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 MoscowD +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 MoscowD +2011-10-30 02:59:59 UTC +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 UTC +2011-10-30 03:00:00 Moscow +2011-10-30 03:00:00 UTC +2011-10-30 03:00:01 Moscow +2011-10-30 03:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 02:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1,p-2011-MSD-2,p-2012-MSK-1,p-2012-MSK-2 ALL NULL NULL NULL NULL 22 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 03:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; +a tz +2011-10-30 01:59:59 Moscow +2011-10-30 01:59:59 UTC +2011-10-30 02:00:00 MoscowD +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 MoscowD +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 MoscowD +2011-10-30 02:59:59 UTC +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 UTC +2011-10-30 03:00:00 Moscow +2011-10-30 03:00:00 UTC +2011-10-30 03:00:01 Moscow +2011-10-30 03:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 03:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1,p-2011-MSD-2,p-2012-MSK-1,p-2012-MSK-2 ALL NULL NULL NULL NULL 22 Using where; Using filesort +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 01:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; +a tz +2011-10-30 01:59:59 Moscow +2011-10-30 01:59:59 UTC +2011-10-30 02:00:00 MoscowD +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 MoscowD +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 MoscowD +2011-10-30 02:59:59 UTC +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 UTC +2011-10-30 03:00:00 Moscow +2011-10-30 03:00:00 UTC +2011-10-30 03:00:01 Moscow +2011-10-30 03:00:01 UTC +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 01:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; +id select_type table partitions type possible_keys key key_len ref rows Extra +1 SIMPLE t2 p-2011-MSD-1,p-2011-MSD-2,p-2012-MSK-1,p-2012-MSK-2 ALL NULL NULL NULL NULL 22 Using where; Using filesort +# Test end range changes +DELETE FROM t2 WHERE a = 0; +INSERT INTO t2 VALUES ('1970-01-01 00:00:00', 'Moscow'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +SELECT COUNT(*) FROM t2; +COUNT(*) +35 +SELECT COUNT(*) FROM t2 WHERE a = 0; +COUNT(*) +1 +SELECT * FROM t2 ORDER BY a, tz LIMIT 3; +a tz +NULL Moscow +NULL UTC +0000-00-00 00:00:00 Moscow +SELECT * FROM t2 ORDER BY a DESC, tz LIMIT 3; +a tz +2038-01-19 06:14:07 Moscow +2038-01-19 06:14:07 UTC +2038-01-19 06:14:06 Moscow +UPDATE t2 SET a = TIMESTAMPADD(SECOND, 1, a); +Warnings: +Warning 1299 Invalid TIMESTAMP value in column 'a' at row 8 +Warning 1299 Invalid TIMESTAMP value in column 'a' at row 9 +Warning 1264 Out of range value for column 'a' at row 34 +Warning 1264 Out of range value for column 'a' at row 35 +SELECT MIN(a), MAX(a) FROM t2; +MIN(a) MAX(a) +0000-00-00 00:00:00 2038-01-19 06:14:07 +SELECT COUNT(*) FROM t2; +COUNT(*) +35 +SELECT COUNT(*) FROM t2 WHERE a = 0; +COUNT(*) +2 +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +PARTITION_NAME TABLE_ROWS +p0 3 +p-2000 6 +p-2011-MSK 0 +p-2011-MSD-1 9 +p-2011-MSD-2 8 +p-2012-MSK-1 0 +p-2012-MSK-2 7 +pEnd 0 +pMax 2 +SELECT * FROM t2 ORDER BY a, tz; +a tz +NULL Moscow +NULL Moscow +NULL UTC +0000-00-00 00:00:00 Moscow +0000-00-00 00:00:00 UTC +1970-01-01 03:00:02 Moscow +1970-01-01 03:00:02 UTC +1974-02-05 21:28:17 Moscow +1974-02-06 00:28:17 UTC +2011-03-27 03:00:00 Moscow +2011-03-27 03:00:00 UTC +2011-03-27 03:00:01 Moscow +2011-03-27 03:00:01 MoscowI +2011-03-27 03:00:01 MoscowI +2011-03-27 03:00:01 MoscowI +2011-03-27 03:00:01 UTC +2011-03-27 03:00:02 Moscow +2011-03-27 03:00:02 UTC +2011-10-30 02:00:00 Moscow +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 MoscowD +2011-10-30 02:00:01 UTC +2011-10-30 02:00:01 UTC +2011-10-30 02:00:02 MoscowD +2011-10-30 02:00:02 UTC +2011-10-30 02:00:02 UTC +2011-10-30 03:00:00 MoscowD +2011-10-30 03:00:00 UTC +2011-10-30 03:00:00 UTC +2011-10-30 03:00:01 Moscow +2011-10-30 03:00:01 UTC +2011-10-30 03:00:02 Moscow +2011-10-30 03:00:02 UTC +2038-01-19 06:14:07 Moscow +2038-01-19 06:14:07 UTC +# Test start range changes +INSERT INTO t2 VALUES ('1970-01-01 00:00:00', 'Moscow'); +Warnings: +Warning 1264 Out of range value for column 'a' at row 1 +SELECT COUNT(*) FROM t2; +COUNT(*) +36 +SELECT COUNT(*) FROM t2 WHERE a = 0; +COUNT(*) +3 +SELECT * FROM t2 ORDER BY a, tz LIMIT 3; +a tz +NULL Moscow +NULL Moscow +NULL UTC +SELECT * FROM t2 ORDER BY a DESC, tz LIMIT 3; +a tz +2038-01-19 06:14:07 Moscow +2038-01-19 06:14:07 UTC +2011-10-30 03:00:02 Moscow +UPDATE t2 SET a = TIMESTAMPADD(SECOND, -1, a); +Warnings: +Warning 1299 Invalid TIMESTAMP value in column 'a' at row 18 +Warning 1299 Invalid TIMESTAMP value in column 'a' at row 19 +SELECT MIN(a), MAX(a) FROM t2; +MIN(a) MAX(a) +1970-01-01 03:00:01 2038-01-19 06:14:06 +SELECT COUNT(*) FROM t2; +COUNT(*) +36 +SELECT COUNT(*) FROM t2 WHERE a = 0; +COUNT(*) +0 +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +PARTITION_NAME TABLE_ROWS +p0 6 +p-2000 4 +p-2011-MSK 0 +p-2011-MSD-1 11 +p-2011-MSD-2 9 +p-2012-MSK-1 0 +p-2012-MSK-2 4 +pEnd 2 +pMax 0 +SELECT * FROM t2 ORDER BY a, tz; +a tz +NULL Moscow +NULL Moscow +NULL Moscow +NULL Moscow +NULL UTC +NULL UTC +1970-01-01 03:00:01 Moscow +1970-01-01 03:00:01 UTC +1974-02-05 21:28:16 Moscow +1974-02-06 00:28:16 UTC +2011-03-27 03:00:00 Moscow +2011-03-27 03:00:00 Moscow +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 MoscowI +2011-03-27 03:00:00 UTC +2011-03-27 03:00:00 UTC +2011-03-27 03:00:01 Moscow +2011-03-27 03:00:01 UTC +2011-10-30 01:59:59 Moscow +2011-10-30 01:59:59 UTC +2011-10-30 02:00:00 MoscowD +2011-10-30 02:00:00 UTC +2011-10-30 02:00:00 UTC +2011-10-30 02:00:01 MoscowD +2011-10-30 02:00:01 UTC +2011-10-30 02:00:01 UTC +2011-10-30 02:59:59 MoscowD +2011-10-30 02:59:59 UTC +2011-10-30 02:59:59 UTC +2011-10-30 03:00:00 Moscow +2011-10-30 03:00:00 UTC +2011-10-30 03:00:01 Moscow +2011-10-30 03:00:01 UTC +2038-01-19 06:14:06 Moscow +2038-01-19 06:14:06 UTC +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `a` timestamp NULL DEFAULT NULL, + `tz` varchar(16) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +/*!50100 PARTITION BY RANGE (UNIX_TIMESTAMP(a)) +(PARTITION p0 VALUES LESS THAN (0) ENGINE = MyISAM, + PARTITION `p-2000` VALUES LESS THAN (946684800) ENGINE = MyISAM, + PARTITION `p-2011-MSK` VALUES LESS THAN (1301180400) ENGINE = MyISAM, + PARTITION `p-2011-MSD-1` VALUES LESS THAN (1319925600) ENGINE = MyISAM, + PARTITION `p-2011-MSD-2` VALUES LESS THAN (1319929200) ENGINE = MyISAM, + PARTITION `p-2012-MSK-1` VALUES LESS THAN (1319932800) ENGINE = MyISAM, + PARTITION `p-2012-MSK-2` VALUES LESS THAN (1332630000) ENGINE = MyISAM, + PARTITION pEnd VALUES LESS THAN (2147483647) ENGINE = MyISAM, + PARTITION pMax VALUES LESS THAN MAXVALUE ENGINE = MyISAM) */ +TRUNCATE TABLE t2; +DROP TABLE t1, t2; +SET @@session.time_zone= @old_time_zone; diff --git a/mysql-test/t/partition.test b/mysql-test/t/partition.test index 0151820cef9..0630dd31714 100644 --- a/mysql-test/t/partition.test +++ b/mysql-test/t/partition.test @@ -112,10 +112,16 @@ INSERT INTO t1 VALUES ('2009-07-14 17:35:55', 'pmax'); INSERT INTO t1 VALUES ('2009-09-21 17:31:42', 'pmax'); SELECT * FROM t1; +SELECT * FROM t1 where a between '2007-01-01' and '2007-08-01'; +EXPLAIN PARTITIONS SELECT * FROM t1 where a between '2007-01-01' and '2007-08-01'; +EXPLAIN PARTITIONS SELECT * FROM t1 where a = '2007-07-30 17:35:48'; ALTER TABLE t1 REORGANIZE PARTITION pmax INTO ( PARTITION p3 VALUES LESS THAN (1247688000), PARTITION pmax VALUES LESS THAN MAXVALUE); SELECT * FROM t1; +SELECT * FROM t1 where a between '2007-01-01' and '2007-08-01'; +EXPLAIN PARTITIONS SELECT * FROM t1 where a between '2007-01-01' and '2007-08-01'; +EXPLAIN PARTITIONS SELECT * FROM t1 where a = '2007-07-30 17:35:48'; SHOW CREATE TABLE t1; DROP TABLE t1; diff --git a/mysql-test/t/partition_datatype.test b/mysql-test/t/partition_datatype.test index 7440a9bf3a3..e5bde3edbef 100644 --- a/mysql-test/t/partition_datatype.test +++ b/mysql-test/t/partition_datatype.test @@ -6,6 +6,7 @@ # Partitions: crash if varchar length > 65530 # -- source include/have_partition.inc +-- source include/have_innodb.inc --disable_warnings drop table if exists t1; @@ -230,3 +231,312 @@ show create table t1; insert into t1 values (1),(4),(7),(10),(13),(16),(19),(22),(25),(28),(31),(34); select hex(a) from t1 where a = 7; drop table t1; + +--echo # +--echo # Bug#28928: UNIX_TIMESTAMP() should be considered unary monotonic +--echo # by partition pruning +SET @old_time_zone= @@session.time_zone; +SET @@session.time_zone = 'UTC'; +--echo # Using MyISAM to get stable values on TABLE_ROWS in I_S.PARTITIONS +CREATE TABLE t1 +(a TIMESTAMP NULL, + tz varchar(16)) +ENGINE = MyISAM; +CREATE TABLE t2 LIKE t1; +ALTER TABLE t2 PARTITION BY RANGE (UNIX_TIMESTAMP(a)) +(PARTITION `p0` VALUES LESS THAN (0), + PARTITION `p-2000` VALUES LESS THAN (UNIX_TIMESTAMP('2000-01-01')), + PARTITION `p-2011-MSK` VALUES LESS THAN (UNIX_TIMESTAMP('2011-03-26 23:00:00')), + PARTITION `p-2011-MSD-1` VALUES LESS THAN (UNIX_TIMESTAMP('2011-10-29 22:00:00')), + PARTITION `p-2011-MSD-2` VALUES LESS THAN (UNIX_TIMESTAMP('2011-10-29 23:00:00')), + PARTITION `p-2012-MSK-1` VALUES LESS THAN (UNIX_TIMESTAMP('2011-10-30 00:00:00')), + PARTITION `p-2012-MSK-2` VALUES LESS THAN (UNIX_TIMESTAMP('2012-03-24 23:00:00')), + PARTITION `pEnd` VALUES LESS THAN (UNIX_TIMESTAMP('2038-01-19 03:14:07')), + PARTITION `pMax` VALUES LESS THAN MAXVALUE); + + +--echo # Test 'odd' values +INSERT INTO t1 VALUES (NULL, 'UTC'); +INSERT INTO t1 VALUES ('0000-00-00 00:00:00', 'UTC'); +--echo # Test invalid values +INSERT INTO t1 VALUES ('1901-01-01 00:00:00', 'UTCI'); +INSERT INTO t1 VALUES ('1969-12-31 23:59:59', 'UTCI'); +INSERT INTO t1 VALUES ('2038-01-19 03:14:08', 'UTCI'); +INSERT INTO t1 VALUES ('1970-01-01 00:00:00', 'UTCI'); +--echo # Test start range +INSERT INTO t1 VALUES ('1970-01-01 00:00:01', 'UTC'); +INSERT INTO t1 VALUES ('1974-02-05 21:28:16', 'UTC'); +--echo # Test end range +INSERT INTO t1 VALUES ('2038-01-19 03:14:06', 'UTC'); +INSERT INTO t1 VALUES ('2038-01-19 03:14:07', 'UTC'); +--echo # Test Daylight saving shift +INSERT INTO t1 VALUES ('2011-03-26 22:59:59', 'UTC'); +INSERT INTO t1 VALUES ('2011-03-26 23:00:00', 'UTC'); +INSERT INTO t1 VALUES ('2011-03-26 23:00:01', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 21:59:59', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 22:00:00', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 22:00:01', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 22:59:59', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 23:00:00', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 23:00:01', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-29 23:59:59', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-30 00:00:00', 'UTC'); +INSERT INTO t1 VALUES ('2011-10-30 00:00:01', 'UTC'); + +SET @@session.time_zone = 'Europe/Moscow'; + +--echo # Test 'odd' values +INSERT INTO t1 VALUES (NULL, 'Moscow'); +INSERT INTO t1 VALUES ('0000-00-00 00:00:00', 'Moscow'); +--echo # Test invalid values +INSERT INTO t1 VALUES ('0000-00-00 03:00:00', 'MoscowI'); +INSERT INTO t1 VALUES ('1901-01-01 00:00:00', 'MoscowI'); +INSERT INTO t1 VALUES ('1969-12-31 23:59:59', 'MoscowI'); +INSERT INTO t1 VALUES ('1970-01-01 02:29:29', 'MoscowI'); +INSERT INTO t1 VALUES ('2038-01-19 06:14:08', 'MoscowI'); +INSERT INTO t1 VALUES ('1970-01-01 03:00:00', 'MoscowI'); +--echo # values truncated to 03:00:00 due to daylight saving shift +INSERT INTO t1 VALUES ('2011-03-27 02:00:00', 'MoscowI'); +INSERT INTO t1 VALUES ('2011-03-27 02:00:01', 'MoscowI'); +INSERT INTO t1 VALUES ('2011-03-27 02:59:59', 'MoscowI'); +--echo # Test start range +INSERT INTO t1 VALUES ('1970-01-01 03:00:01', 'Moscow'); +INSERT INTO t1 VALUES ('1974-02-05 21:28:16', 'Moscow'); +--echo # Test end range +INSERT INTO t1 VALUES ('2038-01-19 06:14:06', 'Moscow'); +INSERT INTO t1 VALUES ('2038-01-19 06:14:07', 'Moscow'); +--echo # Test Daylight saving shift +INSERT INTO t1 VALUES ('2011-03-27 01:59:59', 'Moscow'); +INSERT INTO t1 VALUES ('2011-03-27 03:00:00', 'Moscow'); +INSERT INTO t1 VALUES ('2011-03-27 03:00:01', 'Moscow'); +INSERT INTO t1 VALUES ('2011-10-30 01:59:59', 'Moscow'); +--echo # All values between 02:00 and 02:59:59 will be interpretated as DST +INSERT INTO t1 VALUES ('2011-10-30 02:00:00', 'MoscowD'); +INSERT INTO t1 VALUES ('2011-10-30 02:00:01', 'MoscowD'); +INSERT INTO t1 VALUES ('2011-10-30 02:59:59', 'MoscowD'); +INSERT INTO t1 VALUES ('2011-10-30 03:00:00', 'Moscow'); +INSERT INTO t1 VALUES ('2011-10-30 03:00:01', 'Moscow'); + + +SET @@session.time_zone = 'UTC'; + +INSERT INTO t2 SELECT * FROM t1; + +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; + +SELECT * FROM t1 ORDER BY a, tz; +SELECT * FROM t2 ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-26 23:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-26 23:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-26 22:59:59' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-26 22:59:59' ORDER BY a, tz; + + +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 22:59:59' and '2011-03-28 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 22:59:59' and '2011-03-28 00:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 23:00:00' and '2011-03-28 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 23:00:00' and '2011-03-28 00:00:00' ORDER BY a, tz; + + + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 23:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 23:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 22:59:59' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 22:59:59' ORDER BY a, tz; + + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 22:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 22:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 23:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 23:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; + + +--echo # Test end range changes +DELETE FROM t2 WHERE a = 0; +INSERT INTO t2 VALUES ('1970-01-01 00:00:00', 'UTC'); +SELECT COUNT(*) FROM t2; +SELECT COUNT(*) FROM t2 WHERE a = 0; +SELECT * FROM t2 ORDER BY a, tz LIMIT 3; +SELECT * FROM t2 ORDER BY a DESC, tz LIMIT 3; +UPDATE t2 SET a = TIMESTAMPADD(SECOND, 1, a); +SELECT MIN(a), MAX(a) FROM t2; +SELECT COUNT(*) FROM t2; +SELECT COUNT(*) FROM t2 WHERE a = 0; +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +SELECT * FROM t2 ORDER BY a, tz; + +--echo # Test start range changes +INSERT INTO t2 VALUES ('1970-01-01 00:00:00', 'UTC'); +SELECT COUNT(*) FROM t2; +SELECT COUNT(*) FROM t2 WHERE a = 0; +SELECT * FROM t2 ORDER BY a, tz LIMIT 3; +SELECT * FROM t2 ORDER BY a DESC, tz LIMIT 3; +UPDATE t2 SET a = TIMESTAMPADD(SECOND, -1, a); +SELECT MIN(a), MAX(a) FROM t2; +SELECT COUNT(*) FROM t2; +SELECT COUNT(*) FROM t2 WHERE a = 0; +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +SELECT * FROM t2 ORDER BY a, tz; + +SHOW CREATE TABLE t2; +TRUNCATE TABLE t2; + +SET @@session.time_zone = 'Europe/Moscow'; + +INSERT INTO t2 SELECT * FROM t1; + +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; + +SELECT * FROM t1 ORDER BY a, tz; +SELECT * FROM t2 ORDER BY a, tz; + +--echo # Testing the leap from 01:59:59 to 03:00:00 +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-27 03:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-27 03:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-27 01:59:59' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-01 00:00:00' and '2011-03-27 01:59:59' ORDER BY a, tz; + + +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 01:59:59' and '2011-03-28 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 01:59:59' and '2011-03-28 00:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 03:00:00' and '2011-03-28 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-03-26 03:00:00' and '2011-03-28 00:00:00' ORDER BY a, tz; + + + +--echo # Testing the leap from 02:59:59 to 02:00:00 +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 02:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 02:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 02:59:59' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 02:59:59' ORDER BY a, tz; + + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 03:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 03:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 01:59:59' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-01 00:00:00' and '2011-10-29 01:59:59' ORDER BY a, tz; + + + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 02:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 02:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 02:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 02:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; + + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 03:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 03:00:00' and '2011-10-31 00:00:00' ORDER BY a, tz; + +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 01:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; +EXPLAIN PARTITIONS +SELECT * FROM t2 +WHERE a BETWEEN '2011-10-29 01:59:59' and '2011-10-31 00:00:00' ORDER BY a, tz; + + + +--echo # Test end range changes +DELETE FROM t2 WHERE a = 0; +INSERT INTO t2 VALUES ('1970-01-01 00:00:00', 'Moscow'); +SELECT COUNT(*) FROM t2; +SELECT COUNT(*) FROM t2 WHERE a = 0; +SELECT * FROM t2 ORDER BY a, tz LIMIT 3; +SELECT * FROM t2 ORDER BY a DESC, tz LIMIT 3; +UPDATE t2 SET a = TIMESTAMPADD(SECOND, 1, a); +SELECT MIN(a), MAX(a) FROM t2; +SELECT COUNT(*) FROM t2; +SELECT COUNT(*) FROM t2 WHERE a = 0; +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +SELECT * FROM t2 ORDER BY a, tz; + +--echo # Test start range changes +INSERT INTO t2 VALUES ('1970-01-01 00:00:00', 'Moscow'); +SELECT COUNT(*) FROM t2; +SELECT COUNT(*) FROM t2 WHERE a = 0; +SELECT * FROM t2 ORDER BY a, tz LIMIT 3; +SELECT * FROM t2 ORDER BY a DESC, tz LIMIT 3; +UPDATE t2 SET a = TIMESTAMPADD(SECOND, -1, a); +SELECT MIN(a), MAX(a) FROM t2; +SELECT COUNT(*) FROM t2; +SELECT COUNT(*) FROM t2 WHERE a = 0; +SELECT PARTITION_NAME, TABLE_ROWS FROM INFORMATION_SCHEMA.PARTITIONS +WHERE TABLE_NAME = 't2'; +SELECT * FROM t2 ORDER BY a, tz; + +SHOW CREATE TABLE t2; +TRUNCATE TABLE t2; + +DROP TABLE t1, t2; +SET @@session.time_zone= @old_time_zone; diff --git a/sql/item_timefunc.cc b/sql/item_timefunc.cc index 6335199b8de..c986ec6c00b 100644 --- a/sql/item_timefunc.cc +++ b/sql/item_timefunc.cc @@ -1316,6 +1316,26 @@ longlong Item_func_unix_timestamp::val_int() return (longlong) TIME_to_timestamp(current_thd, <ime, ¬_used); } +enum_monotonicity_info Item_func_unix_timestamp::get_monotonicity_info() const +{ + if (args[0]->type() == Item::FIELD_ITEM && + (args[0]->field_type() == MYSQL_TYPE_TIMESTAMP)) + return MONOTONIC_INCREASING; + return NON_MONOTONIC; +} + + +longlong Item_func_unix_timestamp::val_int_endpoint(bool left_endp, bool *incl_endp) +{ + DBUG_ASSERT(fixed == 1); + DBUG_ASSERT(arg_count == 1 && + args[0]->type() == Item::FIELD_ITEM && + args[0]->field_type() == MYSQL_TYPE_TIMESTAMP); + Field *field=((Item_field*) args[0])->field; + /* Leave the incl_endp intact */ + return ((Field_timestamp*) field)->get_timestamp(&null_value); +} + longlong Item_func_time_to_sec::val_int() { diff --git a/sql/item_timefunc.h b/sql/item_timefunc.h index 9c1ac512bcb..c366c824f13 100644 --- a/sql/item_timefunc.h +++ b/sql/item_timefunc.h @@ -349,6 +349,8 @@ public: Item_func_unix_timestamp(Item *a) :Item_int_func(a) {} longlong val_int(); const char *func_name() const { return "unix_timestamp"; } + enum_monotonicity_info get_monotonicity_info() const; + longlong val_int_endpoint(bool left_endp, bool *incl_endp); bool check_partition_func_processor(uchar *int_arg) {return FALSE;} /* UNIX_TIMESTAMP() depends on the current timezone From 044bf3b6b3235cf2e87dbe2f500ccfab9612f5dd Mon Sep 17 00:00:00 2001 From: Mattias Jonsson Date: Fri, 29 Apr 2011 13:00:16 +0200 Subject: [PATCH 02/35] bug#11765667: bug#58655: ASSERTION FAILED, SERVER CRASHES WITH MYSQLD GOT SIGNAL 6 The partitioning engine checked the auto_increment column even if it was not to be written, triggering a DBUG_ASSERT. Fixed by checking if table->write_set for that column was set. --- mysql-test/r/partition.result | 29 +++++++++++++++++++++++++++++ mysql-test/t/partition.test | 31 +++++++++++++++++++++++++++++++ sql/ha_partition.cc | 11 ++++++++--- 3 files changed, 68 insertions(+), 3 deletions(-) diff --git a/mysql-test/r/partition.result b/mysql-test/r/partition.result index 138264fd4e1..379f7499e11 100644 --- a/mysql-test/r/partition.result +++ b/mysql-test/r/partition.result @@ -1,5 +1,34 @@ drop table if exists t1, t2; # +# Bug#11765667: bug#58655: ASSERTION FAILED, +# SERVER CRASHES WITH MYSQLD GOT SIGNAL 6 +# +CREATE TABLE t1 ( +id MEDIUMINT NOT NULL AUTO_INCREMENT, +dt DATE, st VARCHAR(255), uid INT, +id2nd LONGBLOB, filler VARCHAR(255), PRIMARY KEY(id, dt) +); +INSERT INTO t1 (dt, st, uid, id2nd, filler) VALUES +('1991-03-14', 'Initial Insert', 200, 1234567, 'No Data'), +('1991-02-26', 'Initial Insert', 201, 1234567, 'No Data'), +('1992-03-16', 'Initial Insert', 234, 1234567, 'No Data'), +('1992-07-02', 'Initial Insert', 287, 1234567, 'No Data'), +('1991-05-26', 'Initial Insert', 256, 1234567, 'No Data'), +('1991-04-25', 'Initial Insert', 222, 1234567, 'No Data'), +('1993-03-12', 'Initial Insert', 267, 1234567, 'No Data'), +('1993-03-14', 'Initial Insert', 291, 1234567, 'No Data'), +('1991-12-20', 'Initial Insert', 298, 1234567, 'No Data'), +('1994-10-31', 'Initial Insert', 220, 1234567, 'No Data'); +ALTER TABLE t1 PARTITION BY LIST (YEAR(dt)) ( +PARTITION d1 VALUES IN (1991, 1994), +PARTITION d2 VALUES IN (1993), +PARTITION d3 VALUES IN (1992, 1995, 1996) +); +INSERT INTO t1 (dt, st, uid, id2nd, filler) VALUES +('1991-07-14', 'After Partitioning Insert', 299, 1234567, 'Insert row'); +UPDATE t1 SET filler='Updating the row' WHERE uid=298; +DROP TABLE t1; +# # Bug#59297: Can't find record in 'tablename' on update inner join # CREATE TABLE t1 ( diff --git a/mysql-test/t/partition.test b/mysql-test/t/partition.test index 7a0a5558d32..341e8780b2b 100644 --- a/mysql-test/t/partition.test +++ b/mysql-test/t/partition.test @@ -14,6 +14,37 @@ drop table if exists t1, t2; --enable_warnings +--echo # +--echo # Bug#11765667: bug#58655: ASSERTION FAILED, +--echo # SERVER CRASHES WITH MYSQLD GOT SIGNAL 6 +--echo # +CREATE TABLE t1 ( + id MEDIUMINT NOT NULL AUTO_INCREMENT, + dt DATE, st VARCHAR(255), uid INT, + id2nd LONGBLOB, filler VARCHAR(255), PRIMARY KEY(id, dt) +); +INSERT INTO t1 (dt, st, uid, id2nd, filler) VALUES + ('1991-03-14', 'Initial Insert', 200, 1234567, 'No Data'), + ('1991-02-26', 'Initial Insert', 201, 1234567, 'No Data'), + ('1992-03-16', 'Initial Insert', 234, 1234567, 'No Data'), + ('1992-07-02', 'Initial Insert', 287, 1234567, 'No Data'), + ('1991-05-26', 'Initial Insert', 256, 1234567, 'No Data'), + ('1991-04-25', 'Initial Insert', 222, 1234567, 'No Data'), + ('1993-03-12', 'Initial Insert', 267, 1234567, 'No Data'), + ('1993-03-14', 'Initial Insert', 291, 1234567, 'No Data'), + ('1991-12-20', 'Initial Insert', 298, 1234567, 'No Data'), + ('1994-10-31', 'Initial Insert', 220, 1234567, 'No Data'); +ALTER TABLE t1 PARTITION BY LIST (YEAR(dt)) ( + PARTITION d1 VALUES IN (1991, 1994), + PARTITION d2 VALUES IN (1993), + PARTITION d3 VALUES IN (1992, 1995, 1996) +); +INSERT INTO t1 (dt, st, uid, id2nd, filler) VALUES + ('1991-07-14', 'After Partitioning Insert', 299, 1234567, 'Insert row'); +UPDATE t1 SET filler='Updating the row' WHERE uid=298; + +DROP TABLE t1; + --echo # --echo # Bug#59297: Can't find record in 'tablename' on update inner join --echo # diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc index d09181822ee..82e38be2238 100644 --- a/sql/ha_partition.cc +++ b/sql/ha_partition.cc @@ -3403,15 +3403,19 @@ int ha_partition::update_row(const uchar *old_data, uchar *new_data) exit: /* - if updating an auto_increment column, update + If updating an auto_increment column, update table_share->ha_data->next_auto_inc_val if needed. (not to be used if auto_increment on secondary field in a multi-column index) mysql_update does not set table->next_number_field, so we use table->found_next_number_field instead. + Also checking that the field is marked in the write set. */ - if (table->found_next_number_field && new_data == table->record[0] && - !table->s->next_number_keypart) + if (table->found_next_number_field && + new_data == table->record[0] && + !table->s->next_number_keypart && + bitmap_is_set(table->write_set, + table->found_next_number_field->field_index)) { HA_DATA_PARTITION *ha_data= (HA_DATA_PARTITION*) table_share->ha_data; if (!ha_data->auto_inc_initialized) @@ -3974,6 +3978,7 @@ void ha_partition::position(const uchar *record) void ha_partition::column_bitmaps_signal() { handler::column_bitmaps_signal(); + /* Must read all partition fields to make position() call possible */ bitmap_union(table->read_set, &m_part_info->full_part_field_set); } From 34142b84c69fd67e8d50157d9d3356dcef30d154 Mon Sep 17 00:00:00 2001 From: Mayank Prasad Date: Tue, 14 Jun 2011 00:11:24 +0530 Subject: [PATCH 03/35] BUG#12561297:LIBMYSQLD/EXAMPLE/MYSQL_EMBEDDED IS ABORTING. Issue: When libmysqld/example/mysql_embedded is executed, it was getting abort. Its a regression as it was working in 5.1 and failed in 5.5. Issue is there because remaining_argc/remaining_argv were not getting assigned correctly in init_embedded_server() which were being used later in init_common_variable(). Solution: Rectified code to pass correct argc/argv to be used in init_common_variable(). libmysqld/lib_sql.cc: Rectified remaining_argc/remaining_argv assignment. mysql-test/r/mysql_embedded.result: Result file for the test case added. mysql-test/t/mysql_embedded.test: Added test case to verify libmysqld/example/mysql_embedded works. --- libmysqld/lib_sql.cc | 4 ++-- mysql-test/r/mysql_embedded.result | 5 +++++ mysql-test/t/mysql_embedded.test | 6 ++++++ 3 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 mysql-test/r/mysql_embedded.result create mode 100644 mysql-test/t/mysql_embedded.test diff --git a/libmysqld/lib_sql.cc b/libmysqld/lib_sql.cc index 34f8f685e45..eed5e90edcd 100644 --- a/libmysqld/lib_sql.cc +++ b/libmysqld/lib_sql.cc @@ -531,8 +531,8 @@ int init_embedded_server(int argc, char **argv, char **groups) return 1; defaults_argc= *argcp; defaults_argv= *argvp; - remaining_argc= argc; - remaining_argv= argv; + remaining_argc= *argcp; + remaining_argv= *argvp; /* Must be initialized early for comparison of options name */ system_charset_info= &my_charset_utf8_general_ci; diff --git a/mysql-test/r/mysql_embedded.result b/mysql-test/r/mysql_embedded.result new file mode 100644 index 00000000000..3ba79a01e44 --- /dev/null +++ b/mysql-test/r/mysql_embedded.result @@ -0,0 +1,5 @@ +# +# Bug#12561297 : LIBMYSQLD/EXAMPLE/MYSQL_EMBEDDED IS ABORTING. +# +1 +1 diff --git a/mysql-test/t/mysql_embedded.test b/mysql-test/t/mysql_embedded.test new file mode 100644 index 00000000000..b4c8c6be05f --- /dev/null +++ b/mysql-test/t/mysql_embedded.test @@ -0,0 +1,6 @@ +--echo # +--echo # Bug#12561297 : LIBMYSQLD/EXAMPLE/MYSQL_EMBEDDED IS ABORTING. +--echo # + +--source include/is_embedded.inc +--exec $MYSQL_TEST_DIR/../libmysqld/examples/mysql_embedded -e 'select 1' From 7eddf251bae7d8d0497c7221a8b646da5321732e Mon Sep 17 00:00:00 2001 From: Anitha Gopi Date: Wed, 15 Jun 2011 08:33:13 +0530 Subject: [PATCH 04/35] Bug#12561297 : Disable test main.mysql_embedded since it is failing on all platforms --- mysql-test/t/disabled.def | 1 + 1 file changed, 1 insertion(+) diff --git a/mysql-test/t/disabled.def b/mysql-test/t/disabled.def index ebcfa6b1845..ce4131d359e 100644 --- a/mysql-test/t/disabled.def +++ b/mysql-test/t/disabled.def @@ -16,3 +16,4 @@ alter_table-big : Bug#11748731 2010-11-15 mattiasj was not tested create-big : Bug#11748731 2010-11-15 mattiasj was not tested archive-big : Bug#11817185 2011-03-10 Anitha Disabled since this leads to timeout on Solaris Sparc log_tables-big : Bug#11756699 2010-11-15 mattiasj report already exists +mysql_embedded : Bug#12561297 2011-06-15 New test failing on all platforms From 44ed935b212d67d35c937037ff649f548737e0bc Mon Sep 17 00:00:00 2001 From: Dmitry Shulga Date: Wed, 15 Jun 2011 16:18:08 +0700 Subject: [PATCH 05/35] Fixed bug#12403662 (formerly known as bug#60987): LOAD DATA LOCAL INFILE can't parse relative paths "higher" than 3 levels up When trying to LOAD DATA LOCAL INFILE using a relative path with 3 or more levels up in the directory hierarchy, mysqld wrongly parses the path and as a consequence, can't find the file. This bug was introduced by patch for bug#58205. The reason for bug is that implementaiton of function cleanup_dirname() doesn't take into account the begin of buffer being processed during handling of path to file. mysys/mf_pack.c: function cleanup_dirname() was modified: fixed wrong comparison condition when handling substring "../" at the begining of the buffer. --- mysys/mf_pack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mysys/mf_pack.c b/mysys/mf_pack.c index 292cd8c0454..790936461a4 100644 --- a/mysys/mf_pack.c +++ b/mysys/mf_pack.c @@ -193,7 +193,7 @@ size_t cleanup_dirname(register char *to, const char *from) while (pos >= start && *pos != FN_LIBCHAR) /* remove prev dir */ pos--; if (pos[1] == FN_HOMELIB || - (pos > start && memcmp(pos, parent, length) == 0)) + (pos >= start && memcmp(pos, parent, length) == 0)) { /* Don't remove ~user/ */ pos=strmov(end_parentdir+1,parent); *pos=FN_LIBCHAR; From 4e4e09ea5083f4e7d88aa248c9424953c9162e0a Mon Sep 17 00:00:00 2001 From: Vinay Fisrekar Date: Thu, 16 Jun 2011 10:00:51 +0530 Subject: [PATCH 06/35] Skip test run with valgrind as test has restarts --- .../suite/innodb/t/innodb_prefix_index_restart_server.test | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mysql-test/suite/innodb/t/innodb_prefix_index_restart_server.test b/mysql-test/suite/innodb/t/innodb_prefix_index_restart_server.test index 9d3c52206ff..587e6fe1f6b 100644 --- a/mysql-test/suite/innodb/t/innodb_prefix_index_restart_server.test +++ b/mysql-test/suite/innodb/t/innodb_prefix_index_restart_server.test @@ -9,6 +9,8 @@ # # ###################################################################### +# Don't test this under valgrind, memory leaks will occur due restart +--source include/not_valgrind.inc # Test restart the server and "shutdown_server" looks for pid file # which is not there with embedded mode --source include/not_embedded.inc From 5a0e7394a5ae0c7b6a1ea35b7ea3a8985325987a Mon Sep 17 00:00:00 2001 From: Jorgen Loland Date: Thu, 16 Jun 2011 08:24:00 +0200 Subject: [PATCH 07/35] BUG#11882110: UPDATE REPORTS ER_KEY_NOT_FOUND IF TABLE IS UPDATED TWICE For multi update it is not allowed to update a column of a table if that table is accessed through multiple aliases and either 1) the updated column is used as partitioning key 2) the updated column is part of the primary key and the primary key is clustered This check is done in unsafe_key_update(). The bug was that for case 2), it was checked whether updated_column_number == table_share->primary_key However, the primary_key variable is the index number of the primary key, not a column number. Prior to this bugfix, the first column was wrongly believed to be the primary key. The columns covered by an index is found in table->key_info[idx_number]->key_part. The bugfix is to check if any of the columns in the keyparts of the primary key are updated. The user-visible effect is that for storage engines with clustered primary key (e.g. InnoDB but not MyISAM) queries like "UPDATE t1 AS A JOIN t2 AS B SET A.primkey=..." will now error with "ERROR HY000: Primary key/partition key update is not allowed since the table is updated both as 'A' and 'B'." instead of "ERROR 1032 (HY000): Can't find record in 't1_tb'" even if primkey is not the first column in the table. This was the intended behavior of bugfix 11764529. mysql-test/r/multi_update.result: Add test for bug#11882110 mysql-test/r/multi_update_innodb.result: Add test for bug#11882110 mysql-test/t/multi_update.test: Add test for bug#11882110 mysql-test/t/multi_update_innodb.test: Add test for bug#11882110 sql/sql_update.cc: unsafe_key_update() wrongly checked if the primary key index number was the same as updated column number. Now it is checked whether any of the columns making up the primary key is updated. sql/table.h: Fix comment on TABLE_SHARE::primary_key. Incorrect comment was introduced by an earlier merge conflict (as per dlenev) --- mysql-test/r/multi_update.result | 38 +++++++++++++++++++- mysql-test/r/multi_update_innodb.result | 40 +++++++++++++++++++++ mysql-test/t/multi_update.test | 48 ++++++++++++++++++++++++- mysql-test/t/multi_update_innodb.test | 44 +++++++++++++++++++++++ sql/sql_update.cc | 30 ++++++++++------ sql/table.h | 4 +-- 6 files changed, 190 insertions(+), 14 deletions(-) diff --git a/mysql-test/r/multi_update.result b/mysql-test/r/multi_update.result index 5a3e7938121..c6ee170eef7 100644 --- a/mysql-test/r/multi_update.result +++ b/mysql-test/r/multi_update.result @@ -697,4 +697,40 @@ SELECT * FROM t1; pk a 1 2 DROP TABLE t1; -end of tests +# +# BUG#11882110: UPDATE REPORTS ER_KEY_NOT_FOUND IF TABLE IS +# UPDATED TWICE +# +CREATE TABLE t1 ( +col_int_key int, +pk int, +col_int int, +key(col_int_key), +primary key (pk) +) ENGINE=MyISAM; +INSERT INTO t1 VALUES (1,2,3); + +CREATE TABLE t2 ( +col_int_key int, +pk_1 int, +pk_2 int, +col_int int, +key(col_int_key), +primary key (pk_1,pk_2) +) ENGINE=MyISAM; +INSERT INTO t2 VALUES (1,2,3,4); + +UPDATE t1 AS A NATURAL JOIN t1 B SET A.pk=5,B.pk=7; + +SELECT * FROM t1; +col_int_key pk col_int +1 7 3 + +UPDATE t2 AS A NATURAL JOIN t2 B SET A.pk_1=5,B.pk_1=7; + +UPDATE t2 AS A NATURAL JOIN t2 B SET A.pk_2=10,B.pk_2=11; + +SELECT * FROM t2; +col_int_key pk_1 pk_2 col_int +1 7 11 4 +DROP TABLE t1,t2; diff --git a/mysql-test/r/multi_update_innodb.result b/mysql-test/r/multi_update_innodb.result index 12a94accc1f..643287c3a93 100644 --- a/mysql-test/r/multi_update_innodb.result +++ b/mysql-test/r/multi_update_innodb.result @@ -27,3 +27,43 @@ pk a b 0 1 2 DROP VIEW v1; DROP TABLE t1; +# +# BUG#11882110: UPDATE REPORTS ER_KEY_NOT_FOUND IF TABLE IS +# UPDATED TWICE +# +CREATE TABLE t1 ( +col_int_key int, +pk int, +col_int int, +key(col_int_key), +primary key (pk) +) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1,2,3); + +CREATE TABLE t2 ( +col_int_key int, +pk_1 int, +pk_2 int, +col_int int, +key(col_int_key), +primary key (pk_1,pk_2) +) ENGINE=InnoDB; +INSERT INTO t2 VALUES (1,2,3,4); + +UPDATE t1 AS A NATURAL JOIN t1 B SET A.pk=5,B.pk=7; +ERROR HY000: Primary key/partition key update is not allowed since the table is updated both as 'A' and 'B'. + +SELECT * FROM t1; +col_int_key pk col_int +1 2 3 + +UPDATE t2 AS A NATURAL JOIN t2 B SET A.pk_1=5,B.pk_1=7; +ERROR HY000: Primary key/partition key update is not allowed since the table is updated both as 'A' and 'B'. + +UPDATE t2 AS A NATURAL JOIN t2 B SET A.pk_2=10,B.pk_2=11; +ERROR HY000: Primary key/partition key update is not allowed since the table is updated both as 'A' and 'B'. + +SELECT * FROM t2; +col_int_key pk_1 pk_2 col_int +1 2 3 4 +DROP TABLE t1,t2; diff --git a/mysql-test/t/multi_update.test b/mysql-test/t/multi_update.test index 496d045075a..4de574fbb0d 100644 --- a/mysql-test/t/multi_update.test +++ b/mysql-test/t/multi_update.test @@ -703,4 +703,50 @@ UPDATE t1 AS A, t1 AS B SET A.pk = 1, B.a = 2; SELECT * FROM t1; DROP TABLE t1; ---echo end of tests +--echo # +--echo # BUG#11882110: UPDATE REPORTS ER_KEY_NOT_FOUND IF TABLE IS +--echo # UPDATED TWICE +--echo # + +# Results differ between storage engines. This test is to verify that +# the bugfix did NOT change behavior for MyISAM. +# See multi_update_innodb.test for the InnoDB variant of this test +CREATE TABLE t1 ( + col_int_key int, + pk int, + col_int int, + key(col_int_key), + primary key (pk) +) ENGINE=MyISAM; +INSERT INTO t1 VALUES (1,2,3); + +--echo +CREATE TABLE t2 ( + col_int_key int, + pk_1 int, + pk_2 int, + col_int int, + key(col_int_key), + primary key (pk_1,pk_2) +) ENGINE=MyISAM; +INSERT INTO t2 VALUES (1,2,3,4); + +--echo +UPDATE t1 AS A NATURAL JOIN t1 B SET A.pk=5,B.pk=7; + +--echo +SELECT * FROM t1; + +--echo +UPDATE t2 AS A NATURAL JOIN t2 B SET A.pk_1=5,B.pk_1=7; +--echo +UPDATE t2 AS A NATURAL JOIN t2 B SET A.pk_2=10,B.pk_2=11; + +--echo +SELECT * FROM t2; + +DROP TABLE t1,t2; + + + + diff --git a/mysql-test/t/multi_update_innodb.test b/mysql-test/t/multi_update_innodb.test index 3148d009cef..51757c29553 100644 --- a/mysql-test/t/multi_update_innodb.test +++ b/mysql-test/t/multi_update_innodb.test @@ -31,3 +31,47 @@ SELECT * FROM t1; DROP VIEW v1; DROP TABLE t1; +--echo # +--echo # BUG#11882110: UPDATE REPORTS ER_KEY_NOT_FOUND IF TABLE IS +--echo # UPDATED TWICE +--echo # + +# Results differ between storage engines. +# See multi_update.test for the MyISAM variant of this test +CREATE TABLE t1 ( + col_int_key int, + pk int, + col_int int, + key(col_int_key), + primary key (pk) +) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1,2,3); + +--echo +CREATE TABLE t2 ( + col_int_key int, + pk_1 int, + pk_2 int, + col_int int, + key(col_int_key), + primary key (pk_1,pk_2) +) ENGINE=InnoDB; +INSERT INTO t2 VALUES (1,2,3,4); + +--echo +--error ER_MULTI_UPDATE_KEY_CONFLICT +UPDATE t1 AS A NATURAL JOIN t1 B SET A.pk=5,B.pk=7; +--echo +SELECT * FROM t1; + +--echo +--error ER_MULTI_UPDATE_KEY_CONFLICT +UPDATE t2 AS A NATURAL JOIN t2 B SET A.pk_1=5,B.pk_1=7; +--echo +--error ER_MULTI_UPDATE_KEY_CONFLICT +UPDATE t2 AS A NATURAL JOIN t2 B SET A.pk_2=10,B.pk_2=11; + +--echo +SELECT * FROM t2; + +DROP TABLE t1,t2; diff --git a/sql/sql_update.cc b/sql/sql_update.cc index 59356540201..90c4697d663 100644 --- a/sql/sql_update.cc +++ b/sql/sql_update.cc @@ -1071,17 +1071,27 @@ bool unsafe_key_update(TABLE_LIST *leaves, table_map tables_for_update) return true; } - if (primkey_clustered && - (bitmap_is_set(table1->write_set, table1->s->primary_key) || - bitmap_is_set(table2->write_set, table2->s->primary_key))) + if (primkey_clustered) { - // Clustered primary key is updated - my_error(ER_MULTI_UPDATE_KEY_CONFLICT, MYF(0), - tl->belong_to_view ? tl->belong_to_view->alias - : tl->alias, - tl2->belong_to_view ? tl2->belong_to_view->alias - : tl2->alias); - return true; + // The primary key can cover multiple columns + KEY key_info= table1->key_info[table1->s->primary_key]; + KEY_PART_INFO *key_part= key_info.key_part; + KEY_PART_INFO *key_part_end= key_part + key_info.key_parts; + + for (;key_part != key_part_end; ++key_part) + { + if (bitmap_is_set(table1->write_set, key_part->fieldnr-1) || + bitmap_is_set(table2->write_set, key_part->fieldnr-1)) + { + // Clustered primary key is updated + my_error(ER_MULTI_UPDATE_KEY_CONFLICT, MYF(0), + tl->belong_to_view ? tl->belong_to_view->alias + : tl->alias, + tl2->belong_to_view ? tl2->belong_to_view->alias + : tl2->alias); + return true; + } + } } } } diff --git a/sql/table.h b/sql/table.h index afdf4f7baa9..81b42c5b013 100644 --- a/sql/table.h +++ b/sql/table.h @@ -627,8 +627,8 @@ struct TABLE_SHARE uint db_options_in_use; /* Options in use */ uint db_record_offset; /* if HA_REC_IN_SEQ */ uint rowid_field_offset; /* Field_nr +1 to rowid field */ - /* Index of auto-updated TIMESTAMP field in field array */ - uint primary_key; + /* Primary key index number, used in TABLE::key_info[] */ + uint primary_key; uint next_number_index; /* autoincrement key number */ uint next_number_key_offset; /* autoinc keypart offset in a key */ uint next_number_keypart; /* autoinc keypart number in a key */ From 5adeda3ae6dbf3b6b0b548b8a72cc4cb896784f9 Mon Sep 17 00:00:00 2001 From: Mayank Prasad Date: Thu, 16 Jun 2011 19:25:11 +0530 Subject: [PATCH 08/35] BUG#12561297:LIBMYSQLD/EXAMPLE/MYSQL_EMBEDDED IS ABORTING. Issue: ------ New test case mysql_embedded.test was failing on pb2. Description: ------------ To run this test case executable libmysqld/examples/mysql_embedded is required. But as per /libmysqld/examples/cmake_install.cmake this executable doesn't get copied to when mysql is installed at .That is the reason it was passing in my local branch and failed on pb2 when pushed. Solution; --------- Added code in mysql-test-run.pl, which will try to see if this file exists.If It doesn't exist, test case will be skipped with a skip message. New code in mysql-test-run.pl looks only for directory libmysqld/examples/mysql_embedded because this is the only place where this file could/does exist. mysql-test/mysql-test-run.pl: Added new variable for mysql_embedded executable. mysql-test/t/disabled.def: enabled mysql_embedded.test which was disabled earlier. mysql-test/t/mysql_embedded.test: Modified test case to first verify if mysql_embedded executable exists. If it does not, skip the test. --- mysql-test/mysql-test-run.pl | 4 ++++ mysql-test/t/disabled.def | 1 - mysql-test/t/mysql_embedded.test | 8 +++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/mysql-test/mysql-test-run.pl b/mysql-test/mysql-test-run.pl index f2487130015..423518cd788 100755 --- a/mysql-test/mysql-test-run.pl +++ b/mysql-test/mysql-test-run.pl @@ -171,6 +171,7 @@ our $exe_mysql; our $exe_mysqladmin; our $exe_mysqltest; our $exe_libtool; +our $exe_mysql_embedded; our $opt_big_test= 0; @@ -1950,6 +1951,8 @@ sub executable_setup () { $exe_mysqladmin= mtr_exe_exists("$path_client_bindir/mysqladmin"); $exe_mysql= mtr_exe_exists("$path_client_bindir/mysql"); + $exe_mysql_embedded= mtr_exe_maybe_exists("$basedir/libmysqld/examples/mysql_embedded"); + if ( ! $opt_skip_ndbcluster ) { # Look for single threaded NDB @@ -2354,6 +2357,7 @@ sub environment_setup { $ENV{'MYSQLADMIN'}= native_path($exe_mysqladmin); $ENV{'MYSQL_CLIENT_TEST'}= mysql_client_test_arguments(); $ENV{'EXE_MYSQL'}= $exe_mysql; + $ENV{'MYSQL_EMBEDDED'}= $exe_mysql_embedded; # ---------------------------------------------------- # bug25714 executable may _not_ exist in diff --git a/mysql-test/t/disabled.def b/mysql-test/t/disabled.def index ce4131d359e..ebcfa6b1845 100644 --- a/mysql-test/t/disabled.def +++ b/mysql-test/t/disabled.def @@ -16,4 +16,3 @@ alter_table-big : Bug#11748731 2010-11-15 mattiasj was not tested create-big : Bug#11748731 2010-11-15 mattiasj was not tested archive-big : Bug#11817185 2011-03-10 Anitha Disabled since this leads to timeout on Solaris Sparc log_tables-big : Bug#11756699 2010-11-15 mattiasj report already exists -mysql_embedded : Bug#12561297 2011-06-15 New test failing on all platforms diff --git a/mysql-test/t/mysql_embedded.test b/mysql-test/t/mysql_embedded.test index b4c8c6be05f..30959b83017 100644 --- a/mysql-test/t/mysql_embedded.test +++ b/mysql-test/t/mysql_embedded.test @@ -3,4 +3,10 @@ --echo # --source include/is_embedded.inc ---exec $MYSQL_TEST_DIR/../libmysqld/examples/mysql_embedded -e 'select 1' + +# Test case require mysql_embedded to be present +if(!$MYSQL_EMBEDDED) +{ + --skip Test requires mysql_embedded executable +} +--exec $MYSQL_EMBEDDED -e 'select 1' From 5b225518ff2193834804ddb403ac6c0f9a265884 Mon Sep 17 00:00:00 2001 From: Dmitry Lenev Date: Thu, 16 Jun 2011 19:18:16 +0400 Subject: [PATCH 09/35] Fix for bug #12641342 - "61401: UPDATE PERFORMANCE DEGRADES GRADUALLY IF A TRIGGER EXISTS". This bug manifested itself in two ways: - Firstly execution of any data-changing statement which required prelocking (i.e. involved stored function or trigger) as part of transaction slowed down a bit all subsequent statements in this transaction. So performance in transaction which periodically involved such statements gradually degraded over time. - Secondly execution of any data-changing statement which required prelocking as part of transaction prevented concurrent FLUSH TABLES WITH READ LOCK from proceeding until the end of transaction instead of end of particular statement. The problem was caused by incorrect handling of metadata lock used in FTWRL implementation for statements requiring prelocked mode. Each statement which changes data acquires global IX lock with STATEMENT duration. This lock is supposed to block concurrent FTWRL from proceeding until the statement ends. When entering prelocked mode, durations of all metadata locks acquired so far were changed to EXPLICIT, to prevent substatements from releasing these locks. When prelocked mode was left, durations of metadata locks were changed to TRANSACTIONAL (with a few exceptions) so they can be properly released at the end of transaction. Unfortunately, this meant that the global IX lock blocking FTWRL with STATEMENT duration was moved to TRANSACTIONAL duration after execution of statement requiring prelocking. Since each subsequent statement that required prelocking and tried to acquire global IX lock with STATEMENT duration got a new instance of MDL_ticket, which was later moved to TRANSACTIONAL duration, this led to unwarranted growth of number of tickets with TRANSACITONAL duration in this connection's MDL_context. As result searching for other tickets in it became slow and acquisition of other metadata locks by this transaction started to hog CPU. Moreover, this also meant that after execution of statement requiring prelocking concurrent FTWRL was blocked until the end of transaction instead of end of statement. This patch solves this problem by not moving locks to EXPLICIT duration when thread enters prelocked mode (unless it is a real LOCK TABLES mode). This step turned out to be not really necessary as substatements don't try to release metadata locks. Consequently, the global IX lock blocking FTWRL keeps its STATEMENT duration and is properly released at the end of statement and the above issue goes away. mysql-test/r/flush.result: Added test for bug #12641342 - "61401: UPDATE PERFORMANCE DEGRADES GRADUALLY IF A TRIGGER EXISTS". mysql-test/t/flush.test: Added test for bug #12641342 - "61401: UPDATE PERFORMANCE DEGRADES GRADUALLY IF A TRIGGER EXISTS". sql/mdl.h: Added comment describing various types of metadata lock duration. sql/sql_class.cc: Since we no longer change duration of metadata locks to EXPLICIT when entering prelocked mode (unless it is a real LOCK TABLES) there is no need to restore proper duration of the locks when leaving prelocked mode. sql/sql_class.h: Do not change duration of metadata locks to EXPLICIT when entering prelocking mode (unless it is a real LOCK TABLES). This allows to avoid problems with restoring correct duration when leaving this mode. It is possible to do this as substatements won't release metadata locks in any case. sql/sql_parse.cc: Added assert checking that we won't release metadata locks when in substatement. --- mysql-test/r/flush.result | 23 +++++++++++++++++++++++ mysql-test/t/flush.test | 33 +++++++++++++++++++++++++++++++++ sql/mdl.h | 23 ++++++++++++++++++----- sql/sql_class.cc | 26 +++++++++++++++++--------- sql/sql_class.h | 14 +++++++++++++- sql/sql_parse.cc | 5 +++++ 6 files changed, 109 insertions(+), 15 deletions(-) diff --git a/mysql-test/r/flush.result b/mysql-test/r/flush.result index bbfea2dade8..e36cd8a478f 100644 --- a/mysql-test/r/flush.result +++ b/mysql-test/r/flush.result @@ -466,3 +466,26 @@ ALTER TABLE t1 COMMENT 'test'; ERROR HY000: Table 't1' was locked with a READ lock and can't be updated UNLOCK TABLES; DROP TABLE t1; +# +# Test for bug #12641342 - "61401: UPDATE PERFORMANCE DEGRADES +# GRADUALLY IF A TRIGGER EXISTS". +# +# One of side-effects of this bug was that a transaction which +# involved DML statements requiring prelocking blocked concurrent +# FLUSH TABLES WITH READ LOCK for the whole its duration, while +# correct behavior in this case is to block FTWRL only for duration +# of individual DML statements. +DROP TABLE IF EXISTS t1; +CREATE TABLE t1 (id INT PRIMARY KEY, value INT); +INSERT INTO t1 VALUES (1, 1); +CREATE TRIGGER t1_au AFTER UPDATE ON t1 FOR EACH ROW SET @var = "a"; +BEGIN; +UPDATE t1 SET value= value + 1 WHERE id = 1; +# Switching to connection 'con1'. +# The below FLUSH TABLES WITH READ LOCK should succeed and +# should not be blocked by the transaction in default connection. +FLUSH TABLES WITH READ LOCK; +UNLOCK TABLES; +# Switching to connection 'default'. +COMMIT; +DROP TABLE t1; diff --git a/mysql-test/t/flush.test b/mysql-test/t/flush.test index 52ee6d2cf87..a4f1f19af27 100644 --- a/mysql-test/t/flush.test +++ b/mysql-test/t/flush.test @@ -668,3 +668,36 @@ ALTER TABLE t1 COMMENT 'test'; UNLOCK TABLES; DROP TABLE t1; + + +--echo # +--echo # Test for bug #12641342 - "61401: UPDATE PERFORMANCE DEGRADES +--echo # GRADUALLY IF A TRIGGER EXISTS". +--echo # +--echo # One of side-effects of this bug was that a transaction which +--echo # involved DML statements requiring prelocking blocked concurrent +--echo # FLUSH TABLES WITH READ LOCK for the whole its duration, while +--echo # correct behavior in this case is to block FTWRL only for duration +--echo # of individual DML statements. +--disable_warnings +DROP TABLE IF EXISTS t1; +--enable_warnings +CREATE TABLE t1 (id INT PRIMARY KEY, value INT); +INSERT INTO t1 VALUES (1, 1); +CREATE TRIGGER t1_au AFTER UPDATE ON t1 FOR EACH ROW SET @var = "a"; +BEGIN; +UPDATE t1 SET value= value + 1 WHERE id = 1; + +--echo # Switching to connection 'con1'. +connect(con1, localhost, root); +--echo # The below FLUSH TABLES WITH READ LOCK should succeed and +--echo # should not be blocked by the transaction in default connection. +FLUSH TABLES WITH READ LOCK; +UNLOCK TABLES; +disconnect con1; +--source include/wait_until_disconnected.inc + +--echo # Switching to connection 'default'. +connection default; +COMMIT; +DROP TABLE t1; diff --git a/sql/mdl.h b/sql/mdl.h index e0934cb732b..39e47da61dd 100644 --- a/sql/mdl.h +++ b/sql/mdl.h @@ -152,11 +152,24 @@ enum enum_mdl_type { /** Duration of metadata lock. */ -enum enum_mdl_duration { MDL_STATEMENT= 0, - MDL_TRANSACTION, - MDL_EXPLICIT, - /* This should be the last ! */ - MDL_DURATION_END }; +enum enum_mdl_duration { + /** + Locks with statement duration are automatically released at the end + of statement or transaction. + */ + MDL_STATEMENT= 0, + /** + Locks with transaction duration are automatically released at the end + of transaction. + */ + MDL_TRANSACTION, + /** + Locks with explicit duration survive the end of statement and transaction. + They have to be released explicitly by calling MDL_context::release_lock(). + */ + MDL_EXPLICIT, + /* This should be the last ! */ + MDL_DURATION_END }; /** Maximal length of key for metadata locking subsystem. */ diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 7c2ed133f70..43575d135ee 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -3790,16 +3790,24 @@ void THD::set_mysys_var(struct st_my_thread_var *new_mysys_var) void THD::leave_locked_tables_mode() { + if (locked_tables_mode == LTM_LOCK_TABLES) + { + /* + When leaving LOCK TABLES mode we have to change the duration of most + of the metadata locks being held, except for HANDLER and GRL locks, + to transactional for them to be properly released at UNLOCK TABLES. + */ + mdl_context.set_transaction_duration_for_all_locks(); + /* + Make sure we don't release the global read lock and commit blocker + when leaving LTM. + */ + global_read_lock.set_explicit_lock_duration(this); + /* Also ensure that we don't release metadata locks for open HANDLERs. */ + if (handler_tables_hash.records) + mysql_ha_set_explicit_lock_duration(this); + } locked_tables_mode= LTM_NONE; - mdl_context.set_transaction_duration_for_all_locks(); - /* - Make sure we don't release the global read lock and commit blocker - when leaving LTM. - */ - global_read_lock.set_explicit_lock_duration(this); - /* Also ensure that we don't release metadata locks for open HANDLERs. */ - if (handler_tables_hash.records) - mysql_ha_set_explicit_lock_duration(this); } void THD::get_definer(LEX_USER *definer) diff --git a/sql/sql_class.h b/sql/sql_class.h index a2c622b2326..8a427057d83 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -2795,7 +2795,19 @@ public: { DBUG_ASSERT(locked_tables_mode == LTM_NONE); - mdl_context.set_explicit_duration_for_all_locks(); + if (mode_arg == LTM_LOCK_TABLES) + { + /* + When entering LOCK TABLES mode we should set explicit duration + for all metadata locks acquired so far in order to avoid releasing + them till UNLOCK TABLES statement. + We don't do this when entering prelocked mode since sub-statements + don't release metadata locks and restoring status-quo after leaving + prelocking mode gets complicated. + */ + mdl_context.set_explicit_duration_for_all_locks(); + } + locked_tables_mode= mode_arg; } void leave_locked_tables_mode(); diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 44a9243d8f5..36fdf376fa6 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -2027,6 +2027,11 @@ mysql_execute_command(THD *thd) */ if (stmt_causes_implicit_commit(thd, CF_IMPLICT_COMMIT_BEGIN)) { + /* + Note that this should never happen inside of stored functions + or triggers as all such statements prohibited there. + */ + DBUG_ASSERT(! thd->in_sub_stmt); /* Commit or rollback the statement transaction. */ thd->is_error() ? trans_rollback_stmt(thd) : trans_commit_stmt(thd); /* Commit the normal transaction if one is active. */ From 699f0c067f82c07d6b495b594f6777995ceb6a6c Mon Sep 17 00:00:00 2001 From: Matthias Leich Date: Thu, 16 Jun 2011 22:35:01 +0200 Subject: [PATCH 10/35] Fix for Bug 12430414 - THE TEST PERFSCHEMA.SELECTS.TEST CAN AFFECT SUCCEEDING TESTS Bug 12430599 - THE TEST PERFSCHEMA.ONE_THREAD_PER_CON. CAN AFFECT SUCCEEDING TESTS Bug 12431153 - THE TEST PERFSCHEMA.PFS_UPGRADE CAN AFFECT SUCCEEDING TEST --- .../perfschema/include/cleanup_helper.inc | 20 +++++++++++++------ .../perfschema/include/upgrade_check.inc | 2 ++ mysql-test/suite/perfschema/r/selects.result | 2 ++ mysql-test/suite/perfschema/t/selects.test | 6 ++++-- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/mysql-test/suite/perfschema/include/cleanup_helper.inc b/mysql-test/suite/perfschema/include/cleanup_helper.inc index b7e37849f78..2c2d73f5fca 100644 --- a/mysql-test/suite/perfschema/include/cleanup_helper.inc +++ b/mysql-test/suite/perfschema/include/cleanup_helper.inc @@ -1,25 +1,33 @@ -# Copyright (C) 2009 Sun Microsystems, Inc +# Copyright (c) 2009, 2011, Oracle and/or its affiliates. All rights reserved. # -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; version 2 of the License. +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; version 2 of +# the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA # Tests for PERFORMANCE_SCHEMA update performance_schema.setup_instruments set enabled='YES'; +connection con1; disconnect con1; +--source include/wait_until_disconnected.inc +connection con2; disconnect con2; +--source include/wait_until_disconnected.inc +connection con3; disconnect con3; +--source include/wait_until_disconnected.inc connection default; diff --git a/mysql-test/suite/perfschema/include/upgrade_check.inc b/mysql-test/suite/perfschema/include/upgrade_check.inc index 3620e80002f..1e3ea43e689 100644 --- a/mysql-test/suite/perfschema/include/upgrade_check.inc +++ b/mysql-test/suite/perfschema/include/upgrade_check.inc @@ -17,8 +17,10 @@ # $out_file and $err_file must be set within pfs_upgrade.test. # +--source include/count_sessions.inc --error 1 --exec $MYSQL_UPGRADE --skip-verbose --force > $out_file 2> $err_file +--source include/wait_until_count_sessions.inc # Verify that mysql_upgrade complained about the performance_schema --cat_file $err_file diff --git a/mysql-test/suite/perfschema/r/selects.result b/mysql-test/suite/perfschema/r/selects.result index a3d0931cf4c..4c59cc3338e 100644 --- a/mysql-test/suite/perfschema/r/selects.result +++ b/mysql-test/suite/perfschema/r/selects.result @@ -62,6 +62,7 @@ SELECT EVENT_ID FROM performance_schema.events_waits_current WHERE 1 = 2; CREATE EVENT t_ps_event ON SCHEDULE AT CURRENT_TIMESTAMP + INTERVAL 1 SECOND +ON COMPLETION PRESERVE DO INSERT INTO t_event SELECT DISTINCT EVENT_ID FROM performance_schema.events_waits_current @@ -106,5 +107,6 @@ EVENT_ID [EVENT_ID] DROP PROCEDURE t_ps_proc; DROP FUNCTION t_ps_func; +DROP EVENT t_ps_event; DROP TABLE t1; DROP TABLE t_event; diff --git a/mysql-test/suite/perfschema/t/selects.test b/mysql-test/suite/perfschema/t/selects.test index e8fd0827c53..e4541d6c6fc 100644 --- a/mysql-test/suite/perfschema/t/selects.test +++ b/mysql-test/suite/perfschema/t/selects.test @@ -1,4 +1,4 @@ -# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -97,6 +97,7 @@ SELECT EVENT_ID FROM performance_schema.events_waits_current WHERE 1 = 2; CREATE EVENT t_ps_event ON SCHEDULE AT CURRENT_TIMESTAMP + INTERVAL 1 SECOND +ON COMPLETION PRESERVE DO INSERT INTO t_event SELECT DISTINCT EVENT_ID FROM performance_schema.events_waits_current @@ -168,7 +169,7 @@ delimiter ;| SELECT t_ps_func(connection_id()) = @p_id; # We might reach this point too early which means the event scheduler has not -# execute our "t_ps_event". Therefore we poll till the record was inserted +# executed our "t_ps_event". Therefore we poll till the record was inserted # and run our test statement afterwards. let $wait_timeout= 20; let $wait_condition= SELECT COUNT(*) = 1 FROM t_event; @@ -179,5 +180,6 @@ SELECT * FROM t_event; # Clean up DROP PROCEDURE t_ps_proc; DROP FUNCTION t_ps_func; +DROP EVENT t_ps_event; DROP TABLE t1; DROP TABLE t_event; From 9df4c14354e73455f0e0057f92b9870369f05d43 Mon Sep 17 00:00:00 2001 From: Inaam Rana Date: Fri, 17 Jun 2011 16:29:19 -0400 Subject: [PATCH 11/35] merge from mysql-5.1 Bug 12635227 - 61188: DROP TABLE EXTREMELY SLOW --- storage/innobase/btr/btr0cur.c | 2 +- storage/innobase/buf/buf0buddy.c | 428 ++++++++------------------ storage/innobase/buf/buf0buf.c | 27 +- storage/innobase/buf/buf0lru.c | 232 +++++--------- storage/innobase/include/buf0buddy.h | 31 +- storage/innobase/include/buf0buddy.ic | 37 ++- storage/innobase/include/buf0buf.h | 16 + storage/innobase/include/buf0buf.ic | 29 ++ storage/innobase/include/buf0lru.h | 19 +- storage/innobase/include/buf0types.h | 17 +- 10 files changed, 297 insertions(+), 541 deletions(-) diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index c50b05cc1f5..e1060af525c 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -4155,7 +4155,7 @@ btr_blob_free( && buf_block_get_space(block) == space && buf_block_get_page_no(block) == page_no) { - if (buf_LRU_free_block(&block->page, all) != BUF_LRU_FREED + if (!buf_LRU_free_block(&block->page, all) && all && block->page.zip.data) { /* Attempt to deallocate the uncompressed page if the whole block cannot be deallocted. */ diff --git a/storage/innobase/buf/buf0buddy.c b/storage/innobase/buf/buf0buddy.c index 4d3c4d98510..b7404e46996 100644 --- a/storage/innobase/buf/buf0buddy.c +++ b/storage/innobase/buf/buf0buddy.c @@ -56,6 +56,14 @@ buf_buddy_get( } } +/** Validate a given zip_free list. */ +#define BUF_BUDDY_LIST_VALIDATE(b, i) \ + UT_LIST_VALIDATE(list, buf_page_t, \ + b->zip_free[i], \ + ut_ad(buf_page_get_state( \ + ut_list_node_313) \ + == BUF_BLOCK_ZIP_FREE)) + /**********************************************************************//** Add a block to the head of the appropriate buddy free list. */ UNIV_INLINE @@ -67,21 +75,11 @@ buf_buddy_add_to_free( ulint i) /*!< in: index of buf_pool->zip_free[] */ { -#ifdef UNIV_DEBUG_VALGRIND - buf_page_t* b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); - - if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i); -#endif /* UNIV_DEBUG_VALGRIND */ - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); ut_ad(buf_pool->zip_free[i].start != bpage); UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], bpage); -#ifdef UNIV_DEBUG_VALGRIND - if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i); - UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); -#endif /* UNIV_DEBUG_VALGRIND */ } /**********************************************************************//** @@ -95,25 +93,18 @@ buf_buddy_remove_from_free( ulint i) /*!< in: index of buf_pool->zip_free[] */ { -#ifdef UNIV_DEBUG_VALGRIND +#ifdef UNIV_DEBUG buf_page_t* prev = UT_LIST_GET_PREV(list, bpage); buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); - if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i); - if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i); - ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE); ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE); -#endif /* UNIV_DEBUG_VALGRIND */ +#endif /* UNIV_DEBUG */ ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); UT_LIST_REMOVE(list, buf_pool->zip_free[i], bpage); -#ifdef UNIV_DEBUG_VALGRIND - if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i); - if (next) UNIV_MEM_FREE(next, BUF_BUDDY_LOW << i); -#endif /* UNIV_DEBUG_VALGRIND */ } /**********************************************************************//** @@ -130,17 +121,13 @@ buf_buddy_alloc_zip( ut_ad(buf_pool_mutex_own(buf_pool)); ut_a(i < BUF_BUDDY_SIZES); + ut_a(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); + + ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i)); -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state(ut_list_node_313) - == BUF_BLOCK_ZIP_FREE))); -#endif /* !UNIV_DEBUG_VALGRIND */ bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); if (bpage) { - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); buf_buddy_remove_from_free(buf_pool, bpage, i); @@ -159,13 +146,10 @@ buf_buddy_alloc_zip( } } -#ifdef UNIV_DEBUG if (bpage) { - memset(bpage, ~i, BUF_BUDDY_LOW << i); + ut_d(memset(bpage, ~i, BUF_BUDDY_LOW << i)); + UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i); } -#endif /* UNIV_DEBUG */ - - UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i); return(bpage); } @@ -253,6 +237,7 @@ buf_buddy_alloc_from( { ulint offs = BUF_BUDDY_LOW << j; ut_ad(j <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); ut_ad(j >= i); ut_ad(!ut_align_offset(buf, offs)); @@ -266,13 +251,7 @@ buf_buddy_alloc_from( bpage = (buf_page_t*) ((byte*) buf + offs); ut_d(memset(bpage, j, BUF_BUDDY_LOW << j)); bpage->state = BUF_BLOCK_ZIP_FREE; -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state( - ut_list_node_313) - == BUF_BLOCK_ZIP_FREE))); -#endif /* !UNIV_DEBUG_VALGRIND */ + ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i)); buf_buddy_add_to_free(buf_pool, bpage, j); } @@ -282,8 +261,8 @@ buf_buddy_alloc_from( /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex. -The buf_pool->mutex may only be released and reacquired if lru != NULL. -@return allocated block, possibly NULL if lru==NULL */ +The buf_pool_mutex may be released and reacquired. +@return allocated block, never NULL */ UNIV_INTERN void* buf_buddy_alloc_low( @@ -295,13 +274,14 @@ buf_buddy_alloc_low( will be assigned TRUE if storage was allocated from the LRU list and buf_pool->mutex was temporarily - released, or NULL if the LRU list - should not be used */ + released, */ { buf_block_t* block; + ut_ad(lru); ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); if (i < BUF_BUDDY_SIZES) { /* Try to allocate from the buddy system. */ @@ -320,11 +300,6 @@ buf_buddy_alloc_low( goto alloc_big; } - if (!lru) { - - return(NULL); - } - /* Try replacing an uncompressed page in the buffer pool. */ buf_pool_mutex_exit(buf_pool); block = buf_LRU_get_free_block(buf_pool); @@ -342,66 +317,6 @@ func_exit: return(block); } -/**********************************************************************//** -Try to relocate the control block of a compressed page. -@return TRUE if relocated */ -static -ibool -buf_buddy_relocate_block( -/*=====================*/ - buf_page_t* bpage, /*!< in: block to relocate */ - buf_page_t* dpage) /*!< in: free block to relocate to */ -{ -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - buf_page_t* b; -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - - ut_ad(buf_pool_mutex_own(buf_pool)); - - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_ZIP_FREE: - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_FILE_PAGE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - ut_error; - case BUF_BLOCK_ZIP_DIRTY: - /* Cannot relocate dirty pages. */ - return(FALSE); - - case BUF_BLOCK_ZIP_PAGE: - break; - } - - mutex_enter(&buf_pool->zip_mutex); - - if (!buf_page_can_relocate(bpage)) { - mutex_exit(&buf_pool->zip_mutex); - return(FALSE); - } - - buf_relocate(bpage, dpage); - ut_d(bpage->state = BUF_BLOCK_ZIP_FREE); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - /* relocate buf_pool->zip_clean */ - b = UT_LIST_GET_PREV(list, dpage); - UT_LIST_REMOVE(list, buf_pool->zip_clean, dpage); - - if (b) { - UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, dpage); - } else { - UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage); - } -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - - UNIV_MEM_INVALID(bpage, sizeof *bpage); - - mutex_exit(&buf_pool->zip_mutex); - return(TRUE); -} - /**********************************************************************//** Try to relocate a block. @return TRUE if relocated */ @@ -418,108 +333,91 @@ buf_buddy_relocate( buf_page_t* bpage; const ulint size = BUF_BUDDY_LOW << i; ullint usec = ut_time_us(NULL); + mutex_t* mutex; + ulint space; + ulint page_no; ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(!ut_align_offset(src, size)); ut_ad(!ut_align_offset(dst, size)); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); UNIV_MEM_ASSERT_W(dst, size); /* We assume that all memory from buf_buddy_alloc() - is used for either compressed pages or buf_page_t - objects covering compressed pages. */ + is used for compressed page frames. */ /* We look inside the allocated objects returned by - buf_buddy_alloc() and assume that anything of - PAGE_ZIP_MIN_SIZE or larger is a compressed page that contains - a valid space_id and page_no in the page header. Should the - fields be invalid, we will be unable to relocate the block. - We also assume that anything that fits sizeof(buf_page_t) - actually is a properly initialized buf_page_t object. */ + buf_buddy_alloc() and assume that each block is a compressed + page that contains a valid space_id and page_no in the page + header. Should the fields be invalid, we will be unable to + relocate the block. */ - if (size >= PAGE_ZIP_MIN_SIZE) { - /* This is a compressed page. */ - mutex_t* mutex; - /* The src block may be split into smaller blocks, - some of which may be free. Thus, the - mach_read_from_4() calls below may attempt to read - from free memory. The memory is "owned" by the buddy - allocator (and it has been allocated from the buffer - pool), so there is nothing wrong about this. The - mach_read_from_4() calls here will only trigger bogus - Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */ - ulint space = mach_read_from_4( - (const byte*) src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - ulint page_no = mach_read_from_4( - (const byte*) src + FIL_PAGE_OFFSET); - /* Suppress Valgrind warnings about conditional jump - on uninitialized value. */ - UNIV_MEM_VALID(&space, sizeof space); - UNIV_MEM_VALID(&page_no, sizeof page_no); - bpage = buf_page_hash_get(buf_pool, space, page_no); + /* The src block may be split into smaller blocks, + some of which may be free. Thus, the + mach_read_from_4() calls below may attempt to read + from free memory. The memory is "owned" by the buddy + allocator (and it has been allocated from the buffer + pool), so there is nothing wrong about this. The + mach_read_from_4() calls here will only trigger bogus + Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */ + space = mach_read_from_4((const byte*) src + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + page_no = mach_read_from_4((const byte*) src + + FIL_PAGE_OFFSET); + /* Suppress Valgrind warnings about conditional jump + on uninitialized value. */ + UNIV_MEM_VALID(&space, sizeof space); + UNIV_MEM_VALID(&page_no, sizeof page_no); + bpage = buf_page_hash_get(buf_pool, space, page_no); - if (!bpage || bpage->zip.data != src) { - /* The block has probably been freshly - allocated by buf_LRU_get_free_block() but not - added to buf_pool->page_hash yet. Obviously, - it cannot be relocated. */ + if (!bpage || bpage->zip.data != src) { + /* The block has probably been freshly + allocated by buf_LRU_get_free_block() but not + added to buf_pool->page_hash yet. Obviously, + it cannot be relocated. */ - return(FALSE); - } - - ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); - - if (page_zip_get_size(&bpage->zip) != size) { - /* The block is of different size. We would - have to relocate all blocks covered by src. - For the sake of simplicity, give up. */ - ut_ad(page_zip_get_size(&bpage->zip) < size); - - return(FALSE); - } - - /* The block must have been allocated, but it may - contain uninitialized data. */ - UNIV_MEM_ASSERT_W(src, size); - - mutex = buf_page_get_mutex(bpage); - - mutex_enter(mutex); - - if (buf_page_can_relocate(bpage)) { - /* Relocate the compressed page. */ - ut_a(bpage->zip.data == src); - memcpy(dst, src, size); - bpage->zip.data = dst; - mutex_exit(mutex); -success: - UNIV_MEM_INVALID(src, size); - { - buf_buddy_stat_t* buddy_stat - = &buf_pool->buddy_stat[i]; - buddy_stat->relocated++; - buddy_stat->relocated_usec - += ut_time_us(NULL) - usec; - } - return(TRUE); - } - - mutex_exit(mutex); - } else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) { - /* This must be a buf_page_t object. */ -#if UNIV_WORD_SIZE == 4 - /* On 32-bit systems, there is no padding in - buf_page_t. On other systems, Valgrind could complain - about uninitialized pad bytes. */ - UNIV_MEM_ASSERT_RW(src, size); -#endif - if (buf_buddy_relocate_block(src, dst)) { - - goto success; - } + return(FALSE); } + ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage)); + + if (page_zip_get_size(&bpage->zip) != size) { + /* The block is of different size. We would + have to relocate all blocks covered by src. + For the sake of simplicity, give up. */ + ut_ad(page_zip_get_size(&bpage->zip) < size); + + return(FALSE); + } + + /* The block must have been allocated, but it may + contain uninitialized data. */ + UNIV_MEM_ASSERT_W(src, size); + + mutex = buf_page_get_mutex(bpage); + + mutex_enter(mutex); + + if (buf_page_can_relocate(bpage)) { + /* Relocate the compressed page. */ + ut_a(bpage->zip.data == src); + memcpy(dst, src, size); + bpage->zip.data = dst; + mutex_exit(mutex); + UNIV_MEM_INVALID(src, size); + { + buf_buddy_stat_t* buddy_stat + = &buf_pool->buddy_stat[i]; + buddy_stat->relocated++; + buddy_stat->relocated_usec + += ut_time_us(NULL) - usec; + } + return(TRUE); + } + + mutex_exit(mutex); return(FALSE); } @@ -541,12 +439,13 @@ buf_buddy_free_low( ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); ut_ad(buf_pool->buddy_stat[i].used > 0); buf_pool->buddy_stat[i].used--; recombine: UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i); - ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE); + ((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE; if (i == BUF_BUDDY_SIZES) { buf_buddy_block_free(buf_pool, buf); @@ -557,32 +456,36 @@ recombine: ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); ut_ad(!buf_pool_contains_zip(buf_pool, buf)); - /* Try to combine adjacent blocks. */ - - buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i); - -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - - if (buddy->state != BUF_BLOCK_ZIP_FREE) { - - goto buddy_nonfree; + /* Do not recombine blocks if there are few free blocks. + We may waste up to 15360*max_len bytes to free blocks + (1024 + 2048 + 4096 + 8192 = 15360) */ + if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) { + goto func_exit; } - - /* The field buddy->state can only be trusted for free blocks. - If buddy->state == BUF_BLOCK_ZIP_FREE, the block is free if - it is in the free list. */ + + /* Try to combine adjacent blocks. */ + buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i); + + #ifndef UNIV_DEBUG_VALGRIND + /* When Valgrind instrumentation is not enabled, we can read + buddy->state to quickly determine that a block is not free. + When the block is not free, buddy->state belongs to a compressed + page frame that may be flagged uninitialized in our Valgrind + instrumentation. */ + + if (buddy->state != BUF_BLOCK_ZIP_FREE) { + + goto buddy_nonfree; + } #endif /* !UNIV_DEBUG_VALGRIND */ for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) { - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); if (bpage == buddy) { -buddy_free: /* The buddy is free: recombine */ buf_buddy_remove_from_free(buf_pool, bpage, i); -buddy_free2: +buddy_is_free: ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE); ut_ad(!buf_pool_contains_zip(buf_pool, buddy)); i++; @@ -592,21 +495,15 @@ buddy_free2: } ut_a(bpage != buf); - - { - buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); - UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); - bpage = next; - } + UNIV_MEM_ASSERT_W(bpage, BUF_BUDDY_LOW << i); + bpage = UT_LIST_GET_NEXT(list, bpage); } #ifndef UNIV_DEBUG_VALGRIND buddy_nonfree: - /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state(ut_list_node_313) - == BUF_BLOCK_ZIP_FREE))); -#endif /* UNIV_DEBUG_VALGRIND */ +#endif /* !UNIV_DEBUG_VALGRIND */ + + ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i)); /* The buddy is not free. Is there a free block of this size? */ bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); @@ -614,100 +511,25 @@ buddy_nonfree: if (bpage) { /* Remove the block from the free list, because a successful buf_buddy_relocate() will overwrite bpage->list. */ - - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); buf_buddy_remove_from_free(buf_pool, bpage, i); /* Try to relocate the buddy of buf to the free block. */ if (buf_buddy_relocate(buf_pool, buddy, bpage, i)) { - ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); - goto buddy_free2; + buddy->state = BUF_BLOCK_ZIP_FREE; + goto buddy_is_free; } buf_buddy_add_to_free(buf_pool, bpage, i); - - /* Try to relocate the buddy of the free block to buf. */ - buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage), - BUF_BUDDY_LOW << i); - -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - - /* The buddy must not be (completely) free, because we - always recombine adjacent free blocks. - - (Parts of the buddy can be free in - buf_pool->zip_free[j] with j < i.) */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state( - ut_list_node_313) - == BUF_BLOCK_ZIP_FREE - && ut_list_node_313 != buddy))); -#endif /* !UNIV_DEBUG_VALGRIND */ - - if (buf_buddy_relocate(buf_pool, buddy, buf, i)) { - - buf = bpage; - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); - ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); - goto buddy_free; - } } +func_exit: /* Free the block to the buddy list. */ bpage = buf; -#ifdef UNIV_DEBUG - if (i < buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)) { - /* This area has most likely been allocated for at - least one compressed-only block descriptor. Check - that there are no live objects in the area. This is - not a complete check: it may yield false positives as - well as false negatives. Also, due to buddy blocks - being recombined, it is possible (although unlikely) - that this branch is never reached. */ - char* c; - -# ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing - uninitialized memory. Besides, Valgrind performs a - more exhaustive check, at every memory access. */ - const buf_page_t* b = buf; - const buf_page_t* const b_end = (buf_page_t*) - ((char*) b + (BUF_BUDDY_LOW << i)); - - for (; b < b_end; b++) { - /* Avoid false positives (and cause false - negatives) by checking for b->space < 1000. */ - - if ((b->state == BUF_BLOCK_ZIP_PAGE - || b->state == BUF_BLOCK_ZIP_DIRTY) - && b->space > 0 && b->space < 1000) { - fprintf(stderr, - "buddy dirty %p %u (%u,%u) %p,%lu\n", - (void*) b, - b->state, b->space, b->offset, - buf, i); - } - } -# endif /* !UNIV_DEBUG_VALGRIND */ - - /* Scramble the block. This should make any pointers - invalid and trigger a segmentation violation. Because - the scrambling can be reversed, it may be possible to - track down the object pointing to the freed data by - dereferencing the unscrambled bpage->LRU or - bpage->list pointers. */ - for (c = (char*) buf + (BUF_BUDDY_LOW << i); - c-- > (char*) buf; ) { - *c = ~*c ^ i; - } - } else { - /* Fill large blocks with a constant pattern. */ - memset(bpage, i, BUF_BUDDY_LOW << i); - } -#endif /* UNIV_DEBUG */ + /* Fill large blocks with a constant pattern. */ + ut_d(memset(bpage, i, BUF_BUDDY_LOW << i)); + UNIV_MEM_INVALID(bpage, BUF_BUDDY_LOW << i); bpage->state = BUF_BLOCK_ZIP_FREE; buf_buddy_add_to_free(buf_pool, bpage, i); } diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c index 821bfb7386e..b493529bb7f 100644 --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -1907,7 +1907,7 @@ err_exit: mutex_enter(block_mutex); /* Discard the uncompressed page frame if possible. */ - if (buf_LRU_free_block(bpage, FALSE) == BUF_LRU_FREED) { + if (buf_LRU_free_block(bpage, FALSE)) { mutex_exit(block_mutex); goto lookup; @@ -2310,12 +2310,8 @@ loop: if (block) { /* If the guess is a compressed page descriptor that - has been allocated by buf_buddy_alloc(), it may have - been invalidated by buf_buddy_relocate(). In that - case, block could point to something that happens to - contain the expected bits in block->page. Similarly, - the guess may be pointing to a buffer pool chunk that - has been released when resizing the buffer pool. */ + has been allocated by buf_page_alloc_descriptor(), + it may have been freed by buf_relocate(). */ if (!buf_block_is_uncompressed(buf_pool, block) || offset != block->page.offset @@ -2522,11 +2518,10 @@ wait_until_unfixed: mutex_exit(&buf_pool->zip_mutex); buf_pool->n_pend_unzip++; - bpage->state = BUF_BLOCK_ZIP_FREE; - buf_buddy_free(buf_pool, bpage, sizeof *bpage); - buf_pool_mutex_exit(buf_pool); + buf_page_free_descriptor(bpage); + /* Decompress the page and apply buffered operations while not holding buf_pool->mutex or block->mutex. */ success = buf_zip_decompress(block, srv_use_checksums); @@ -2572,7 +2567,7 @@ wait_until_unfixed: /* Try to evict the block from the buffer pool, to use the insert buffer (change buffer) as much as possible. */ - if (buf_LRU_free_block(&block->page, TRUE) == BUF_LRU_FREED) { + if (buf_LRU_free_block(&block->page, TRUE)) { mutex_exit(&block->mutex); if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { /* Set the watch, as it would have @@ -3231,17 +3226,11 @@ err_exit: mutex_exit(&block->mutex); } else { - /* Defer buf_buddy_alloc() until after the block has - been found not to exist. The buf_buddy_alloc() and - buf_buddy_free() calls may be expensive because of - buf_buddy_relocate(). */ - /* The compressed page must be allocated before the control block (bpage), in order to avoid the invocation of buf_buddy_relocate_block() on uninitialized data. */ data = buf_buddy_alloc(buf_pool, zip_size, &lru); - bpage = buf_buddy_alloc(buf_pool, sizeof *bpage, &lru); /* Initialize the buf_pool pointer. */ bpage->buf_pool_index = buf_pool_index(buf_pool); @@ -3260,8 +3249,6 @@ err_exit: /* The block was added by some other thread. */ watch_page = NULL; - bpage->state = BUF_BLOCK_ZIP_FREE; - buf_buddy_free(buf_pool, bpage, sizeof *bpage); buf_buddy_free(buf_pool, data, zip_size); bpage = NULL; @@ -3269,6 +3256,8 @@ err_exit: } } + bpage = buf_page_alloc_descriptor(); + page_zip_des_init(&bpage->zip); page_zip_set_size(&bpage->zip, zip_size); bpage->zip.data = data; diff --git a/storage/innobase/buf/buf0lru.c b/storage/innobase/buf/buf0lru.c index a7768374f86..de3a55737dc 100644 --- a/storage/innobase/buf/buf0lru.c +++ b/storage/innobase/buf/buf0lru.c @@ -353,31 +353,34 @@ scan_again: while (bpage != NULL) { buf_page_t* prev_bpage; - ibool prev_bpage_buf_fix = FALSE; + mutex_t* block_mutex = NULL; ut_a(buf_page_in_file(bpage)); prev_bpage = UT_LIST_GET_PREV(LRU, bpage); /* bpage->space and bpage->io_fix are protected by - buf_pool->mutex and block_mutex. It is safe to check - them while holding buf_pool->mutex only. */ + buf_pool_mutex and block_mutex. It is safe to check + them while holding buf_pool_mutex only. */ if (buf_page_get_space(bpage) != id) { /* Skip this block, as it does not belong to the space that is being invalidated. */ + goto next_page; } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { /* We cannot remove this page during this scan yet; maybe the system is currently reading it in, or flushing the modifications to the file */ all_freed = FALSE; + goto next_page; } else { - mutex_t* block_mutex = buf_page_get_mutex(bpage); + block_mutex = buf_page_get_mutex(bpage); mutex_enter(block_mutex); if (bpage->buf_fix_count > 0) { + mutex_exit(block_mutex); /* We cannot remove this page during this scan yet; maybe the system is currently reading it in, or flushing @@ -387,106 +390,59 @@ scan_again: goto next_page; } - -#ifdef UNIV_DEBUG - if (buf_debug_prints) { - fprintf(stderr, - "Dropping space %lu page %lu\n", - (ulong) buf_page_get_space(bpage), - (ulong) buf_page_get_page_no(bpage)); - } -#endif - if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { - /* This is a compressed-only block - descriptor. Ensure that prev_bpage - cannot be relocated when bpage is freed. */ - if (UNIV_LIKELY(prev_bpage != NULL)) { - switch (buf_page_get_state( - prev_bpage)) { - case BUF_BLOCK_FILE_PAGE: - /* Descriptors of uncompressed - blocks will not be relocated, - because we are holding the - buf_pool->mutex. */ - break; - case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: - /* Descriptors of compressed- - only blocks can be relocated, - unless they are buffer-fixed. - Because both bpage and - prev_bpage are protected by - buf_pool_zip_mutex, it is - not necessary to acquire - further mutexes. */ - ut_ad(&buf_pool->zip_mutex - == block_mutex); - ut_ad(mutex_own(block_mutex)); - prev_bpage_buf_fix = TRUE; - prev_bpage->buf_fix_count++; - break; - default: - ut_error; - } - } - } else if (((buf_block_t*) bpage)->is_hashed) { - ulint page_no; - ulint zip_size; - - buf_pool_mutex_exit(buf_pool); - - zip_size = buf_page_get_zip_size(bpage); - page_no = buf_page_get_page_no(bpage); - - mutex_exit(block_mutex); - - /* Note that the following call will acquire - an S-latch on the page */ - - btr_search_drop_page_hash_when_freed( - id, zip_size, page_no); - goto scan_again; - } - - if (bpage->oldest_modification != 0) { - - buf_flush_remove(bpage); - } - - /* Remove from the LRU list. */ - - if (buf_LRU_block_remove_hashed_page(bpage, TRUE) - != BUF_BLOCK_ZIP_FREE) { - buf_LRU_block_free_hashed_page((buf_block_t*) - bpage); - } else { - /* The block_mutex should have been - released by buf_LRU_block_remove_hashed_page() - when it returns BUF_BLOCK_ZIP_FREE. */ - ut_ad(block_mutex == &buf_pool->zip_mutex); - ut_ad(!mutex_own(block_mutex)); - - if (prev_bpage_buf_fix) { - /* We temporarily buffer-fixed - prev_bpage, so that - buf_buddy_free() could not - relocate it, in case it was a - compressed-only block - descriptor. */ - - mutex_enter(block_mutex); - ut_ad(prev_bpage->buf_fix_count > 0); - prev_bpage->buf_fix_count--; - mutex_exit(block_mutex); - } - - goto next_page_no_mutex; - } -next_page: - mutex_exit(block_mutex); } -next_page_no_mutex: + ut_ad(mutex_own(block_mutex)); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Dropping space %lu page %lu\n", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + } +#endif + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + /* This is a compressed-only block + descriptor. Do nothing. */ + } else if (((buf_block_t*) bpage)->is_hashed) { + ulint page_no; + ulint zip_size; + + buf_pool_mutex_exit(buf_pool); + + zip_size = buf_page_get_zip_size(bpage); + page_no = buf_page_get_page_no(bpage); + + mutex_exit(block_mutex); + + /* Note that the following call will acquire + an S-latch on the page */ + + btr_search_drop_page_hash_when_freed( + id, zip_size, page_no); + goto scan_again; + } + + if (bpage->oldest_modification != 0) { + + buf_flush_remove(bpage); + } + + /* Remove from the LRU list. */ + + if (buf_LRU_block_remove_hashed_page(bpage, TRUE) + != BUF_BLOCK_ZIP_FREE) { + buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + mutex_exit(block_mutex); + } else { + /* The block_mutex should have been released + by buf_LRU_block_remove_hashed_page() when it + returns BUF_BLOCK_ZIP_FREE. */ + ut_ad(block_mutex == &buf_pool->zip_mutex); + ut_ad(!mutex_own(block_mutex)); + } +next_page: bpage = prev_bpage; } @@ -602,7 +558,7 @@ buf_LRU_free_from_unzip_LRU_list( UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0); block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) { - enum buf_lru_free_block_status freed; + ibool freed; ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->in_unzip_LRU_list); @@ -612,24 +568,9 @@ buf_LRU_free_from_unzip_LRU_list( freed = buf_LRU_free_block(&block->page, FALSE); mutex_exit(&block->mutex); - switch (freed) { - case BUF_LRU_FREED: + if (freed) { return(TRUE); - - case BUF_LRU_CANNOT_RELOCATE: - /* If we failed to relocate, try - regular LRU eviction. */ - return(FALSE); - - case BUF_LRU_NOT_FREED: - /* The block was buffer-fixed or I/O-fixed. - Keep looking. */ - continue; } - - /* inappropriate return value from - buf_LRU_free_block() */ - ut_error; } return(FALSE); @@ -662,10 +603,9 @@ buf_LRU_free_from_common_LRU_list( UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0); bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) { - enum buf_lru_free_block_status freed; - unsigned accessed; - mutex_t* block_mutex - = buf_page_get_mutex(bpage); + ibool freed; + unsigned accessed; + mutex_t* block_mutex = buf_page_get_mutex(bpage); ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); @@ -675,8 +615,7 @@ buf_LRU_free_from_common_LRU_list( freed = buf_LRU_free_block(bpage, TRUE); mutex_exit(block_mutex); - switch (freed) { - case BUF_LRU_FREED: + if (freed) { /* Keep track of pages that are evicted without ever being accessed. This gives us a measure of the effectiveness of readahead */ @@ -684,21 +623,7 @@ buf_LRU_free_from_common_LRU_list( ++buf_pool->stat.n_ra_pages_evicted; } return(TRUE); - - case BUF_LRU_NOT_FREED: - /* The block was dirty, buffer-fixed, or I/O-fixed. - Keep looking. */ - continue; - - case BUF_LRU_CANNOT_RELOCATE: - /* This should never occur, because we - want to discard the compressed page too. */ - break; } - - /* inappropriate return value from - buf_LRU_free_block() */ - ut_error; } return(FALSE); @@ -1424,17 +1349,16 @@ buf_LRU_make_block_old( Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. -NOTE: If this function returns BUF_LRU_FREED, it will temporarily +NOTE: If this function returns TRUE, it will temporarily release buf_pool->mutex. Furthermore, the page frame will no longer be accessible via bpage. The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and release these two mutexes after the call. No other buf_page_get_mutex() may be held when calling this function. -@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or -BUF_LRU_NOT_FREED otherwise. */ +@return TRUE if freed, FALSE otherwise. */ UNIV_INTERN -enum buf_lru_free_block_status +ibool buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ @@ -1460,7 +1384,7 @@ buf_LRU_free_block( if (!buf_page_can_relocate(bpage)) { /* Do not free buffer-fixed or I/O-fixed blocks. */ - return(BUF_LRU_NOT_FREED); + return(FALSE); } #ifdef UNIV_IBUF_COUNT_DEBUG @@ -1472,7 +1396,7 @@ buf_LRU_free_block( /* Do not completely free dirty blocks. */ if (bpage->oldest_modification) { - return(BUF_LRU_NOT_FREED); + return(FALSE); } } else if (bpage->oldest_modification) { /* Do not completely free dirty blocks. */ @@ -1480,7 +1404,7 @@ buf_LRU_free_block( if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); - return(BUF_LRU_NOT_FREED); + return(FALSE); } goto alloc; @@ -1489,14 +1413,8 @@ buf_LRU_free_block( If it cannot be allocated (without freeing a block from the LRU list), refuse to free bpage. */ alloc: - buf_pool_mutex_exit_forbid(buf_pool); - b = buf_buddy_alloc(buf_pool, sizeof *b, NULL); - buf_pool_mutex_exit_allow(buf_pool); - - if (UNIV_UNLIKELY(!b)) { - return(BUF_LRU_CANNOT_RELOCATE); - } - + b = buf_page_alloc_descriptor(); + ut_a(b); memcpy(b, bpage, sizeof *b); } @@ -1669,7 +1587,7 @@ alloc: mutex_enter(block_mutex); } - return(BUF_LRU_FREED); + return(TRUE); } /******************************************************************//** @@ -1899,11 +1817,9 @@ buf_LRU_block_remove_hashed_page( buf_pool, bpage->zip.data, page_zip_get_size(&bpage->zip)); - bpage->state = BUF_BLOCK_ZIP_FREE; - buf_buddy_free(buf_pool, bpage, sizeof(*bpage)); buf_pool_mutex_exit_allow(buf_pool); + buf_page_free_descriptor(bpage); - UNIV_MEM_UNDESC(bpage); return(BUF_BLOCK_ZIP_FREE); case BUF_BLOCK_FILE_PAGE: diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h index b255d8c9351..c626ea52af9 100644 --- a/storage/innobase/include/buf0buddy.h +++ b/storage/innobase/include/buf0buddy.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -37,25 +37,24 @@ Created December 2006 by Marko Makela /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool->mutex and must not hold buf_pool->zip_mutex or any -block->mutex. The buf_pool->mutex may only be released and reacquired -if lru != NULL. This function should only be used for allocating -compressed page frames or control blocks (buf_page_t). Allocated -control blocks must be properly initialized immediately after -buf_buddy_alloc() has returned the memory, before releasing -buf_pool->mutex. -@return allocated block, possibly NULL if lru == NULL */ +block->mutex. The buf_pool_mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ UNIV_INLINE void* buf_buddy_alloc( /*============*/ - buf_pool_t* buf_pool, - /*!< buffer pool in which the block resides */ - ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ - ibool* lru) /*!< in: pointer to a variable that will be assigned - TRUE if storage was allocated from the LRU list - and buf_pool->mutex was temporarily released, - or NULL if the LRU list should not be used */ - __attribute__((malloc)); + buf_pool_t* buf_pool, /*!< in: buffer pool in which + the page resides */ + ulint size, /*!< in: compressed page size + (between PAGE_ZIP_MIN_SIZE and + UNIV_PAGE_SIZE) */ + ibool* lru) /*!< in: pointer to a variable + that will be assigned TRUE if + storage was allocated from the + LRU list and buf_pool->mutex was + temporarily released */ + __attribute__((malloc, nonnull)); /**********************************************************************//** Release a block. */ diff --git a/storage/innobase/include/buf0buddy.ic b/storage/innobase/include/buf0buddy.ic index e50c33ea15a..e65b7ae2936 100644 --- a/storage/innobase/include/buf0buddy.ic +++ b/storage/innobase/include/buf0buddy.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,8 +36,8 @@ Created December 2006 by Marko Makela /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex. -The buf_pool->mutex may only be released and reacquired if lru != NULL. -@return allocated block, possibly NULL if lru==NULL */ +The buf_pool_mutex may be released and reacquired. +@return allocated block, never NULL */ UNIV_INTERN void* buf_buddy_alloc_low( @@ -48,9 +48,8 @@ buf_buddy_alloc_low( or BUF_BUDDY_SIZES */ ibool* lru) /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool->mutex was temporarily released, - or NULL if the LRU list should not be used */ - __attribute__((malloc)); + and buf_pool_mutex was temporarily released */ + __attribute__((malloc, nonnull)); /**********************************************************************//** Deallocate a block. */ @@ -77,6 +76,8 @@ buf_buddy_get_slot( ulint i; ulint s; + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) { } @@ -87,29 +88,28 @@ buf_buddy_get_slot( /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool->mutex and must not hold buf_pool->zip_mutex or any -block->mutex. The buf_pool->mutex may only be released and reacquired -if lru != NULL. This function should only be used for allocating -compressed page frames or control blocks (buf_page_t). Allocated -control blocks must be properly initialized immediately after -buf_buddy_alloc() has returned the memory, before releasing -buf_pool->mutex. -@return allocated block, possibly NULL if lru == NULL */ +block->mutex. The buf_pool_mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ UNIV_INLINE void* buf_buddy_alloc( /*============*/ buf_pool_t* buf_pool, /*!< in: buffer pool in which the page resides */ - ulint size, /*!< in: block size, up to - UNIV_PAGE_SIZE */ + ulint size, /*!< in: compressed page size + (between PAGE_ZIP_MIN_SIZE and + UNIV_PAGE_SIZE) */ ibool* lru) /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool->mutex was - temporarily released, or NULL if - the LRU list should not be used */ + temporarily released */ { ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(ut_is_2pow(size)); + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + ut_ad(size <= UNIV_PAGE_SIZE); return(buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size), lru)); } @@ -127,6 +127,9 @@ buf_buddy_free( UNIV_PAGE_SIZE */ { ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(ut_is_2pow(size)); + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + ut_ad(size <= UNIV_PAGE_SIZE); buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size)); } diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 4b986736104..1826d78269f 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -269,6 +269,22 @@ ib_uint64_t buf_pool_get_oldest_modification(void); /*==================================*/ /********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ + __attribute__((malloc)); +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ + __attribute__((nonnull)); +/********************************************************************//** Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ UNIV_INTERN diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index 52772d351a1..4fdaf6ff43e 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -753,6 +753,35 @@ buf_block_get_lock_hash_val( return(block->lock_hash_val); } +/********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. +@return: the allocated descriptor. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ +{ + buf_page_t* bpage; + + bpage = (buf_page_t*) ut_malloc(sizeof *bpage); + ut_d(memset(bpage, 0, sizeof *bpage)); + UNIV_MEM_ALLOC(bpage, sizeof *bpage); + + return(bpage); +} + +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ +{ + ut_free(bpage); +} + /********************************************************************//** Frees a buffer block which does not contain a file page. */ UNIV_INLINE diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index 2f03d1219e3..f42894a138c 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -30,18 +30,6 @@ Created 11/5/1995 Heikki Tuuri #include "ut0byte.h" #include "buf0types.h" -/** The return type of buf_LRU_free_block() */ -enum buf_lru_free_block_status { - /** freed */ - BUF_LRU_FREED = 0, - /** not freed because the caller asked to remove the - uncompressed frame but the control block cannot be - relocated */ - BUF_LRU_CANNOT_RELOCATE, - /** not freed because of some other reason */ - BUF_LRU_NOT_FREED -}; - /******************************************************************//** Tries to remove LRU flushed blocks from the end of the LRU list and put them to the free list. This is beneficial for the efficiency of the insert buffer @@ -99,17 +87,16 @@ buf_LRU_insert_zip_clean( Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. -NOTE: If this function returns BUF_LRU_FREED, it will temporarily +NOTE: If this function returns TRUE, it will temporarily release buf_pool->mutex. Furthermore, the page frame will no longer be accessible via bpage. The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and release these two mutexes after the call. No other buf_page_get_mutex() may be held when calling this function. -@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or -BUF_LRU_NOT_FREED otherwise. */ +@return TRUE if freed, FALSE otherwise. */ UNIV_INTERN -enum buf_lru_free_block_status +ibool buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index a2175098704..0cc2defb3ff 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,6 +26,8 @@ Created 11/17/1995 Heikki Tuuri #ifndef buf0types_h #define buf0types_h +#include "page0types.h" + /** Buffer page (uncompressed or compressed) */ typedef struct buf_page_struct buf_page_t; /** Buffer block for which an uncompressed page exists */ @@ -60,17 +62,10 @@ enum buf_io_fix { /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ /* @{ */ -#if UNIV_WORD_SIZE <= 4 /* 32-bit system */ -/** Base-2 logarithm of the smallest buddy block size */ -# define BUF_BUDDY_LOW_SHIFT 6 -#else /* 64-bit system */ -/** Base-2 logarithm of the smallest buddy block size */ -# define BUF_BUDDY_LOW_SHIFT 7 -#endif +#define BUF_BUDDY_LOW_SHIFT PAGE_ZIP_MIN_SIZE_SHIFT + #define BUF_BUDDY_LOW (1 << BUF_BUDDY_LOW_SHIFT) - /*!< minimum block size in the binary - buddy system; must be at least - sizeof(buf_page_t) */ + #define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT) /*!< number of buddy sizes */ From c75e8aa6e1ff904c75ace8e9ddb213b412c7216c Mon Sep 17 00:00:00 2001 From: Inaam Rana Date: Mon, 20 Jun 2011 00:37:36 -0400 Subject: [PATCH 12/35] Fix a merge error introduced in: revision-id: inaam.rana@oracle.com-20110617202919-b7p0u0ekj5a9u9nu --- storage/innobase/buf/buf0buf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/storage/innobase/buf/buf0buf.c b/storage/innobase/buf/buf0buf.c index b493529bb7f..a282954e8e8 100644 --- a/storage/innobase/buf/buf0buf.c +++ b/storage/innobase/buf/buf0buf.c @@ -3232,9 +3232,6 @@ err_exit: uninitialized data. */ data = buf_buddy_alloc(buf_pool, zip_size, &lru); - /* Initialize the buf_pool pointer. */ - bpage->buf_pool_index = buf_pool_index(buf_pool); - /* If buf_buddy_alloc() allocated storage from the LRU list, it released and reacquired buf_pool->mutex. Thus, we must check the page_hash again, as it may have been modified. */ @@ -3258,6 +3255,9 @@ err_exit: bpage = buf_page_alloc_descriptor(); + /* Initialize the buf_pool pointer. */ + bpage->buf_pool_index = buf_pool_index(buf_pool); + page_zip_des_init(&bpage->zip); page_zip_set_size(&bpage->zip, zip_size); bpage->zip.data = data; From e1aa1f874ec6ac67ee14cabd6ba3c5f28bc3a79f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 22 Jun 2011 11:20:19 +0300 Subject: [PATCH 13/35] Bug#11753728 45225: Locking: hang if drop table with no timeout Re-enable a test that was disabled as collateral damage. Starting with MySQL 5.5, queries will acquire and hold a shared meta-data lock (MDL) on tables they process, until the transaction is committed or rolled back. This will prevent DDL operations on the tables, such as creating an index. innodb-index.test: Use a second table for creating the index. The index will still be "too new" for the transaction that was started before the index creation was started. --- mysql-test/suite/innodb/r/innodb-index.result | 40 +++++++++++++ mysql-test/suite/innodb/t/innodb-index.test | 56 +++++++++---------- 2 files changed, 68 insertions(+), 28 deletions(-) diff --git a/mysql-test/suite/innodb/r/innodb-index.result b/mysql-test/suite/innodb/r/innodb-index.result index 7d7062c9161..3f3a14c72f2 100644 --- a/mysql-test/suite/innodb/r/innodb-index.result +++ b/mysql-test/suite/innodb/r/innodb-index.result @@ -1085,3 +1085,43 @@ t2 CREATE TABLE `t2` ( ) ENGINE=InnoDB DEFAULT CHARSET=latin1 DROP TABLE t2; DROP TABLE t1; +CREATE TABLE t1 (a INT, b CHAR(1)) ENGINE=InnoDB; +INSERT INTO t1 VALUES (3,'a'),(3,'b'),(1,'c'),(0,'d'),(1,'e'); +CREATE TABLE t2 SELECT * FROM t1; +BEGIN; +SELECT * FROM t1; +a b +3 a +3 b +1 c +0 d +1 e +SET lock_wait_timeout=1; +CREATE INDEX t1a ON t1(a); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction +CREATE INDEX t2a ON t2(a); +SELECT * FROM t2; +a b +3 a +3 b +1 c +0 d +1 e +SELECT * FROM t2 FORCE INDEX(t2a) ORDER BY a; +ERROR HY000: Table definition has changed, please retry transaction +SELECT * FROM t2; +a b +3 a +3 b +1 c +0 d +1 e +COMMIT; +SELECT * FROM t2 FORCE INDEX(t2a) ORDER BY a; +a b +0 d +1 c +1 e +3 a +3 b +DROP TABLE t1,t2; diff --git a/mysql-test/suite/innodb/t/innodb-index.test b/mysql-test/suite/innodb/t/innodb-index.test index baeeb240c06..7f5b041202c 100644 --- a/mysql-test/suite/innodb/t/innodb-index.test +++ b/mysql-test/suite/innodb/t/innodb-index.test @@ -516,34 +516,34 @@ SHOW CREATE TABLE t2; DROP TABLE t2; DROP TABLE t1; -# The following tests are disabled because of the introduced timeouts for -# metadata locks at the MySQL level as part of the fix for -# Bug#45225 Locking: hang if drop table with no timeout -# The following CREATE INDEX t1a ON t1(a); causes a lock wait timeout -# start disabled45225_2 -#connect (a,localhost,root,,); -#connect (b,localhost,root,,); -#connection a; -#CREATE TABLE t1 (a INT, b CHAR(1)) ENGINE=InnoDB; -#INSERT INTO t1 VALUES (3,'a'),(3,'b'),(1,'c'),(0,'d'),(1,'e'); -#connection b; -#BEGIN; -#SELECT * FROM t1; -#connection a; -#CREATE INDEX t1a ON t1(a); -#connection b; -#SELECT * FROM t1; -#--error ER_TABLE_DEF_CHANGED -#SELECT * FROM t1 FORCE INDEX(t1a) ORDER BY a; -#SELECT * FROM t1; -#COMMIT; -#SELECT * FROM t1 FORCE INDEX(t1a) ORDER BY a; -#connection default; -#disconnect a; -#disconnect b; -# -#DROP TABLE t1; -# end disabled45225_2 +connect (a,localhost,root,,); +connect (b,localhost,root,,); +connection a; +CREATE TABLE t1 (a INT, b CHAR(1)) ENGINE=InnoDB; +INSERT INTO t1 VALUES (3,'a'),(3,'b'),(1,'c'),(0,'d'),(1,'e'); +CREATE TABLE t2 SELECT * FROM t1; +connection b; +BEGIN; +# This acquires a MDL lock on t1 until commit. +SELECT * FROM t1; +connection a; +# This times out before of the MDL lock held by connection b. +SET lock_wait_timeout=1; +--error ER_LOCK_WAIT_TIMEOUT +CREATE INDEX t1a ON t1(a); +CREATE INDEX t2a ON t2(a); +connection b; +SELECT * FROM t2; +--error ER_TABLE_DEF_CHANGED +SELECT * FROM t2 FORCE INDEX(t2a) ORDER BY a; +SELECT * FROM t2; +COMMIT; +SELECT * FROM t2 FORCE INDEX(t2a) ORDER BY a; +connection default; +disconnect a; +disconnect b; + +DROP TABLE t1,t2; # # restore environment to the state it was before this test execution From af5c95c2991508b8a59a0ad9ac28d6461f77609d Mon Sep 17 00:00:00 2001 From: Guilhem Bichot Date: Tue, 7 Jun 2011 09:53:37 +0200 Subject: [PATCH 14/35] This patch is for 5.5. WL#5914 remove option "--all" of the "perror" program. In 5.5 we just give a deprecation warning; in trunk we remove the option. --- extra/perror.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/extra/perror.c b/extra/perror.c index 3c00b9b11d7..e9d73a665e6 100644 --- a/extra/perror.c +++ b/extra/perror.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2000 MySQL AB +/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -11,7 +11,7 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* Return error-text for system error messages and handler messages */ @@ -64,7 +64,8 @@ static struct my_option my_long_options[] = &ndb_code, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, #endif #ifdef HAVE_SYS_ERRLIST - {"all", 'a', "Print all the error messages and the number.", + {"all", 'a', "Print all the error messages and the number. Deprecated," + " will be removed in a future release.", &print_all_codes, &print_all_codes, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, #endif @@ -295,6 +296,8 @@ int main(int argc,char *argv[]) if (print_all_codes) { HA_ERRORS *ha_err_ptr; + printf("WARNING: option '-a/--all' is deprecated and will be removed in a" + " future release.\n"); for (code=1 ; code < sys_nerr ; code++) { if (sys_errlist[code] && sys_errlist[code][0]) From dd4957965411c1b67ebfb1ba8650a3090d305f63 Mon Sep 17 00:00:00 2001 From: Georgi Kodinov Date: Tue, 7 Jun 2011 15:43:16 +0300 Subject: [PATCH 15/35] Bug #12589928: MEMORY LEAK WHEN RUNNING SYS_VARS.SECURE_FILE_PRIV This is the 5.1 version of the fix. Need to free the memory allocated by the option parsing code for empty strings when resetting the pointer to NULL. No test case needed, as the existing ones already cover this path. --- sql/mysqld.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 36f195e6232..13395cd9e07 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -9063,6 +9063,7 @@ static int fix_paths(void) { if (*opt_secure_file_priv == 0) { + my_free(opt_secure_file_priv, MYF(0)); opt_secure_file_priv= 0; } else From 98d527d3cb89b5e7f8e14d4e5ccf43b03b6f6e25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 14 Jun 2011 08:40:32 +0300 Subject: [PATCH 16/35] Merge a fix from mysql-5.5 to mysql-5.1: revno 2995.37.209 revision id marko.makela@oracle.com-20110518120508-qhn7vz814vn77v5k parent marko.makela@oracle.com-20110517121555-lmple24qzxqkzep4 timestamp: Wed 2011-05-18 15:05:08 +0300 message: Fix a bogus UNIV_SYNC_DEBUG failure in the fix of Bug #59641 or Oracle Bug #11766513. trx_undo_free_prepared(): Do not acquire or release trx->rseg->mutex. This code is invoked in the single-threaded part of shutdown, therefore a mutex is not needed. --- storage/innodb_plugin/trx/trx0undo.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/storage/innodb_plugin/trx/trx0undo.c b/storage/innodb_plugin/trx/trx0undo.c index 68ff82f618c..7f03b68fb55 100644 --- a/storage/innodb_plugin/trx/trx0undo.c +++ b/storage/innodb_plugin/trx/trx0undo.c @@ -1986,8 +1986,6 @@ trx_undo_free_prepared( /*===================*/ trx_t* trx) /*!< in/out: PREPARED transaction */ { - mutex_enter(&trx->rseg->mutex); - ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); if (trx->update_undo) { @@ -2002,6 +2000,5 @@ trx_undo_free_prepared( trx->insert_undo); trx_undo_mem_free(trx->insert_undo); } - mutex_exit(&trx->rseg->mutex); } #endif /* !UNIV_HOTBACKUP */ From a8629376994c1365923ac9144354d7cd99d8288a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 15 Jun 2011 10:16:59 +0300 Subject: [PATCH 17/35] Introduce UNIV_BLOB_NULL_DEBUG for temporarily hiding Bug#12650861. Some ut_a(!rec_offs_any_null_extern()) assertion failures are indicating genuine BLOB bugs, others are bogus failures when rolling back incomplete transactions at crash recovery. This needs more work, and until I get a chance to work on it, other testing must not be disrupted by this. --- storage/innobase/btr/btr0cur.c | 4 ++-- storage/innobase/include/rem0rec.h | 4 ++-- storage/innobase/include/rem0rec.ic | 4 ++-- storage/innobase/include/univ.i | 4 ++-- storage/innobase/row/row0row.c | 8 ++++---- storage/innobase/row/row0vers.c | 16 ++++++++-------- storage/innobase/trx/trx0rec.c | 4 ++-- storage/innodb_plugin/btr/btr0cur.c | 4 ++-- storage/innodb_plugin/include/rem0rec.h | 4 ++-- storage/innodb_plugin/include/rem0rec.ic | 4 ++-- storage/innodb_plugin/include/univ.i | 2 ++ storage/innodb_plugin/row/row0row.c | 8 ++++---- storage/innodb_plugin/row/row0vers.c | 16 ++++++++-------- storage/innodb_plugin/trx/trx0rec.c | 4 ++-- 14 files changed, 44 insertions(+), 42 deletions(-) diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index 1d17aa998f6..d5bed3bec99 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -1592,9 +1592,9 @@ btr_cur_optimistic_update( heap = mem_heap_create(1024); offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ #ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index 67baeb7d8d2..a1a206c3281 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -339,7 +339,7 @@ rec_offs_any_extern( /*================*/ /* out: TRUE if a field is stored externally */ const ulint* offsets);/* in: array returned by rec_get_offsets() */ -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG /******************************************************** Determine if the offsets are for a record containing null BLOB pointers. */ UNIV_INLINE @@ -351,7 +351,7 @@ rec_offs_any_null_extern( or NULL if none found */ rec_t* rec, /*!< in: record */ const ulint* offsets); /*!< in: rec_get_offsets(rec) */ -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ /*************************************************************** Sets the value of the ith field extern storage bit. */ UNIV_INLINE diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic index 566c62e30f2..9e659f12881 100644 --- a/storage/innobase/include/rem0rec.ic +++ b/storage/innobase/include/rem0rec.ic @@ -1021,7 +1021,7 @@ rec_offs_any_extern( return(FALSE); } -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG /******************************************************** Determine if the offsets are for a record containing null BLOB pointers. */ UNIV_INLINE @@ -1055,7 +1055,7 @@ rec_offs_any_null_extern( return(NULL); } -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ /*************************************************************** Sets the value of the ith field extern storage bit. */ diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index a67b1b3895e..8eb78491b04 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -88,8 +88,8 @@ memory is read outside the allocated blocks. */ #if 0 #define UNIV_DEBUG_VALGRIND /* Enable extra Valgrind instrumentation */ -#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column - debugging without UNIV_DEBUG */ +#define UNIV_BLOB_NULL_DEBUG /* Enable deep off-page + column debugging */ #define UNIV_DEBUG /* Enable ut_ad() assertions */ #define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */ #define UNIV_MEM_DEBUG /* detect memory leaks etc */ diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c index 247f19e097c..b9efdcfbfdd 100644 --- a/storage/innobase/row/row0row.c +++ b/storage/innobase/row/row0row.c @@ -210,13 +210,13 @@ row_build( ut_ad(rec_offs_validate(rec, index, offsets)); } -#if 0/* defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG*/ +#if 0 && defined UNIV_BLOB_NULL_DEBUG /* This one can fail in trx_rollback_or_clean_all_without_sess() if the server crashed during an insert before the btr_store_big_rec_extern_fields() did mtr_commit() all BLOB pointers to the clustered index record. */ ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* 0 && UNIV_BLOB_NULL_DEBUG */ if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ @@ -310,10 +310,10 @@ row_rec_to_index_entry( rec = rec_copy(buf, rec, offsets); /* Avoid a debug assertion in rec_offs_validate(). */ rec_offs_make_valid(rec, index, offsets); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG } else { ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ } rec_len = rec_offs_n_fields(offsets); diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c index 906b46fb51b..a52ef3cc083 100644 --- a/storage/innobase/row/row0vers.c +++ b/storage/innobase/row/row0vers.c @@ -473,10 +473,10 @@ row_vers_build_for_consistent_read( /* The view already sees this version: we can copy it to in_heap and return */ -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#if defined UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern( version, *offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); @@ -511,9 +511,9 @@ row_vers_build_for_consistent_read( *offsets = rec_get_offsets(prev_version, index, *offsets, ULINT_UNDEFINED, offset_heap); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ trx_id = row_get_rec_trx_id(prev_version, index, *offsets); @@ -615,9 +615,9 @@ row_vers_build_for_semi_consistent_read( /* We found a version that belongs to a committed transaction: return it. */ -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ if (rec == version) { *old_vers = rec; @@ -676,9 +676,9 @@ row_vers_build_for_semi_consistent_read( version = prev_version; *offsets = rec_get_offsets(version, index, *offsets, ULINT_UNDEFINED, offset_heap); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ }/* for (;;) */ if (heap) { diff --git a/storage/innobase/trx/trx0rec.c b/storage/innobase/trx/trx0rec.c index 730ac6a6f60..2a9224b0a72 100644 --- a/storage/innobase/trx/trx0rec.c +++ b/storage/innobase/trx/trx0rec.c @@ -1397,9 +1397,9 @@ trx_undo_prev_version_build( return(DB_ERROR); } -# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +# ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(rec, offsets)); -# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +# endif /* UNIV_BLOB_NULL_DEBUG */ if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint* ext_vect; diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index ea59fe9d025..84ba0b99e58 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -1871,9 +1871,9 @@ btr_cur_optimistic_update( heap = mem_heap_create(1024); offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ #ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { diff --git a/storage/innodb_plugin/include/rem0rec.h b/storage/innodb_plugin/include/rem0rec.h index 06de23be757..fff44eecb00 100644 --- a/storage/innodb_plugin/include/rem0rec.h +++ b/storage/innodb_plugin/include/rem0rec.h @@ -480,7 +480,7 @@ ulint rec_offs_any_extern( /*================*/ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG /******************************************************//** Determine if the offsets are for a record containing null BLOB pointers. @return first field containing a null BLOB pointer, or NULL if none found */ @@ -491,7 +491,7 @@ rec_offs_any_null_extern( const rec_t* rec, /*!< in: record */ const ulint* offsets) /*!< in: rec_get_offsets(rec) */ __attribute__((nonnull, warn_unused_result)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ /******************************************************//** Returns nonzero if the extern bit is set in nth field of rec. @return nonzero if externally stored */ diff --git a/storage/innodb_plugin/include/rem0rec.ic b/storage/innodb_plugin/include/rem0rec.ic index 7cff36fee6c..252484a7433 100644 --- a/storage/innodb_plugin/include/rem0rec.ic +++ b/storage/innodb_plugin/include/rem0rec.ic @@ -1088,7 +1088,7 @@ rec_offs_any_extern( return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL)); } -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG /******************************************************//** Determine if the offsets are for a record containing null BLOB pointers. @return first field containing a null BLOB pointer, or NULL if none found */ @@ -1124,7 +1124,7 @@ rec_offs_any_null_extern( return(NULL); } -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ /******************************************************//** Returns nonzero if the extern bit is set in nth field of rec. diff --git a/storage/innodb_plugin/include/univ.i b/storage/innodb_plugin/include/univ.i index 6ac227a59a6..63d8e7140b5 100644 --- a/storage/innodb_plugin/include/univ.i +++ b/storage/innodb_plugin/include/univ.i @@ -179,6 +179,8 @@ command. Not tested on Windows. */ debugging without UNIV_DEBUG */ #define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column debugging without UNIV_DEBUG */ +#define UNIV_BLOB_NULL_DEBUG /* Enable deep off-page + column debugging */ #define UNIV_DEBUG /* Enable ut_ad() assertions and disable UNIV_INLINE */ #define UNIV_DEBUG_LOCK_VALIDATE /* Enable diff --git a/storage/innodb_plugin/row/row0row.c b/storage/innodb_plugin/row/row0row.c index 80b449a416b..8aba375d046 100644 --- a/storage/innodb_plugin/row/row0row.c +++ b/storage/innodb_plugin/row/row0row.c @@ -231,13 +231,13 @@ row_build( ut_ad(rec_offs_validate(rec, index, offsets)); } -#if 0 /* defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG */ +#if 0 && defined UNIV_BLOB_NULL_DEBUG /* This one can fail in trx_rollback_active() if the server crashed during an insert before the btr_store_big_rec_extern_fields() did mtr_commit() all BLOB pointers to the clustered index record. */ ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* 0 && UNIV_BLOB_NULL_DEBUG */ if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ @@ -423,10 +423,10 @@ row_rec_to_index_entry( rec = rec_copy(buf, rec, offsets); /* Avoid a debug assertion in rec_offs_validate(). */ rec_offs_make_valid(rec, index, offsets); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG } else { ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ } entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap); diff --git a/storage/innodb_plugin/row/row0vers.c b/storage/innodb_plugin/row/row0vers.c index 8a7bb842293..2d39f92d18f 100644 --- a/storage/innodb_plugin/row/row0vers.c +++ b/storage/innodb_plugin/row/row0vers.c @@ -550,10 +550,10 @@ row_vers_build_for_consistent_read( /* The view already sees this version: we can copy it to in_heap and return */ -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern( version, *offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); @@ -588,9 +588,9 @@ row_vers_build_for_consistent_read( *offsets = rec_get_offsets(prev_version, index, *offsets, ULINT_UNDEFINED, offset_heap); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ trx_id = row_get_rec_trx_id(prev_version, index, *offsets); @@ -691,9 +691,9 @@ row_vers_build_for_semi_consistent_read( /* We found a version that belongs to a committed transaction: return it. */ -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ if (rec == version) { *old_vers = rec; @@ -752,9 +752,9 @@ row_vers_build_for_semi_consistent_read( version = prev_version; *offsets = rec_get_offsets(version, index, *offsets, ULINT_UNDEFINED, offset_heap); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +#ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +#endif /* UNIV_BLOB_NULL_DEBUG */ }/* for (;;) */ if (heap) { diff --git a/storage/innodb_plugin/trx/trx0rec.c b/storage/innodb_plugin/trx/trx0rec.c index 9f2fd59d82b..297838365d5 100644 --- a/storage/innodb_plugin/trx/trx0rec.c +++ b/storage/innodb_plugin/trx/trx0rec.c @@ -1577,9 +1577,9 @@ trx_undo_prev_version_build( return(DB_ERROR); } -# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +# ifdef UNIV_BLOB_NULL_DEBUG ut_a(!rec_offs_any_null_extern(rec, offsets)); -# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ +# endif /* UNIV_BLOB_NULL_DEBUG */ if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint n_ext; From 5b4ceba58d1c9c35e0cba1f126290009bd7643ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 16 Jun 2011 10:27:21 +0300 Subject: [PATCH 18/35] Bug#12612184 Race condition after btr_cur_pessimistic_update() btr_cur_compress_if_useful(), btr_compress(): Add the parameter ibool adjust. If adjust=TRUE, adjust the cursor position after compressing the page. btr_lift_page_up(): Return a pointer to the father page. BTR_KEEP_POS_FLAG: A new flag for btr_cur_pessimistic_update(). btr_cur_pessimistic_update(): If *big_rec != NULL and flags & BTR_KEEP_POS_FLAG, keep the cursor positioned on the updated record. Also, do not release the index tree x-lock if *big_rec != NULL. btr_cur_mtr_commit_and_start(): Commits and restarts a mini-transaction so that it will retain an x-lock on index->lock and the page of the cursor. This is invoked when btr_cur_pessimistic_update() returns *big_rec != NULL. In all callers of btr_cur_pessimistic_update() that do not pass BTR_KEEP_POS_FLAG, assert that *big_rec == NULL. btr_cur_compress(): Unused function [in the built-in MySQL 5.1], remove. page_rec_get_nth(): Return the nth record on the page (an inverse function of page_rec_get_n_recs_before()). Refactored from page_get_middle_rec(). page_get_middle_rec(): Invoke page_rec_get_nth(). page_cur_insert_rec_zip_reorg(): Make use of the page directory shortcuts in page_rec_get_nth() instead of scanning the whole list of records. row_ins_clust_index_entry_by_modify(): Pass BTR_KEEP_POS_FLAG to btr_cur_pessimistic_update(). row_ins_index_entry_low(): If row_ins_clust_index_entry_by_modify() returns a big_rec, invoke btr_cur_mtr_commit_and_start() in order to commit and start the mini-transaction without releasing the x-locks on index->lock and the cursor page, and write the big_rec. Releasing the page latch in mtr_commit() caused a race condition. row_upd_clust_rec(): Pass BTR_KEEP_POS_FLAG to btr_cur_pessimistic_update(). If it returns a big_rec, invoke btr_cur_mtr_commit_and_start() in order to commit and start the mini-transaction without releasing the x-locks on index->lock and the cursor page, and write the big_rec. Releasing the page latch in mtr_commit() caused a race condition. sync_thread_add_level(): Add the parameter ibool relock. When TRUE, bypass the latching order rules. rw_lock_add_debug_info(): For nested X-lock requests, pass relock=TRUE to sync_thread_add_level(). rb:678 approved by Jimmy Yang --- storage/innobase/btr/btr0btr.c | 40 ++++++-- storage/innobase/btr/btr0cur.c | 106 ++++++++++++++------- storage/innobase/include/btr0btr.h | 12 ++- storage/innobase/include/btr0cur.h | 39 ++++---- storage/innobase/include/buf0buf.h | 19 ++++ storage/innobase/include/buf0buf.ic | 2 +- storage/innobase/include/page0page.h | 20 +++- storage/innobase/include/page0page.ic | 16 ++++ storage/innobase/include/sync0sync.h | 3 +- storage/innobase/page/page0page.c | 31 +++--- storage/innobase/row/row0ins.c | 44 ++++++++- storage/innobase/row/row0umod.c | 2 + storage/innobase/row/row0upd.c | 34 +++++-- storage/innobase/sync/sync0rw.c | 4 +- storage/innobase/sync/sync0sync.c | 10 +- storage/innodb_plugin/ChangeLog | 10 ++ storage/innodb_plugin/btr/btr0btr.c | 52 +++++++--- storage/innodb_plugin/btr/btr0cur.c | 80 ++++++++++++++-- storage/innodb_plugin/include/btr0btr.h | 15 +-- storage/innodb_plugin/include/btr0cur.h | 26 ++++- storage/innodb_plugin/include/btr0cur.ic | 4 +- storage/innodb_plugin/include/buf0buf.h | 27 +++++- storage/innodb_plugin/include/buf0buf.ic | 17 +--- storage/innodb_plugin/include/page0cur.ic | 5 +- storage/innodb_plugin/include/page0page.h | 39 ++++++-- storage/innodb_plugin/include/page0page.ic | 32 ++++++- storage/innodb_plugin/include/sync0rw.ic | 10 +- storage/innodb_plugin/include/sync0sync.h | 6 +- storage/innodb_plugin/page/page0cur.c | 17 +++- storage/innodb_plugin/page/page0page.c | 52 +++++----- storage/innodb_plugin/row/row0ins.c | 48 ++++++++-- storage/innodb_plugin/row/row0upd.c | 35 +++++-- storage/innodb_plugin/sync/sync0rw.c | 4 +- storage/innodb_plugin/sync/sync0sync.c | 12 ++- 34 files changed, 650 insertions(+), 223 deletions(-) diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c index 9438277050d..41e0bf6e067 100644 --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -1937,7 +1937,7 @@ btr_node_ptr_delete( ut_a(err == DB_SUCCESS); if (!compressed) { - btr_cur_compress_if_useful(&cursor, mtr); + btr_cur_compress_if_useful(&cursor, FALSE, mtr); } } @@ -1945,9 +1945,10 @@ btr_node_ptr_delete( If page is the only on its level, this function moves its records to the father page, thus reducing the tree height. */ static -void +page_t* btr_lift_page_up( /*=============*/ + /* out: father page */ dict_index_t* index, /* in: index tree */ page_t* page, /* in: page which is the only on its level; must not be empty: use @@ -2023,6 +2024,8 @@ btr_lift_page_up( ibuf_reset_free_bits(index, father_page); ut_ad(page_validate(father_page, index)); ut_ad(btr_check_node_ptr(index, father_page, mtr)); + + return(father_page); } /***************************************************************** @@ -2039,11 +2042,13 @@ enough free extents so that the compression will always succeed if done! */ void btr_compress( /*=========*/ - btr_cur_t* cursor, /* in: cursor on the page to merge or lift; - the page must not be empty: in record delete - use btr_discard_page if the page would become - empty */ - mtr_t* mtr) /* in: mtr */ + btr_cur_t* cursor, /* in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /* in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /* in/out: mini-transaction */ { dict_index_t* index; ulint space; @@ -2058,6 +2063,7 @@ btr_compress( rec_t* node_ptr; ulint data_size; ulint n_recs; + ulint nth_rec; ulint max_ins_size; ulint max_ins_size_reorg; ulint comp; @@ -2065,6 +2071,7 @@ btr_compress( page = btr_cur_get_page(cursor); index = btr_cur_get_index(cursor); comp = page_is_comp(page); + ut_a((ibool)!!comp == dict_table_is_comp(index->table)); ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), @@ -2086,6 +2093,10 @@ btr_compress( father_page = buf_frame_align(node_ptr); ut_a(comp == page_is_comp(father_page)); + if (adjust) { + nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + } + /* Decide the page to which we try to merge and which will inherit the locks */ @@ -2110,9 +2121,8 @@ btr_compress( } else { /* The page is the only one on the level, lift the records to the father */ - btr_lift_page_up(index, page, mtr); - - return; + merge_page = btr_lift_page_up(index, page, mtr); + goto func_exit; } n_recs = page_get_n_recs(page); @@ -2188,6 +2198,10 @@ btr_compress( index, mtr); lock_update_merge_left(merge_page, orig_pred, page); + + if (adjust) { + nth_rec += page_rec_get_n_recs_before(orig_pred); + } } else { orig_succ = page_rec_get_next( page_get_infimum_rec(merge_page)); @@ -2208,6 +2222,12 @@ btr_compress( btr_page_free(index, page, mtr); ut_ad(btr_check_node_ptr(index, merge_page, mtr)); + +func_exit: + if (adjust) { + btr_cur_position(index, page_rec_get_nth(merge_page, nth_rec), + cursor); + } } /***************************************************************** diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index d5bed3bec99..7bdf87f2793 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -1791,7 +1791,9 @@ btr_cur_pessimistic_update( /* out: DB_SUCCESS or error code */ ulint flags, /* in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /* in: cursor on the record to update */ + btr_cur_t* cursor, /* in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ big_rec_t** big_rec,/* out: big rec vector whose fields have to be stored externally by the caller, or NULL */ upd_t* update, /* in: update vector; this is allowed also @@ -1926,6 +1928,10 @@ btr_cur_pessimistic_update( err = DB_TOO_BIG_RECORD; goto return_after_reservations; } + + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(btr_page_get_level(page, mtr) == 0); + ut_ad(flags & BTR_KEEP_POS_FLAG); } page_cursor = btr_cur_get_page_cur(cursor); @@ -1952,6 +1958,8 @@ btr_cur_pessimistic_update( ut_a(rec || optim_err != DB_UNDERFLOW); if (rec) { + page_cursor->rec = rec; + lock_rec_restore_from_page_infimum(rec, page); rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr); @@ -1965,12 +1973,30 @@ btr_cur_pessimistic_update( btr_cur_unmark_extern_fields(rec, mtr, offsets); } - btr_cur_compress_if_useful(cursor, mtr); + btr_cur_compress_if_useful( + cursor, + big_rec_vec != NULL && (flags & BTR_KEEP_POS_FLAG), + mtr); err = DB_SUCCESS; goto return_after_reservations; } + if (big_rec_vec) { + ut_ad(index->type & DICT_CLUSTERED); + ut_ad(btr_page_get_level(page, mtr) == 0); + ut_ad(flags & BTR_KEEP_POS_FLAG); + + /* btr_page_split_and_insert() in + btr_cur_pessimistic_insert() invokes + mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK). + We must keep the index->lock when we created a + big_rec, so that row_upd_clust_rec() can store the + big_rec in the same mini-transaction. */ + + mtr_x_lock(dict_index_get_lock(index), mtr); + } + if (page_cur_is_before_first(page_cursor)) { /* The record to be updated was positioned as the first user record on its page */ @@ -1991,6 +2017,7 @@ btr_cur_pessimistic_update( ut_a(rec); ut_a(err == DB_SUCCESS); ut_a(dummy_big_rec == NULL); + page_cursor->rec = rec; rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr); offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); @@ -2025,6 +2052,43 @@ return_after_reservations: return(err); } +/***************************************************************** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ + +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /* in: cursor */ + mtr_t* mtr) /* in/out: mini-transaction */ +{ + buf_block_t* block; + + block = buf_block_align(btr_cur_get_rec(cursor)); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* Keep the locks across the mtr_commit(mtr). */ + rw_lock_x_lock(dict_index_get_lock(cursor->index)); + rw_lock_x_lock(&block->lock); + mutex_enter(&block->mutex); +#ifdef UNIV_SYNC_DEBUG + buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__); +#else + buf_block_buf_fix_inc(block); +#endif + mutex_exit(&block->mutex); + /* Write out the redo log. */ + mtr_commit(mtr); + mtr_start(mtr); + /* Reassociate the locks with the mini-transaction. + They will be released on mtr_commit(mtr). */ + mtr_memo_push(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK); + mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); +} + /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /******************************************************************** @@ -2392,30 +2456,6 @@ btr_cur_del_unmark_for_ibuf( /*==================== B-TREE RECORD REMOVE =========================*/ -/***************************************************************** -Tries to compress a page of the tree on the leaf level. It is assumed -that mtr holds an x-latch on the tree and on the cursor page. To avoid -deadlocks, mtr must also own x-latches to brothers of page, if those -brothers exist. NOTE: it is assumed that the caller has reserved enough -free extents so that the compression will always succeed if done! */ - -void -btr_cur_compress( -/*=============*/ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid */ - mtr_t* mtr) /* in: mtr */ -{ - ut_ad(mtr_memo_contains(mtr, - dict_index_get_lock(btr_cur_get_index(cursor)), - MTR_MEMO_X_LOCK)); - ut_ad(mtr_memo_contains(mtr, buf_block_align(btr_cur_get_rec(cursor)), - MTR_MEMO_PAGE_X_FIX)); - ut_ad(btr_page_get_level(btr_cur_get_page(cursor), mtr) == 0); - - btr_compress(cursor, mtr); -} - /***************************************************************** Tries to compress a page of the tree if it seems useful. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid @@ -2427,10 +2467,12 @@ ibool btr_cur_compress_if_useful( /*=======================*/ /* out: TRUE if compression occurred */ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid if compression - occurs */ - mtr_t* mtr) /* in: mtr */ + btr_cur_t* cursor, /* in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + ibool adjust, /* in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /* in/out: mini-transaction */ { ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(btr_cur_get_index(cursor)), @@ -2440,7 +2482,7 @@ btr_cur_compress_if_useful( if (btr_cur_compress_recommendation(cursor, mtr)) { - btr_compress(cursor, mtr); + btr_compress(cursor, adjust, mtr); return(TRUE); } @@ -2653,7 +2695,7 @@ return_after_reservations: mem_heap_free(heap); if (ret == FALSE) { - ret = btr_cur_compress_if_useful(cursor, mtr); + ret = btr_cur_compress_if_useful(cursor, FALSE, mtr); } if (n_extents > 0) { diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 1573de7e818..269fa355558 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -312,11 +312,13 @@ enough free extents so that the compression will always succeed if done! */ void btr_compress( /*=========*/ - btr_cur_t* cursor, /* in: cursor on the page to merge or lift; - the page must not be empty: in record delete - use btr_discard_page if the page would become - empty */ - mtr_t* mtr); /* in: mtr */ + btr_cur_t* cursor, /* in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /* in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr); /* in/out: mini-transaction */ /***************************************************************** Discards a page from a B-tree. This is used to remove the last record from a B-tree page: the whole page must be removed at the same time. This cannot diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index 20235c55f22..c068d8d3318 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -23,6 +23,9 @@ Created 10/16/1994 Heikki Tuuri #define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */ #define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the update vector or inserted entry */ +#define BTR_KEEP_POS_FLAG 8 /* btr_cur_pessimistic_update() + must keep cursor position when + moving columns to big_rec */ #define BTR_CUR_ADAPT #define BTR_CUR_HASH_ADAPT @@ -237,7 +240,9 @@ btr_cur_pessimistic_update( /* out: DB_SUCCESS or error code */ ulint flags, /* in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /* in: cursor on the record to update */ + btr_cur_t* cursor, /* in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ big_rec_t** big_rec,/* out: big rec vector whose fields have to be stored externally by the caller, or NULL */ upd_t* update, /* in: update vector; this is allowed also @@ -247,6 +252,15 @@ btr_cur_pessimistic_update( updates */ que_thr_t* thr, /* in: query thread */ mtr_t* mtr); /* in: mtr */ +/***************************************************************** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ + +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /* in: cursor */ + mtr_t* mtr); /* in/out: mini-transaction */ /*************************************************************** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -286,19 +300,6 @@ btr_cur_del_unmark_for_ibuf( rec_t* rec, /* in: record to delete unmark */ mtr_t* mtr); /* in: mtr */ /***************************************************************** -Tries to compress a page of the tree on the leaf level. It is assumed -that mtr holds an x-latch on the tree and on the cursor page. To avoid -deadlocks, mtr must also own x-latches to brothers of page, if those -brothers exist. NOTE: it is assumed that the caller has reserved enough -free extents so that the compression will always succeed if done! */ - -void -btr_cur_compress( -/*=============*/ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid */ - mtr_t* mtr); /* in: mtr */ -/***************************************************************** Tries to compress a page of the tree if it seems useful. It is assumed that mtr holds an x-latch on the tree and on the cursor page. To avoid deadlocks, mtr must also own x-latches to brothers of page, if those @@ -309,10 +310,12 @@ ibool btr_cur_compress_if_useful( /*=======================*/ /* out: TRUE if compression occurred */ - btr_cur_t* cursor, /* in: cursor on the page to compress; - cursor does not stay valid if compression - occurs */ - mtr_t* mtr); /* in: mtr */ + btr_cur_t* cursor, /* in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + ibool adjust, /* in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr); /* in/out: mini-transaction */ /*********************************************************** Removes the record on which the tree cursor is positioned. It is assumed that the mtr has an x-latch on the page where the cursor is positioned, diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 3e8972d9182..7479ce9cbf0 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -682,6 +682,25 @@ buf_page_address_fold( /* out: the folded value */ ulint space, /* in: space id */ ulint offset);/* in: offset of the page within space */ +#ifdef UNIV_SYNC_DEBUG +/*********************************************************************** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_debug( +/*========================*/ + buf_block_t* block, /* in: block to bufferfix */ + const char* file __attribute__ ((unused)), /* in: file name */ + ulint line __attribute__ ((unused))); /* in: line */ +#else /* UNIV_SYNC_DEBUG */ +/*********************************************************************** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc( +/*==================*/ + buf_block_t* block); /* in: block to bufferfix */ +#endif /* UNIV_SYNC_DEBUG */ /********************************************************************** Returns the control block of a file page, NULL if not found. */ UNIV_INLINE diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index 58c5fd9ef3d..f4d3619f73f 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -660,6 +660,6 @@ buf_page_dbg_add_level( ulint level __attribute__((unused))) /* in: latching order level */ { - sync_thread_add_level(&(buf_block_align(frame)->lock), level); + sync_thread_add_level(&(buf_block_align(frame)->lock), level, FALSE); } #endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h index 273007c2778..24698557e77 100644 --- a/storage/innobase/include/page0page.h +++ b/storage/innobase/include/page0page.h @@ -234,10 +234,21 @@ page_get_supremum_rec( /*==================*/ /* out: the last record in record list */ page_t* page); /* in: page which must have record(s) */ -/**************************************************************** -Returns the middle record of record list. If there are an even number -of records in the list, returns the first record of upper half-list. */ +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). */ +rec_t* +page_rec_get_nth( +/*=============*/ + /* out: nth record */ + page_t* page, /* in: page */ + ulint nth); /* in: nth record */ +/***************************************************************** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. */ +UNIV_INLINE rec_t* page_get_middle_rec( /*================*/ @@ -280,7 +291,8 @@ page_get_n_recs( page_t* page); /* in: index page */ /******************************************************************* Returns the number of records before the given record in chain. -The number includes infimum and supremum records. */ +The number includes infimum and supremum records. +This is the inverse function of page_rec_get_nth(). */ ulint page_rec_get_n_recs_before( diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic index d9e67f3eeeb..a019aa28515 100644 --- a/storage/innobase/include/page0page.ic +++ b/storage/innobase/include/page0page.ic @@ -340,6 +340,22 @@ page_rec_is_infimum( return(page_rec_is_infimum_low(page_offset(rec))); } +/***************************************************************** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + /* out: middle record */ + page_t* page) /* in: page */ +{ + ulint middle = (page_get_n_recs(page) + 2) / 2; + + return(page_rec_get_nth(page, middle)); +} + /***************************************************************** Compares a data tuple to a physical record. Differs from the function cmp_dtuple_rec_with_match in the way that the record must reside on an diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 9430d4cb723..595dca0da6d 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -198,8 +198,9 @@ void sync_thread_add_level( /*==================*/ void* latch, /* in: pointer to a mutex or an rw-lock */ - ulint level); /* in: level in the latching order; if + ulint level, /* in: level in the latching order; if SYNC_LEVEL_VARYING, nothing is done */ + ibool relock);/* in: TRUE if re-entering an x-lock */ /********************************************************************** Removes a latch from the thread level array if it is found there. */ diff --git a/storage/innobase/page/page0page.c b/storage/innobase/page/page0page.c index 543cf9e34eb..6a89df7de22 100644 --- a/storage/innobase/page/page0page.c +++ b/storage/innobase/page/page0page.c @@ -1194,49 +1194,42 @@ page_dir_balance_slot( } /**************************************************************** -Returns the middle record of the record list. If there are an even number -of records in the list, returns the first record of the upper half-list. */ +Returns the nth record of the record list. */ rec_t* -page_get_middle_rec( -/*================*/ - /* out: middle record */ - page_t* page) /* in: page */ +page_rec_get_nth( +/*=============*/ + /* out: nth record */ + page_t* page, /* in: page */ + ulint nth) /* in: nth record */ { page_dir_slot_t* slot; - ulint middle; ulint i; ulint n_owned; - ulint count; rec_t* rec; - /* This many records we must leave behind */ - middle = (page_get_n_recs(page) + 2) / 2; - - count = 0; + ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); for (i = 0;; i++) { slot = page_dir_get_nth_slot(page, i); n_owned = page_dir_slot_get_n_owned(slot); - if (count + n_owned > middle) { + if (n_owned > nth) { break; } else { - count += n_owned; + nth -= n_owned; } } ut_ad(i > 0); slot = page_dir_get_nth_slot(page, i - 1); rec = page_dir_slot_get_rec(slot); - rec = page_rec_get_next(rec); - /* There are now count records behind rec */ - - for (i = 0; i < middle - count; i++) { + do { rec = page_rec_get_next(rec); - } + ut_ad(rec); + } while (nth--); return(rec); } diff --git a/storage/innobase/row/row0ins.c b/storage/innobase/row/row0ins.c index 9786f90fd39..7ff443a11ad 100644 --- a/storage/innobase/row/row0ins.c +++ b/storage/innobase/row/row0ins.c @@ -259,6 +259,7 @@ row_ins_sec_index_entry_by_modify( err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor, &dummy_big_rec, update, 0, thr, mtr); + ut_a(!dummy_big_rec); } func_exit: mem_heap_free(heap); @@ -329,8 +330,9 @@ row_ins_clust_index_entry_by_modify( goto func_exit; } - err = btr_cur_pessimistic_update(0, cursor, big_rec, update, - 0, thr, mtr); + err = btr_cur_pessimistic_update( + BTR_KEEP_POS_FLAG, cursor, big_rec, update, + 0, thr, mtr); } func_exit: mem_heap_free(heap); @@ -2083,6 +2085,41 @@ row_ins_index_entry_low( err = row_ins_clust_index_entry_by_modify( mode, &cursor, &big_rec, entry, ext_vec, n_ext_vec, thr, &mtr); + + if (big_rec) { + ut_a(err == DB_SUCCESS); + /* Write out the externally stored + columns while still x-latching + index->lock and block->lock. We have + to mtr_commit(mtr) first, so that the + redo log will be written in the + correct order. Otherwise, we would run + into trouble on crash recovery if mtr + freed B-tree pages on which some of + the big_rec fields will be written. */ + btr_cur_mtr_commit_and_start(&cursor, &mtr); + + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, + &heap); + + err = btr_store_big_rec_extern_fields( + index, rec, offsets, big_rec, &mtr); + /* If writing big_rec fails (for + example, because of DB_OUT_OF_FILE_SPACE), + the record will be corrupted. Even if + we did not update any externally + stored columns, our update could cause + the record to grow so that a + non-updated column was selected for + external storage. This non-update + would not have been written to the + undo log, and thus the record cannot + be rolled back. */ + ut_a(err == DB_SUCCESS); + goto stored_big_rec; + } } else { err = row_ins_sec_index_entry_by_modify( mode, &cursor, entry, thr, &mtr); @@ -2119,7 +2156,6 @@ function_exit: mtr_commit(&mtr); if (big_rec) { - rec_t* rec; mtr_start(&mtr); btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, @@ -2130,7 +2166,7 @@ function_exit: err = btr_store_big_rec_extern_fields(index, rec, offsets, big_rec, &mtr); - +stored_big_rec: if (modify) { dtuple_big_rec_free(big_rec); } else { diff --git a/storage/innobase/row/row0umod.c b/storage/innobase/row/row0umod.c index a3333fcc536..0b00aa2411a 100644 --- a/storage/innobase/row/row0umod.c +++ b/storage/innobase/row/row0umod.c @@ -119,6 +119,7 @@ row_undo_mod_clust_low( | BTR_KEEP_SYS_FLAG, btr_cur, &dummy_big_rec, node->update, node->cmpl_info, thr, mtr); + ut_ad(!dummy_big_rec); } return(err); @@ -471,6 +472,7 @@ row_undo_mod_del_unmark_sec_and_undo_update( BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, btr_cur, &dummy_big_rec, update, 0, thr, &mtr); + ut_ad(!dummy_big_rec); } mem_heap_free(heap); diff --git a/storage/innobase/row/row0upd.c b/storage/innobase/row/row0upd.c index 0790cfe02e2..694b00ea265 100644 --- a/storage/innobase/row/row0upd.c +++ b/storage/innobase/row/row0upd.c @@ -1580,32 +1580,48 @@ row_upd_clust_rec( ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), dict_table_is_comp(index->table))); - err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, - &big_rec, node->update, - node->cmpl_info, thr, mtr); - mtr_commit(mtr); + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur, + &big_rec, node->update, node->cmpl_info, thr, mtr); - if (err == DB_SUCCESS && big_rec) { + if (big_rec) { mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; rec_t* rec; *offsets_ = (sizeof offsets_) / sizeof *offsets_; - mtr_start(mtr); + ut_a(err == DB_SUCCESS); + /* Write out the externally stored columns while still + x-latching index->lock and block->lock. We have to + mtr_commit(mtr) first, so that the redo log will be + written in the correct order. Otherwise, we would run + into trouble on crash recovery if mtr freed B-tree + pages on which some of the big_rec fields will be + written. */ + btr_cur_mtr_commit_and_start(btr_cur, mtr); - ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), - big_rec, mtr); + big_rec, mtr); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } - mtr_commit(mtr); + /* If writing big_rec fails (for example, because of + DB_OUT_OF_FILE_SPACE), the record will be corrupted. + Even if we did not update any externally stored + columns, our update could cause the record to grow so + that a non-updated column was selected for external + storage. This non-update would not have been written + to the undo log, and thus the record cannot be rolled + back. */ + ut_a(err == DB_SUCCESS); } + mtr_commit(mtr); + if (big_rec) { dtuple_big_rec_free(big_rec); } diff --git a/storage/innobase/sync/sync0rw.c b/storage/innobase/sync/sync0rw.c index ef4c07e8c26..089e87a8a5c 100644 --- a/storage/innobase/sync/sync0rw.c +++ b/storage/innobase/sync/sync0rw.c @@ -663,7 +663,9 @@ rw_lock_add_debug_info( rw_lock_debug_mutex_exit(); if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { - sync_thread_add_level(lock, lock->level); + sync_thread_add_level(lock, lock->level, + lock_type == RW_LOCK_EX + && lock->writer_count > 1); } } diff --git a/storage/innobase/sync/sync0sync.c b/storage/innobase/sync/sync0sync.c index 944fd2a97fc..1099dff798e 100644 --- a/storage/innobase/sync/sync0sync.c +++ b/storage/innobase/sync/sync0sync.c @@ -641,7 +641,7 @@ mutex_set_debug_info( ut_ad(mutex); ut_ad(file_name); - sync_thread_add_level(mutex, mutex->level); + sync_thread_add_level(mutex, mutex->level, FALSE); mutex->file_name = file_name; mutex->line = line; @@ -1011,8 +1011,9 @@ void sync_thread_add_level( /*==================*/ void* latch, /* in: pointer to a mutex or an rw-lock */ - ulint level) /* in: level in the latching order; if + ulint level, /* in: level in the latching order; if SYNC_LEVEL_VARYING, nothing is done */ + ibool relock) /* in: TRUE if re-entering an x-lock */ { sync_level_t* array; sync_level_t* slot; @@ -1060,6 +1061,10 @@ sync_thread_add_level( array = thread_slot->levels; + if (relock) { + goto levels_ok; + } + /* NOTE that there is a problem with _NODE and _LEAF levels: if the B-tree height changes, then a leaf can change to an internal node or the other way around. We do not know at present if this can cause @@ -1209,6 +1214,7 @@ sync_thread_add_level( ut_error; } +levels_ok: for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { slot = sync_thread_levels_get_nth(array, i); diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index fa8b4727cb1..ef5f87172c6 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,13 @@ +2011-06-16 The InnoDB Team + + * btr/btr0btr.c, btr/btr0cur.c, include/btr0btr.h, include/btr0cur.h, + include/btr0cur.ic, include/buf0buf.h, include/buf0buf.ic, + include/page0cur.ic, include/page0page.h, include/page0page.ic, + include/sync0rw.ic, include/sync0sync.h, page/page0cur.c, + page/page0page.c, row/row0ins.c, row/row0upd.c, + sync/sync0rw.c, sync/sync0sync.c: + Fix Bug#12612184 Race condition after btr_cur_pessimistic_update() + 2011-06-09 The InnoDB Team * btr/btr0cur.c, include/rem0rec.h, include/rem0rec.ic, * row/row0row.c, row/row0vers.c, trx/trx0rec.c: diff --git a/storage/innodb_plugin/btr/btr0btr.c b/storage/innodb_plugin/btr/btr0btr.c index 46810c011c4..fb472062fe6 100644 --- a/storage/innodb_plugin/btr/btr0btr.c +++ b/storage/innodb_plugin/btr/btr0btr.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -2272,7 +2272,7 @@ btr_attach_half_pages( /*==================*/ dict_index_t* index, /*!< in: the index tree */ buf_block_t* block, /*!< in/out: page to be split */ - rec_t* split_rec, /*!< in: first record on upper + const rec_t* split_rec, /*!< in: first record on upper half page */ buf_block_t* new_block, /*!< in/out: the new half page */ ulint direction, /*!< in: FSP_UP or FSP_DOWN */ @@ -2964,15 +2964,16 @@ btr_node_ptr_delete( ut_a(err == DB_SUCCESS); if (!compressed) { - btr_cur_compress_if_useful(&cursor, mtr); + btr_cur_compress_if_useful(&cursor, FALSE, mtr); } } /*************************************************************//** If page is the only on its level, this function moves its records to the -father page, thus reducing the tree height. */ +father page, thus reducing the tree height. +@return father block */ static -void +buf_block_t* btr_lift_page_up( /*=============*/ dict_index_t* index, /*!< in: index tree */ @@ -3089,6 +3090,8 @@ btr_lift_page_up( } ut_ad(page_validate(father_page, index)); ut_ad(btr_check_node_ptr(index, father_block, mtr)); + + return(father_block); } /*************************************************************//** @@ -3105,11 +3108,13 @@ UNIV_INTERN ibool btr_compress( /*=========*/ - btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift; - the page must not be empty: in record delete - use btr_discard_page if the page would become - empty */ - mtr_t* mtr) /*!< in: mtr */ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { dict_index_t* index; ulint space; @@ -3127,12 +3132,14 @@ btr_compress( ulint* offsets; ulint data_size; ulint n_recs; + ulint nth_rec; ulint max_ins_size; ulint max_ins_size_reorg; block = btr_cur_get_block(cursor); page = btr_cur_get_page(cursor); index = btr_cur_get_index(cursor); + ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table)); ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), @@ -3153,6 +3160,10 @@ btr_compress( offsets = btr_page_get_father_block(NULL, heap, index, block, mtr, &father_cursor); + if (adjust) { + nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + } + /* Decide the page to which we try to merge and which will inherit the locks */ @@ -3179,9 +3190,9 @@ btr_compress( } else { /* The page is the only one on the level, lift the records to the father */ - btr_lift_page_up(index, block, mtr); - mem_heap_free(heap); - return(TRUE); + + merge_block = btr_lift_page_up(index, block, mtr); + goto func_exit; } n_recs = page_get_n_recs(page); @@ -3263,6 +3274,10 @@ err_exit: btr_node_ptr_delete(index, block, mtr); lock_update_merge_left(merge_block, orig_pred, block); + + if (adjust) { + nth_rec += page_rec_get_n_recs_before(orig_pred); + } } else { rec_t* orig_succ; #ifdef UNIV_BTR_DEBUG @@ -3327,7 +3342,6 @@ err_exit: } btr_blob_dbg_remove(page, index, "btr_compress"); - mem_heap_free(heap); if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) { /* Update the free bits of the B-tree page in the @@ -3379,6 +3393,16 @@ err_exit: btr_page_free(index, block, mtr); ut_ad(btr_check_node_ptr(index, merge_block, mtr)); +func_exit: + mem_heap_free(heap); + + if (adjust) { + btr_cur_position( + index, + page_rec_get_nth(merge_block->frame, nth_rec), + merge_block, cursor); + } + return(TRUE); } diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index 84ba0b99e58..a551b4dfcb9 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -2088,7 +2088,9 @@ btr_cur_pessimistic_update( /*=======================*/ ulint flags, /*!< in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /*!< in: cursor on the record to update */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to be stored externally by the caller, or NULL */ @@ -2227,7 +2229,7 @@ btr_cur_pessimistic_update( record to be inserted: we have to remember which fields were such */ ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec)); - offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap); + ut_ad(rec_offs_validate(rec, index, offsets)); n_ext += btr_push_update_extern_fields(new_entry, update, *heap); if (UNIV_LIKELY_NULL(page_zip)) { @@ -2250,6 +2252,10 @@ make_external: err = DB_TOO_BIG_RECORD; goto return_after_reservations; } + + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); } /* Store state of explicit locks on rec on the page infimum record, @@ -2277,6 +2283,8 @@ make_external: rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr); if (rec) { + page_cursor->rec = rec; + lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor), rec, block); @@ -2290,7 +2298,10 @@ make_external: rec, index, offsets, mtr); } - btr_cur_compress_if_useful(cursor, mtr); + btr_cur_compress_if_useful( + cursor, + big_rec_vec != NULL && (flags & BTR_KEEP_POS_FLAG), + mtr); if (page_zip && !dict_index_is_clust(index) && page_is_leaf(page)) { @@ -2310,6 +2321,21 @@ make_external: } } + if (big_rec_vec) { + ut_ad(page_is_leaf(page)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); + + /* btr_page_split_and_insert() in + btr_cur_pessimistic_insert() invokes + mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK). + We must keep the index->lock when we created a + big_rec, so that row_upd_clust_rec() can store the + big_rec in the same mini-transaction. */ + + mtr_x_lock(dict_index_get_lock(index), mtr); + } + /* Was the record to be updated positioned as the first user record on its page? */ was_first = page_cur_is_before_first(page_cursor); @@ -2325,6 +2351,7 @@ make_external: ut_a(rec); ut_a(err == DB_SUCCESS); ut_a(dummy_big_rec == NULL); + page_cursor->rec = rec; if (dict_index_is_sec_or_ibuf(index)) { /* Update PAGE_MAX_TRX_ID in the index page header. @@ -2383,6 +2410,39 @@ return_after_reservations: return(err); } +/**************************************************************//** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ +UNIV_INTERN +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /*!< in: cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + buf_block_t* block; + + block = btr_cur_get_block(cursor); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK)); + ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); + /* Keep the locks across the mtr_commit(mtr). */ + rw_lock_x_lock(dict_index_get_lock(cursor->index)); + rw_lock_x_lock(&block->lock); + mutex_enter(&block->mutex); + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + mutex_exit(&block->mutex); + /* Write out the redo log. */ + mtr_commit(mtr); + mtr_start(mtr); + /* Reassociate the locks with the mini-transaction. + They will be released on mtr_commit(mtr). */ + mtr_memo_push(mtr, dict_index_get_lock(cursor->index), + MTR_MEMO_X_LOCK); + mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX); +} + /*==================== B-TREE DELETE MARK AND UNMARK ===============*/ /****************************************************************//** @@ -2762,10 +2822,12 @@ UNIV_INTERN ibool btr_cur_compress_if_useful( /*=======================*/ - btr_cur_t* cursor, /*!< in: cursor on the page to compress; - cursor does not stay valid if compression - occurs */ - mtr_t* mtr) /*!< in: mtr */ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ { ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(btr_cur_get_index(cursor)), @@ -2774,7 +2836,7 @@ btr_cur_compress_if_useful( MTR_MEMO_PAGE_X_FIX)); return(btr_cur_compress_recommendation(cursor, mtr) - && btr_compress(cursor, mtr)); + && btr_compress(cursor, adjust, mtr)); } /*******************************************************//** @@ -3016,7 +3078,7 @@ return_after_reservations: mem_heap_free(heap); if (ret == FALSE) { - ret = btr_cur_compress_if_useful(cursor, mtr); + ret = btr_cur_compress_if_useful(cursor, FALSE, mtr); } if (n_extents > 0) { diff --git a/storage/innodb_plugin/include/btr0btr.h b/storage/innodb_plugin/include/btr0btr.h index 5aa02694e0e..987de03b349 100644 --- a/storage/innodb_plugin/include/btr0btr.h +++ b/storage/innodb_plugin/include/btr0btr.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -470,11 +470,14 @@ UNIV_INTERN ibool btr_compress( /*=========*/ - btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift; - the page must not be empty: in record delete - use btr_discard_page if the page would become - empty */ - mtr_t* mtr); /*!< in: mtr */ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /*************************************************************//** Discards a page from a B-tree. This is used to remove the last record from a B-tree page: the whole page must be removed at the same time. This cannot diff --git a/storage/innodb_plugin/include/btr0cur.h b/storage/innodb_plugin/include/btr0cur.h index ece3621fa97..6094a2a6c7a 100644 --- a/storage/innodb_plugin/include/btr0cur.h +++ b/storage/innodb_plugin/include/btr0cur.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,6 +36,9 @@ Created 10/16/1994 Heikki Tuuri #define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */ #define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the update vector or inserted entry */ +#define BTR_KEEP_POS_FLAG 8 /* btr_cur_pessimistic_update() + must keep cursor position when + moving columns to big_rec */ #ifndef UNIV_HOTBACKUP #include "que0types.h" @@ -309,7 +312,9 @@ btr_cur_pessimistic_update( /*=======================*/ ulint flags, /*!< in: undo logging, locking, and rollback flags */ - btr_cur_t* cursor, /*!< in: cursor on the record to update */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to be stored externally by the caller, or NULL */ @@ -321,6 +326,16 @@ btr_cur_pessimistic_update( que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr); /*!< in: mtr; must be committed before latching any further pages */ +/***************************************************************** +Commits and restarts a mini-transaction so that it will retain an +x-lock on index->lock and the cursor page. */ +UNIV_INTERN +void +btr_cur_mtr_commit_and_start( +/*=========================*/ + btr_cur_t* cursor, /*!< in: cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /***********************************************************//** Marks a clustered index record deleted. Writes an undo log record to undo log on this delete marking. Writes in the trx id field the id @@ -376,10 +391,13 @@ UNIV_INTERN ibool btr_cur_compress_if_useful( /*=======================*/ - btr_cur_t* cursor, /*!< in: cursor on the page to compress; + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; cursor does not stay valid if compression occurs */ - mtr_t* mtr); /*!< in: mtr */ + ibool adjust, /*!< in: TRUE if should adjust the + cursor position even if compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); /*******************************************************//** Removes the record on which the tree cursor is positioned. It is assumed that the mtr has an x-latch on the page where the cursor is positioned, diff --git a/storage/innodb_plugin/include/btr0cur.ic b/storage/innodb_plugin/include/btr0cur.ic index 280583f6ccf..c833b3e8572 100644 --- a/storage/innodb_plugin/include/btr0cur.ic +++ b/storage/innodb_plugin/include/btr0cur.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -139,7 +139,7 @@ btr_cur_compress_recommendation( btr_cur_t* cursor, /*!< in: btr cursor */ mtr_t* mtr) /*!< in: mtr */ { - page_t* page; + const page_t* page; ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor), MTR_MEMO_PAGE_X_FIX)); diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h index 05dead5ac9e..775ddc758d0 100644 --- a/storage/innodb_plugin/include/buf0buf.h +++ b/storage/innodb_plugin/include/buf0buf.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -467,6 +467,31 @@ buf_block_get_modify_clock( #else /* !UNIV_HOTBACKUP */ # define buf_block_modify_clock_inc(block) ((void) 0) #endif /* !UNIV_HOTBACKUP */ +/*******************************************************************//** +Increments the bufferfix count. */ +UNIV_INLINE +void +buf_block_buf_fix_inc_func( +/*=======================*/ +#ifdef UNIV_SYNC_DEBUG + const char* file, /*!< in: file name */ + ulint line, /*!< in: line */ +#endif /* UNIV_SYNC_DEBUG */ + buf_block_t* block) /*!< in/out: block to bufferfix */ + __attribute__((nonnull)); +#ifdef UNIV_SYNC_DEBUG +/** Increments the bufferfix count. +@param b in/out: block to bufferfix +@param f in: file name where requested +@param l in: line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b) +#else /* UNIV_SYNC_DEBUG */ +/** Increments the bufferfix count. +@param b in/out: block to bufferfix +@param f in: file name where requested +@param l in: line number where requested */ +# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b) +#endif /* UNIV_SYNC_DEBUG */ /********************************************************************//** Calculates a page checksum which is stored to the page when it is written to a file. Note that we must be careful to calculate the same value diff --git a/storage/innodb_plugin/include/buf0buf.ic b/storage/innodb_plugin/include/buf0buf.ic index 0025bef5aac..d4ca07a4cd8 100644 --- a/storage/innodb_plugin/include/buf0buf.ic +++ b/storage/innodb_plugin/include/buf0buf.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -871,19 +871,6 @@ buf_block_buf_fix_inc_func( block->page.buf_fix_count++; } -#ifdef UNIV_SYNC_DEBUG -/** Increments the bufferfix count. -@param b in/out: block to bufferfix -@param f in: file name where requested -@param l in: line number where requested */ -# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b) -#else /* UNIV_SYNC_DEBUG */ -/** Increments the bufferfix count. -@param b in/out: block to bufferfix -@param f in: file name where requested -@param l in: line number where requested */ -# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b) -#endif /* UNIV_SYNC_DEBUG */ /*******************************************************************//** Decrements the bufferfix count. */ @@ -1071,7 +1058,7 @@ buf_block_dbg_add_level( where we have acquired latch */ ulint level) /*!< in: latching order level */ { - sync_thread_add_level(&block->lock, level); + sync_thread_add_level(&block->lock, level, FALSE); } #endif /* UNIV_SYNC_DEBUG */ #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innodb_plugin/include/page0cur.ic b/storage/innodb_plugin/include/page0cur.ic index 3520677dfb3..81474fa35f5 100644 --- a/storage/innodb_plugin/include/page0cur.ic +++ b/storage/innodb_plugin/include/page0cur.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,6 +27,8 @@ Created 10/4/1994 Heikki Tuuri #include "buf0types.h" #ifdef UNIV_DEBUG +# include "rem0cmp.h" + /*********************************************************//** Gets pointer to the page frame where the cursor is positioned. @return page */ @@ -268,6 +270,7 @@ page_cur_tuple_insert( index, rec, offsets, mtr); } + ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, offsets)); mem_heap_free(heap); return(rec); } diff --git a/storage/innodb_plugin/include/page0page.h b/storage/innodb_plugin/include/page0page.h index 3899499fb6a..9099fd7b65d 100644 --- a/storage/innodb_plugin/include/page0page.h +++ b/storage/innodb_plugin/include/page0page.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -284,16 +284,42 @@ page_get_supremum_offset( const page_t* page); /*!< in: page which must have record(s) */ #define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page)) #define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page)) + /************************************************************//** -Returns the middle record of record list. If there are an even number -of records in the list, returns the first record of upper half-list. -@return middle record */ +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ UNIV_INTERN +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ + __attribute__((nonnull, warn_unused_result)); +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*< in: page */ + ulint nth) /*!< in: nth record */ + __attribute__((nonnull, warn_unused_result)); + +#ifndef UNIV_HOTBACKUP +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE rec_t* page_get_middle_rec( /*================*/ - page_t* page); /*!< in: page */ -#ifndef UNIV_HOTBACKUP + page_t* page) /*!< in: page */ + __attribute__((nonnull, warn_unused_result)); /*************************************************************//** Compares a data tuple to a physical record. Differs from the function cmp_dtuple_rec_with_match in the way that the record must reside on an @@ -348,6 +374,7 @@ page_get_n_recs( /***************************************************************//** Returns the number of records before the given record in chain. The number includes infimum and supremum records. +This is the inverse function of page_rec_get_nth(). @return number of records */ UNIV_INTERN ulint diff --git a/storage/innodb_plugin/include/page0page.ic b/storage/innodb_plugin/include/page0page.ic index 8f794410f20..1450b0892b3 100644 --- a/storage/innodb_plugin/include/page0page.ic +++ b/storage/innodb_plugin/include/page0page.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -420,7 +420,37 @@ page_rec_is_infimum( return(page_rec_is_infimum_low(page_offset(rec))); } +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ +UNIV_INLINE +rec_t* +page_rec_get_nth( +/*=============*/ + page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ +{ + return((rec_t*) page_rec_get_nth_const(page, nth)); +} + #ifndef UNIV_HOTBACKUP +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ +{ + ulint middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2; + + return(page_rec_get_nth(page, middle)); +} + /*************************************************************//** Compares a data tuple to a physical record. Differs from the function cmp_dtuple_rec_with_match in the way that the record must reside on an diff --git a/storage/innodb_plugin/include/sync0rw.ic b/storage/innodb_plugin/include/sync0rw.ic index 7116f1b7c9b..485a63a1b18 100644 --- a/storage/innodb_plugin/include/sync0rw.ic +++ b/storage/innodb_plugin/include/sync0rw.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -603,16 +603,16 @@ rw_lock_x_unlock_direct( ut_ad((lock->lock_word % X_LOCK_DECR) == 0); -#ifdef UNIV_SYNC_DEBUG - rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX); -#endif - if (lock->lock_word == 0) { lock->recursive = FALSE; UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread); } +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX); +#endif + lock->lock_word += X_LOCK_DECR; ut_ad(!lock->waiters); diff --git a/storage/innodb_plugin/include/sync0sync.h b/storage/innodb_plugin/include/sync0sync.h index 71c9920a10b..d9dcf91accb 100644 --- a/storage/innodb_plugin/include/sync0sync.h +++ b/storage/innodb_plugin/include/sync0sync.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -219,8 +219,10 @@ void sync_thread_add_level( /*==================*/ void* latch, /*!< in: pointer to a mutex or an rw-lock */ - ulint level); /*!< in: level in the latching order; if + ulint level, /*!< in: level in the latching order; if SYNC_LEVEL_VARYING, nothing is done */ + ibool relock) /*!< in: TRUE if re-entering an x-lock */ + __attribute__((nonnull)); /******************************************************************//** Removes a latch from the thread level array if it is found there. @return TRUE if found in the array; it is no error if the latch is diff --git a/storage/innodb_plugin/page/page0cur.c b/storage/innodb_plugin/page/page0cur.c index 936762b986a..b8c492328e8 100644 --- a/storage/innodb_plugin/page/page0cur.c +++ b/storage/innodb_plugin/page/page0cur.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1180,14 +1180,15 @@ page_cur_insert_rec_zip_reorg( /* Before trying to reorganize the page, store the number of preceding records on the page. */ pos = page_rec_get_n_recs_before(rec); + ut_ad(pos > 0); if (page_zip_reorganize(block, index, mtr)) { /* The page was reorganized: Find rec by seeking to pos, and update *current_rec. */ - rec = page + PAGE_NEW_INFIMUM; - - while (--pos) { - rec = page + rec_get_next_offs(rec, TRUE); + if (pos > 1) { + rec = page_rec_get_nth(page, pos - 1); + } else { + rec = page + PAGE_NEW_INFIMUM; } *current_rec = rec; @@ -1283,6 +1284,12 @@ page_cur_insert_rec_zip( insert_rec = page_cur_insert_rec_zip_reorg( current_rec, block, index, insert_rec, page, page_zip, mtr); +#ifdef UNIV_DEBUG + if (insert_rec) { + rec_offs_make_valid( + insert_rec, index, offsets); + } +#endif /* UNIV_DEBUG */ } return(insert_rec); diff --git a/storage/innodb_plugin/page/page0page.c b/storage/innodb_plugin/page/page0page.c index 6cae03e8829..a284b1480a3 100644 --- a/storage/innodb_plugin/page/page0page.c +++ b/storage/innodb_plugin/page/page0page.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1487,55 +1487,54 @@ page_dir_balance_slot( } } -#ifndef UNIV_HOTBACKUP /************************************************************//** -Returns the middle record of the record list. If there are an even number -of records in the list, returns the first record of the upper half-list. -@return middle record */ +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record */ UNIV_INTERN -rec_t* -page_get_middle_rec( -/*================*/ - page_t* page) /*!< in: page */ +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ { - page_dir_slot_t* slot; - ulint middle; + const page_dir_slot_t* slot; ulint i; ulint n_owned; - ulint count; - rec_t* rec; + const rec_t* rec; - /* This many records we must leave behind */ - middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2; - - count = 0; + ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); for (i = 0;; i++) { slot = page_dir_get_nth_slot(page, i); n_owned = page_dir_slot_get_n_owned(slot); - if (count + n_owned > middle) { + if (n_owned > nth) { break; } else { - count += n_owned; + nth -= n_owned; } } ut_ad(i > 0); slot = page_dir_get_nth_slot(page, i - 1); - rec = (rec_t*) page_dir_slot_get_rec(slot); - rec = page_rec_get_next(rec); + rec = page_dir_slot_get_rec(slot); - /* There are now count records behind rec */ - - for (i = 0; i < middle - count; i++) { - rec = page_rec_get_next(rec); + if (page_is_comp(page)) { + do { + rec = page_rec_get_next_low(rec, TRUE); + ut_ad(rec); + } while (nth--); + } else { + do { + rec = page_rec_get_next_low(rec, FALSE); + ut_ad(rec); + } while (nth--); } return(rec); } -#endif /* !UNIV_HOTBACKUP */ /***************************************************************//** Returns the number of records before the given record in chain. @@ -1597,6 +1596,7 @@ page_rec_get_n_recs_before( n--; ut_ad(n >= 0); + ut_ad(n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1)); return((ulint) n); } diff --git a/storage/innodb_plugin/row/row0ins.c b/storage/innodb_plugin/row/row0ins.c index 8050c099751..ea43cbfb5f1 100644 --- a/storage/innodb_plugin/row/row0ins.c +++ b/storage/innodb_plugin/row/row0ins.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -345,9 +345,9 @@ row_ins_clust_index_entry_by_modify( return(DB_LOCK_TABLE_FULL); } - err = btr_cur_pessimistic_update(0, cursor, - heap, big_rec, update, - 0, thr, mtr); + err = btr_cur_pessimistic_update( + BTR_KEEP_POS_FLAG, cursor, heap, big_rec, update, + 0, thr, mtr); } return(err); @@ -1986,6 +1986,7 @@ row_ins_index_entry_low( ulint modify = 0; /* remove warning */ rec_t* insert_rec; rec_t* rec; + ulint* offsets; ulint err; ulint n_unique; big_rec_t* big_rec = NULL; @@ -2089,6 +2090,42 @@ row_ins_index_entry_low( err = row_ins_clust_index_entry_by_modify( mode, &cursor, &heap, &big_rec, entry, thr, &mtr); + + if (big_rec) { + ut_a(err == DB_SUCCESS); + /* Write out the externally stored + columns while still x-latching + index->lock and block->lock. We have + to mtr_commit(mtr) first, so that the + redo log will be written in the + correct order. Otherwise, we would run + into trouble on crash recovery if mtr + freed B-tree pages on which some of + the big_rec fields will be written. */ + btr_cur_mtr_commit_and_start(&cursor, &mtr); + + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets( + rec, index, NULL, + ULINT_UNDEFINED, &heap); + + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, &mtr, FALSE, big_rec); + /* If writing big_rec fails (for + example, because of DB_OUT_OF_FILE_SPACE), + the record will be corrupted. Even if + we did not update any externally + stored columns, our update could cause + the record to grow so that a + non-updated column was selected for + external storage. This non-update + would not have been written to the + undo log, and thus the record cannot + be rolled back. */ + ut_a(err == DB_SUCCESS); + goto stored_big_rec; + } } else { ut_ad(!n_ext); err = row_ins_sec_index_entry_by_modify( @@ -2117,8 +2154,6 @@ function_exit: mtr_commit(&mtr); if (UNIV_LIKELY_NULL(big_rec)) { - rec_t* rec; - ulint* offsets; mtr_start(&mtr); btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, @@ -2132,6 +2167,7 @@ function_exit: index, btr_cur_get_block(&cursor), rec, offsets, &mtr, FALSE, big_rec); +stored_big_rec: if (modify) { dtuple_big_rec_free(big_rec); } else { diff --git a/storage/innodb_plugin/row/row0upd.c b/storage/innodb_plugin/row/row0upd.c index 3a6de4b94a7..b5952ff0a78 100644 --- a/storage/innodb_plugin/row/row0upd.c +++ b/storage/innodb_plugin/row/row0upd.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1969,28 +1969,43 @@ row_upd_clust_rec( ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), dict_table_is_comp(index->table))); - err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, - &heap, &big_rec, node->update, - node->cmpl_info, thr, mtr); - mtr_commit(mtr); - - if (err == DB_SUCCESS && big_rec) { + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur, + &heap, &big_rec, node->update, node->cmpl_info, thr, mtr); + if (big_rec) { ulint offsets_[REC_OFFS_NORMAL_SIZE]; rec_t* rec; rec_offs_init(offsets_); - mtr_start(mtr); + ut_a(err == DB_SUCCESS); + /* Write out the externally stored columns while still + x-latching index->lock and block->lock. We have to + mtr_commit(mtr) first, so that the redo log will be + written in the correct order. Otherwise, we would run + into trouble on crash recovery if mtr freed B-tree + pages on which some of the big_rec fields will be + written. */ + btr_cur_mtr_commit_and_start(btr_cur, mtr); - ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); rec = btr_cur_get_rec(btr_cur); err = btr_store_big_rec_extern_fields( index, btr_cur_get_block(btr_cur), rec, rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &heap), mtr, TRUE, big_rec); - mtr_commit(mtr); + /* If writing big_rec fails (for example, because of + DB_OUT_OF_FILE_SPACE), the record will be corrupted. + Even if we did not update any externally stored + columns, our update could cause the record to grow so + that a non-updated column was selected for external + storage. This non-update would not have been written + to the undo log, and thus the record cannot be rolled + back. */ + ut_a(err == DB_SUCCESS); } + mtr_commit(mtr); + if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } diff --git a/storage/innodb_plugin/sync/sync0rw.c b/storage/innodb_plugin/sync/sync0rw.c index a5da606ad80..3df2b4e9bbd 100644 --- a/storage/innodb_plugin/sync/sync0rw.c +++ b/storage/innodb_plugin/sync/sync0rw.c @@ -766,7 +766,9 @@ rw_lock_add_debug_info( rw_lock_debug_mutex_exit(); if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { - sync_thread_add_level(lock, lock->level); + sync_thread_add_level(lock, lock->level, + lock_type == RW_LOCK_EX + && lock->lock_word < 0); } } diff --git a/storage/innodb_plugin/sync/sync0sync.c b/storage/innodb_plugin/sync/sync0sync.c index 2be9d667705..59498386fda 100644 --- a/storage/innodb_plugin/sync/sync0sync.c +++ b/storage/innodb_plugin/sync/sync0sync.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -656,7 +656,7 @@ mutex_set_debug_info( ut_ad(mutex); ut_ad(file_name); - sync_thread_add_level(mutex, mutex->level); + sync_thread_add_level(mutex, mutex->level, FALSE); mutex->file_name = file_name; mutex->line = line; @@ -1083,8 +1083,9 @@ void sync_thread_add_level( /*==================*/ void* latch, /*!< in: pointer to a mutex or an rw-lock */ - ulint level) /*!< in: level in the latching order; if + ulint level, /*!< in: level in the latching order; if SYNC_LEVEL_VARYING, nothing is done */ + ibool relock) /*!< in: TRUE if re-entering an x-lock */ { sync_level_t* array; sync_level_t* slot; @@ -1132,6 +1133,10 @@ sync_thread_add_level( array = thread_slot->levels; + if (relock) { + goto levels_ok; + } + /* NOTE that there is a problem with _NODE and _LEAF levels: if the B-tree height changes, then a leaf can change to an internal node or the other way around. We do not know at present if this can cause @@ -1269,6 +1274,7 @@ sync_thread_add_level( ut_error; } +levels_ok: for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { slot = sync_thread_levels_get_nth(array, i); From 417a267927b9249868e8ca7bd3cb7b6e09485f43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 16 Jun 2011 11:51:04 +0300 Subject: [PATCH 19/35] Re-enable the debug assertions for Bug#12650861. Replace UNIV_BLOB_NULL_DEBUG with UNIV_DEBUG||UNIV_BLOB_LIGHT_DEBUG. Fix known bogus failures. btr_cur_optimistic_update(): If rec_offs_any_null_extern(), assert that the current transaction is an incomplete transaction that is being rolled back in crash recovery. row_build(): If rec_offs_any_null_extern(), assert that the transaction that last updated the record was recovered during crash recovery (and will soon be rolled back). --- storage/innobase/btr/btr0cur.c | 13 ++++--- storage/innobase/include/rem0rec.h | 4 +-- storage/innobase/include/rem0rec.ic | 4 +-- storage/innobase/include/trx0roll.h | 3 ++ storage/innobase/include/univ.i | 4 +-- storage/innobase/row/row0row.c | 43 +++++++++++++++++++----- storage/innobase/row/row0vers.c | 16 ++++----- storage/innobase/trx/trx0rec.c | 4 +-- storage/innodb_plugin/btr/btr0cur.c | 12 +++---- storage/innodb_plugin/include/rem0rec.h | 4 +-- storage/innodb_plugin/include/rem0rec.ic | 4 +-- storage/innodb_plugin/include/univ.i | 5 +-- storage/innodb_plugin/row/row0row.c | 41 +++++++++++++++++----- storage/innodb_plugin/row/row0vers.c | 16 ++++----- storage/innodb_plugin/trx/trx0rec.c | 4 +-- 15 files changed, 111 insertions(+), 66 deletions(-) diff --git a/storage/innobase/btr/btr0cur.c b/storage/innobase/btr/btr0cur.c index 7bdf87f2793..9ce09929f9a 100644 --- a/storage/innobase/btr/btr0cur.c +++ b/storage/innobase/btr/btr0cur.c @@ -31,6 +31,7 @@ Created 10/16/1994 Heikki Tuuri #include "btr0sea.h" #include "row0upd.h" #include "trx0rec.h" +#include "trx0roll.h" /* trx_roll_crash_recv_trx */ #include "que0que.h" #include "row0row.h" #include "srv0srv.h" @@ -1579,7 +1580,6 @@ btr_cur_optimistic_update( ulint old_rec_size; dtuple_t* new_entry; dulint roll_ptr; - trx_t* trx; mem_heap_t* heap; ibool reorganized = FALSE; ulint i; @@ -1592,9 +1592,10 @@ btr_cur_optimistic_update( heap = mem_heap_create(1024); offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); -#ifdef UNIV_BLOB_NULL_DEBUG - ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, offsets) + || thr_get_trx(thr) == trx_roll_crash_recv_trx); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ #ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { @@ -1701,13 +1702,11 @@ btr_cur_optimistic_update( page_cur_move_to_prev(page_cursor); - trx = thr_get_trx(thr); - if (!(flags & BTR_KEEP_SYS_FLAG)) { row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, roll_ptr); row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - trx->id); + thr_get_trx(thr)->id); } rec = btr_cur_insert_if_possible(cursor, new_entry, &reorganized, mtr); diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h index a1a206c3281..67baeb7d8d2 100644 --- a/storage/innobase/include/rem0rec.h +++ b/storage/innobase/include/rem0rec.h @@ -339,7 +339,7 @@ rec_offs_any_extern( /*================*/ /* out: TRUE if a field is stored externally */ const ulint* offsets);/* in: array returned by rec_get_offsets() */ -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG /******************************************************** Determine if the offsets are for a record containing null BLOB pointers. */ UNIV_INLINE @@ -351,7 +351,7 @@ rec_offs_any_null_extern( or NULL if none found */ rec_t* rec, /*!< in: record */ const ulint* offsets); /*!< in: rec_get_offsets(rec) */ -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /*************************************************************** Sets the value of the ith field extern storage bit. */ UNIV_INLINE diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic index 9e659f12881..566c62e30f2 100644 --- a/storage/innobase/include/rem0rec.ic +++ b/storage/innobase/include/rem0rec.ic @@ -1021,7 +1021,7 @@ rec_offs_any_extern( return(FALSE); } -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG /******************************************************** Determine if the offsets are for a record containing null BLOB pointers. */ UNIV_INLINE @@ -1055,7 +1055,7 @@ rec_offs_any_null_extern( return(NULL); } -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /*************************************************************** Sets the value of the ith field extern storage bit. */ diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h index c1eca3d5753..4fabb83b025 100644 --- a/storage/innobase/include/trx0roll.h +++ b/storage/innobase/include/trx0roll.h @@ -15,6 +15,9 @@ Created 3/26/1996 Heikki Tuuri #include "mtr0mtr.h" #include "trx0sys.h" +/* In crash recovery, the current trx to be rolled back */ +extern trx_t* trx_roll_crash_recv_trx; + #define trx_roll_free_all_savepoints(s) trx_roll_savepoints_free((s), NULL) /*********************************************************************** diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 8eb78491b04..a67b1b3895e 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -88,8 +88,8 @@ memory is read outside the allocated blocks. */ #if 0 #define UNIV_DEBUG_VALGRIND /* Enable extra Valgrind instrumentation */ -#define UNIV_BLOB_NULL_DEBUG /* Enable deep off-page - column debugging */ +#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column + debugging without UNIV_DEBUG */ #define UNIV_DEBUG /* Enable ut_ad() assertions */ #define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */ #define UNIV_MEM_DEBUG /* detect memory leaks etc */ diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c index b9efdcfbfdd..751de98deba 100644 --- a/storage/innobase/row/row0row.c +++ b/storage/innobase/row/row0row.c @@ -202,6 +202,7 @@ row_build( ut_ad(index && rec && heap); ut_ad(index->type & DICT_CLUSTERED); + ut_ad(!mutex_own(&kernel_mutex)); if (!offsets) { offsets = rec_get_offsets(rec, index, offsets_, @@ -210,13 +211,37 @@ row_build( ut_ad(rec_offs_validate(rec, index, offsets)); } -#if 0 && defined UNIV_BLOB_NULL_DEBUG - /* This one can fail in trx_rollback_or_clean_all_without_sess() - if the server crashed during an insert before the - btr_store_big_rec_extern_fields() did mtr_commit() - all BLOB pointers to the clustered index record. */ - ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* 0 && UNIV_BLOB_NULL_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + if (UNIV_LIKELY_NULL(rec_offs_any_null_extern(rec, offsets))) { + /* This condition can occur during crash recovery before + trx_rollback_or_clean_all_without_sess() has completed + execution. + + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. + + Look up the transaction that holds the implicit lock + on this record, and assert that it was recovered (and + will soon be rolled back). */ + + ulint trx_id_pos = dict_index_get_sys_col_pos( + index, DATA_TRX_ID); + ulint len; + dulint trx_id = trx_read_trx_id( + rec_get_nth_field(rec, offsets, trx_id_pos, &len)); + trx_t* trx; + ut_a(len == 6); + + mutex_enter(&kernel_mutex); + trx = trx_get_on_id(trx_id); + ut_a(trx); + /* This field does not exist in this version of InnoDB. */ + /* ut_a(trx->is_recovered); */ + mutex_exit(&kernel_mutex); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ @@ -310,10 +335,10 @@ row_rec_to_index_entry( rec = rec_copy(buf, rec, offsets); /* Avoid a debug assertion in rec_offs_validate(). */ rec_offs_make_valid(rec, index, offsets); -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG } else { ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ } rec_len = rec_offs_n_fields(offsets); diff --git a/storage/innobase/row/row0vers.c b/storage/innobase/row/row0vers.c index a52ef3cc083..906b46fb51b 100644 --- a/storage/innobase/row/row0vers.c +++ b/storage/innobase/row/row0vers.c @@ -473,10 +473,10 @@ row_vers_build_for_consistent_read( /* The view already sees this version: we can copy it to in_heap and return */ -#if defined UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern( version, *offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); @@ -511,9 +511,9 @@ row_vers_build_for_consistent_read( *offsets = rec_get_offsets(prev_version, index, *offsets, ULINT_UNDEFINED, offset_heap); -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ trx_id = row_get_rec_trx_id(prev_version, index, *offsets); @@ -615,9 +615,9 @@ row_vers_build_for_semi_consistent_read( /* We found a version that belongs to a committed transaction: return it. */ -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (rec == version) { *old_vers = rec; @@ -676,9 +676,9 @@ row_vers_build_for_semi_consistent_read( version = prev_version; *offsets = rec_get_offsets(version, index, *offsets, ULINT_UNDEFINED, offset_heap); -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ }/* for (;;) */ if (heap) { diff --git a/storage/innobase/trx/trx0rec.c b/storage/innobase/trx/trx0rec.c index 2a9224b0a72..730ac6a6f60 100644 --- a/storage/innobase/trx/trx0rec.c +++ b/storage/innobase/trx/trx0rec.c @@ -1397,9 +1397,9 @@ trx_undo_prev_version_build( return(DB_ERROR); } -# ifdef UNIV_BLOB_NULL_DEBUG +# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(rec, offsets)); -# endif /* UNIV_BLOB_NULL_DEBUG */ +# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint* ext_vect; diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index a551b4dfcb9..651c2c342a5 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -1854,7 +1854,6 @@ btr_cur_optimistic_update( ulint old_rec_size; dtuple_t* new_entry; roll_ptr_t roll_ptr; - trx_t* trx; mem_heap_t* heap; ulint i; ulint n_ext; @@ -1871,9 +1870,10 @@ btr_cur_optimistic_update( heap = mem_heap_create(1024); offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); -#ifdef UNIV_BLOB_NULL_DEBUG - ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, offsets) + || trx_is_recv(thr_get_trx(thr))); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ #ifdef UNIV_DEBUG if (btr_cur_print_record_ops && thr) { @@ -1996,13 +1996,11 @@ any_extern: page_cur_move_to_prev(page_cursor); - trx = thr_get_trx(thr); - if (!(flags & BTR_KEEP_SYS_FLAG)) { row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, roll_ptr); row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - trx->id); + thr_get_trx(thr)->id); } /* There are no externally stored columns in new_entry */ diff --git a/storage/innodb_plugin/include/rem0rec.h b/storage/innodb_plugin/include/rem0rec.h index fff44eecb00..06de23be757 100644 --- a/storage/innodb_plugin/include/rem0rec.h +++ b/storage/innodb_plugin/include/rem0rec.h @@ -480,7 +480,7 @@ ulint rec_offs_any_extern( /*================*/ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */ -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG /******************************************************//** Determine if the offsets are for a record containing null BLOB pointers. @return first field containing a null BLOB pointer, or NULL if none found */ @@ -491,7 +491,7 @@ rec_offs_any_null_extern( const rec_t* rec, /*!< in: record */ const ulint* offsets) /*!< in: rec_get_offsets(rec) */ __attribute__((nonnull, warn_unused_result)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /******************************************************//** Returns nonzero if the extern bit is set in nth field of rec. @return nonzero if externally stored */ diff --git a/storage/innodb_plugin/include/rem0rec.ic b/storage/innodb_plugin/include/rem0rec.ic index 252484a7433..7cff36fee6c 100644 --- a/storage/innodb_plugin/include/rem0rec.ic +++ b/storage/innodb_plugin/include/rem0rec.ic @@ -1088,7 +1088,7 @@ rec_offs_any_extern( return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL)); } -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG /******************************************************//** Determine if the offsets are for a record containing null BLOB pointers. @return first field containing a null BLOB pointer, or NULL if none found */ @@ -1124,7 +1124,7 @@ rec_offs_any_null_extern( return(NULL); } -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /******************************************************//** Returns nonzero if the extern bit is set in nth field of rec. diff --git a/storage/innodb_plugin/include/univ.i b/storage/innodb_plugin/include/univ.i index 63d8e7140b5..5e36867f05a 100644 --- a/storage/innodb_plugin/include/univ.i +++ b/storage/innodb_plugin/include/univ.i @@ -1,8 +1,7 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. -Copyright (c) 2009, Sun Microsystems, Inc. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -179,8 +178,6 @@ command. Not tested on Windows. */ debugging without UNIV_DEBUG */ #define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column debugging without UNIV_DEBUG */ -#define UNIV_BLOB_NULL_DEBUG /* Enable deep off-page - column debugging */ #define UNIV_DEBUG /* Enable ut_ad() assertions and disable UNIV_INLINE */ #define UNIV_DEBUG_LOCK_VALIDATE /* Enable diff --git a/storage/innodb_plugin/row/row0row.c b/storage/innodb_plugin/row/row0row.c index 8aba375d046..3b610d735b0 100644 --- a/storage/innodb_plugin/row/row0row.c +++ b/storage/innodb_plugin/row/row0row.c @@ -223,6 +223,7 @@ row_build( ut_ad(index && rec && heap); ut_ad(dict_index_is_clust(index)); + ut_ad(!mutex_own(&kernel_mutex)); if (!offsets) { offsets = rec_get_offsets(rec, index, offsets_, @@ -231,13 +232,35 @@ row_build( ut_ad(rec_offs_validate(rec, index, offsets)); } -#if 0 && defined UNIV_BLOB_NULL_DEBUG - /* This one can fail in trx_rollback_active() if - the server crashed during an insert before the - btr_store_big_rec_extern_fields() did mtr_commit() - all BLOB pointers to the clustered index record. */ - ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* 0 && UNIV_BLOB_NULL_DEBUG */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + if (UNIV_LIKELY_NULL(rec_offs_any_null_extern(rec, offsets))) { + /* This condition can occur during crash recovery before + trx_rollback_active() has completed execution. + + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. + + Look up the transaction that holds the implicit lock + on this record, and assert that it was recovered (and + will soon be rolled back). */ + + ulint trx_id_pos = dict_index_get_sys_col_pos( + index, DATA_TRX_ID); + ulint len; + trx_id_t trx_id = trx_read_trx_id( + rec_get_nth_field(rec, offsets, trx_id_pos, &len)); + trx_t* trx; + ut_a(len == 6); + + mutex_enter(&kernel_mutex); + trx = trx_get_on_id(trx_id); + ut_a(trx); + ut_a(trx->is_recovered); + mutex_exit(&kernel_mutex); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ @@ -423,10 +446,10 @@ row_rec_to_index_entry( rec = rec_copy(buf, rec, offsets); /* Avoid a debug assertion in rec_offs_validate(). */ rec_offs_make_valid(rec, index, offsets); -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG } else { ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ } entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap); diff --git a/storage/innodb_plugin/row/row0vers.c b/storage/innodb_plugin/row/row0vers.c index 2d39f92d18f..8a7bb842293 100644 --- a/storage/innodb_plugin/row/row0vers.c +++ b/storage/innodb_plugin/row/row0vers.c @@ -550,10 +550,10 @@ row_vers_build_for_consistent_read( /* The view already sees this version: we can copy it to in_heap and return */ -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern( version, *offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); @@ -588,9 +588,9 @@ row_vers_build_for_consistent_read( *offsets = rec_get_offsets(prev_version, index, *offsets, ULINT_UNDEFINED, offset_heap); -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ trx_id = row_get_rec_trx_id(prev_version, index, *offsets); @@ -691,9 +691,9 @@ row_vers_build_for_semi_consistent_read( /* We found a version that belongs to a committed transaction: return it. */ -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (rec == version) { *old_vers = rec; @@ -752,9 +752,9 @@ row_vers_build_for_semi_consistent_read( version = prev_version; *offsets = rec_get_offsets(version, index, *offsets, ULINT_UNDEFINED, offset_heap); -#ifdef UNIV_BLOB_NULL_DEBUG +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); -#endif /* UNIV_BLOB_NULL_DEBUG */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ }/* for (;;) */ if (heap) { diff --git a/storage/innodb_plugin/trx/trx0rec.c b/storage/innodb_plugin/trx/trx0rec.c index 297838365d5..9f2fd59d82b 100644 --- a/storage/innodb_plugin/trx/trx0rec.c +++ b/storage/innodb_plugin/trx/trx0rec.c @@ -1577,9 +1577,9 @@ trx_undo_prev_version_build( return(DB_ERROR); } -# ifdef UNIV_BLOB_NULL_DEBUG +# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(rec, offsets)); -# endif /* UNIV_BLOB_NULL_DEBUG */ +# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (row_upd_changes_field_size_or_external(index, offsets, update)) { ulint n_ext; From e4aa6667a1da2f769ed3af819e6eb4c519431af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 16 Jun 2011 14:22:12 +0300 Subject: [PATCH 20/35] Bug#12595087 - 61191: Question about page_zip_available There is an apparent problem with page_zip_clear_rec(). In btr_cur_optimistic_update() we do this: page_cur_delete_rec(page_cursor, index, offsets, mtr); ... rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr); ut_a(rec); /* <- We calculated above the insert would fit */ The problem is that page_cur_delete_rec() could fill the modification log while doing page_zip_clear_rec(), requiring recompression for the btr_cur_insert_if_possible(). In a pathological case, the data could fail to recompress. page_zip_clear_rec(): Leave the page modification log alone. Only clear the necessary fields. rb:673 approved by Jimmy Yang --- storage/innodb_plugin/ChangeLog | 5 ++ storage/innodb_plugin/page/page0zip.c | 114 +++++++++++--------------- storage/innodb_plugin/rem/rem0rec.c | 10 +-- 3 files changed, 59 insertions(+), 70 deletions(-) diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index ef5f87172c6..d747a45d434 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,8 @@ +2011-06-16 The InnoDB Team + + * page/page0zip.c, rem/rem0rec.c: + Fix Bug#61191 question about page_zip_available() + 2011-06-16 The InnoDB Team * btr/btr0btr.c, btr/btr0cur.c, include/btr0btr.h, include/btr0cur.h, diff --git a/storage/innodb_plugin/page/page0zip.c b/storage/innodb_plugin/page/page0zip.c index 6e866b3f016..2d97b9a0d64 100644 --- a/storage/innodb_plugin/page/page0zip.c +++ b/storage/innodb_plugin/page/page0zip.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2005, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -3912,17 +3912,9 @@ page_zip_write_trx_id_and_roll_ptr( UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); } -#ifdef UNIV_ZIP_DEBUG -/** Set this variable in a debugger to disable page_zip_clear_rec(). -The only observable effect should be the compression ratio due to -deleted records not being zeroed out. In rare cases, there can be -page_zip_validate() failures on the node_ptr, trx_id and roll_ptr -columns if the space is reallocated for a smaller record. */ -UNIV_INTERN ibool page_zip_clear_rec_disable; -#endif /* UNIV_ZIP_DEBUG */ - /**********************************************************************//** -Clear an area on the uncompressed and compressed page, if possible. */ +Clear an area on the uncompressed and compressed page. +Do not clear the data payload, as that would grow the modification log. */ static void page_zip_clear_rec( @@ -3934,6 +3926,9 @@ page_zip_clear_rec( { ulint heap_no; page_t* page = page_align(rec); + byte* storage; + byte* field; + ulint len; /* page_zip_validate() would fail here if a record containing externally stored columns is being deleted. */ ut_ad(rec_offs_validate(rec, index, offsets)); @@ -3949,60 +3944,46 @@ page_zip_clear_rec( UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets), rec_offs_extra_size(offsets)); - if ( -#ifdef UNIV_ZIP_DEBUG - !page_zip_clear_rec_disable && -#endif /* UNIV_ZIP_DEBUG */ - page_zip->m_end - + 1 + ((heap_no - 1) >= 64)/* size of the log entry */ - + page_zip_get_trailer_len(page_zip, - dict_index_is_clust(index), NULL) - < page_zip_get_size(page_zip)) { - byte* data; + if (!page_is_leaf(page)) { + /* Clear node_ptr. On the compressed page, + there is an array of node_ptr immediately before the + dense page directory, at the very end of the page. */ + storage = page_zip->data + + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) + - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + ut_ad(dict_index_get_n_unique_in_tree(index) == + rec_offs_n_fields(offsets) - 1); + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, + &len); + ut_ad(len == REC_NODE_PTR_SIZE); - /* Clear only the data bytes, because the allocator and - the decompressor depend on the extra bytes. */ - memset(rec, 0, rec_offs_data_size(offsets)); + ut_ad(!rec_offs_any_extern(offsets)); + memset(field, 0, REC_NODE_PTR_SIZE); + memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE, + 0, REC_NODE_PTR_SIZE); + } else if (dict_index_is_clust(index)) { + /* Clear trx_id and roll_ptr. On the compressed page, + there is an array of these fields immediately before the + dense page directory, at the very end of the page. */ + const ulint trx_id_pos + = dict_col_get_clust_pos( + dict_table_get_sys_col( + index->table, DATA_TRX_ID), index); + storage = page_zip->data + + page_zip_get_size(page_zip) + - (page_dir_get_n_heap(page) + - PAGE_HEAP_NO_USER_LOW) + * PAGE_ZIP_DIR_SLOT_SIZE; + field = rec_get_nth_field(rec, offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); - if (!page_is_leaf(page)) { - /* Clear node_ptr on the compressed page. */ - byte* storage = page_zip->data - + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; - - memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE, - 0, REC_NODE_PTR_SIZE); - } else if (dict_index_is_clust(index)) { - /* Clear trx_id and roll_ptr on the compressed page. */ - byte* storage = page_zip->data - + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; - - memset(storage - (heap_no - 1) - * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), - 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); - } - - /* Log that the data was zeroed out. */ - data = page_zip->data + page_zip->m_end; - ut_ad(!*data); - if (UNIV_UNLIKELY(heap_no - 1 >= 64)) { - *data++ = (byte) (0x80 | (heap_no - 1) >> 7); - ut_ad(!*data); - } - *data++ = (byte) ((heap_no - 1) << 1 | 1); - ut_ad(!*data); - ut_ad((ulint) (data - page_zip->data) - < page_zip_get_size(page_zip)); - page_zip->m_end = data - page_zip->data; - page_zip->m_nonempty = TRUE; - } else if (page_is_leaf(page) && dict_index_is_clust(index)) { - /* Do not clear the record, because there is not enough space - to log the operation. */ + memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + memset(storage - (heap_no - 1) + * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN), + 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); if (rec_offs_any_extern(offsets)) { ulint i; @@ -4011,15 +3992,18 @@ page_zip_clear_rec( /* Clear all BLOB pointers in order to make page_zip_validate() pass. */ if (rec_offs_nth_extern(offsets, i)) { - ulint len; - byte* field = rec_get_nth_field( + field = rec_get_nth_field( rec, offsets, i, &len); + ut_ad(len + == BTR_EXTERN_FIELD_REF_SIZE); memset(field + len - BTR_EXTERN_FIELD_REF_SIZE, 0, BTR_EXTERN_FIELD_REF_SIZE); } } } + } else { + ut_ad(!rec_offs_any_extern(offsets)); } #ifdef UNIV_ZIP_DEBUG diff --git a/storage/innodb_plugin/rem/rem0rec.c b/storage/innodb_plugin/rem/rem0rec.c index 37ba8ca2ffe..9f90d2940dd 100644 --- a/storage/innodb_plugin/rem/rem0rec.c +++ b/storage/innodb_plugin/rem/rem0rec.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -408,7 +408,7 @@ rec_init_offsets( do { ulint len; if (UNIV_UNLIKELY(i == n_node_ptr_field)) { - len = offs += 4; + len = offs += REC_NODE_PTR_SIZE; goto resolved; } @@ -640,7 +640,7 @@ rec_get_offsets_reverse( do { ulint len; if (UNIV_UNLIKELY(i == n_node_ptr_field)) { - len = offs += 4; + len = offs += REC_NODE_PTR_SIZE; goto resolved; } @@ -1131,9 +1131,9 @@ rec_convert_dtuple_to_rec_comp( if (UNIV_UNLIKELY(i == n_node_ptr_field)) { ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL); - ut_ad(len == 4); + ut_ad(len == REC_NODE_PTR_SIZE); memcpy(end, dfield_get_data(field), len); - end += 4; + end += REC_NODE_PTR_SIZE; break; } From b0fc27dc0a9a0f0a48a1f0cbbdcc0015bbad24bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 16 Jun 2011 14:55:46 +0300 Subject: [PATCH 21/35] Bug #61341 buf_LRU_insert_zip_clean can be O(N) on LRU length The buf_pool->zip_clean list is only needed for debugging, or for recomputing buf_pool->page_hash when resizing the buffer pool. Buffer pool resizing was never fully implemented. Remove the resizing code, and define buf_pool->zip_clean only in debug builds. buf_pool->zip_clean, buf_LRU_insert_zip_clean(): Enclose in #if defined UNIV_DEBUG || UNIV_BUF_DEBUG. buf_chunk_free(), buf_chunk_all_free(), buf_pool_shrink(), buf_pool_page_hash_rebuild(), buf_pool_resize(): Remove (unreachable code). rb:671 approved by Inaam Rana --- storage/innodb_plugin/ChangeLog | 6 + storage/innodb_plugin/buf/buf0buddy.c | 7 +- storage/innodb_plugin/buf/buf0buf.c | 379 +----------------------- storage/innodb_plugin/buf/buf0flu.c | 4 +- storage/innodb_plugin/buf/buf0lru.c | 8 +- storage/innodb_plugin/include/buf0buf.h | 8 +- storage/innodb_plugin/include/buf0lru.h | 4 +- 7 files changed, 31 insertions(+), 385 deletions(-) diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index d747a45d434..cf060ed3d84 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,9 @@ +2011-06-16 The InnoDB Team + + * buf/buf0buddy.c, buf/buf0buf.c, buf/buf0flu.c, buf/buf0lru.c, + include/buf0buf.h, include/buf0lru.h: + Fix Bug#61341 buf_LRU_insert_zip_clean can be O(N) on LRU length + 2011-06-16 The InnoDB Team * page/page0zip.c, rem/rem0rec.c: diff --git a/storage/innodb_plugin/buf/buf0buddy.c b/storage/innodb_plugin/buf/buf0buddy.c index 63c99571510..9a95f2c639b 100644 --- a/storage/innodb_plugin/buf/buf0buddy.c +++ b/storage/innodb_plugin/buf/buf0buddy.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -351,7 +351,9 @@ buf_buddy_relocate_block( buf_page_t* bpage, /*!< in: block to relocate */ buf_page_t* dpage) /*!< in: free block to relocate to */ { +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_page_t* b; +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ ut_ad(buf_pool_mutex_own()); @@ -380,7 +382,7 @@ buf_buddy_relocate_block( buf_relocate(bpage, dpage); ut_d(bpage->state = BUF_BLOCK_ZIP_FREE); - +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /* relocate buf_pool->zip_clean */ b = UT_LIST_GET_PREV(list, dpage); UT_LIST_REMOVE(list, buf_pool->zip_clean, dpage); @@ -390,6 +392,7 @@ buf_buddy_relocate_block( } else { UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage); } +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ UNIV_MEM_INVALID(bpage, sizeof *bpage); diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index 14ec7b75911..0426d5ec872 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -873,72 +873,6 @@ buf_chunk_not_freed( return(NULL); } -/*********************************************************************//** -Checks that all blocks in the buffer chunk are in BUF_BLOCK_NOT_USED state. -@return TRUE if all freed */ -static -ibool -buf_chunk_all_free( -/*===============*/ - const buf_chunk_t* chunk) /*!< in: chunk being checked */ -{ - const buf_block_t* block; - ulint i; - - ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); - - block = chunk->blocks; - - for (i = chunk->size; i--; block++) { - - if (buf_block_get_state(block) != BUF_BLOCK_NOT_USED) { - - return(FALSE); - } - } - - return(TRUE); -} - -/********************************************************************//** -Frees a chunk of buffer frames. */ -static -void -buf_chunk_free( -/*===========*/ - buf_chunk_t* chunk) /*!< out: chunk of buffers */ -{ - buf_block_t* block; - const buf_block_t* block_end; - - ut_ad(buf_pool_mutex_own()); - - block_end = chunk->blocks + chunk->size; - - for (block = chunk->blocks; block < block_end; block++) { - ut_a(buf_block_get_state(block) == BUF_BLOCK_NOT_USED); - ut_a(!block->page.zip.data); - - ut_ad(!block->page.in_LRU_list); - ut_ad(!block->in_unzip_LRU_list); - ut_ad(!block->page.in_flush_list); - /* Remove the block from the free list. */ - ut_ad(block->page.in_free_list); - UT_LIST_REMOVE(list, buf_pool->free, (&block->page)); - - /* Free the latches. */ - mutex_free(&block->mutex); - rw_lock_free(&block->lock); -#ifdef UNIV_SYNC_DEBUG - rw_lock_free(&block->debug_latch); -#endif /* UNIV_SYNC_DEBUG */ - UNIV_MEM_UNDESC(block); - } - - os_mem_free_large(chunk->mem, chunk->mem_size); -} - /********************************************************************//** Creates the buffer pool. @return own: buf_pool object, NULL if not enough memory or error */ @@ -1017,8 +951,6 @@ buf_pool_free(void) chunk = chunks + buf_pool->n_chunks; while (--chunk >= chunks) { - /* Bypass the checks of buf_chunk_free(), since they - would fail at shutdown. */ os_mem_free_large(chunk->mem, chunk->mem_size); } @@ -1193,311 +1125,6 @@ buf_relocate( HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage); } -/********************************************************************//** -Shrinks the buffer pool. */ -static -void -buf_pool_shrink( -/*============*/ - ulint chunk_size) /*!< in: number of pages to remove */ -{ - buf_chunk_t* chunks; - buf_chunk_t* chunk; - ulint max_size; - ulint max_free_size; - buf_chunk_t* max_chunk; - buf_chunk_t* max_free_chunk; - - ut_ad(!buf_pool_mutex_own()); - -try_again: - btr_search_disable(); /* Empty the adaptive hash index again */ - buf_pool_mutex_enter(); - -shrink_again: - if (buf_pool->n_chunks <= 1) { - - /* Cannot shrink if there is only one chunk */ - goto func_done; - } - - /* Search for the largest free chunk - not larger than the size difference */ - chunks = buf_pool->chunks; - chunk = chunks + buf_pool->n_chunks; - max_size = max_free_size = 0; - max_chunk = max_free_chunk = NULL; - - while (--chunk >= chunks) { - if (chunk->size <= chunk_size - && chunk->size > max_free_size) { - if (chunk->size > max_size) { - max_size = chunk->size; - max_chunk = chunk; - } - - if (buf_chunk_all_free(chunk)) { - max_free_size = chunk->size; - max_free_chunk = chunk; - } - } - } - - if (!max_free_size) { - - ulint dirty = 0; - ulint nonfree = 0; - buf_block_t* block; - buf_block_t* bend; - - /* Cannot shrink: try again later - (do not assign srv_buf_pool_old_size) */ - if (!max_chunk) { - - goto func_exit; - } - - block = max_chunk->blocks; - bend = block + max_chunk->size; - - /* Move the blocks of chunk to the end of the - LRU list and try to flush them. */ - for (; block < bend; block++) { - switch (buf_block_get_state(block)) { - case BUF_BLOCK_NOT_USED: - continue; - case BUF_BLOCK_FILE_PAGE: - break; - default: - nonfree++; - continue; - } - - mutex_enter(&block->mutex); - /* The following calls will temporarily - release block->mutex and buf_pool_mutex. - Therefore, we have to always retry, - even if !dirty && !nonfree. */ - - if (!buf_flush_ready_for_replace(&block->page)) { - - buf_LRU_make_block_old(&block->page); - dirty++; - } else if (buf_LRU_free_block(&block->page, TRUE) - != BUF_LRU_FREED) { - nonfree++; - } - - mutex_exit(&block->mutex); - } - - buf_pool_mutex_exit(); - - /* Request for a flush of the chunk if it helps. - Do not flush if there are non-free blocks, since - flushing will not make the chunk freeable. */ - if (nonfree) { - /* Avoid busy-waiting. */ - os_thread_sleep(100000); - } else if (dirty - && buf_flush_batch(BUF_FLUSH_LRU, dirty, 0) - == ULINT_UNDEFINED) { - - buf_flush_wait_batch_end(BUF_FLUSH_LRU); - } - - goto try_again; - } - - max_size = max_free_size; - max_chunk = max_free_chunk; - - srv_buf_pool_old_size = srv_buf_pool_size; - - /* Rewrite buf_pool->chunks. Copy everything but max_chunk. */ - chunks = mem_alloc((buf_pool->n_chunks - 1) * sizeof *chunks); - memcpy(chunks, buf_pool->chunks, - (max_chunk - buf_pool->chunks) * sizeof *chunks); - memcpy(chunks + (max_chunk - buf_pool->chunks), - max_chunk + 1, - buf_pool->chunks + buf_pool->n_chunks - - (max_chunk + 1)); - ut_a(buf_pool->curr_size > max_chunk->size); - buf_pool->curr_size -= max_chunk->size; - srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE; - chunk_size -= max_chunk->size; - buf_chunk_free(max_chunk); - mem_free(buf_pool->chunks); - buf_pool->chunks = chunks; - buf_pool->n_chunks--; - - /* Allow a slack of one megabyte. */ - if (chunk_size > 1048576 / UNIV_PAGE_SIZE) { - - goto shrink_again; - } - -func_done: - srv_buf_pool_old_size = srv_buf_pool_size; -func_exit: - buf_pool_mutex_exit(); - btr_search_enable(); -} - -/********************************************************************//** -Rebuild buf_pool->page_hash. */ -static -void -buf_pool_page_hash_rebuild(void) -/*============================*/ -{ - ulint i; - ulint n_chunks; - buf_chunk_t* chunk; - hash_table_t* page_hash; - hash_table_t* zip_hash; - buf_page_t* b; - - buf_pool_mutex_enter(); - - /* Free, create, and populate the hash table. */ - hash_table_free(buf_pool->page_hash); - buf_pool->page_hash = page_hash = hash_create(2 * buf_pool->curr_size); - zip_hash = hash_create(2 * buf_pool->curr_size); - - HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash, - BUF_POOL_ZIP_FOLD_BPAGE); - - hash_table_free(buf_pool->zip_hash); - buf_pool->zip_hash = zip_hash; - - /* Insert the uncompressed file pages to buf_pool->page_hash. */ - - chunk = buf_pool->chunks; - n_chunks = buf_pool->n_chunks; - - for (i = 0; i < n_chunks; i++, chunk++) { - ulint j; - buf_block_t* block = chunk->blocks; - - for (j = 0; j < chunk->size; j++, block++) { - if (buf_block_get_state(block) - == BUF_BLOCK_FILE_PAGE) { - ut_ad(!block->page.in_zip_hash); - ut_ad(block->page.in_page_hash); - - HASH_INSERT(buf_page_t, hash, page_hash, - buf_page_address_fold( - block->page.space, - block->page.offset), - &block->page); - } - } - } - - /* Insert the compressed-only pages to buf_pool->page_hash. - All such blocks are either in buf_pool->zip_clean or - in buf_pool->flush_list. */ - - for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(list, b)) { - ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); - ut_ad(!b->in_flush_list); - ut_ad(b->in_LRU_list); - ut_ad(b->in_page_hash); - ut_ad(!b->in_zip_hash); - - HASH_INSERT(buf_page_t, hash, page_hash, - buf_page_address_fold(b->space, b->offset), b); - } - - for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(list, b)) { - ut_ad(b->in_flush_list); - ut_ad(b->in_LRU_list); - ut_ad(b->in_page_hash); - ut_ad(!b->in_zip_hash); - - switch (buf_page_get_state(b)) { - case BUF_BLOCK_ZIP_DIRTY: - HASH_INSERT(buf_page_t, hash, page_hash, - buf_page_address_fold(b->space, - b->offset), b); - break; - case BUF_BLOCK_FILE_PAGE: - /* uncompressed page */ - break; - case BUF_BLOCK_ZIP_FREE: - case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - ut_error; - break; - } - } - - buf_pool_mutex_exit(); -} - -/********************************************************************//** -Resizes the buffer pool. */ -UNIV_INTERN -void -buf_pool_resize(void) -/*=================*/ -{ - buf_pool_mutex_enter(); - - if (srv_buf_pool_old_size == srv_buf_pool_size) { - - buf_pool_mutex_exit(); - return; - } - - if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) { - - buf_pool_mutex_exit(); - - /* Disable adaptive hash indexes and empty the index - in order to free up memory in the buffer pool chunks. */ - buf_pool_shrink((srv_buf_pool_curr_size - srv_buf_pool_size) - / UNIV_PAGE_SIZE); - } else if (srv_buf_pool_curr_size + 1048576 < srv_buf_pool_size) { - - /* Enlarge the buffer pool by at least one megabyte */ - - ulint mem_size - = srv_buf_pool_size - srv_buf_pool_curr_size; - buf_chunk_t* chunks; - buf_chunk_t* chunk; - - chunks = mem_alloc((buf_pool->n_chunks + 1) * sizeof *chunks); - - memcpy(chunks, buf_pool->chunks, buf_pool->n_chunks - * sizeof *chunks); - - chunk = &chunks[buf_pool->n_chunks]; - - if (!buf_chunk_init(chunk, mem_size)) { - mem_free(chunks); - } else { - buf_pool->curr_size += chunk->size; - srv_buf_pool_curr_size = buf_pool->curr_size - * UNIV_PAGE_SIZE; - mem_free(buf_pool->chunks); - buf_pool->chunks = chunks; - buf_pool->n_chunks++; - } - - srv_buf_pool_old_size = srv_buf_pool_size; - buf_pool_mutex_exit(); - } - - buf_pool_page_hash_rebuild(); -} - /********************************************************************//** Moves a page to the start of the buffer pool LRU list. This high-level function can be used to prevent an important page from slipping out of @@ -2233,8 +1860,10 @@ wait_until_unfixed: if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) { +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG UT_LIST_REMOVE(list, buf_pool->zip_clean, &block->page); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ ut_ad(!block->page.in_flush_list); } else { /* Relocate buf_pool->flush_list. */ @@ -2975,7 +2604,9 @@ err_exit: /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(bpage); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ buf_page_set_io_fix(bpage, BUF_IO_READ); diff --git a/storage/innodb_plugin/buf/buf0flu.c b/storage/innodb_plugin/buf/buf0flu.c index 3a9975ce4b7..60cf8d58bc7 100644 --- a/storage/innodb_plugin/buf/buf0flu.c +++ b/storage/innodb_plugin/buf/buf0flu.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -435,7 +435,9 @@ buf_flush_remove( case BUF_BLOCK_ZIP_DIRTY: buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(bpage); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ break; case BUF_BLOCK_FILE_PAGE: UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); diff --git a/storage/innodb_plugin/buf/buf0lru.c b/storage/innodb_plugin/buf/buf0lru.c index a69b2658c51..01e7e9a5f69 100644 --- a/storage/innodb_plugin/buf/buf0lru.c +++ b/storage/innodb_plugin/buf/buf0lru.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -501,6 +501,7 @@ next_page_no_mutex: } } +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /********************************************************************//** Insert a compressed block into buf_pool->zip_clean in the LRU order. */ UNIV_INTERN @@ -532,6 +533,7 @@ buf_LRU_insert_zip_clean( UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, bpage); } } +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ /******************************************************************//** Try to free an uncompressed page of a compressed block from the unzip @@ -1518,7 +1520,9 @@ alloc: } if (b->state == BUF_BLOCK_ZIP_PAGE) { +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(b); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ } else { /* Relocate on buf_pool->flush_list. */ buf_flush_relocate_on_flush_list(bpage, b); @@ -1797,7 +1801,9 @@ buf_LRU_block_remove_hashed_page( ut_a(bpage->zip.data); ut_a(buf_page_get_zip_size(bpage)); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage); +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ mutex_exit(&buf_pool_zip_mutex); buf_pool_mutex_exit_forbid(); diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h index 775ddc758d0..2dfb821e199 100644 --- a/storage/innodb_plugin/include/buf0buf.h +++ b/storage/innodb_plugin/include/buf0buf.h @@ -141,12 +141,6 @@ buf_relocate( BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */ buf_page_t* dpage) /*!< in/out: destination control block */ __attribute__((nonnull)); -/********************************************************************//** -Resizes the buffer pool. */ -UNIV_INTERN -void -buf_pool_resize(void); -/*=================*/ /*********************************************************************//** Gets the current size of buffer buf_pool in bytes. @return size in bytes */ @@ -1446,8 +1440,10 @@ struct buf_pool_struct{ frames and buf_page_t descriptors of blocks that exist in the buffer pool only in compressed form. */ /* @{ */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG UT_LIST_BASE_NODE_T(buf_page_t) zip_clean; /*!< unmodified compressed pages */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES]; /*!< buddy free lists */ #if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE diff --git a/storage/innodb_plugin/include/buf0lru.h b/storage/innodb_plugin/include/buf0lru.h index d543bce53cd..bea1f7d5b1e 100644 --- a/storage/innodb_plugin/include/buf0lru.h +++ b/storage/innodb_plugin/include/buf0lru.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -84,6 +84,7 @@ void buf_LRU_invalidate_tablespace( /*==========================*/ ulint id); /*!< in: space id */ +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /********************************************************************//** Insert a compressed block into buf_pool->zip_clean in the LRU order. */ UNIV_INTERN @@ -91,6 +92,7 @@ void buf_LRU_insert_zip_clean( /*=====================*/ buf_page_t* bpage); /*!< in: pointer to the block in question */ +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ /******************************************************************//** Try to free a block. If bpage is a descriptor of a compressed-only From 0dfe86f53f1a54f0d260ac8bd3c1bd712331e34d Mon Sep 17 00:00:00 2001 From: Vasil Dimov Date: Thu, 16 Jun 2011 16:11:43 +0300 Subject: [PATCH 22/35] Silence bogus compiler warning introduced in marko.makela@oracle.com-20110616072721-8bo92ctixq6eqavr --- storage/innobase/btr/btr0btr.c | 2 +- storage/innodb_plugin/btr/btr0btr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/innobase/btr/btr0btr.c b/storage/innobase/btr/btr0btr.c index 41e0bf6e067..790582815a3 100644 --- a/storage/innobase/btr/btr0btr.c +++ b/storage/innobase/btr/btr0btr.c @@ -2063,7 +2063,7 @@ btr_compress( rec_t* node_ptr; ulint data_size; ulint n_recs; - ulint nth_rec; + ulint nth_rec = 0; /* remove bogus warning */ ulint max_ins_size; ulint max_ins_size_reorg; ulint comp; diff --git a/storage/innodb_plugin/btr/btr0btr.c b/storage/innodb_plugin/btr/btr0btr.c index fb472062fe6..ac188c8c2fd 100644 --- a/storage/innodb_plugin/btr/btr0btr.c +++ b/storage/innodb_plugin/btr/btr0btr.c @@ -3132,7 +3132,7 @@ btr_compress( ulint* offsets; ulint data_size; ulint n_recs; - ulint nth_rec; + ulint nth_rec = 0; /* remove bogus warning */ ulint max_ins_size; ulint max_ins_size_reorg; From edfd31a06a223620a32ea996f974d4dd3decfb80 Mon Sep 17 00:00:00 2001 From: Dmitry Lenev Date: Fri, 17 Jun 2011 02:02:52 +0400 Subject: [PATCH 23/35] Fix for bug #12652385 - "61493: REORDERING COLUMNS TO POSITION FIRST CAN CAUSE DATA TO BE CORRUPTED". ALTER TABLE MODIFY/CHANGE ... FIRST did nothing except renaming columns if new version of the table had exactly the same structure as the old one (i.e. as result of such statement, names of columns changed their order as specified but data in columns didn't). The same thing happened for ALTER TABLE DROP COLUMN/ADD COLUMN statements which were supposed to produce new version of table with exactly the same structure as the old version of table. I.e. in the latter case the result was the same as if old column was renamed instead of being dropped and new column with default as value being created. Both these problems were caused by the fact that ALTER TABLE implementation incorrectly interpreted both these situations as simple renaming of columns and assumed that in-place ALTER TABLE algorithm could have been used for them. This patch fixes this problem by ensuring that in cases when some column is moved to the first position or some column is dropped the default ALTER TABLE algorithm involving table copying is always used. This is achieved by detecting such situations in mysql_prepare_alter_table() and setting Alter_info::change_level to ALTER_TABLE_DATA_CHANGED for them. mysql-test/r/alter_table.result: Added test for bug #12652385 - "61493: REORDERING COLUMNS TO POSITION FIRST CAN CAUSE DATA TO BE CORRUPTED". mysql-test/t/alter_table.test: Added test for bug #12652385 - "61493: REORDERING COLUMNS TO POSITION FIRST CAN CAUSE DATA TO BE CORRUPTED". sql/sql_table.cc: Changed mysql_prepare_alter_table() to detect situations in which we some column moved to the first position or some column is dropped and ensure that such ALTER TABLE statements won't be carried out using in-place algorithm. The latter could have happened before this patch if new version of table had the same structure as the old one (except the column names). --- mysql-test/r/alter_table.result | 29 +++++++++++++++++++++++++++++ mysql-test/t/alter_table.test | 27 +++++++++++++++++++++++++++ sql/sql_table.cc | 17 +++++++++++++++++ 3 files changed, 73 insertions(+) diff --git a/mysql-test/r/alter_table.result b/mysql-test/r/alter_table.result index 004e2031fb1..674238b14cd 100644 --- a/mysql-test/r/alter_table.result +++ b/mysql-test/r/alter_table.result @@ -1345,4 +1345,33 @@ DROP TABLE t1; CREATE TABLE t1 (a TEXT, id INT, b INT); ALTER TABLE t1 DROP COLUMN a, ADD COLUMN c TEXT FIRST; DROP TABLE t1; +# +# Test for bug #12652385 - "61493: REORDERING COLUMNS TO POSITION +# FIRST CAN CAUSE DATA TO BE CORRUPTED". +# +drop table if exists t1; +# Use MyISAM engine as the fact that InnoDB doesn't support +# in-place ALTER TABLE in cases when columns are being renamed +# hides some bugs. +create table t1 (i int, j int) engine=myisam; +insert into t1 value (1, 2); +# First, test for original problem described in the bug report. +select * from t1; +i j +1 2 +# Change of column order by the below ALTER TABLE statement should +# affect both column names and column contents. +alter table t1 modify column j int first; +select * from t1; +j i +2 1 +# Now test for similar problem with the same root. +# The below ALTER TABLE should change not only the name but +# also the value for the last column of the table. +alter table t1 drop column i, add column k int default 0; +select * from t1; +j k +2 0 +# Clean-up. +drop table t1; End of 5.1 tests diff --git a/mysql-test/t/alter_table.test b/mysql-test/t/alter_table.test index 4989a6c380c..2a8ac2a00d6 100644 --- a/mysql-test/t/alter_table.test +++ b/mysql-test/t/alter_table.test @@ -1073,4 +1073,31 @@ ALTER TABLE t1 DROP COLUMN a, ADD COLUMN c TEXT FIRST; DROP TABLE t1; +--echo # +--echo # Test for bug #12652385 - "61493: REORDERING COLUMNS TO POSITION +--echo # FIRST CAN CAUSE DATA TO BE CORRUPTED". +--echo # +--disable_warnings +drop table if exists t1; +--enable_warnings +--echo # Use MyISAM engine as the fact that InnoDB doesn't support +--echo # in-place ALTER TABLE in cases when columns are being renamed +--echo # hides some bugs. +create table t1 (i int, j int) engine=myisam; +insert into t1 value (1, 2); +--echo # First, test for original problem described in the bug report. +select * from t1; +--echo # Change of column order by the below ALTER TABLE statement should +--echo # affect both column names and column contents. +alter table t1 modify column j int first; +select * from t1; +--echo # Now test for similar problem with the same root. +--echo # The below ALTER TABLE should change not only the name but +--echo # also the value for the last column of the table. +alter table t1 drop column i, add column k int default 0; +select * from t1; +--echo # Clean-up. +drop table t1; + + --echo End of 5.1 tests diff --git a/sql/sql_table.cc b/sql/sql_table.cc index 58e2684e5b7..077f00f9ea8 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -6170,6 +6170,12 @@ mysql_prepare_alter_table(THD *thd, TABLE *table, if (drop) { drop_it.remove(); + /* + ALTER TABLE DROP COLUMN always changes table data even in cases + when new version of the table has the same structure as the old + one. + */ + alter_info->change_level= ALTER_TABLE_DATA_CHANGED; continue; } /* Check if field is changed */ @@ -6247,7 +6253,14 @@ mysql_prepare_alter_table(THD *thd, TABLE *table, if (!def->after) new_create_list.push_back(def); else if (def->after == first_keyword) + { new_create_list.push_front(def); + /* + Re-ordering columns in table can't be done using in-place algorithm + as it always changes table data. + */ + alter_info->change_level= ALTER_TABLE_DATA_CHANGED; + } else { Create_field *find; @@ -6263,6 +6276,10 @@ mysql_prepare_alter_table(THD *thd, TABLE *table, goto err; } find_it.after(def); // Put element after this + /* + Re-ordering columns in table can't be done using in-place algorithm + as it always changes table data. + */ alter_info->change_level= ALTER_TABLE_DATA_CHANGED; } } From 7de029da9a1ef8841c2f7d21299583a5e5671e46 Mon Sep 17 00:00:00 2001 From: Jon Olav Hauglid Date: Fri, 17 Jun 2011 09:51:34 +0200 Subject: [PATCH 24/35] Bug#12657095 YASSL ERROR MESSAGE CONTAINS TYPO This patch fixes a typo in a YaSSL error message. --- extra/yassl/src/yassl_error.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/yassl/src/yassl_error.cpp b/extra/yassl/src/yassl_error.cpp index dd30348cd93..862cd82fb3b 100644 --- a/extra/yassl/src/yassl_error.cpp +++ b/extra/yassl/src/yassl_error.cpp @@ -128,7 +128,7 @@ void SetErrorString(unsigned long error, char* buffer) break; case badVersion_error : - strncpy(buffer, "protocl version mismatch", max); + strncpy(buffer, "protocol version mismatch", max); break; case compress_error : From e60e65052f4a1310a2a725652c195ef74dee046e Mon Sep 17 00:00:00 2001 From: Inaam Rana Date: Fri, 17 Jun 2011 16:20:20 -0400 Subject: [PATCH 25/35] Bug 12635227 - 61188: DROP TABLE EXTREMELY SLOW approved by: Marko rb://681 Coalescing of free buf_page_t descriptors can prove to be one severe bottleneck in performance of compression. One such workload where it hurts badly is DROP TABLE. This patch removes buf_page_t allocations from buf_buddy and uses ut_malloc instead. In order to further reduce overhead of colaescing we no longer attempt to coalesce a block if the corresponding free_list is less than 16 in size. --- storage/innodb_plugin/ChangeLog | 7 + storage/innodb_plugin/btr/btr0cur.c | 2 +- storage/innodb_plugin/buf/buf0buddy.c | 412 ++++++--------------- storage/innodb_plugin/buf/buf0buf.c | 29 +- storage/innodb_plugin/buf/buf0lru.c | 228 ++++-------- storage/innodb_plugin/include/buf0buddy.h | 21 +- storage/innodb_plugin/include/buf0buddy.ic | 37 +- storage/innodb_plugin/include/buf0buf.h | 17 + storage/innodb_plugin/include/buf0buf.ic | 29 ++ storage/innodb_plugin/include/buf0lru.h | 19 +- storage/innodb_plugin/include/buf0types.h | 17 +- 11 files changed, 289 insertions(+), 529 deletions(-) diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index cf060ed3d84..6d9e074d202 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,10 @@ +2011-06-16 The InnoDB Team + + * btr/btr0cur.c, buf/buf0buddy.c, buf/buf0buf.c, buf/buf0lru.c, + include/buf0buddy.h, include/buf0buddy.ic, include/buf0buf.h, + include/buf0buf.ic, include/buf0lru.h, include/buf0types.h: + Fix Bug#61188 DROP TABLE extremely slow + 2011-06-16 The InnoDB Team * buf/buf0buddy.c, buf/buf0buf.c, buf/buf0flu.c, buf/buf0lru.c, diff --git a/storage/innodb_plugin/btr/btr0cur.c b/storage/innodb_plugin/btr/btr0cur.c index 651c2c342a5..f378c477537 100644 --- a/storage/innodb_plugin/btr/btr0cur.c +++ b/storage/innodb_plugin/btr/btr0cur.c @@ -3864,7 +3864,7 @@ btr_blob_free( && buf_block_get_space(block) == space && buf_block_get_page_no(block) == page_no) { - if (buf_LRU_free_block(&block->page, all) != BUF_LRU_FREED + if (!buf_LRU_free_block(&block->page, all) && all && block->page.zip.data) { /* Attempt to deallocate the uncompressed page if the whole block cannot be deallocted. */ diff --git a/storage/innodb_plugin/buf/buf0buddy.c b/storage/innodb_plugin/buf/buf0buddy.c index 9a95f2c639b..de571743361 100644 --- a/storage/innodb_plugin/buf/buf0buddy.c +++ b/storage/innodb_plugin/buf/buf0buddy.c @@ -45,6 +45,14 @@ static ulint buf_buddy_n_frames; Protected by buf_pool_mutex. */ UNIV_INTERN buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES + 1]; +/** Validate a given zip_free list. */ +#define BUF_BUDDY_LIST_VALIDATE(i) \ + UT_LIST_VALIDATE(list, buf_page_t, \ + buf_pool->zip_free[i], \ + ut_ad(buf_page_get_state( \ + ut_list_node_313) \ + == BUF_BLOCK_ZIP_FREE)) + /**********************************************************************//** Get the offset of the buddy of a compressed page frame. @return the buddy relative of page */ @@ -76,21 +84,10 @@ buf_buddy_add_to_free( buf_page_t* bpage, /*!< in,own: block to be freed */ ulint i) /*!< in: index of buf_pool->zip_free[] */ { -#ifdef UNIV_DEBUG_VALGRIND - buf_page_t* b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); - - if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i); -#endif /* UNIV_DEBUG_VALGRIND */ - ut_ad(buf_pool_mutex_own()); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); ut_ad(buf_pool->zip_free[i].start != bpage); UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], bpage); - -#ifdef UNIV_DEBUG_VALGRIND - if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i); - UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); -#endif /* UNIV_DEBUG_VALGRIND */ } /**********************************************************************//** @@ -102,25 +99,17 @@ buf_buddy_remove_from_free( buf_page_t* bpage, /*!< in: block to be removed */ ulint i) /*!< in: index of buf_pool->zip_free[] */ { -#ifdef UNIV_DEBUG_VALGRIND +#ifdef UNIV_DEBUG buf_page_t* prev = UT_LIST_GET_PREV(list, bpage); buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); - if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i); - if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i); - ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE); ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE); -#endif /* UNIV_DEBUG_VALGRIND */ +#endif /* UNIV_DEBUG */ ut_ad(buf_pool_mutex_own()); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); UT_LIST_REMOVE(list, buf_pool->zip_free[i], bpage); - -#ifdef UNIV_DEBUG_VALGRIND - if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i); - if (next) UNIV_MEM_FREE(next, BUF_BUDDY_LOW << i); -#endif /* UNIV_DEBUG_VALGRIND */ } /**********************************************************************//** @@ -136,17 +125,13 @@ buf_buddy_alloc_zip( ut_ad(buf_pool_mutex_own()); ut_a(i < BUF_BUDDY_SIZES); + ut_a(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); + + ut_d(BUF_BUDDY_LIST_VALIDATE(i)); -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state(ut_list_node_313) - == BUF_BLOCK_ZIP_FREE))); -#endif /* !UNIV_DEBUG_VALGRIND */ bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); if (bpage) { - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); buf_buddy_remove_from_free(bpage, i); @@ -165,13 +150,10 @@ buf_buddy_alloc_zip( } } -#ifdef UNIV_DEBUG if (bpage) { - memset(bpage, ~i, BUF_BUDDY_LOW << i); + ut_d(memset(bpage, ~i, BUF_BUDDY_LOW << i)); + UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i); } -#endif /* UNIV_DEBUG */ - - UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i); return(bpage); } @@ -255,6 +237,7 @@ buf_buddy_alloc_from( { ulint offs = BUF_BUDDY_LOW << j; ut_ad(j <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); ut_ad(j >= i); ut_ad(!ut_align_offset(buf, offs)); @@ -268,13 +251,7 @@ buf_buddy_alloc_from( bpage = (buf_page_t*) ((byte*) buf + offs); ut_d(memset(bpage, j, BUF_BUDDY_LOW << j)); bpage->state = BUF_BLOCK_ZIP_FREE; -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state( - ut_list_node_313) - == BUF_BLOCK_ZIP_FREE))); -#endif /* !UNIV_DEBUG_VALGRIND */ + ut_d(BUF_BUDDY_LIST_VALIDATE(i)); buf_buddy_add_to_free(bpage, j); } @@ -284,8 +261,8 @@ buf_buddy_alloc_from( /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex. -The buf_pool_mutex may only be released and reacquired if lru != NULL. -@return allocated block, possibly NULL if lru==NULL */ +The buf_pool_mutex may be released and reacquired. +@return allocated block, never NULL */ UNIV_INTERN void* buf_buddy_alloc_low( @@ -294,13 +271,14 @@ buf_buddy_alloc_low( or BUF_BUDDY_SIZES */ ibool* lru) /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ + and buf_pool_mutex was temporarily released */ { buf_block_t* block; + ut_ad(lru); ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); if (i < BUF_BUDDY_SIZES) { /* Try to allocate from the buddy system. */ @@ -320,11 +298,6 @@ buf_buddy_alloc_low( goto alloc_big; } - if (!lru) { - - return(NULL); - } - /* Try replacing an uncompressed page in the buffer pool. */ buf_pool_mutex_exit(); block = buf_LRU_get_free_block(); @@ -341,65 +314,6 @@ func_exit: return(block); } -/**********************************************************************//** -Try to relocate the control block of a compressed page. -@return TRUE if relocated */ -static -ibool -buf_buddy_relocate_block( -/*=====================*/ - buf_page_t* bpage, /*!< in: block to relocate */ - buf_page_t* dpage) /*!< in: free block to relocate to */ -{ -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - buf_page_t* b; -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - - ut_ad(buf_pool_mutex_own()); - - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_ZIP_FREE: - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_FILE_PAGE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - ut_error; - case BUF_BLOCK_ZIP_DIRTY: - /* Cannot relocate dirty pages. */ - return(FALSE); - - case BUF_BLOCK_ZIP_PAGE: - break; - } - - mutex_enter(&buf_pool_zip_mutex); - - if (!buf_page_can_relocate(bpage)) { - mutex_exit(&buf_pool_zip_mutex); - return(FALSE); - } - - buf_relocate(bpage, dpage); - ut_d(bpage->state = BUF_BLOCK_ZIP_FREE); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - /* relocate buf_pool->zip_clean */ - b = UT_LIST_GET_PREV(list, dpage); - UT_LIST_REMOVE(list, buf_pool->zip_clean, dpage); - - if (b) { - UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, dpage); - } else { - UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage); - } -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - - UNIV_MEM_INVALID(bpage, sizeof *bpage); - - mutex_exit(&buf_pool_zip_mutex); - return(TRUE); -} - /**********************************************************************//** Try to relocate a block. @return TRUE if relocated */ @@ -414,106 +328,89 @@ buf_buddy_relocate( buf_page_t* bpage; const ulint size = BUF_BUDDY_LOW << i; ullint usec = ut_time_us(NULL); + mutex_t* mutex; + ulint space; + ulint page_no; ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_ad(!ut_align_offset(src, size)); ut_ad(!ut_align_offset(dst, size)); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); UNIV_MEM_ASSERT_W(dst, size); /* We assume that all memory from buf_buddy_alloc() - is used for either compressed pages or buf_page_t - objects covering compressed pages. */ + is used for compressed page frames. */ /* We look inside the allocated objects returned by - buf_buddy_alloc() and assume that anything of - PAGE_ZIP_MIN_SIZE or larger is a compressed page that contains - a valid space_id and page_no in the page header. Should the - fields be invalid, we will be unable to relocate the block. - We also assume that anything that fits sizeof(buf_page_t) - actually is a properly initialized buf_page_t object. */ + buf_buddy_alloc() and assume that each block is a compressed + page that contains a valid space_id and page_no in the page + header. Should the fields be invalid, we will be unable to + relocate the block. */ - if (size >= PAGE_ZIP_MIN_SIZE) { - /* This is a compressed page. */ - mutex_t* mutex; + /* The src block may be split into smaller blocks, + some of which may be free. Thus, the + mach_read_from_4() calls below may attempt to read + from free memory. The memory is "owned" by the buddy + allocator (and it has been allocated from the buffer + pool), so there is nothing wrong about this. The + mach_read_from_4() calls here will only trigger bogus + Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */ + space = mach_read_from_4((const byte *) src + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + page_no = mach_read_from_4((const byte *) src + + FIL_PAGE_OFFSET); + /* Suppress Valgrind warnings about conditional jump + on uninitialized value. */ + UNIV_MEM_VALID(&space, sizeof space); + UNIV_MEM_VALID(&page_no, sizeof page_no); + bpage = buf_page_hash_get(space, page_no); - /* The src block may be split into smaller blocks, - some of which may be free. Thus, the - mach_read_from_4() calls below may attempt to read - from free memory. The memory is "owned" by the buddy - allocator (and it has been allocated from the buffer - pool), so there is nothing wrong about this. The - mach_read_from_4() calls here will only trigger bogus - Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */ - ulint space = mach_read_from_4( - (const byte*) src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - ulint page_no = mach_read_from_4( - (const byte*) src + FIL_PAGE_OFFSET); - /* Suppress Valgrind warnings about conditional jump - on uninitialized value. */ - UNIV_MEM_VALID(&space, sizeof space); - UNIV_MEM_VALID(&page_no, sizeof page_no); - bpage = buf_page_hash_get(space, page_no); + if (!bpage || bpage->zip.data != src) { + /* The block has probably been freshly + allocated by buf_LRU_get_free_block() but not + added to buf_pool->page_hash yet. Obviously, + it cannot be relocated. */ - if (!bpage || bpage->zip.data != src) { - /* The block has probably been freshly - allocated by buf_LRU_get_free_block() but not - added to buf_pool->page_hash yet. Obviously, - it cannot be relocated. */ - - return(FALSE); - } - - if (page_zip_get_size(&bpage->zip) != size) { - /* The block is of different size. We would - have to relocate all blocks covered by src. - For the sake of simplicity, give up. */ - ut_ad(page_zip_get_size(&bpage->zip) < size); - - return(FALSE); - } - - /* The block must have been allocated, but it may - contain uninitialized data. */ - UNIV_MEM_ASSERT_W(src, size); - - mutex = buf_page_get_mutex(bpage); - - mutex_enter(mutex); - - if (buf_page_can_relocate(bpage)) { - /* Relocate the compressed page. */ - ut_a(bpage->zip.data == src); - memcpy(dst, src, size); - bpage->zip.data = dst; - mutex_exit(mutex); -success: - UNIV_MEM_INVALID(src, size); - { - buf_buddy_stat_t* buddy_stat - = &buf_buddy_stat[i]; - buddy_stat->relocated++; - buddy_stat->relocated_usec - += ut_time_us(NULL) - usec; - } - return(TRUE); - } - - mutex_exit(mutex); - } else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) { - /* This must be a buf_page_t object. */ -#if UNIV_WORD_SIZE == 4 - /* On 32-bit systems, there is no padding in - buf_page_t. On other systems, Valgrind could complain - about uninitialized pad bytes. */ - UNIV_MEM_ASSERT_RW(src, size); -#endif - if (buf_buddy_relocate_block(src, dst)) { - - goto success; - } + return(FALSE); } + if (page_zip_get_size(&bpage->zip) != size) { + /* The block is of different size. We would + have to relocate all blocks covered by src. + For the sake of simplicity, give up. */ + ut_ad(page_zip_get_size(&bpage->zip) < size); + + return(FALSE); + } + + /* The block must have been allocated, but it may + contain uninitialized data. */ + UNIV_MEM_ASSERT_W(src, size); + + mutex = buf_page_get_mutex(bpage); + + mutex_enter(mutex); + + if (buf_page_can_relocate(bpage)) { + /* Relocate the compressed page. */ + ut_a(bpage->zip.data == src); + memcpy(dst, src, size); + bpage->zip.data = dst; + mutex_exit(mutex); + UNIV_MEM_INVALID(src, size); + { + buf_buddy_stat_t* buddy_stat + = &buf_buddy_stat[i]; + buddy_stat->relocated++; + buddy_stat->relocated_usec + += ut_time_us(NULL) - usec; + } + return(TRUE); + } + + mutex_exit(mutex); + return(FALSE); } @@ -534,12 +431,14 @@ buf_buddy_free_low( ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)); ut_ad(buf_buddy_stat[i].used > 0); buf_buddy_stat[i].used--; + recombine: UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i); - ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE); + ((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE; if (i == BUF_BUDDY_SIZES) { buf_buddy_block_free(buf); @@ -550,32 +449,36 @@ recombine: ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); ut_ad(!buf_pool_contains_zip(buf)); - /* Try to combine adjacent blocks. */ + /* Do not recombine blocks if there are few free blocks. + We may waste up to 15360*max_len bytes to free blocks + (1024 + 2048 + 4096 + 8192 = 15360) */ + if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) { + goto func_exit; + } + /* Try to combine adjacent blocks. */ buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i); #ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ + /* When Valgrind instrumentation is not enabled, we can read + buddy->state to quickly determine that a block is not free. + When the block is not free, buddy->state belongs to a compressed + page frame that may be flagged uninitialized in our Valgrind + instrumentation. */ if (buddy->state != BUF_BLOCK_ZIP_FREE) { goto buddy_nonfree; } - - /* The field buddy->state can only be trusted for free blocks. - If buddy->state == BUF_BLOCK_ZIP_FREE, the block is free if - it is in the free list. */ #endif /* !UNIV_DEBUG_VALGRIND */ for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) { - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); if (bpage == buddy) { -buddy_free: /* The buddy is free: recombine */ buf_buddy_remove_from_free(bpage, i); -buddy_free2: +buddy_is_free: ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE); ut_ad(!buf_pool_contains_zip(buddy)); i++; @@ -585,122 +488,43 @@ buddy_free2: } ut_a(bpage != buf); - - { - buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); - UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); - bpage = next; - } + UNIV_MEM_ASSERT_W(bpage, BUF_BUDDY_LOW << i); + bpage = UT_LIST_GET_NEXT(list, bpage); } #ifndef UNIV_DEBUG_VALGRIND buddy_nonfree: - /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state(ut_list_node_313) - == BUF_BLOCK_ZIP_FREE))); -#endif /* UNIV_DEBUG_VALGRIND */ +#endif /* !UNIV_DEBUG_VALGRIND */ + + ut_d(BUF_BUDDY_LIST_VALIDATE(i)); /* The buddy is not free. Is there a free block of this size? */ bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); if (bpage) { + /* Remove the block from the free list, because a successful buf_buddy_relocate() will overwrite bpage->list. */ - - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); buf_buddy_remove_from_free(bpage, i); /* Try to relocate the buddy of buf to the free block. */ if (buf_buddy_relocate(buddy, bpage, i)) { - ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); - goto buddy_free2; + buddy->state = BUF_BLOCK_ZIP_FREE; + goto buddy_is_free; } buf_buddy_add_to_free(bpage, i); - - /* Try to relocate the buddy of the free block to buf. */ - buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage), - BUF_BUDDY_LOW << i); - -#ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing free memory. */ - - /* The buddy must not be (completely) free, because we - always recombine adjacent free blocks. - - (Parts of the buddy can be free in - buf_pool->zip_free[j] with j < i.) */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i], - ut_ad(buf_page_get_state( - ut_list_node_313) - == BUF_BLOCK_ZIP_FREE - && ut_list_node_313 != buddy))); -#endif /* !UNIV_DEBUG_VALGRIND */ - - if (buf_buddy_relocate(buddy, buf, i)) { - - buf = bpage; - UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); - ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); - goto buddy_free; - } } +func_exit: /* Free the block to the buddy list. */ bpage = buf; -#ifdef UNIV_DEBUG - if (i < buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)) { - /* This area has most likely been allocated for at - least one compressed-only block descriptor. Check - that there are no live objects in the area. This is - not a complete check: it may yield false positives as - well as false negatives. Also, due to buddy blocks - being recombined, it is possible (although unlikely) - that this branch is never reached. */ - char* c; + /* Fill large blocks with a constant pattern. */ + ut_d(memset(bpage, i, BUF_BUDDY_LOW << i)); + UNIV_MEM_INVALID(bpage, BUF_BUDDY_LOW << i); -# ifndef UNIV_DEBUG_VALGRIND - /* Valgrind would complain about accessing - uninitialized memory. Besides, Valgrind performs a - more exhaustive check, at every memory access. */ - const buf_page_t* b = buf; - const buf_page_t* const b_end = (buf_page_t*) - ((char*) b + (BUF_BUDDY_LOW << i)); - - for (; b < b_end; b++) { - /* Avoid false positives (and cause false - negatives) by checking for b->space < 1000. */ - - if ((b->state == BUF_BLOCK_ZIP_PAGE - || b->state == BUF_BLOCK_ZIP_DIRTY) - && b->space > 0 && b->space < 1000) { - fprintf(stderr, - "buddy dirty %p %u (%u,%u) %p,%lu\n", - (void*) b, - b->state, b->space, b->offset, - buf, i); - } - } -# endif /* !UNIV_DEBUG_VALGRIND */ - - /* Scramble the block. This should make any pointers - invalid and trigger a segmentation violation. Because - the scrambling can be reversed, it may be possible to - track down the object pointing to the freed data by - dereferencing the unscrambled bpage->LRU or - bpage->list pointers. */ - for (c = (char*) buf + (BUF_BUDDY_LOW << i); - c-- > (char*) buf; ) { - *c = ~*c ^ i; - } - } else { - /* Fill large blocks with a constant pattern. */ - memset(bpage, i, BUF_BUDDY_LOW << i); - } -#endif /* UNIV_DEBUG */ bpage->state = BUF_BLOCK_ZIP_FREE; buf_buddy_add_to_free(bpage, i); } diff --git a/storage/innodb_plugin/buf/buf0buf.c b/storage/innodb_plugin/buf/buf0buf.c index 0426d5ec872..c0da7dbdbe5 100644 --- a/storage/innodb_plugin/buf/buf0buf.c +++ b/storage/innodb_plugin/buf/buf0buf.c @@ -1358,7 +1358,7 @@ err_exit: mutex_enter(block_mutex); /* Discard the uncompressed page frame if possible. */ - if (buf_LRU_free_block(bpage, FALSE) == BUF_LRU_FREED) { + if (buf_LRU_free_block(bpage, FALSE)) { mutex_exit(block_mutex); goto lookup; @@ -1699,13 +1699,8 @@ loop: if (block) { /* If the guess is a compressed page descriptor that - has been allocated by buf_buddy_alloc(), it may have - been invalidated by buf_buddy_relocate(). In that - case, block could point to something that happens to - contain the expected bits in block->page. Similarly, - the guess may be pointing to a buffer pool chunk that - has been released when resizing the buffer pool. */ - + has been allocated by buf_page_alloc_descriptor(), + it may have been freed by buf_relocate(). */ if (!buf_block_is_uncompressed(block) || offset != block->page.offset || space != block->page.space @@ -1889,11 +1884,10 @@ wait_until_unfixed: mutex_exit(&buf_pool_zip_mutex); buf_pool->n_pend_unzip++; - bpage->state = BUF_BLOCK_ZIP_FREE; - buf_buddy_free(bpage, sizeof *bpage); - buf_pool_mutex_exit(); + buf_page_free_descriptor(bpage); + /* Decompress the page and apply buffered operations while not holding buf_pool_mutex or block->mutex. */ success = buf_zip_decompress(block, srv_use_checksums); @@ -1937,7 +1931,7 @@ wait_until_unfixed: /* Try to evict the block from the buffer pool, to use the insert buffer as much as possible. */ - if (buf_LRU_free_block(&block->page, TRUE) == BUF_LRU_FREED) { + if (buf_LRU_free_block(&block->page, TRUE)) { buf_pool_mutex_exit(); mutex_exit(&block->mutex); fprintf(stderr, @@ -2551,17 +2545,12 @@ err_exit: mutex_exit(&block->mutex); } else { - /* Defer buf_buddy_alloc() until after the block has - been found not to exist. The buf_buddy_alloc() and - buf_buddy_free() calls may be expensive because of - buf_buddy_relocate(). */ /* The compressed page must be allocated before the control block (bpage), in order to avoid the invocation of buf_buddy_relocate_block() on uninitialized data. */ data = buf_buddy_alloc(zip_size, &lru); - bpage = buf_buddy_alloc(sizeof *bpage, &lru); /* If buf_buddy_alloc() allocated storage from the LRU list, it released and reacquired buf_pool_mutex. Thus, we must @@ -2569,15 +2558,13 @@ err_exit: if (UNIV_UNLIKELY(lru) && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) { - /* The block was added by some other thread. */ - bpage->state = BUF_BLOCK_ZIP_FREE; - buf_buddy_free(bpage, sizeof *bpage); buf_buddy_free(data, zip_size); - bpage = NULL; goto func_exit; } + bpage = buf_page_alloc_descriptor(); + page_zip_des_init(&bpage->zip); page_zip_set_size(&bpage->zip, zip_size); bpage->zip.data = data; diff --git a/storage/innodb_plugin/buf/buf0lru.c b/storage/innodb_plugin/buf/buf0lru.c index 01e7e9a5f69..ad6feef5f2f 100644 --- a/storage/innodb_plugin/buf/buf0lru.c +++ b/storage/innodb_plugin/buf/buf0lru.c @@ -355,7 +355,7 @@ scan_again: while (bpage != NULL) { buf_page_t* prev_bpage; - ibool prev_bpage_buf_fix = FALSE; + mutex_t* block_mutex = NULL; ut_a(buf_page_in_file(bpage)); @@ -368,18 +368,21 @@ scan_again: if (buf_page_get_space(bpage) != id) { /* Skip this block, as it does not belong to the space that is being invalidated. */ + goto next_page; } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { /* We cannot remove this page during this scan yet; maybe the system is currently reading it in, or flushing the modifications to the file */ all_freed = FALSE; + goto next_page; } else { - mutex_t* block_mutex = buf_page_get_mutex(bpage); + block_mutex = buf_page_get_mutex(bpage); mutex_enter(block_mutex); if (bpage->buf_fix_count > 0) { + mutex_exit(block_mutex); /* We cannot remove this page during this scan yet; maybe the system is currently reading it in, or flushing @@ -389,106 +392,59 @@ scan_again: goto next_page; } - -#ifdef UNIV_DEBUG - if (buf_debug_prints) { - fprintf(stderr, - "Dropping space %lu page %lu\n", - (ulong) buf_page_get_space(bpage), - (ulong) buf_page_get_page_no(bpage)); - } -#endif - if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { - /* This is a compressed-only block - descriptor. Ensure that prev_bpage - cannot be relocated when bpage is freed. */ - if (UNIV_LIKELY(prev_bpage != NULL)) { - switch (buf_page_get_state( - prev_bpage)) { - case BUF_BLOCK_FILE_PAGE: - /* Descriptors of uncompressed - blocks will not be relocated, - because we are holding the - buf_pool_mutex. */ - break; - case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: - /* Descriptors of compressed- - only blocks can be relocated, - unless they are buffer-fixed. - Because both bpage and - prev_bpage are protected by - buf_pool_zip_mutex, it is - not necessary to acquire - further mutexes. */ - ut_ad(&buf_pool_zip_mutex - == block_mutex); - ut_ad(mutex_own(block_mutex)); - prev_bpage_buf_fix = TRUE; - prev_bpage->buf_fix_count++; - break; - default: - ut_error; - } - } - } else if (((buf_block_t*) bpage)->is_hashed) { - ulint page_no; - ulint zip_size; - - buf_pool_mutex_exit(); - - zip_size = buf_page_get_zip_size(bpage); - page_no = buf_page_get_page_no(bpage); - - mutex_exit(block_mutex); - - /* Note that the following call will acquire - an S-latch on the page */ - - btr_search_drop_page_hash_when_freed( - id, zip_size, page_no); - goto scan_again; - } - - if (bpage->oldest_modification != 0) { - - buf_flush_remove(bpage); - } - - /* Remove from the LRU list. */ - - if (buf_LRU_block_remove_hashed_page(bpage, TRUE) - != BUF_BLOCK_ZIP_FREE) { - buf_LRU_block_free_hashed_page((buf_block_t*) - bpage); - } else { - /* The block_mutex should have been - released by buf_LRU_block_remove_hashed_page() - when it returns BUF_BLOCK_ZIP_FREE. */ - ut_ad(block_mutex == &buf_pool_zip_mutex); - ut_ad(!mutex_own(block_mutex)); - - if (prev_bpage_buf_fix) { - /* We temporarily buffer-fixed - prev_bpage, so that - buf_buddy_free() could not - relocate it, in case it was a - compressed-only block - descriptor. */ - - mutex_enter(block_mutex); - ut_ad(prev_bpage->buf_fix_count > 0); - prev_bpage->buf_fix_count--; - mutex_exit(block_mutex); - } - - goto next_page_no_mutex; - } -next_page: - mutex_exit(block_mutex); } -next_page_no_mutex: + ut_ad(mutex_own(block_mutex)); + +#ifdef UNIV_DEBUG + if (buf_debug_prints) { + fprintf(stderr, + "Dropping space %lu page %lu\n", + (ulong) buf_page_get_space(bpage), + (ulong) buf_page_get_page_no(bpage)); + } +#endif + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + /* This is a compressed-only block + descriptor. Do nothing. */ + } else if (((buf_block_t*) bpage)->is_hashed) { + ulint page_no; + ulint zip_size; + + buf_pool_mutex_exit(); + + zip_size = buf_page_get_zip_size(bpage); + page_no = buf_page_get_page_no(bpage); + + mutex_exit(block_mutex); + + /* Note that the following call will acquire + an S-latch on the page */ + + btr_search_drop_page_hash_when_freed( + id, zip_size, page_no); + goto scan_again; + } + + if (bpage->oldest_modification != 0) { + + buf_flush_remove(bpage); + } + + /* Remove from the LRU list. */ + + if (buf_LRU_block_remove_hashed_page(bpage, TRUE) + != BUF_BLOCK_ZIP_FREE) { + buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + mutex_exit(block_mutex); + } else { + /* The block_mutex should have been released + by buf_LRU_block_remove_hashed_page() when it + returns BUF_BLOCK_ZIP_FREE. */ + ut_ad(block_mutex == &buf_pool_zip_mutex); + ut_ad(!mutex_own(block_mutex)); + } +next_page: bpage = prev_bpage; } @@ -574,7 +530,7 @@ buf_LRU_free_from_unzip_LRU_list( UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0); block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) { - enum buf_lru_free_block_status freed; + ibool freed; ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->in_unzip_LRU_list); @@ -584,24 +540,9 @@ buf_LRU_free_from_unzip_LRU_list( freed = buf_LRU_free_block(&block->page, FALSE); mutex_exit(&block->mutex); - switch (freed) { - case BUF_LRU_FREED: + if (freed) { return(TRUE); - - case BUF_LRU_CANNOT_RELOCATE: - /* If we failed to relocate, try - regular LRU eviction. */ - return(FALSE); - - case BUF_LRU_NOT_FREED: - /* The block was buffer-fixed or I/O-fixed. - Keep looking. */ - continue; } - - /* inappropriate return value from - buf_LRU_free_block() */ - ut_error; } return(FALSE); @@ -632,10 +573,9 @@ buf_LRU_free_from_common_LRU_list( UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0); bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) { - enum buf_lru_free_block_status freed; - unsigned accessed; - mutex_t* block_mutex - = buf_page_get_mutex(bpage); + ibool freed; + unsigned accessed; + mutex_t* block_mutex = buf_page_get_mutex(bpage); ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); @@ -645,8 +585,7 @@ buf_LRU_free_from_common_LRU_list( freed = buf_LRU_free_block(bpage, TRUE); mutex_exit(block_mutex); - switch (freed) { - case BUF_LRU_FREED: + if (freed) { /* Keep track of pages that are evicted without ever being accessed. This gives us a measure of the effectiveness of readahead */ @@ -654,21 +593,7 @@ buf_LRU_free_from_common_LRU_list( ++buf_pool->stat.n_ra_pages_evicted; } return(TRUE); - - case BUF_LRU_NOT_FREED: - /* The block was dirty, buffer-fixed, or I/O-fixed. - Keep looking. */ - continue; - - case BUF_LRU_CANNOT_RELOCATE: - /* This should never occur, because we - want to discard the compressed page too. */ - break; } - - /* inappropriate return value from - buf_LRU_free_block() */ - ut_error; } return(FALSE); @@ -1350,17 +1275,16 @@ buf_LRU_make_block_old( Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. -NOTE: If this function returns BUF_LRU_FREED, it will temporarily +NOTE: If this function returns TRUE, it will temporarily release buf_pool_mutex. Furthermore, the page frame will no longer be accessible via bpage. The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and release these two mutexes after the call. No other buf_page_get_mutex() may be held when calling this function. -@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or -BUF_LRU_NOT_FREED otherwise. */ +@return TRUE if freed, FALSE otherwise. */ UNIV_INTERN -enum buf_lru_free_block_status +ibool buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ @@ -1385,7 +1309,7 @@ buf_LRU_free_block( if (!buf_page_can_relocate(bpage)) { /* Do not free buffer-fixed or I/O-fixed blocks. */ - return(BUF_LRU_NOT_FREED); + return(FALSE); } #ifdef UNIV_IBUF_COUNT_DEBUG @@ -1397,7 +1321,7 @@ buf_LRU_free_block( /* Do not completely free dirty blocks. */ if (bpage->oldest_modification) { - return(BUF_LRU_NOT_FREED); + return(FALSE); } } else if (bpage->oldest_modification) { /* Do not completely free dirty blocks. */ @@ -1405,7 +1329,7 @@ buf_LRU_free_block( if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); - return(BUF_LRU_NOT_FREED); + return(FALSE); } goto alloc; @@ -1414,14 +1338,8 @@ buf_LRU_free_block( If it cannot be allocated (without freeing a block from the LRU list), refuse to free bpage. */ alloc: - buf_pool_mutex_exit_forbid(); - b = buf_buddy_alloc(sizeof *b, NULL); - buf_pool_mutex_exit_allow(); - - if (UNIV_UNLIKELY(!b)) { - return(BUF_LRU_CANNOT_RELOCATE); - } - + b = buf_page_alloc_descriptor(); + ut_a(b); memcpy(b, bpage, sizeof *b); } @@ -1589,7 +1507,7 @@ alloc: mutex_enter(block_mutex); } - return(BUF_LRU_FREED); + return(TRUE); } /******************************************************************//** @@ -1809,10 +1727,8 @@ buf_LRU_block_remove_hashed_page( buf_pool_mutex_exit_forbid(); buf_buddy_free(bpage->zip.data, page_zip_get_size(&bpage->zip)); - bpage->state = BUF_BLOCK_ZIP_FREE; - buf_buddy_free(bpage, sizeof(*bpage)); buf_pool_mutex_exit_allow(); - UNIV_MEM_UNDESC(bpage); + buf_page_free_descriptor(bpage); return(BUF_BLOCK_ZIP_FREE); case BUF_BLOCK_FILE_PAGE: diff --git a/storage/innodb_plugin/include/buf0buddy.h b/storage/innodb_plugin/include/buf0buddy.h index 7648950d5d1..d218a7112f1 100644 --- a/storage/innodb_plugin/include/buf0buddy.h +++ b/storage/innodb_plugin/include/buf0buddy.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -37,24 +37,19 @@ Created December 2006 by Marko Makela /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool_mutex and must not hold buf_pool_zip_mutex or any -block->mutex. The buf_pool_mutex may only be released and reacquired -if lru != NULL. This function should only be used for allocating -compressed page frames or control blocks (buf_page_t). Allocated -control blocks must be properly initialized immediately after -buf_buddy_alloc() has returned the memory, before releasing -buf_pool_mutex. -@return allocated block, possibly NULL if lru == NULL */ +block->mutex. The buf_pool_mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ UNIV_INLINE void* buf_buddy_alloc( /*============*/ - ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ + ulint size, /*!< in: compressed page size + (between PAGE_ZIP_MIN_SIZE and UNIV_PAGE_SIZE) */ ibool* lru) /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ - __attribute__((malloc)); - + and buf_pool_mutex was temporarily released */ + __attribute__((malloc, nonnull)); /**********************************************************************//** Release a block. */ UNIV_INLINE diff --git a/storage/innodb_plugin/include/buf0buddy.ic b/storage/innodb_plugin/include/buf0buddy.ic index c419a2374d9..1087b45ee61 100644 --- a/storage/innodb_plugin/include/buf0buddy.ic +++ b/storage/innodb_plugin/include/buf0buddy.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -36,8 +36,8 @@ Created December 2006 by Marko Makela /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex. -The buf_pool_mutex may only be released and reacquired if lru != NULL. -@return allocated block, possibly NULL if lru==NULL */ +The buf_pool_mutex may be released and reacquired. +@return allocated block, never NULL */ UNIV_INTERN void* buf_buddy_alloc_low( @@ -46,9 +46,8 @@ buf_buddy_alloc_low( or BUF_BUDDY_SIZES */ ibool* lru) /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ - __attribute__((malloc)); + and buf_pool_mutex was temporarily released */ + __attribute__((malloc, nonnull)); /**********************************************************************//** Deallocate a block. */ @@ -74,6 +73,8 @@ buf_buddy_get_slot( ulint i; ulint s; + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) { } @@ -84,26 +85,25 @@ buf_buddy_get_slot( /**********************************************************************//** Allocate a block. The thread calling this function must hold buf_pool_mutex and must not hold buf_pool_zip_mutex or any -block->mutex. The buf_pool_mutex may only be released and reacquired -if lru != NULL. This function should only be used for allocating -compressed page frames or control blocks (buf_page_t). Allocated -control blocks must be properly initialized immediately after -buf_buddy_alloc() has returned the memory, before releasing -buf_pool_mutex. -@return allocated block, possibly NULL if lru == NULL */ +block->mutex. The buf_pool_mutex may be released and reacquired. +This function should only be used for allocating compressed page frames. +@return allocated block, never NULL */ UNIV_INLINE void* buf_buddy_alloc( /*============*/ - ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */ + ulint size, /*!< in: compressed page size + (between PAGE_ZIP_MIN_SIZE and UNIV_PAGE_SIZE) */ ibool* lru) /*!< in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list - and buf_pool_mutex was temporarily released, - or NULL if the LRU list should not be used */ + and buf_pool_mutex was temporarily released */ { ut_ad(buf_pool_mutex_own()); + ut_ad(ut_is_2pow(size)); + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + ut_ad(size <= UNIV_PAGE_SIZE); - return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru)); + return((byte*) buf_buddy_alloc_low(buf_buddy_get_slot(size), lru)); } /**********************************************************************//** @@ -117,6 +117,9 @@ buf_buddy_free( ulint size) /*!< in: block size, up to UNIV_PAGE_SIZE */ { ut_ad(buf_pool_mutex_own()); + ut_ad(ut_is_2pow(size)); + ut_ad(size >= PAGE_ZIP_MIN_SIZE); + ut_ad(size <= UNIV_PAGE_SIZE); buf_buddy_free_low(buf, buf_buddy_get_slot(size)); } diff --git a/storage/innodb_plugin/include/buf0buf.h b/storage/innodb_plugin/include/buf0buf.h index 2dfb821e199..86c47a6edba 100644 --- a/storage/innodb_plugin/include/buf0buf.h +++ b/storage/innodb_plugin/include/buf0buf.h @@ -156,6 +156,23 @@ UNIV_INLINE ib_uint64_t buf_pool_get_oldest_modification(void); /*==================================*/ +/********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ + __attribute__((malloc)); +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ + __attribute__((nonnull)); + /********************************************************************//** Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ diff --git a/storage/innodb_plugin/include/buf0buf.ic b/storage/innodb_plugin/include/buf0buf.ic index d4ca07a4cd8..35c63f034f5 100644 --- a/storage/innodb_plugin/include/buf0buf.ic +++ b/storage/innodb_plugin/include/buf0buf.ic @@ -714,6 +714,35 @@ buf_block_get_lock_hash_val( return(block->lock_hash_val); } +/********************************************************************//** +Allocates a buf_page_t descriptor. This function must succeed. In case +of failure we assert in this function. +@return: the allocated descriptor. */ +UNIV_INLINE +buf_page_t* +buf_page_alloc_descriptor(void) +/*===========================*/ +{ + buf_page_t* bpage; + + bpage = (buf_page_t*) ut_malloc(sizeof *bpage); + ut_d(memset(bpage, 0, sizeof *bpage)); + UNIV_MEM_ALLOC(bpage, sizeof *bpage); + + return(bpage); +} + +/********************************************************************//** +Free a buf_page_t descriptor. */ +UNIV_INLINE +void +buf_page_free_descriptor( +/*=====================*/ + buf_page_t* bpage) /*!< in: bpage descriptor to free. */ +{ + ut_free(bpage); +} + /********************************************************************//** Allocates a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ diff --git a/storage/innodb_plugin/include/buf0lru.h b/storage/innodb_plugin/include/buf0lru.h index bea1f7d5b1e..0c2102b9549 100644 --- a/storage/innodb_plugin/include/buf0lru.h +++ b/storage/innodb_plugin/include/buf0lru.h @@ -30,18 +30,6 @@ Created 11/5/1995 Heikki Tuuri #include "ut0byte.h" #include "buf0types.h" -/** The return type of buf_LRU_free_block() */ -enum buf_lru_free_block_status { - /** freed */ - BUF_LRU_FREED = 0, - /** not freed because the caller asked to remove the - uncompressed frame but the control block cannot be - relocated */ - BUF_LRU_CANNOT_RELOCATE, - /** not freed because of some other reason */ - BUF_LRU_NOT_FREED -}; - /******************************************************************//** Tries to remove LRU flushed blocks from the end of the LRU list and put them to the free list. This is beneficial for the efficiency of the insert buffer @@ -98,17 +86,16 @@ buf_LRU_insert_zip_clean( Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. -NOTE: If this function returns BUF_LRU_FREED, it will temporarily +NOTE: If this function returns TRUE, it will temporarily release buf_pool_mutex. Furthermore, the page frame will no longer be accessible via bpage. The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and release these two mutexes after the call. No other buf_page_get_mutex() may be held when calling this function. -@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or -BUF_LRU_NOT_FREED otherwise. */ +@return TRUE if freed, FALSE otherwise. */ UNIV_INTERN -enum buf_lru_free_block_status +ibool buf_LRU_free_block( /*===============*/ buf_page_t* bpage, /*!< in: block to be freed */ diff --git a/storage/innodb_plugin/include/buf0types.h b/storage/innodb_plugin/include/buf0types.h index bfae6477135..4fe0b4483c8 100644 --- a/storage/innodb_plugin/include/buf0types.h +++ b/storage/innodb_plugin/include/buf0types.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,6 +26,8 @@ Created 11/17/1995 Heikki Tuuri #ifndef buf0types_h #define buf0types_h +#include "page0types.h" + /** Buffer page (uncompressed or compressed) */ typedef struct buf_page_struct buf_page_t; /** Buffer block for which an uncompressed page exists */ @@ -58,17 +60,10 @@ enum buf_io_fix { /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ /* @{ */ -#if UNIV_WORD_SIZE <= 4 /* 32-bit system */ -/** Base-2 logarithm of the smallest buddy block size */ -# define BUF_BUDDY_LOW_SHIFT 6 -#else /* 64-bit system */ -/** Base-2 logarithm of the smallest buddy block size */ -# define BUF_BUDDY_LOW_SHIFT 7 -#endif +#define BUF_BUDDY_LOW_SHIFT PAGE_ZIP_MIN_SIZE_SHIFT + #define BUF_BUDDY_LOW (1 << BUF_BUDDY_LOW_SHIFT) - /*!< minimum block size in the binary - buddy system; must be at least - sizeof(buf_page_t) */ + #define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT) /*!< number of buddy sizes */ From d09b999fb383f25c7f8c9ac84eb4227c8624e310 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 23 Jun 2011 12:40:19 +0300 Subject: [PATCH 26/35] Bug#11753728 45225: Locking: hang if drop table with no timeout Fix a failure of the re-enabled innodb-index.test in the embedded server. Apparently, the embedded server does not default to ENGINE=InnoDB when copying an InnoDB table by CREATE TABLE t2 SELECT * FROM t1; --- mysql-test/suite/innodb/r/innodb-index.result | 3 ++- mysql-test/suite/innodb/t/innodb-index.test | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mysql-test/suite/innodb/r/innodb-index.result b/mysql-test/suite/innodb/r/innodb-index.result index 3f3a14c72f2..e35c40ac4be 100644 --- a/mysql-test/suite/innodb/r/innodb-index.result +++ b/mysql-test/suite/innodb/r/innodb-index.result @@ -1087,7 +1087,8 @@ DROP TABLE t2; DROP TABLE t1; CREATE TABLE t1 (a INT, b CHAR(1)) ENGINE=InnoDB; INSERT INTO t1 VALUES (3,'a'),(3,'b'),(1,'c'),(0,'d'),(1,'e'); -CREATE TABLE t2 SELECT * FROM t1; +CREATE TABLE t2 (a INT, b CHAR(1)) ENGINE=InnoDB; +INSERT INTO t2 SELECT * FROM t1; BEGIN; SELECT * FROM t1; a b diff --git a/mysql-test/suite/innodb/t/innodb-index.test b/mysql-test/suite/innodb/t/innodb-index.test index 7f5b041202c..a7b75eff377 100644 --- a/mysql-test/suite/innodb/t/innodb-index.test +++ b/mysql-test/suite/innodb/t/innodb-index.test @@ -521,7 +521,8 @@ connect (b,localhost,root,,); connection a; CREATE TABLE t1 (a INT, b CHAR(1)) ENGINE=InnoDB; INSERT INTO t1 VALUES (3,'a'),(3,'b'),(1,'c'),(0,'d'),(1,'e'); -CREATE TABLE t2 SELECT * FROM t1; +CREATE TABLE t2 (a INT, b CHAR(1)) ENGINE=InnoDB; +INSERT INTO t2 SELECT * FROM t1; connection b; BEGIN; # This acquires a MDL lock on t1 until commit. From bc7af1757939f1ec3c389da73176c7c68a8bafd5 Mon Sep 17 00:00:00 2001 From: Dmitry Shulga Date: Thu, 23 Jun 2011 20:41:04 +0700 Subject: [PATCH 27/35] Fixed Bug#11756013 (formerly known as bug#47870): BOGUS "THE TABLE MYSQL.PROC IS MISSING,..." There was a race condition between loading a stored routine (function/procedure/trigger) specified by fully qualified name SCHEMA_NAME.PROC_NAME and dropping the stored routine database. The problem was that there is a window for race condition when one server thread tries to load a stored routine being executed and the other thread tries to drop the stored routine schema. This condition race window exists in implementation of function mysql_change_db() called by db_load_routine() during loading of stored routine to cache. Function mysql_change_db() calls check_db_dir_existence() that might failed because specified database was dropped during concurrent execution of DROP SCHEMA statement. db_load_routine() calls mysql_change_db() with flag 'force_switch' set to 'true' value so when referenced db is not found then my_error() is not called and function mysql_change_db() returns ok. This shadows information about schema opening error in db_load_routine(). Then db_load_routine() makes attempt to parse stored routine that is failed. This makes to return error to sp_cache_routines_and_add_tables_aux() but since during error generation a call to my_error wasn't made and hence THD::main_da wasn't set we set the generic "mysql.proc table corrupt" error when running sp_cache_routines_and_add_tables_aux(). The fix is to install an error handler inside db_load_routine() for the mysql_op_change_db() call, and check later if the ER_BAD_DB_ERROR was caught. sql/sql_db.cc: Added synchronization point "before_db_dir_check" to emulate a race condition during processing of CALL/DROP SCHEMA. --- mysql-test/r/sp_sync.result | 14 ++++++++++++- mysql-test/t/sp_sync.test | 31 ++++++++++++++++++++++++++- sql/sp.cc | 42 ++++++++++++++++++++++++++++++++++++- sql/sql_db.cc | 3 +++ 4 files changed, 87 insertions(+), 3 deletions(-) diff --git a/mysql-test/r/sp_sync.result b/mysql-test/r/sp_sync.result index afa37e70531..17292f4c1a8 100644 --- a/mysql-test/r/sp_sync.result +++ b/mysql-test/r/sp_sync.result @@ -1,4 +1,4 @@ -Tests of syncronization of stored procedure execution. +Tests of synchronization of stored procedure execution. # # Bug#48157: crash in Item_field::used_tables # @@ -20,4 +20,16 @@ SET DEBUG_SYNC = 'now SIGNAL go'; # code, this test statement will hang. DROP TABLE t1, t2; DROP PROCEDURE p1; +# +# test for bug#11756013 +# +DROP SCHEMA IF EXISTS s1; +CREATE SCHEMA s1; +CREATE PROCEDURE s1.p1() BEGIN END; +SET DEBUG_SYNC='before_db_dir_check SIGNAL check_db WAIT_FOR dropped_schema'; +CALL s1.p1; +SET DEBUG_SYNC='now WAIT_FOR check_db'; +DROP SCHEMA s1; +SET DEBUG_SYNC='now SIGNAL dropped_schema'; +ERROR 42000: Unknown database 's1' SET DEBUG_SYNC = 'RESET'; diff --git a/mysql-test/t/sp_sync.test b/mysql-test/t/sp_sync.test index f9dae17b039..d8458f69eef 100644 --- a/mysql-test/t/sp_sync.test +++ b/mysql-test/t/sp_sync.test @@ -1,7 +1,10 @@ # This test should work in embedded server after mysqltest is fixed -- source include/not_embedded.inc ---echo Tests of syncronization of stored procedure execution. +# Save the initial number of concurrent sessions +--source include/count_sessions.inc + +--echo Tests of synchronization of stored procedure execution. --source include/have_debug_sync.inc @@ -54,5 +57,31 @@ connection default; DROP TABLE t1, t2; DROP PROCEDURE p1; +--echo # +--echo # test for bug#11756013 +--echo # +--disable_warnings +DROP SCHEMA IF EXISTS s1; +--enable_warnings +CREATE SCHEMA s1; +CREATE PROCEDURE s1.p1() BEGIN END; + +connect (con3, localhost, root); +SET DEBUG_SYNC='before_db_dir_check SIGNAL check_db WAIT_FOR dropped_schema'; +--send CALL s1.p1 + +connection default; +SET DEBUG_SYNC='now WAIT_FOR check_db'; +DROP SCHEMA s1; +SET DEBUG_SYNC='now SIGNAL dropped_schema'; + +connection con3; +--error ER_BAD_DB_ERROR +--reap +connection default; +disconnect con3; + SET DEBUG_SYNC = 'RESET'; +# Wait till we reached the initial number of concurrent sessions +--source include/wait_until_count_sessions.inc diff --git a/sql/sp.cc b/sql/sp.cc index ddddaee2e10..78d1dc7e22c 100644 --- a/sql/sp.cc +++ b/sql/sp.cc @@ -708,6 +708,37 @@ Silence_deprecated_warning::handle_error(uint sql_errno, const char *message, } +class Bad_db_error_handler : public Internal_error_handler +{ +public: + Bad_db_error_handler() + :m_error_caught(false) + {} + + virtual bool handle_error(uint sql_errno, const char *message, + MYSQL_ERROR::enum_warning_level level, + THD *thd); + + bool error_caught() const { return m_error_caught; } + +private: + bool m_error_caught; +}; + +bool +Bad_db_error_handler::handle_error(uint sql_errno, const char *message, + MYSQL_ERROR::enum_warning_level level, + THD *thd) +{ + if (sql_errno == ER_BAD_DB_ERROR) + { + m_error_caught= true; + return true; + } + return false; +} + + static int db_load_routine(THD *thd, int type, sp_name *name, sp_head **sphp, ulong sql_mode, const char *params, const char *returns, @@ -725,7 +756,7 @@ db_load_routine(THD *thd, int type, sp_name *name, sp_head **sphp, ha_rows old_select_limit= thd->variables.select_limit; sp_rcontext *old_spcont= thd->spcont; Silence_deprecated_warning warning_handler; - + Bad_db_error_handler db_not_exists_handler; char definer_user_name_holder[USERNAME_LENGTH + 1]; LEX_STRING definer_user_name= { definer_user_name_holder, USERNAME_LENGTH }; @@ -766,6 +797,7 @@ db_load_routine(THD *thd, int type, sp_name *name, sp_head **sphp, goto end; } + thd->push_internal_handler(&db_not_exists_handler); /* Change the current database (if needed). @@ -776,9 +808,17 @@ db_load_routine(THD *thd, int type, sp_name *name, sp_head **sphp, &cur_db_changed)) { ret= SP_INTERNAL_ERROR; + thd->pop_internal_handler(); goto end; } + thd->pop_internal_handler(); + if (db_not_exists_handler.error_caught()) + { + ret= SP_INTERNAL_ERROR; + my_error(ER_BAD_DB_ERROR, MYF(0), name->m_db.str); + goto end; + } thd->spcont= NULL; { diff --git a/sql/sql_db.cc b/sql/sql_db.cc index d7d7f43a7aa..e0d848321a7 100644 --- a/sql/sql_db.cc +++ b/sql/sql_db.cc @@ -26,6 +26,7 @@ #ifdef __WIN__ #include #endif +#include "debug_sync.h" #define MAX_DROP_TABLE_Q_LEN 1024 @@ -1702,6 +1703,8 @@ bool mysql_change_db(THD *thd, const LEX_STRING *new_db_name, bool force_switch) } #endif + DEBUG_SYNC(thd, "before_db_dir_check"); + if (check_db_dir_existence(new_db_file_name.str)) { if (force_switch) From 0c54d44fef3306143dd2133617dede11828c78e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 28 Jun 2011 11:57:09 +0300 Subject: [PATCH 28/35] Bug#12595087 - 61191: Question about page_zip_available (clean up page0zip.c) page_zip_dir_elems(): New function, refactored from page_zip_dir_size(). page_zip_dir_size(): Use page_zip_dir_elems() page_zip_dir_start_offs(): New function: Gets an offset to the compressed page trailer (the dense page directory), including deleted records (the free list) page_zip_dir_start_low(page_zip, n_dense): Constness-preserving wrapper macro for page_zip_dir_start_offs(). page_zip_dir_start(page_zip): Constness-preserving wrapper macro for page_zip_dir_start_offs(). page_zip_decompress_node_ptrs(), page_zip_decompress_clust(): Replace a formula with a fully equivalent page_zip_dir_start_low() call. page_zip_write_rec(), page_zip_parse_write_node_ptr(), page_zip_write_node_ptr(), page_zip_write_trx_id_and_roll_ptr(), page_zip_clear_rec(): Replace a formula with an almost equivalent page_zip_dir_start() call. It is OK to replace page_dir_get_n_heap(page) with page_dir_get_n_heap(page_zip->data), because ut_ad(page_zip_header_cmp(page_zip, page)) or page_zip_validate(page_zip, page) asserts that the page headers are identical. rb:687 approved by Jimmy Yang --- storage/innodb_plugin/page/page0zip.c | 90 ++++++++++++++++++--------- 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/storage/innodb_plugin/page/page0zip.c b/storage/innodb_plugin/page/page0zip.c index 2d97b9a0d64..9f00fb4d1e0 100644 --- a/storage/innodb_plugin/page/page0zip.c +++ b/storage/innodb_plugin/page/page0zip.c @@ -150,6 +150,20 @@ page_zip_empty_size( } #endif /* !UNIV_HOTBACKUP */ +/*************************************************************//** +Gets the number of elements in the dense page directory, +including deleted records (the free list). +@return number of elements in the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_elems( +/*===============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + /* Exclude the page infimum and supremum from the record count. */ + return(page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW); +} + /*************************************************************//** Gets the size of the compressed page trailer (the dense page directory), including deleted records (the free list). @@ -160,13 +174,41 @@ page_zip_dir_size( /*==============*/ const page_zip_des_t* page_zip) /*!< in: compressed page */ { - /* Exclude the page infimum and supremum from the record count. */ - ulint size = PAGE_ZIP_DIR_SLOT_SIZE - * (page_dir_get_n_heap(page_zip->data) - - PAGE_HEAP_NO_USER_LOW); - return(size); + return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip)); } +/*************************************************************//** +Gets an offset to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@return offset of the dense page directory */ +UNIV_INLINE +ulint +page_zip_dir_start_offs( +/*====================*/ + const page_zip_des_t* page_zip, /*!< in: compressed page */ + ulint n_dense) /*!< in: directory size */ +{ + ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip)); + + return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE); +} + +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@param[in] n_dense number of entries in the directory +@return pointer to the dense page directory */ +#define page_zip_dir_start_low(page_zip, n_dense) \ + ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense)) +/*************************************************************//** +Gets a pointer to the compressed page trailer (the dense page directory), +including deleted records (the free list). +@param[in] page_zip compressed page +@return pointer to the dense page directory */ +#define page_zip_dir_start(page_zip) \ + page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip)) + /*************************************************************//** Gets the size of the compressed page trailer (the dense page directory), only including user records (excluding the free list). @@ -2242,8 +2284,7 @@ zlib_done: } /* Restore the uncompressed columns in heap_no order. */ - storage = page_zip->data + page_zip_get_size(page_zip) - - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + storage = page_zip_dir_start_low(page_zip, n_dense); for (slot = 0; slot < n_dense; slot++) { rec_t* rec = recs[slot]; @@ -2728,8 +2769,7 @@ zlib_done: return(FALSE); } - storage = page_zip->data + page_zip_get_size(page_zip) - - n_dense * PAGE_ZIP_DIR_SLOT_SIZE; + storage = page_zip_dir_start_low(page_zip, n_dense); externs = storage - n_dense * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); @@ -3457,9 +3497,7 @@ page_zip_write_rec( } /* Write the data bytes. Store the uncompressed bytes separately. */ - storage = page_zip->data + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; + storage = page_zip_dir_start(page_zip); if (page_is_leaf(page)) { ulint len; @@ -3755,9 +3793,7 @@ corrupt: field = page + offset; storage = page_zip->data + z_offset; - storage_end = page_zip->data + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; + storage_end = page_zip_dir_start(page_zip); heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE; @@ -3793,7 +3829,9 @@ page_zip_write_node_ptr( { byte* field; byte* storage; +#ifdef UNIV_DEBUG page_t* page = page_align(rec); +#endif /* UNIV_DEBUG */ ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); ut_ad(page_simple_validate_new(page)); @@ -3810,9 +3848,7 @@ page_zip_write_node_ptr( UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); UNIV_MEM_ASSERT_RW(rec, size); - storage = page_zip->data + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE + storage = page_zip_dir_start(page_zip) - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE; field = rec + size - REC_NODE_PTR_SIZE; @@ -3861,7 +3897,9 @@ page_zip_write_trx_id_and_roll_ptr( { byte* field; byte* storage; +#ifdef UNIV_DEBUG page_t* page = page_align(rec); +#endif /* UNIV_DEBUG */ ulint len; ut_ad(PAGE_ZIP_MATCH(rec, page_zip)); @@ -3879,9 +3917,7 @@ page_zip_write_trx_id_and_roll_ptr( UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip)); - storage = page_zip->data + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE + storage = page_zip_dir_start(page_zip) - (rec_get_heap_no_new(rec) - 1) * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); @@ -3948,11 +3984,7 @@ page_zip_clear_rec( /* Clear node_ptr. On the compressed page, there is an array of node_ptr immediately before the dense page directory, at the very end of the page. */ - storage = page_zip->data - + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; + storage = page_zip_dir_start(page_zip); ut_ad(dict_index_get_n_unique_in_tree(index) == rec_offs_n_fields(offsets) - 1); field = rec_get_nth_field(rec, offsets, @@ -3972,11 +4004,7 @@ page_zip_clear_rec( = dict_col_get_clust_pos( dict_table_get_sys_col( index->table, DATA_TRX_ID), index); - storage = page_zip->data - + page_zip_get_size(page_zip) - - (page_dir_get_n_heap(page) - - PAGE_HEAP_NO_USER_LOW) - * PAGE_ZIP_DIR_SLOT_SIZE; + storage = page_zip_dir_start(page_zip); field = rec_get_nth_field(rec, offsets, trx_id_pos, &len); ut_ad(len == DATA_TRX_ID_LEN); From 8c988dc61b92e0dffcf1158175acd7326b9df08a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Tue, 28 Jun 2011 15:28:21 +0300 Subject: [PATCH 29/35] Bug#12699505 Memory leak in row_create_index_for_mysql() DB_COL_APPEARS_TWICE_IN_INDEX: Remove. This condition is already checked and reported by MySQL before passing the index definition to the storage engine. row_create_index_for_mysql(): Remove the redundant check for DB_COL_APPEARS_TWICE_IN_INDEX. When enforcing the column prefix index limit, invoke dict_mem_index_free(index) to plug the memory leak. In the loop, use index->n_def instead of dict_index_get_n_fields(index), because the latter would be 0 for indexes that have not been copied to the data dictionary cache. innodb-use-sys-malloc.test: Add test cases for attempting to trigger the error checks in row_create_index_for_mysql(). Before MySQL 5.5 and WL#5743, the leak is only reproducible if ha_innobase::max_supported_key_part_length() returned a higher limit than the one used in row_create_index_for_mysql(). In MySQL 5.5 and later, the leak is reproducible with innodb_large_prefix=true. rb:688 approved by Jimmy Yang --- .../innodb/r/innodb-use-sys-malloc.result | 75 ++++++++++--------- .../innodb/t/innodb-use-sys-malloc-master.opt | 5 +- .../suite/innodb/t/innodb-use-sys-malloc.test | 59 +++++++-------- storage/innobase/handler/ha_innodb.cc | 1 - storage/innobase/include/db0err.h | 2 - storage/innobase/row/row0mysql.c | 37 ++------- storage/innobase/ut/ut0ut.c | 2 - 7 files changed, 75 insertions(+), 106 deletions(-) diff --git a/mysql-test/suite/innodb/r/innodb-use-sys-malloc.result b/mysql-test/suite/innodb/r/innodb-use-sys-malloc.result index 2ec4c7c8130..d6ca690be7e 100644 --- a/mysql-test/suite/innodb/r/innodb-use-sys-malloc.result +++ b/mysql-test/suite/innodb/r/innodb-use-sys-malloc.result @@ -9,40 +9,45 @@ SELECT @@GLOBAL.innodb_use_sys_malloc; @@GLOBAL.innodb_use_sys_malloc 1 1 Expected -drop table if exists t1; -create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1; -insert into t1 values (1),(2),(3),(4),(5),(6),(7); +create table t1(a int not null,key(a,a)) engine=innodb DEFAULT CHARSET=latin1; +ERROR 42S21: Duplicate column name 'a' +create table t1(a int,b text,key(b(768))) engine=innodb DEFAULT CHARSET=latin1; +ERROR HY000: Index column size too large. The maximum column size is 767 bytes. +create table t1(a int not null,b text) engine=innodb DEFAULT CHARSET=latin1; +insert into t1 values (1,''),(2,''),(3,''),(4,''),(5,''),(6,''),(7,''); +create index t1aa on t1(a,a); +ERROR 42S21: Duplicate column name 'a' +create index t1b on t1(b(768)); +ERROR HY000: Index column size too large. The maximum column size is 767 bytes. +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) NOT NULL, + `b` text +) ENGINE=InnoDB DEFAULT CHARSET=latin1 select * from t1; -a -1 -2 -3 -4 -5 -6 -7 -drop table t1; -SELECT @@GLOBAL.innodb_use_sys_malloc; -@@GLOBAL.innodb_use_sys_malloc -1 -1 Expected -SET @@GLOBAL.innodb_use_sys_malloc=0; -ERROR HY000: Variable 'innodb_use_sys_malloc' is a read only variable -Expected error 'Read only variable' -SELECT @@GLOBAL.innodb_use_sys_malloc; -@@GLOBAL.innodb_use_sys_malloc -1 -1 Expected -drop table if exists t1; -create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1; -insert into t1 values (1),(2),(3),(4),(5),(6),(7); -select * from t1; -a -1 -2 -3 -4 -5 -6 -7 +a b +1 +2 +3 +4 +5 +6 +7 drop table t1; +CREATE TABLE t2(a int primary key, b text) ENGINE=InnoDB DEFAULT CHARSET=latin1; +INSERT INTO t2 VALUES (1,''),(2,''),(3,''),(4,''),(5,''),(6,''),(7,''); +CREATE INDEX t2aa on t2(a,a); +ERROR 42S21: Duplicate column name 'a' +CREATE INDEX t2b on t2(b(768)); +ERROR HY000: Index column size too large. The maximum column size is 767 bytes. +SELECT * FROM t2; +a b +1 +2 +3 +4 +5 +6 +7 +DROP TABLE t2; diff --git a/mysql-test/suite/innodb/t/innodb-use-sys-malloc-master.opt b/mysql-test/suite/innodb/t/innodb-use-sys-malloc-master.opt index 041b063645b..8071b4f7282 100644 --- a/mysql-test/suite/innodb/t/innodb-use-sys-malloc-master.opt +++ b/mysql-test/suite/innodb/t/innodb-use-sys-malloc-master.opt @@ -1,3 +1,2 @@ ---default-storage-engine=MyISAM ---loose-innodb-use-sys-malloc=true ---loose-innodb-use-sys-malloc=true +--innodb-use-sys-malloc=true +--innodb-large-prefix=true diff --git a/mysql-test/suite/innodb/t/innodb-use-sys-malloc.test b/mysql-test/suite/innodb/t/innodb-use-sys-malloc.test index 325dd19d086..7acd3f5fe81 100644 --- a/mysql-test/suite/innodb/t/innodb-use-sys-malloc.test +++ b/mysql-test/suite/innodb/t/innodb-use-sys-malloc.test @@ -1,4 +1,4 @@ ---source include/have_innodb.inc +-- source include/have_innodb.inc #display current value of innodb_use_sys_malloc SELECT @@GLOBAL.innodb_use_sys_malloc; @@ -13,36 +13,33 @@ SELECT @@GLOBAL.innodb_use_sys_malloc; --echo 1 Expected -#do some stuff to see if it works. ---disable_warnings -drop table if exists t1; ---enable_warnings +# Do some stuff to see if it works. +# Also, test the code paths of +# Bug #12699505 MEMORY LEAK IN ROW_CREATE_INDEX_FOR_MYSQL() +# (the leak would only be triggered if +# ha_innobase::max_supported_key_part_length() were set +# higher than the limit used in row_create_index_for_mysql()) -create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1; -insert into t1 values (1),(2),(3),(4),(5),(6),(7); -select * from t1; -drop table t1; ---source include/have_innodb.inc - -#display current value of innodb_use_sys_malloc -SELECT @@GLOBAL.innodb_use_sys_malloc; ---echo 1 Expected - -#try changing it. Should fail. ---error ER_INCORRECT_GLOBAL_LOCAL_VAR -SET @@GLOBAL.innodb_use_sys_malloc=0; ---echo Expected error 'Read only variable' - -SELECT @@GLOBAL.innodb_use_sys_malloc; ---echo 1 Expected - - -#do some stuff to see if it works. ---disable_warnings -drop table if exists t1; ---enable_warnings - -create table t1(a int not null) engine=innodb DEFAULT CHARSET=latin1; -insert into t1 values (1),(2),(3),(4),(5),(6),(7); +--error ER_DUP_FIELDNAME +create table t1(a int not null,key(a,a)) engine=innodb DEFAULT CHARSET=latin1; +# thanks to --innodb-large-prefix=1 this will not be truncated to b(767) +-- error ER_INDEX_COLUMN_TOO_LONG +create table t1(a int,b text,key(b(768))) engine=innodb DEFAULT CHARSET=latin1; +create table t1(a int not null,b text) engine=innodb DEFAULT CHARSET=latin1; +insert into t1 values (1,''),(2,''),(3,''),(4,''),(5,''),(6,''),(7,''); +--error ER_DUP_FIELDNAME +create index t1aa on t1(a,a); +-- error ER_INDEX_COLUMN_TOO_LONG +create index t1b on t1(b(768)); +SHOW CREATE TABLE t1; select * from t1; + drop table t1; +CREATE TABLE t2(a int primary key, b text) ENGINE=InnoDB DEFAULT CHARSET=latin1; +INSERT INTO t2 VALUES (1,''),(2,''),(3,''),(4,''),(5,''),(6,''),(7,''); +--error ER_DUP_FIELDNAME +CREATE INDEX t2aa on t2(a,a); +-- error ER_INDEX_COLUMN_TOO_LONG +CREATE INDEX t2b on t2(b(768)); +SELECT * FROM t2; +DROP TABLE t2; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 11640acbce4..c4d2226227e 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -990,7 +990,6 @@ convert_error_code_to_mysql( misleading, a new MySQL error code should be introduced */ - case DB_COL_APPEARS_TWICE_IN_INDEX: case DB_CORRUPTION: return(HA_ERR_CRASHED); diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h index 74a2354bce3..28ef64500cc 100644 --- a/storage/innobase/include/db0err.h +++ b/storage/innobase/include/db0err.h @@ -64,8 +64,6 @@ enum db_err { DB_CANNOT_ADD_CONSTRAINT, /* adding a foreign key constraint to a table failed */ DB_CORRUPTION, /* data structure corruption noticed */ - DB_COL_APPEARS_TWICE_IN_INDEX, /* InnoDB cannot handle an index - where same column appears twice */ DB_CANNOT_DROP_CONSTRAINT, /* dropping a foreign key constraint from a table failed */ DB_NO_SAVEPOINT, /* no savepoint exists with the given diff --git a/storage/innobase/row/row0mysql.c b/storage/innobase/row/row0mysql.c index e1ada387729..a56d419d1f0 100644 --- a/storage/innobase/row/row0mysql.c +++ b/storage/innobase/row/row0mysql.c @@ -2015,41 +2015,13 @@ row_create_index_for_mysql( trx_start_if_not_started(trx); - /* Check that the same column does not appear twice in the index. - Starting from 4.0.14, InnoDB should be able to cope with that, but - safer not to allow them. */ - - for (i = 0; i < dict_index_get_n_fields(index); i++) { - ulint j; - - for (j = 0; j < i; j++) { - if (0 == ut_strcmp( - dict_index_get_nth_field(index, j)->name, - dict_index_get_nth_field(index, i)->name)) { - ut_print_timestamp(stderr); - - fputs(" InnoDB: Error: column ", stderr); - ut_print_name(stderr, trx, FALSE, - dict_index_get_nth_field( - index, i)->name); - fputs(" appears twice in ", stderr); - dict_index_name_print(stderr, trx, index); - fputs("\n" - "InnoDB: This is not allowed" - " in InnoDB.\n", stderr); - - err = DB_COL_APPEARS_TWICE_IN_INDEX; - - goto error_handling; - } - } - - /* Check also that prefix_len and actual length - is less than that from DICT_MAX_FIELD_LEN_BY_FORMAT() */ + for (i = 0; i < index->n_def; i++) { + /* Check that prefix_len and actual length + < DICT_MAX_INDEX_COL_LEN */ len = dict_index_get_nth_field(index, i)->prefix_len; - if (field_lengths) { + if (field_lengths && field_lengths[i]) { len = ut_max(len, field_lengths[i]); } @@ -2057,6 +2029,7 @@ row_create_index_for_mysql( if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) { err = DB_TOO_BIG_INDEX_COL; + dict_mem_index_free(index); goto error_handling; } } diff --git a/storage/innobase/ut/ut0ut.c b/storage/innobase/ut/ut0ut.c index a9c0d381e16..1ef1a082bb2 100644 --- a/storage/innobase/ut/ut0ut.c +++ b/storage/innobase/ut/ut0ut.c @@ -674,8 +674,6 @@ ut_strerr( return("Cannot add constraint"); case DB_CORRUPTION: return("Data structure corruption"); - case DB_COL_APPEARS_TWICE_IN_INDEX: - return("Column appears twice in index"); case DB_CANNOT_DROP_CONSTRAINT: return("Cannot drop constraint"); case DB_NO_SAVEPOINT: From 0f37ccb30f9ca0855b4494a7c7c10d293e316c47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 29 Jun 2011 09:57:15 +0300 Subject: [PATCH 30/35] Bug #12612184 BLOB debug code cleanup: Refactor the !rec_offs_any_extern relaxation in row_build(). trx_assert_active(trx_id): Assert that the given transaction is active. (In the 5.1 built-in InnoDB, there is no trx->is_recovered field.) trx_assert_recovered(trx_id): Assert that the given transaction is active and has been recovered after a crash. row_build(): Replace a bunch of code with an assertion that invokes trx_assert_active() or trx_assert_recovered() and row_get_rec_trx_id(). row_get_trx_id_offset(): Make the function inlined. Remove the unused parameter rec, and make all parameters const. row_get_rec_trx_id(), row_get_rec_roll_ptr(): Make all parameters const. rb:691 approved by Jimmy Yang --- storage/innobase/include/trx0sys.h | 10 ++++ storage/innobase/include/trx0sys.ic | 21 ++++++++ storage/innobase/row/row0row.c | 40 +++++---------- storage/innodb_plugin/include/row0row.h | 26 +++++----- storage/innodb_plugin/include/row0row.ic | 45 ++++++++++++---- storage/innodb_plugin/include/row0upd.ic | 4 +- storage/innodb_plugin/include/trx0sys.h | 11 +++- storage/innodb_plugin/include/trx0sys.ic | 24 ++++++++- storage/innodb_plugin/row/row0row.c | 65 ++++-------------------- 9 files changed, 141 insertions(+), 105 deletions(-) diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index bad3c9d570c..7ea981eb85c 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -256,6 +256,16 @@ trx_in_trx_list( /*============*/ /* out: TRUE if is in */ trx_t* in_trx);/* in: trx */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/******************************************************** +Assert that a transaction is active. */ +UNIV_INLINE +ibool +trx_assert_active( +/*==============*/ + /* out: TRUE */ + dulint trx_id); /* in: transaction identifier */ +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /********************************************************************* Updates the offset information about the end of the MySQL binlog entry which corresponds to the transaction just being committed. In a MySQL diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic index 1142fb60398..f5033c5778a 100644 --- a/storage/innobase/include/trx0sys.ic +++ b/storage/innobase/include/trx0sys.ic @@ -257,6 +257,27 @@ trx_get_on_id( return(NULL); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/******************************************************** +Assert that a transaction is active. */ +UNIV_INLINE +ibool +trx_assert_active( +/*==============*/ + /* out: TRUE */ + dulint trx_id) /* in: transaction identifier */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + trx = trx_get_on_id(trx_id); + ut_a(trx); + mutex_exit(&kernel_mutex); + + return(TRUE); +} +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + /******************************************************************** Returns the minumum trx id in trx list. This is the smallest id for which the trx can possibly be active. (But, you must look at the trx->conc_state to diff --git a/storage/innobase/row/row0row.c b/storage/innobase/row/row0row.c index 751de98deba..171039e34ac 100644 --- a/storage/innobase/row/row0row.c +++ b/storage/innobase/row/row0row.c @@ -212,35 +212,23 @@ row_build( } #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - if (UNIV_LIKELY_NULL(rec_offs_any_null_extern(rec, offsets))) { - /* This condition can occur during crash recovery before - trx_rollback_or_clean_all_without_sess() has completed - execution. + /* This condition can occur during crash recovery before + trx_rollback_or_clean_all_without_sess() has completed + execution. - This condition is possible if the server crashed - during an insert or update before - btr_store_big_rec_extern_fields() did mtr_commit() all - BLOB pointers to the clustered index record. + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. - Look up the transaction that holds the implicit lock - on this record, and assert that it was recovered (and - will soon be rolled back). */ + If the record contains a null BLOB pointer, look up the + transaction that holds the implicit lock on this record, and + assert that it is active. (In this version of InnoDB, we + cannot assert that it was recovered, because there is no + trx->is_recovered field.) */ - ulint trx_id_pos = dict_index_get_sys_col_pos( - index, DATA_TRX_ID); - ulint len; - dulint trx_id = trx_read_trx_id( - rec_get_nth_field(rec, offsets, trx_id_pos, &len)); - trx_t* trx; - ut_a(len == 6); - - mutex_enter(&kernel_mutex); - trx = trx_get_on_id(trx_id); - ut_a(trx); - /* This field does not exist in this version of InnoDB. */ - /* ut_a(trx->is_recovered); */ - mutex_exit(&kernel_mutex); - } + ut_a(!rec_offs_any_null_extern(rec, offsets) + || trx_assert_active(row_get_rec_trx_id(rec, index, offsets))); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { diff --git a/storage/innodb_plugin/include/row0row.h b/storage/innodb_plugin/include/row0row.h index 723b7b53395..36fb26482ce 100644 --- a/storage/innodb_plugin/include/row0row.h +++ b/storage/innodb_plugin/include/row0row.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,16 +38,16 @@ Created 4/20/1996 Heikki Tuuri #include "btr0types.h" /*********************************************************************//** -Gets the offset of the trx id field, in bytes relative to the origin of +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of a clustered index record. @return offset of DATA_TRX_ID */ -UNIV_INTERN +UNIV_INLINE ulint row_get_trx_id_offset( /*==================*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: record offsets */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Reads the trx id field from a clustered index record. @return value of the field */ @@ -55,9 +55,10 @@ UNIV_INLINE trx_id_t row_get_rec_trx_id( /*===============*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); /*********************************************************************//** Reads the roll pointer field from a clustered index record. @return value of the field */ @@ -65,9 +66,10 @@ UNIV_INLINE roll_ptr_t row_get_rec_roll_ptr( /*=================*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + __attribute__((nonnull, warn_unused_result)); /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. diff --git a/storage/innodb_plugin/include/row0row.ic b/storage/innodb_plugin/include/row0row.ic index 05c007641af..0b9ca982af8 100644 --- a/storage/innodb_plugin/include/row0row.ic +++ b/storage/innodb_plugin/include/row0row.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,6 +27,33 @@ Created 4/20/1996 Heikki Tuuri #include "rem0rec.h" #include "trx0undo.h" +/*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: record offsets */ +{ + ulint pos; + ulint offset; + ulint len; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(NULL, index, offsets)); + + pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + + offset = rec_get_nth_field_offs(offsets, pos, &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + return(offset); +} + /*********************************************************************//** Reads the trx id field from a clustered index record. @return value of the field */ @@ -34,9 +61,9 @@ UNIV_INLINE trx_id_t row_get_rec_trx_id( /*===============*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ { ulint offset; @@ -46,7 +73,7 @@ row_get_rec_trx_id( offset = index->trx_id_offset; if (!offset) { - offset = row_get_trx_id_offset(rec, index, offsets); + offset = row_get_trx_id_offset(index, offsets); } return(trx_read_trx_id(rec + offset)); @@ -59,9 +86,9 @@ UNIV_INLINE roll_ptr_t row_get_rec_roll_ptr( /*=================*/ - const rec_t* rec, /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ { ulint offset; @@ -71,7 +98,7 @@ row_get_rec_roll_ptr( offset = index->trx_id_offset; if (!offset) { - offset = row_get_trx_id_offset(rec, index, offsets); + offset = row_get_trx_id_offset(index, offsets); } return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); diff --git a/storage/innodb_plugin/include/row0upd.ic b/storage/innodb_plugin/include/row0upd.ic index 18e22f1eca9..0894ed373b0 100644 --- a/storage/innodb_plugin/include/row0upd.ic +++ b/storage/innodb_plugin/include/row0upd.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -171,7 +171,7 @@ row_upd_rec_sys_fields( ulint offset = index->trx_id_offset; if (!offset) { - offset = row_get_trx_id_offset(rec, index, offsets); + offset = row_get_trx_id_offset(index, offsets); } #if DATA_TRX_ID + 1 != DATA_ROLL_PTR diff --git a/storage/innodb_plugin/include/trx0sys.h b/storage/innodb_plugin/include/trx0sys.h index cbb89689748..ed62375797a 100644 --- a/storage/innodb_plugin/include/trx0sys.h +++ b/storage/innodb_plugin/include/trx0sys.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -284,6 +284,15 @@ ibool trx_in_trx_list( /*============*/ trx_t* in_trx);/*!< in: trx */ +/***********************************************************//** +Assert that a transaction has been recovered. +@return TRUE */ +UNIV_INLINE +ibool +trx_assert_recovered( +/*=================*/ + trx_id_t trx_id) /*!< in: transaction identifier */ + __attribute__((warn_unused_result)); /*****************************************************************//** Updates the offset information about the end of the MySQL binlog entry which corresponds to the transaction just being committed. In a MySQL diff --git a/storage/innodb_plugin/include/trx0sys.ic b/storage/innodb_plugin/include/trx0sys.ic index 820d31d0692..6246debac0a 100644 --- a/storage/innodb_plugin/include/trx0sys.ic +++ b/storage/innodb_plugin/include/trx0sys.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -277,6 +277,28 @@ trx_get_on_id( return(NULL); } +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG +/***********************************************************//** +Assert that a transaction has been recovered. +@return TRUE */ +UNIV_INLINE +ibool +trx_assert_recovered( +/*=================*/ + trx_id_t trx_id) /*!< in: transaction identifier */ +{ + trx_t* trx; + + mutex_enter(&kernel_mutex); + trx = trx_get_on_id(trx_id); + ut_a(trx); + ut_a(trx->is_recovered); + mutex_exit(&kernel_mutex); + + return(TRUE); +} +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + /****************************************************************//** Returns the minumum trx id in trx list. This is the smallest id for which the trx can possibly be active. (But, you must look at the trx->conc_state to diff --git a/storage/innodb_plugin/row/row0row.c b/storage/innodb_plugin/row/row0row.c index 3b610d735b0..dd94132eebc 100644 --- a/storage/innodb_plugin/row/row0row.c +++ b/storage/innodb_plugin/row/row0row.c @@ -47,35 +47,6 @@ Created 4/20/1996 Heikki Tuuri #include "read0read.h" #include "ut0mem.h" -/*********************************************************************//** -Gets the offset of trx id field, in bytes relative to the origin of -a clustered index record. -@return offset of DATA_TRX_ID */ -UNIV_INTERN -ulint -row_get_trx_id_offset( -/*==================*/ - const rec_t* rec __attribute__((unused)), - /*!< in: record */ - dict_index_t* index, /*!< in: clustered index */ - const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ -{ - ulint pos; - ulint offset; - ulint len; - - ut_ad(dict_index_is_clust(index)); - ut_ad(rec_offs_validate(rec, index, offsets)); - - pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); - - offset = rec_get_nth_field_offs(offsets, pos, &len); - - ut_ad(len == DATA_TRX_ID_LEN); - - return(offset); -} - /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. @@ -233,33 +204,19 @@ row_build( } #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - if (UNIV_LIKELY_NULL(rec_offs_any_null_extern(rec, offsets))) { - /* This condition can occur during crash recovery before - trx_rollback_active() has completed execution. + /* This condition can occur during crash recovery before + trx_rollback_active() has completed execution. - This condition is possible if the server crashed - during an insert or update before - btr_store_big_rec_extern_fields() did mtr_commit() all - BLOB pointers to the clustered index record. + This condition is possible if the server crashed + during an insert or update before + btr_store_big_rec_extern_fields() did mtr_commit() all + BLOB pointers to the clustered index record. - Look up the transaction that holds the implicit lock - on this record, and assert that it was recovered (and - will soon be rolled back). */ - - ulint trx_id_pos = dict_index_get_sys_col_pos( - index, DATA_TRX_ID); - ulint len; - trx_id_t trx_id = trx_read_trx_id( - rec_get_nth_field(rec, offsets, trx_id_pos, &len)); - trx_t* trx; - ut_a(len == 6); - - mutex_enter(&kernel_mutex); - trx = trx_get_on_id(trx_id); - ut_a(trx); - ut_a(trx->is_recovered); - mutex_exit(&kernel_mutex); - } + If the record contains a null BLOB pointer, look up the + transaction that holds the implicit lock on this record, and + assert that it was recovered (and will soon be rolled back). */ + ut_a(!rec_offs_any_null_extern(rec, offsets) + || trx_assert_recovered(row_get_rec_trx_id(rec, index, offsets))); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { From 6f1decbf7736a42b8eb08ac6c05bd300cc9df1d3 Mon Sep 17 00:00:00 2001 From: Marc Alff Date: Wed, 29 Jun 2011 10:15:05 +0200 Subject: [PATCH 31/35] Bug#12565712 - 61205: "QUERIES PER SECOND AVG" VALUE IN STATUS OUTPUT IS INCORRECT Before this fix, a server executing 1009 queries in 1000 seconds would give a status of: Queries per second avg: 1.9 The printf format used to print the decimal part, computed separately, is incorrect. With this fix, the correct result is printed: Queries per second avg: 1.009 Tested manually, no test case provided. --- sql/sql_parse.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 36fdf376fa6..885db292699 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -1308,7 +1308,7 @@ bool dispatch_command(enum enum_server_command command, THD *thd, length= my_snprintf(buff, buff_len - 1, "Uptime: %lu Threads: %d Questions: %lu " "Slow queries: %lu Opens: %lu Flush tables: %lu " - "Open tables: %u Queries per second avg: %u.%u", + "Open tables: %u Queries per second avg: %u.%03u", uptime, (int) thread_count, (ulong) thd->query_id, current_global_status_var.long_query_count, From 67ea0a59e5b3c4e990c946c3df5af46ecbc4182b Mon Sep 17 00:00:00 2001 From: Vasil Dimov Date: Wed, 29 Jun 2011 14:28:30 +0300 Subject: [PATCH 32/35] Bug #12696083 FIX OUTDATED COPYRIGHT NOTICES IN INNODB RELATED CLIENT TOOLS Update copyright comment in innochecksum. --- extra/innochecksum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/innochecksum.c b/extra/innochecksum.c index 33f925a4cad..ecdf84d5181 100644 --- a/extra/innochecksum.c +++ b/extra/innochecksum.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2000-2005 MySQL AB & Innobase Oy +/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by From e25fb73be22b496a61a28b82de746f990822ae50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 29 Jun 2011 16:48:41 +0300 Subject: [PATCH 33/35] Bug #12612184 BLOB debug code cleanup: Forgot an #if around the declaration of trx_assert_recovered(). --- storage/innodb_plugin/include/trx0sys.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/storage/innodb_plugin/include/trx0sys.h b/storage/innodb_plugin/include/trx0sys.h index ed62375797a..78bb6fc349b 100644 --- a/storage/innodb_plugin/include/trx0sys.h +++ b/storage/innodb_plugin/include/trx0sys.h @@ -284,6 +284,7 @@ ibool trx_in_trx_list( /*============*/ trx_t* in_trx);/*!< in: trx */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG /***********************************************************//** Assert that a transaction has been recovered. @return TRUE */ @@ -293,6 +294,7 @@ trx_assert_recovered( /*=================*/ trx_id_t trx_id) /*!< in: transaction identifier */ __attribute__((warn_unused_result)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ /*****************************************************************//** Updates the offset information about the end of the MySQL binlog entry which corresponds to the transaction just being committed. In a MySQL From 88eb572c25eb81492d0861d266a85a2d1f571abb Mon Sep 17 00:00:00 2001 From: Marc Alff Date: Wed, 29 Jun 2011 22:18:07 +0200 Subject: [PATCH 34/35] Bug#12603341 - PERFSCHEMA.RELAYLOG FAILS SPORADICALLY IN PB2 Before this fix, the test performance_schema.relaylog would fail with sporadic failures related to statistics on update_cond. The reason for these failures is that thread scheduling makes impossible to predict if instrumented conditions will be used on not. The fix is to relax the test case, to not collect statistics about: - wait/synch/cond/sql/MYSQL_BIN_LOG::update_cond - wait/synch/cond/sql/MYSQL_RELAY_LOG::update_cond --- mysql-test/suite/perfschema/r/relaylog.result | 20 +++++++++++-------- mysql-test/suite/perfschema/t/relaylog.test | 18 +++++++++++++---- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/mysql-test/suite/perfschema/r/relaylog.result b/mysql-test/suite/perfschema/r/relaylog.result index e2f8270b1f0..0a7d0a5b2be 100644 --- a/mysql-test/suite/perfschema/r/relaylog.result +++ b/mysql-test/suite/perfschema/r/relaylog.result @@ -52,10 +52,11 @@ select EVENT_NAME, if (count_star > 0, "MANY", "NONE") as COUNT_STAR from performance_schema.events_waits_summary_global_by_event_name -where event_name like "%MYSQL_BIN_LOG%" order by event_name; +where event_name like "%MYSQL_BIN_LOG%" + and event_name not like "%MYSQL_BIN_LOG::update_cond" + order by event_name; EVENT_NAME COUNT_STAR wait/synch/cond/sql/MYSQL_BIN_LOG::COND_prep_xids NONE -wait/synch/cond/sql/MYSQL_BIN_LOG::update_cond MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_prep_xids NONE "Expect no slave relay log" @@ -68,9 +69,10 @@ EVENT_NAME COUNT_READ COUNT_WRITE SUM_NUMBER_OF_BYTES_READ SUM_NUMBER_OF_BYTES_W wait/io/file/sql/relaylog 0 0 0 0 wait/io/file/sql/relaylog_index 0 0 0 0 select * from performance_schema.events_waits_summary_global_by_event_name -where event_name like "%MYSQL_RELAY_LOG%" order by event_name; +where event_name like "%MYSQL_RELAY_LOG%" + and event_name not like "%MYSQL_RELAY_LOG::update_cond" + order by event_name; EVENT_NAME COUNT_STAR SUM_TIMER_WAIT MIN_TIMER_WAIT AVG_TIMER_WAIT MAX_TIMER_WAIT -wait/synch/cond/sql/MYSQL_RELAY_LOG::update_cond 0 0 0 0 0 wait/synch/mutex/sql/MYSQL_RELAY_LOG::LOCK_index 0 0 0 0 0 "============ Performance schema on slave ============" select * from performance_schema.file_summary_by_instance @@ -123,10 +125,11 @@ select EVENT_NAME, if (count_star > 0, "MANY", "NONE") as COUNT_STAR from performance_schema.events_waits_summary_global_by_event_name -where event_name like "%MYSQL_BIN_LOG%" order by event_name; +where event_name like "%MYSQL_BIN_LOG%" + and event_name not like "%MYSQL_BIN_LOG::update_cond" + order by event_name; EVENT_NAME COUNT_STAR wait/synch/cond/sql/MYSQL_BIN_LOG::COND_prep_xids NONE -wait/synch/cond/sql/MYSQL_BIN_LOG::update_cond NONE wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_prep_xids NONE "Expect a slave relay log" @@ -162,8 +165,9 @@ select EVENT_NAME, if (count_star > 0, "MANY", "NONE") as COUNT_STAR from performance_schema.events_waits_summary_global_by_event_name -where event_name like "%MYSQL_RELAY_LOG%" order by event_name; +where event_name like "%MYSQL_RELAY_LOG%" + and event_name not like "%MYSQL_RELAY_LOG::update_cond" + order by event_name; EVENT_NAME COUNT_STAR -wait/synch/cond/sql/MYSQL_RELAY_LOG::update_cond MANY wait/synch/mutex/sql/MYSQL_RELAY_LOG::LOCK_index MANY include/stop_slave.inc diff --git a/mysql-test/suite/perfschema/t/relaylog.test b/mysql-test/suite/perfschema/t/relaylog.test index 68ba57dd502..e2798e9b707 100644 --- a/mysql-test/suite/perfschema/t/relaylog.test +++ b/mysql-test/suite/perfschema/t/relaylog.test @@ -41,6 +41,8 @@ drop table test.t1; # To ensure robustness: # - log file rotation is limited to file .000001 and .000002 # - statistics are normalized to "NONE" or "MANY" +# - statistics on ::update_cond conditions are not collected, +# since this is too much dependent on execution. # connection master; @@ -84,7 +86,9 @@ select EVENT_NAME, if (count_star > 0, "MANY", "NONE") as COUNT_STAR from performance_schema.events_waits_summary_global_by_event_name - where event_name like "%MYSQL_BIN_LOG%" order by event_name; + where event_name like "%MYSQL_BIN_LOG%" + and event_name not like "%MYSQL_BIN_LOG::update_cond" + order by event_name; -- echo "Expect no slave relay log" @@ -95,7 +99,9 @@ select * from performance_schema.file_summary_by_event_name where event_name like "%relaylog%" order by event_name; select * from performance_schema.events_waits_summary_global_by_event_name - where event_name like "%MYSQL_RELAY_LOG%" order by event_name; + where event_name like "%MYSQL_RELAY_LOG%" + and event_name not like "%MYSQL_RELAY_LOG::update_cond" + order by event_name; sync_slave_with_master; -- echo "============ Performance schema on slave ============" @@ -142,7 +148,9 @@ select EVENT_NAME, if (count_star > 0, "MANY", "NONE") as COUNT_STAR from performance_schema.events_waits_summary_global_by_event_name - where event_name like "%MYSQL_BIN_LOG%" order by event_name; + where event_name like "%MYSQL_BIN_LOG%" + and event_name not like "%MYSQL_BIN_LOG::update_cond" + order by event_name; -- echo "Expect a slave relay log" @@ -173,7 +181,9 @@ select EVENT_NAME, if (count_star > 0, "MANY", "NONE") as COUNT_STAR from performance_schema.events_waits_summary_global_by_event_name - where event_name like "%MYSQL_RELAY_LOG%" order by event_name; + where event_name like "%MYSQL_RELAY_LOG%" + and event_name not like "%MYSQL_RELAY_LOG::update_cond" + order by event_name; --source include/stop_slave.inc From eeb028bbc1695b8727ee9e58c173c85f1abcb29c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 30 Jun 2011 13:18:54 +0300 Subject: [PATCH 35/35] Bug#12637786 Wrong secondary index entries on CHAR and VARCHAR columns row_build_index_entry(): In innodb_file_format=Barracuda (ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED), a secondary index on a full column can refer to a field that is stored off-page in the clustered index record. Take that into account. rb:692 approved by Jimmy Yang --- .../suite/innodb_plugin/r/innodb-index.result | 13 ++++++ .../suite/innodb_plugin/t/innodb-index.test | 16 +++++++ storage/innodb_plugin/ChangeLog | 6 +++ storage/innodb_plugin/row/row0row.c | 42 +++++++++++++++++-- 4 files changed, 74 insertions(+), 3 deletions(-) diff --git a/mysql-test/suite/innodb_plugin/r/innodb-index.result b/mysql-test/suite/innodb_plugin/r/innodb-index.result index f86fcd4a8ef..547d253911a 100644 --- a/mysql-test/suite/innodb_plugin/r/innodb-index.result +++ b/mysql-test/suite/innodb_plugin/r/innodb-index.result @@ -967,6 +967,19 @@ ERROR HY000: Too big row alter table t1 row_format=compact; create index t1u on t1 (u(1)); drop table t1; +SET @r=REPEAT('a',500); +CREATE TABLE t1(a INT, +v1 VARCHAR(500), v2 VARCHAR(500), v3 VARCHAR(500), +v4 VARCHAR(500), v5 VARCHAR(500), v6 VARCHAR(500), +v7 VARCHAR(500), v8 VARCHAR(500), v9 VARCHAR(500), +v10 VARCHAR(500), v11 VARCHAR(500), v12 VARCHAR(500), +v13 VARCHAR(500), v14 VARCHAR(500), v15 VARCHAR(500), +v16 VARCHAR(500), v17 VARCHAR(500), v18 VARCHAR(500) +) ENGINE=InnoDB ROW_FORMAT=DYNAMIC; +CREATE INDEX idx1 ON t1(a,v1); +INSERT INTO t1 VALUES(9,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r); +UPDATE t1 SET a=1000; +DROP TABLE t1; set global innodb_file_per_table=0; set global innodb_file_format=Antelope; set global innodb_file_format_check=Antelope; diff --git a/mysql-test/suite/innodb_plugin/t/innodb-index.test b/mysql-test/suite/innodb_plugin/t/innodb-index.test index 717c7d4e032..d8a4a13edd7 100644 --- a/mysql-test/suite/innodb_plugin/t/innodb-index.test +++ b/mysql-test/suite/innodb_plugin/t/innodb-index.test @@ -404,6 +404,22 @@ alter table t1 row_format=compact; create index t1u on t1 (u(1)); drop table t1; + +# Bug#12637786 +SET @r=REPEAT('a',500); +CREATE TABLE t1(a INT, + v1 VARCHAR(500), v2 VARCHAR(500), v3 VARCHAR(500), + v4 VARCHAR(500), v5 VARCHAR(500), v6 VARCHAR(500), + v7 VARCHAR(500), v8 VARCHAR(500), v9 VARCHAR(500), + v10 VARCHAR(500), v11 VARCHAR(500), v12 VARCHAR(500), + v13 VARCHAR(500), v14 VARCHAR(500), v15 VARCHAR(500), + v16 VARCHAR(500), v17 VARCHAR(500), v18 VARCHAR(500) +) ENGINE=InnoDB ROW_FORMAT=DYNAMIC; +CREATE INDEX idx1 ON t1(a,v1); +INSERT INTO t1 VALUES(9,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r); +UPDATE t1 SET a=1000; +DROP TABLE t1; + eval set global innodb_file_per_table=$per_table; eval set global innodb_file_format=$format; eval set global innodb_file_format_check=$format; diff --git a/storage/innodb_plugin/ChangeLog b/storage/innodb_plugin/ChangeLog index 6d9e074d202..b348e19033b 100644 --- a/storage/innodb_plugin/ChangeLog +++ b/storage/innodb_plugin/ChangeLog @@ -1,3 +1,9 @@ +2011-06-30 The InnoDB Team + + * row/row0row.c: + Fix Bug#12637786 Wrong secondary index entries on CHAR and VARCHAR + columns in ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED + 2011-06-16 The InnoDB Team * btr/btr0cur.c, buf/buf0buddy.c, buf/buf0buf.c, buf/buf0lru.c, diff --git a/storage/innodb_plugin/row/row0row.c b/storage/innodb_plugin/row/row0row.c index dd94132eebc..4bad2cb9144 100644 --- a/storage/innodb_plugin/row/row0row.c +++ b/storage/innodb_plugin/row/row0row.c @@ -101,12 +101,27 @@ row_build_index_entry( dfield_copy(dfield, dfield2); - if (dfield_is_null(dfield) || ind_field->prefix_len == 0) { + if (dfield_is_null(dfield)) { continue; } - /* If a column prefix index, take only the prefix. - Prefix-indexed columns may be externally stored. */ + if (ind_field->prefix_len == 0 + && (!dfield_is_ext(dfield) + || dict_index_is_clust(index))) { + /* The dfield_copy() above suffices for + columns that are stored in-page, or for + clustered index record columns that are not + part of a column prefix in the PRIMARY KEY. */ + continue; + } + + /* If the column is stored externally (off-page) in + the clustered index, it must be an ordering field in + the secondary index. In the Antelope format, only + prefix-indexed columns may be stored off-page in the + clustered index record. In the Barracuda format, also + fully indexed long CHAR or VARCHAR columns may be + stored off-page. */ ut_ad(col->ord_part); if (UNIV_LIKELY_NULL(ext)) { @@ -119,13 +134,34 @@ row_build_index_entry( } dfield_set_data(dfield, buf, len); } + + if (ind_field->prefix_len == 0) { + /* In the Barracuda format + (ROW_FORMAT=DYNAMIC or + ROW_FORMAT=COMPRESSED), we can have a + secondary index on an entire column + that is stored off-page in the + clustered index. As this is not a + prefix index (prefix_len == 0), + include the entire off-page column in + the secondary index record. */ + continue; + } } else if (dfield_is_ext(dfield)) { + /* This table should be in Antelope format + (ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT). + In that format, the maximum column prefix + index length is 767 bytes, and the clustered + index record contains a 768-byte prefix of + each off-page column. */ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); len -= BTR_EXTERN_FIELD_REF_SIZE; ut_a(ind_field->prefix_len <= len || dict_index_is_clust(index)); } + /* If a column prefix index, take only the prefix. */ + ut_ad(ind_field->prefix_len); len = dtype_get_at_most_n_mbchars( col->prtype, col->mbminlen, col->mbmaxlen, ind_field->prefix_len, len, dfield_get_data(dfield));