From 7afa819f727144e8a107e28444e07d54045ab38e Mon Sep 17 00:00:00 2001 From: Monty Date: Sun, 25 Sep 2022 18:44:48 +0300 Subject: [PATCH] Fix cost calculation for get_best_group_min_max() If the final range restrictions (SEL_ARG tree) over GROUP BY columns are single-point, we can compute the number of GROUP BY groups. Example: in the query: SELECT ... FROM tbl WHERE keypart1 IN (1,2,3) and keypart2 IN ('foo','bar') Other things: - Fixed cost calculation to more correctly count the number of blocks that may be read. The old code could use the total blocks in the file even if a range was available. --- mysql-test/main/group_by.result | 67 ++++++++--- mysql-test/main/group_by.test | 24 +++- sql/opt_range.cc | 189 ++++++++++++++++++++++---------- sql/opt_range.h | 2 + 4 files changed, 207 insertions(+), 75 deletions(-) diff --git a/mysql-test/main/group_by.result b/mysql-test/main/group_by.result index f60f9ca106c..460a3cd9357 100644 --- a/mysql-test/main/group_by.result +++ b/mysql-test/main/group_by.result @@ -552,12 +552,12 @@ a b 3 1 explain select t1.a,t2.b from t1,t2 where t1.a=t2.a group by t1.a,t2.b; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using temporary; Using filesort -1 SIMPLE t2 ALL a NULL NULL NULL 4 Using where; Using join buffer (flat, BNL join) +1 SIMPLE t2 ALL a NULL NULL NULL 4 Using temporary; Using filesort +1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using where; Using join buffer (flat, BNL join) explain select t1.a,t2.b from t1,t2 where t1.a=t2.a group by t1.a,t2.b ORDER BY NULL; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using temporary -1 SIMPLE t2 ALL a NULL NULL NULL 4 Using where; Using join buffer (flat, BNL join) +1 SIMPLE t2 ALL a NULL NULL NULL 4 Using temporary +1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using where; Using join buffer (flat, BNL join) drop table t1,t2; SET @@optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity,@@optimizer_switch=@save_optimizer_switch; create table t1 (a int, b int); @@ -890,7 +890,8 @@ Level Code Message drop table t1, t2; CREATE TABLE t1 (a tinyint(3), b varchar(255), PRIMARY KEY (a)); INSERT INTO t1 VALUES (1,'-----'), (6,'Allemagne'), (17,'Autriche'), -(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'); +(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'), +(100,"No land"), (101,"No land"); CREATE TABLE t2 (a tinyint(3), b tinyint(3), PRIMARY KEY (a), KEY b (b)); INSERT INTO t2 VALUES (1,1), (2,1), (6,6), (18,17), (15,25), (16,25), (17,25), (10,54), (5,62),(3,68); @@ -1352,7 +1353,7 @@ id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 range PRIMARY,i2 PRIMARY 4 NULL 1 Using where; Using index EXPLAIN SELECT a FROM t1 WHERE a < 2 GROUP BY a; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t1 range PRIMARY,i2 PRIMARY 4 NULL 1 Using where; Using index +1 SIMPLE t1 range PRIMARY,i2 i2 4 NULL 1 Using where; Using index for group-by EXPLAIN SELECT a FROM t1 IGNORE INDEX (PRIMARY,i2); id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 144 @@ -1701,7 +1702,7 @@ NULL 1 1 2 EXPLAIN SELECT a from t2 GROUP BY a; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t2 index NULL a 10 NULL 6 Using index +1 SIMPLE t2 range NULL a 5 NULL 6 Using index for group-by SELECT a from t2 GROUP BY a; a NULL @@ -1715,10 +1716,17 @@ NULL 1 2 insert into t2 SELECT NULL, NULL from seq_1_to_10; -# Expect: Using index for group-by EXPLAIN SELECT b from t2 GROUP BY a; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t2 range NULL a 5 NULL 9 Using index for group-by +1 SIMPLE t2 index NULL a 10 NULL 16 Using index +# Expect: Using index for group-by +analyze table t2; +Table Op Msg_type Msg_text +test.t2 analyze status Engine-independent statistics collected +test.t2 analyze status OK +EXPLAIN SELECT b from t2 GROUP BY a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t2 range NULL a 5 NULL 6 Using index for group-by DROP TABLE t1; DROP TABLE t2; CREATE TABLE t1 ( a INT, b INT ); @@ -2255,11 +2263,11 @@ INSERT INTO t2(col1, col2) VALUES explain select col1 f1, col2 f2, col1 f3 from t2 group by f1; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t2 range NULL idx 5 NULL 7 Using index for group-by +1 SIMPLE t2 index NULL idx 10 NULL 20 Using index explain select SQL_BIG_RESULT col1 f1, col2 f2, col1 f3 from t2 group by f1; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t2 range NULL idx 5 NULL 7 Using index for group-by +1 SIMPLE t2 index NULL idx 10 NULL 20 Using index; Using filesort explain select col1 f1, col2 f2, col1 f3 from t2 group by f1, f2; id select_type table type possible_keys key key_len ref rows Extra @@ -2267,7 +2275,7 @@ id select_type table type possible_keys key key_len ref rows Extra explain select col1 f1, col1 f2 from t2 group by f1, 1+1; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t2 range NULL idx 5 NULL 7 Using index for group-by +1 SIMPLE t2 index NULL idx 10 NULL 20 Using index explain select col1 f1, col2 f2, col1 f3 from t2 group by f1, f2, f3+0; id select_type table type possible_keys key key_len ref rows Extra @@ -2457,7 +2465,7 @@ test.t1 analyze status OK EXPLAIN SELECT SQL_BUFFER_RESULT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t1 range b b 9 NULL 2 Using where; Using index for group-by; Using temporary +1 SIMPLE t1 range b b 9 NULL 1 Using where; Using index for group-by; Using temporary SELECT SQL_BUFFER_RESULT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b; MIN(a) b @@ -2465,7 +2473,7 @@ MIN(a) b EXPLAIN SELECT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b; id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE t1 range b b 9 NULL 2 Using where; Using index for group-by +1 SIMPLE t1 range b b 9 NULL 1 Using where; Using index for group-by SELECT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b; MIN(a) b @@ -2994,3 +3002,34 @@ drop table t20, t21, t22; # # End of 10.3 tests # +# +# Test new group_min_max optimization +# +create table t1 (a int, b int, c int, key(a,b,c)); +insert into t1 select mod(seq,23),mod(seq,13), mod(seq,5) from seq_1_to_10000; +explain select a from t1 where a in (1,2,3) group by a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range a a 5 NULL 3 Using where; Using index for group-by +explain select a from t1 where a in (1,2,3) or a = 22 group by a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range a a 5 NULL 4 Using where; Using index for group-by +explain select a from t1 where a in (1,2,3) and a < 3 group by a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range a a 5 NULL 2 Using where; Using index for group-by +explain select a,b from t1 where (a) in (1,2,3) and b in (5,6,7) group by a,b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range a a 10 NULL 9 Using where; Using index for group-by +explain select a,b from t1 where (a,b) in ((1,1),(2,2),(3,3)) group by a,b; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range a a 10 NULL 3 Using where; Using index for group-by +explain select a,b,c from t1 where (a,b) in ((1,1),(2,2),(3,3)) and c=3 group by a,b,c; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range a a 15 NULL 3 Using where; Using index for group-by +# Will not use index for group-by +explain select a from t1 where a in (1,2,3) and b>1 group by a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range a a 10 NULL 983 Using where; Using index +explain select a from t1 where a in (1,2,3) and c=1 group by a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range a a 5 NULL 1161 Using where; Using index +drop table t1; diff --git a/mysql-test/main/group_by.test b/mysql-test/main/group_by.test index 87f82fde5ce..3aa7f1cd0d3 100644 --- a/mysql-test/main/group_by.test +++ b/mysql-test/main/group_by.test @@ -718,7 +718,8 @@ drop table t1, t2; CREATE TABLE t1 (a tinyint(3), b varchar(255), PRIMARY KEY (a)); INSERT INTO t1 VALUES (1,'-----'), (6,'Allemagne'), (17,'Autriche'), - (25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'); + (25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'), + (100,"No land"), (101,"No land"); CREATE TABLE t2 (a tinyint(3), b tinyint(3), PRIMARY KEY (a), KEY b (b)); @@ -1168,7 +1169,9 @@ SELECT b from t2 GROUP BY b; # Show that we are using 'range' when there is more NULL rows in the table insert into t2 SELECT NULL, NULL from seq_1_to_10; +EXPLAIN SELECT b from t2 GROUP BY a; --echo # Expect: Using index for group-by +analyze table t2; EXPLAIN SELECT b from t2 GROUP BY a; DROP TABLE t1; @@ -2131,3 +2134,22 @@ drop table t20, t21, t22; --echo # --echo # End of 10.3 tests --echo # + +--echo # +--echo # Test new group_min_max optimization +--echo # + +create table t1 (a int, b int, c int, key(a,b,c)); +insert into t1 select mod(seq,23),mod(seq,13), mod(seq,5) from seq_1_to_10000; + +explain select a from t1 where a in (1,2,3) group by a; +explain select a from t1 where a in (1,2,3) or a = 22 group by a; +explain select a from t1 where a in (1,2,3) and a < 3 group by a; +explain select a,b from t1 where (a) in (1,2,3) and b in (5,6,7) group by a,b; +explain select a,b from t1 where (a,b) in ((1,1),(2,2),(3,3)) group by a,b; +explain select a,b,c from t1 where (a,b) in ((1,1),(2,2),(3,3)) and c=3 group by a,b,c; + +--echo # Will not use index for group-by +explain select a from t1 where a in (1,2,3) and b>1 group by a; +explain select a from t1 where a in (1,2,3) and c=1 group by a; +drop table t1; diff --git a/sql/opt_range.cc b/sql/opt_range.cc index 78e577c31f2..b2e109a5a72 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -2174,6 +2174,27 @@ int SEL_ARG::sel_cmp(Field *field, uchar *a, uchar *b, uint8 a_flag, } +/* + Check if min and values are equal + + @return 1 if equal +*/ + +bool SEL_ARG::min_max_are_equal() const +{ + uint offset= 0; + if (field->real_maybe_null()) // If null is part of key + { + if (*min_value != *max_value) + return 0; + if (*min_value) + return 1; // NULL where equal + offset= 1; // Skip NULL marker + } + return field->key_cmp(min_value+offset, max_value+offset) == 0; +} + + SEL_ARG *SEL_ARG::clone_tree(RANGE_OPT_PARAM *param) { SEL_ARG tmp_link,*next_arg,*root; @@ -11040,6 +11061,50 @@ uint SEL_ARG::get_max_key_part() const } +/** + Compute the number of eq_ranges top elements in the tree + + This is used by the cost_group_min_max() to calculate the number of + groups in SEL_TREE + + @param group_key_parts number of key parts that must be equal + + @return < 0 Not known + @return >= 0 Number of groups +*/ + +int SEL_ARG::number_of_eq_groups(uint group_key_parts) const +{ + int elements= 0; + SEL_ARG const *cur; + + if (part > group_key_parts-1 || type != KEY_RANGE) + return -1; + + cur= first(); + do + { + if ((cur->min_flag | cur->min_flag) & + (NO_MIN_RANGE | NO_MAX_RANGE | NEAR_MIN | NEAR_MAX | GEOM_FLAG)) + return -1; + if (min_value != max_value && !min_max_are_equal()) + return -1; + if (part != group_key_parts -1) + { + int tmp; + if (!next_key_part) + return -1; + if ((tmp= next_key_part->number_of_eq_groups(group_key_parts)) < 0) + return -1; + elements+= tmp; + } + else + elements++; + } while ((cur= cur->next)); + return elements; +} + + /* Remove the SEL_ARG graph elements which have part > max_part. @@ -11092,8 +11157,8 @@ void prune_sel_arg_graph(SEL_ARG *sel_arg, uint max_part) @return tree pointer The tree after processing, - NULL If it was not possible to reduce the weight of the tree below the - limit. + NULL If it was not possible to reduce the weight of the tree below + the limit. */ SEL_ARG *enforce_sel_arg_weight_limit(RANGE_OPT_PARAM *param, uint keyno, @@ -14950,7 +15015,7 @@ get_field_keypart(KEY *index, Field *field) have_min [in] True if there is a MIN function have_max [in] True if there is a MAX function read_cost [out] The cost to retrieve rows via this quick select - records [out] The number of rows retrieved + out_records [out] The number of rows retrieved DESCRIPTION This method computes the access cost of a TRP_GROUP_MIN_MAX instance and @@ -14968,6 +15033,20 @@ get_field_keypart(KEY *index, Field *field) either scan the index for the next value or do a new index dive with 'find next bigger key'. + When using MIN() and MAX() in the query, the calls to the storage engine + are as follows for each group: + Assuming kp1 in ('abc','def','ghi)' and kp2 between 1000 and 2000 + + read_key('abc', HA_READ_KEY_OR_NEXT) + In case of MIN() we do: + read_key('abc,:'1000', HA_READ_KEY_OR_NEXT) + In case of MAX() we do + read_key('abc,:'2000', HA_READ_PREFIX_LAST_OR_PREV) + In the following code we will assume that the MIN key will be in + the same block as the first key read. + (We should try to optimize away the extra call for MAX() at some + point). + NOTES See get_best_group_min_max() for which kind of queries this function will be called. @@ -15013,36 +15092,27 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts, uint group_key_parts, SEL_TREE *range_tree, SEL_ARG *index_tree, ha_rows quick_prefix_records, bool have_min, bool have_max, - double *read_cost, ha_rows *records) + double *read_cost, ha_rows *out_records) { - uint keys_per_block, key_length; - ha_rows table_records; + uint key_length; + ha_rows records; ha_rows num_groups; ha_rows num_blocks; ha_rows keys_per_group; - ha_rows keys_per_subgroup; /* Average number of keys in sub-groups */ - /* formed by a key infix. */ - double p_overlap; /* Probability that a sub-group overlaps two blocks. */ double quick_prefix_selectivity; - double io_cost; + ulonglong io_cost; handler *file= table->file; DBUG_ENTER("cost_group_min_max"); /* Same code as in handler::key_read_time() */ - table_records= table->stat_records(); + records= table->stat_records(); key_length= (index_info->key_length + file->ref_length); - num_blocks= (table_records * key_length / INDEX_BLOCK_FILL_FACTOR_DIV * - INDEX_BLOCK_FILL_FACTOR_MUL) / file->stats.block_size + 1; - keys_per_block= (file->stats.block_size / - (key_length * INDEX_BLOCK_FILL_FACTOR_MUL / - INDEX_BLOCK_FILL_FACTOR_DIV) + - 1); /* Compute the number of keys in a group. */ if (!group_key_parts) { /* Summary over the whole table */ - keys_per_group= MY_MAX(table_records,1); + keys_per_group= MY_MAX(records,1); } else { @@ -15050,62 +15120,61 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts, 1); if (keys_per_group == 0) /* If there is no statistics try to guess */ { - /* each group contains 10% of all records */ - keys_per_group= (table_records / 10) + 1; + /* each group contains 1% of all records */ + keys_per_group= (records / 100) + 1; } } if (keys_per_group > 1) - num_groups= (table_records / keys_per_group) + 1; + num_groups= (records / keys_per_group) + 1; else - num_groups= table_records; + num_groups= records; /* Apply the selectivity of the quick select for group prefixes. */ if (range_tree && (quick_prefix_records != HA_POS_ERROR)) { + int groups; quick_prefix_selectivity= (double) quick_prefix_records / - (double) table_records; + (double) records; num_groups= (ha_rows) rint(num_groups * quick_prefix_selectivity); + records= quick_prefix_records; + /* - Expect at least as many groups as there is ranges in the index + Try to handle cases like WHERE a in (1,2,3) GROUP BY a - This is mostly relevant for queries with few records, which is - something we have a lot of in our test suites. - In theory it is possible to scan the index_tree and for cases - where all ranges are eq ranges, we could calculate the exact number - of groups. This is probably an overkill so for now we estimate - the lower level of number of groups by the range elements in the - tree. + If all ranges are eq_ranges for the group_key_parts we can use + this as the number of groups. */ - set_if_bigger(num_groups, MY_MAX(index_tree->elements, 1)); + groups= index_tree->number_of_eq_groups(group_key_parts); + if (groups > 0) + num_groups= groups; + else + { + /* + Expect at least as many groups as there is ranges in the index + + This is mostly relevant for queries with few records, which is + something we have a lot of in our test suites. + In theory it is possible to scan the index_tree and for cases + where all ranges are eq ranges, we could calculate the exact number + of groups. This is probably an overkill so for now we estimate + the lower level of number of groups by the range elements in the + tree. + */ + set_if_bigger(num_groups, MY_MAX(index_tree->elements, 1)); + } /* There cannot be more groups than matched records */ set_if_smaller(num_groups, quick_prefix_records); } - DBUG_ASSERT(num_groups <= table_records); + DBUG_ASSERT(num_groups <= records); - if (used_key_parts > group_key_parts) - { - /* - Compute the probability that two ends of a subgroup are inside - different blocks. - */ - keys_per_subgroup= (ha_rows) index_info->actual_rec_per_key(used_key_parts - - 1); - if (keys_per_subgroup >= keys_per_block) /* If a subgroup is bigger than */ - p_overlap= 1.0; /* a block, it will overlap at least two blocks. */ - else - { - double blocks_per_group= (double) num_blocks / (double) num_groups; - p_overlap= (blocks_per_group * (keys_per_subgroup - 1)) / keys_per_group; - p_overlap= MY_MIN(p_overlap, 1.0); - } - io_cost= (double) MY_MIN(num_groups * (1 + p_overlap), num_blocks); - } - else - io_cost= ((keys_per_group > keys_per_block) ? - (have_min && have_max) ? (double) (num_groups + 1) : - (double) num_groups : - (double) num_blocks); + + /* Calculate the number of blocks we will touch for the table or range scan */ + num_blocks= (records * key_length / INDEX_BLOCK_FILL_FACTOR_DIV * + INDEX_BLOCK_FILL_FACTOR_MUL) / file->stats.block_size + 1; + + io_cost= (have_max) ? num_groups*2 : num_groups; + set_if_smaller(io_cost, num_blocks); /* CPU cost must be comparable to that of an index scan as computed @@ -15118,13 +15187,13 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts, *read_cost= file->ha_keyread_and_compare_time(keyno, (ulong) num_groups, num_groups, io_cost); - *records= num_groups; + *out_records= num_groups; DBUG_PRINT("info", - ("table rows: %lu keys/block: %u keys/group: %lu " + ("rows: %lu keys/group: %lu " "result rows: %lu blocks: %lu", - (ulong) table_records, keys_per_block, (ulong) keys_per_group, - (ulong) *records, (ulong) num_blocks)); + (ulong) records, (ulong) keys_per_group, + (ulong) *out_records, (ulong) num_blocks)); DBUG_VOID_RETURN; } diff --git a/sql/opt_range.h b/sql/opt_range.h index 0973a9a944e..91cbfb3412b 100644 --- a/sql/opt_range.h +++ b/sql/opt_range.h @@ -302,6 +302,7 @@ class SEL_ARG :public Sql_alloc { static int sel_cmp(Field *field, uchar *a, uchar *b, uint8 a_flag, uint8 b_flag); + bool min_max_are_equal() const; public: uint8 min_flag,max_flag,maybe_flag; uint8 part; // Which key part @@ -401,6 +402,7 @@ public: return false; return true; } + int number_of_eq_groups(uint group_key_parts) const; inline void merge_flags(SEL_ARG *arg) { maybe_flag|=arg->maybe_flag; } inline void maybe_smaller() { maybe_flag=1; } /* Return true iff it's a single-point null interval */