Fix cost calculation for get_best_group_min_max()

If the final range restrictions (SEL_ARG tree) over GROUP BY columns are single-point, we can compute the number of GROUP BY groups. Example: in the query: SELECT ... FROM tbl WHERE keypart1 IN (1,2,3) and keypart2 IN ('foo','bar') Other things: - Fixed cost calculation to more correctly count the number of blocks that may be read. The old code could use the total blocks in the file even if a range was available.
2022-09-25 18:44:48 +03:00 · 2022-09-25 18:44:48 +03:00 · 7afa819f72
commit 7afa819f72
parent 009db2288b
4 changed files with 207 additions and 75 deletions
--- a/mysql-test/main/group_by.result
+++ b/mysql-test/main/group_by.result
@ -552,12 +552,12 @@ a	b
 3	1
 explain select t1.a,t2.b from t1,t2 where t1.a=t2.a group by t1.a,t2.b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	6	Using temporary; Using filesort
+1	SIMPLE	t2	ALL	a	NULL	NULL	NULL	4	Using temporary; Using filesort
-1	SIMPLE	t2	ALL	a	NULL	NULL	NULL	4	Using where; Using join buffer (flat, BNL join)
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	6	Using where; Using join buffer (flat, BNL join)
 explain select t1.a,t2.b from t1,t2 where t1.a=t2.a group by t1.a,t2.b ORDER BY NULL;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	6	Using temporary
+1	SIMPLE	t2	ALL	a	NULL	NULL	NULL	4	Using temporary
-1	SIMPLE	t2	ALL	a	NULL	NULL	NULL	4	Using where; Using join buffer (flat, BNL join)
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	6	Using where; Using join buffer (flat, BNL join)
 drop table t1,t2;
 SET @@optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity,@@optimizer_switch=@save_optimizer_switch;
 create table t1 (a int, b int);
@ -890,7 +890,8 @@ Level	Code	Message
 drop table t1, t2;
 CREATE TABLE t1 (a tinyint(3), b varchar(255), PRIMARY KEY  (a));
 INSERT INTO t1 VALUES (1,'-----'), (6,'Allemagne'), (17,'Autriche'), 
-(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France');
+(25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'),
 (100,"No land"), (101,"No land");
 CREATE TABLE t2 (a tinyint(3), b tinyint(3), PRIMARY KEY  (a), KEY b (b));
 INSERT INTO t2 VALUES (1,1), (2,1), (6,6), (18,17), (15,25), (16,25),
 (17,25), (10,54), (5,62),(3,68);
@ -1352,7 +1353,7 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	PRIMARY,i2	PRIMARY	4	NULL	1	Using where; Using index
 EXPLAIN SELECT a FROM t1 WHERE a < 2 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	PRIMARY,i2	PRIMARY	4	NULL	1	Using where; Using index
+1	SIMPLE	t1	range	PRIMARY,i2	i2	4	NULL	1	Using where; Using index for group-by
 EXPLAIN SELECT a FROM t1 IGNORE INDEX (PRIMARY,i2);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	144	
@ -1701,7 +1702,7 @@ NULL	1
 1	2
 EXPLAIN SELECT a from t2 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	index	NULL	a	10	NULL	6	Using index
+1	SIMPLE	t2	range	NULL	a	5	NULL	6	Using index for group-by
 SELECT a from t2 GROUP BY a;
 a
 NULL
@ -1715,10 +1716,17 @@ NULL
 1
 2
 insert into t2 SELECT NULL, NULL from seq_1_to_10;
 # Expect: Using index for group-by
 EXPLAIN SELECT b from t2 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	a	5	NULL	9	Using index for group-by
+1	SIMPLE	t2	index	NULL	a	10	NULL	16	Using index
 # Expect: Using index for group-by
 analyze table t2;
 Table	Op	Msg_type	Msg_text
 test.t2	analyze	status	Engine-independent statistics collected
 test.t2	analyze	status	OK
 EXPLAIN SELECT b from t2 GROUP BY a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t2	range	NULL	a	5	NULL	6	Using index for group-by
 DROP TABLE t1;
 DROP TABLE t2;
 CREATE TABLE t1 ( a INT, b INT );
@ -2255,11 +2263,11 @@ INSERT INTO t2(col1, col2) VALUES
 explain
 select col1 f1, col2 f2, col1 f3 from t2 group by f1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx	5	NULL	7	Using index for group-by
+1	SIMPLE	t2	index	NULL	idx	10	NULL	20	Using index
 explain
 select SQL_BIG_RESULT col1 f1, col2 f2, col1 f3 from t2 group by f1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx	5	NULL	7	Using index for group-by
+1	SIMPLE	t2	index	NULL	idx	10	NULL	20	Using index; Using filesort
 explain
 select col1 f1, col2 f2, col1 f3 from t2 group by f1, f2;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
@ -2267,7 +2275,7 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 explain
 select col1 f1, col1 f2 from t2 group by f1, 1+1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t2	range	NULL	idx	5	NULL	7	Using index for group-by
+1	SIMPLE	t2	index	NULL	idx	10	NULL	20	Using index
 explain
 select col1 f1, col2 f2, col1 f3 from t2 group by f1, f2, f3+0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
@ -2457,7 +2465,7 @@ test.t1	analyze	status	OK
 EXPLAIN SELECT SQL_BUFFER_RESULT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	b	b	9	NULL	2	Using where; Using index for group-by; Using temporary
+1	SIMPLE	t1	range	b	b	9	NULL	1	Using where; Using index for group-by; Using temporary
 SELECT SQL_BUFFER_RESULT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
 MIN(a)	b
@ -2465,7 +2473,7 @@ MIN(a)	b
 EXPLAIN SELECT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	SIMPLE	t1	range	b	b	9	NULL	2	Using where; Using index for group-by
+1	SIMPLE	t1	range	b	b	9	NULL	1	Using where; Using index for group-by
 SELECT MIN(a), b FROM t1 WHERE t1.b = 'a' GROUP BY b;
 MIN(a)	b
@ -2994,3 +3002,34 @@ drop table t20, t21, t22;
 #
 # End of 10.3 tests
 #
 #
 # Test new group_min_max optimization
 #
 create table t1 (a int, b int, c int, key(a,b,c));
 insert into t1 select mod(seq,23),mod(seq,13), mod(seq,5) from seq_1_to_10000;
 explain select a from t1 where a in (1,2,3) group by a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	5	NULL	3	Using where; Using index for group-by
 explain select a from t1 where a in (1,2,3) or a = 22 group by a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	5	NULL	4	Using where; Using index for group-by
 explain select a from t1 where a in (1,2,3) and a < 3 group by a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	5	NULL	2	Using where; Using index for group-by
 explain select a,b from t1 where (a) in (1,2,3) and b in (5,6,7) group by a,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	10	NULL	9	Using where; Using index for group-by
 explain select a,b from t1 where (a,b) in ((1,1),(2,2),(3,3)) group by a,b;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	10	NULL	3	Using where; Using index for group-by
 explain select a,b,c from t1 where (a,b) in ((1,1),(2,2),(3,3)) and c=3 group by a,b,c;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	15	NULL	3	Using where; Using index for group-by
 # Will not use index for group-by
 explain select a from t1 where a in (1,2,3) and b>1 group by a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	10	NULL	983	Using where; Using index
 explain select a from t1 where a in (1,2,3) and c=1 group by a;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t1	range	a	a	5	NULL	1161	Using where; Using index
 drop table t1;
--- a/mysql-test/main/group_by.test
+++ b/mysql-test/main/group_by.test
@ -718,7 +718,8 @@ drop table t1, t2;
 CREATE TABLE t1 (a tinyint(3), b varchar(255), PRIMARY KEY  (a));
 INSERT INTO t1 VALUES (1,'-----'), (6,'Allemagne'), (17,'Autriche'), 
-    (25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France');
+    (25,'Belgique'), (54,'Danemark'), (62,'Espagne'), (68,'France'),
    (100,"No land"), (101,"No land");
 CREATE TABLE t2 (a tinyint(3), b tinyint(3), PRIMARY KEY  (a), KEY b (b));
@ -1168,7 +1169,9 @@ SELECT b from t2 GROUP BY b;
 # Show that we are using 'range' when there is more NULL rows in the table
 insert into t2 SELECT NULL, NULL from seq_1_to_10;
 EXPLAIN SELECT b from t2 GROUP BY a;
 --echo # Expect: Using index for group-by
 analyze table t2;
 EXPLAIN SELECT b from t2 GROUP BY a;
 DROP TABLE t1;
@ -2131,3 +2134,22 @@ drop table t20, t21, t22;
 --echo #
 --echo # End of 10.3 tests
 --echo #
 --echo #
 --echo # Test new group_min_max optimization
 --echo #
 create table t1 (a int, b int, c int, key(a,b,c));
 insert into t1 select mod(seq,23),mod(seq,13), mod(seq,5) from seq_1_to_10000;
 explain select a from t1 where a in (1,2,3) group by a;
 explain select a from t1 where a in (1,2,3) or a = 22 group by a;
 explain select a from t1 where a in (1,2,3) and a < 3 group by a;
 explain select a,b from t1 where (a) in (1,2,3) and b in (5,6,7) group by a,b;
 explain select a,b from t1 where (a,b) in ((1,1),(2,2),(3,3)) group by a,b;
 explain select a,b,c from t1 where (a,b) in ((1,1),(2,2),(3,3)) and c=3 group by a,b,c;
 --echo # Will not use index for group-by
 explain select a from t1 where a in (1,2,3) and b>1 group by a;
 explain select a from t1 where a in (1,2,3) and c=1 group by a;
 drop table t1;
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@ -2174,6 +2174,27 @@ int SEL_ARG::sel_cmp(Field *field, uchar *a, uchar *b, uint8 a_flag,
 }
 /*
  Check if min and values are equal
  @return 1 if equal
 */
 bool SEL_ARG::min_max_are_equal() const
 {
  uint offset= 0;
  if (field->real_maybe_null())			// If null is part of key
  {
    if (*min_value != *max_value)
      return 0;
    if (*min_value)
      return 1;					// NULL where equal
    offset= 1;                                  // Skip NULL marker
  }
  return field->key_cmp(min_value+offset, max_value+offset) == 0;
 }
 SEL_ARG *SEL_ARG::clone_tree(RANGE_OPT_PARAM *param)
 {
  SEL_ARG tmp_link,*next_arg,*root;
@ -11040,6 +11061,50 @@ uint SEL_ARG::get_max_key_part() const
 }
 /**
  Compute the number of eq_ranges top elements in the tree
  This is used by the cost_group_min_max() to calculate the number of
  groups in SEL_TREE
  @param group_key_parts number of key parts that must be equal
  @return < 0  Not known
  @return >= 0  Number of groups
 */
 int SEL_ARG::number_of_eq_groups(uint group_key_parts) const
 {
  int elements= 0;
  SEL_ARG const *cur;
  if (part > group_key_parts-1 || type != KEY_RANGE)
    return -1;
  cur= first();
  do
  {
    if ((cur->min_flag | cur->min_flag) &
        (NO_MIN_RANGE | NO_MAX_RANGE | NEAR_MIN | NEAR_MAX | GEOM_FLAG))
      return -1;
    if (min_value != max_value && !min_max_are_equal())
      return -1;
    if (part != group_key_parts -1)
    {
      int tmp;
      if (!next_key_part)
        return -1;
      if ((tmp= next_key_part->number_of_eq_groups(group_key_parts)) < 0)
        return -1;
      elements+= tmp;
    }
    else
      elements++;
  } while ((cur= cur->next));
  return elements;
 }
 /*
  Remove the SEL_ARG graph elements which have part > max_part.
@ -11092,8 +11157,8 @@ void prune_sel_arg_graph(SEL_ARG *sel_arg, uint max_part)
  @return
    tree pointer  The tree after processing,
-    NULL          If it was not possible to reduce the weight of the tree below the
+    NULL          If it was not possible to reduce the weight of the tree below
-                  limit.
+                  the limit.
 */
 SEL_ARG *enforce_sel_arg_weight_limit(RANGE_OPT_PARAM *param, uint keyno,
@ -14950,7 +15015,7 @@ get_field_keypart(KEY *index, Field *field)
    have_min             [in] True if there is a MIN function
    have_max             [in] True if there is a MAX function
    read_cost           [out] The cost to retrieve rows via this quick select
-    records             [out] The number of rows retrieved
+    out_records         [out] The number of rows retrieved
  DESCRIPTION
    This method computes the access cost of a TRP_GROUP_MIN_MAX instance and
@ -14968,6 +15033,20 @@ get_field_keypart(KEY *index, Field *field)
        either scan the index for the next value or do a new index dive
        with 'find next bigger key'.
    When using MIN() and MAX() in the query, the calls to the storage engine
    are as follows for each group:
      Assuming kp1 in ('abc','def','ghi)' and kp2 between 1000 and 2000
      read_key('abc', HA_READ_KEY_OR_NEXT)
      In case of MIN() we do:
        read_key('abc,:'1000', HA_READ_KEY_OR_NEXT)
      In case of MAX() we do
        read_key('abc,:'2000', HA_READ_PREFIX_LAST_OR_PREV)
      In the following code we will assume that the MIN key will be in
      the same block as the first key read.
      (We should try to optimize away the extra call for MAX() at some
       point).
  NOTES
    See get_best_group_min_max() for which kind of queries this function
    will be called.
@ -15013,36 +15092,27 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
                        uint group_key_parts, SEL_TREE *range_tree,
                        SEL_ARG *index_tree, ha_rows quick_prefix_records,
                        bool have_min, bool have_max,
-                        double *read_cost, ha_rows *records)
+                        double *read_cost, ha_rows *out_records)
 {
-  uint    keys_per_block, key_length;
+  uint    key_length;
-  ha_rows table_records;
+  ha_rows records;
  ha_rows num_groups;
  ha_rows num_blocks;
  ha_rows keys_per_group;
  ha_rows keys_per_subgroup; /* Average number of keys in sub-groups */
                          /* formed by a key infix. */
  double p_overlap; /* Probability that a sub-group overlaps two blocks. */
  double quick_prefix_selectivity;
-  double io_cost;
+  ulonglong io_cost;
  handler *file= table->file;
  DBUG_ENTER("cost_group_min_max");
  /* Same code as in handler::key_read_time() */
-  table_records= table->stat_records();
+  records= table->stat_records();
  key_length= (index_info->key_length + file->ref_length);
  num_blocks= (table_records * key_length / INDEX_BLOCK_FILL_FACTOR_DIV *
               INDEX_BLOCK_FILL_FACTOR_MUL) / file->stats.block_size + 1;
  keys_per_block= (file->stats.block_size /
                   (key_length * INDEX_BLOCK_FILL_FACTOR_MUL /
                    INDEX_BLOCK_FILL_FACTOR_DIV) +
                   1);
  /* Compute the number of keys in a group. */
  if (!group_key_parts)
  {
    /* Summary over the whole table */
-    keys_per_group= MY_MAX(table_records,1);
+    keys_per_group= MY_MAX(records,1);
  }
  else
  {
@ -15050,24 +15120,38 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
                                                             1);
    if (keys_per_group == 0) /* If there is no statistics try to guess */
    {
-      /* each group contains 10% of all records */
+      /* each group contains 1% of all records */
-      keys_per_group= (table_records / 10) + 1;
+      keys_per_group= (records / 100) + 1;
    }
  }
  if (keys_per_group > 1)
-    num_groups= (table_records / keys_per_group) + 1;
+    num_groups= (records / keys_per_group) + 1;
  else
-    num_groups= table_records;
+    num_groups= records;
  /* Apply the selectivity of the quick select for group prefixes. */
  if (range_tree && (quick_prefix_records != HA_POS_ERROR))
  {
    int groups;
    quick_prefix_selectivity= (double) quick_prefix_records /
-                              (double) table_records;
+                              (double) records;
    num_groups= (ha_rows) rint(num_groups * quick_prefix_selectivity);
    records= quick_prefix_records;
    /*
      Try to handle cases like
      WHERE a in (1,2,3) GROUP BY a
      If all ranges are eq_ranges for the group_key_parts we can use
      this as the number of groups.
    */
    groups= index_tree->number_of_eq_groups(group_key_parts);
    if (groups > 0)
      num_groups= groups;
    else
    {
      /*
        Expect at least as many groups as there is ranges in the index
      WHERE a in (1,2,3) GROUP BY a
        This is mostly relevant for queries with few records, which is
        something we have a lot of in our test suites.
@ -15078,34 +15162,19 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
        tree.
      */
      set_if_bigger(num_groups, MY_MAX(index_tree->elements, 1));
    }
    /* There cannot be more groups than matched records */
    set_if_smaller(num_groups, quick_prefix_records);
  }
-  DBUG_ASSERT(num_groups <= table_records);
+  DBUG_ASSERT(num_groups <= records);
-  if (used_key_parts > group_key_parts)
+
-  {
+  /* Calculate the number of blocks we will touch for the table or range scan */
-    /*
+  num_blocks= (records * key_length / INDEX_BLOCK_FILL_FACTOR_DIV *
-      Compute the probability that two ends of a subgroup are inside
+               INDEX_BLOCK_FILL_FACTOR_MUL) / file->stats.block_size + 1;
-      different blocks.
+
-    */
+  io_cost= (have_max) ? num_groups*2 : num_groups;
-    keys_per_subgroup= (ha_rows) index_info->actual_rec_per_key(used_key_parts -
+  set_if_smaller(io_cost, num_blocks);
                                                                1);
    if (keys_per_subgroup >= keys_per_block) /* If a subgroup is bigger than */
      p_overlap= 1.0;       /* a block, it will overlap at least two blocks. */
    else
    {
      double blocks_per_group= (double) num_blocks / (double) num_groups;
      p_overlap= (blocks_per_group * (keys_per_subgroup - 1)) / keys_per_group;
      p_overlap= MY_MIN(p_overlap, 1.0);
    }
    io_cost= (double) MY_MIN(num_groups * (1 + p_overlap), num_blocks);
  }
  else
    io_cost= ((keys_per_group > keys_per_block) ?
              (have_min && have_max) ? (double) (num_groups + 1) :
              (double) num_groups :
              (double) num_blocks);
  /*
    CPU cost must be comparable to that of an index scan as computed
@ -15118,13 +15187,13 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts,
  *read_cost= file->ha_keyread_and_compare_time(keyno, (ulong) num_groups,
                                                num_groups,
                                                io_cost);
-  *records= num_groups;
+  *out_records= num_groups;
  DBUG_PRINT("info",
-             ("table rows: %lu  keys/block: %u  keys/group: %lu  "
+             ("rows: %lu  keys/group: %lu  "
              "result rows: %lu  blocks: %lu",
-              (ulong) table_records, keys_per_block, (ulong) keys_per_group, 
+              (ulong) records, (ulong) keys_per_group,
-              (ulong) *records, (ulong) num_blocks));
+              (ulong) *out_records, (ulong) num_blocks));
  DBUG_VOID_RETURN;
 }
--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@ -302,6 +302,7 @@ class SEL_ARG :public Sql_alloc
 {
  static int sel_cmp(Field *field, uchar *a, uchar *b, uint8 a_flag,
                     uint8 b_flag);
  bool min_max_are_equal() const;
 public:
  uint8 min_flag,max_flag,maybe_flag;
  uint8 part;					// Which key part
@ -401,6 +402,7 @@ public:
      return false;
    return true;
  }
  int number_of_eq_groups(uint group_key_parts) const;
  inline void merge_flags(SEL_ARG *arg) { maybe_flag|=arg->maybe_flag; }
  inline void maybe_smaller() { maybe_flag=1; }
  /* Return true iff it's a single-point null interval */