MDEV-26886: Estimation for filtered rows less precise with JSON histogram

- Make Histogram_json_hb::range_selectivity handle singleton buckets specially when computing selectivity of the max. endpoint bound. (for min. endpoint, we already do that). - Also, fixed comments for Histogram_json_hb::find_bucket
2021-11-26 20:03:08 +03:00 · 2021-11-26 20:03:08 +03:00 · eb6a9ad705
commit eb6a9ad705
parent 106c785e2d
4 changed files with 84 additions and 29 deletions
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@ -7658,3 +7658,28 @@ test.t1	analyze	status	OK
 ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
 INSERT INTO t1 (f) VALUES ('bar');
 DROP TABLE t1;
 #
 # MDEV-26886: Estimation for filtered rows less precise with JSON histogram
 #
 create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
 select count(*) from t1 where a <= 0;
 count(*)
 33
 set histogram_type = JSON_HB, histogram_size=default;
 analyze table t1 persistent for all;
 Table	Op	Msg_type	Msg_text
 test.t1	analyze	status	Engine-independent statistics collected
 test.t1	analyze	status	OK
 analyze select * from t1 where a <= 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	33.00	33.00	Using where
 analyze select * from t1 where a < 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	1.47	0.00	Using where
 analyze select * from t1 where a > 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	67.00	67.00	Using where
 analyze select * from t1 where a >= 0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	100.00	100.00	Using where
 drop table t1;
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@ -340,3 +340,17 @@ ANALYZE TABLE t1 PERSISTENT FOR ALL;
 ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
 INSERT INTO t1 (f) VALUES ('bar');
 DROP TABLE t1;
 --echo #
 --echo # MDEV-26886: Estimation for filtered rows less precise with JSON histogram
 --echo #
 create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
 select count(*) from t1 where a <= 0;
 set histogram_type = JSON_HB, histogram_size=default;
 analyze table t1 persistent for all;
 analyze select * from t1 where a <= 0;
 analyze select * from t1 where a < 0;
 analyze select * from t1 where a > 0;
 analyze select * from t1 where a >= 0;
 drop table t1;
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@ -743,9 +743,22 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
      idx--;
    }
    double left_fract= get_left_fract(idx);
-    double sel= position_in_interval(field, max_key, max_key_len,
+
-                                     buckets[idx].start_value,
+    double sel;
-                                     get_end_value(idx));
+    /* Special handling for singleton buckets */
    if (buckets[idx].ndv == 1 && equal)
    {
      if (inclusive_endp)
        sel= 1.0;
      else
        sel= 0.0;
    }
    else
    {
      sel= position_in_interval(field, max_key, max_key_len,
                                buckets[idx].start_value,
                                get_end_value(idx));
    }
    max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
@ -763,26 +776,18 @@ void Histogram_json_hb::serialize(Field *field)
 /*
-  Find the rightmost histogram bucket such that "lookup_val $GT start_value".
+  @brief
   Find the leftmost histogram bucket such that "lookup_val >= start_value".
-  $GT is either '>' or '>=' depending on equal_is_less parameter.
+  @param field        Field object (used to do value comparisons)
  @param lookup_val   The lookup value in KeyTupleFormat.
  @param equal  OUT   TRUE<=> the found bucket has left_bound=lookup_val
-  @param equal_is_less Controls what to do if a histogram bound is equal to the
+  @return
-                       lookup_val.
+     The bucket index
  @detail
    Possible cases:
    1. The regular case: the value falls into some bucket.
    2. The value is less than the minimum of the first bucket
    3. The value is greater than the maximum of the last bucket
      In these cases we "clip" to the first/last bucket.
    4. The value hits the bucket boundary. Then, we need to know whether the
       point of interest is to the left the constant, or to the right of it.
 */
-int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
+int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
                                   bool *equal)
 {
  int res;
@ -797,7 +802,8 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
    if (!res)
    {
      *equal= true;
-      return middle;
+      low= middle;
      goto end;
    }
    else if (res < 0)
      low= middle;
@ -806,25 +812,25 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
  }
  /*
-    If low and high were assigned a value in the above loop, then they are not
+    If low and high were assigned a value in the above loop and we got here,
-    equal to the lookup value:
+    then the following holds:
-      bucket[low] < lookup_val < bucket[high]
+      bucket[low].start_value < lookup_val < bucket[high].start_value
-    But there are two special cases: low=0 and high=last_bucket. Handle them
+    Besides that, there are two special cases: low=0 and high=last_bucket.
-    below.
+    Handle them below.
  */
  if (low == 0)
  {
    res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
    if (!res)
      *equal= true;
-    else if (res < 0)
+    else if (res < 0) //  buckets[0] < lookup_val
    {
      res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
      if (!res)
        *equal= true;
-      if (res >= 0)
+      if (res <= 0) // buckets[high] <= lookup_val
        low= high;
    }
  }
@ -833,9 +839,19 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
    res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
    if (!res)
      *equal= true;
-    if (res >= 0)
+    if (res <= 0)
      low= high;
  }
 end:
  // Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
  DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
                                         lookup_val)));
  // buckets[low] <= lookup_val, with one exception of the first bucket.
  DBUG_ASSERT(low == 0 ||
              field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
  // buckets[low+1] > lookup_val, with one exception of the last bucket
  DBUG_ASSERT(low == (int)buckets.size()-1 ||
              field->key_cmp((uchar*)buckets[low+1].start_value.data(), lookup_val)> 0);
  return low;
 }
--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@ -124,6 +124,6 @@ public:
 private:
  double get_left_fract(int idx);
  std::string& get_end_value(int idx);
-  int find_bucket(Field *field, const uchar *lookup_val, bool *equal);
+  int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
 };