Fix off-by-one error in Histogram_json_hb::find_bucket

2021-09-14 14:29:41 +03:00 · 2021-09-14 14:29:41 +03:00 · 28ad128585
commit 28ad128585
parent b179640219
4 changed files with 70 additions and 13 deletions
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@ -4093,12 +4093,12 @@ test.t2	analyze	status	Engine-independent statistics collected
 test.t2	analyze	status	OK
 explain extended select * from t2 where city = 'Moscow';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	50.00	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	98.02	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`city` AS `city` from `test`.`t2` where `test`.`t2`.`city` = 'Moscow'
 analyze select * from t2 where city = 'Moscow';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	101.00	50.00	98.02	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	101.00	98.02	98.02	Using where
 explain extended select * from t2 where city = 'Helsinki';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	1.98	Using where
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@ -182,4 +182,3 @@ SET histogram_type= JSON_HB;
 ANALYZE TABLE t1 PERSISTENT FOR ALL;
 SELECT * FROM t1;
 drop table t1;
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@ -483,12 +483,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
  // If the value is outside of the histogram's range, this will "clip" it to
  // first or last bucket.
-  int idx= find_bucket(field, key, false);
+  bool equal;
  int idx= find_bucket(field, key, &equal);
  double sel;
-  if (buckets[idx].ndv == 1 &&
+  if (buckets[idx].ndv == 1 && !equal)
      field->key_cmp((uchar*)buckets[idx].start_value.data(), key))
  {
    // The bucket has a single value and it doesn't match! Use the global
    // average.
@ -550,7 +550,18 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
    // Find the leftmost bucket that contains the lookup value.
    // (If the lookup value is to the left of all buckets, find bucket #0)
-    int idx= find_bucket(field, min_key, exclusive_endp);
+    bool equal;
    int idx= find_bucket(field, min_key, &equal);
    if (equal && exclusive_endp && buckets[idx].ndv==1 &&
        idx < (int)buckets.size()-1)
    {
      /*
        The range is "col > $CONST" and we've found a bucket that contains
        only the value $CONST. Move to the next bucket.
        TODO: what if the last value in the histogram is a popular one?
      */
      idx++;
    }
    double left_fract= get_left_fract(idx);
    double sel= position_in_interval(field, min_key, min_key_len,
                                     buckets[idx].start_value,
@ -573,8 +584,18 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
      max_key++;
      max_key_len--;
    }
    bool equal;
    int idx= find_bucket(field, max_key, &equal);
-    int idx= find_bucket(field, max_key, inclusive_endp);
+    if (equal && !inclusive_endp && idx > 0)
    {
      /*
        The range is "col < $CONST" and we've found a bucket starting with
        $CONST. Move to the previous bucket.
        TODO: what if the first value is the popular one?
      */
      idx--;
    }
    double left_fract= get_left_fract(idx);
    double sel= position_in_interval(field, max_key, max_key_len,
                                     buckets[idx].start_value,
@ -616,22 +637,59 @@ void Histogram_json_hb::serialize(Field *field)
 */
 int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
-                                   bool equal_is_less)
+                                   bool *equal)
 {
  int res;
  int low= 0;
  int high= (int)buckets.size() - 1;
  *equal= false;
  while (low + 1 < high)
  {
    int middle= (low + high) / 2;
-    int res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
+    res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
    if (!res)
-      res= equal_is_less? -1: 1;
+    {
-    if (res < 0)
+      *equal= true;
      return middle;
    }
    else if (res < 0)
      low= middle;
    else //res > 0
      high= middle;
  }
  /*
    If low and high were assigned a value in the above loop, then they are not
    equal to the lookup value:
      bucket[low] < lookup_val < bucket[high]
    But there are two special cases: low=0 and high=last_bucket. Handle them
    below.
  */
  if (low == 0)
  {
    res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
    if (!res)
      *equal= true;
    else if (res < 0)
    {
      res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
      if (!res)
        *equal= true;
      if (res >= 0)
        low= high;
    }
  }
  else if (high == (int)buckets.size() - 1)
  {
    res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
    if (!res)
      *equal= true;
    if (res >= 0)
      low= high;
  }
  return low;
 }
--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@ -123,6 +123,6 @@ public:
 private:
  double get_left_fract(int idx);
  std::string& get_end_value(int idx);
-  int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
+  int find_bucket(Field *field, const uchar *lookup_val, bool *equal);
 };