Fix off-by-one error in Histogram_json_hb::find_bucket

2021-09-14 14:29:41 +03:00 · 2021-09-14 14:29:41 +03:00 · 28ad128585
commit 28ad128585
parent b179640219
4 changed files with 70 additions and 13 deletions
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@ -4093,12 +4093,12 @@ test.t2	analyze	status	Engine-independent statistics collected
 test.t2	analyze	status	OK
 explain extended select * from t2 where city = 'Moscow';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	50.00	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	98.02	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`city` AS `city` from `test`.`t2` where `test`.`t2`.`city` = 'Moscow'
 analyze select * from t2 where city = 'Moscow';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	101.00	50.00	98.02	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	101.00	98.02	98.02	Using where
 explain extended select * from t2 where city = 'Helsinki';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	1.98	Using where
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@ -182,4 +182,3 @@ SET histogram_type= JSON_HB;
 ANALYZE TABLE t1 PERSISTENT FOR ALL;
 SELECT * FROM t1;
 drop table t1;
-
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@ -483,12 +483,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,

  // If the value is outside of the histogram's range, this will "clip" it to
  // first or last bucket.
-  int idx= find_bucket(field, key, false);
+  bool equal;
+  int idx= find_bucket(field, key, &equal);

  double sel;

-  if (buckets[idx].ndv == 1 &&
-      field->key_cmp((uchar*)buckets[idx].start_value.data(), key))
+  if (buckets[idx].ndv == 1 && !equal)
  {
    // The bucket has a single value and it doesn't match! Use the global
    // average.
@ -550,7 +550,18 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,

    // Find the leftmost bucket that contains the lookup value.
    // (If the lookup value is to the left of all buckets, find bucket #0)
-    int idx= find_bucket(field, min_key, exclusive_endp);
+    bool equal;
+    int idx= find_bucket(field, min_key, &equal);
+    if (equal && exclusive_endp && buckets[idx].ndv==1 &&
+        idx < (int)buckets.size()-1)
+    {
+      /*
+        The range is "col > $CONST" and we've found a bucket that contains
+        only the value $CONST. Move to the next bucket.
+        TODO: what if the last value in the histogram is a popular one?
+      */
+      idx++;
+    }
    double left_fract= get_left_fract(idx);
    double sel= position_in_interval(field, min_key, min_key_len,
                                     buckets[idx].start_value,
@ -573,8 +584,18 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
      max_key++;
      max_key_len--;
    }
+    bool equal;
+    int idx= find_bucket(field, max_key, &equal);

-    int idx= find_bucket(field, max_key, inclusive_endp);
+    if (equal && !inclusive_endp && idx > 0)
+    {
+      /*
+        The range is "col < $CONST" and we've found a bucket starting with
+        $CONST. Move to the previous bucket.
+        TODO: what if the first value is the popular one?
+      */
+      idx--;
+    }
    double left_fract= get_left_fract(idx);
    double sel= position_in_interval(field, max_key, max_key_len,
                                     buckets[idx].start_value,
@ -616,22 +637,59 @@ void Histogram_json_hb::serialize(Field *field)
 */

 int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
-                                   bool equal_is_less)
+                                   bool *equal)
 {
+  int res;
  int low= 0;
  int high= (int)buckets.size() - 1;
+  *equal= false;

  while (low + 1 < high)
  {
    int middle= (low + high) / 2;
-    int res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
+    res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
    if (!res)
-      res= equal_is_less? -1: 1;
-    if (res < 0)
+    {
+      *equal= true;
+      return middle;
+    }
+    else if (res < 0)
      low= middle;
    else //res > 0
      high= middle;
  }

+  /*
+    If low and high were assigned a value in the above loop, then they are not
+    equal to the lookup value:
+
+      bucket[low] < lookup_val < bucket[high]
+
+    But there are two special cases: low=0 and high=last_bucket. Handle them
+    below.
+  */
+  if (low == 0)
+  {
+    res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
+    if (!res)
+      *equal= true;
+    else if (res < 0)
+    {
+      res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
+      if (!res)
+        *equal= true;
+      if (res >= 0)
+        low= high;
+    }
+  }
+  else if (high == (int)buckets.size() - 1)
+  {
+    res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
+    if (!res)
+      *equal= true;
+    if (res >= 0)
+      low= high;
+  }
+
  return low;
 }
--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@ -123,6 +123,6 @@ public:
 private:
  double get_left_fract(int idx);
  std::string& get_end_value(int idx);
-  int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
+  int find_bucket(Field *field, const uchar *lookup_val, bool *equal);
 };