Use binary search to compute range selectivity

* it also adds an "explain select" statement to the test so that the fprintf calls can print the computed intervals to mysqld.1.err Signed-off-by: Michael Okoko <okokomichaels@outlook.com>
2021-08-16 10:09:56 +01:00 · 2021-08-16 10:09:56 +01:00 · c129689ddc
commit c129689ddc
parent c605285bb8
4 changed files with 89 additions and 99 deletions
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@ -67,33 +67,11 @@ test	t1	d	1	25	0.0000	8.0000	1.0000	10	JSON	[
  "21",
  "23"
 ]
-SELECT * FROM t1;
+explain extended select * from t1 where b between '20' and '70';
-a	b	c	d
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	1	1	1
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	25	10.00	Using where
-2	2	2	2
+Warnings:
-3	3	3	3
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where `test`.`t1`.`b` between '20' and '70'
 4	4	4	4
 5	5	5	5
 6	6	6	6
 7	7	7	7
 8	8	8	8
 9	9	9	9
 10	10	10	10
 11	11	11	11
 12	12	12	12
 13	13	13	13
 14	14	14	14
 15	15	15	15
 16	16	16	16
 17	17	17	17
 18	18	18	18
 19	19	19	19
 20	20	20	20
 21	21	21	21
 22	22	22	22
 23	23	23	23
 24	24	24	24
 25	25	25	25
 UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
 FLUSH TABLES;
 SELECT * FROM t1;
--- a/mysql-test/main/statistics_json.test
+++ b/mysql-test/main/statistics_json.test
@ -28,7 +28,7 @@ set histogram_size=10;
 ANALYZE TABLE t1 PERSISTENT FOR ALL;
 SELECT * FROM mysql.column_stats WHERE table_name='t1';
-SELECT * FROM t1;
+explain extended select * from t1 where b between '20' and '70';
 # We then test different valid JSON strings that are invalid histograms.
 UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@ -1466,45 +1466,7 @@ double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
                                             key_range *max_endp)
 {
  fprintf(stderr, "Histogram_json::range_selectivity_new\n");
-
+  double min_sel, max_sel;
  /*
    GSOC-TODO: 
    The code below is NOT what this function have. 
    == WHAT THIS CODE DOES ==
    At the moment it does a linear walk through histogram_bounds and compares 
    min_endp to each of histogram bucket's min and max. 
    ATTENTION:  This is a demo of how key_cmp() is used to compare the values.
    When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END, 
    it computes a position of min_endp within the bucket.
    ATTENTION: calls to pos_in_interval_.... are a demo of how to compute 
    position of a value within a [min,max] range.
    == WHAT THIS CODE SHOULD DO ==
    * Use binary search to locate the range  [MIN_BUCKET; MAX_BUCKET] - the
      set of buckets that overlaps with the search interval {min_endp, max_endp}.
    * If the search interval covers MIN_BUCKET only partially, compute a
      position of min_endp within the bucket.
    * The same for max_endp.
    * Compute the final selectivity and return it.
  */
  std::string prev_s;
  bool have_prev_s=false;
  for (auto &s : histogram_bounds)
  {
    if (!have_prev_s)
    {
      prev_s = s;
      have_prev_s= true;
      continue;
    }
    // It's a test code, so we only process min_endp.
  if (min_endp)
  {
    const uchar *min_key= min_endp->key;
@ -1513,43 +1475,86 @@ double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
    if (field->real_maybe_null())
      min_key++;
-      int res1= field->key_cmp((uchar*)prev_s.data(), min_key);
+    min_sel= selection_in_interval(field, min_key);
-      const char *str1="<";
+    fprintf(stderr, "min pos_in_interval(min_endp)=%g\n", min_sel);
-      if (res1>0) str1=">";
+  }
-      if (res1==0) str1="=";
+  if (max_endp)
      int res2= field->key_cmp(min_key, (uchar*)s.data());
      const char *str2="<";
      if (res2>0) str2=">";
      if (res2==0) str2="=";
      fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2);
      if (res1<0 && res2 < 0)
  {
-        double sel;
+    const uchar *max_key= max_endp->key;
-        if (field->pos_through_val_str())
+    if (field->real_maybe_null())
-          sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(), 
+      max_key++;
                                               (uchar*)s.data(), (uchar*)min_key);
        else
          sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(), 
                                                (uchar*)s.data(), (uchar*)min_key);
-        fprintf(stderr, "  pos_in_interval=%g\n", sel);
+    max_sel= selection_in_interval(field, max_key);
    fprintf(stderr, "max pos_in_interval(min_endp)=%g\n", max_sel);
  }
      prev_s= s;
    }
  }
  fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
  return 0.5;
 }
 double Histogram_json::selection_in_interval(Field *field, const uchar* endpoint)
 {
  int min_bucket_idx, max_bucket_idx;
  min_bucket_idx= find_bucket(field, endpoint);
  std::string min_bucket, max_bucket;
  // todo:
  //  this will probably trip up for cases where mind_endp > the last histogram value i.e min_bucket_idx = -1, but max_bucket_idx = 0 doesn't make sense.
  max_bucket_idx= min_bucket_idx + 1;
  double selection = 0;
  if (min_bucket_idx != -1)
  {
    min_bucket= histogram_bounds[min_bucket_idx];
    max_bucket= (max_bucket_idx < (int)histogram_bounds.size()) ? histogram_bounds[max_bucket_idx] : "";
    if (field->pos_through_val_str())
      selection = pos_in_interval_through_strxfrm(field, (uchar *) min_bucket.data(),
                                           (uchar *) max_bucket.data(),
                                           (uchar *) endpoint);
    else
      selection = pos_in_interval_through_val_real(field, (uchar *) min_bucket.data(),
                                            (uchar *) max_bucket.data(),
                                            (uchar *) endpoint);
  }
  return selection;
 }
 void Histogram_json::serialize(Field *field)
 {
  field->store((char*)get_values(), strlen((char*)get_values()),
               &my_charset_bin);
 }
 int Histogram_json::find_bucket(Field *field, const uchar *endpoint)
 {
  int low = 0;
  int high = (int)histogram_bounds.size()-1;
  int mid;
  int min_bucket_index = -1;
  std::string mid_val;
  while(low <= high) {
    // c++ gives us the floor of integer divisions by default, below we get the ceiling (round-up).
    // it works but it doesn't feel so readable, maybe we could make improvements?
    int sum = (low+high);
    mid = sum/2 + (sum % 2 != 0);
    mid_val = histogram_bounds[mid];
    int res = field->key_cmp((uchar*) mid_val.data(), endpoint);
    min_bucket_index = mid;
    if (res < 0) {
      low = mid + 1;
    } else if (res > 0) {
      high = mid - 1;
    } else {
      //todo: endpoint is on a bucket boundary
      break;
    }
  }
  return min_bucket_index;
 }
 /*
  An object of the class Index_stat is created to read statistical
  data on tables from the statistical table table_stat, to update
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@ -419,6 +419,13 @@ public:
  */
  double range_selectivity_new(Field *field, key_range *min_endp,
                                       key_range *max_endp) override;
  /*
   * Returns the index of the biggest histogram value that is smaller than endpoint
   */
  int find_bucket(Field *field, const uchar *endpoint);
  double selection_in_interval(Field *field, const uchar* endpoint);
 };
 class Columns_statistics;