Code cleanup part#2: do not copy key values in xxx_selectivity() functions

2021-08-29 19:32:25 +03:00 · 2021-08-29 19:32:25 +03:00 · fcf58a5e0f
commit fcf58a5e0f
parent 2a1cdbabec
3 changed files with 84 additions and 121 deletions
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
@ -2444,15 +2444,15 @@ test	t1_json	a	a-0	a-9	0.0000	3.0000	1.0000	100	JSON_HB	{
 }
 explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	60.87	Using where
+1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	59.87	Using where
 Warnings:
 Note	1003	select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz'
 analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	10.00	60.87	60.00	Using where
+1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	10.00	59.87	60.00	Using where
 explain extended select * from t1_json where a < 'b-1a';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
+1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	99.00	Using where
 Warnings:
 Note	1003	select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` < 'b-1a'
 analyze select * from t1_json where a > 'zzzzzzzzz';
@ -2476,12 +2476,12 @@ test.t2	analyze	status	Engine-independent statistics collected
 test.t2	analyze	status	OK
 explain extended select * from t2 where city = 'Moscow';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	98.04	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	96.08	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`city` AS `city` from `test`.`t2` where `test`.`t2`.`city` = 'Moscow'
 analyze select * from t2 where city = 'Moscow';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	101.00	98.04	98.02	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	101.00	96.08	98.02	Using where
 explain extended select * from t2 where city = 'Helsinki';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	101	2.00	Using where
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@ -63,16 +63,7 @@
  equal to "never".
 */

-/*
-   * json_get_array_items expects a JSON array as argument,
-   * and pushes the elements of the array into the `container` vector.
-   * It only works if all the elements in the original JSON array
-   * are scalar values (i.e., strings, numbers, true or false),
-   * else, the JSON type encountered is stored in value_type and the function returns false.
-   */
-bool json_get_array_items(const char *json, const char *json_end, int *value_type, std::vector<std::string> &container);
-
-Histogram_base *create_histogram(Histogram_type hist_type);
+Histogram_base *create_histogram(MEM_ROOT *mem_root, Histogram_type hist_type);

 /* Currently there are only 3 persistent statistical tables */
 static const uint STATISTICS_TABLES= 3;
@ -1235,18 +1226,9 @@ public:
      Field *stat_field= stat_table->field[fldno];
      table_field->read_stats->set_not_null(fldno);
      stat_field->val_str(&val);
-      switch (table_field->read_stats->histogram_type_on_disk)
-      {
-      case SINGLE_PREC_HB:
-      case DOUBLE_PREC_HB:
-        hist = new (mem_root) Histogram_binary();
-        break;
-      case JSON_HB:
-        hist = new (mem_root) Histogram_json();
-        break;
-      default:
+      hist= create_histogram(mem_root, table_field->read_stats->histogram_type_on_disk);
+      if (!hist)
        return NULL;
-      }
      if (!hist->parse(mem_root, table_field,
                       table_field->read_stats->histogram_type_on_disk, 
                       val.ptr(), val.length()))
@ -1415,7 +1397,6 @@ double pos_in_interval_through_val_real(Field *field,
                                      uchar *max_val, 
                                      uchar *midpoint_val)
 {
-
  // For each passed value: unpack it into Field's current value. Then, we can
  // get the value as double.

@ -1526,114 +1507,105 @@ double Histogram_json::point_selectivity(Field *field, key_range *endpoint, doub
  const uchar *min_key = endpoint->key;
  if (field->real_maybe_null())
    min_key++;
-  uint min_idx= find_bucket(field, min_key);
-
-  uint max_idx= min_idx;
+  uint min_idx= find_bucket(field, min_key, false);

+  uint max_idx= find_bucket(field, min_key, true);
+#if 0
  // find how many buckets this value occupies
  while ((max_idx + 1 < get_width() ) &&
         (field->key_cmp((uchar *)histogram_bounds[max_idx + 1].data(), min_key) == 0)) {
    max_idx++;
  }
-
+#endif
  if (max_idx > min_idx)
  {
    // value spans multiple buckets
    double bucket_sel= 1.0/(get_width() + 1);
    sel= bucket_sel * (max_idx - min_idx + 1);
-  } else
+  }
+  else
  {
    // the value fits within a single bucket
-    sel = MY_MIN(avg_sel, (1.0/get_width()));
+    sel = MY_MIN(avg_sel, 1.0/get_width());
  }
  return sel;
 }


 /*
-  @param field  The table field histogram is for.  We don't care about the
-                 field's current value, we only need its virtual functions to
-                 perform various operations
+  @param field    The table field histogram is for.  We don't care about the
+                  field's current value, we only need its virtual functions to
+                  perform various operations

-  @param min_endp, max_endp - this specifies the range.
+  @param min_endp Left endpoint, or NULL if there is none
+  @param max_endp Right endpoint, or NULL if there is none
 */
 double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
-                                             key_range *max_endp)
+                                         key_range *max_endp)
 {
-  double min = 0.0, max = 1.0;
-  double width = 1.0/(int)histogram_bounds.size();
-  if (min_endp)
+  double min, max;
+  double width= 1.0 / histogram_bounds.size();
+
+  if (min_endp && !(field->null_ptr && min_endp->key[0]))
  {
-    double min_sel = 0.0;
+    bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *min_key= min_endp->key;
-    // GSOC-TODO: properly handle SQL NULLs.
-    // in this test patch, we just assume the values are not SQL NULLs.
    if (field->real_maybe_null())
      min_key++;

-    int min_bucket_idx, max_bucket_idx;
-    min_bucket_idx= find_bucket(field, min_key);
-    std::string min_bucket, max_bucket;
-
-    max_bucket_idx= min_bucket_idx + 1;
-    if (min_bucket_idx != -1)
+    // Find the leftmost bucket that contains the lookup value.
+    // (If the lookup value is to the left of all buckets, find bucket #0)
+    int idx= find_bucket(field, min_key, exclusive_endp);
+    double min_sel;
    {
-      min_bucket= histogram_bounds[min_bucket_idx];
-      max_bucket= (max_bucket_idx < (int) histogram_bounds.size())
-                      ? histogram_bounds[max_bucket_idx]
-                      : "";
-
+      std::string &left= histogram_bounds[idx];
+      std::string &right= histogram_bounds[idx+1];
      if (field->pos_through_val_str())
        min_sel= pos_in_interval_through_strxfrm(
-            field, (uchar *) min_bucket.data(), (uchar *) max_bucket.data(),
-            (uchar *) min_key);
+            field, (uchar*) left.data(), (uchar*) right.data(),
+            (uchar*) min_key);
      else
        min_sel= pos_in_interval_through_val_real(
-            field, (uchar *) min_bucket.data(), (uchar *) max_bucket.data(),
-            (uchar *) min_key);
+            field, (uchar *) left.data(), (uchar*) right.data(),
+            (uchar*) min_key);
    }

-    min = min_bucket_idx * (width) + min_sel * (width);
-    //fprintf(stderr, "min pos_in_interval =%g\n", min_sel);
-    //fprintf(stderr, "min =%g\n", min);
+    min= idx*width + min_sel*width;
  }
+  else
+    min= 0.0;
+
  if (max_endp)
  {
-    double max_sel = 1.0;
+    // The right endpoint cannot be NULL
+    DBUG_ASSERT(!(field->null_ptr && max_endp->key[0]));
+    bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
    const uchar *max_key= max_endp->key;
    if (field->real_maybe_null())
      max_key++;

-    int min_bucket_idx, max_bucket_idx;
-    min_bucket_idx= find_bucket(field, max_key);
-    std::string min_bucket, max_bucket;
-
-    max_bucket_idx= min_bucket_idx + 1;
-    if (min_bucket_idx != -1)
+    int idx= find_bucket(field, max_key, inclusive_endp);
+    double max_sel;
    {
-      min_bucket= histogram_bounds[min_bucket_idx];
-      max_bucket= (max_bucket_idx < (int) histogram_bounds.size())
-                  ? histogram_bounds[max_bucket_idx]
-                  : "";
+      std::string &left= histogram_bounds[idx];
+      std::string &right= histogram_bounds[idx+1];

      if (field->pos_through_val_str())
        max_sel= pos_in_interval_through_strxfrm(
-            field, (uchar *) min_bucket.data(), (uchar *) max_bucket.data(),
+            field, (uchar *) left.data(), (uchar *) right.data(),
            (uchar *) max_key);
      else
        max_sel= pos_in_interval_through_val_real(
-            field, (uchar *) min_bucket.data(), (uchar *) max_bucket.data(),
+            field, (uchar *) left.data(), (uchar *) right.data(),
            (uchar *) max_key);
    }

-    max = min_bucket_idx * (width) + max_sel * (width);
-    //fprintf(stderr, "max pos_in_interval =%g\n", max_sel);
-    //fprintf(stderr, "max =%g\n", max);
+    max= idx*width + max_sel*width;
  }
+  else
+    max= 1.0;

  double sel = max - min;
-  //fprintf(stderr, "final selection = %g\n", sel);
-  //fprintf(stderr, "Histogram_json::range_selectivity ends\n");
  return sel;
 }

@ -1644,34 +1616,33 @@ void Histogram_json::serialize(Field *field)
 }


-int Histogram_json::find_bucket(Field *field, const uchar *endpoint)
+/*
+  Find the histogram bucket that contains the value.
+
+  @param equal_is_less Controls what to do if a histogram bound is equal to the
+                       lookup_val.
+*/
+
+int Histogram_json::find_bucket(Field *field, const uchar *lookup_val,
+                                bool equal_is_less)
 {
-  int low = 0;
-  int high = (int)histogram_bounds.size()-1;
-  int mid;
-  int min_bucket_index = -1;
-  std::string mid_val; // GSOC-todo: don't copy strings
+  int low= 0;
+  int high= histogram_bounds.size() - 1;
+  int middle;

-  while(low <= high) {
-    // c++ gives us the floor of integer divisions by default, below we get the ceiling (round-up).
-    // it works but it doesn't feel so readable, maybe we could make improvements?
-    int sum = (low+high);
-    mid = sum/2 + (sum % 2 != 0);
-
-    mid_val = histogram_bounds[mid];
-
-    int res = field->key_cmp((uchar*) mid_val.data(), endpoint);
-    if (res < 0) {
-      low = mid + 1;
-      min_bucket_index = mid;
-    } else if (res >= 0) {
-      high = mid - 1;
-    }
+  while (low + 1 < high)
+  {
+    middle= (low + high) / 2;
+    int res= field->key_cmp((uchar*)histogram_bounds[middle].data(), lookup_val);
+    if (!res)
+      res= equal_is_less? -1: 1;
+    if (res < 0)
+      low= middle;
+    else //res > 0
+      high= middle;
  }

-  if (min_bucket_index == -1)
-    min_bucket_index = high;
-  return min_bucket_index;
+  return low;
 }

 /*
@ -2114,14 +2085,14 @@ public:
 };


-Histogram_base *create_histogram(Histogram_type hist_type)
+Histogram_base *create_histogram(MEM_ROOT *mem_root, Histogram_type hist_type)
 {
  switch (hist_type) {
  case SINGLE_PREC_HB:
  case DOUBLE_PREC_HB:
-    return new Histogram_binary();
+    return new (mem_root) Histogram_binary();
  case JSON_HB:
-    return new Histogram_json();
+    return new (mem_root) Histogram_json();
  default:
    DBUG_ASSERT(0);
  }
@ -2963,7 +2934,7 @@ void Column_statistics_collected::finish(MEM_ROOT *mem_root, ha_rows rows, doubl
    if (hist_size != 0 && hist_type != INVALID_HISTOGRAM)
    {
      have_histogram= true;
-      histogram_= create_histogram(hist_type);
+      histogram_= create_histogram(mem_root, hist_type);
      histogram_->init_for_collection(mem_root, hist_type, hist_size);
    }

@ -4530,9 +4501,10 @@ double Histogram_binary::point_selectivity(Field *field, key_range *min_endp, do
  return sel;
 }

+
 double Histogram_binary::range_selectivity(Field *field,
-                                               key_range *min_endp,
-                                               key_range *max_endp)
+                                           key_range *min_endp,
+                                           key_range *max_endp)
 {
  double sel, min_mp_pos, max_mp_pos;
  Column_statistics *col_stats= field->read_stats;
@ -4561,13 +4533,6 @@ double Histogram_binary::range_selectivity(Field *field,
  uint max= find_bucket(max_mp_pos, FALSE);
  sel= bucket_sel * (max - min + 1);

-  /*fprintf(stderr, "bucket_sel =%g\n", bucket_sel);
-  fprintf(stderr, "min pos_in_interval =%g\n", min_mp_pos);
-  fprintf(stderr, "max pos_in_interval =%g\n", max_mp_pos);
-  fprintf(stderr, "min =%d\n", min);
-  fprintf(stderr, "max =%d\n", max);*/
-  /*fprintf(stderr, "final sel =%g\n", sel);
-  fprintf(stderr, "Histogram_binary::range_selectivity ends\n");*/
  return sel;
 }

--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@ -400,10 +400,8 @@ public:
                           double avg_selection) override;
  double range_selectivity(Field *field, key_range *min_endp,
                           key_range *max_endp) override;
-  /*
-   * Returns the index of the biggest histogram value that is smaller than endpoint
-   */
-  int find_bucket(Field *field, const uchar *endpoint);
+private:
+  int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
 };

 class Columns_statistics;