MDEV-26519: JSON Histograms: improve histogram collection

Basic ideas: 1. Store "popular" values in their own buckets. 2. Also store ndv (Number of Distinct Values) in each bucket. Because of #1, the buckets are now variable-size, so store the size in each bucket. Adjust selectivity estimation functions accordingly.
2021-09-10 10:45:04 +03:00 · 2021-09-10 10:45:04 +03:00 · f460272054
commit f460272054
parent d64e104810
3 changed files with 5722 additions and 1318 deletions
--- a/mysql-test/main/statistics_json.result
+++ b/mysql-test/main/statistics_json.result
--- a/sql/opt_histogram_json.cc
+++ b/sql/opt_histogram_json.cc
@ -23,12 +23,35 @@
 class Histogram_json_builder : public Histogram_builder
 {
  Histogram_json_hb *histogram;
-  uint hist_width;         /* the number of points in the histogram        */
-  double bucket_capacity;  /* number of rows in a bucket of the histogram  */
-  uint curr_bucket;        /* number of the current bucket to be built     */
+  /* Number of buckets in the histogram */
+  uint hist_width;

-  std::vector<std::string> bucket_bounds;
-  bool first_value= true;
+  /*
+    Number of rows that we intend to have in the bucket. That is, this is
+
+      n_rows_in_table / histo_width
+
+    Actual number of rows in the buckets we produce may vary because of
+    "popular values" and rounding.
+  */
+  longlong bucket_capacity;
+
+  /* Number of the buckets already collected */
+  uint n_buckets_collected;
+
+  /* Data about the bucket we are filling now */
+  struct CurBucket
+  {
+    /* Number of values in the bucket so far. */
+    longlong size;
+
+    /* Number of distinct values in the bucket */
+    int ndv;
+  };
+  CurBucket bucket;
+
+  /* Used to create the JSON representation of the histogram. */
+  Json_writer writer;
 public:

  Histogram_json_builder(Histogram_json_hb *hist, Field *col, uint col_len,
@ -37,57 +60,159 @@ public:
  {
    bucket_capacity= (double)records / histogram->get_width();
    hist_width= histogram->get_width();
-    curr_bucket= 0;
+    n_buckets_collected= 0;
+    bucket.ndv= 0;
+    bucket.size= 0;
+
+    writer.start_object();
+    writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
  }

  ~Histogram_json_builder() override = default;

+  bool bucket_is_empty() { return bucket.ndv == 0; }
+
+  /*
+    Flush the current bucket out (to JSON output), and set it to be empty.
+  */
+  void finalize_bucket()
+  {
+    double fract= (double) bucket.size / records;
+    writer.add_member("size").add_double(fract);
+    writer.add_member("ndv").add_ll(bucket.ndv);
+    writer.end_object();
+    n_buckets_collected++;
+
+    bucket.ndv= 0;
+    bucket.size= 0;
+  }
+
+  /*
+    Same as finalize_bucket() but also provide the bucket's end value.
+  */
+  void finalize_bucket_with_end_value(void *elem)
+  {
+    column->store_field_value((uchar*) elem, col_length);
+    StringBuffer<MAX_FIELD_WIDTH> val;
+    column->val_str(&val);
+    writer.add_member("end").add_str(val.c_ptr());
+    finalize_bucket();
+  }
+
+  /*
+    Write the first value group to the bucket.
+    @param elem  The value we are writing
+    @param cnt   The number of such values.
+  */
+  void start_bucket(void *elem, element_count cnt)
+  {
+    DBUG_ASSERT(bucket.size == 0);
+    column->store_field_value((uchar*) elem, col_length);
+    StringBuffer<MAX_FIELD_WIDTH> val;
+    column->val_str(&val);
+
+    writer.start_object();
+    writer.add_member("start").add_str(val.c_ptr());
+
+    bucket.ndv= 1;
+    bucket.size= cnt;
+  }
+
+  /*
+    Append a value group of cnt values.
+  */
+  void append_to_bucket(element_count cnt)
+  {
+    bucket.ndv++;
+    bucket.size += cnt;
+  }
+
  /*
    @brief
-      Add data to the histogram. This call adds elem_cnt rows, each
-      of which has value of *elem.
+      Add data to the histogram.

    @detail
-      Subsequent next() calls will add values that are greater than *elem.
+      The call signals to add a "value group" of elem_cnt rows, each of which
+      has the same value that is provided in *elem.
+
+      Subsequent next() calls will add values that are greater than the
+      current one.
+
+    @return
+      0 - OK
  */
  int next(void *elem, element_count elem_cnt) override
  {
    counters.next(elem, elem_cnt);
    ulonglong count= counters.get_count();

-    if (curr_bucket == hist_width)
-      return 0;
-    if (first_value)
+    /*
+      Ok, we've got a "value group" of elem_cnt identical values.
+
+      If we take the values from the value group and put them into
+      the current bucket, how many values will be left after we've
+      filled the bucket?
+    */
+    longlong overflow= bucket.size + elem_cnt - bucket_capacity;
+
+    /*
+      Case #1: This value group should be put into a separate bucket, if
+       A. It fills the current bucket and also fills the next bucket, OR
+       B. It fills the current bucket, which was empty.
+    */
+    if (overflow >= bucket_capacity || (bucket_is_empty() && overflow >= 0))
    {
-      first_value= false;
-      column->store_field_value((uchar*) elem, col_length);
-      StringBuffer<MAX_FIELD_WIDTH> val;
-      column->val_str(&val);
-      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
+      // Finalize the current bucket
+      if (!bucket_is_empty())
+        finalize_bucket();
+
+      // Start/end the separate bucket for this value group.
+      start_bucket(elem, elem_cnt);
+      if (records == count)
+        finalize_bucket_with_end_value(elem);
+      else
+        finalize_bucket();
    }
-
-    if (count > bucket_capacity * (curr_bucket + 1))
+    else if (overflow >= 0)
    {
-      column->store_field_value((uchar*) elem, col_length);
-      StringBuffer<MAX_FIELD_WIDTH> val;
-      column->val_str(&val);
-      bucket_bounds.emplace_back(val.ptr(), val.length());
+      /*
+        Case #2: is when Case#1 doesn't hold, but we can still fill the
+        current bucket.
+      */

-      curr_bucket++;
-      while (curr_bucket != hist_width &&
-             count > bucket_capacity * (curr_bucket + 1))
+      // If the bucket was empty, it would have been case #1.
+      DBUG_ASSERT(!bucket_is_empty());
+
+      /*
+        Finalize the current bucket. Put there enough values to make it hold
+        bucket_capacity values.
+      */
+      append_to_bucket(bucket_capacity - bucket.size);
+      if (records == count && !overflow)
+        finalize_bucket_with_end_value(elem);
+      else
+        finalize_bucket();
+
+      if (overflow > 0)
      {
-        bucket_bounds.push_back(std::string(val.ptr(), val.length()));
-        curr_bucket++;
+        // Then, start the new bucket with the remaining values.
+        start_bucket(elem, overflow);
      }
    }
-
-    if (records == count && bucket_bounds.size() == hist_width)
+    else
    {
-      column->store_field_value((uchar*) elem, col_length);
-      StringBuffer<MAX_FIELD_WIDTH> val;
-      column->val_str(&val);
-      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
+      // Case #3: there's not enough values to fill the current bucket.
+      if (bucket_is_empty())
+        start_bucket(elem, elem_cnt);
+      else
+        append_to_bucket(elem_cnt);
+    }
+
+    if (records == count)
+    {
+      // This is the final value group.
+      if (!bucket_is_empty())
+        finalize_bucket_with_end_value(elem);
    }
    return 0;
  }
@ -98,17 +223,10 @@ public:
  */
  void finalize() override
  {
-    Json_writer writer;
-    writer.start_object();
-    writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
-
-    for(auto& value: bucket_bounds) {
-      writer.add_str(value.c_str());
-    }
    writer.end_array();
    writer.end_object();
    Binary_string *json_string= (Binary_string *) writer.output.get_string();
-    histogram->set_json_text(bucket_bounds.size()-1,
+    histogram->set_json_text(n_buckets_collected,
                             (uchar *) json_string->c_ptr());
  }
 };
@ -143,78 +261,132 @@ bool Histogram_json_hb::parse(MEM_ROOT *mem_root, Field *field,
                              Histogram_type type_arg, const char *hist_data,
                              size_t hist_data_len)
 {
+  const char *err;
  DBUG_ENTER("Histogram_json_hb::parse");
  DBUG_ASSERT(type_arg == JSON_HB);
-  const char *err;
-  json_engine_t je;
-  json_string_t key_name;
+  const char *obj1;
+  int obj1_len;
+  double cumulative_size= 0.0;

-  json_scan_start(&je, &my_charset_utf8mb4_bin,
-                  (const uchar*)hist_data,
-                  (const uchar*)hist_data+hist_data_len);
-
-  if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
+  if (JSV_OBJECT != json_type(hist_data, hist_data + hist_data_len,
+                              &obj1, &obj1_len))
  {
    err= "Root JSON element must be a JSON object";
    goto error;
  }

-  json_string_set_str(&key_name, (const uchar*)JSON_NAME,
-                      (const uchar*)JSON_NAME + strlen(JSON_NAME));
-  json_string_set_cs(&key_name, system_charset_info);
-
-  if (json_scan_next(&je) || je.state != JST_KEY ||
-      !json_key_matches(&je, &key_name))
-  {
-    err= "The first key in the object must be histogram_hb_v1";
-    goto error;
-  }
-
-  // The value must be a JSON array
-  if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
+  const char *hist_array;
+  int hist_array_len;
+  if (JSV_ARRAY != json_get_object_key(obj1, obj1 + obj1_len,
+                                       "histogram_hb_v2", &hist_array,
+                                       &hist_array_len))
  {
    err= "A JSON array expected";
    goto error;
  }

-  // Read the array
-  while (!json_scan_next(&je))
+  for (int i= 0;; i++)
  {
-    switch(je.state)
+    const char *bucket_info;
+    int bucket_info_len;
+    enum json_types ret= json_get_array_item(hist_array, hist_array+hist_array_len,
+                                             i, &bucket_info,
+                                             &bucket_info_len);
+    if (ret == JSV_NOTHING)
+      break;
+    if (ret == JSV_BAD_JSON)
    {
-      case JST_VALUE:
-      {
-        const char *val;
-        int val_len;
-        json_smart_read_value(&je, &val, &val_len);
-        if (je.value_type != JSON_VALUE_STRING &&
-            je.value_type != JSON_VALUE_NUMBER &&
-            je.value_type != JSON_VALUE_TRUE &&
-            je.value_type != JSON_VALUE_FALSE)
-        {
-          err= "Scalar value expected";
+      err= "JSON parse error";
      goto error;
    }
-        uchar buf[MAX_KEY_LENGTH];
-        uint len_to_copy= field->key_length();
-        field->store_text(val, val_len, &my_charset_bin);
-        uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
-        histogram_bounds.push_back(std::string((char*)buf, bytes));
-        // TODO: Should we also compare this endpoint with the previous
-        // to verify that the ordering is right?
-        break;
+    if (ret != JSV_OBJECT)
+    {
+      err= "Object expected";
+      goto error;
    }
-      case JST_ARRAY_END:
-        break;
-    }
-  }
-  // n_buckets = n_bounds - 1 :
-  size= histogram_bounds.size()-1;
-  DBUG_RETURN(false);

+    // Ok, now we are parsing the JSON object describing the bucket
+    // Read the "start" field.
+    const char *val;
+    int val_len;
+    ret= json_get_object_key(bucket_info, bucket_info+bucket_info_len,
+                             "start", &val, &val_len);
+    if (ret != JSV_STRING && ret != JSV_NUMBER)
+    {
+      err= ".start member must be present and be a scalar";
+      goto error;
+    }
+
+    // Read the "size" field.
+    const char *size;
+    int size_len;
+    ret= json_get_object_key(bucket_info, bucket_info+bucket_info_len,
+                             "size", &size, &size_len);
+    if (ret != JSV_NUMBER)
+    {
+      err= ".size member must be present and be a scalar";
+      goto error;
+    }
+
+    int conv_err;
+    char *size_end= (char*)size + size_len;
+    double size_d= my_strtod(size, &size_end, &conv_err);
+    if (conv_err)
+    {
+      err= ".size member must be a floating-point value";
+      goto error;
+    }
+    cumulative_size += size_d;
+
+    // Read the "ndv" field
+    const char *ndv;
+    int ndv_len;
+    ret= json_get_object_key(bucket_info, bucket_info+bucket_info_len,
+                             "ndv", &ndv, &ndv_len);
+    if (ret != JSV_NUMBER)
+    {
+      err= ".ndv member must be present and be a scalar";
+      goto error;
+    }
+    char *ndv_end= (char*)ndv + ndv_len;
+    longlong ndv_ll= my_strtoll10(ndv, &ndv_end, &conv_err);
+    if (conv_err)
+    {
+      err= ".ndv member must be an integer value";
+      goto error;
+    }
+
+    const char *end_val;
+    int end_val_len;
+    ret= json_get_object_key(bucket_info, bucket_info+bucket_info_len,
+                             "end", &end_val, &end_val_len);
+    if (ret != JSV_NOTHING && ret != JSV_STRING && ret !=JSV_NUMBER)
+    {
+      err= ".end member must be a scalar";
+      goto error;
+    }
+    if (ret != JSV_NOTHING)
+      last_bucket_end_endp.assign(end_val, end_val_len);
+
+    buckets.push_back({std::string(val, val_len), NULL, cumulative_size,
+                       ndv_ll});
+
+    if (buckets.size())
+    {
+      auto& prev_bucket= buckets[buckets.size()-1];
+      if (prev_bucket.ndv == 1)
+        prev_bucket.end_value= &prev_bucket.start_value;
+      else
+        prev_bucket.end_value= &buckets.back().start_value;
+    }
+  }
+  buckets.back().end_value= &last_bucket_end_endp;
+  size= buckets.size();
+
+  DBUG_RETURN(false);
 error:
  my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
-           je.s.c_str - (const uchar*)hist_data);
+           12345);
  DBUG_RETURN(true);
 }

@ -273,37 +445,44 @@ double position_in_interval(Field *field, const  uchar *key, uint key_len,
 double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
                                            double avg_sel)
 {
-  double sel;
-  store_key_image_to_rec(field, (uchar *) endpoint->key,
-                         field->key_length());
-  const uchar *min_key = endpoint->key;
+  const uchar *key = endpoint->key;
  if (field->real_maybe_null())
-    min_key++;
-  uint min_idx= find_bucket(field, min_key, false);
+    key++;

-  uint max_idx= find_bucket(field, min_key, true);
-#if 0
-  // find how many buckets this value occupies
-  while ((max_idx + 1 < get_width() ) &&
-         (field->key_cmp((uchar *)histogram_bounds[max_idx + 1].data(), min_key) == 0)) {
-    max_idx++;
-  }
-#endif
-  if (max_idx > min_idx)
+  // If the value is outside of the histogram's range, this will "clip" it to
+  // first or last bucket.
+  int idx= find_bucket(field, key, false);
+
+  double sel;
+
+  if (buckets[idx].ndv == 1 &&
+      field->key_cmp((uchar*)buckets[idx].start_value.data(), key))
  {
-    // value spans multiple buckets
-    double bucket_sel= 1.0/(get_width() + 1);
-    sel= bucket_sel * (max_idx - min_idx + 1);
+    // The bucket has a single value and it doesn't match! Use the global
+    // average.
+    sel= avg_sel;
  }
  else
  {
-    // the value fits within a single bucket
-    sel = MY_MIN(avg_sel, 1.0/get_width());
+    /*
+      We get here when:
+      * The bucket has one value and this is the value we are looking for.
+      * The bucket has multiple values. Then, assume
+    */
+    sel= (get_left_fract(idx) - buckets[idx].cum_fract) / buckets[idx].ndv;
  }
  return sel;
 }


+double Histogram_json_hb::get_left_fract(int idx)
+{
+  if (!idx)
+    return 0.0;
+  else
+    return buckets[idx-1].cum_fract;
+}
+
 /*
  @param field    The table field histogram is for.  We don't care about the
                  field's current value, we only need its virtual functions to
@ -317,7 +496,6 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
                                            key_range *max_endp)
 {
  double min, max;
-  double width= 1.0 / histogram_bounds.size();

  if (min_endp && !(field->null_ptr && min_endp->key[0]))
  {
@ -333,10 +511,12 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
    // Find the leftmost bucket that contains the lookup value.
    // (If the lookup value is to the left of all buckets, find bucket #0)
    int idx= find_bucket(field, min_key, exclusive_endp);
-    double min_sel= position_in_interval(field, min_key, min_key_len,
-                                         histogram_bounds[idx],
-                                         histogram_bounds[idx+1]);
-    min= idx*width + min_sel*width;
+    double left_fract= get_left_fract(idx);
+    double sel= position_in_interval(field, min_key, min_key_len,
+                                     buckets[idx].start_value,
+                                     *buckets[idx].end_value);
+
+    min= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
    min= 0.0;
@ -355,10 +535,11 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
    }

    int idx= find_bucket(field, max_key, inclusive_endp);
-    double max_sel= position_in_interval(field, max_key, max_key_len,
-                                         histogram_bounds[idx],
-                                         histogram_bounds[idx+1]);
-    max= idx*width + max_sel*width;
+    double left_fract= get_left_fract(idx);
+    double sel= position_in_interval(field, max_key, max_key_len,
+                                     buckets[idx].start_value,
+                                     *buckets[idx].end_value);
+    max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
  }
  else
    max= 1.0;
@ -375,23 +556,35 @@ void Histogram_json_hb::serialize(Field *field)


 /*
-  Find the histogram bucket that contains the value.
+  Find the rightmost histogram bucket such that "lookup_val $GT start_value".
+
+  $GT is either '>' or '>=' depending on equal_is_less parameter.

  @param equal_is_less Controls what to do if a histogram bound is equal to the
                       lookup_val.
+
+  @detail
+    Possible cases:
+    1. The regular case: the value falls into some bucket.
+
+    2. The value is less than the minimum of the first bucket
+    3. The value is greater than the maximum of the last bucket
+      In these cases we "clip" to the first/last bucket.
+
+    4. The value hits the bucket boundary. Then, we need to know whether the
+       point of interest is to the left the constant, or to the right of it.
 */

 int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
                                   bool equal_is_less)
 {
  int low= 0;
-  int high= (int)histogram_bounds.size() - 1;
-  int middle;
+  int high= (int)buckets.size() - 1;

  while (low + 1 < high)
  {
-    middle= (low + high) / 2;
-    int res= field->key_cmp((uchar*)histogram_bounds[middle].data(), lookup_val);
+    int middle= (low + high) / 2;
+    int res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
    if (!res)
      res= equal_is_less? -1: 1;
    if (res < 0)
--- a/sql/opt_histogram_json.h
+++ b/sql/opt_histogram_json.h
@ -20,6 +20,32 @@
  An equi-height histogram which stores real values for bucket bounds.

  Handles @@histogram_type=JSON_HB
+
+  Histogram format in JSON:
+
+  {
+    "histogram_hb_v2": [
+      { "start": "value", "size":nnn.nn, "ndv": nnn },
+      ...
+      { "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"}
+    ]
+  }
+
+  The histogram is an object with single member named "histogram_hb_v2".
+  The value of that member is an array of buckets.
+  Each bucket is an object with these members:
+    "start" - the first value in the bucket.
+    "size"  - fraction of table rows that is contained in the bucket.
+    "ndv"   - Number of Distinct Values in the bucket.
+    "end"   - Optionally, the last value in the bucket.
+
+  A bucket is a single-point bucket if it has ndv=1.
+
+  Most buckets have no "end" member: the bucket is assumed to contain all
+  values up to the "start" of the next bucket.
+
+  The exception is single-point buckets where last value is the same as the
+  first value.
 */

 class Histogram_json_hb : public Histogram_base
@ -29,11 +55,29 @@ class Histogram_json_hb : public Histogram_base
  /* Collection-time only: collected histogram in the JSON form. */
  std::string json_text;

-  // Array of histogram bucket endpoints in KeyTupleFormat.
-  std::vector<std::string> histogram_bounds;
+  struct Bucket
+  {
+    // The left endpoint in KeyTupleFormat. The endpoint is inclusive, this
+    // value is in this bucket.
+    std::string start_value;
+
+    // The right endpoint. It is non-inclusive, except for the last bucket.
+    std::string *end_value;
+
+    // Cumulative fraction: The fraction of table rows that fall into this
+    //  and preceding buckets.
+    double cum_fract;
+
+    // Number of distinct values in the bucket.
+    longlong ndv;
+  };
+
+  std::vector<Bucket> buckets;
+
+  std::string last_bucket_end_endp;

 public:
-  static constexpr const char* JSON_NAME="histogram_hb_v1";
+  static constexpr const char* JSON_NAME="histogram_hb_v2";

  bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
             const char *hist_data, size_t hist_data_len) override;
@ -80,6 +124,7 @@ public:
  }

 private:
+  double get_left_fract(int idx);
  int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
 };