Histogram code cleanup and fixes

Factor the code that updates count, count_distinct,
count_distinct_single_occurrence into class Basic_stats_collector

Change from Histogram_builder and its descendant Histogram_builder_json
to  Histogram_builder (the interface), and Histogram_binary_builder,
Histogram_json_builder.

In Histogram_json_builder, do not forget to collect the right bound
of the right-most bucket.
This commit is contained in:
Sergei Petrunia 2021-09-04 16:28:10 +03:00
parent a9c1feea60
commit 4ab2b78b65
3 changed files with 1003 additions and 921 deletions

File diff suppressed because one or more lines are too long

View File

@ -1363,7 +1363,8 @@ bool Histogram_json_hb::parse(MEM_ROOT *mem_root, Field *field,
break; break;
} }
} }
size= histogram_bounds.size(); // n_buckets = n_bounds - 1 :
size= histogram_bounds.size()-1;
DBUG_RETURN(false); DBUG_RETURN(false);
error: error:
@ -1852,6 +1853,45 @@ public:
} }
}; };
/*
This is used to collect the the basic statistics from a Unique object:
- count of values
- count of distinct values
- count of distinct values that have occurred only once
*/
class Basic_stats_collector
{
ulonglong count; /* number of values retrieved */
ulonglong count_distinct; /* number of distinct values retrieved */
/* number of distinct values that occured only once */
ulonglong count_distinct_single_occurence;
public:
Basic_stats_collector()
{
count= 0;
count_distinct= 0;
count_distinct_single_occurence= 0;
}
ulonglong get_count_distinct() const { return count_distinct; }
ulonglong get_count_single_occurence() const
{
return count_distinct_single_occurence;
}
ulonglong get_count() const { return count; }
void next(void *elem, element_count elem_cnt)
{
count_distinct++;
if (elem_cnt == 1)
count_distinct_single_occurence++;
count+= elem_cnt;
}
};
/* /*
Histogram_builder is a helper class that is used to build histograms Histogram_builder is a helper class that is used to build histograms
for columns. for columns.
@ -1865,87 +1905,95 @@ protected:
Field *column; /* table field for which the histogram is built */ Field *column; /* table field for which the histogram is built */
uint col_length; /* size of this field */ uint col_length; /* size of this field */
ha_rows records; /* number of records the histogram is built for */ ha_rows records; /* number of records the histogram is built for */
Histogram_builder(Field *col, uint col_len, ha_rows rows) :
column(col), col_length(col_len), records(rows)
{}
public:
// A histogram builder will also collect the counters
Basic_stats_collector counters;
virtual int next(void *elem, element_count elem_cnt)=0;
virtual void finalize()=0;
virtual ~Histogram_builder(){}
};
class Histogram_binary_builder : public Histogram_builder
{
Field *min_value; /* pointer to the minimal value for the field */ Field *min_value; /* pointer to the minimal value for the field */
Field *max_value; /* pointer to the maximal value for the field */ Field *max_value; /* pointer to the maximal value for the field */
Histogram_base *histogram; /* the histogram location */ Histogram_binary *histogram; /* the histogram location */
uint hist_width; /* the number of points in the histogram */ uint hist_width; /* the number of points in the histogram */
double bucket_capacity; /* number of rows in a bucket of the histogram */ double bucket_capacity; /* number of rows in a bucket of the histogram */
uint curr_bucket; /* number of the current bucket to be built */ uint curr_bucket; /* number of the current bucket to be built */
ulonglong count; /* number of values retrieved */
ulonglong count_distinct; /* number of distinct values retrieved */
/* number of distinct values that occured only once */
ulonglong count_distinct_single_occurence;
public: public:
Histogram_builder(Field *col, uint col_len, ha_rows rows) Histogram_binary_builder(Field *col, uint col_len, ha_rows rows)
: column(col), col_length(col_len), records(rows) : Histogram_builder(col, col_len, rows)
{ {
Column_statistics *col_stats= col->collected_stats; Column_statistics *col_stats= col->collected_stats;
min_value= col_stats->min_value; min_value= col_stats->min_value;
max_value= col_stats->max_value; max_value= col_stats->max_value;
histogram= col_stats->histogram; histogram= (Histogram_binary*)col_stats->histogram;
hist_width= histogram->get_width(); hist_width= histogram->get_width();
bucket_capacity= (double) records / (hist_width + 1); bucket_capacity= (double) records / (hist_width + 1);
curr_bucket= 0; curr_bucket= 0;
count= 0;
count_distinct= 0;
count_distinct_single_occurence= 0;
} }
Histogram_builder() = default; int next(void *elem, element_count elem_cnt) override
virtual ~Histogram_builder() = default;
ulonglong get_count_distinct() const { return count_distinct; }
ulonglong get_count_single_occurence() const
{ {
return count_distinct_single_occurence; counters.next(elem, elem_cnt);
} ulonglong count= counters.get_count();
virtual int next(void *elem, element_count elem_cnt)
{
count_distinct++;
if (elem_cnt == 1)
count_distinct_single_occurence++;
count+= elem_cnt;
if (curr_bucket == hist_width) if (curr_bucket == hist_width)
return 0; return 0;
if (count > bucket_capacity * (curr_bucket + 1)) if (count > bucket_capacity * (curr_bucket + 1))
{ {
column->store_field_value((uchar *) elem, col_length); column->store_field_value((uchar *) elem, col_length);
((Histogram_binary *)histogram)->set_value(curr_bucket, histogram->set_value(curr_bucket,
column->pos_in_interval(min_value, max_value)); column->pos_in_interval(min_value, max_value));
curr_bucket++; curr_bucket++;
while (curr_bucket != hist_width && while (curr_bucket != hist_width &&
count > bucket_capacity * (curr_bucket + 1)) count > bucket_capacity * (curr_bucket + 1))
{ {
((Histogram_binary *)histogram)->set_prev_value(curr_bucket); histogram->set_prev_value(curr_bucket);
curr_bucket++; curr_bucket++;
} }
} }
return 0; return 0;
} }
virtual void finalize(){} void finalize() override {}
}; };
Histogram_builder *Histogram_binary::create_builder(Field *col, uint col_len, Histogram_builder *Histogram_binary::create_builder(Field *col, uint col_len,
ha_rows rows) ha_rows rows)
{ {
return new Histogram_builder(col, col_len, rows); return new Histogram_binary_builder(col, col_len, rows);
} }
class Histogram_builder_json : public Histogram_builder class Histogram_json_builder : public Histogram_builder
{ {
Histogram_json_hb *histogram;
uint hist_width; /* the number of points in the histogram */
double bucket_capacity; /* number of rows in a bucket of the histogram */
uint curr_bucket; /* number of the current bucket to be built */
std::vector<std::string> bucket_bounds; std::vector<std::string> bucket_bounds;
bool got_first_value = false; bool first_value= true;
public: public:
Histogram_builder_json(Field *col, uint col_len, ha_rows rows) Histogram_json_builder(Field *col, uint col_len, ha_rows rows)
: Histogram_builder(col, col_len, rows) {} : Histogram_builder(col, col_len, rows)
{
histogram= (Histogram_json_hb*)col->collected_stats->histogram;
bucket_capacity= (double)records / histogram->get_width();
hist_width= histogram->get_width();
curr_bucket= 0;
}
~Histogram_builder_json() override = default; ~Histogram_json_builder() override = default;
/* /*
Add data to the histogram. Adding Element elem which encountered elem_cnt Add data to the histogram. Adding Element elem which encountered elem_cnt
@ -1953,18 +2001,27 @@ public:
*/ */
int next(void *elem, element_count elem_cnt) override int next(void *elem, element_count elem_cnt) override
{ {
count_distinct++; counters.next(elem, elem_cnt);
if (elem_cnt == 1) ulonglong count= counters.get_count();
count_distinct_single_occurence++;
count+= elem_cnt;
if (curr_bucket == hist_width) if (curr_bucket == hist_width)
return 0; return 0;
if (first_value)
{
first_value= false;
column->store_field_value((uchar*) elem, col_length);
StringBuffer<MAX_FIELD_WIDTH> val;
column->val_str(&val);
bucket_bounds.push_back(std::string(val.ptr(), val.length()));
}
if (count > bucket_capacity * (curr_bucket + 1)) if (count > bucket_capacity * (curr_bucket + 1))
{ {
column->store_field_value((uchar*) elem, col_length); column->store_field_value((uchar*) elem, col_length);
StringBuffer<MAX_FIELD_WIDTH> val; StringBuffer<MAX_FIELD_WIDTH> val;
column->val_str(&val); column->val_str(&val);
bucket_bounds.push_back(std::string(val.ptr(), val.length())); bucket_bounds.emplace_back(val.ptr(), val.length());
curr_bucket++; curr_bucket++;
while (curr_bucket != hist_width && while (curr_bucket != hist_width &&
count > bucket_capacity * (curr_bucket + 1)) count > bucket_capacity * (curr_bucket + 1))
@ -1973,6 +2030,14 @@ public:
curr_bucket++; curr_bucket++;
} }
} }
if (records == count && bucket_bounds.size() == hist_width)
{
column->store_field_value((uchar*) elem, col_length);
StringBuffer<MAX_FIELD_WIDTH> val;
column->val_str(&val);
bucket_bounds.push_back(std::string(val.ptr(), val.length()));
}
return 0; return 0;
} }
@ -1991,8 +2056,8 @@ public:
writer.end_array(); writer.end_array();
writer.end_object(); writer.end_object();
Binary_string *json_string = (Binary_string *) writer.output.get_string(); Binary_string *json_string = (Binary_string *) writer.output.get_string();
Histogram_json_hb *hist= (Histogram_json_hb*)histogram; histogram->set_json_text(bucket_bounds.size()-1,
hist->set_json_text(bucket_bounds.size(), (uchar *) json_string->c_ptr()); (uchar *) json_string->c_ptr());
} }
}; };
@ -2000,12 +2065,10 @@ public:
Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len, Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len,
ha_rows rows) ha_rows rows)
{ {
return new Histogram_builder_json(col, col_len, rows); return new Histogram_json_builder(col, col_len, rows);
} }
Histogram_base *create_histogram(MEM_ROOT *mem_root, Histogram_type hist_type, Histogram_base *create_histogram(MEM_ROOT *mem_root, Histogram_type hist_type,
THD *owner) THD *owner)
{ {
@ -2036,13 +2099,10 @@ static int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
return hist_builder->next(elem, elem_cnt); return hist_builder->next(elem, elem_cnt);
} }
int basic_stats_collector_walk(void *elem, element_count count,
static int count_distinct_single_occurence_walk(void *elem, void *arg)
element_count count, void *arg)
{ {
((ulonglong*)arg)[0]+= 1; ((Basic_stats_collector*)arg)->next(elem, count);
if (count == 1)
((ulonglong*)arg)[1]+= 1;
return 0; return 0;
} }
@ -2127,11 +2187,11 @@ public:
*/ */
void walk_tree() void walk_tree()
{ {
ulonglong counts[2] = {0, 0}; Basic_stats_collector stats_collector;
tree->walk(table_field->table, tree->walk(table_field->table, basic_stats_collector_walk,
count_distinct_single_occurence_walk, counts); (void*)&stats_collector );
distincts= counts[0]; distincts= stats_collector.get_count_distinct();
distincts_single_occurence= counts[1]; distincts_single_occurence= stats_collector.get_count_single_occurence();
} }
/* /*
@ -2147,8 +2207,9 @@ public:
tree->walk(table_field->table, histogram_build_walk, tree->walk(table_field->table, histogram_build_walk,
(void *) hist_builder); (void *) hist_builder);
hist_builder->finalize(); hist_builder->finalize();
distincts= hist_builder->get_count_distinct(); distincts= hist_builder->counters.get_count_distinct();
distincts_single_occurence= hist_builder->get_count_single_occurence(); distincts_single_occurence= hist_builder->counters.
get_count_single_occurence();
delete hist_builder; delete hist_builder;
} }

View File

@ -403,7 +403,8 @@ public:
return size; return size;
} }
void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size) override; void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
ulonglong size) override;
bool is_available() override {return true; } bool is_available() override {return true; }