diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index e4fee5ae7cd..024ef606c68 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -76,6 +76,8 @@ std::vector parse_histogram_from_json(const char *json); void test_parse_histogram_from_json(); +Histogram_base *create_histogram(Histogram_type hist_type); + /* Currently there are only 3 persistent statistical tables */ static const uint STATISTICS_TABLES= 3; @@ -1217,9 +1219,9 @@ public: The method assumes that the value of histogram size and the pointer to the histogram location has been already set in the fields size and values of read_stats->histogram. - */ + */ - Histogram * load_histogram(MEM_ROOT *mem_root) + Histogram_binary * load_histogram(MEM_ROOT *mem_root) { if (find_stat()) { @@ -1230,7 +1232,7 @@ public: table_field->read_stats->set_not_null(fldno); stat_field->val_str(&val); // histogram-todo: here, create the histogram of appropriate type. - Histogram *hist= new (mem_root) Histogram(); + Histogram_binary *hist= new (mem_root) Histogram_binary(); if (!hist->parse(mem_root, table_field->read_stats->histogram_type_on_disk, (const uchar*)val.ptr(), val.length())) { @@ -1244,7 +1246,7 @@ public: } }; -bool Histogram::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg) +bool Histogram_binary::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg) { // Just copy the data size = (uint8) size_arg; @@ -1258,7 +1260,7 @@ bool Histogram::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar * /* Save the histogram data info a table field. */ -void Histogram::serialize(Field *field) +void Histogram_binary::serialize(Field *field) { if (get_type() == JSON) { @@ -1269,7 +1271,7 @@ void Histogram::serialize(Field *field) field->store((char*)get_values(), get_size(), &my_charset_bin); } -void Histogram::init_for_collection(MEM_ROOT *mem_root, +void Histogram_binary::init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size_arg) { @@ -1278,6 +1280,13 @@ void Histogram::init_for_collection(MEM_ROOT *mem_root, size= (uint8) size_arg; } + +void Histogram_json::init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size_arg) +{ + type= htype_arg; + values = (uchar*)alloc_root(mem_root, size_arg); + size = (uint8) size_arg; +} /* An object of the class Index_stat is created to read statistical data on tables from the statistical table table_stat, to update @@ -1595,13 +1604,13 @@ public: class Histogram_builder { -protected: +private: Field *column; /* table field for which the histogram is built */ uint col_length; /* size of this field */ ha_rows records; /* number of records the histogram is built for */ Field *min_value; /* pointer to the minimal value for the field */ Field *max_value; /* pointer to the maximal value for the field */ - Histogram *histogram; /* the histogram location */ + Histogram_binary *histogram; /* the histogram location */ uint hist_width; /* the number of points in the histogram */ double bucket_capacity; /* number of rows in a bucket of the histogram */ uint curr_bucket; /* number of the current bucket to be built */ @@ -1617,7 +1626,7 @@ public: Column_statistics *col_stats= col->collected_stats; min_value= col_stats->min_value; max_value= col_stats->max_value; - histogram= col_stats->histogram_; + histogram= dynamic_cast(col_stats->histogram_); hist_width= histogram->get_width(); bucket_capacity= (double) records / (hist_width + 1); curr_bucket= 0; @@ -1626,6 +1635,8 @@ public: count_distinct_single_occurence= 0; } + Histogram_builder() = default; + virtual ~Histogram_builder() = default; ulonglong get_count_distinct() const { return count_distinct; } @@ -1661,16 +1672,29 @@ public: class Histogram_builder_json : public Histogram_builder { -std::vector bucket_bounds = {}; + Field *column; /* table field for which the histogram is built */ + uint col_length; /* size of this field */ + ha_rows records; /* number of records the histogram is built for */ + Field *min_value; /* pointer to the minimal value for the field */ + Field *max_value; /* pointer to the maximal value for the field */ + Histogram_json *histogram; /* the histogram location */ + uint hist_width; /* the number of points in the histogram */ + double bucket_capacity; /* number of rows in a bucket of the histogram */ + uint curr_bucket; /* number of the current bucket to be built */ + ulonglong count; /* number of values retrieved */ + ulonglong count_distinct; /* number of distinct values retrieved */ + /* number of distinct values that occured only once */ + ulonglong count_distinct_single_occurence; + std::vector bucket_bounds = {}; public: Histogram_builder_json(Field *col, uint col_len, ha_rows rows) - : Histogram_builder(col, col_len, rows) + : column(col), col_length(col_len), records(rows) { Column_statistics *col_stats= col->collected_stats; min_value= col_stats->min_value; max_value= col_stats->max_value; - histogram= col_stats->histogram_; + histogram= dynamic_cast(col_stats->histogram_); hist_width= histogram->get_width(); bucket_capacity= (double) records / (hist_width + 1); curr_bucket= 0; @@ -1718,6 +1742,15 @@ public: } }; +Histogram_base *create_histogram(Histogram_type hist_type) +{ + // assumes the caller already checked for invalid histograms + if (hist_type == JSON) + return new Histogram_json; + else + return new Histogram_binary; +} + void test_parse_histogram_from_json() { std::vector bucket = {}; @@ -1954,9 +1987,9 @@ public: @brief Get the pointer to the histogram built for table_field */ - Histogram *get_histogram() + Histogram_binary *get_histogram() { - return table_field->collected_stats->histogram_; + return dynamic_cast(table_field->collected_stats->histogram_); } }; @@ -2608,18 +2641,18 @@ bool Column_statistics_collected::add() /* - Create an empty Histogram object from histogram_type. + Create an empty Histogram_binary object from histogram_type. Note: it is not yet clear whether collection-time histogram should be the same as lookup-time histogram. At the moment, they are. */ -Histogram* get_histogram_by_type(MEM_ROOT *mem_root, Histogram_type hist_type) { +Histogram_binary * get_histogram_by_type(MEM_ROOT *mem_root, Histogram_type hist_type) { switch (hist_type) { case SINGLE_PREC_HB: case DOUBLE_PREC_HB: case JSON: - return new Histogram(); + return new Histogram_binary(); default: DBUG_ASSERT(0); } @@ -2660,7 +2693,7 @@ void Column_statistics_collected::finish(MEM_ROOT *mem_root, ha_rows rows, doubl if (hist_size != 0 && hist_type != INVALID_HISTOGRAM) { have_histogram= true; - histogram_= new Histogram; + histogram_= create_histogram(hist_type); histogram_->init_for_collection(mem_root, hist_type, hist_size); } @@ -4048,7 +4081,8 @@ double get_column_range_cardinality(Field *field, if (avg_frequency > 1.0 + 0.000001 && col_stats->min_max_values_are_provided()) { - Histogram *hist= col_stats->histogram_; + Histogram_binary *hist= + dynamic_cast(col_stats->histogram_); if (hist && hist->is_usable(thd)) { store_key_image_to_rec(field, (uchar *) min_endp->key, @@ -4092,7 +4126,8 @@ double get_column_range_cardinality(Field *field, else max_mp_pos= 1.0; - Histogram *hist= col_stats->histogram_; + Histogram_binary *hist= + dynamic_cast(col_stats->histogram_); if (hist && hist->is_usable(thd)) sel= hist->range_selectivity(min_mp_pos, max_mp_pos); else @@ -4143,7 +4178,7 @@ double get_column_range_cardinality(Field *field, value. */ -double Histogram::point_selectivity(double pos, double avg_sel) +double Histogram_binary::point_selectivity(double pos, double avg_sel) { double sel; /* Find the bucket that contains the value 'pos'. */ @@ -4179,7 +4214,7 @@ double Histogram::point_selectivity(double pos, double avg_sel) /* The value 'pos' fits within one single histogram bucket. - Histogram buckets have the same numbers of rows, but they cover + Histogram_binary buckets have the same numbers of rows, but they cover different ranges of values. We assume that values are uniformly distributed across the [0..1] value diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 178bc11a278..3524f7acc05 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -153,6 +153,24 @@ public: virtual void serialize(Field *to_field)= 0; virtual Histogram_type get_type()=0; + + virtual uint get_width()=0; + + virtual void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size)=0; + + virtual bool is_available()=0; + + virtual bool is_usable(THD *thd)=0; + + virtual void set_values(uchar * values)=0; + + virtual uchar *get_values()=0; + + virtual void set_size(ulonglong sz)=0; + + virtual double range_selectivity(double min_pos, double max_pos)=0; + + virtual double point_selectivity(double pos, double avg_selection)=0; // Legacy: return the size of the histogram on disk. // This will be stored in mysql.column_stats.hist_size column. @@ -162,22 +180,21 @@ public: virtual ~Histogram_base(){} }; -class Histogram : public Histogram_base +class Histogram_binary : public Histogram_base { public: bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg) override; void serialize(Field *to_field) override; + Histogram_type get_type() override { return type; } uint get_size() override { return (uint) size; } - // returns number of buckets in the histogram - uint get_width() + uint get_width() override { switch (type) { case SINGLE_PREC_HB: - case JSON: return size; case DOUBLE_PREC_HB: return size / 2; @@ -196,7 +213,6 @@ private: { switch (type) { case SINGLE_PREC_HB: - case JSON: return ((uint) (1 << 8) - 1); case DOUBLE_PREC_HB: return ((uint) (1 << 16) - 1); @@ -211,7 +227,6 @@ private: DBUG_ASSERT(i < get_width()); switch (type) { case SINGLE_PREC_HB: - case JSON: return (uint) (((uint8 *) values)[i]); case DOUBLE_PREC_HB: return (uint) uint2korr(values + i * 2); @@ -260,22 +275,22 @@ private: return i; } - uchar *get_values() { return (uchar *) values; } + uchar *get_values() override { return (uchar *) values; } public: - void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size); + void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size) override; // Note: these two are used only for saving the JSON text: - void set_values (uchar *vals) { values= (uchar *) vals; } - void set_size (ulonglong sz) { size= (uint8) sz; } + void set_values (uchar *vals) override { values= (uchar *) vals; } + void set_size (ulonglong sz) override { size= (uint8) sz; } - bool is_available() { return get_size() > 0 && get_values(); } + bool is_available() override { return get_size() > 0 && get_values(); } /* This function checks that histograms should be usable only when 1) the level of optimizer_use_condition_selectivity > 3 2) histograms have been collected */ - bool is_usable(THD *thd) + bool is_usable(THD *thd) override { return thd->variables.optimizer_use_condition_selectivity > 3 && is_available(); @@ -285,7 +300,6 @@ public: { switch (type) { case SINGLE_PREC_HB: - case JSON: ((uint8 *) values)[i]= (uint8) (val * prec_factor()); return; case DOUBLE_PREC_HB: @@ -301,7 +315,6 @@ public: { switch (type) { case SINGLE_PREC_HB: - case JSON: ((uint8 *) values)[i]= ((uint8 *) values)[i-1]; return; case DOUBLE_PREC_HB: @@ -313,7 +326,7 @@ public: } } - double range_selectivity(double min_pos, double max_pos) + double range_selectivity(double min_pos, double max_pos) override { double sel; double bucket_sel= 1.0/(get_width() + 1); @@ -326,9 +339,54 @@ public: /* Estimate selectivity of "col=const" using a histogram */ - double point_selectivity(double pos, double avg_sel); + double point_selectivity(double pos, double avg_sel) override; }; +class Histogram_json : public Histogram_base +{ +private: + Histogram_type type; + uint8 size; /* Number of elements in the histogram*/ + uchar *values; + +public: + bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr, uint size) override {return false;} + + void serialize(Field *to_field) override{} + + uint get_size() override {return (uint) size;} + + // returns number of buckets in the histogram + uint get_width() override + { + return size; + }; + + Histogram_type get_type() override + { + return JSON; + } + + void set_size (ulonglong sz) override {size = (uint8) sz; } + + void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size) override; + + bool is_available() override {return get_size() > 0 && get_values(); } + + bool is_usable(THD *thd) override + { + return thd->variables.optimizer_use_condition_selectivity > 3 && + is_available(); + } + + void set_values (uchar *vals) override { values= (uchar *) vals; } + + uchar *get_values() override { return (uchar *) values; } + + double range_selectivity(double min_pos, double max_pos) override {return 0.1;} + + double point_selectivity(double pos, double avg_selection) override {return 0.5;} +}; class Columns_statistics; class Index_statistics; @@ -411,7 +469,7 @@ private: public: Histogram_type histogram_type_on_disk; - Histogram *histogram_; + Histogram_base *histogram_; uint32 no_values_provided_bitmap() {