From c129689ddc7dc4c8fd2a36915ae4d9d50fa6d591 Mon Sep 17 00:00:00 2001 From: Michael Okoko Date: Mon, 16 Aug 2021 10:09:56 +0100 Subject: [PATCH] Use binary search to compute range selectivity * it also adds an "explain select" statement to the test so that the fprintf calls can print the computed intervals to mysqld.1.err Signed-off-by: Michael Okoko --- mysql-test/main/statistics_json.result | 32 +----- mysql-test/main/statistics_json.test | 2 +- sql/sql_statistics.cc | 147 +++++++++++++------------ sql/sql_statistics.h | 7 ++ 4 files changed, 89 insertions(+), 99 deletions(-) diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result index b4da524637b..0f68398650d 100644 --- a/mysql-test/main/statistics_json.result +++ b/mysql-test/main/statistics_json.result @@ -67,33 +67,11 @@ test t1 d 1 25 0.0000 8.0000 1.0000 10 JSON [ "21", "23" ] -SELECT * FROM t1; -a b c d -1 1 1 1 -2 2 2 2 -3 3 3 3 -4 4 4 4 -5 5 5 5 -6 6 6 6 -7 7 7 7 -8 8 8 8 -9 9 9 9 -10 10 10 10 -11 11 11 11 -12 12 12 12 -13 13 13 13 -14 14 14 14 -15 15 15 15 -16 16 16 16 -17 17 17 17 -18 18 18 18 -19 19 19 19 -20 20 20 20 -21 21 21 21 -22 22 22 22 -23 23 23 23 -24 24 24 24 -25 25 25 25 +explain extended select * from t1 where b between '20' and '70'; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 25 10.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where `test`.`t1`.`b` between '20' and '70' UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1'; FLUSH TABLES; SELECT * FROM t1; diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test index 64b3b83e5f7..d5ece058440 100644 --- a/mysql-test/main/statistics_json.test +++ b/mysql-test/main/statistics_json.test @@ -28,7 +28,7 @@ set histogram_size=10; ANALYZE TABLE t1 PERSISTENT FOR ALL; SELECT * FROM mysql.column_stats WHERE table_name='t1'; -SELECT * FROM t1; +explain extended select * from t1 where b between '20' and '70'; # We then test different valid JSON strings that are invalid histograms. UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1'; diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index f0e8098aa85..81a04fc29f5 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -1466,90 +1466,95 @@ double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp, key_range *max_endp) { fprintf(stderr, "Histogram_json::range_selectivity_new\n"); - - - /* - GSOC-TODO: - The code below is NOT what this function have. - - == WHAT THIS CODE DOES == - At the moment it does a linear walk through histogram_bounds and compares - min_endp to each of histogram bucket's min and max. - ATTENTION: This is a demo of how key_cmp() is used to compare the values. - - When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END, - it computes a position of min_endp within the bucket. - ATTENTION: calls to pos_in_interval_.... are a demo of how to compute - position of a value within a [min,max] range. - - == WHAT THIS CODE SHOULD DO == - * Use binary search to locate the range [MIN_BUCKET; MAX_BUCKET] - the - set of buckets that overlaps with the search interval {min_endp, max_endp}. - - * If the search interval covers MIN_BUCKET only partially, compute a - position of min_endp within the bucket. - - * The same for max_endp. - - * Compute the final selectivity and return it. - */ - std::string prev_s; - bool have_prev_s=false; - for (auto &s : histogram_bounds) + double min_sel, max_sel; + if (min_endp) { - if (!have_prev_s) - { - prev_s = s; - have_prev_s= true; - continue; - } + const uchar *min_key= min_endp->key; + // TODO: also, properly handle SQL NULLs. + // in this test patch, we just assume the values are not SQL NULLs. + if (field->real_maybe_null()) + min_key++; - // It's a test code, so we only process min_endp. - if (min_endp) - { - const uchar *min_key= min_endp->key; - // TODO: also, properly handle SQL NULLs. - // in this test patch, we just assume the values are not SQL NULLs. - if (field->real_maybe_null()) - min_key++; - - int res1= field->key_cmp((uchar*)prev_s.data(), min_key); - const char *str1="<"; - if (res1>0) str1=">"; - if (res1==0) str1="="; - - int res2= field->key_cmp(min_key, (uchar*)s.data()); - const char *str2="<"; - if (res2>0) str2=">"; - if (res2==0) str2="="; - fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2); - - if (res1<0 && res2 < 0) - { - double sel; - if (field->pos_through_val_str()) - sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(), - (uchar*)s.data(), (uchar*)min_key); - else - sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(), - (uchar*)s.data(), (uchar*)min_key); - - fprintf(stderr, " pos_in_interval=%g\n", sel); - } - - prev_s= s; - } + min_sel= selection_in_interval(field, min_key); + fprintf(stderr, "min pos_in_interval(min_endp)=%g\n", min_sel); } + if (max_endp) + { + const uchar *max_key= max_endp->key; + if (field->real_maybe_null()) + max_key++; + + max_sel= selection_in_interval(field, max_key); + fprintf(stderr, "max pos_in_interval(min_endp)=%g\n", max_sel); + } + fprintf(stderr, "Histogram_json::range_selectivity_new ends\n"); return 0.5; } +double Histogram_json::selection_in_interval(Field *field, const uchar* endpoint) +{ + int min_bucket_idx, max_bucket_idx; + min_bucket_idx= find_bucket(field, endpoint); + std::string min_bucket, max_bucket; + + // todo: + // this will probably trip up for cases where mind_endp > the last histogram value i.e min_bucket_idx = -1, but max_bucket_idx = 0 doesn't make sense. + max_bucket_idx= min_bucket_idx + 1; + double selection = 0; + if (min_bucket_idx != -1) + { + min_bucket= histogram_bounds[min_bucket_idx]; + max_bucket= (max_bucket_idx < (int)histogram_bounds.size()) ? histogram_bounds[max_bucket_idx] : ""; + + if (field->pos_through_val_str()) + selection = pos_in_interval_through_strxfrm(field, (uchar *) min_bucket.data(), + (uchar *) max_bucket.data(), + (uchar *) endpoint); + else + selection = pos_in_interval_through_val_real(field, (uchar *) min_bucket.data(), + (uchar *) max_bucket.data(), + (uchar *) endpoint); + } + return selection; +} + void Histogram_json::serialize(Field *field) { field->store((char*)get_values(), strlen((char*)get_values()), &my_charset_bin); } +int Histogram_json::find_bucket(Field *field, const uchar *endpoint) +{ + int low = 0; + int high = (int)histogram_bounds.size()-1; + int mid; + int min_bucket_index = -1; + std::string mid_val; + + while(low <= high) { + // c++ gives us the floor of integer divisions by default, below we get the ceiling (round-up). + // it works but it doesn't feel so readable, maybe we could make improvements? + int sum = (low+high); + mid = sum/2 + (sum % 2 != 0); + + mid_val = histogram_bounds[mid]; + + int res = field->key_cmp((uchar*) mid_val.data(), endpoint); + min_bucket_index = mid; + if (res < 0) { + low = mid + 1; + } else if (res > 0) { + high = mid - 1; + } else { + //todo: endpoint is on a bucket boundary + break; + } + } + return min_bucket_index; +} + /* An object of the class Index_stat is created to read statistical data on tables from the statistical table table_stat, to update diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index a8d5e338698..d91f55368c9 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -419,6 +419,13 @@ public: */ double range_selectivity_new(Field *field, key_range *min_endp, key_range *max_endp) override; + + /* + * Returns the index of the biggest histogram value that is smaller than endpoint + */ + int find_bucket(Field *field, const uchar *endpoint); + + double selection_in_interval(Field *field, const uchar* endpoint); }; class Columns_statistics;