diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result index a54d5fd4151..0a58faa3143 100644 --- a/mysql-test/main/statistics_json.result +++ b/mysql-test/main/statistics_json.result @@ -4631,12 +4631,12 @@ test t1_json a a-0 a-9 0.0000 3.0000 1.0000 10 JSON_HB { } explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz'; id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 68.71 Using where +1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 60.00 Using where Warnings: Note 1003 select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz' analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz'; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra -1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 10.00 68.71 60.00 Using where +1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 10.00 60.00 60.00 Using where explain extended select * from t1_json where a < 'b-1a'; id select_type table type possible_keys key key_len ref rows filtered Extra 1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 100.00 Using where @@ -8014,7 +8014,7 @@ test.t1 analyze status OK analyze select c from t1 where c > '1'; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 16 16.00 80.47 75.00 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 16 16.00 75.00 75.00 Using where drop table t1; # # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values @@ -8211,3 +8211,33 @@ analyze select COUNT(*) FROM t1 WHERE a < 'a'; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 50.00 50.00 Using where drop table t1; +# +# MDEV-27229: Estimation for filtered rows less precise ... #5 +# +create table t1 (id int, a varchar(8)); +insert into t1 select seq, 'bar' from seq_1_to_100; +insert into t1 select id, 'qux' from t1; +set histogram_type=JSON_HB; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK +analyze select COUNT(*) FROM t1 WHERE a > 'foo'; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 50.00 50.00 Using where +analyze select COUNT(*) FROM t1 WHERE a > 'aaa'; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 100.00 100.00 Using where +analyze select COUNT(*) FROM t1 WHERE a >='aaa'; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 100.00 100.00 Using where +analyze select COUNT(*) FROM t1 WHERE a > 'bar'; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 50.00 50.00 Using where +analyze select COUNT(*) FROM t1 WHERE a >='bar'; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 100.00 100.00 Using where +analyze select COUNT(*) FROM t1 WHERE a <='bar'; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 200 200.00 50.00 50.00 Using where +drop table t1; diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test index 024cb55e540..2b19ff14a2f 100644 --- a/mysql-test/main/statistics_json.test +++ b/mysql-test/main/statistics_json.test @@ -390,3 +390,29 @@ analyze table t1 persistent for all; analyze select COUNT(*) FROM t1 WHERE a <> 'a'; analyze select COUNT(*) FROM t1 WHERE a < 'a'; drop table t1; + +--echo # +--echo # MDEV-27229: Estimation for filtered rows less precise ... #5 +--echo # +create table t1 (id int, a varchar(8)); +insert into t1 select seq, 'bar' from seq_1_to_100; +insert into t1 select id, 'qux' from t1; + +set histogram_type=JSON_HB; +analyze table t1 persistent for all; +analyze select COUNT(*) FROM t1 WHERE a > 'foo'; + +analyze select COUNT(*) FROM t1 WHERE a > 'aaa'; +analyze select COUNT(*) FROM t1 WHERE a >='aaa'; + +analyze select COUNT(*) FROM t1 WHERE a > 'bar'; +analyze select COUNT(*) FROM t1 WHERE a >='bar'; + +# Can enable these after get_avg_frequency issue is resolved: +# analyze select COUNT(*) FROM t1 WHERE a < 'aaa'; +# analyze select COUNT(*) FROM t1 WHERE a <='aaa'; +# analyze select COUNT(*) FROM t1 WHERE a < 'bar'; + +analyze select COUNT(*) FROM t1 WHERE a <='bar'; + +drop table t1; diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc index 2ee6cd73dbe..7c037183f41 100644 --- a/sql/opt_histogram_json.cc +++ b/sql/opt_histogram_json.cc @@ -910,12 +910,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint, // If the value is outside of the histogram's range, this will "clip" it to // first or last bucket. - bool equal; - int idx= find_bucket(field, key, &equal); + int endp_cmp; + int idx= find_bucket(field, key, &endp_cmp); double sel; - if (buckets[idx].ndv == 1 && !equal) + if (buckets[idx].ndv == 1 && (endp_cmp!=0)) { /* The bucket has a single value and it doesn't match! Return a very @@ -979,22 +979,27 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp, // Find the leftmost bucket that contains the lookup value. // (If the lookup value is to the left of all buckets, find bucket #0) - bool equal; - int idx= find_bucket(field, min_key, &equal); - if (equal && exclusive_endp && buckets[idx].ndv==1 && - idx < (int)buckets.size()-1) + int endp_cmp; + int idx= find_bucket(field, min_key, &endp_cmp); + + double sel; + // Special handling for buckets with ndv=1: + if (buckets[idx].ndv == 1) { - /* - The range is "col > $CONST" and we've found a bucket that contains - only the value $CONST. Move to the next bucket. - */ - idx++; + if (endp_cmp < 0) + sel= 0.0; + else if (endp_cmp > 0) + sel= 1.0; + else // endp_cmp == 0.0 + sel= (exclusive_endp)? 1.0 : 0.0; + } + else + { + sel= position_in_interval(field, min_key, min_key_len, + buckets[idx].start_value, + get_end_value(idx)); } double left_fract= get_left_fract(idx); - double sel= position_in_interval(field, min_key, min_key_len, - buckets[idx].start_value, - get_end_value(idx)); - min= left_fract + sel * (buckets[idx].cum_fract - left_fract); } else @@ -1012,28 +1017,35 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp, max_key++; max_key_len--; } - bool equal; - int idx= find_bucket(field, max_key, &equal); + int endp_cmp; + int idx= find_bucket(field, max_key, &endp_cmp); - if (equal && !inclusive_endp && idx > 0) + if ((endp_cmp == 0) && !inclusive_endp) { /* The range is "col < $CONST" and we've found a bucket starting with - $CONST. Move to the previous bucket. + $CONST. */ - idx--; - equal= false; - } - double left_fract= get_left_fract(idx); - - double sel; - /* Special handling for singleton buckets */ - if (buckets[idx].ndv == 1 && equal) - { - if (inclusive_endp) - sel= 1.0; + if (idx > 0) + { + // Move to the previous bucket + endp_cmp= 1; + idx--; + } else + endp_cmp= -1; + } + double sel; + + // Special handling for buckets with ndv=1: + if (buckets[idx].ndv == 1) + { + if (endp_cmp < 0) sel= 0.0; + else if (endp_cmp > 0) + sel= 1.0; + else // endp_cmp == 0.0 + sel= inclusive_endp? 1.0 : 0.0; } else { @@ -1041,13 +1053,13 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp, buckets[idx].start_value, get_end_value(idx)); } + double left_fract= get_left_fract(idx); max= left_fract + sel * (buckets[idx].cum_fract - left_fract); } else max= 1.0; - double sel = max - min; - return sel; + return max - min; } @@ -1057,25 +1069,37 @@ void Histogram_json_hb::serialize(Field *field) } +static int SGN(int x) +{ + if (!x) + return 0; + return (x < 0)? -1 : 1; +} + + /* @brief Find the leftmost histogram bucket such that "lookup_val >= start_value". @param field Field object (used to do value comparisons) @param lookup_val The lookup value in KeyTupleFormat. - @param equal OUT TRUE<=> the found bucket has left_bound=lookup_val - + @param cmp OUT How the lookup_val compares to found_bucket.left_bound: + 0 - lookup_val == bucket.left_bound + >0 - lookup_val > bucket.left_bound (the most typical) + <0 - lookup_val < bucket.left_bound. This can only happen + for the first bucket, for all other buckets we would just + pick the previous bucket and have cmp>=0. @return The bucket index */ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val, - bool *equal) + int *cmp) { int res; int low= 0; int high= (int)buckets.size() - 1; - *equal= false; + *cmp= 1; // By default, (bucket[retval].start_value < *lookup_val) while (low + 1 < high) { @@ -1083,7 +1107,7 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val, res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val); if (!res) { - *equal= true; + *cmp= res; low= middle; goto end; } @@ -1104,31 +1128,44 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val, */ if (low == 0) { - res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val); - if (!res) - *equal= true; - else if (res < 0) // buckets[0] < lookup_val + res= field->key_cmp(lookup_val, (uchar*)buckets[0].start_value.data()); + if (res <= 0) + *cmp= res; + else // res>0, lookup_val > buckets[0].start_value { - res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val); - if (!res) - *equal= true; - if (res <= 0) // buckets[high] <= lookup_val + res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data()); + if (res >= 0) // lookup_val >= buckets[high].start_value + { + // Move to that bucket low= high; + *cmp= res; + } + else + *cmp= 1; } } else if (high == (int)buckets.size() - 1) { - res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val); - if (!res) - *equal= true; - if (res <= 0) + res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data()); + if (res >= 0) + { + // Ok the value is in the last bucket. + *cmp= res; low= high; + } + else + { + // The value is in the 'low' bucket. + res= field->key_cmp(lookup_val, (uchar*)buckets[low].start_value.data()); + *cmp= res; + } } end: - // Verification: *equal==TRUE <=> lookup value is equal to the found bucket. - DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(), - lookup_val))); + // Verification: *cmp has correct value + DBUG_ASSERT(SGN(*cmp) == + SGN(field->key_cmp(lookup_val, + (uchar*)buckets[low].start_value.data()))); // buckets[low] <= lookup_val, with one exception of the first bucket. DBUG_ASSERT(low == 0 || field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0); diff --git a/sql/opt_histogram_json.h b/sql/opt_histogram_json.h index 327c852db98..9e32e70f7fc 100644 --- a/sql/opt_histogram_json.h +++ b/sql/opt_histogram_json.h @@ -144,6 +144,6 @@ private: double get_left_fract(int idx); std::string& get_end_value(int idx); - int find_bucket(const Field *field, const uchar *lookup_val, bool *equal); + int find_bucket(const Field *field, const uchar *lookup_val, int *cmp); };