MDEV-26886: Estimation for filtered rows less precise with JSON histogram

- Make Histogram_json_hb::range_selectivity handle singleton buckets
  specially when computing selectivity of the max. endpoint bound.
  (for min. endpoint, we already do that).

- Also, fixed comments for Histogram_json_hb::find_bucket
This commit is contained in:
Sergei Petrunia 2021-11-26 20:03:08 +03:00
parent 106c785e2d
commit eb6a9ad705
4 changed files with 84 additions and 29 deletions

View File

@ -7658,3 +7658,28 @@ test.t1 analyze status OK
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
INSERT INTO t1 (f) VALUES ('bar');
DROP TABLE t1;
#
# MDEV-26886: Estimation for filtered rows less precise with JSON histogram
#
create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
select count(*) from t1 where a <= 0;
count(*)
33
set histogram_type = JSON_HB, histogram_size=default;
analyze table t1 persistent for all;
Table Op Msg_type Msg_text
test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
analyze select * from t1 where a <= 0;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 33.00 33.00 Using where
analyze select * from t1 where a < 0;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 1.47 0.00 Using where
analyze select * from t1 where a > 0;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 67.00 67.00 Using where
analyze select * from t1 where a >= 0;
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where
drop table t1;

View File

@ -340,3 +340,17 @@ ANALYZE TABLE t1 PERSISTENT FOR ALL;
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
INSERT INTO t1 (f) VALUES ('bar');
DROP TABLE t1;
--echo #
--echo # MDEV-26886: Estimation for filtered rows less precise with JSON histogram
--echo #
create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
select count(*) from t1 where a <= 0;
set histogram_type = JSON_HB, histogram_size=default;
analyze table t1 persistent for all;
analyze select * from t1 where a <= 0;
analyze select * from t1 where a < 0;
analyze select * from t1 where a > 0;
analyze select * from t1 where a >= 0;
drop table t1;

View File

@ -743,9 +743,22 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
idx--;
}
double left_fract= get_left_fract(idx);
double sel= position_in_interval(field, max_key, max_key_len,
buckets[idx].start_value,
get_end_value(idx));
double sel;
/* Special handling for singleton buckets */
if (buckets[idx].ndv == 1 && equal)
{
if (inclusive_endp)
sel= 1.0;
else
sel= 0.0;
}
else
{
sel= position_in_interval(field, max_key, max_key_len,
buckets[idx].start_value,
get_end_value(idx));
}
max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
}
else
@ -763,26 +776,18 @@ void Histogram_json_hb::serialize(Field *field)
/*
Find the rightmost histogram bucket such that "lookup_val $GT start_value".
@brief
Find the leftmost histogram bucket such that "lookup_val >= start_value".
$GT is either '>' or '>=' depending on equal_is_less parameter.
@param field Field object (used to do value comparisons)
@param lookup_val The lookup value in KeyTupleFormat.
@param equal OUT TRUE<=> the found bucket has left_bound=lookup_val
@param equal_is_less Controls what to do if a histogram bound is equal to the
lookup_val.
@detail
Possible cases:
1. The regular case: the value falls into some bucket.
2. The value is less than the minimum of the first bucket
3. The value is greater than the maximum of the last bucket
In these cases we "clip" to the first/last bucket.
4. The value hits the bucket boundary. Then, we need to know whether the
point of interest is to the left the constant, or to the right of it.
@return
The bucket index
*/
int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
bool *equal)
{
int res;
@ -797,7 +802,8 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
if (!res)
{
*equal= true;
return middle;
low= middle;
goto end;
}
else if (res < 0)
low= middle;
@ -806,25 +812,25 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
}
/*
If low and high were assigned a value in the above loop, then they are not
equal to the lookup value:
If low and high were assigned a value in the above loop and we got here,
then the following holds:
bucket[low] < lookup_val < bucket[high]
bucket[low].start_value < lookup_val < bucket[high].start_value
But there are two special cases: low=0 and high=last_bucket. Handle them
below.
Besides that, there are two special cases: low=0 and high=last_bucket.
Handle them below.
*/
if (low == 0)
{
res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
if (!res)
*equal= true;
else if (res < 0)
else if (res < 0) // buckets[0] < lookup_val
{
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
if (!res)
*equal= true;
if (res >= 0)
if (res <= 0) // buckets[high] <= lookup_val
low= high;
}
}
@ -833,9 +839,19 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
if (!res)
*equal= true;
if (res >= 0)
if (res <= 0)
low= high;
}
end:
// Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
lookup_val)));
// buckets[low] <= lookup_val, with one exception of the first bucket.
DBUG_ASSERT(low == 0 ||
field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
// buckets[low+1] > lookup_val, with one exception of the last bucket
DBUG_ASSERT(low == (int)buckets.size()-1 ||
field->key_cmp((uchar*)buckets[low+1].start_value.data(), lookup_val)> 0);
return low;
}

View File

@ -124,6 +124,6 @@ public:
private:
double get_left_fract(int idx);
std::string& get_end_value(int idx);
int find_bucket(Field *field, const uchar *lookup_val, bool *equal);
int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
};