MDEV-26886: Estimation for filtered rows less precise with JSON histogram
- Make Histogram_json_hb::range_selectivity handle singleton buckets specially when computing selectivity of the max. endpoint bound. (for min. endpoint, we already do that). - Also, fixed comments for Histogram_json_hb::find_bucket
This commit is contained in:
parent
106c785e2d
commit
eb6a9ad705
@ -7658,3 +7658,28 @@ test.t1 analyze status OK
|
|||||||
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
|
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
|
||||||
INSERT INTO t1 (f) VALUES ('bar');
|
INSERT INTO t1 (f) VALUES ('bar');
|
||||||
DROP TABLE t1;
|
DROP TABLE t1;
|
||||||
|
#
|
||||||
|
# MDEV-26886: Estimation for filtered rows less precise with JSON histogram
|
||||||
|
#
|
||||||
|
create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
|
||||||
|
select count(*) from t1 where a <= 0;
|
||||||
|
count(*)
|
||||||
|
33
|
||||||
|
set histogram_type = JSON_HB, histogram_size=default;
|
||||||
|
analyze table t1 persistent for all;
|
||||||
|
Table Op Msg_type Msg_text
|
||||||
|
test.t1 analyze status Engine-independent statistics collected
|
||||||
|
test.t1 analyze status OK
|
||||||
|
analyze select * from t1 where a <= 0;
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 33.00 33.00 Using where
|
||||||
|
analyze select * from t1 where a < 0;
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 1.47 0.00 Using where
|
||||||
|
analyze select * from t1 where a > 0;
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 67.00 67.00 Using where
|
||||||
|
analyze select * from t1 where a >= 0;
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where
|
||||||
|
drop table t1;
|
||||||
|
@ -340,3 +340,17 @@ ANALYZE TABLE t1 PERSISTENT FOR ALL;
|
|||||||
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
|
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
|
||||||
INSERT INTO t1 (f) VALUES ('bar');
|
INSERT INTO t1 (f) VALUES ('bar');
|
||||||
DROP TABLE t1;
|
DROP TABLE t1;
|
||||||
|
|
||||||
|
--echo #
|
||||||
|
--echo # MDEV-26886: Estimation for filtered rows less precise with JSON histogram
|
||||||
|
--echo #
|
||||||
|
create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
|
||||||
|
select count(*) from t1 where a <= 0;
|
||||||
|
|
||||||
|
set histogram_type = JSON_HB, histogram_size=default;
|
||||||
|
analyze table t1 persistent for all;
|
||||||
|
analyze select * from t1 where a <= 0;
|
||||||
|
analyze select * from t1 where a < 0;
|
||||||
|
analyze select * from t1 where a > 0;
|
||||||
|
analyze select * from t1 where a >= 0;
|
||||||
|
drop table t1;
|
||||||
|
@ -743,9 +743,22 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
|
|||||||
idx--;
|
idx--;
|
||||||
}
|
}
|
||||||
double left_fract= get_left_fract(idx);
|
double left_fract= get_left_fract(idx);
|
||||||
double sel= position_in_interval(field, max_key, max_key_len,
|
|
||||||
buckets[idx].start_value,
|
double sel;
|
||||||
get_end_value(idx));
|
/* Special handling for singleton buckets */
|
||||||
|
if (buckets[idx].ndv == 1 && equal)
|
||||||
|
{
|
||||||
|
if (inclusive_endp)
|
||||||
|
sel= 1.0;
|
||||||
|
else
|
||||||
|
sel= 0.0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sel= position_in_interval(field, max_key, max_key_len,
|
||||||
|
buckets[idx].start_value,
|
||||||
|
get_end_value(idx));
|
||||||
|
}
|
||||||
max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
|
max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -763,26 +776,18 @@ void Histogram_json_hb::serialize(Field *field)
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Find the rightmost histogram bucket such that "lookup_val $GT start_value".
|
@brief
|
||||||
|
Find the leftmost histogram bucket such that "lookup_val >= start_value".
|
||||||
|
|
||||||
$GT is either '>' or '>=' depending on equal_is_less parameter.
|
@param field Field object (used to do value comparisons)
|
||||||
|
@param lookup_val The lookup value in KeyTupleFormat.
|
||||||
|
@param equal OUT TRUE<=> the found bucket has left_bound=lookup_val
|
||||||
|
|
||||||
@param equal_is_less Controls what to do if a histogram bound is equal to the
|
@return
|
||||||
lookup_val.
|
The bucket index
|
||||||
|
|
||||||
@detail
|
|
||||||
Possible cases:
|
|
||||||
1. The regular case: the value falls into some bucket.
|
|
||||||
|
|
||||||
2. The value is less than the minimum of the first bucket
|
|
||||||
3. The value is greater than the maximum of the last bucket
|
|
||||||
In these cases we "clip" to the first/last bucket.
|
|
||||||
|
|
||||||
4. The value hits the bucket boundary. Then, we need to know whether the
|
|
||||||
point of interest is to the left the constant, or to the right of it.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
|
int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
|
||||||
bool *equal)
|
bool *equal)
|
||||||
{
|
{
|
||||||
int res;
|
int res;
|
||||||
@ -797,7 +802,8 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
|
|||||||
if (!res)
|
if (!res)
|
||||||
{
|
{
|
||||||
*equal= true;
|
*equal= true;
|
||||||
return middle;
|
low= middle;
|
||||||
|
goto end;
|
||||||
}
|
}
|
||||||
else if (res < 0)
|
else if (res < 0)
|
||||||
low= middle;
|
low= middle;
|
||||||
@ -806,25 +812,25 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
If low and high were assigned a value in the above loop, then they are not
|
If low and high were assigned a value in the above loop and we got here,
|
||||||
equal to the lookup value:
|
then the following holds:
|
||||||
|
|
||||||
bucket[low] < lookup_val < bucket[high]
|
bucket[low].start_value < lookup_val < bucket[high].start_value
|
||||||
|
|
||||||
But there are two special cases: low=0 and high=last_bucket. Handle them
|
Besides that, there are two special cases: low=0 and high=last_bucket.
|
||||||
below.
|
Handle them below.
|
||||||
*/
|
*/
|
||||||
if (low == 0)
|
if (low == 0)
|
||||||
{
|
{
|
||||||
res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
|
res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
|
||||||
if (!res)
|
if (!res)
|
||||||
*equal= true;
|
*equal= true;
|
||||||
else if (res < 0)
|
else if (res < 0) // buckets[0] < lookup_val
|
||||||
{
|
{
|
||||||
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
|
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
|
||||||
if (!res)
|
if (!res)
|
||||||
*equal= true;
|
*equal= true;
|
||||||
if (res >= 0)
|
if (res <= 0) // buckets[high] <= lookup_val
|
||||||
low= high;
|
low= high;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -833,9 +839,19 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
|
|||||||
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
|
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
|
||||||
if (!res)
|
if (!res)
|
||||||
*equal= true;
|
*equal= true;
|
||||||
if (res >= 0)
|
if (res <= 0)
|
||||||
low= high;
|
low= high;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
end:
|
||||||
|
// Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
|
||||||
|
DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
|
||||||
|
lookup_val)));
|
||||||
|
// buckets[low] <= lookup_val, with one exception of the first bucket.
|
||||||
|
DBUG_ASSERT(low == 0 ||
|
||||||
|
field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
|
||||||
|
// buckets[low+1] > lookup_val, with one exception of the last bucket
|
||||||
|
DBUG_ASSERT(low == (int)buckets.size()-1 ||
|
||||||
|
field->key_cmp((uchar*)buckets[low+1].start_value.data(), lookup_val)> 0);
|
||||||
return low;
|
return low;
|
||||||
}
|
}
|
||||||
|
@ -124,6 +124,6 @@ public:
|
|||||||
private:
|
private:
|
||||||
double get_left_fract(int idx);
|
double get_left_fract(int idx);
|
||||||
std::string& get_end_value(int idx);
|
std::string& get_end_value(int idx);
|
||||||
int find_bucket(Field *field, const uchar *lookup_val, bool *equal);
|
int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user