Use binary search to compute range selectivity
* it also adds an "explain select" statement to the test so that the fprintf calls can print the computed intervals to mysqld.1.err Signed-off-by: Michael Okoko <okokomichaels@outlook.com>
This commit is contained in:
parent
c605285bb8
commit
c129689ddc
@ -67,33 +67,11 @@ test t1 d 1 25 0.0000 8.0000 1.0000 10 JSON [
|
|||||||
"21",
|
"21",
|
||||||
"23"
|
"23"
|
||||||
]
|
]
|
||||||
SELECT * FROM t1;
|
explain extended select * from t1 where b between '20' and '70';
|
||||||
a b c d
|
id select_type table type possible_keys key key_len ref rows filtered Extra
|
||||||
1 1 1 1
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 25 10.00 Using where
|
||||||
2 2 2 2
|
Warnings:
|
||||||
3 3 3 3
|
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c`,`test`.`t1`.`d` AS `d` from `test`.`t1` where `test`.`t1`.`b` between '20' and '70'
|
||||||
4 4 4 4
|
|
||||||
5 5 5 5
|
|
||||||
6 6 6 6
|
|
||||||
7 7 7 7
|
|
||||||
8 8 8 8
|
|
||||||
9 9 9 9
|
|
||||||
10 10 10 10
|
|
||||||
11 11 11 11
|
|
||||||
12 12 12 12
|
|
||||||
13 13 13 13
|
|
||||||
14 14 14 14
|
|
||||||
15 15 15 15
|
|
||||||
16 16 16 16
|
|
||||||
17 17 17 17
|
|
||||||
18 18 18 18
|
|
||||||
19 19 19 19
|
|
||||||
20 20 20 20
|
|
||||||
21 21 21 21
|
|
||||||
22 22 22 22
|
|
||||||
23 23 23 23
|
|
||||||
24 24 24 24
|
|
||||||
25 25 25 25
|
|
||||||
UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
|
UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
|
||||||
FLUSH TABLES;
|
FLUSH TABLES;
|
||||||
SELECT * FROM t1;
|
SELECT * FROM t1;
|
||||||
|
@ -28,7 +28,7 @@ set histogram_size=10;
|
|||||||
|
|
||||||
ANALYZE TABLE t1 PERSISTENT FOR ALL;
|
ANALYZE TABLE t1 PERSISTENT FOR ALL;
|
||||||
SELECT * FROM mysql.column_stats WHERE table_name='t1';
|
SELECT * FROM mysql.column_stats WHERE table_name='t1';
|
||||||
SELECT * FROM t1;
|
explain extended select * from t1 where b between '20' and '70';
|
||||||
|
|
||||||
# We then test different valid JSON strings that are invalid histograms.
|
# We then test different valid JSON strings that are invalid histograms.
|
||||||
UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
|
UPDATE mysql.column_stats SET histogram='["1", {"a": "b"}, "2"]' WHERE table_name='t1';
|
||||||
|
@ -1466,45 +1466,7 @@ double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
|
|||||||
key_range *max_endp)
|
key_range *max_endp)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Histogram_json::range_selectivity_new\n");
|
fprintf(stderr, "Histogram_json::range_selectivity_new\n");
|
||||||
|
double min_sel, max_sel;
|
||||||
|
|
||||||
/*
|
|
||||||
GSOC-TODO:
|
|
||||||
The code below is NOT what this function have.
|
|
||||||
|
|
||||||
== WHAT THIS CODE DOES ==
|
|
||||||
At the moment it does a linear walk through histogram_bounds and compares
|
|
||||||
min_endp to each of histogram bucket's min and max.
|
|
||||||
ATTENTION: This is a demo of how key_cmp() is used to compare the values.
|
|
||||||
|
|
||||||
When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END,
|
|
||||||
it computes a position of min_endp within the bucket.
|
|
||||||
ATTENTION: calls to pos_in_interval_.... are a demo of how to compute
|
|
||||||
position of a value within a [min,max] range.
|
|
||||||
|
|
||||||
== WHAT THIS CODE SHOULD DO ==
|
|
||||||
* Use binary search to locate the range [MIN_BUCKET; MAX_BUCKET] - the
|
|
||||||
set of buckets that overlaps with the search interval {min_endp, max_endp}.
|
|
||||||
|
|
||||||
* If the search interval covers MIN_BUCKET only partially, compute a
|
|
||||||
position of min_endp within the bucket.
|
|
||||||
|
|
||||||
* The same for max_endp.
|
|
||||||
|
|
||||||
* Compute the final selectivity and return it.
|
|
||||||
*/
|
|
||||||
std::string prev_s;
|
|
||||||
bool have_prev_s=false;
|
|
||||||
for (auto &s : histogram_bounds)
|
|
||||||
{
|
|
||||||
if (!have_prev_s)
|
|
||||||
{
|
|
||||||
prev_s = s;
|
|
||||||
have_prev_s= true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// It's a test code, so we only process min_endp.
|
|
||||||
if (min_endp)
|
if (min_endp)
|
||||||
{
|
{
|
||||||
const uchar *min_key= min_endp->key;
|
const uchar *min_key= min_endp->key;
|
||||||
@ -1513,43 +1475,86 @@ double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
|
|||||||
if (field->real_maybe_null())
|
if (field->real_maybe_null())
|
||||||
min_key++;
|
min_key++;
|
||||||
|
|
||||||
int res1= field->key_cmp((uchar*)prev_s.data(), min_key);
|
min_sel= selection_in_interval(field, min_key);
|
||||||
const char *str1="<";
|
fprintf(stderr, "min pos_in_interval(min_endp)=%g\n", min_sel);
|
||||||
if (res1>0) str1=">";
|
}
|
||||||
if (res1==0) str1="=";
|
if (max_endp)
|
||||||
|
|
||||||
int res2= field->key_cmp(min_key, (uchar*)s.data());
|
|
||||||
const char *str2="<";
|
|
||||||
if (res2>0) str2=">";
|
|
||||||
if (res2==0) str2="=";
|
|
||||||
fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2);
|
|
||||||
|
|
||||||
if (res1<0 && res2 < 0)
|
|
||||||
{
|
{
|
||||||
double sel;
|
const uchar *max_key= max_endp->key;
|
||||||
if (field->pos_through_val_str())
|
if (field->real_maybe_null())
|
||||||
sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(),
|
max_key++;
|
||||||
(uchar*)s.data(), (uchar*)min_key);
|
|
||||||
else
|
|
||||||
sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(),
|
|
||||||
(uchar*)s.data(), (uchar*)min_key);
|
|
||||||
|
|
||||||
fprintf(stderr, " pos_in_interval=%g\n", sel);
|
max_sel= selection_in_interval(field, max_key);
|
||||||
|
fprintf(stderr, "max pos_in_interval(min_endp)=%g\n", max_sel);
|
||||||
}
|
}
|
||||||
|
|
||||||
prev_s= s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
|
fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
|
||||||
return 0.5;
|
return 0.5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double Histogram_json::selection_in_interval(Field *field, const uchar* endpoint)
|
||||||
|
{
|
||||||
|
int min_bucket_idx, max_bucket_idx;
|
||||||
|
min_bucket_idx= find_bucket(field, endpoint);
|
||||||
|
std::string min_bucket, max_bucket;
|
||||||
|
|
||||||
|
// todo:
|
||||||
|
// this will probably trip up for cases where mind_endp > the last histogram value i.e min_bucket_idx = -1, but max_bucket_idx = 0 doesn't make sense.
|
||||||
|
max_bucket_idx= min_bucket_idx + 1;
|
||||||
|
double selection = 0;
|
||||||
|
if (min_bucket_idx != -1)
|
||||||
|
{
|
||||||
|
min_bucket= histogram_bounds[min_bucket_idx];
|
||||||
|
max_bucket= (max_bucket_idx < (int)histogram_bounds.size()) ? histogram_bounds[max_bucket_idx] : "";
|
||||||
|
|
||||||
|
if (field->pos_through_val_str())
|
||||||
|
selection = pos_in_interval_through_strxfrm(field, (uchar *) min_bucket.data(),
|
||||||
|
(uchar *) max_bucket.data(),
|
||||||
|
(uchar *) endpoint);
|
||||||
|
else
|
||||||
|
selection = pos_in_interval_through_val_real(field, (uchar *) min_bucket.data(),
|
||||||
|
(uchar *) max_bucket.data(),
|
||||||
|
(uchar *) endpoint);
|
||||||
|
}
|
||||||
|
return selection;
|
||||||
|
}
|
||||||
|
|
||||||
void Histogram_json::serialize(Field *field)
|
void Histogram_json::serialize(Field *field)
|
||||||
{
|
{
|
||||||
field->store((char*)get_values(), strlen((char*)get_values()),
|
field->store((char*)get_values(), strlen((char*)get_values()),
|
||||||
&my_charset_bin);
|
&my_charset_bin);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int Histogram_json::find_bucket(Field *field, const uchar *endpoint)
|
||||||
|
{
|
||||||
|
int low = 0;
|
||||||
|
int high = (int)histogram_bounds.size()-1;
|
||||||
|
int mid;
|
||||||
|
int min_bucket_index = -1;
|
||||||
|
std::string mid_val;
|
||||||
|
|
||||||
|
while(low <= high) {
|
||||||
|
// c++ gives us the floor of integer divisions by default, below we get the ceiling (round-up).
|
||||||
|
// it works but it doesn't feel so readable, maybe we could make improvements?
|
||||||
|
int sum = (low+high);
|
||||||
|
mid = sum/2 + (sum % 2 != 0);
|
||||||
|
|
||||||
|
mid_val = histogram_bounds[mid];
|
||||||
|
|
||||||
|
int res = field->key_cmp((uchar*) mid_val.data(), endpoint);
|
||||||
|
min_bucket_index = mid;
|
||||||
|
if (res < 0) {
|
||||||
|
low = mid + 1;
|
||||||
|
} else if (res > 0) {
|
||||||
|
high = mid - 1;
|
||||||
|
} else {
|
||||||
|
//todo: endpoint is on a bucket boundary
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return min_bucket_index;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
An object of the class Index_stat is created to read statistical
|
An object of the class Index_stat is created to read statistical
|
||||||
data on tables from the statistical table table_stat, to update
|
data on tables from the statistical table table_stat, to update
|
||||||
|
@ -419,6 +419,13 @@ public:
|
|||||||
*/
|
*/
|
||||||
double range_selectivity_new(Field *field, key_range *min_endp,
|
double range_selectivity_new(Field *field, key_range *min_endp,
|
||||||
key_range *max_endp) override;
|
key_range *max_endp) override;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns the index of the biggest histogram value that is smaller than endpoint
|
||||||
|
*/
|
||||||
|
int find_bucket(Field *field, const uchar *endpoint);
|
||||||
|
|
||||||
|
double selection_in_interval(Field *field, const uchar* endpoint);
|
||||||
};
|
};
|
||||||
|
|
||||||
class Columns_statistics;
|
class Columns_statistics;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user