Code cleanup part#2: do not copy key values in xxx_selectivity() functions

This commit is contained in:
Sergei Petrunia 2021-08-29 19:32:25 +03:00
parent 2a1cdbabec
commit fcf58a5e0f
3 changed files with 84 additions and 121 deletions

View File

@ -2444,15 +2444,15 @@ test t1_json a a-0 a-9 0.0000 3.0000 1.0000 100 JSON_HB {
}
explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 60.87 Using where
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 59.87 Using where
Warnings:
Note 1003 select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz'
analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 10.00 60.87 60.00 Using where
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 10.00 59.87 60.00 Using where
explain extended select * from t1_json where a < 'b-1a';
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 100.00 Using where
1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 99.00 Using where
Warnings:
Note 1003 select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` < 'b-1a'
analyze select * from t1_json where a > 'zzzzzzzzz';
@ -2476,12 +2476,12 @@ test.t2 analyze status Engine-independent statistics collected
test.t2 analyze status OK
explain extended select * from t2 where city = 'Moscow';
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 101 98.04 Using where
1 SIMPLE t2 ALL NULL NULL NULL NULL 101 96.08 Using where
Warnings:
Note 1003 select `test`.`t2`.`city` AS `city` from `test`.`t2` where `test`.`t2`.`city` = 'Moscow'
analyze select * from t2 where city = 'Moscow';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 101 101.00 98.04 98.02 Using where
1 SIMPLE t2 ALL NULL NULL NULL NULL 101 101.00 96.08 98.02 Using where
explain extended select * from t2 where city = 'Helsinki';
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t2 ALL NULL NULL NULL NULL 101 2.00 Using where

View File

@ -63,16 +63,7 @@
equal to "never".
*/
/*
* json_get_array_items expects a JSON array as argument,
* and pushes the elements of the array into the `container` vector.
* It only works if all the elements in the original JSON array
* are scalar values (i.e., strings, numbers, true or false),
* else, the JSON type encountered is stored in value_type and the function returns false.
*/
bool json_get_array_items(const char *json, const char *json_end, int *value_type, std::vector<std::string> &container);
Histogram_base *create_histogram(Histogram_type hist_type);
Histogram_base *create_histogram(MEM_ROOT *mem_root, Histogram_type hist_type);
/* Currently there are only 3 persistent statistical tables */
static const uint STATISTICS_TABLES= 3;
@ -1235,18 +1226,9 @@ public:
Field *stat_field= stat_table->field[fldno];
table_field->read_stats->set_not_null(fldno);
stat_field->val_str(&val);
switch (table_field->read_stats->histogram_type_on_disk)
{
case SINGLE_PREC_HB:
case DOUBLE_PREC_HB:
hist = new (mem_root) Histogram_binary();
break;
case JSON_HB:
hist = new (mem_root) Histogram_json();
break;
default:
hist= create_histogram(mem_root, table_field->read_stats->histogram_type_on_disk);
if (!hist)
return NULL;
}
if (!hist->parse(mem_root, table_field,
table_field->read_stats->histogram_type_on_disk,
val.ptr(), val.length()))
@ -1415,7 +1397,6 @@ double pos_in_interval_through_val_real(Field *field,
uchar *max_val,
uchar *midpoint_val)
{
// For each passed value: unpack it into Field's current value. Then, we can
// get the value as double.
@ -1526,114 +1507,105 @@ double Histogram_json::point_selectivity(Field *field, key_range *endpoint, doub
const uchar *min_key = endpoint->key;
if (field->real_maybe_null())
min_key++;
uint min_idx= find_bucket(field, min_key);
uint max_idx= min_idx;
uint min_idx= find_bucket(field, min_key, false);
uint max_idx= find_bucket(field, min_key, true);
#if 0
// find how many buckets this value occupies
while ((max_idx + 1 < get_width() ) &&
(field->key_cmp((uchar *)histogram_bounds[max_idx + 1].data(), min_key) == 0)) {
max_idx++;
}
#endif
if (max_idx > min_idx)
{
// value spans multiple buckets
double bucket_sel= 1.0/(get_width() + 1);
sel= bucket_sel * (max_idx - min_idx + 1);
} else
}
else
{
// the value fits within a single bucket
sel = MY_MIN(avg_sel, (1.0/get_width()));
sel = MY_MIN(avg_sel, 1.0/get_width());
}
return sel;
}
/*
@param field The table field histogram is for. We don't care about the
field's current value, we only need its virtual functions to
perform various operations
@param field The table field histogram is for. We don't care about the
field's current value, we only need its virtual functions to
perform various operations
@param min_endp, max_endp - this specifies the range.
@param min_endp Left endpoint, or NULL if there is none
@param max_endp Right endpoint, or NULL if there is none
*/
double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
key_range *max_endp)
key_range *max_endp)
{
double min = 0.0, max = 1.0;
double width = 1.0/(int)histogram_bounds.size();
if (min_endp)
double min, max;
double width= 1.0 / histogram_bounds.size();
if (min_endp && !(field->null_ptr && min_endp->key[0]))
{
double min_sel = 0.0;
bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
const uchar *min_key= min_endp->key;
// GSOC-TODO: properly handle SQL NULLs.
// in this test patch, we just assume the values are not SQL NULLs.
if (field->real_maybe_null())
min_key++;
int min_bucket_idx, max_bucket_idx;
min_bucket_idx= find_bucket(field, min_key);
std::string min_bucket, max_bucket;
max_bucket_idx= min_bucket_idx + 1;
if (min_bucket_idx != -1)
// Find the leftmost bucket that contains the lookup value.
// (If the lookup value is to the left of all buckets, find bucket #0)
int idx= find_bucket(field, min_key, exclusive_endp);
double min_sel;
{
min_bucket= histogram_bounds[min_bucket_idx];
max_bucket= (max_bucket_idx < (int) histogram_bounds.size())
? histogram_bounds[max_bucket_idx]
: "";
std::string &left= histogram_bounds[idx];
std::string &right= histogram_bounds[idx+1];
if (field->pos_through_val_str())
min_sel= pos_in_interval_through_strxfrm(
field, (uchar *) min_bucket.data(), (uchar *) max_bucket.data(),
(uchar *) min_key);
field, (uchar*) left.data(), (uchar*) right.data(),
(uchar*) min_key);
else
min_sel= pos_in_interval_through_val_real(
field, (uchar *) min_bucket.data(), (uchar *) max_bucket.data(),
(uchar *) min_key);
field, (uchar *) left.data(), (uchar*) right.data(),
(uchar*) min_key);
}
min = min_bucket_idx * (width) + min_sel * (width);
//fprintf(stderr, "min pos_in_interval =%g\n", min_sel);
//fprintf(stderr, "min =%g\n", min);
min= idx*width + min_sel*width;
}
else
min= 0.0;
if (max_endp)
{
double max_sel = 1.0;
// The right endpoint cannot be NULL
DBUG_ASSERT(!(field->null_ptr && max_endp->key[0]));
bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
const uchar *max_key= max_endp->key;
if (field->real_maybe_null())
max_key++;
int min_bucket_idx, max_bucket_idx;
min_bucket_idx= find_bucket(field, max_key);
std::string min_bucket, max_bucket;
max_bucket_idx= min_bucket_idx + 1;
if (min_bucket_idx != -1)
int idx= find_bucket(field, max_key, inclusive_endp);
double max_sel;
{
min_bucket= histogram_bounds[min_bucket_idx];
max_bucket= (max_bucket_idx < (int) histogram_bounds.size())
? histogram_bounds[max_bucket_idx]
: "";
std::string &left= histogram_bounds[idx];
std::string &right= histogram_bounds[idx+1];
if (field->pos_through_val_str())
max_sel= pos_in_interval_through_strxfrm(
field, (uchar *) min_bucket.data(), (uchar *) max_bucket.data(),
field, (uchar *) left.data(), (uchar *) right.data(),
(uchar *) max_key);
else
max_sel= pos_in_interval_through_val_real(
field, (uchar *) min_bucket.data(), (uchar *) max_bucket.data(),
field, (uchar *) left.data(), (uchar *) right.data(),
(uchar *) max_key);
}
max = min_bucket_idx * (width) + max_sel * (width);
//fprintf(stderr, "max pos_in_interval =%g\n", max_sel);
//fprintf(stderr, "max =%g\n", max);
max= idx*width + max_sel*width;
}
else
max= 1.0;
double sel = max - min;
//fprintf(stderr, "final selection = %g\n", sel);
//fprintf(stderr, "Histogram_json::range_selectivity ends\n");
return sel;
}
@ -1644,34 +1616,33 @@ void Histogram_json::serialize(Field *field)
}
int Histogram_json::find_bucket(Field *field, const uchar *endpoint)
/*
Find the histogram bucket that contains the value.
@param equal_is_less Controls what to do if a histogram bound is equal to the
lookup_val.
*/
int Histogram_json::find_bucket(Field *field, const uchar *lookup_val,
bool equal_is_less)
{
int low = 0;
int high = (int)histogram_bounds.size()-1;
int mid;
int min_bucket_index = -1;
std::string mid_val; // GSOC-todo: don't copy strings
int low= 0;
int high= histogram_bounds.size() - 1;
int middle;
while(low <= high) {
// c++ gives us the floor of integer divisions by default, below we get the ceiling (round-up).
// it works but it doesn't feel so readable, maybe we could make improvements?
int sum = (low+high);
mid = sum/2 + (sum % 2 != 0);
mid_val = histogram_bounds[mid];
int res = field->key_cmp((uchar*) mid_val.data(), endpoint);
if (res < 0) {
low = mid + 1;
min_bucket_index = mid;
} else if (res >= 0) {
high = mid - 1;
}
while (low + 1 < high)
{
middle= (low + high) / 2;
int res= field->key_cmp((uchar*)histogram_bounds[middle].data(), lookup_val);
if (!res)
res= equal_is_less? -1: 1;
if (res < 0)
low= middle;
else //res > 0
high= middle;
}
if (min_bucket_index == -1)
min_bucket_index = high;
return min_bucket_index;
return low;
}
/*
@ -2114,14 +2085,14 @@ public:
};
Histogram_base *create_histogram(Histogram_type hist_type)
Histogram_base *create_histogram(MEM_ROOT *mem_root, Histogram_type hist_type)
{
switch (hist_type) {
case SINGLE_PREC_HB:
case DOUBLE_PREC_HB:
return new Histogram_binary();
return new (mem_root) Histogram_binary();
case JSON_HB:
return new Histogram_json();
return new (mem_root) Histogram_json();
default:
DBUG_ASSERT(0);
}
@ -2963,7 +2934,7 @@ void Column_statistics_collected::finish(MEM_ROOT *mem_root, ha_rows rows, doubl
if (hist_size != 0 && hist_type != INVALID_HISTOGRAM)
{
have_histogram= true;
histogram_= create_histogram(hist_type);
histogram_= create_histogram(mem_root, hist_type);
histogram_->init_for_collection(mem_root, hist_type, hist_size);
}
@ -4530,9 +4501,10 @@ double Histogram_binary::point_selectivity(Field *field, key_range *min_endp, do
return sel;
}
double Histogram_binary::range_selectivity(Field *field,
key_range *min_endp,
key_range *max_endp)
key_range *min_endp,
key_range *max_endp)
{
double sel, min_mp_pos, max_mp_pos;
Column_statistics *col_stats= field->read_stats;
@ -4561,13 +4533,6 @@ double Histogram_binary::range_selectivity(Field *field,
uint max= find_bucket(max_mp_pos, FALSE);
sel= bucket_sel * (max - min + 1);
/*fprintf(stderr, "bucket_sel =%g\n", bucket_sel);
fprintf(stderr, "min pos_in_interval =%g\n", min_mp_pos);
fprintf(stderr, "max pos_in_interval =%g\n", max_mp_pos);
fprintf(stderr, "min =%d\n", min);
fprintf(stderr, "max =%d\n", max);*/
/*fprintf(stderr, "final sel =%g\n", sel);
fprintf(stderr, "Histogram_binary::range_selectivity ends\n");*/
return sel;
}

View File

@ -400,10 +400,8 @@ public:
double avg_selection) override;
double range_selectivity(Field *field, key_range *min_endp,
key_range *max_endp) override;
/*
* Returns the index of the biggest histogram value that is smaller than endpoint
*/
int find_bucket(Field *field, const uchar *endpoint);
private:
int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
};
class Columns_statistics;