Fix JSON parsing: future-proof data representation in JSON, code cleanup
This commit is contained in:
parent
a0b4a86822
commit
2a1cdbabec
@ -283,12 +283,13 @@ int json_key_matches(json_engine_t *je, json_string_t *k);
|
||||
int json_read_value(json_engine_t *j);
|
||||
|
||||
/*
|
||||
* json_smart_read_value() reads parses a scalar value and value length from the json engine,
|
||||
* and copies them into `value` and `value_length` respectively.
|
||||
* It should only be called when the json_engine state is JST_VALUE.
|
||||
* If it encounters a non-scalar value (say object or array) before getting to value_len,
|
||||
* such value is also read and copied into value.
|
||||
*/
|
||||
json_smart_read_value() reads a JSON value. Pointer to value is stored in
|
||||
*value and its length in *value_len.
|
||||
|
||||
if the value is non a scalar, it returns pointers to its JSON
|
||||
representation.
|
||||
The function should only be called when je->state==JST_VALUE.
|
||||
*/
|
||||
enum json_types json_smart_read_value(json_engine_t *je, const char **value, int *value_len);
|
||||
|
||||
/*
|
||||
|
File diff suppressed because one or more lines are too long
@ -37,6 +37,8 @@ analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
|
||||
explain extended select * from t1_json where a < 'b-1a';
|
||||
analyze select * from t1_json where a > 'zzzzzzzzz';
|
||||
|
||||
drop table ten;
|
||||
|
||||
# test different valid JSON strings that are invalid histograms.
|
||||
UPDATE mysql.column_stats SET histogram='["a-1", "a-2", {"a": "b"}, "a-3"]' WHERE table_name='t1_json';
|
||||
FLUSH TABLES;
|
||||
@ -45,23 +47,23 @@ explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
|
||||
|
||||
|
||||
--source include/have_sequence.inc
|
||||
create table users (
|
||||
create table t2 (
|
||||
city varchar(100)
|
||||
);
|
||||
set histogram_size=50;
|
||||
insert into users select 'Moscow' from seq_1_to_99;
|
||||
insert into users select 'Helsinki' from seq_1_to_2;
|
||||
insert into t2 select 'Moscow' from seq_1_to_99;
|
||||
insert into t2 select 'Helsinki' from seq_1_to_2;
|
||||
set histogram_type=json_hb;
|
||||
analyze table users persistent for all;
|
||||
explain extended select * from users where city = 'Moscow';
|
||||
analyze select * from users where city = 'Moscow';
|
||||
explain extended select * from users where city = 'Helsinki';
|
||||
analyze select * from users where city = 'helsinki';
|
||||
explain extended select * from users where city < 'Lagos';
|
||||
analyze table t2 persistent for all;
|
||||
explain extended select * from t2 where city = 'Moscow';
|
||||
analyze select * from t2 where city = 'Moscow';
|
||||
explain extended select * from t2 where city = 'Helsinki';
|
||||
analyze select * from t2 where city = 'helsinki';
|
||||
explain extended select * from t2 where city < 'Lagos';
|
||||
|
||||
drop table t1_bin;
|
||||
drop table t1_json;
|
||||
drop table users;
|
||||
drop table t2;
|
||||
|
||||
DELETE FROM mysql.column_stats;
|
||||
|
||||
|
@ -8914,4 +8914,4 @@ ER_PARTITION_CONVERT_SUBPARTITIONED
|
||||
ER_PROVIDER_NOT_LOADED
|
||||
eng "MariaDB tried to use the %s, but its provider plugin is not loaded"
|
||||
ER_JSON_HISTOGRAM_PARSE_FAILED
|
||||
eng "Failed to parse histogram, encountered JSON_TYPE '%d'."
|
||||
eng "Failed to parse histogram: %s at offset %d."
|
||||
|
@ -1123,6 +1123,7 @@ public:
|
||||
void get_stat_values()
|
||||
{
|
||||
table_field->read_stats->set_all_nulls();
|
||||
// default: hist_type=NULL means there's no histogram
|
||||
table_field->read_stats->histogram_type_on_disk= INVALID_HISTOGRAM;
|
||||
|
||||
if (table_field->read_stats->min_value)
|
||||
@ -1196,7 +1197,10 @@ public:
|
||||
break;
|
||||
}
|
||||
case COLUMN_STAT_HISTOGRAM:
|
||||
//TODO: if stat_field->length() == 0 then histogram_type_on_disk is set to INVALID_HISTOGRAM
|
||||
/*
|
||||
Do nothing here: we take the histogram length from the 'histogram'
|
||||
column itself
|
||||
*/
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1245,7 +1249,7 @@ public:
|
||||
}
|
||||
if (!hist->parse(mem_root, table_field,
|
||||
table_field->read_stats->histogram_type_on_disk,
|
||||
(const uchar*)val.ptr(), val.length()))
|
||||
val.ptr(), val.length()))
|
||||
{
|
||||
table_field->read_stats->histogram_= hist;
|
||||
return hist;
|
||||
@ -1255,19 +1259,19 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
bool Histogram_binary::parse(MEM_ROOT *mem_root, Field *,
|
||||
Histogram_type type_arg,
|
||||
const uchar *ptr_arg, uint size_arg)
|
||||
|
||||
bool Histogram_binary::parse(MEM_ROOT *mem_root, Field*,
|
||||
Histogram_type type_arg, const char *hist_data,
|
||||
size_t hist_data_len)
|
||||
{
|
||||
// Just copy the data
|
||||
size = (uint8) size_arg;
|
||||
type = type_arg;
|
||||
if ((values = (uchar*)alloc_root(mem_root, size_arg)))
|
||||
{
|
||||
memcpy(values, ptr_arg, size_arg);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
/* On-disk an in-memory formats are the same. Just copy the data. */
|
||||
type= type_arg;
|
||||
size= (uint8) hist_data_len; // 'size' holds the size of histogram in bytes
|
||||
if (!(values= (uchar*)alloc_root(mem_root, hist_data_len)))
|
||||
return true;
|
||||
|
||||
memcpy(values, hist_data, hist_data_len);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1307,39 +1311,81 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root,
|
||||
*/
|
||||
|
||||
bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field,
|
||||
Histogram_type type_arg, const uchar *ptr,
|
||||
uint size_arg)
|
||||
Histogram_type type_arg, const char *hist_data,
|
||||
size_t hist_data_len)
|
||||
{
|
||||
DBUG_ENTER("Histogram_json::parse");
|
||||
DBUG_ASSERT(type_arg == JSON_HB);
|
||||
size = (uint8) size_arg;
|
||||
const char *json = (char *)ptr;
|
||||
int vt;
|
||||
std::vector<std::string> hist_buckets_text;
|
||||
bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets_text);
|
||||
if (!result)
|
||||
{
|
||||
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), vt);
|
||||
DBUG_RETURN(true);
|
||||
}
|
||||
size= hist_buckets_text.size();
|
||||
const char *err;
|
||||
json_engine_t je;
|
||||
json_string_t key_name;
|
||||
|
||||
/*
|
||||
Convert the text based array into a data structure that allows lookups and
|
||||
estimates
|
||||
*/
|
||||
for (auto &s : hist_buckets_text)
|
||||
{
|
||||
field->store_text(s.data(), s.size(), &my_charset_bin);
|
||||
json_scan_start(&je, &my_charset_utf8mb4_bin,
|
||||
(const uchar*)hist_data,
|
||||
(const uchar*)hist_data+hist_data_len);
|
||||
|
||||
// Get the value in "truncated key tuple format" here:
|
||||
uchar buf[MAX_KEY_LENGTH];
|
||||
uint len_to_copy= field->key_length();
|
||||
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
|
||||
histogram_bounds.push_back(std::string((char*)buf, bytes));
|
||||
if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
|
||||
{
|
||||
err= "Root JSON element must be a JSON object";
|
||||
goto error;
|
||||
}
|
||||
|
||||
json_string_set_str(&key_name, (const uchar*)JSON_NAME,
|
||||
(const uchar*)JSON_NAME + strlen(JSON_NAME));
|
||||
json_string_set_cs(&key_name, system_charset_info);
|
||||
|
||||
if (json_scan_next(&je) || je.state != JST_KEY ||
|
||||
!json_key_matches(&je, &key_name))
|
||||
{
|
||||
err= "The first key in the object must be histogram_hb_v1";
|
||||
goto error;
|
||||
}
|
||||
|
||||
// The value must be a JSON array
|
||||
if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
|
||||
{
|
||||
err= "A JSON array expected";
|
||||
goto error;
|
||||
}
|
||||
|
||||
// Read the array
|
||||
while (!json_scan_next(&je))
|
||||
{
|
||||
switch(je.state)
|
||||
{
|
||||
case JST_VALUE:
|
||||
{
|
||||
const char *val;
|
||||
int val_len;
|
||||
json_smart_read_value(&je, &val, &val_len);
|
||||
if (je.value_type != JSON_VALUE_STRING &&
|
||||
je.value_type != JSON_VALUE_NUMBER &&
|
||||
je.value_type != JSON_VALUE_TRUE &&
|
||||
je.value_type != JSON_VALUE_FALSE)
|
||||
{
|
||||
err= "Scalar value expected";
|
||||
goto error;
|
||||
}
|
||||
uchar buf[MAX_KEY_LENGTH];
|
||||
uint len_to_copy= field->key_length();
|
||||
field->store_text(val, val_len, &my_charset_bin);
|
||||
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
|
||||
histogram_bounds.push_back(std::string((char*)buf, bytes));
|
||||
// TODO: Should we also compare this endpoint with the previous
|
||||
// to verify that the ordering is right?
|
||||
break;
|
||||
}
|
||||
case JST_ARRAY_END:
|
||||
break;
|
||||
}
|
||||
}
|
||||
size= histogram_bounds.size();
|
||||
DBUG_RETURN(false);
|
||||
|
||||
error:
|
||||
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
|
||||
je.s.c_str - (const uchar*)hist_data);
|
||||
DBUG_RETURN(true);
|
||||
}
|
||||
|
||||
|
||||
@ -1347,7 +1393,7 @@ static
|
||||
void store_key_image_to_rec_no_null(Field *field, uchar *ptr) {
|
||||
MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
|
||||
&field->table->write_set);
|
||||
field->set_key_image(ptr, field->key_length());
|
||||
field->set_key_image(ptr, field->key_length());
|
||||
dbug_tmp_restore_column_map(&field->table->write_set, old_map);
|
||||
}
|
||||
|
||||
@ -1506,9 +1552,9 @@ double Histogram_json::point_selectivity(Field *field, key_range *endpoint, doub
|
||||
|
||||
/*
|
||||
@param field The table field histogram is for. We don't care about the
|
||||
field's current value, we only need its virtual functions to
|
||||
field's current value, we only need its virtual functions to
|
||||
perform various operations
|
||||
|
||||
|
||||
@param min_endp, max_endp - this specifies the range.
|
||||
*/
|
||||
double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
|
||||
@ -1594,7 +1640,7 @@ double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
|
||||
|
||||
void Histogram_json::serialize(Field *field)
|
||||
{
|
||||
field->store((char*)json_text, strlen((char*)json_text), &my_charset_bin);
|
||||
field->store(json_text.data(), json_text.size(), &my_charset_bin);
|
||||
}
|
||||
|
||||
|
||||
@ -2052,13 +2098,16 @@ public:
|
||||
}
|
||||
|
||||
void build_json_from_histogram() {
|
||||
Json_writer *writer = new Json_writer();
|
||||
writer->start_array();
|
||||
Json_writer writer;
|
||||
writer.start_object();
|
||||
writer.add_member(Histogram_json::JSON_NAME).start_array();
|
||||
|
||||
for(auto& value: bucket_bounds) {
|
||||
writer->add_str(value.c_str());
|
||||
writer.add_str(value.c_str());
|
||||
}
|
||||
writer->end_array();
|
||||
Binary_string *json_string = (Binary_string *) writer->output.get_string();
|
||||
writer.end_array();
|
||||
writer.end_object();
|
||||
Binary_string *json_string = (Binary_string *) writer.output.get_string();
|
||||
Histogram_json *hist= (Histogram_json*)histogram;
|
||||
hist->set_json_text(bucket_bounds.size(), (uchar *) json_string->c_ptr());
|
||||
}
|
||||
@ -2080,42 +2129,6 @@ Histogram_base *create_histogram(Histogram_type hist_type)
|
||||
}
|
||||
|
||||
|
||||
bool json_get_array_items(const char *json, const char *json_end, int *value_type, std::vector<std::string> &container) {
|
||||
json_engine_t je;
|
||||
int vl;
|
||||
const char *v;
|
||||
|
||||
json_scan_start(&je, &my_charset_utf8mb4_bin, (const uchar *)json, (const uchar *)json_end);
|
||||
|
||||
if (json_read_value(&je) || (*value_type = je.value_type) != JSON_VALUE_ARRAY)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string val;
|
||||
while(!json_scan_next(&je))
|
||||
{
|
||||
switch(je.state)
|
||||
{
|
||||
case JST_VALUE:
|
||||
*value_type = json_smart_read_value(&je, &v, &vl);
|
||||
if (je.value_type != JSON_VALUE_STRING &&
|
||||
je.value_type != JSON_VALUE_NUMBER &&
|
||||
je.value_type != JSON_VALUE_TRUE &&
|
||||
je.value_type != JSON_VALUE_FALSE)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
val = std::string(v, vl);
|
||||
container.emplace_back(val);
|
||||
break;
|
||||
case JST_ARRAY_END:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
C_MODE_START
|
||||
|
||||
int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
|
||||
|
@ -152,7 +152,7 @@ class Histogram_base : public Sql_alloc
|
||||
{
|
||||
public:
|
||||
virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
|
||||
const uchar *ptr, uint size)= 0;
|
||||
const char *hist_data, size_t hist_data_len)= 0;
|
||||
virtual void serialize(Field *to_field)= 0;
|
||||
|
||||
virtual Histogram_type get_type()=0;
|
||||
@ -187,7 +187,7 @@ class Histogram_binary : public Histogram_base
|
||||
{
|
||||
public:
|
||||
bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg,
|
||||
const uchar *ptr_arg, uint size_arg) override;
|
||||
const char *hist_data, size_t hist_data_len) override;
|
||||
void serialize(Field *to_field) override;
|
||||
|
||||
Histogram_type get_type() override { return type; }
|
||||
@ -350,14 +350,16 @@ private:
|
||||
uint8 size; /* Number of elements in the histogram */
|
||||
|
||||
/* Collection-time only: collected histogram in the JSON form. */
|
||||
uchar *json_text;
|
||||
std::string json_text;
|
||||
|
||||
// Array of histogram bucket endpoints in KeyTupleFormat.
|
||||
std::vector<std::string> histogram_bounds;
|
||||
|
||||
public:
|
||||
static constexpr const char* JSON_NAME="histogram_hb_v1";
|
||||
|
||||
bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
|
||||
const uchar *ptr, uint size) override;
|
||||
const char *hist_data, size_t hist_data_len) override;
|
||||
|
||||
void serialize(Field *field) override;
|
||||
|
||||
@ -375,7 +377,8 @@ public:
|
||||
void set_json_text(ulonglong sz, uchar *json_text_arg)
|
||||
{
|
||||
size = (uint8) sz;
|
||||
json_text= json_text_arg;
|
||||
json_text.assign((const char*)json_text_arg,
|
||||
strlen((const char*)json_text_arg));
|
||||
}
|
||||
|
||||
uint get_size() override
|
||||
@ -481,8 +484,9 @@ private:
|
||||
ulonglong avg_frequency;
|
||||
|
||||
public:
|
||||
|
||||
/* Histogram type as specified in mysql.column_stats.hist_type */
|
||||
Histogram_type histogram_type_on_disk;
|
||||
|
||||
Histogram_base *histogram_;
|
||||
|
||||
uint32 no_values_provided_bitmap()
|
||||
|
Loading…
x
Reference in New Issue
Block a user