MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
Encode such characters in hex.
This commit is contained in:
parent
748b293c14
commit
d8d57d2c27
@ -7896,16 +7896,41 @@ a
|
|||||||
drop table t1;
|
drop table t1;
|
||||||
#
|
#
|
||||||
# Another testcase: use a character that cannot be represented in utf8:
|
# Another testcase: use a character that cannot be represented in utf8:
|
||||||
|
# Also, now it's testcase for:
|
||||||
|
# MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
|
||||||
#
|
#
|
||||||
create table t1 ( a varchar(100) character set cp1251);
|
create table t1 ( a varchar(100) character set cp1251);
|
||||||
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98');
|
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88');
|
||||||
|
insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98');
|
||||||
analyze table t1 persistent for all;
|
analyze table t1 persistent for all;
|
||||||
Table Op Msg_type Msg_text
|
Table Op Msg_type Msg_text
|
||||||
test.t1 analyze status Operation failed
|
test.t1 analyze status Engine-independent statistics collected
|
||||||
|
test.t1 analyze status OK
|
||||||
select hist_type, histogram
|
select hist_type, histogram
|
||||||
from mysql.column_stats
|
from mysql.column_stats
|
||||||
where db_name=database() and table_name='t1';
|
where db_name=database() and table_name='t1';
|
||||||
hist_type histogram
|
hist_type histogram
|
||||||
|
JSON_HB {
|
||||||
|
"target_histogram_size": 10,
|
||||||
|
"collected_at": "REPLACED",
|
||||||
|
"collected_by": "REPLACED",
|
||||||
|
"histogram_hb": [
|
||||||
|
{
|
||||||
|
"start": "€",
|
||||||
|
"size": 0.6,
|
||||||
|
"ndv": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"start_hex": "98",
|
||||||
|
"end_hex": "98",
|
||||||
|
"size": 0.4,
|
||||||
|
"ndv": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
analyze select * from t1 where a=_cp1251 x'88';
|
||||||
|
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||||
|
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 5.00 60.00 60.00 Using where
|
||||||
drop table t1;
|
drop table t1;
|
||||||
#
|
#
|
||||||
# ASAN use-after-poison my_strnxfrm_simple_internal / Histogram_json_hb::range_selectivity ...
|
# ASAN use-after-poison my_strnxfrm_simple_internal / Histogram_json_hb::range_selectivity ...
|
||||||
@ -8102,7 +8127,8 @@ set histogram_type= JSON_HB, histogram_size= 1;
|
|||||||
insert into t1 values ('foo'),(unhex('9C'));
|
insert into t1 values ('foo'),(unhex('9C'));
|
||||||
analyze table t1 persistent for all;
|
analyze table t1 persistent for all;
|
||||||
Table Op Msg_type Msg_text
|
Table Op Msg_type Msg_text
|
||||||
test.t1 analyze status Operation failed
|
test.t1 analyze status Engine-independent statistics collected
|
||||||
|
test.t1 analyze status OK
|
||||||
select * from t1;
|
select * from t1;
|
||||||
a
|
a
|
||||||
foo
|
foo
|
||||||
|
@ -227,9 +227,12 @@ drop table t1;
|
|||||||
|
|
||||||
--echo #
|
--echo #
|
||||||
--echo # Another testcase: use a character that cannot be represented in utf8:
|
--echo # Another testcase: use a character that cannot be represented in utf8:
|
||||||
|
--echo # Also, now it's testcase for:
|
||||||
|
--echo # MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
|
||||||
--echo #
|
--echo #
|
||||||
create table t1 ( a varchar(100) character set cp1251);
|
create table t1 ( a varchar(100) character set cp1251);
|
||||||
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98');
|
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88');
|
||||||
|
insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98');
|
||||||
analyze table t1 persistent for all;
|
analyze table t1 persistent for all;
|
||||||
|
|
||||||
--source include/histogram_replaces.inc
|
--source include/histogram_replaces.inc
|
||||||
@ -237,6 +240,8 @@ select hist_type, histogram
|
|||||||
from mysql.column_stats
|
from mysql.column_stats
|
||||||
where db_name=database() and table_name='t1';
|
where db_name=database() and table_name='t1';
|
||||||
|
|
||||||
|
analyze select * from t1 where a=_cp1251 x'88';
|
||||||
|
|
||||||
drop table t1;
|
drop table t1;
|
||||||
|
|
||||||
--echo #
|
--echo #
|
||||||
|
@ -70,11 +70,11 @@ static bool json_unescape_to_string(const char *val, int val_len, String* out)
|
|||||||
succeeds.
|
succeeds.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static bool json_escape_to_string(const String *str, String* out)
|
static int json_escape_to_string(const String *str, String* out)
|
||||||
{
|
{
|
||||||
// Make sure 'out' has some memory allocated.
|
// Make sure 'out' has some memory allocated.
|
||||||
if (!out->alloced_length() && out->alloc(128))
|
if (!out->alloced_length() && out->alloc(128))
|
||||||
return true;
|
return JSON_ERROR_OUT_OF_SPACE;
|
||||||
|
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
@ -90,15 +90,15 @@ static bool json_escape_to_string(const String *str, String* out)
|
|||||||
if (res >= 0)
|
if (res >= 0)
|
||||||
{
|
{
|
||||||
out->length(res);
|
out->length(res);
|
||||||
return false; // Ok
|
return 0; // Ok
|
||||||
}
|
}
|
||||||
|
|
||||||
if (res != JSON_ERROR_OUT_OF_SPACE)
|
if (res != JSON_ERROR_OUT_OF_SPACE)
|
||||||
return true; // Some conversion error
|
return res; // Some conversion error
|
||||||
|
|
||||||
// Out of space error. Try with a bigger buffer
|
// Out of space error. Try with a bigger buffer
|
||||||
if (out->alloc(out->alloced_length()*2))
|
if (out->alloc(out->alloced_length()*2))
|
||||||
return true;
|
return JSON_ERROR_OUT_OF_SPACE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -208,8 +208,7 @@ private:
|
|||||||
*/
|
*/
|
||||||
bool finalize_bucket_with_end_value(void *elem)
|
bool finalize_bucket_with_end_value(void *elem)
|
||||||
{
|
{
|
||||||
writer.add_member("end");
|
if (append_column_value(elem, false))
|
||||||
if (append_column_value(elem))
|
|
||||||
return true;
|
return true;
|
||||||
finalize_bucket();
|
finalize_bucket();
|
||||||
return false;
|
return false;
|
||||||
@ -224,19 +223,18 @@ private:
|
|||||||
{
|
{
|
||||||
DBUG_ASSERT(bucket.size == 0);
|
DBUG_ASSERT(bucket.size == 0);
|
||||||
writer.start_object();
|
writer.start_object();
|
||||||
writer.add_member("start");
|
if (append_column_value(elem, true))
|
||||||
if (append_column_value(elem))
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
bucket.ndv= 1;
|
bucket.ndv= 1;
|
||||||
bucket.size= cnt;
|
bucket.size= cnt;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Append the passed value into the JSON writer as string value
|
Append the passed value into the JSON writer as string value
|
||||||
*/
|
*/
|
||||||
bool append_column_value(void *elem)
|
bool append_column_value(void *elem, bool is_start)
|
||||||
{
|
{
|
||||||
StringBuffer<MAX_FIELD_WIDTH> val;
|
StringBuffer<MAX_FIELD_WIDTH> val;
|
||||||
|
|
||||||
@ -246,12 +244,21 @@ private:
|
|||||||
|
|
||||||
// Escape the value for JSON
|
// Escape the value for JSON
|
||||||
StringBuffer<MAX_FIELD_WIDTH> escaped_val;
|
StringBuffer<MAX_FIELD_WIDTH> escaped_val;
|
||||||
if (json_escape_to_string(str, &escaped_val))
|
int rc= json_escape_to_string(str, &escaped_val);
|
||||||
return true;
|
if (!rc)
|
||||||
|
{
|
||||||
// Note: The Json_writer does NOT do escapes (perhaps this should change?)
|
writer.add_member(is_start? "start": "end");
|
||||||
writer.add_str(escaped_val.c_ptr_safe());
|
writer.add_str(escaped_val.c_ptr_safe());
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
if (rc == JSON_ERROR_ILLEGAL_SYMBOL)
|
||||||
|
{
|
||||||
|
escaped_val.set_hex(val.ptr(), val.length());
|
||||||
|
writer.add_member(is_start? "start_hex": "end_hex");
|
||||||
|
writer.add_str(escaped_val.c_ptr_safe());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -496,6 +503,41 @@ bool read_bucket_endpoint(json_engine_t *je, Field *field, String *out,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool read_hex_bucket_endpoint(json_engine_t *je, Field *field, String *out,
|
||||||
|
const char **err)
|
||||||
|
{
|
||||||
|
if (json_read_value(je))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (je->value_type != JSON_VALUE_STRING || je->value_escaped ||
|
||||||
|
(je->value_len & 1))
|
||||||
|
{
|
||||||
|
*err= "Expected a hex string";
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
StringBuffer<128> buf;
|
||||||
|
|
||||||
|
for (auto pc= je->value; pc < je->value + je->value_len; pc+=2)
|
||||||
|
{
|
||||||
|
int hex_char1= hexchar_to_int(pc[0]);
|
||||||
|
int hex_char2= hexchar_to_int(pc[1]);
|
||||||
|
if (hex_char1 == -1 || hex_char2 == -1)
|
||||||
|
{
|
||||||
|
*err= "Expected a hex string";
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
buf.append((hex_char1 << 4) | hex_char2);
|
||||||
|
}
|
||||||
|
|
||||||
|
field->store_text(buf.ptr(), buf.length(), field->charset());
|
||||||
|
out->alloc(field->pack_length());
|
||||||
|
uint bytes= field->get_key_image((uchar*)out->ptr(),
|
||||||
|
field->key_length(), Field::itRAW);
|
||||||
|
out->length(bytes);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@brief Parse a JSON reprsentation for one histogram bucket
|
@brief Parse a JSON reprsentation for one histogram bucket
|
||||||
|
|
||||||
@ -619,6 +661,30 @@ int Histogram_json_hb::parse_bucket(json_engine_t *je, Field *field,
|
|||||||
}
|
}
|
||||||
save1.restore_to(je);
|
save1.restore_to(je);
|
||||||
|
|
||||||
|
// Less common endoints:
|
||||||
|
Json_string start_hex_str("start_hex");
|
||||||
|
if (json_key_matches(je, start_hex_str.get()))
|
||||||
|
{
|
||||||
|
if (read_hex_bucket_endpoint(je, field, &value_buf, err))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
have_start= true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
save1.restore_to(je);
|
||||||
|
|
||||||
|
Json_string end_hex_str("end_hex");
|
||||||
|
if (json_key_matches(je, end_hex_str.get()))
|
||||||
|
{
|
||||||
|
if (read_hex_bucket_endpoint(je, field, &value_buf, err))
|
||||||
|
return 1;
|
||||||
|
last_bucket_end_endp.assign(value_buf.ptr(), value_buf.length());
|
||||||
|
*assigned_last_end= true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
save1.restore_to(je);
|
||||||
|
|
||||||
|
|
||||||
// Some unknown member. Skip it.
|
// Some unknown member. Skip it.
|
||||||
if (json_skip_key(je))
|
if (json_skip_key(je))
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -32,12 +32,18 @@
|
|||||||
"histogram_hb": [
|
"histogram_hb": [
|
||||||
{ "start": "value", "size":nnn.nn, "ndv": nnn },
|
{ "start": "value", "size":nnn.nn, "ndv": nnn },
|
||||||
...
|
...
|
||||||
{ "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"}
|
|
||||||
|
// Optionally, start and/or end can be replaced with _hex variant
|
||||||
|
{ "start_hex: "value", "size":nnn.nn, "ndv":nnn},
|
||||||
|
|
||||||
|
...
|
||||||
|
{ "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"},
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
The histogram is an object with single member named Histogram_json_hb::
|
The histogram is an object with single member named Histogram_json_hb::
|
||||||
JSON_NAME. The value of that member is an array of buckets.
|
JSON_NAME. The value of that member is an array of buckets.
|
||||||
|
|
||||||
Each bucket is an object with these members:
|
Each bucket is an object with these members:
|
||||||
"start" - the first value in the bucket.
|
"start" - the first value in the bucket.
|
||||||
"size" - fraction of table rows that is contained in the bucket.
|
"size" - fraction of table rows that is contained in the bucket.
|
||||||
@ -51,6 +57,11 @@
|
|||||||
|
|
||||||
The exception is single-point buckets where last value is the same as the
|
The exception is single-point buckets where last value is the same as the
|
||||||
first value.
|
first value.
|
||||||
|
|
||||||
|
start/end can be replaced with start_hex/end_hex. In _hex variant, the
|
||||||
|
constant is encoded in hex. This encoding is used to handle so called
|
||||||
|
"unassigned characters": some non-UTF8 charsets have byte combinations that
|
||||||
|
are not mapped to any UTF8 character.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class Histogram_json_hb : public Histogram_base
|
class Histogram_json_hb : public Histogram_base
|
||||||
|
Loading…
x
Reference in New Issue
Block a user