MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
Encode such characters in hex.
This commit is contained in:
parent
748b293c14
commit
d8d57d2c27
@ -7896,16 +7896,41 @@ a
|
||||
drop table t1;
|
||||
#
|
||||
# Another testcase: use a character that cannot be represented in utf8:
|
||||
# Also, now it's testcase for:
|
||||
# MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
|
||||
#
|
||||
create table t1 ( a varchar(100) character set cp1251);
|
||||
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98');
|
||||
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88');
|
||||
insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98');
|
||||
analyze table t1 persistent for all;
|
||||
Table Op Msg_type Msg_text
|
||||
test.t1 analyze status Operation failed
|
||||
test.t1 analyze status Engine-independent statistics collected
|
||||
test.t1 analyze status OK
|
||||
select hist_type, histogram
|
||||
from mysql.column_stats
|
||||
where db_name=database() and table_name='t1';
|
||||
hist_type histogram
|
||||
JSON_HB {
|
||||
"target_histogram_size": 10,
|
||||
"collected_at": "REPLACED",
|
||||
"collected_by": "REPLACED",
|
||||
"histogram_hb": [
|
||||
{
|
||||
"start": "€",
|
||||
"size": 0.6,
|
||||
"ndv": 1
|
||||
},
|
||||
{
|
||||
"start_hex": "98",
|
||||
"end_hex": "98",
|
||||
"size": 0.4,
|
||||
"ndv": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
analyze select * from t1 where a=_cp1251 x'88';
|
||||
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
|
||||
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 5.00 60.00 60.00 Using where
|
||||
drop table t1;
|
||||
#
|
||||
# ASAN use-after-poison my_strnxfrm_simple_internal / Histogram_json_hb::range_selectivity ...
|
||||
@ -8102,7 +8127,8 @@ set histogram_type= JSON_HB, histogram_size= 1;
|
||||
insert into t1 values ('foo'),(unhex('9C'));
|
||||
analyze table t1 persistent for all;
|
||||
Table Op Msg_type Msg_text
|
||||
test.t1 analyze status Operation failed
|
||||
test.t1 analyze status Engine-independent statistics collected
|
||||
test.t1 analyze status OK
|
||||
select * from t1;
|
||||
a
|
||||
foo
|
||||
|
@ -227,9 +227,12 @@ drop table t1;
|
||||
|
||||
--echo #
|
||||
--echo # Another testcase: use a character that cannot be represented in utf8:
|
||||
--echo # Also, now it's testcase for:
|
||||
--echo # MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
|
||||
--echo #
|
||||
create table t1 ( a varchar(100) character set cp1251);
|
||||
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98');
|
||||
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88');
|
||||
insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98');
|
||||
analyze table t1 persistent for all;
|
||||
|
||||
--source include/histogram_replaces.inc
|
||||
@ -237,6 +240,8 @@ select hist_type, histogram
|
||||
from mysql.column_stats
|
||||
where db_name=database() and table_name='t1';
|
||||
|
||||
analyze select * from t1 where a=_cp1251 x'88';
|
||||
|
||||
drop table t1;
|
||||
|
||||
--echo #
|
||||
|
@ -70,11 +70,11 @@ static bool json_unescape_to_string(const char *val, int val_len, String* out)
|
||||
succeeds.
|
||||
*/
|
||||
|
||||
static bool json_escape_to_string(const String *str, String* out)
|
||||
static int json_escape_to_string(const String *str, String* out)
|
||||
{
|
||||
// Make sure 'out' has some memory allocated.
|
||||
if (!out->alloced_length() && out->alloc(128))
|
||||
return true;
|
||||
return JSON_ERROR_OUT_OF_SPACE;
|
||||
|
||||
while (1)
|
||||
{
|
||||
@ -90,15 +90,15 @@ static bool json_escape_to_string(const String *str, String* out)
|
||||
if (res >= 0)
|
||||
{
|
||||
out->length(res);
|
||||
return false; // Ok
|
||||
return 0; // Ok
|
||||
}
|
||||
|
||||
if (res != JSON_ERROR_OUT_OF_SPACE)
|
||||
return true; // Some conversion error
|
||||
return res; // Some conversion error
|
||||
|
||||
// Out of space error. Try with a bigger buffer
|
||||
if (out->alloc(out->alloced_length()*2))
|
||||
return true;
|
||||
return JSON_ERROR_OUT_OF_SPACE;
|
||||
}
|
||||
}
|
||||
|
||||
@ -208,8 +208,7 @@ private:
|
||||
*/
|
||||
bool finalize_bucket_with_end_value(void *elem)
|
||||
{
|
||||
writer.add_member("end");
|
||||
if (append_column_value(elem))
|
||||
if (append_column_value(elem, false))
|
||||
return true;
|
||||
finalize_bucket();
|
||||
return false;
|
||||
@ -224,19 +223,18 @@ private:
|
||||
{
|
||||
DBUG_ASSERT(bucket.size == 0);
|
||||
writer.start_object();
|
||||
writer.add_member("start");
|
||||
if (append_column_value(elem))
|
||||
if (append_column_value(elem, true))
|
||||
return true;
|
||||
|
||||
bucket.ndv= 1;
|
||||
bucket.size= cnt;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Append the passed value into the JSON writer as string value
|
||||
*/
|
||||
bool append_column_value(void *elem)
|
||||
bool append_column_value(void *elem, bool is_start)
|
||||
{
|
||||
StringBuffer<MAX_FIELD_WIDTH> val;
|
||||
|
||||
@ -246,12 +244,21 @@ private:
|
||||
|
||||
// Escape the value for JSON
|
||||
StringBuffer<MAX_FIELD_WIDTH> escaped_val;
|
||||
if (json_escape_to_string(str, &escaped_val))
|
||||
return true;
|
||||
|
||||
// Note: The Json_writer does NOT do escapes (perhaps this should change?)
|
||||
writer.add_str(escaped_val.c_ptr_safe());
|
||||
return false;
|
||||
int rc= json_escape_to_string(str, &escaped_val);
|
||||
if (!rc)
|
||||
{
|
||||
writer.add_member(is_start? "start": "end");
|
||||
writer.add_str(escaped_val.c_ptr_safe());
|
||||
return false;
|
||||
}
|
||||
if (rc == JSON_ERROR_ILLEGAL_SYMBOL)
|
||||
{
|
||||
escaped_val.set_hex(val.ptr(), val.length());
|
||||
writer.add_member(is_start? "start_hex": "end_hex");
|
||||
writer.add_str(escaped_val.c_ptr_safe());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -496,6 +503,41 @@ bool read_bucket_endpoint(json_engine_t *je, Field *field, String *out,
|
||||
}
|
||||
|
||||
|
||||
bool read_hex_bucket_endpoint(json_engine_t *je, Field *field, String *out,
|
||||
const char **err)
|
||||
{
|
||||
if (json_read_value(je))
|
||||
return true;
|
||||
|
||||
if (je->value_type != JSON_VALUE_STRING || je->value_escaped ||
|
||||
(je->value_len & 1))
|
||||
{
|
||||
*err= "Expected a hex string";
|
||||
return true;
|
||||
}
|
||||
StringBuffer<128> buf;
|
||||
|
||||
for (auto pc= je->value; pc < je->value + je->value_len; pc+=2)
|
||||
{
|
||||
int hex_char1= hexchar_to_int(pc[0]);
|
||||
int hex_char2= hexchar_to_int(pc[1]);
|
||||
if (hex_char1 == -1 || hex_char2 == -1)
|
||||
{
|
||||
*err= "Expected a hex string";
|
||||
return true;
|
||||
}
|
||||
buf.append((hex_char1 << 4) | hex_char2);
|
||||
}
|
||||
|
||||
field->store_text(buf.ptr(), buf.length(), field->charset());
|
||||
out->alloc(field->pack_length());
|
||||
uint bytes= field->get_key_image((uchar*)out->ptr(),
|
||||
field->key_length(), Field::itRAW);
|
||||
out->length(bytes);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
@brief Parse a JSON reprsentation for one histogram bucket
|
||||
|
||||
@ -619,6 +661,30 @@ int Histogram_json_hb::parse_bucket(json_engine_t *je, Field *field,
|
||||
}
|
||||
save1.restore_to(je);
|
||||
|
||||
// Less common endoints:
|
||||
Json_string start_hex_str("start_hex");
|
||||
if (json_key_matches(je, start_hex_str.get()))
|
||||
{
|
||||
if (read_hex_bucket_endpoint(je, field, &value_buf, err))
|
||||
return 1;
|
||||
|
||||
have_start= true;
|
||||
continue;
|
||||
}
|
||||
save1.restore_to(je);
|
||||
|
||||
Json_string end_hex_str("end_hex");
|
||||
if (json_key_matches(je, end_hex_str.get()))
|
||||
{
|
||||
if (read_hex_bucket_endpoint(je, field, &value_buf, err))
|
||||
return 1;
|
||||
last_bucket_end_endp.assign(value_buf.ptr(), value_buf.length());
|
||||
*assigned_last_end= true;
|
||||
continue;
|
||||
}
|
||||
save1.restore_to(je);
|
||||
|
||||
|
||||
// Some unknown member. Skip it.
|
||||
if (json_skip_key(je))
|
||||
return 1;
|
||||
|
@ -32,12 +32,18 @@
|
||||
"histogram_hb": [
|
||||
{ "start": "value", "size":nnn.nn, "ndv": nnn },
|
||||
...
|
||||
{ "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"}
|
||||
|
||||
// Optionally, start and/or end can be replaced with _hex variant
|
||||
{ "start_hex: "value", "size":nnn.nn, "ndv":nnn},
|
||||
|
||||
...
|
||||
{ "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"},
|
||||
]
|
||||
}
|
||||
|
||||
The histogram is an object with single member named Histogram_json_hb::
|
||||
JSON_NAME. The value of that member is an array of buckets.
|
||||
|
||||
Each bucket is an object with these members:
|
||||
"start" - the first value in the bucket.
|
||||
"size" - fraction of table rows that is contained in the bucket.
|
||||
@ -51,6 +57,11 @@
|
||||
|
||||
The exception is single-point buckets where last value is the same as the
|
||||
first value.
|
||||
|
||||
start/end can be replaced with start_hex/end_hex. In _hex variant, the
|
||||
constant is encoded in hex. This encoding is used to handle so called
|
||||
"unassigned characters": some non-UTF8 charsets have byte combinations that
|
||||
are not mapped to any UTF8 character.
|
||||
*/
|
||||
|
||||
class Histogram_json_hb : public Histogram_base
|
||||
|
Loading…
x
Reference in New Issue
Block a user