MDEV-22043 Special character leads to assertion in my_wc_to_printable_generic on 10.5.2 (debug)

The code did not take into account that:
- U+005C (backslash) can occupy more than mbminlen characters (e.g. in sjis)
- Some character sets do not have a code for U+005C (e.g. swe7)

Adding a new function my_wc_to_printable into MY_CHARSET_HANDLER to
cover all special cases easier.
This commit is contained in:
Alexander Barkov 2020-05-07 19:20:17 +04:00
parent c675886dcd
commit cfe5ee90c8
25 changed files with 242 additions and 18 deletions

View File

@ -541,6 +541,7 @@ struct my_charset_handler_st
my_ci_native_to_mb() rather than my_ci_wc_mb().
*/
my_charset_conv_wc_mb native_to_mb;
my_charset_conv_wc_mb wc_to_printable;
};
extern MY_CHARSET_HANDLER my_charset_8bit_handler;
@ -660,6 +661,11 @@ struct charset_info_st
return (cset->native_to_mb)(this, wc, s, e);
}
int wc_to_printable(my_wc_t wc, uchar *s, uchar *e) const
{
return (cset->wc_to_printable)(this, wc, s, e);
}
int ctype(int *to, const uchar *s, const uchar *e) const
{
return (cset->ctype)(this, to, s, e);
@ -1249,9 +1255,6 @@ int my_wc_mb_bin(CHARSET_INFO *cs,my_wc_t wc, uchar *s, uchar *e);
int my_mb_ctype_8bit(CHARSET_INFO *,int *, const uchar *,const uchar *);
int my_mb_ctype_mb(CHARSET_INFO *,int *, const uchar *,const uchar *);
int my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
uchar *s, uchar *e);
size_t my_scan_8bit(CHARSET_INFO *cs, const char *b, const char *e, int sq);
size_t my_snprintf_8bit(CHARSET_INFO *, char *to, size_t n,

View File

@ -21,3 +21,16 @@ SET NAMES utf8;
SELECT @a:=CONVERT('aя' USING filename) AS `@a`, BINARY @a, REVERSE(@a), HEX(@a), HEX(REVERSE(@a));
@a BINARY @a REVERSE(@a) HEX(@a) HEX(REVERSE(@a))
aя a@r1 яa 61407231 40723161
#
# Start of 10.5 tests
#
#
# MDEV-22043 Special character leads to assertion in my_wc_to_printable_generic on 10.5.2 (debug)
#
SET NAMES filename;
EXECUTE IMMEDIATE _latin1 0x01;
ERROR 42000: You@0020have@0020an@0020error@0020in@0020your@0020SQL@0020syntax@003b@0020check@0020the@0020manual@0020that@0020corresponds@0020to@0020your@0020MariaDB@0020server@0020version@0020for@0020the@0020right@0020syntax@0020to@0020use@0020near@0020@0027@005c0001@0027@0020at@0020line@00201
SET NAMES utf8;
#
# End of 10.5 tests
#

View File

@ -27,3 +27,22 @@ select convert(convert(',' using filename) using binary);
--echo #
SET NAMES utf8;
SELECT @a:=CONVERT('aя' USING filename) AS `@a`, BINARY @a, REVERSE(@a), HEX(@a), HEX(REVERSE(@a));
--echo #
--echo # Start of 10.5 tests
--echo #
--echo #
--echo # MDEV-22043 Special character leads to assertion in my_wc_to_printable_generic on 10.5.2 (debug)
--echo #
SET NAMES filename;
--error ER_PARSE_ERROR
EXECUTE IMMEDIATE _latin1 0x01;
SET NAMES utf8;
--echo #
--echo # End of 10.5 tests
--echo #

View File

@ -19296,3 +19296,25 @@ SET DEFAULT_STORAGE_ENGINE=Default;
#
# End of 10.2 tests
#
#
# Start of 10.5 tests
#
#
# MDEV-22043 Special character leads to assertion in my_wc_to_printable_generic on 10.5.2 (debug)
#
SET NAMES sjis;
SET @@CHARACTER_SET_CLIENT='cp1257';
(a(b 'Ñ‚'));
ERROR 42000: You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near 'a(b '<27>_0143<34>_201A'))' at line 1
SET NAMES sjis;
SET @@CHARACTER_SET_CLIENT='cp1257';
'Ñ‚';
ERROR 42000: You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near ''<27>_0143<34>_201A'' at line 1
SET NAMES sjis;
SET @@CHARACTER_SET_CLIENT='cp1257';
EXECUTE IMMEDIATE _cp1257 0xD182;
ERROR 42000: You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near '<27>_0143<34>_201A' at line 1
SET NAMES sjis;
#
# End of 10.5 tests
#

View File

@ -260,3 +260,33 @@ let $coll_pad='sjis_bin';
--echo #
--echo # End of 10.2 tests
--echo #
--echo #
--echo # Start of 10.5 tests
--echo #
--echo #
--echo # MDEV-22043 Special character leads to assertion in my_wc_to_printable_generic on 10.5.2 (debug)
--echo #
SET NAMES sjis;
SET @@CHARACTER_SET_CLIENT='cp1257';
--error ER_PARSE_ERROR
(a(b 'т'));
SET NAMES sjis;
SET @@CHARACTER_SET_CLIENT='cp1257';
--error ER_PARSE_ERROR
'т';
SET NAMES sjis;
SET @@CHARACTER_SET_CLIENT='cp1257';
--error ER_PARSE_ERROR
EXECUTE IMMEDIATE _cp1257 0xD182;
SET NAMES sjis;
--echo #
--echo # End of 10.5 tests
--echo #

View File

@ -3635,3 +3635,21 @@ SET DEFAULT_STORAGE_ENGINE=Default;
#
# End of 10.2 tests
#
#
# Start of 10.5 tests
#
#
# MDEV-22043 Special character leads to assertion in my_wc_to_printable_generic on 10.5.2 (debug)
#
SET NAMES swe7;
SELECT ``;
ERROR HY000: Invalid swe7 character string: '.xEF.xBC.xB4'
SET NAMES swe7;
SELECT `龔`;
ERROR HY000: Invalid swe7 character string: '.xE9.xBE.x94'
SET NAMES swe7;
EXECUTE IMMEDIATE _swe7 0x01;
ERROR 42000: You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near '.0001' at line 1
#
# End of 10.5 tests
#

View File

@ -38,3 +38,29 @@ let $coll_pad='swe7_bin';
--echo #
--echo # End of 10.2 tests
--echo #
--echo #
--echo # Start of 10.5 tests
--echo #
--echo #
--echo # MDEV-22043 Special character leads to assertion in my_wc_to_printable_generic on 10.5.2 (debug)
--echo #
SET NAMES swe7;
--error ER_INVALID_CHARACTER_STRING
SELECT ``;
SET NAMES swe7;
--error ER_INVALID_CHARACTER_STRING
SELECT `龔`;
SET NAMES swe7;
--error ER_PARSE_ERROR
EXECUTE IMMEDIATE _swe7 0x01;
--echo #
--echo # End of 10.5 tests
--echo #

View File

@ -850,7 +850,7 @@ extern "C" int my_wc_mb_utf8_null_terminated(CHARSET_INFO *cs,
my_wc_t wc, uchar *r, uchar *e)
{
return wc == '\0' ?
my_wc_to_printable_generic(cs, wc, r, e) :
cs->wc_to_printable(wc, r, e) :
my_charset_utf8mb3_handler.wc_mb(cs, wc, r, e);
}
@ -951,7 +951,7 @@ size_t convert_error_message(char *to, size_t to_length, CHARSET_INFO *to_cs,
to_cs= system_charset_info;
uint32 cnv_length= my_convert_using_func(to, to_length,
to_cs,
my_wc_to_printable_generic,
to_cs->cset->wc_to_printable,
from, from_length,
from_cs, from_cs->cset->mb_wc,
errors);

View File

@ -791,7 +791,7 @@ bool Binary_string::copy_printable_hhhh(CHARSET_INFO *to_cs,
if (bytes_needed >= UINT_MAX32 || alloc((size_t) bytes_needed))
return true;
str_length= my_convert_using_func(Ptr, Alloced_length, to_cs,
my_wc_to_printable_generic,
to_cs->cset->wc_to_printable,
from, from_length,
from_cs,
from_cs->cset->mb_wc,

View File

@ -6800,6 +6800,7 @@ static MY_CHARSET_HANDLER my_charset_big5_handler=
my_well_formed_char_length_big5,
my_copy_fix_mb,
my_native_to_mb_big5,
my_wc_to_printable_generic
};
struct charset_info_st my_charset_big5_chinese_ci=

View File

@ -560,6 +560,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_8bit,
my_copy_8bit,
my_wc_mb_bin,
my_wc_to_printable_generic
};

View File

@ -34756,6 +34756,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_cp932,
my_copy_fix_mb,
my_native_to_mb_cp932,
my_wc_to_printable_generic
};

View File

@ -10046,6 +10046,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_euckr,
my_copy_fix_mb,
my_native_to_mb_euckr,
my_wc_to_printable_generic
};

View File

@ -67584,6 +67584,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_eucjpms,
my_copy_fix_mb,
my_native_to_mb_eucjpms,
my_wc_to_printable_generic
};

View File

@ -6451,6 +6451,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_gb2312,
my_copy_fix_mb,
my_native_to_mb_gb2312,
my_wc_to_printable_generic
};

View File

@ -10733,6 +10733,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_gbk,
my_copy_fix_mb,
my_native_to_mb_gbk,
my_wc_to_printable_generic
};

View File

@ -423,6 +423,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_8bit,
my_copy_8bit,
my_wc_mb_bin, /* native_to_mb */
my_wc_to_printable_generic
};

View File

@ -2088,6 +2088,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
my_well_formed_char_length_8bit,
my_copy_8bit,
my_wc_mb_bin, /* native_to_mb */
my_wc_to_printable_8bit
};
MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler =

View File

@ -34004,6 +34004,15 @@ size_t my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)),
}
static int
my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc,
uchar *str, uchar *end)
{
return my_wc_to_printable_ex(cs, wc, str, end,
'\\', 2, 1);
}
/*
sjis_chinese_ci and sjis_bin sort character blocks in this order:
1. [00..7F] - 7BIT characters (ASCII)
@ -34135,6 +34144,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_sjis,
my_copy_fix_mb,
my_native_to_mb_sjis,
my_wc_to_printable_sjis
};

View File

@ -905,6 +905,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_8bit,
my_copy_8bit,
my_wc_mb_bin, /* native_to_mb */
my_wc_to_printable_generic
};

View File

@ -1591,6 +1591,7 @@ MY_CHARSET_HANDLER my_charset_utf16_handler=
my_well_formed_char_length_utf16,
my_copy_fix_mb2_or_mb4,
my_uni_utf16,
my_wc_to_printable_generic
};
@ -1931,6 +1932,7 @@ static MY_CHARSET_HANDLER my_charset_utf16le_handler=
my_well_formed_char_length_utf16,
my_copy_fix_mb2_or_mb4,
my_uni_utf16le,
my_wc_to_printable_generic
};
@ -2753,6 +2755,7 @@ MY_CHARSET_HANDLER my_charset_utf32_handler=
my_well_formed_char_length_utf32,
my_copy_fix_mb2_or_mb4,
my_uni_utf32,
my_wc_to_printable_generic
};
@ -3343,6 +3346,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
my_well_formed_char_length_ucs2,
my_copy_fix_mb2_or_mb4,
my_uni_ucs2,
my_wc_to_printable_generic
};

View File

@ -67328,6 +67328,7 @@ static MY_CHARSET_HANDLER my_charset_handler=
my_well_formed_char_length_ujis,
my_copy_fix_mb,
my_native_to_mb_ujis,
my_wc_to_printable_generic
};

View File

@ -5466,6 +5466,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb3_handler=
my_well_formed_char_length_utf8mb3,
my_copy_fix_mb,
my_uni_utf8mb3,
my_wc_to_printable_generic
};
@ -7030,6 +7031,16 @@ my_charlen_filename(CHARSET_INFO *cs, const uchar *str, const uchar *end)
}
static int
my_wc_to_printable_filename(CHARSET_INFO *cs, my_wc_t wc,
uchar *str, uchar *end)
{
return my_wc_to_printable_ex(cs, wc, str, end,
'\\', 5, 1);
}
#define MY_FUNCTION_NAME(x) my_ ## x ## _filename
#define CHARLEN(cs,str,end) my_charlen_filename(cs,str,end)
#define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
@ -7102,6 +7113,7 @@ static MY_CHARSET_HANDLER my_charset_filename_handler=
my_well_formed_char_length_filename,
my_copy_fix_mb,
my_wc_mb_filename,
my_wc_to_printable_filename
};
@ -7792,6 +7804,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
my_well_formed_char_length_utf8mb4,
my_copy_fix_mb,
my_wc_mb_utf8mb4,
my_wc_to_printable_generic
};

View File

@ -1020,7 +1020,7 @@ my_is_printable(my_wc_t wc)
}
static uint to_printable_8bit(uchar *dst, my_wc_t wc)
static uint to_printable_8bit(uchar *dst, my_wc_t wc, uint bs)
{
/*
This function is used only in context of error messages for now.
@ -1028,7 +1028,7 @@ static uint to_printable_8bit(uchar *dst, my_wc_t wc)
when a message is put into diagnostics area.
*/
DBUG_ASSERT(wc < 0x10000);
*dst++= '\\';
*dst++= (char) bs;
*dst++= _dig_vec_upper[(wc >> 12) & 0x0F];
*dst++= _dig_vec_upper[(wc >> 8) & 0x0F];
*dst++= _dig_vec_upper[(wc >> 4) & 0x0F];
@ -1037,18 +1037,25 @@ static uint to_printable_8bit(uchar *dst, my_wc_t wc)
}
static uint my_printable_length(uint bslen, uint diglen)
{
return bslen + (MY_CS_PRINTABLE_CHAR_LENGTH - 1) * diglen;
}
/**
Encode an Unicode character "wc" into a printable string.
This function is suitable for any character set, including
ASCII-incompatible multi-byte character sets, e.g. ucs2, utf16, utf32.
*/
int
my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
uchar *str, uchar *end)
my_wc_to_printable_ex(CHARSET_INFO *cs, my_wc_t wc,
uchar *str, uchar *end,
uint bs, uint bslen, uint diglen)
{
uchar *str0;
uint i, length;
uchar tmp[MY_CS_PRINTABLE_CHAR_LENGTH];
uchar tmp[MY_CS_PRINTABLE_CHAR_LENGTH * MY_CS_MBMAXLEN];
if (my_is_printable(wc))
{
@ -1057,27 +1064,62 @@ my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
return mblen;
}
if (str + MY_CS_PRINTABLE_CHAR_LENGTH * cs->mbminlen > end)
return MY_CS_TOOSMALLN(MY_CS_PRINTABLE_CHAR_LENGTH * cs->mbminlen);
if (str + my_printable_length(bslen, diglen) > end)
return MY_CS_TOOSMALLN(my_printable_length(bslen, diglen));
if ((cs->state & MY_CS_NONASCII) == 0)
return to_printable_8bit(str, wc);
return to_printable_8bit(str, wc, bs);
length= to_printable_8bit(tmp, wc);
length= to_printable_8bit(tmp, wc, bs);
str0= str;
for (i= 0; i < length; i++)
{
if (my_ci_wc_mb(cs, tmp[i], str, end) != (int) cs->mbminlen)
uint expected_length= i == 0 ? bslen : diglen;
if (my_ci_wc_mb(cs, tmp[i], str, end) != (int) expected_length)
{
DBUG_ASSERT(0);
return MY_CS_ILSEQ;
}
str+= cs->mbminlen;
str+= expected_length;
}
return (int) (str - str0);
}
int
my_wc_to_printable_8bit(CHARSET_INFO *cs, my_wc_t wc,
uchar *str, uchar *end)
{
/*
Special case: swe7 does not have the backslash character.
Use dot instead of backslash for escaping.
*/
uint bs= cs->tab_to_uni && cs->tab_to_uni['\\'] != '\\' ? '.' : '\\';
DBUG_ASSERT(cs->mbminlen == 1);
/*
Additionally, if the original swe7 string contains backslashes,
replace them to dots, so this error message:
Invalid swe7 character string: '\xEF\xBC\xB4'
is displayed as:
Invalid swe7 character string: '.xEF.xBC.xB4'
which is more readable than what would happen without '\'-to-dot mapping:
Invalid swe7 character string: '.005CxEF.005CxBC.005CxB4'
*/
if (bs == '.' && wc == '\\')
wc= '.';
return my_wc_to_printable_ex(cs, wc, str, end, bs, 1, 1);
}
int
my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
uchar *str, uchar *end)
{
return my_wc_to_printable_ex(cs, wc, str, end, '\\',
cs->mbminlen, cs->mbminlen);
}
/*
Convert a string between two character sets.
'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.

View File

@ -117,4 +117,17 @@ uint my_8bit_collation_flags_from_data(CHARSET_INFO *cs);
#define MY_HASH_ADD_16(A, B, value) \
do { MY_HASH_ADD(A, B, ((value) & 0xFF)) ; MY_HASH_ADD(A, B, ((value >>8 ))); } while(0)
#endif
#define my_wc_t ulong
int my_wc_to_printable_ex(CHARSET_INFO *cs, my_wc_t wc,
uchar *s, uchar *e,
uint bs, uint bslen, uint diglen);
int my_wc_to_printable_generic(CHARSET_INFO *cs, my_wc_t wc,
uchar *s, uchar *e);
int my_wc_to_printable_8bit(CHARSET_INFO *cs, my_wc_t wc,
uchar *s, uchar *e);
#endif /*STRINGS_DEF_INCLUDED */