diff --git a/include/m_ctype.h b/include/m_ctype.h index b92fb676a2e..78657bd1084 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -44,6 +44,7 @@ typedef struct unicase_info_st uint16 sort; } MY_UNICASE_INFO; + extern MY_UNICASE_INFO *my_unicase_default[256]; extern MY_UNICASE_INFO *my_unicase_turkish[256]; @@ -52,6 +53,19 @@ extern MY_UNICASE_INFO *my_unicase_turkish[256]; #define MY_CS_TOOSMALL -1 #define MY_CS_TOOFEW(n) (-1-(n)) +/* wm_wc and wc_mb return codes */ +#define MY_CS_ILSEQ 0 /* Wrong by sequence: wb_wc */ +#define MY_CS_ILUNI 0 /* Cannot encode Unicode to charset: wc_mb */ +#define MY_CS_TOOSMALL -101 /* Need at least one byte: wc_mb and mb_wc */ +#define MY_CS_TOOSMALL2 -102 /* Need at least two bytes: wc_mb and mb_wc */ +#define MY_CS_TOOSMALL3 -103 /* Need at least three bytes: wc_mb and mb_wc */ +/* These following three are currently not really used */ +#define MY_CS_TOOSMALL4 -104 /* Need at least 4 bytes: wc_mb and mb_wc */ +#define MY_CS_TOOSMALL5 -105 /* Need at least 5 bytes: wc_mb and mb_wc */ +#define MY_CS_TOOSMALL6 -106 /* Need at least 6 bytes: wc_mb and mb_wc */ +/* A helper macros for "need at least n bytes" */ +#define MY_CS_TOOSMALLN(n) (-100-(n)) + #define MY_SEQ_INTTAIL 1 #define MY_SEQ_SPACES 2 diff --git a/mysql-test/r/ctype_big5.result b/mysql-test/r/ctype_big5.result index 4c5832a57e9..6574908101c 100644 --- a/mysql-test/r/ctype_big5.result +++ b/mysql-test/r/ctype_big5.result @@ -189,3 +189,6 @@ select hex(a) from t1 where a = _big5 0xF9DC; hex(a) E5ABBA drop table t1; +select hex(convert(_big5 0xC84041 using ucs2)); +hex(convert(_big5 0xC84041 using ucs2)) +003F0041 diff --git a/mysql-test/r/ctype_gbk.result b/mysql-test/r/ctype_gbk.result index aaffe692126..241539ecf42 100644 --- a/mysql-test/r/ctype_gbk.result +++ b/mysql-test/r/ctype_gbk.result @@ -165,3 +165,6 @@ hex(a) A1A1 A3A0 DROP TABLE t1; +select hex(convert(_gbk 0xA14041 using ucs2)); +hex(convert(_gbk 0xA14041 using ucs2)) +003F0041 diff --git a/mysql-test/r/ctype_ujis.result b/mysql-test/r/ctype_ujis.result index 2e14fe34430..e8ec1b2cbfb 100644 --- a/mysql-test/r/ctype_ujis.result +++ b/mysql-test/r/ctype_ujis.result @@ -2337,3 +2337,9 @@ DROP TABLE t2; set names default; set character_set_database=default; set character_set_server=default; +select hex(convert(_ujis 0xA5FE41 using ucs2)); +hex(convert(_ujis 0xA5FE41 using ucs2)) +003F0041 +select hex(convert(_ujis 0x8FABF841 using ucs2)); +hex(convert(_ujis 0x8FABF841 using ucs2)) +003F0041 diff --git a/mysql-test/t/ctype_big5.test b/mysql-test/t/ctype_big5.test index ffe2a12234e..200002cd235 100644 --- a/mysql-test/t/ctype_big5.test +++ b/mysql-test/t/ctype_big5.test @@ -53,4 +53,14 @@ alter table t1 convert to character set utf8; select hex(a) from t1 where a = _big5 0xF9DC; drop table t1; +# +# Bugs#15375: Unassigned multibyte codes are broken +# into parts when converting to Unicode. +# This query should return 0x003F0041. I.e. it should +# scan unassigned double-byte character 0xC840, convert +# it as QUESTION MARK 0x003F and then scan the next +# character, which is a single byte character 0x41. +# +select hex(convert(_big5 0xC84041 using ucs2)); + # End of 4.1 tests diff --git a/mysql-test/t/ctype_gbk.test b/mysql-test/t/ctype_gbk.test index 5eeade96186..7aec48586d8 100644 --- a/mysql-test/t/ctype_gbk.test +++ b/mysql-test/t/ctype_gbk.test @@ -31,4 +31,14 @@ INSERT INTO t1 VALUES (0xA3A0),(0xA1A1); SELECT hex(a) FROM t1 ORDER BY a; DROP TABLE t1; +# +# Bugs#15375: Unassigned multibyte codes are broken +# into parts when converting to Unicode. +# This query should return 0x003F0041. I.e. it should +# scan unassigned double-byte character 0xA140, convert +# it as QUESTION MARK 0x003F and then scan the next +# character, which is a single byte character 0x41. +# +select hex(convert(_gbk 0xA14041 using ucs2)); + # End of 4.1 tests diff --git a/mysql-test/t/ctype_ujis.test b/mysql-test/t/ctype_ujis.test index 77d250b5c45..14b37569b11 100644 --- a/mysql-test/t/ctype_ujis.test +++ b/mysql-test/t/ctype_ujis.test @@ -1152,6 +1152,21 @@ SET collation_connection='ujis_bin'; -- source include/ctype_innodb_like.inc -- source include/ctype_like_escape.inc +# +# Bugs#15375: Unassigned multibyte codes are broken +# into parts when converting to Unicode. +# This query should return 0x003F0041. I.e. it should +# scan unassigned double-byte character 0xA5FE, convert +# it as QUESTION MARK 0x003F and then scan the next +# character, which is a single byte character 0x41. +# +select hex(convert(_ujis 0xA5FE41 using ucs2)); +# This one should return 0x003F0041: +# scan unassigned three-byte character 0x8FABF8, +# convert it as QUESTION MARK 0x003F and then scan +# the next character, which is a single byte character 0x41. +select hex(convert(_ujis 0x8FABF841 using ucs2)); + # End of 4.1 tests --disable_warnings DROP TABLE IF EXISTS t1, t2; diff --git a/sql/sql_string.cc b/sql/sql_string.cc index fd7bca7ec21..79228be8a76 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -819,8 +819,18 @@ copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, from++; wc= '?'; } + else if (cnvres > MY_CS_TOOSMALL) + { + /* + A correct multibyte sequence detected + But it doesn't have Unicode mapping. + */ + error_count++; + from+= (-cnvres); + wc= '?'; + } else - break; // Impossible char. + break; // Not enough characters outp: if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0) diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index 460215418f8..0ca1cf21129 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -6275,7 +6275,7 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)), int hi=s[0]; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; if (hi<0x80) { @@ -6284,10 +6284,10 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)), } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_big5_uni_onechar((hi<<8)+s[1]))) - return MY_CS_ILSEQ; + return -2; return 2; } diff --git a/strings/ctype-bin.c b/strings/ctype-bin.c index 9708257c1b5..54c35c82652 100644 --- a/strings/ctype-bin.c +++ b/strings/ctype-bin.c @@ -246,7 +246,7 @@ static int my_mb_wc_bin(CHARSET_INFO *cs __attribute__((unused)), const unsigned char *end __attribute__((unused))) { if (str >= end) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; *wc=str[0]; return 1; diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index 5e357e0b65c..d6df46f7e05 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -8601,7 +8601,7 @@ my_wc_mb_euc_kr(CHARSET_INFO *cs __attribute__((unused)), return MY_CS_ILUNI; if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; s[0]=code>>8; s[1]=code&0xFF; @@ -8617,7 +8617,7 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), int hi=s[0]; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; if (hi<0x80) { @@ -8626,10 +8626,10 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_ksc5601_uni_onechar((hi<<8)+s[1]))) - return MY_CS_ILSEQ; + return -2; return 2; } diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index f3938cc27ba..29ecafc3527 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -5651,7 +5651,7 @@ my_wc_mb_gb2312(CHARSET_INFO *cs __attribute__((unused)), return MY_CS_ILUNI; if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; code|=0x8080; s[0]=code>>8; @@ -5668,7 +5668,7 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), hi=(int) s[0]; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; if (hi<0x80) { @@ -5677,10 +5677,10 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_gb2312_uni_onechar(((hi<<8)+s[1])&0x7F7F))) - return MY_CS_ILSEQ; + return -2; return 2; } diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index edc595875d7..ef1b33fd82c 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -9902,7 +9902,7 @@ my_wc_mb_gbk(CHARSET_INFO *cs __attribute__((unused)), return MY_CS_ILUNI; if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; s[0]=code>>8; s[1]=code&0xFF; @@ -9916,7 +9916,7 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)), int hi; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; hi=s[0]; @@ -9927,10 +9927,10 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)), } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_gbk_uni_onechar( (hi<<8) + s[1]))) - return MY_CS_ILSEQ; + return -2; return 2; diff --git a/strings/ctype-latin1.c b/strings/ctype-latin1.c index 746cb5a4003..a3f5aec9605 100644 --- a/strings/ctype-latin1.c +++ b/strings/ctype-latin1.c @@ -363,10 +363,10 @@ int my_mb_wc_latin1(CHARSET_INFO *cs __attribute__((unused)), const unsigned char *end __attribute__((unused))) { if (str >= end) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; *wc=cs_to_uni[*str]; - return (!wc[0] && str[0]) ? MY_CS_ILSEQ : 1; + return (!wc[0] && str[0]) ? -1 : 1; } static diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index d6f2f0e5fe5..12ef77c59b1 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -239,10 +239,10 @@ int my_mb_wc_8bit(CHARSET_INFO *cs,my_wc_t *wc, const unsigned char *end __attribute__((unused))) { if (str >= end) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; *wc=cs->tab_to_uni[*str]; - return (!wc[0] && str[0]) ? MY_CS_ILSEQ : 1; + return (!wc[0] && str[0]) ? -1 : 1; } int my_wc_mb_8bit(CHARSET_INFO *cs,my_wc_t wc, diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index 398aea08b05..57d6d8bae2b 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -4516,7 +4516,7 @@ my_wc_mb_sjis(CHARSET_INFO *cs __attribute__((unused)), mb: if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; s[0]=code>>8; s[1]=code&0xFF; @@ -4530,7 +4530,7 @@ my_mb_wc_sjis(CHARSET_INFO *cs __attribute__((unused)), int hi=s[0]; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; if (hi < 0x80) { @@ -4545,10 +4545,10 @@ my_mb_wc_sjis(CHARSET_INFO *cs __attribute__((unused)), } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!(pwc[0]=func_sjis_uni_onechar((hi<<8)+s[1]))) - return MY_CS_ILSEQ; + return -2; return 2; } diff --git a/strings/ctype-tis620.c b/strings/ctype-tis620.c index 1200644de3c..dc4f18b516b 100644 --- a/strings/ctype-tis620.c +++ b/strings/ctype-tis620.c @@ -827,10 +827,10 @@ int my_mb_wc_tis620(CHARSET_INFO *cs __attribute__((unused)), const unsigned char *end __attribute__((unused))) { if (str >= end) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; *wc=cs_to_uni[*str]; - return (!wc[0] && str[0]) ? MY_CS_ILSEQ : 1; + return (!wc[0] && str[0]) ? -1 : 1; } static diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index e2629f445cb..97dca79e84b 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -94,7 +94,7 @@ static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)), my_wc_t * pwc, const uchar *s, const uchar *e) { if (s+2 > e) /* Need 2 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; *pwc= ((unsigned char)s[0]) * 256 + ((unsigned char)s[1]); return 2; @@ -104,7 +104,7 @@ static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) , my_wc_t wc, uchar *r, uchar *e) { if ( r+2 > e ) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; r[0]= (uchar) (wc >> 8); r[1]= (uchar) (wc & 0xFF); diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index 696eecaa794..ca27b4bef6b 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -242,7 +242,7 @@ my_mb_wc_jisx0201(CHARSET_INFO *cs __attribute__((unused)), const uchar *e __attribute__((unused))) { wc[0]=tab_jisx0201_uni[*s]; - return (!wc[0] && s[0]) ? MY_CS_ILSEQ : 1; + return (!wc[0] && s[0]) ? -1 : 1; } @@ -8341,7 +8341,7 @@ my_mb_wc_euc_jp(CHARSET_INFO *cs,my_wc_t *pwc, const uchar *s, const uchar *e) int c1,c2,c3; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; c1=s[0]; @@ -8353,7 +8353,7 @@ my_mb_wc_euc_jp(CHARSET_INFO *cs,my_wc_t *pwc, const uchar *s, const uchar *e) } if (s+2>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; c2=s[1]; @@ -8368,7 +8368,7 @@ my_mb_wc_euc_jp(CHARSET_INFO *cs,my_wc_t *pwc, const uchar *s, const uchar *e) { pwc[0]=my_jisx0208_uni_onechar( ((c1-0x80) << 8) + (c2-0x80)); if (!pwc[0]) - return MY_CS_ILSEQ; + return -2; } else { @@ -8388,7 +8388,7 @@ my_mb_wc_euc_jp(CHARSET_INFO *cs,my_wc_t *pwc, const uchar *s, const uchar *e) ret = my_mb_wc_jisx0201(cs,pwc,s+1,e); if (ret!=1) - return ret; + return -2; return 2; } @@ -8399,7 +8399,7 @@ my_mb_wc_euc_jp(CHARSET_INFO *cs,my_wc_t *pwc, const uchar *s, const uchar *e) return MY_CS_ILSEQ; if (s+3>e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL3; c3=s[2]; if (c3 < 0xA1 || c3>=0xFF) @@ -8408,8 +8408,8 @@ my_mb_wc_euc_jp(CHARSET_INFO *cs,my_wc_t *pwc, const uchar *s, const uchar *e) if (c2<0xF5) { pwc[0]=my_jisx0212_uni_onechar((c2-0x80)*256 + (c3-0x80)); - if (!pwc) - return MY_CS_ILSEQ; + if (!pwc[0]) + return -3; } else { @@ -8440,7 +8440,7 @@ my_wc_mb_euc_jp(CHARSET_INFO *c,my_wc_t wc, unsigned char *s, unsigned char *e) if ((jp=my_uni_jisx0208_onechar(wc))) { if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; jp+=0x8080; s[0]=jp>>8; @@ -8452,7 +8452,7 @@ my_wc_mb_euc_jp(CHARSET_INFO *c,my_wc_t wc, unsigned char *s, unsigned char *e) if (my_wc_mb_jisx0201(c,wc,s,e) == 1) { if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; s[1]= s[0]; s[0]= 0x8E; return 2; @@ -8462,7 +8462,7 @@ my_wc_mb_euc_jp(CHARSET_INFO *c,my_wc_t wc, unsigned char *s, unsigned char *e) if ((jp=my_uni_jisx0212_onechar(wc))) { if (s+3>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL3; jp+=0x8080; s[0]=0x8F; @@ -8476,7 +8476,7 @@ my_wc_mb_euc_jp(CHARSET_INFO *c,my_wc_t wc, unsigned char *s, unsigned char *e) if (wc>=0xE000 && wc<0xE3AC) { if (s+2>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL2; c1=((unsigned)(wc-0xE000)/94)+0xF5; s[0]=c1; @@ -8490,7 +8490,7 @@ my_wc_mb_euc_jp(CHARSET_INFO *c,my_wc_t wc, unsigned char *s, unsigned char *e) if (wc>=0xE3AC && wc<0xE758) { if (s+3>e) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALL3; s[0]=0x8F; c1=((unsigned)(wc-0xE3AC)/94)+0xF5; diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index e1bb746fd9a..3594ab954c6 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1947,7 +1947,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), unsigned char c; if (s >= e) - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL; c= s[0]; if (c < 0x80) @@ -1960,7 +1960,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), else if (c < 0xe0) { if (s+2 > e) /* We need 2 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL2; if (!((s[1] ^ 0x80) < 0x40)) return MY_CS_ILSEQ; @@ -1971,7 +1971,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), else if (c < 0xf0) { if (s+3 > e) /* We need 3 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL3; if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0))) return MY_CS_ILSEQ; @@ -1986,7 +1986,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), else if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32) { if (s+4 > e) /* We need 4 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL4; if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && @@ -2004,7 +2004,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), else if (c < 0xfc && sizeof(my_wc_t)*8 >= 32) { if (s+5 >e) /* We need 5 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL5; if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && @@ -2023,7 +2023,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)), else if (c < 0xfe && sizeof(my_wc_t)*8 >= 32) { if ( s+6 >e ) /* We need 6 characters */ - return MY_CS_TOOFEW(0); + return MY_CS_TOOSMALL6; if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && @@ -2074,7 +2074,7 @@ static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)) , Because of it (r+count > e), not (r+count-1 >e ) */ if ( r+count > e ) - return MY_CS_TOOSMALL; + return MY_CS_TOOSMALLN(count); switch (count) { /* Fall through all cases!!! */