Moving the conversion code from String::well_formed_copy()
to my_convert_fix() - a new function in /strings.
This commit is contained in:
parent
c4b268add0
commit
f48dc5ccc7
@ -382,6 +382,16 @@ typedef struct
|
|||||||
} MY_STRCOPY_STATUS;
|
} MY_STRCOPY_STATUS;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
A structure to return the statistics of a Unicode string conversion.
|
||||||
|
*/
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
MY_STRCOPY_STATUS m_native_copy_status;
|
||||||
|
const char *m_cannot_convert_error_pos;
|
||||||
|
} MY_STRCONV_STATUS;
|
||||||
|
|
||||||
|
|
||||||
/* See strings/CHARSET_INFO.txt about information on this structure */
|
/* See strings/CHARSET_INFO.txt about information on this structure */
|
||||||
struct my_charset_handler_st
|
struct my_charset_handler_st
|
||||||
{
|
{
|
||||||
@ -852,10 +862,38 @@ const MY_CONTRACTIONS *my_charset_get_contractions(CHARSET_INFO *cs,
|
|||||||
extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
|
extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
|
||||||
const char* fmt, va_list ap);
|
const char* fmt, va_list ap);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Convert a string between two character sets.
|
||||||
|
Bad byte sequences as well as characters that cannot be
|
||||||
|
encoded in the destination character set are replaced to '?'.
|
||||||
|
*/
|
||||||
uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
||||||
const char *from, uint32 from_length,
|
const char *from, uint32 from_length,
|
||||||
CHARSET_INFO *from_cs, uint *errors);
|
CHARSET_INFO *from_cs, uint *errors);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Convert a string between two character sets.
|
||||||
|
Bad byte sequences as well as characters that cannot be
|
||||||
|
encoded in the destination character set are replaced to '?'.
|
||||||
|
Not more than "nchars" characters are copied.
|
||||||
|
Conversion statistics is returnd in "status" and is set as follows:
|
||||||
|
- status->m_native_copy_status.m_source_end_pos - to the position
|
||||||
|
between (src) and (src+src_length), where the function stopped reading
|
||||||
|
the source string.
|
||||||
|
- status->m_native_copy_status.m_well_formed_error_pos - to the position
|
||||||
|
between (src) and (src+src_length), where the first badly formed byte
|
||||||
|
sequence was found, or to NULL if the string was well formed in the
|
||||||
|
given range.
|
||||||
|
- status->m_cannot_convert_error_pos - to the position
|
||||||
|
between (src) and (src+src_length), where the first character that
|
||||||
|
cannot be represented in the destination character set was found,
|
||||||
|
or to NULL if all characters in the given range were successfully
|
||||||
|
converted.
|
||||||
|
*/
|
||||||
|
size_t my_convert_fix(CHARSET_INFO *dstcs, char *dst, size_t dst_length,
|
||||||
|
CHARSET_INFO *srccs, const char *src, size_t src_length,
|
||||||
|
size_t nchars, MY_STRCONV_STATUS *status);
|
||||||
|
|
||||||
#define _MY_U 01 /* Upper case */
|
#define _MY_U 01 /* Upper case */
|
||||||
#define _MY_L 02 /* Lower case */
|
#define _MY_L 02 /* Lower case */
|
||||||
#define _MY_NMR 04 /* Numeral (digit) */
|
#define _MY_NMR 04 /* Numeral (digit) */
|
||||||
|
@ -914,8 +914,6 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
|
|||||||
const char *from, uint from_length,
|
const char *from, uint from_length,
|
||||||
uint nchars)
|
uint nchars)
|
||||||
{
|
{
|
||||||
uint res;
|
|
||||||
|
|
||||||
if ((to_cs == &my_charset_bin) ||
|
if ((to_cs == &my_charset_bin) ||
|
||||||
(from_cs == &my_charset_bin) ||
|
(from_cs == &my_charset_bin) ||
|
||||||
(to_cs == from_cs) ||
|
(to_cs == from_cs) ||
|
||||||
@ -923,73 +921,10 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
|
|||||||
{
|
{
|
||||||
m_cannot_convert_error_pos= NULL;
|
m_cannot_convert_error_pos= NULL;
|
||||||
return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length,
|
return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length,
|
||||||
nchars, this);
|
nchars, &m_native_copy_status);
|
||||||
}
|
}
|
||||||
else
|
return my_convert_fix(to_cs, to, to_length, from_cs, from, from_length,
|
||||||
{
|
nchars, this);
|
||||||
int cnvres;
|
|
||||||
my_wc_t wc;
|
|
||||||
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
|
|
||||||
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
|
|
||||||
const uchar *from_end= (const uchar*) from + from_length;
|
|
||||||
uchar *to_end= (uchar*) to + to_length;
|
|
||||||
char *to_start= to;
|
|
||||||
m_well_formed_error_pos= NULL;
|
|
||||||
m_cannot_convert_error_pos= NULL;
|
|
||||||
|
|
||||||
for ( ; nchars; nchars--)
|
|
||||||
{
|
|
||||||
const char *from_prev= from;
|
|
||||||
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
|
|
||||||
from+= cnvres;
|
|
||||||
else if (cnvres == MY_CS_ILSEQ)
|
|
||||||
{
|
|
||||||
if (!m_well_formed_error_pos)
|
|
||||||
m_well_formed_error_pos= from;
|
|
||||||
from++;
|
|
||||||
wc= '?';
|
|
||||||
}
|
|
||||||
else if (cnvres > MY_CS_TOOSMALL)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
A correct multibyte sequence detected
|
|
||||||
But it doesn't have Unicode mapping.
|
|
||||||
*/
|
|
||||||
if (!m_cannot_convert_error_pos)
|
|
||||||
m_cannot_convert_error_pos= from;
|
|
||||||
from+= (-cnvres);
|
|
||||||
wc= '?';
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if ((uchar *) from >= from_end)
|
|
||||||
break; // End of line
|
|
||||||
// Incomplete byte sequence
|
|
||||||
if (!m_well_formed_error_pos)
|
|
||||||
m_well_formed_error_pos= from;
|
|
||||||
from++;
|
|
||||||
wc= '?';
|
|
||||||
}
|
|
||||||
outp:
|
|
||||||
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
|
|
||||||
to+= cnvres;
|
|
||||||
else if (cnvres == MY_CS_ILUNI && wc != '?')
|
|
||||||
{
|
|
||||||
if (!m_cannot_convert_error_pos)
|
|
||||||
m_cannot_convert_error_pos= from_prev;
|
|
||||||
wc= '?';
|
|
||||||
goto outp;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
from= from_prev;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
m_source_end_pos= from;
|
|
||||||
res= (uint) (to - to_start);
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -43,14 +43,13 @@ inline uint32 copy_and_convert(char *to, uint32 to_length,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class String_copier: private MY_STRCOPY_STATUS
|
class String_copier: private MY_STRCONV_STATUS
|
||||||
{
|
{
|
||||||
const char *m_cannot_convert_error_pos;
|
|
||||||
public:
|
public:
|
||||||
const char *source_end_pos() const
|
const char *source_end_pos() const
|
||||||
{ return m_source_end_pos; }
|
{ return m_native_copy_status.m_source_end_pos; }
|
||||||
const char *well_formed_error_pos() const
|
const char *well_formed_error_pos() const
|
||||||
{ return m_well_formed_error_pos; }
|
{ return m_native_copy_status.m_well_formed_error_pos; }
|
||||||
const char *cannot_convert_error_pos() const
|
const char *cannot_convert_error_pos() const
|
||||||
{ return m_cannot_convert_error_pos; }
|
{ return m_cannot_convert_error_pos; }
|
||||||
const char *most_important_error_pos() const
|
const char *most_important_error_pos() const
|
||||||
|
@ -1161,3 +1161,76 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
|
|||||||
DBUG_ASSERT(FALSE); // Should never get to here
|
DBUG_ASSERT(FALSE); // Should never get to here
|
||||||
return 0; // Make compiler happy
|
return 0; // Make compiler happy
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
size_t
|
||||||
|
my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length,
|
||||||
|
CHARSET_INFO *from_cs, const char *from, size_t from_length,
|
||||||
|
size_t nchars, MY_STRCONV_STATUS *status)
|
||||||
|
{
|
||||||
|
int cnvres;
|
||||||
|
my_wc_t wc;
|
||||||
|
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
|
||||||
|
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
|
||||||
|
const uchar *from_end= (const uchar*) from + from_length;
|
||||||
|
uchar *to_end= (uchar*) to + to_length;
|
||||||
|
char *to_start= to;
|
||||||
|
|
||||||
|
DBUG_ASSERT(to_cs != &my_charset_bin);
|
||||||
|
DBUG_ASSERT(from_cs != &my_charset_bin);
|
||||||
|
|
||||||
|
status->m_native_copy_status.m_well_formed_error_pos= NULL;
|
||||||
|
status->m_cannot_convert_error_pos= NULL;
|
||||||
|
|
||||||
|
for ( ; nchars; nchars--)
|
||||||
|
{
|
||||||
|
const char *from_prev= from;
|
||||||
|
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
|
||||||
|
from+= cnvres;
|
||||||
|
else if (cnvres == MY_CS_ILSEQ)
|
||||||
|
{
|
||||||
|
if (!status->m_native_copy_status.m_well_formed_error_pos)
|
||||||
|
status->m_native_copy_status.m_well_formed_error_pos= from;
|
||||||
|
from++;
|
||||||
|
wc= '?';
|
||||||
|
}
|
||||||
|
else if (cnvres > MY_CS_TOOSMALL)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
A correct multibyte sequence detected
|
||||||
|
But it doesn't have Unicode mapping.
|
||||||
|
*/
|
||||||
|
if (!status->m_cannot_convert_error_pos)
|
||||||
|
status->m_cannot_convert_error_pos= from;
|
||||||
|
from+= (-cnvres);
|
||||||
|
wc= '?';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ((uchar *) from >= from_end)
|
||||||
|
break; // End of line
|
||||||
|
// Incomplete byte sequence
|
||||||
|
if (!status->m_native_copy_status.m_well_formed_error_pos)
|
||||||
|
status->m_native_copy_status.m_well_formed_error_pos= from;
|
||||||
|
from++;
|
||||||
|
wc= '?';
|
||||||
|
}
|
||||||
|
outp:
|
||||||
|
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
|
||||||
|
to+= cnvres;
|
||||||
|
else if (cnvres == MY_CS_ILUNI && wc != '?')
|
||||||
|
{
|
||||||
|
if (!status->m_cannot_convert_error_pos)
|
||||||
|
status->m_cannot_convert_error_pos= from_prev;
|
||||||
|
wc= '?';
|
||||||
|
goto outp;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
from= from_prev;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
status->m_native_copy_status.m_source_end_pos= from;
|
||||||
|
return to - to_start;
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user