Moving the conversion code from String::well_formed_copy()

to my_convert_fix() - a new function in /strings.
2015-03-16 12:14:31 +04:00 · 2015-03-16 12:14:31 +04:00 · f48dc5ccc7
commit f48dc5ccc7
parent c4b268add0
4 changed files with 117 additions and 72 deletions
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@ -382,6 +382,16 @@ typedef struct
 } MY_STRCOPY_STATUS;
 /*
  A structure to return the statistics of a Unicode string conversion.
 */
 typedef struct
 {
  MY_STRCOPY_STATUS m_native_copy_status;
  const char *m_cannot_convert_error_pos;
 } MY_STRCONV_STATUS;
 /* See strings/CHARSET_INFO.txt about information on this structure  */
 struct my_charset_handler_st
 {
@ -852,10 +862,38 @@ const MY_CONTRACTIONS *my_charset_get_contractions(CHARSET_INFO *cs,
 extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
                              const char* fmt, va_list ap);
 /*
  Convert a string between two character sets.
  Bad byte sequences as well as characters that cannot be
  encoded in the destination character set are replaced to '?'.
 */
 uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
                  const char *from, uint32 from_length,
                  CHARSET_INFO *from_cs, uint *errors);
 /*
  Convert a string between two character sets.
  Bad byte sequences as well as characters that cannot be
  encoded in the destination character set are replaced to '?'.
  Not more than "nchars" characters are copied.
  Conversion statistics is returnd in "status" and is set as follows:
  - status->m_native_copy_status.m_source_end_pos - to the position
    between (src) and (src+src_length), where the function stopped reading
    the source string.
  - status->m_native_copy_status.m_well_formed_error_pos - to the position
    between (src) and (src+src_length), where the first badly formed byte
    sequence was found, or to NULL if the string was well formed in the
    given range.
  - status->m_cannot_convert_error_pos - to the position 
    between (src) and (src+src_length), where the first character that
    cannot be represented in the destination character set was found,
    or to NULL if all characters in the given range were successfully
    converted.
 */
 size_t my_convert_fix(CHARSET_INFO *dstcs, char *dst, size_t dst_length,
                      CHARSET_INFO *srccs, const char *src, size_t src_length,
                      size_t nchars, MY_STRCONV_STATUS *status);
 #define	_MY_U	01	/* Upper case */
 #define	_MY_L	02	/* Lower case */
 #define	_MY_NMR	04	/* Numeral (digit) */
--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@ -914,8 +914,6 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
                                const char *from, uint from_length,
                                uint nchars)
 {
  uint res;
  if ((to_cs == &my_charset_bin) || 
      (from_cs == &my_charset_bin) ||
      (to_cs == from_cs) ||
@ -923,73 +921,10 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
  {
    m_cannot_convert_error_pos= NULL;
    return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length,
-                                 nchars, this);
+                                 nchars, &m_native_copy_status);
  }
-  else
+  return my_convert_fix(to_cs, to, to_length, from_cs, from, from_length,
-  {
+                        nchars, this);
    int cnvres;
    my_wc_t wc;
    my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
    my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
    const uchar *from_end= (const uchar*) from + from_length;
    uchar *to_end= (uchar*) to + to_length;
    char *to_start= to;
    m_well_formed_error_pos= NULL;
    m_cannot_convert_error_pos= NULL;
    for ( ; nchars; nchars--)
    {
      const char *from_prev= from;
      if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
        from+= cnvres;
      else if (cnvres == MY_CS_ILSEQ)
      {
        if (!m_well_formed_error_pos)
          m_well_formed_error_pos= from;
        from++;
        wc= '?';
      }
      else if (cnvres > MY_CS_TOOSMALL)
      {
        /*
          A correct multibyte sequence detected
          But it doesn't have Unicode mapping.
        */
        if (!m_cannot_convert_error_pos)
          m_cannot_convert_error_pos= from;
        from+= (-cnvres);
        wc= '?';
      }
      else
      {
        if ((uchar *) from >= from_end)
          break; // End of line
        // Incomplete byte sequence
        if (!m_well_formed_error_pos)
          m_well_formed_error_pos= from;
        from++;
        wc= '?';
      }
 outp:
      if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
        to+= cnvres;
      else if (cnvres == MY_CS_ILUNI && wc != '?')
      {
        if (!m_cannot_convert_error_pos)
          m_cannot_convert_error_pos= from_prev;
        wc= '?';
        goto outp;
      }
      else
      {
        from= from_prev;
        break;
      }
    }
    m_source_end_pos= from;
    res= (uint) (to - to_start);
  }
  return res;
 }
--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@ -43,14 +43,13 @@ inline uint32 copy_and_convert(char *to, uint32 to_length,
 }
-class String_copier: private MY_STRCOPY_STATUS
+class String_copier: private MY_STRCONV_STATUS
 {
  const char *m_cannot_convert_error_pos;
 public:
  const char *source_end_pos() const
-  { return m_source_end_pos; }
+  { return m_native_copy_status.m_source_end_pos; }
  const char *well_formed_error_pos() const
-  { return m_well_formed_error_pos; }
+  { return m_native_copy_status.m_well_formed_error_pos; }
  const char *cannot_convert_error_pos() const
  { return m_cannot_convert_error_pos; }
  const char *most_important_error_pos() const
--- a/strings/ctype.c
+++ b/strings/ctype.c
@ -1161,3 +1161,76 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
  DBUG_ASSERT(FALSE); // Should never get to here
  return 0;           // Make compiler happy
 }
 size_t
 my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length,
               CHARSET_INFO *from_cs, const char *from, size_t from_length,
               size_t nchars, MY_STRCONV_STATUS *status)
 {
  int cnvres;
  my_wc_t wc;
  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
  const uchar *from_end= (const uchar*) from + from_length;
  uchar *to_end= (uchar*) to + to_length;
  char *to_start= to;
  DBUG_ASSERT(to_cs != &my_charset_bin);
  DBUG_ASSERT(from_cs != &my_charset_bin);
  status->m_native_copy_status.m_well_formed_error_pos= NULL;
  status->m_cannot_convert_error_pos= NULL;
  for ( ; nchars; nchars--)
  {
    const char *from_prev= from;
    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
      from+= cnvres;
    else if (cnvres == MY_CS_ILSEQ)
    {
      if (!status->m_native_copy_status.m_well_formed_error_pos)
        status->m_native_copy_status.m_well_formed_error_pos= from;
      from++;
      wc= '?';
    }
    else if (cnvres > MY_CS_TOOSMALL)
    {
      /*
        A correct multibyte sequence detected
        But it doesn't have Unicode mapping.
      */
      if (!status->m_cannot_convert_error_pos)
        status->m_cannot_convert_error_pos= from;
      from+= (-cnvres);
      wc= '?';
    }
    else
    {
      if ((uchar *) from >= from_end)
        break; // End of line
      // Incomplete byte sequence
      if (!status->m_native_copy_status.m_well_formed_error_pos)
        status->m_native_copy_status.m_well_formed_error_pos= from;
      from++;
      wc= '?';
    }
 outp:
    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
      to+= cnvres;
    else if (cnvres == MY_CS_ILUNI && wc != '?')
    {
      if (!status->m_cannot_convert_error_pos)
        status->m_cannot_convert_error_pos= from_prev;
      wc= '?';
      goto outp;
    }
    else
    {
      from= from_prev;
      break;
    }
  }
  status->m_native_copy_status.m_source_end_pos= from;
  return to - to_start;
 }