Moving the conversion code from String::well_formed_copy()

to my_convert_fix() - a new function in /strings.
2015-03-16 12:14:31 +04:00 · 2015-03-16 12:14:31 +04:00 · f48dc5ccc7
commit f48dc5ccc7
parent c4b268add0
4 changed files with 117 additions and 72 deletions
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@ -382,6 +382,16 @@ typedef struct
 } MY_STRCOPY_STATUS;


+/*
+  A structure to return the statistics of a Unicode string conversion.
+*/
+typedef struct
+{
+  MY_STRCOPY_STATUS m_native_copy_status;
+  const char *m_cannot_convert_error_pos;
+} MY_STRCONV_STATUS;
+
+
 /* See strings/CHARSET_INFO.txt about information on this structure  */
 struct my_charset_handler_st
 {
@ -852,10 +862,38 @@ const MY_CONTRACTIONS *my_charset_get_contractions(CHARSET_INFO *cs,
 extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
                              const char* fmt, va_list ap);

+/*
+  Convert a string between two character sets.
+  Bad byte sequences as well as characters that cannot be
+  encoded in the destination character set are replaced to '?'.
+*/
 uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
                  const char *from, uint32 from_length,
                  CHARSET_INFO *from_cs, uint *errors);

+/*
+  Convert a string between two character sets.
+  Bad byte sequences as well as characters that cannot be
+  encoded in the destination character set are replaced to '?'.
+  Not more than "nchars" characters are copied.
+  Conversion statistics is returnd in "status" and is set as follows:
+  - status->m_native_copy_status.m_source_end_pos - to the position
+    between (src) and (src+src_length), where the function stopped reading
+    the source string.
+  - status->m_native_copy_status.m_well_formed_error_pos - to the position
+    between (src) and (src+src_length), where the first badly formed byte
+    sequence was found, or to NULL if the string was well formed in the
+    given range.
+  - status->m_cannot_convert_error_pos - to the position 
+    between (src) and (src+src_length), where the first character that
+    cannot be represented in the destination character set was found,
+    or to NULL if all characters in the given range were successfully
+    converted.
+*/
+size_t my_convert_fix(CHARSET_INFO *dstcs, char *dst, size_t dst_length,
+                      CHARSET_INFO *srccs, const char *src, size_t src_length,
+                      size_t nchars, MY_STRCONV_STATUS *status);
+
 #define	_MY_U	01	/* Upper case */
 #define	_MY_L	02	/* Lower case */
 #define	_MY_NMR	04	/* Numeral (digit) */
--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@ -914,8 +914,6 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
                                const char *from, uint from_length,
                                uint nchars)
 {
-  uint res;
-
  if ((to_cs == &my_charset_bin) || 
      (from_cs == &my_charset_bin) ||
      (to_cs == from_cs) ||
@ -923,73 +921,10 @@ String_copier::well_formed_copy(CHARSET_INFO *to_cs,
  {
    m_cannot_convert_error_pos= NULL;
    return to_cs->cset->copy_fix(to_cs, to, to_length, from, from_length,
-                                 nchars, this);
+                                 nchars, &m_native_copy_status);
  }
-  else
-  {
-    int cnvres;
-    my_wc_t wc;
-    my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
-    my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
-    const uchar *from_end= (const uchar*) from + from_length;
-    uchar *to_end= (uchar*) to + to_length;
-    char *to_start= to;
-    m_well_formed_error_pos= NULL;
-    m_cannot_convert_error_pos= NULL;
-
-    for ( ; nchars; nchars--)
-    {
-      const char *from_prev= from;
-      if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
-        from+= cnvres;
-      else if (cnvres == MY_CS_ILSEQ)
-      {
-        if (!m_well_formed_error_pos)
-          m_well_formed_error_pos= from;
-        from++;
-        wc= '?';
-      }
-      else if (cnvres > MY_CS_TOOSMALL)
-      {
-        /*
-          A correct multibyte sequence detected
-          But it doesn't have Unicode mapping.
-        */
-        if (!m_cannot_convert_error_pos)
-          m_cannot_convert_error_pos= from;
-        from+= (-cnvres);
-        wc= '?';
-      }
-      else
-      {
-        if ((uchar *) from >= from_end)
-          break; // End of line
-        // Incomplete byte sequence
-        if (!m_well_formed_error_pos)
-          m_well_formed_error_pos= from;
-        from++;
-        wc= '?';
-      }
-outp:
-      if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
-        to+= cnvres;
-      else if (cnvres == MY_CS_ILUNI && wc != '?')
-      {
-        if (!m_cannot_convert_error_pos)
-          m_cannot_convert_error_pos= from_prev;
-        wc= '?';
-        goto outp;
-      }
-      else
-      {
-        from= from_prev;
-        break;
-      }
-    }
-    m_source_end_pos= from;
-    res= (uint) (to - to_start);
-  }
-  return res;
+  return my_convert_fix(to_cs, to, to_length, from_cs, from, from_length,
+                        nchars, this);
 }


--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@ -43,14 +43,13 @@ inline uint32 copy_and_convert(char *to, uint32 to_length,
 }


-class String_copier: private MY_STRCOPY_STATUS
+class String_copier: private MY_STRCONV_STATUS
 {
-  const char *m_cannot_convert_error_pos;
 public:
  const char *source_end_pos() const
-  { return m_source_end_pos; }
+  { return m_native_copy_status.m_source_end_pos; }
  const char *well_formed_error_pos() const
-  { return m_well_formed_error_pos; }
+  { return m_native_copy_status.m_well_formed_error_pos; }
  const char *cannot_convert_error_pos() const
  { return m_cannot_convert_error_pos; }
  const char *most_important_error_pos() const
--- a/strings/ctype.c
+++ b/strings/ctype.c
@ -1161,3 +1161,76 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
  DBUG_ASSERT(FALSE); // Should never get to here
  return 0;           // Make compiler happy
 }
+
+
+size_t
+my_convert_fix(CHARSET_INFO *to_cs, char *to, size_t to_length,
+               CHARSET_INFO *from_cs, const char *from, size_t from_length,
+               size_t nchars, MY_STRCONV_STATUS *status)
+{
+  int cnvres;
+  my_wc_t wc;
+  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
+  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
+  const uchar *from_end= (const uchar*) from + from_length;
+  uchar *to_end= (uchar*) to + to_length;
+  char *to_start= to;
+
+  DBUG_ASSERT(to_cs != &my_charset_bin);
+  DBUG_ASSERT(from_cs != &my_charset_bin);
+
+  status->m_native_copy_status.m_well_formed_error_pos= NULL;
+  status->m_cannot_convert_error_pos= NULL;
+
+  for ( ; nchars; nchars--)
+  {
+    const char *from_prev= from;
+    if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
+      from+= cnvres;
+    else if (cnvres == MY_CS_ILSEQ)
+    {
+      if (!status->m_native_copy_status.m_well_formed_error_pos)
+        status->m_native_copy_status.m_well_formed_error_pos= from;
+      from++;
+      wc= '?';
+    }
+    else if (cnvres > MY_CS_TOOSMALL)
+    {
+      /*
+        A correct multibyte sequence detected
+        But it doesn't have Unicode mapping.
+      */
+      if (!status->m_cannot_convert_error_pos)
+        status->m_cannot_convert_error_pos= from;
+      from+= (-cnvres);
+      wc= '?';
+    }
+    else
+    {
+      if ((uchar *) from >= from_end)
+        break; // End of line
+      // Incomplete byte sequence
+      if (!status->m_native_copy_status.m_well_formed_error_pos)
+        status->m_native_copy_status.m_well_formed_error_pos= from;
+      from++;
+      wc= '?';
+    }
+outp:
+    if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
+      to+= cnvres;
+    else if (cnvres == MY_CS_ILUNI && wc != '?')
+    {
+      if (!status->m_cannot_convert_error_pos)
+        status->m_cannot_convert_error_pos= from_prev;
+      wc= '?';
+      goto outp;
+    }
+    else
+    {
+      from= from_prev;
+      break;
+    }
+  }
+  status->m_native_copy_status.m_source_end_pos= from;
+  return to - to_start;
+}