Fixes for German sorting order.

2001-09-11 01:40:52 +03:00 · 2001-09-11 01:40:52 +03:00 · fa23b20789
commit fa23b20789
parent c526f5d2ac
9 changed files with 328 additions and 155 deletions
--- a/Docs/manual.texi
+++ b/Docs/manual.texi
@ -748,7 +748,7 @@ is also available through the SQL interface as well.
@item
 Full support for several different character sets, including
-ISO-8859-1 (Latin1), big5, ujis, and more.  For example, the
+ISO-8859-1 (Latin1), german, big5, ujis, and more.  For example, the
 Scandinavian characters `@ringaccent{a}', `@"a' and `@"o' are allowed
 in table and column names.
@ -20442,6 +20442,35 @@ default-character-set=character-set-name
 but normally this is never needed.
@menu
 * German character set::        
@end menu
@node German character set,  , Character sets, Character sets
@subsubsection German character set
 To get German sorting order, you should start @code{mysqld} with
@code{--default-character-set=latin_de}.  This will give you the following
 characteristics.
 When sorting and comparing string's the following mapping is done on the
 strings before doing the comparison:
@example
 ä  ->  ae
 ö  ->  oe
 ü  ->  ue
 ß  ->  ss
@end example
 All accented characters, except @code{'é'} and @code{É} are converted to
 their un-accented counterpart.  All letters are converted to uppercase.
 When comparing strings with @code{LIKE} the one -> two character mapping
 is not done. All letters are converted to uppercase. Accent are removed
 from all letters except: @code{Ü}, @code{ü}, @code{É}, @code{é}, @code{Ö},
@code{ö}, @code{Ä} and @code{ä}.
@node Languages, Adding character set, Character sets, Localization
@subsection Non-English Error Messages
@ -46753,6 +46782,8 @@ Our TODO section contains what we plan to have in 4.0. @xref{TODO MySQL 4.0}.
@itemize @bullet
@item
 New character set @code{latin_de} which provides correct German sorting.
@item
@code{TRUNCATE TABLE} and @code{DELETE FROM table_name} are now separate
 functions. One bonus is that @code{DELETE FROM table_name} now returns
 the number of deleted rows.
--- a/configure.in
+++ b/configure.in
@ -1826,7 +1826,7 @@ CHARSETS_AVAILABLE="big5 cp1251 cp1257
                          latin1 latin1_de latin2 latin5 sjis swe7 tis620 ujis
                          usa7 win1250 win1251ukr"
 CHARSETS_DEPRECATED="win1251"
-DEFAULT_CHARSET=latin1_de
+DEFAULT_CHARSET=latin1
 AC_DIVERT_POP
 AC_ARG_WITH(charset,
--- a/myisam/mi_delete_all.c
+++ b/myisam/mi_delete_all.c
@ -15,7 +15,7 @@
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 /* Remove all rows from a MyISAM table */
-/* This only clears the status information;  The files are not truncated */
+/* This only clears the status information and truncates the data file */
 #include "myisamdef.h"
@ -50,6 +50,8 @@ int mi_delete_all_rows(MI_INFO *info)
  myisam_log_command(MI_LOG_DELETE_ALL,info,(byte*) 0,0,0);
  VOID(_mi_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
  if (my_chsize(info->dfile, 0, MYF(MY_WME)))
    goto err;
  allow_break();			/* Allow SIGHUP & SIGINT */
  DBUG_RETURN(0);
--- a/myisam/mi_search.c
+++ b/myisam/mi_search.c
@ -657,19 +657,19 @@ void _mi_dpointer(MI_INFO *info, uchar *buff, my_off_t pos)
 int _mi_compare_text(CHARSET_INFO *charset_info, uchar *a, uint a_length,
                     uchar *b, uint b_length, my_bool part_key)
 {
  uint length= min(a_length,b_length);
  uchar *end= a+ length;
  int flag;
 #ifdef USE_STRCOLL
  if (use_strcoll(charset_info))
  {
-    if ((flag = my_strnncoll(charset_info, a, a_length, b, b_length)))
+    /* QQ: This needs to work with part keys at some point */
-      return flag;
+    return my_strnncoll(charset_info, a, a_length, b, b_length);
  }
  else
 #endif
  {
    uint length= min(a_length,b_length);
    uchar *end= a+ length;
    uchar *sort_order=charset_info->sort_order;
    while (a < end)
      if ((flag= (int) sort_order[*a++] - (int) sort_order[*b++]))
@ -768,8 +768,15 @@ int _mi_key_cmp(register MI_KEYSEG *keyseg, register uchar *a,
      }
      else
      {
-        uint length=(uint) (end-a);
+	uint length=(uint) (end-a), a_length=length, b_length=length;
-        if ((flag=_mi_compare_text(keyseg->charset,a,length,b,length,
+	if (!(nextflag & SEARCH_PREFIX))
 	{
 	  while (a_length && a[a_length-1] == ' ')
 	    a_length--;
 	  while (b_length && b[b_length-1] == ' ')
 	    b_length--;
 	}
        if ((flag=_mi_compare_text(keyseg->charset,a,a_length,b,b_length,
                                   (my_bool) ((nextflag & SEARCH_PREFIX) &&
                                              next_key_length <= 0))))
          return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
--- a/mysql-test/r/ctype_latin1_de.result
+++ b/mysql-test/r/ctype_latin1_de.result
@ -0,0 +1,168 @@
 a	b
 a	35
 ac	2
 ad	4
 ä	1
 ae	3
 ää	31
 aeae	33
 ääa	32
 aeb	6
 Äc	5
 eä	28
 o	37
 oc	15
 od	18
 ö	14
 oe	17
 Öa	16
 oeb	20
 Öc	19
 öo	30
 q	34
 s	21
 ss	22
 ß	23
 ssa	25
 ßa	27
 ßb	24
 ssc	26
 u	36
 uc	8
 ud	10
 ue	9
 Ü	11
 ueb	12
 üc	7
 uf	13
 uü	29
 é	38
 É	39
 a	b
 a	35
 ac	2
 ad	4
 ä	1
 ae	3
 ää	31
 aeae	33
 ääa	32
 aeb	6
 Äc	5
 eä	28
 o	37
 oc	15
 od	18
 ö	14
 oe	17
 Öa	16
 oeb	20
 Öc	19
 öo	30
 q	34
 s	21
 ss	22
 ß	23
 ssa	25
 ßa	27
 ßb	24
 ssc	26
 u	36
 uc	8
 ud	10
 ue	9
 Ü	11
 ueb	12
 üc	7
 uf	13
 uü	29
 é	38
 É	39
 a
 É
 é
 uü
 uf
 üc
 ueb
 Ü
 ue
 ud
 uc
 u
 ssc
 ßb
 ßa
 ssa
 ß
 ss
 s
 q
 öo
 Öc
 oeb
 Öa
 oe
 ö
 od
 oc
 o
 eä
 Äc
 aeb
 ääa
 aeae
 ää
 ae
 ä
 ad
 ac
 a
 Table	Op	Msg_type	Msg_text
 test.t1	check	status	OK
 a	b
 Öa	16
 Öc	19
 öo	30
 a	b
 é	38
 É	39
 a	b
 a	35
 ac	2
 ad	4
 ae	3
 aeae	33
 ääa	32
 aeb	6
 Öa	16
 ssa	25
 ßa	27
 a	b
 u	36
 uc	8
 ud	10
 ue	9
 ueb	12
 uf	13
 uü	29
 a	b
 ss	22
 ssa	25
 ssc	26
 strcmp('ä','ae')	strcmp('ae','ä')	strcmp('aeq','äq')	strcmp('äq','aeq')
 0	0	0	0
 strcmp('ss','ß')	strcmp('ß','ss')	strcmp('ßs','sss')	strcmp('ßq','ssq')
 0	0	0	0
 strcmp('ä','af')	strcmp('a','ä')	strcmp('ää','aeq')	strcmp('ää','aeaeq')
 -1	-1	-1	-1
 strcmp('ss','ßa')	strcmp('ß','ssa')	strcmp('sßa','sssb')	strcmp('s','ß')
 -1	-1	-1	-1
 strcmp('ö','oö')	strcmp('Ü','uü')	strcmp('ö','oeb')
 -1	-1	-1
 strcmp('af','ä')	strcmp('ä','a')	strcmp('aeq','ää')	strcmp('aeaeq','ää')
 1	1	1	1
 strcmp('ßa','ss')	strcmp('ssa','ß')	strcmp('sssb','sßa')	strcmp('ß','s')
 1	1	1	1
 strcmp('u','öa')	strcmp('u','ö')
 1	1
--- a/mysql-test/t/ctype_latin1_de-master.opt
+++ b/mysql-test/t/ctype_latin1_de-master.opt
@ -0,0 +1 @@
 --default-character-set=latin1_de
--- a/mysql-test/t/ctype_latin1_de.test
+++ b/mysql-test/t/ctype_latin1_de.test
@ -0,0 +1,36 @@
 #
 # Test latin_de character set
 #
 drop table if exists t1;
 create table t1 (a char (20) not null, b int not null auto_increment, index (a,b),index(b));
 insert into t1 (a) values ('ä'),('ac'),('ae'),('ad'),('Äc'),('aeb');
 insert into t1 (a) values ('üc'),('uc'),('ue'),('ud'),('Ü'),('ueb'),('uf');
 insert into t1 (a) values ('ö'),('oc'),('Öa'),('oe'),('od'),('Öc'),('oeb');
 insert into t1 (a) values ('s'),('ss'),('ß'),('ßb'),('ssa'),('ssc'),('ßa');
 insert into t1 (a) values ('eä'),('uü'),('öo'),('ää'),('ääa'),('aeae');
 insert into t1 (a) values ('q'),('a'),('u'),('o'),('é'),('É');
 select a,b from t1 order by a,b;
 select a,b from t1 order by upper(a),b;
 select a from t1 order by a desc;
 check table t1;
 select * from t1 where a like "ö%";
 select * from t1 where a like "%É%";
 select * from t1 where a like "%Á%";
 select * from t1 where a like "%U%";
 select * from t1 where a like "%ss%";
 drop table t1;
 # The following should all be true
 select strcmp('ä','ae'),strcmp('ae','ä'),strcmp('aeq','äq'),strcmp('äq','aeq');
 select strcmp('ss','ß'),strcmp('ß','ss'),strcmp('ßs','sss'),strcmp('ßq','ssq');
 # The following should all return -1
 select strcmp('ä','af'),strcmp('a','ä'),strcmp('ää','aeq'),strcmp('ää','aeaeq');
 select strcmp('ss','ßa'),strcmp('ß','ssa'),strcmp('sßa','sssb'),strcmp('s','ß');
 select strcmp('ö','oö'),strcmp('Ü','uü'),strcmp('ö','oeb');
 # The following should all return 1
 select strcmp('af','ä'),strcmp('ä','a'),strcmp('aeq','ää'),strcmp('aeaeq','ää');
 select strcmp('ßa','ss'),strcmp('ssa','ß'),strcmp('sssb','sßa'),strcmp('ß','s');
 select strcmp('u','öa'),strcmp('u','ö');
--- a/sql/item_cmpfunc.cc
+++ b/sql/item_cmpfunc.cc
@ -254,7 +254,7 @@ longlong Item_func_strcmp::val_int()
    null_value=1;
    return 0;
  }
-  int value=stringcmp(a,b);
+  int value= binary ? stringcmp(a,b) : sortcmp(a,b);
  null_value=0;
  return !value ? 0 : (value < 0 ? (longlong) -1 : (longlong) 1);
 }
--- a/strings/ctype-latin1_de.c
+++ b/strings/ctype-latin1_de.c
@ -99,12 +99,10 @@ uchar to_upper_latin1_de[] = {
 * This is a simple latin1 mapping table, which maps all accented
 * characters to their non-accented equivalents.  Note: in this
 * table, 'ä' is mapped to 'A', 'ÿ' is mapped to 'Y', etc. - all
- * accented characters are treated the same way.
+ * accented characters except the following are treated the same way.
- *
+ * Ü, ü, É, é, Ö, ö, Ä, ä
 * SPECIAL NOTE: 'ß' (the sz ligature), which isn't really an
 * accented 's', is mapped to 'S', to simplify the sorting
 * functions.
 */
 uchar sort_order_latin1_de[] = {
    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
@ -118,10 +116,10 @@ uchar sort_order_latin1_de[] = {
  144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
  160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
  176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
-   65, 65, 65, 65, 65, 65, 92, 67, 69, 69, 69, 69, 73, 73, 73, 73,
+   65, 65, 65, 65,196, 65, 92, 67, 69,201, 69, 69, 73, 73, 73, 73,
-   68, 78, 79, 79, 79, 79, 79,215,216, 85, 85, 85, 85, 89,222, 83,
+   68, 78, 79, 79, 79, 79,214,215,216, 85, 85, 85,220, 89,222,223,
-   65, 65, 65, 65, 65, 65, 92, 67, 69, 69, 69, 69, 73, 73, 73, 73,
+   65, 65, 65, 65,196, 65, 92, 67, 69,201, 69, 69, 73, 73, 73, 73,
-   68, 78, 79, 79, 79, 79, 79,247,216, 85, 85, 85, 85, 89,222, 89
+   68, 78, 79, 79, 79, 79,214,247,216, 85, 85, 85,220, 89,222, 89
 };
 #define L1_AE 196
@ -132,6 +130,39 @@ uchar sort_order_latin1_de[] = {
 #define L1_ue 252
 #define L1_ss 223
 /*
  Some notes about the following comparison rules:
  By definition, my_strnncoll_latin_de must works exactly as if had called
  my_strnxfrm_latin_de() on both strings and compared the result strings.
  This means that:
  Ä must also matches ÁE and Aè, because my_strxn_frm_latin_de() will convert
  both to AE.
  The other option would be to not do any accent removal in
  sort_order_latin_de[] at all
 */
 #define CHECK_S1_COMBO(ch1, ch2, str1, str1_end, res_if_str1_smaller, str2, fst, snd, accent)   \
  /* Invariant: ch1 == fst == sort_order_latin1_de[accent] && ch1 != ch2 */ \
  if (ch2 != accent)							\
  {									\
    ch1= fst;								\
    goto normal;							\
  }									\
  if (str1 == str1_end)							\
    return res_if_str1_smaller;						\
  {									\
     int diff = (int) sort_order_latin1_de[*str1] - snd;		\
     if (diff)								\
        return diff*(-(res_if_str1_smaller));				\
      /* They are equal (e.g., "Ae" == 'ä') */				\
     str1++;								\
  }
 int my_strnncoll_latin1_de(const uchar * s1, int len1,
                           const uchar * s2, int len2)
 {
@ -140,172 +171,71 @@ int my_strnncoll_latin1_de(const uchar * s1, int len1,
  while (s1 < e1 && s2 < e2)
  {
-    /* to_upper is used instead of sort_order, because we don't want
+    /*
-     * 'Ä' to match "ÁE", only "AE".  This couples the to_upper and
+      Because sort_order_latin1_de doesn't convert 'Ä', Ü or ß we
-     * sort_order tables together, but that is acceptable. */
+      can use it here.
-    uchar c1 = to_upper_latin1_de[*s1];
+    */
-    uchar c2 = to_upper_latin1_de[*s2];
+    uchar c1 = sort_order_latin1_de[*s1++];
    uchar c2 = sort_order_latin1_de[*s2++];
    if (c1 != c2)
    {
-      switch (c1)
+      switch (c1) {
      {
 #define CHECK_S1_COMBO(fst, snd, accent)                                  \
  /* Invariant: c1 == fst == sort_order_latin1_de[accent] && c1 != c2 */  \
  if (c2 == accent)                                                       \
  {                                                                       \
    if (s1 + 1 < e1)                                                      \
    {                                                                     \
      if (to_upper_latin1_de[*(s1 + 1)] == snd)                           \
      {                                                                   \
 	/* They are equal (e.g., "Ae" == 'ä') */                          \
 	s1 += 2;                                                          \
 	s2 += 1;                                                          \
      }                                                                   \
      else                                                                \
      {                                                                   \
 	int diff = sort_order_latin1_de[*(s1 + 1)] - snd;                 \
 	if (diff)                                                         \
 	  return diff;                                                    \
 	else                                                              \
 	  /* Comparison between, e.g., "AÉ" and 'Ä' */                    \
 	  return 1;                                                       \
      }                                                                   \
    }                                                                     \
    else                                                                  \
      return -1;                                                          \
  }                                                                       \
  else                                                                    \
    /* The following should work even if c2 is [ÄÖÜß] */                  \
    return fst - sort_order_latin1_de[c2]
      case 'A':
-	CHECK_S1_COMBO('A', 'E', L1_AE);
+	CHECK_S1_COMBO(c1, c2, s1, e1, -1, s2, 'A', 'E', L1_AE);
 	break;
      case 'O':
-	CHECK_S1_COMBO('O', 'E', L1_OE);
+	CHECK_S1_COMBO(c1, c2, s1, e1, -1, s2, 'O', 'E', L1_OE);
 	break;
      case 'U':
-	CHECK_S1_COMBO('U', 'E', L1_UE);
+	CHECK_S1_COMBO(c1, c2, s1, e1, -1, s2, 'U', 'E', L1_UE);
 	break;
      case 'S':
-	CHECK_S1_COMBO('S', 'S', L1_ss);
+	CHECK_S1_COMBO(c1, c2, s1, e1, -1, s2, 'S', 'S', L1_ss);
 	break;
 #define CHECK_S2_COMBO(fst, snd)                                          \
  /* Invariant: sort_order_latin1_de[c1] == fst && c1 != c2 */            \
  if (c2 == fst)                                                          \
  {                                                                       \
    if (s2 + 1 < e2)                                                      \
    {                                                                     \
      if (to_upper_latin1_de[*(s2 + 1)] == snd)                           \
      {                                                                   \
 	/* They are equal (e.g., 'ä' == "Ae") */                          \
 	s1 += 1;                                                          \
 	s2 += 2;                                                          \
      }                                                                   \
      else                                                                \
      {                                                                   \
 	int diff = sort_order_latin1_de[*(s1 + 1)] - snd;                 \
 	if (diff)                                                         \
 	  return diff;                                                    \
 	else                                                              \
 	  /* Comparison between, e.g., 'Ä' and "AÉ" */                    \
 	  return -1;                                                      \
      }                                                                   \
    }                                                                     \
    else                                                                  \
      return 1;                                                           \
  }                                                                       \
  else                                                                    \
    /* The following should work even if c2 is [ÄÖÜß] */                  \
    return fst - sort_order_latin1_de[c2]
      case L1_AE:
-	CHECK_S2_COMBO('A', 'E');
+	CHECK_S1_COMBO(c1, c2, s2, e2, 1, s1, 'A', 'E', 'A');
 	break;
      case L1_OE:
-	CHECK_S2_COMBO('O', 'E');
+	CHECK_S1_COMBO(c1, c2, s2, e2, 1, s1, 'O', 'E', 'O');
 	break;
      case L1_UE:
-	CHECK_S2_COMBO('U', 'E');
+	CHECK_S1_COMBO(c1, c2, s2, e2, 1, s1, 'U', 'E', 'U');
 	break;
      case L1_ss:
-	CHECK_S2_COMBO('S', 'S');
+	CHECK_S1_COMBO(c1, c2, s2, e2, 1, s1, 'S', 'S', 'S');
 	break;
      default:
 	/*
 	  Handle the case where 'c2' is a special character
 	  If this is true, we know that c1 can't match this character.
 	*/
    normal:
 	switch (c2) {
 	case L1_AE:
 	  return  (int) c1 - (int) 'A';
 	case L1_OE:
 	  return  (int) c1 - (int) 'O';
 	case L1_UE:
 	  return  (int) c1 - (int) 'U';
 	case L1_ss:
-	  /* Make sure these do not match (e.g., "Ä" != "Á") */
+	  return  (int) c1 - (int) 'S';
 	  return sort_order_latin1_de[c1] - sort_order_latin1_de[c2];
 	  break;
 	default:
 	  if (sort_order_latin1_de[*s1] != sort_order_latin1_de[*s2])
 	    return sort_order_latin1_de[*s1] - sort_order_latin1_de[*s2];
 	  ++s1;
 	  ++s2;
 	  break;
 	}
 	break;
 #undef CHECK_S1_COMBO
 #undef CHECK_S2_COMBO
      }
    }
    else
 	{
-      /* In order to consistently treat "ae" == 'ä', but to NOT allow
+	  int diff= (int) c1 - (int) c2;
-       * "aé" == 'ä', we must look ahead here to ensure that the second
+	  if (diff)
-       * letter in a combo really is the unaccented 'e' (or 's' for
+	    return diff;
-       * "ss") and is not an accented character with the same sort_order. */
+	}
      ++s1;
      ++s2;
      if (s1 < e1 && s2 < e2)
      {
 	switch (c1)
 	{
 	case 'A':
 	case 'O':
 	case 'U':
 	  if (sort_order_latin1_de[*s1] == 'E' &&
 	      to_upper_latin1_de[*s1] != 'E' &&
 	      to_upper_latin1_de[*s2] == 'E')
 	    /* Comparison between, e.g., "AÉ" and "AE" */
 	    return 1;
 	  if (sort_order_latin1_de[*s2] == 'E' &&
 	      to_upper_latin1_de[*s2] != 'E' &&
 	      to_upper_latin1_de[*s1] == 'E')
 	    /* Comparison between, e.g., "AE" and "AÉ" */
 	    return -1;
 	  break;
 	case 'S':
 	  if (sort_order_latin1_de[*s1] == 'S' &&
 	      to_upper_latin1_de[*s1] != 'S' &&
 	      to_upper_latin1_de[*s2] == 'S')
 	    /* Comparison between, e.g., "Sß" and "SS" */
 	    return 1;
 	  if (sort_order_latin1_de[*s2] == 'S' &&
 	      to_upper_latin1_de[*s2] != 'S' &&
 	      to_upper_latin1_de[*s1] == 'S')
 	    /* Comparison between, e.g., "SS" and "Sß" */
 	    return -1;
 	  break;
 	default:
 	break;
 	}
      }
    }
  }
  /* A simple test of string lengths won't work -- we test to see
   * which string ran out first */
  return s1 < e1 ? 1 : s2 < e2 ? -1 : 0;
 }
 int my_strnxfrm_latin1_de(uchar * dest, const uchar * src, int len, int srclen)
 {
  const uchar *dest_orig = dest;
@ -313,22 +243,19 @@ int my_strnxfrm_latin1_de(uchar * dest, const uchar * src, int len, int srclen)
  const uchar *se = src + srclen;
  while (src < se && dest < de)
  {
-    switch (*src)
+    uchar chr=sort_order_latin1_de[*src];
-    {
+    switch (chr) {
    case L1_AE:
    case L1_ae:
      *dest++ = 'A';
      if (dest < de)
 	*dest++ = 'E';
      break;
    case L1_OE:
    case L1_oe:
      *dest++ = 'O';
      if (dest < de)
 	*dest++ = 'E';
      break;
    case L1_UE:
    case L1_ue:
      *dest++ = 'U';
      if (dest < de)
 	*dest++ = 'E';
@ -339,7 +266,7 @@ int my_strnxfrm_latin1_de(uchar * dest, const uchar * src, int len, int srclen)
 	*dest++ = 'S';
      break;
    default:
-      *dest++ = sort_order_latin1_de[*src];
+      *dest++= chr;
      break;
    }
    ++src;
@ -347,6 +274,7 @@ int my_strnxfrm_latin1_de(uchar * dest, const uchar * src, int len, int srclen)
  return dest - dest_orig;
 }
 int my_strcoll_latin1_de(const uchar * s1, const uchar * s2)
 {
  /* XXX QQ: This should be fixed to not call strlen */