Allocate memory when a character set is requested:
- For simple character sets: from_uni convertion table. - For UCA: alternative weight arrays. Use mbminlen instead of MY_CS_NONTEXT
This commit is contained in:
parent
4047b5ade3
commit
5275eb21c2
@ -53,7 +53,6 @@ typedef struct unicase_info_st
|
||||
#define MY_SEQ_SPACES 2
|
||||
|
||||
/* My charsets_list flags */
|
||||
#define MY_NO_SETS 0
|
||||
#define MY_CS_COMPILED 1 /* compiled-in sets */
|
||||
#define MY_CS_CONFIG 2 /* sets that have a *.conf file */
|
||||
#define MY_CS_INDEX 4 /* sets listed in the Index file */
|
||||
@ -62,7 +61,7 @@ typedef struct unicase_info_st
|
||||
#define MY_CS_PRIMARY 32 /* if primary collation */
|
||||
#define MY_CS_STRNXFRM 64 /* if strnxfrm is used for sort */
|
||||
#define MY_CS_UNICODE 128 /* is a charset is full unicode */
|
||||
#define MY_CS_NONTEXT 256 /* if a charset is not sprintf() compatible */
|
||||
#define MY_CS_READY 256 /* if a charset is initialized */
|
||||
#define MY_CS_AVAILABLE 512 /* If either compiled-in or loaded*/
|
||||
|
||||
#define MY_CHARSET_UNDEFINED 0
|
||||
@ -102,6 +101,7 @@ struct charset_info_st;
|
||||
|
||||
typedef struct my_collation_handler_st
|
||||
{
|
||||
my_bool (*init)(struct charset_info_st *, void *(*alloc)(uint));
|
||||
/* Collation routines */
|
||||
int (*strnncoll)(struct charset_info_st *,
|
||||
const uchar *, uint, const uchar *, uint);
|
||||
@ -140,6 +140,7 @@ extern MY_COLLATION_HANDLER my_collation_ucs2_uca_handler;
|
||||
|
||||
typedef struct my_charset_handler_st
|
||||
{
|
||||
my_bool (*init)(struct charset_info_st *, void *(*alloc)(uint));
|
||||
/* Multibyte routines */
|
||||
int (*ismbchar)(struct charset_info_st *, const char *, const char *);
|
||||
int (*mbcharlen)(struct charset_info_st *, uint);
|
||||
|
562
mysys/charset.c
562
mysys/charset.c
@ -22,354 +22,6 @@
|
||||
#include <my_xml.h>
|
||||
|
||||
|
||||
/*
|
||||
Collation language is implemented according to
|
||||
subset of ICU Collation Customization (tailorings):
|
||||
http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
|
||||
|
||||
Collation language elements:
|
||||
Delimiters:
|
||||
space - skipped
|
||||
|
||||
<char> := A-Z | a-z | \uXXXX
|
||||
|
||||
Shift command:
|
||||
<shift> := & - reset at this letter.
|
||||
|
||||
Diff command:
|
||||
<d1> := < - Identifies a primary difference.
|
||||
<d2> := << - Identifies a secondary difference.
|
||||
<d3> := <<< - Idenfifies a tertiary difference.
|
||||
|
||||
|
||||
Collation rules:
|
||||
<ruleset> := <rule> { <ruleset> }
|
||||
|
||||
<rule> := <d1> <string>
|
||||
| <d2> <string>
|
||||
| <d3> <string>
|
||||
| <shift> <char>
|
||||
|
||||
<string> := <char> [ <string> ]
|
||||
|
||||
An example, Polish collation:
|
||||
|
||||
&A < \u0105 <<< \u0104
|
||||
&C < \u0107 <<< \u0106
|
||||
&E < \u0119 <<< \u0118
|
||||
&L < \u0142 <<< \u0141
|
||||
&N < \u0144 <<< \u0143
|
||||
&O < \u00F3 <<< \u00D3
|
||||
&S < \u015B <<< \u015A
|
||||
&Z < \u017A <<< \u017B
|
||||
*/
|
||||
|
||||
|
||||
typedef enum my_coll_lexem_num_en
|
||||
{
|
||||
MY_COLL_LEXEM_EOF = 0,
|
||||
MY_COLL_LEXEM_DIFF = 1,
|
||||
MY_COLL_LEXEM_SHIFT = 4,
|
||||
MY_COLL_LEXEM_CHAR = 5,
|
||||
MY_COLL_LEXEM_ERROR = 6
|
||||
} my_coll_lexem_num;
|
||||
|
||||
|
||||
typedef struct my_coll_lexem_st
|
||||
{
|
||||
const char *beg;
|
||||
const char *end;
|
||||
const char *prev;
|
||||
int diff;
|
||||
int code;
|
||||
} MY_COLL_LEXEM;
|
||||
|
||||
|
||||
/*
|
||||
Initialize collation rule lexical anilizer
|
||||
|
||||
SYNOPSIS
|
||||
my_coll_lexem_init
|
||||
lexem Lex analizer to init
|
||||
str Const string to parse
|
||||
strend End of the string
|
||||
USAGE
|
||||
|
||||
RETURN VALUES
|
||||
N/A
|
||||
*/
|
||||
|
||||
static void my_coll_lexem_init(MY_COLL_LEXEM *lexem,
|
||||
const char *str, const char *strend)
|
||||
{
|
||||
lexem->beg= str;
|
||||
lexem->prev= str;
|
||||
lexem->end= strend;
|
||||
lexem->diff= 0;
|
||||
lexem->code= 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Print collation customization expression parse error, with context.
|
||||
|
||||
SYNOPSIS
|
||||
my_coll_lexem_print_error
|
||||
lexem Lex analizer to take context from
|
||||
errstr sting to write error to
|
||||
errsize errstr size
|
||||
txt error message
|
||||
USAGE
|
||||
|
||||
RETURN VALUES
|
||||
N/A
|
||||
*/
|
||||
|
||||
static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem,
|
||||
char *errstr, size_t errsize,
|
||||
const char *txt)
|
||||
{
|
||||
char tail[30];
|
||||
size_t len= lexem->end - lexem->prev;
|
||||
strmake (tail, lexem->prev, min(len, sizeof(tail)-1));
|
||||
errstr[errsize-1]= '\0';
|
||||
my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Convert a hex digit into its numeric value
|
||||
|
||||
SYNOPSIS
|
||||
ch2x
|
||||
ch hex digit to convert
|
||||
USAGE
|
||||
|
||||
RETURN VALUES
|
||||
an integer value in the range 0..15
|
||||
-1 on error
|
||||
*/
|
||||
|
||||
static int ch2x(int ch)
|
||||
{
|
||||
if (ch >= '0' && ch <= '9')
|
||||
return ch - '0';
|
||||
|
||||
if (ch >= 'a' && ch <= 'f')
|
||||
return 10 + ch - 'a';
|
||||
|
||||
if (ch >= 'A' && ch <= 'F')
|
||||
return 10 + ch - 'A';
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Collation language lexical parser:
|
||||
Scans the next lexem.
|
||||
|
||||
SYNOPSIS
|
||||
my_coll_lexem_next
|
||||
lexem Lex analizer, previously initialized by
|
||||
my_coll_lexem_init.
|
||||
USAGE
|
||||
Call this function in a loop
|
||||
|
||||
RETURN VALUES
|
||||
Lexem number: eof, diff, shift, char or error.
|
||||
*/
|
||||
|
||||
static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem)
|
||||
{
|
||||
for ( ;lexem->beg < lexem->end ; lexem->beg++)
|
||||
{
|
||||
lexem->prev= lexem->beg;
|
||||
if (lexem->beg[0] == ' ' || lexem->beg[0] == '\t' ||
|
||||
lexem->beg[0] == '\r' || lexem->beg[0] == '\n')
|
||||
continue;
|
||||
|
||||
if (lexem->beg[0] == '&')
|
||||
{
|
||||
lexem->beg++;
|
||||
return MY_COLL_LEXEM_SHIFT;
|
||||
}
|
||||
|
||||
if (lexem->beg[0] == '<')
|
||||
{
|
||||
for (lexem->beg++, lexem->diff=1;
|
||||
(lexem->beg < lexem->end) &&
|
||||
(lexem->beg[0] == '<') && (lexem->diff<3);
|
||||
lexem->beg++, lexem->diff++);
|
||||
return MY_COLL_LEXEM_DIFF;
|
||||
}
|
||||
|
||||
if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') ||
|
||||
(lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z'))
|
||||
{
|
||||
lexem->code= lexem->beg[0];
|
||||
lexem->beg++;
|
||||
return MY_COLL_LEXEM_CHAR;
|
||||
}
|
||||
|
||||
if ((lexem->beg[0] == '\\') &&
|
||||
(lexem->beg+2 < lexem->end) &&
|
||||
(lexem->beg[1] == 'u'))
|
||||
{
|
||||
int ch;
|
||||
|
||||
lexem->code= 0;
|
||||
for (lexem->beg+=2;
|
||||
(lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ;
|
||||
lexem->beg++)
|
||||
{
|
||||
lexem->code= (lexem->code << 4) + ch;
|
||||
}
|
||||
return MY_COLL_LEXEM_CHAR;
|
||||
}
|
||||
|
||||
return MY_COLL_LEXEM_ERROR;
|
||||
}
|
||||
return MY_COLL_LEXEM_EOF;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Collation rule item
|
||||
*/
|
||||
|
||||
typedef struct my_coll_rule_item_st
|
||||
{
|
||||
uint base; /* Base character */
|
||||
uint curr; /* Current character */
|
||||
int diff[3]; /* Primary, Secondary and Tertiary difference */
|
||||
} MY_COLL_RULE;
|
||||
|
||||
|
||||
/*
|
||||
Collation language syntax parser.
|
||||
Uses lexical parser.
|
||||
|
||||
SYNOPSIS
|
||||
my_coll_rule_parse
|
||||
rule Collation rule list to load to.
|
||||
str A string containin collation language expression.
|
||||
strend End of the string.
|
||||
USAGE
|
||||
|
||||
RETURN VALUES
|
||||
0 - OK
|
||||
1 - ERROR, e.g. too many items.
|
||||
*/
|
||||
|
||||
static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
|
||||
const char *str, const char *strend,
|
||||
char *errstr, size_t errsize)
|
||||
{
|
||||
MY_COLL_LEXEM lexem;
|
||||
my_coll_lexem_num lexnum;
|
||||
my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR;
|
||||
MY_COLL_RULE item;
|
||||
int state= 0;
|
||||
size_t nitems= 0;
|
||||
|
||||
/* Init all variables */
|
||||
errstr[0]= '\0';
|
||||
bzero(&item, sizeof(item));
|
||||
my_coll_lexem_init(&lexem, str, strend);
|
||||
|
||||
while ((lexnum= my_coll_lexem_next(&lexem)))
|
||||
{
|
||||
if (lexnum == MY_COLL_LEXEM_ERROR)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character");
|
||||
return -1;
|
||||
}
|
||||
|
||||
switch (state) {
|
||||
case 0:
|
||||
if (lexnum != MY_COLL_LEXEM_SHIFT)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected");
|
||||
return -1;
|
||||
}
|
||||
prevlexnum= lexnum;
|
||||
state= 2;
|
||||
continue;
|
||||
|
||||
case 1:
|
||||
if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected");
|
||||
return -1;
|
||||
}
|
||||
prevlexnum= lexnum;
|
||||
state= 2;
|
||||
continue;
|
||||
|
||||
case 2:
|
||||
if (lexnum != MY_COLL_LEXEM_CHAR)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (prevlexnum == MY_COLL_LEXEM_SHIFT)
|
||||
{
|
||||
item.base= lexem.code;
|
||||
item.diff[0]= 0;
|
||||
item.diff[1]= 0;
|
||||
item.diff[2]= 0;
|
||||
}
|
||||
else if (prevlexnum == MY_COLL_LEXEM_DIFF)
|
||||
{
|
||||
item.curr= lexem.code;
|
||||
if (lexem.diff == 3)
|
||||
{
|
||||
item.diff[2]++;
|
||||
}
|
||||
else if (lexem.diff == 2)
|
||||
{
|
||||
item.diff[1]++;
|
||||
item.diff[2]= 0;
|
||||
}
|
||||
else if (lexem.diff == 1)
|
||||
{
|
||||
item.diff[0]++;
|
||||
item.diff[1]= 0;
|
||||
item.diff[2]= 0;
|
||||
}
|
||||
if (nitems >= mitems)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules");
|
||||
return -1;
|
||||
}
|
||||
rule[nitems++]= item;
|
||||
}
|
||||
else
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen");
|
||||
return -1;
|
||||
}
|
||||
state= 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return (size_t) nitems;
|
||||
}
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int nchars;
|
||||
MY_UNI_IDX uidx;
|
||||
} uni_idx;
|
||||
|
||||
#define PLANE_SIZE 0x100
|
||||
#define PLANE_NUM 0x100
|
||||
#define PLANE_NUMBER(x) (((x)>>8) % PLANE_NUM)
|
||||
|
||||
|
||||
/*
|
||||
The code below implements this functionality:
|
||||
|
||||
@ -484,91 +136,6 @@ static void simple_cs_init_functions(CHARSET_INFO *cs)
|
||||
}
|
||||
|
||||
|
||||
static int pcmp(const void * f, const void * s)
|
||||
{
|
||||
const uni_idx *F= (const uni_idx*) f;
|
||||
const uni_idx *S= (const uni_idx*) s;
|
||||
int res;
|
||||
|
||||
if (!(res=((S->nchars)-(F->nchars))))
|
||||
res=((F->uidx.from)-(S->uidx.to));
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
static my_bool create_fromuni(CHARSET_INFO *cs)
|
||||
{
|
||||
uni_idx idx[PLANE_NUM];
|
||||
int i,n;
|
||||
|
||||
/* Clear plane statistics */
|
||||
bzero(idx,sizeof(idx));
|
||||
|
||||
/* Count number of characters in each plane */
|
||||
for (i=0; i< 0x100; i++)
|
||||
{
|
||||
uint16 wc=cs->tab_to_uni[i];
|
||||
int pl= PLANE_NUMBER(wc);
|
||||
|
||||
if (wc || !i)
|
||||
{
|
||||
if (!idx[pl].nchars)
|
||||
{
|
||||
idx[pl].uidx.from=wc;
|
||||
idx[pl].uidx.to=wc;
|
||||
}else
|
||||
{
|
||||
idx[pl].uidx.from=wc<idx[pl].uidx.from?wc:idx[pl].uidx.from;
|
||||
idx[pl].uidx.to=wc>idx[pl].uidx.to?wc:idx[pl].uidx.to;
|
||||
}
|
||||
idx[pl].nchars++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Sort planes in descending order */
|
||||
qsort(&idx,PLANE_NUM,sizeof(uni_idx),&pcmp);
|
||||
|
||||
for (i=0; i < PLANE_NUM; i++)
|
||||
{
|
||||
int ch,numchars;
|
||||
|
||||
/* Skip empty plane */
|
||||
if (!idx[i].nchars)
|
||||
break;
|
||||
|
||||
numchars=idx[i].uidx.to-idx[i].uidx.from+1;
|
||||
if (!(idx[i].uidx.tab=(uchar*) my_once_alloc(numchars *
|
||||
sizeof(*idx[i].uidx.tab),
|
||||
MYF(MY_WME))))
|
||||
return TRUE;
|
||||
|
||||
bzero(idx[i].uidx.tab,numchars*sizeof(*idx[i].uidx.tab));
|
||||
|
||||
for (ch=1; ch < PLANE_SIZE; ch++)
|
||||
{
|
||||
uint16 wc=cs->tab_to_uni[ch];
|
||||
if (wc >= idx[i].uidx.from && wc <= idx[i].uidx.to && wc)
|
||||
{
|
||||
int ofs= wc - idx[i].uidx.from;
|
||||
idx[i].uidx.tab[ofs]= ch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Allocate and fill reverse table for each plane */
|
||||
n=i;
|
||||
if (!(cs->tab_from_uni= (MY_UNI_IDX*) my_once_alloc(sizeof(MY_UNI_IDX)*(n+1),
|
||||
MYF(MY_WME))))
|
||||
return TRUE;
|
||||
|
||||
for (i=0; i< n; i++)
|
||||
cs->tab_from_uni[i]= idx[i].uidx;
|
||||
|
||||
/* Set end-of-list marker */
|
||||
bzero(&cs->tab_from_uni[i],sizeof(MY_UNI_IDX));
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
|
||||
{
|
||||
@ -622,8 +189,6 @@ static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
|
||||
if (!(to->tab_to_uni= (uint16*) my_once_memdup((char*)from->tab_to_uni,
|
||||
sz, MYF(MY_WME))))
|
||||
goto err;
|
||||
if (create_fromuni(to))
|
||||
goto err;
|
||||
}
|
||||
to->mbminlen= 1;
|
||||
to->mbmaxlen= 1;
|
||||
@ -754,117 +319,6 @@ static my_tailoring tailoring[]=
|
||||
}
|
||||
};
|
||||
|
||||
#define MY_MAX_COLL_RULE 64
|
||||
|
||||
/*
|
||||
This function copies an UCS2 collation from
|
||||
the default Unicode Collation Algorithm (UCA)
|
||||
weights applying tailorings, i.e. a set of
|
||||
alternative weights for some characters.
|
||||
|
||||
The default UCA weights are stored in my_charset_ucs2_general_uca.
|
||||
They consist of 256 pages, 256 character each.
|
||||
|
||||
If a page is not overwritten by tailoring rules,
|
||||
it is copies as is from UCA as is.
|
||||
|
||||
If a page contains some overwritten characters, it is
|
||||
allocated. Untouched characters are copied from the
|
||||
default weights.
|
||||
*/
|
||||
|
||||
static my_bool create_tailoring(CHARSET_INFO *cs)
|
||||
{
|
||||
MY_COLL_RULE rule[MY_MAX_COLL_RULE];
|
||||
char errstr[128];
|
||||
uchar *newlengths;
|
||||
uint16 **newweights;
|
||||
const uchar *deflengths= my_charset_ucs2_general_uca.sort_order;
|
||||
uint16 **defweights= my_charset_ucs2_general_uca.sort_order_big;
|
||||
int rc, i;
|
||||
|
||||
if (!cs->tailoring)
|
||||
return 1;
|
||||
|
||||
/* Parse ICU Collation Customization expression */
|
||||
if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE,
|
||||
cs->tailoring,
|
||||
cs->tailoring + strlen(cs->tailoring),
|
||||
errstr, sizeof(errstr))) <= 0)
|
||||
{
|
||||
/*
|
||||
TODO: add error message reporting.
|
||||
printf("Error: %d '%s'\n", rc, errstr);
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!(newweights= (uint16**) my_once_alloc(256*sizeof(uint16*),MYF(MY_WME))))
|
||||
return 1;
|
||||
bzero(newweights, 256*sizeof(uint16*));
|
||||
|
||||
if (!(newlengths= (uchar*) my_once_memdup(deflengths,256,MYF(MY_WME))))
|
||||
return 1;
|
||||
|
||||
/*
|
||||
Calculate maximum lenghts for the pages
|
||||
which will be overwritten.
|
||||
*/
|
||||
for (i=0; i < rc; i++)
|
||||
{
|
||||
uint pageb= (rule[i].base >> 8) & 0xFF;
|
||||
uint pagec= (rule[i].curr >> 8) & 0xFF;
|
||||
|
||||
if (newlengths[pagec] < deflengths[pageb])
|
||||
newlengths[pagec]= deflengths[pageb];
|
||||
}
|
||||
|
||||
for (i=0; i < rc; i++)
|
||||
{
|
||||
uint pageb= (rule[i].base >> 8) & 0xFF;
|
||||
uint pagec= (rule[i].curr >> 8) & 0xFF;
|
||||
uint chb, chc;
|
||||
|
||||
if (!newweights[pagec])
|
||||
{
|
||||
/* Alloc new page and copy the default UCA weights */
|
||||
uint size= 256*newlengths[pagec]*sizeof(uint16);
|
||||
|
||||
if (!(newweights[pagec]= (uint16*) my_once_alloc(size,MYF(MY_WME))))
|
||||
return 1;
|
||||
bzero((void*) newweights[pagec], size);
|
||||
|
||||
for (chc=0 ; chc < 256; chc++)
|
||||
{
|
||||
memcpy(newweights[pagec] + chc*newlengths[pagec],
|
||||
defweights[pagec] + chc*deflengths[pagec],
|
||||
deflengths[pagec]*sizeof(uint16));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Aply the alternative rule:
|
||||
shift to the base character and primary difference.
|
||||
*/
|
||||
chc= rule[i].curr & 0xFF;
|
||||
chb= rule[i].base & 0xFF;
|
||||
memcpy(newweights[pagec] + chc*newlengths[pagec],
|
||||
defweights[pageb] + chb*deflengths[pageb],
|
||||
deflengths[pageb]*sizeof(uint16));
|
||||
/* Apply primary difference */
|
||||
newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0];
|
||||
}
|
||||
|
||||
/* Copy non-overwritten pages from the default UCA weights */
|
||||
for (i= 0; i < 256 ; i++)
|
||||
if (!newweights[i])
|
||||
newweights[i]= defweights[i];
|
||||
|
||||
cs->sort_order= newlengths;
|
||||
cs->sort_order_big= newweights;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
|
||||
@ -894,7 +348,7 @@ static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
|
||||
to->mbminlen= 2;
|
||||
to->mbmaxlen= 2;
|
||||
|
||||
return create_tailoring(to);
|
||||
return 0;
|
||||
|
||||
err:
|
||||
return 1;
|
||||
@ -997,7 +451,7 @@ static my_bool init_uca_charsets()
|
||||
CHARSET_INFO cs= my_charset_ucs2_general_uca;
|
||||
char name[64];
|
||||
|
||||
cs.state= MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONTEXT;
|
||||
cs.state= MY_CS_STRNXFRM|MY_CS_UNICODE;
|
||||
for (t= tailoring; t->tailoring; t++)
|
||||
{
|
||||
cs.number= 128 + t->number;
|
||||
@ -1083,6 +537,10 @@ void add_compiled_collation(CHARSET_INFO *cs)
|
||||
cs->state|= MY_CS_AVAILABLE;
|
||||
}
|
||||
|
||||
static void *cs_alloc(uint size)
|
||||
{
|
||||
return my_once_alloc(size, MYF(MY_WME));
|
||||
}
|
||||
|
||||
|
||||
#ifdef __NETWARE__
|
||||
@ -1207,6 +665,14 @@ static CHARSET_INFO *get_internal_charset(uint cs_number, myf flags)
|
||||
cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
|
||||
}
|
||||
pthread_mutex_unlock(&THR_LOCK_charset);
|
||||
if (cs && !(cs->state & MY_CS_READY))
|
||||
{
|
||||
if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
|
||||
(cs->coll->init && cs->coll->init(cs, cs_alloc)))
|
||||
cs= NULL;
|
||||
else
|
||||
cs->state|= MY_CS_READY;
|
||||
}
|
||||
return cs;
|
||||
}
|
||||
|
||||
|
@ -373,7 +373,7 @@ Item *create_func_space(Item *a)
|
||||
CHARSET_INFO *cs= current_thd->variables.collation_connection;
|
||||
Item *sp;
|
||||
|
||||
if (cs->state & MY_CS_NONTEXT)
|
||||
if (cs->mbminlen > 1)
|
||||
{
|
||||
sp= new Item_string("",0,cs);
|
||||
if (sp)
|
||||
|
@ -329,7 +329,7 @@ char log_error_file[FN_REFLEN], glob_hostname[FN_REFLEN];
|
||||
char* log_error_file_ptr= log_error_file;
|
||||
char mysql_real_data_home[FN_REFLEN],
|
||||
language[LIBLEN],reg_ext[FN_EXTLEN], mysql_charsets_dir[FN_REFLEN],
|
||||
max_sort_char,*mysqld_user,*mysqld_chroot, *opt_init_file,
|
||||
*mysqld_user,*mysqld_chroot, *opt_init_file,
|
||||
*opt_init_connect, *opt_init_slave,
|
||||
def_ft_boolean_syntax[sizeof(ft_boolean_syntax)];
|
||||
|
||||
@ -5249,7 +5249,6 @@ static void mysql_init_variables(void)
|
||||
specialflag= opened_tables= created_tmp_tables= created_tmp_disk_tables= 0;
|
||||
binlog_cache_use= binlog_cache_disk_use= 0;
|
||||
max_used_connections= slow_launch_threads = 0;
|
||||
max_sort_char= 0;
|
||||
mysqld_user= mysqld_chroot= opt_init_file= opt_bin_logname = 0;
|
||||
errmesg= 0;
|
||||
mysqld_unix_port= opt_mysql_tmpdir= my_bind_addr_str= NullS;
|
||||
|
@ -370,7 +370,7 @@ bool String::copy(const char *str, uint32 arg_length,
|
||||
|
||||
bool String::set_ascii(const char *str, uint32 arg_length)
|
||||
{
|
||||
if (!(str_charset->state & MY_CS_NONTEXT))
|
||||
if (!(str_charset->mbminlen > 1))
|
||||
{
|
||||
set(str, arg_length, str_charset);
|
||||
return 0;
|
||||
|
@ -858,7 +858,7 @@ int mysql_prepare_table(THD *thd, HA_CREATE_INFO *create_info,
|
||||
sql_field->sql_type != FIELD_TYPE_VAR_STRING &&
|
||||
!f_is_blob(sql_field->pack_flag)) ||
|
||||
sql_field->charset == &my_charset_bin ||
|
||||
sql_field->charset->state & MY_CS_NONTEXT || // ucs2 doesn't work yet
|
||||
sql_field->charset->mbminlen > 1 || // ucs2 doesn't work yet
|
||||
(ft_key_charset && sql_field->charset != ft_key_charset))
|
||||
{
|
||||
my_printf_error(ER_BAD_FT_COLUMN,ER(ER_BAD_FT_COLUMN),MYF(0),
|
||||
|
@ -6269,6 +6269,7 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_big5,
|
||||
my_strnncollsp_big5,
|
||||
my_strnxfrm_big5,
|
||||
@ -6281,6 +6282,7 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_big5_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
ismbchar_big5,
|
||||
mbcharlen_big5,
|
||||
my_numchars_mb,
|
||||
|
@ -333,6 +333,7 @@ skip:
|
||||
|
||||
MY_COLLATION_HANDLER my_collation_8bit_bin_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_binary,
|
||||
my_strnncoll_binary,
|
||||
my_strnxfrm_bin,
|
||||
@ -346,6 +347,7 @@ MY_COLLATION_HANDLER my_collation_8bit_bin_handler =
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
NULL, /* ismbchar */
|
||||
my_mbcharlen_8bit, /* mbcharlen */
|
||||
my_numchars_8bit,
|
||||
|
@ -572,6 +572,7 @@ static MY_UNI_IDX idx_uni_8859_2[]={
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_latin2_czech_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_czech,
|
||||
my_strnncollsp_czech,
|
||||
my_strnxfrm_czech,
|
||||
|
@ -8637,6 +8637,7 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_simple, /* strnncoll */
|
||||
my_strnncollsp_simple,
|
||||
my_strnxfrm_simple, /* strnxfrm */
|
||||
@ -8649,6 +8650,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
ismbchar_euc_kr,
|
||||
mbcharlen_euc_kr,
|
||||
my_numchars_mb,
|
||||
|
@ -5688,6 +5688,7 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_simple, /* strnncoll */
|
||||
my_strnncollsp_simple,
|
||||
my_strnxfrm_simple, /* strnxfrm */
|
||||
@ -5700,6 +5701,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
ismbchar_gb2312,
|
||||
mbcharlen_gb2312,
|
||||
my_numchars_mb,
|
||||
|
@ -9918,6 +9918,7 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_gbk,
|
||||
my_strnncollsp_gbk,
|
||||
my_strnxfrm_gbk,
|
||||
@ -9930,6 +9931,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
ismbchar_gbk,
|
||||
mbcharlen_gbk,
|
||||
my_numchars_mb,
|
||||
|
@ -380,6 +380,7 @@ int my_wc_mb_latin1(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
NULL,
|
||||
my_mbcharlen_8bit,
|
||||
my_numchars_8bit,
|
||||
@ -674,6 +675,7 @@ void my_hash_sort_latin1_de(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_german2_ci_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_latin1_de,
|
||||
my_strnncollsp_latin1_de,
|
||||
my_strnxfrm_latin1_de,
|
||||
|
@ -512,6 +512,7 @@ static int my_wildcmp_mb_bin(CHARSET_INFO *cs,
|
||||
|
||||
MY_COLLATION_HANDLER my_collation_mb_bin_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_mb_bin,
|
||||
my_strnncoll_mb_bin,
|
||||
my_strnxfrm_mb_bin,
|
||||
|
@ -1142,8 +1142,107 @@ skip:
|
||||
}
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int nchars;
|
||||
MY_UNI_IDX uidx;
|
||||
} uni_idx;
|
||||
|
||||
#define PLANE_SIZE 0x100
|
||||
#define PLANE_NUM 0x100
|
||||
#define PLANE_NUMBER(x) (((x)>>8) % PLANE_NUM)
|
||||
|
||||
static int pcmp(const void * f, const void * s)
|
||||
{
|
||||
const uni_idx *F= (const uni_idx*) f;
|
||||
const uni_idx *S= (const uni_idx*) s;
|
||||
int res;
|
||||
|
||||
if (!(res=((S->nchars)-(F->nchars))))
|
||||
res=((F->uidx.from)-(S->uidx.to));
|
||||
return res;
|
||||
}
|
||||
|
||||
static my_bool create_fromuni(CHARSET_INFO *cs, void *(*alloc)(uint))
|
||||
{
|
||||
uni_idx idx[PLANE_NUM];
|
||||
int i,n;
|
||||
|
||||
/* Clear plane statistics */
|
||||
bzero(idx,sizeof(idx));
|
||||
|
||||
/* Count number of characters in each plane */
|
||||
for (i=0; i< 0x100; i++)
|
||||
{
|
||||
uint16 wc=cs->tab_to_uni[i];
|
||||
int pl= PLANE_NUMBER(wc);
|
||||
|
||||
if (wc || !i)
|
||||
{
|
||||
if (!idx[pl].nchars)
|
||||
{
|
||||
idx[pl].uidx.from=wc;
|
||||
idx[pl].uidx.to=wc;
|
||||
}else
|
||||
{
|
||||
idx[pl].uidx.from=wc<idx[pl].uidx.from?wc:idx[pl].uidx.from;
|
||||
idx[pl].uidx.to=wc>idx[pl].uidx.to?wc:idx[pl].uidx.to;
|
||||
}
|
||||
idx[pl].nchars++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Sort planes in descending order */
|
||||
qsort(&idx,PLANE_NUM,sizeof(uni_idx),&pcmp);
|
||||
|
||||
for (i=0; i < PLANE_NUM; i++)
|
||||
{
|
||||
int ch,numchars;
|
||||
|
||||
/* Skip empty plane */
|
||||
if (!idx[i].nchars)
|
||||
break;
|
||||
|
||||
numchars=idx[i].uidx.to-idx[i].uidx.from+1;
|
||||
if (!(idx[i].uidx.tab=(uchar*) alloc(numchars * sizeof(*idx[i].uidx.tab))))
|
||||
return TRUE;
|
||||
|
||||
bzero(idx[i].uidx.tab,numchars*sizeof(*idx[i].uidx.tab));
|
||||
|
||||
for (ch=1; ch < PLANE_SIZE; ch++)
|
||||
{
|
||||
uint16 wc=cs->tab_to_uni[ch];
|
||||
if (wc >= idx[i].uidx.from && wc <= idx[i].uidx.to && wc)
|
||||
{
|
||||
int ofs= wc - idx[i].uidx.from;
|
||||
idx[i].uidx.tab[ofs]= ch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Allocate and fill reverse table for each plane */
|
||||
n=i;
|
||||
if (!(cs->tab_from_uni= (MY_UNI_IDX*) alloc(sizeof(MY_UNI_IDX)*(n+1))))
|
||||
return TRUE;
|
||||
|
||||
for (i=0; i< n; i++)
|
||||
cs->tab_from_uni[i]= idx[i].uidx;
|
||||
|
||||
/* Set end-of-list marker */
|
||||
bzero(&cs->tab_from_uni[i],sizeof(MY_UNI_IDX));
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
static my_bool my_cset_init_8bit(CHARSET_INFO *cs, void *(*alloc)(uint))
|
||||
{
|
||||
return create_fromuni(cs, alloc);
|
||||
}
|
||||
|
||||
|
||||
|
||||
MY_CHARSET_HANDLER my_charset_8bit_handler=
|
||||
{
|
||||
my_cset_init_8bit,
|
||||
NULL, /* ismbchar */
|
||||
my_mbcharlen_8bit, /* mbcharlen */
|
||||
my_numchars_8bit,
|
||||
@ -1170,6 +1269,7 @@ MY_CHARSET_HANDLER my_charset_8bit_handler=
|
||||
|
||||
MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_simple,
|
||||
my_strnncollsp_simple,
|
||||
my_strnxfrm_simple,
|
||||
|
@ -4534,6 +4534,7 @@ my_mb_wc_sjis(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_sjis,
|
||||
my_strnncollsp_sjis,
|
||||
my_strnxfrm_sjis,
|
||||
@ -4547,6 +4548,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
ismbchar_sjis,
|
||||
mbcharlen_sjis,
|
||||
my_numchars_mb,
|
||||
|
@ -906,6 +906,7 @@ int my_wc_mb_tis620(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_tis620,
|
||||
my_strnncollsp_tis620,
|
||||
my_strnxfrm_tis620,
|
||||
@ -918,6 +919,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
NULL, /* ismbchar */
|
||||
my_mbcharlen_8bit, /* mbcharlen */
|
||||
my_numchars_8bit,
|
||||
|
@ -7036,8 +7036,464 @@ int my_wildcmp_uca(CHARSET_INFO *cs,
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Collation language is implemented according to
|
||||
subset of ICU Collation Customization (tailorings):
|
||||
http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
|
||||
|
||||
Collation language elements:
|
||||
Delimiters:
|
||||
space - skipped
|
||||
|
||||
<char> := A-Z | a-z | \uXXXX
|
||||
|
||||
Shift command:
|
||||
<shift> := & - reset at this letter.
|
||||
|
||||
Diff command:
|
||||
<d1> := < - Identifies a primary difference.
|
||||
<d2> := << - Identifies a secondary difference.
|
||||
<d3> := <<< - Idenfifies a tertiary difference.
|
||||
|
||||
|
||||
Collation rules:
|
||||
<ruleset> := <rule> { <ruleset> }
|
||||
|
||||
<rule> := <d1> <string>
|
||||
| <d2> <string>
|
||||
| <d3> <string>
|
||||
| <shift> <char>
|
||||
|
||||
<string> := <char> [ <string> ]
|
||||
|
||||
An example, Polish collation:
|
||||
|
||||
&A < \u0105 <<< \u0104
|
||||
&C < \u0107 <<< \u0106
|
||||
&E < \u0119 <<< \u0118
|
||||
&L < \u0142 <<< \u0141
|
||||
&N < \u0144 <<< \u0143
|
||||
&O < \u00F3 <<< \u00D3
|
||||
&S < \u015B <<< \u015A
|
||||
&Z < \u017A <<< \u017B
|
||||
*/
|
||||
|
||||
|
||||
typedef enum my_coll_lexem_num_en
|
||||
{
|
||||
MY_COLL_LEXEM_EOF = 0,
|
||||
MY_COLL_LEXEM_DIFF = 1,
|
||||
MY_COLL_LEXEM_SHIFT = 4,
|
||||
MY_COLL_LEXEM_CHAR = 5,
|
||||
MY_COLL_LEXEM_ERROR = 6
|
||||
} my_coll_lexem_num;
|
||||
|
||||
|
||||
typedef struct my_coll_lexem_st
|
||||
{
|
||||
const char *beg;
|
||||
const char *end;
|
||||
const char *prev;
|
||||
int diff;
|
||||
int code;
|
||||
} MY_COLL_LEXEM;
|
||||
|
||||
|
||||
/*
|
||||
Initialize collation rule lexical anilizer
|
||||
|
||||
SYNOPSIS
|
||||
my_coll_lexem_init
|
||||
lexem Lex analizer to init
|
||||
str Const string to parse
|
||||
strend End of the string
|
||||
USAGE
|
||||
|
||||
RETURN VALUES
|
||||
N/A
|
||||
*/
|
||||
|
||||
static void my_coll_lexem_init(MY_COLL_LEXEM *lexem,
|
||||
const char *str, const char *strend)
|
||||
{
|
||||
lexem->beg= str;
|
||||
lexem->prev= str;
|
||||
lexem->end= strend;
|
||||
lexem->diff= 0;
|
||||
lexem->code= 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Print collation customization expression parse error, with context.
|
||||
|
||||
SYNOPSIS
|
||||
my_coll_lexem_print_error
|
||||
lexem Lex analizer to take context from
|
||||
errstr sting to write error to
|
||||
errsize errstr size
|
||||
txt error message
|
||||
USAGE
|
||||
|
||||
RETURN VALUES
|
||||
N/A
|
||||
*/
|
||||
|
||||
static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem,
|
||||
char *errstr, size_t errsize,
|
||||
const char *txt)
|
||||
{
|
||||
char tail[30];
|
||||
size_t len= lexem->end - lexem->prev;
|
||||
strmake (tail, lexem->prev, min(len, sizeof(tail)-1));
|
||||
errstr[errsize-1]= '\0';
|
||||
my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Convert a hex digit into its numeric value
|
||||
|
||||
SYNOPSIS
|
||||
ch2x
|
||||
ch hex digit to convert
|
||||
USAGE
|
||||
|
||||
RETURN VALUES
|
||||
an integer value in the range 0..15
|
||||
-1 on error
|
||||
*/
|
||||
|
||||
static int ch2x(int ch)
|
||||
{
|
||||
if (ch >= '0' && ch <= '9')
|
||||
return ch - '0';
|
||||
|
||||
if (ch >= 'a' && ch <= 'f')
|
||||
return 10 + ch - 'a';
|
||||
|
||||
if (ch >= 'A' && ch <= 'F')
|
||||
return 10 + ch - 'A';
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Collation language lexical parser:
|
||||
Scans the next lexem.
|
||||
|
||||
SYNOPSIS
|
||||
my_coll_lexem_next
|
||||
lexem Lex analizer, previously initialized by
|
||||
my_coll_lexem_init.
|
||||
USAGE
|
||||
Call this function in a loop
|
||||
|
||||
RETURN VALUES
|
||||
Lexem number: eof, diff, shift, char or error.
|
||||
*/
|
||||
|
||||
static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem)
|
||||
{
|
||||
for ( ;lexem->beg < lexem->end ; lexem->beg++)
|
||||
{
|
||||
lexem->prev= lexem->beg;
|
||||
if (lexem->beg[0] == ' ' || lexem->beg[0] == '\t' ||
|
||||
lexem->beg[0] == '\r' || lexem->beg[0] == '\n')
|
||||
continue;
|
||||
|
||||
if (lexem->beg[0] == '&')
|
||||
{
|
||||
lexem->beg++;
|
||||
return MY_COLL_LEXEM_SHIFT;
|
||||
}
|
||||
|
||||
if (lexem->beg[0] == '<')
|
||||
{
|
||||
for (lexem->beg++, lexem->diff=1;
|
||||
(lexem->beg < lexem->end) &&
|
||||
(lexem->beg[0] == '<') && (lexem->diff<3);
|
||||
lexem->beg++, lexem->diff++);
|
||||
return MY_COLL_LEXEM_DIFF;
|
||||
}
|
||||
|
||||
if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') ||
|
||||
(lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z'))
|
||||
{
|
||||
lexem->code= lexem->beg[0];
|
||||
lexem->beg++;
|
||||
return MY_COLL_LEXEM_CHAR;
|
||||
}
|
||||
|
||||
if ((lexem->beg[0] == '\\') &&
|
||||
(lexem->beg+2 < lexem->end) &&
|
||||
(lexem->beg[1] == 'u'))
|
||||
{
|
||||
int ch;
|
||||
|
||||
lexem->code= 0;
|
||||
for (lexem->beg+=2;
|
||||
(lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ;
|
||||
lexem->beg++)
|
||||
{
|
||||
lexem->code= (lexem->code << 4) + ch;
|
||||
}
|
||||
return MY_COLL_LEXEM_CHAR;
|
||||
}
|
||||
|
||||
return MY_COLL_LEXEM_ERROR;
|
||||
}
|
||||
return MY_COLL_LEXEM_EOF;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Collation rule item
|
||||
*/
|
||||
|
||||
typedef struct my_coll_rule_item_st
|
||||
{
|
||||
uint base; /* Base character */
|
||||
uint curr; /* Current character */
|
||||
int diff[3]; /* Primary, Secondary and Tertiary difference */
|
||||
} MY_COLL_RULE;
|
||||
|
||||
|
||||
/*
|
||||
Collation language syntax parser.
|
||||
Uses lexical parser.
|
||||
|
||||
SYNOPSIS
|
||||
my_coll_rule_parse
|
||||
rule Collation rule list to load to.
|
||||
str A string containin collation language expression.
|
||||
strend End of the string.
|
||||
USAGE
|
||||
|
||||
RETURN VALUES
|
||||
0 - OK
|
||||
1 - ERROR, e.g. too many items.
|
||||
*/
|
||||
|
||||
static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
|
||||
const char *str, const char *strend,
|
||||
char *errstr, size_t errsize)
|
||||
{
|
||||
MY_COLL_LEXEM lexem;
|
||||
my_coll_lexem_num lexnum;
|
||||
my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR;
|
||||
MY_COLL_RULE item;
|
||||
int state= 0;
|
||||
size_t nitems= 0;
|
||||
|
||||
/* Init all variables */
|
||||
errstr[0]= '\0';
|
||||
bzero(&item, sizeof(item));
|
||||
my_coll_lexem_init(&lexem, str, strend);
|
||||
|
||||
while ((lexnum= my_coll_lexem_next(&lexem)))
|
||||
{
|
||||
if (lexnum == MY_COLL_LEXEM_ERROR)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character");
|
||||
return -1;
|
||||
}
|
||||
|
||||
switch (state) {
|
||||
case 0:
|
||||
if (lexnum != MY_COLL_LEXEM_SHIFT)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected");
|
||||
return -1;
|
||||
}
|
||||
prevlexnum= lexnum;
|
||||
state= 2;
|
||||
continue;
|
||||
|
||||
case 1:
|
||||
if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected");
|
||||
return -1;
|
||||
}
|
||||
prevlexnum= lexnum;
|
||||
state= 2;
|
||||
continue;
|
||||
|
||||
case 2:
|
||||
if (lexnum != MY_COLL_LEXEM_CHAR)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (prevlexnum == MY_COLL_LEXEM_SHIFT)
|
||||
{
|
||||
item.base= lexem.code;
|
||||
item.diff[0]= 0;
|
||||
item.diff[1]= 0;
|
||||
item.diff[2]= 0;
|
||||
}
|
||||
else if (prevlexnum == MY_COLL_LEXEM_DIFF)
|
||||
{
|
||||
item.curr= lexem.code;
|
||||
if (lexem.diff == 3)
|
||||
{
|
||||
item.diff[2]++;
|
||||
}
|
||||
else if (lexem.diff == 2)
|
||||
{
|
||||
item.diff[1]++;
|
||||
item.diff[2]= 0;
|
||||
}
|
||||
else if (lexem.diff == 1)
|
||||
{
|
||||
item.diff[0]++;
|
||||
item.diff[1]= 0;
|
||||
item.diff[2]= 0;
|
||||
}
|
||||
if (nitems >= mitems)
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules");
|
||||
return -1;
|
||||
}
|
||||
rule[nitems++]= item;
|
||||
}
|
||||
else
|
||||
{
|
||||
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen");
|
||||
return -1;
|
||||
}
|
||||
state= 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return (size_t) nitems;
|
||||
}
|
||||
|
||||
#define MY_MAX_COLL_RULE 64
|
||||
|
||||
/*
|
||||
This function copies an UCS2 collation from
|
||||
the default Unicode Collation Algorithm (UCA)
|
||||
weights applying tailorings, i.e. a set of
|
||||
alternative weights for some characters.
|
||||
|
||||
The default UCA weights are stored in my_charset_ucs2_general_uca.
|
||||
They consist of 256 pages, 256 character each.
|
||||
|
||||
If a page is not overwritten by tailoring rules,
|
||||
it is copies as is from UCA as is.
|
||||
|
||||
If a page contains some overwritten characters, it is
|
||||
allocated. Untouched characters are copied from the
|
||||
default weights.
|
||||
*/
|
||||
|
||||
static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(uint))
|
||||
{
|
||||
MY_COLL_RULE rule[MY_MAX_COLL_RULE];
|
||||
char errstr[128];
|
||||
uchar *newlengths;
|
||||
uint16 **newweights;
|
||||
const uchar *deflengths= my_charset_ucs2_general_uca.sort_order;
|
||||
uint16 **defweights= my_charset_ucs2_general_uca.sort_order_big;
|
||||
int rc, i;
|
||||
|
||||
if (!cs->tailoring)
|
||||
return 1;
|
||||
|
||||
/* Parse ICU Collation Customization expression */
|
||||
if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE,
|
||||
cs->tailoring,
|
||||
cs->tailoring + strlen(cs->tailoring),
|
||||
errstr, sizeof(errstr))) <= 0)
|
||||
{
|
||||
/*
|
||||
TODO: add error message reporting.
|
||||
printf("Error: %d '%s'\n", rc, errstr);
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!(newweights= (uint16**) alloc(256*sizeof(uint16*))))
|
||||
return 1;
|
||||
bzero(newweights, 256*sizeof(uint16*));
|
||||
|
||||
if (!(newlengths= (uchar*) alloc(256)))
|
||||
return 1;
|
||||
|
||||
memcpy(newlengths, deflengths, 256);
|
||||
|
||||
/*
|
||||
Calculate maximum lenghts for the pages
|
||||
which will be overwritten.
|
||||
*/
|
||||
for (i=0; i < rc; i++)
|
||||
{
|
||||
uint pageb= (rule[i].base >> 8) & 0xFF;
|
||||
uint pagec= (rule[i].curr >> 8) & 0xFF;
|
||||
|
||||
if (newlengths[pagec] < deflengths[pageb])
|
||||
newlengths[pagec]= deflengths[pageb];
|
||||
}
|
||||
|
||||
for (i=0; i < rc; i++)
|
||||
{
|
||||
uint pageb= (rule[i].base >> 8) & 0xFF;
|
||||
uint pagec= (rule[i].curr >> 8) & 0xFF;
|
||||
uint chb, chc;
|
||||
|
||||
if (!newweights[pagec])
|
||||
{
|
||||
/* Alloc new page and copy the default UCA weights */
|
||||
uint size= 256*newlengths[pagec]*sizeof(uint16);
|
||||
|
||||
if (!(newweights[pagec]= (uint16*) alloc(size)))
|
||||
return 1;
|
||||
bzero((void*) newweights[pagec], size);
|
||||
|
||||
for (chc=0 ; chc < 256; chc++)
|
||||
{
|
||||
memcpy(newweights[pagec] + chc*newlengths[pagec],
|
||||
defweights[pagec] + chc*deflengths[pagec],
|
||||
deflengths[pagec]*sizeof(uint16));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Aply the alternative rule:
|
||||
shift to the base character and primary difference.
|
||||
*/
|
||||
chc= rule[i].curr & 0xFF;
|
||||
chb= rule[i].base & 0xFF;
|
||||
memcpy(newweights[pagec] + chc*newlengths[pagec],
|
||||
defweights[pageb] + chb*deflengths[pageb],
|
||||
deflengths[pageb]*sizeof(uint16));
|
||||
/* Apply primary difference */
|
||||
newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0];
|
||||
}
|
||||
|
||||
/* Copy non-overwritten pages from the default UCA weights */
|
||||
for (i= 0; i < 256 ; i++)
|
||||
if (!newweights[i])
|
||||
newweights[i]= defweights[i];
|
||||
|
||||
cs->sort_order= newlengths;
|
||||
cs->sort_order_big= newweights;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static my_bool my_coll_init_uca(CHARSET_INFO *cs, void *(*alloc)(uint))
|
||||
{
|
||||
return create_tailoring(cs, alloc);
|
||||
}
|
||||
|
||||
MY_COLLATION_HANDLER my_collation_ucs2_uca_handler =
|
||||
{
|
||||
my_coll_init_uca, /* init */
|
||||
my_strnncoll_uca,
|
||||
my_strnncollsp_uca,
|
||||
my_strnxfrm_uca,
|
||||
@ -7051,7 +7507,7 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler =
|
||||
CHARSET_INFO my_charset_ucs2_general_uca=
|
||||
{
|
||||
45,0,0, /* number */
|
||||
MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONTEXT,
|
||||
MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
|
||||
"ucs2", /* cs name */
|
||||
"ucs2_general_uca", /* name */
|
||||
"", /* comment */
|
||||
|
@ -1372,6 +1372,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_ucs2,
|
||||
my_strnncoll_ucs2,
|
||||
my_strnxfrm_ucs2,
|
||||
@ -1385,6 +1386,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_ucs2_bin,
|
||||
my_strnncoll_ucs2_bin,
|
||||
my_strnxfrm_ucs2_bin,
|
||||
@ -1398,6 +1400,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
|
||||
|
||||
MY_CHARSET_HANDLER my_charset_ucs2_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
my_ismbchar_ucs2, /* ismbchar */
|
||||
my_mbcharlen_ucs2, /* mbcharlen */
|
||||
my_numchars_ucs2,
|
||||
@ -1426,7 +1429,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
|
||||
CHARSET_INFO my_charset_ucs2_general_ci=
|
||||
{
|
||||
35,0,0, /* number */
|
||||
MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONTEXT,
|
||||
MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE,
|
||||
"ucs2", /* cs name */
|
||||
"ucs2_general_ci", /* name */
|
||||
"", /* comment */
|
||||
@ -1452,7 +1455,7 @@ CHARSET_INFO my_charset_ucs2_general_ci=
|
||||
CHARSET_INFO my_charset_ucs2_bin=
|
||||
{
|
||||
90,0,0, /* number */
|
||||
MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONTEXT,
|
||||
MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE,
|
||||
"ucs2", /* cs name */
|
||||
"ucs2_bin", /* name */
|
||||
"", /* comment */
|
||||
|
@ -8423,6 +8423,7 @@ my_wc_mb_euc_jp(CHARSET_INFO *c,my_wc_t wc, unsigned char *s, unsigned char *e)
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_simple,/* strnncoll */
|
||||
my_strnncollsp_simple,
|
||||
my_strnxfrm_simple, /* strnxfrm */
|
||||
@ -8435,14 +8436,15 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
ismbchar_ujis,
|
||||
mbcharlen_ujis,
|
||||
my_numchars_mb,
|
||||
my_charpos_mb,
|
||||
my_well_formed_len_mb,
|
||||
my_lengthsp_8bit,
|
||||
my_mb_wc_euc_jp, /* mb_wc */
|
||||
my_wc_mb_euc_jp, /* wc_mb */
|
||||
my_mb_wc_euc_jp, /* mb_wc */
|
||||
my_wc_mb_euc_jp, /* wc_mb */
|
||||
my_caseup_str_mb,
|
||||
my_casedn_str_mb,
|
||||
my_caseup_mb,
|
||||
|
@ -2045,6 +2045,7 @@ static int my_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)) , uint c)
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_utf8,
|
||||
my_strnncollsp_utf8,
|
||||
my_strnxfrm_utf8,
|
||||
@ -2057,6 +2058,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
|
||||
|
||||
static MY_CHARSET_HANDLER my_charset_handler=
|
||||
{
|
||||
NULL, /* init */
|
||||
my_ismbchar_utf8,
|
||||
my_mbcharlen_utf8,
|
||||
my_numchars_mb,
|
||||
|
@ -605,6 +605,7 @@ my_like_range_win1250ch(CHARSET_INFO *cs __attribute__((unused)),
|
||||
|
||||
static MY_COLLATION_HANDLER my_collation_czech_ci_handler =
|
||||
{
|
||||
NULL, /* init */
|
||||
my_strnncoll_win1250ch,
|
||||
my_strnncollsp_win1250ch,
|
||||
my_strnxfrm_win1250ch,
|
||||
|
Loading…
x
Reference in New Issue
Block a user