44
45
cs < all_charsets+array_elements(all_charsets)-1 ;
47
if ( cs[0] && cs[0]->name &&
48
!my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
48
if ( cs[0] && cs[0]->name &&
49
!my_strcasecmp(&my_charset_latin1, cs[0]->name, name))
49
50
return cs[0]->number;
55
static bool init_state_maps(CHARSET_INFO *cs)
56
static my_bool init_state_maps(CHARSET_INFO *cs)
58
unsigned char *state_map;
59
unsigned char *ident_map;
61
if (!(cs->state_map= (unsigned char*) malloc(256)))
62
if (!(cs->state_map= (uchar*) my_once_alloc(256, MYF(MY_WME))))
64
if (!(cs->ident_map= (unsigned char*) malloc(256)))
65
if (!(cs->ident_map= (uchar*) my_once_alloc(256, MYF(MY_WME))))
67
68
state_map= cs->state_map;
68
69
ident_map= cs->ident_map;
70
71
/* Fill state_map with states to get a faster parser */
71
72
for (i=0; i < 256 ; i++)
73
74
if (my_isalpha(cs,i))
74
state_map[i]=(unsigned char) MY_LEX_IDENT;
75
state_map[i]=(uchar) MY_LEX_IDENT;
75
76
else if (my_isdigit(cs,i))
76
state_map[i]=(unsigned char) MY_LEX_NUMBER_IDENT;
77
state_map[i]=(uchar) MY_LEX_NUMBER_IDENT;
77
78
#if defined(USE_MB) && defined(USE_MB_IDENT)
78
79
else if (my_mbcharlen(cs, i)>1)
79
state_map[i]=(unsigned char) MY_LEX_IDENT;
80
state_map[i]=(uchar) MY_LEX_IDENT;
81
82
else if (my_isspace(cs,i))
82
state_map[i]=(unsigned char) MY_LEX_SKIP;
83
state_map[i]=(uchar) MY_LEX_SKIP;
84
state_map[i]=(unsigned char) MY_LEX_CHAR;
85
state_map[i]=(uchar) MY_LEX_CHAR;
86
state_map[(unsigned char)'_']=state_map[(unsigned char)'$']=(unsigned char) MY_LEX_IDENT;
87
state_map[(unsigned char)'\'']=(unsigned char) MY_LEX_STRING;
88
state_map[(unsigned char)'.']=(unsigned char) MY_LEX_REAL_OR_POINT;
89
state_map[(unsigned char)'>']=state_map[(unsigned char)'=']=state_map[(unsigned char)'!']= (unsigned char) MY_LEX_CMP_OP;
90
state_map[(unsigned char)'<']= (unsigned char) MY_LEX_LONG_CMP_OP;
91
state_map[(unsigned char)'&']=state_map[(unsigned char)'|']=(unsigned char) MY_LEX_BOOL;
92
state_map[(unsigned char)'#']=(unsigned char) MY_LEX_COMMENT;
93
state_map[(unsigned char)';']=(unsigned char) MY_LEX_SEMICOLON;
94
state_map[(unsigned char)':']=(unsigned char) MY_LEX_SET_VAR;
95
state_map[0]=(unsigned char) MY_LEX_EOL;
96
state_map[(unsigned char)'\\']= (unsigned char) MY_LEX_ESCAPE;
97
state_map[(unsigned char)'/']= (unsigned char) MY_LEX_LONG_COMMENT;
98
state_map[(unsigned char)'*']= (unsigned char) MY_LEX_END_LONG_COMMENT;
99
state_map[(unsigned char)'@']= (unsigned char) MY_LEX_USER_END;
100
state_map[(unsigned char) '`']= (unsigned char) MY_LEX_USER_VARIABLE_DELIMITER;
101
state_map[(unsigned char)'"']= (unsigned char) MY_LEX_STRING_OR_DELIMITER;
87
state_map[(uchar)'_']=state_map[(uchar)'$']=(uchar) MY_LEX_IDENT;
88
state_map[(uchar)'\'']=(uchar) MY_LEX_STRING;
89
state_map[(uchar)'.']=(uchar) MY_LEX_REAL_OR_POINT;
90
state_map[(uchar)'>']=state_map[(uchar)'=']=state_map[(uchar)'!']= (uchar) MY_LEX_CMP_OP;
91
state_map[(uchar)'<']= (uchar) MY_LEX_LONG_CMP_OP;
92
state_map[(uchar)'&']=state_map[(uchar)'|']=(uchar) MY_LEX_BOOL;
93
state_map[(uchar)'#']=(uchar) MY_LEX_COMMENT;
94
state_map[(uchar)';']=(uchar) MY_LEX_SEMICOLON;
95
state_map[(uchar)':']=(uchar) MY_LEX_SET_VAR;
96
state_map[0]=(uchar) MY_LEX_EOL;
97
state_map[(uchar)'\\']= (uchar) MY_LEX_ESCAPE;
98
state_map[(uchar)'/']= (uchar) MY_LEX_LONG_COMMENT;
99
state_map[(uchar)'*']= (uchar) MY_LEX_END_LONG_COMMENT;
100
state_map[(uchar)'@']= (uchar) MY_LEX_USER_END;
101
state_map[(uchar) '`']= (uchar) MY_LEX_USER_VARIABLE_DELIMITER;
102
state_map[(uchar)'"']= (uchar) MY_LEX_STRING_OR_DELIMITER;
104
105
Create a second map to make it faster to find identifiers
106
107
for (i=0; i < 256 ; i++)
108
ident_map[i]= (unsigned char) (state_map[i] == MY_LEX_IDENT ||
109
ident_map[i]= (uchar) (state_map[i] == MY_LEX_IDENT ||
109
110
state_map[i] == MY_LEX_NUMBER_IDENT);
112
113
/* Special handling of hex and binary strings */
113
state_map[(unsigned char)'x']= state_map[(unsigned char)'X']= (unsigned char) MY_LEX_IDENT_OR_HEX;
114
state_map[(unsigned char)'b']= state_map[(unsigned char)'B']= (unsigned char) MY_LEX_IDENT_OR_BIN;
114
state_map[(uchar)'x']= state_map[(uchar)'X']= (uchar) MY_LEX_IDENT_OR_HEX;
115
state_map[(uchar)'b']= state_map[(uchar)'B']= (uchar) MY_LEX_IDENT_OR_BIN;
116
state_map[(uchar)'n']= state_map[(uchar)'N']= (uchar) MY_LEX_IDENT_OR_NCHAR;
121
static void simple_cs_init_functions(CHARSET_INFO *cs)
123
if (cs->state & MY_CS_BINSORT)
124
cs->coll= &my_collation_8bit_bin_handler;
126
cs->coll= &my_collation_8bit_simple_ci_handler;
128
cs->cset= &my_charset_8bit_handler;
133
static int cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
135
to->number= from->number ? from->number : to->number;
138
if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME))))
142
if (!(to->name= my_once_strdup(from->name,MYF(MY_WME))))
146
if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
151
if (!(to->ctype= (uchar*) my_once_memdup((char*) from->ctype,
152
MY_CS_CTYPE_TABLE_SIZE,
155
if (init_state_maps(to))
159
if (!(to->to_lower= (uchar*) my_once_memdup((char*) from->to_lower,
160
MY_CS_TO_LOWER_TABLE_SIZE,
165
if (!(to->to_upper= (uchar*) my_once_memdup((char*) from->to_upper,
166
MY_CS_TO_UPPER_TABLE_SIZE,
169
if (from->sort_order)
171
if (!(to->sort_order= (uchar*) my_once_memdup((char*) from->sort_order,
172
MY_CS_SORT_ORDER_TABLE_SIZE,
177
if (from->tab_to_uni)
179
uint sz= MY_CS_TO_UNI_TABLE_SIZE*sizeof(uint16);
180
if (!(to->tab_to_uni= (uint16*) my_once_memdup((char*)from->tab_to_uni,
185
if (!(to->tailoring= my_once_strdup(from->tailoring,MYF(MY_WME))))
196
static my_bool simple_cs_is_full(CHARSET_INFO *cs)
198
return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper &&
200
(cs->number && cs->name &&
201
(cs->sort_order || (cs->state & MY_CS_BINSORT) )));
206
copy_uca_collation(CHARSET_INFO *to, CHARSET_INFO *from)
208
to->cset= from->cset;
209
to->coll= from->coll;
210
to->strxfrm_multiply= from->strxfrm_multiply;
211
to->min_sort_char= from->min_sort_char;
212
to->max_sort_char= from->max_sort_char;
213
to->mbminlen= from->mbminlen;
214
to->mbmaxlen= from->mbmaxlen;
218
static int add_collation(CHARSET_INFO *cs)
220
if (cs->name && (cs->number ||
221
(cs->number=get_collation_number_internal(cs->name))))
223
if (!all_charsets[cs->number])
225
if (!(all_charsets[cs->number]=
226
(CHARSET_INFO*) my_once_alloc(sizeof(CHARSET_INFO),MYF(0))))
228
bzero((void*)all_charsets[cs->number],sizeof(CHARSET_INFO));
231
if (cs->primary_number == cs->number)
232
cs->state |= MY_CS_PRIMARY;
234
if (cs->binary_number == cs->number)
235
cs->state |= MY_CS_BINSORT;
237
all_charsets[cs->number]->state|= cs->state;
239
if (!(all_charsets[cs->number]->state & MY_CS_COMPILED))
241
CHARSET_INFO *newcs= all_charsets[cs->number];
242
if (cs_copy_data(all_charsets[cs->number],cs))
245
newcs->levels_for_compare= 1;
246
newcs->levels_for_order= 1;
248
if (!strcmp(cs->csname,"ucs2") )
250
#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
251
copy_uca_collation(newcs, &my_charset_ucs2_unicode_ci);
252
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
255
else if (!strcmp(cs->csname, "utf8"))
257
#if defined (HAVE_CHARSET_utf8mb3) && defined(HAVE_UCA_COLLATIONS)
258
copy_uca_collation(newcs, &my_charset_utf8mb4_unicode_ci);
259
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
262
else if (!strcmp(cs->csname, "utf8mb3"))
264
#if defined (HAVE_CHARSET_utf8mb3) && defined(HAVE_UCA_COLLATIONS)
265
copy_uca_collation(newcs, &my_charset_utf8mb3_unicode_ci);
266
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
269
else if (!strcmp(cs->csname, "utf16"))
271
#if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS)
272
copy_uca_collation(newcs, &my_charset_utf16_unicode_ci);
273
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
276
else if (!strcmp(cs->csname, "utf32"))
278
#if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS)
279
copy_uca_collation(newcs, &my_charset_utf32_unicode_ci);
280
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
285
uchar *sort_order= all_charsets[cs->number]->sort_order;
286
simple_cs_init_functions(all_charsets[cs->number]);
289
if (simple_cs_is_full(all_charsets[cs->number]))
291
all_charsets[cs->number]->state |= MY_CS_LOADED;
293
all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
296
Check if case sensitive sort order: A < a < B.
297
We need MY_CS_FLAG for regex library, and for
298
case sensitivity flag for 5.0 client protocol,
299
to support isCaseSensitive() method in JDBC driver
301
if (sort_order && sort_order['A'] < sort_order['a'] &&
302
sort_order['a'] < sort_order['B'])
303
all_charsets[cs->number]->state|= MY_CS_CSSORT;
305
if (my_charset_is_8bit_pure_ascii(all_charsets[cs->number]))
306
all_charsets[cs->number]->state|= MY_CS_PUREASCII;
307
if (!my_charset_is_ascii_compatible(cs))
308
all_charsets[cs->number]->state|= MY_CS_NONASCII;
314
We need the below to make get_charset_name()
315
and get_charset_number() working even if a
316
character set has not been really incompiled.
317
The above functions are used for example
318
in error message compiler extra/comp_err.c.
319
If a character set was compiled, this information
320
will get lost and overwritten in add_compiled_collation().
322
CHARSET_INFO *dst= all_charsets[cs->number];
323
dst->number= cs->number;
325
if (!(dst->comment= my_once_strdup(cs->comment,MYF(MY_WME))))
327
if (cs->csname && !dst->csname)
328
if (!(dst->csname= my_once_strdup(cs->csname,MYF(MY_WME))))
330
if (cs->name && !dst->name)
331
if (!(dst->name= my_once_strdup(cs->name,MYF(MY_WME))))
335
cs->primary_number= 0;
336
cs->binary_number= 0;
339
cs->sort_order= NULL;
346
#define MY_MAX_ALLOWED_BUF 1024*1024
119
347
#define MY_CHARSET_INDEX "Index.xml"
121
349
const char *charsets_dir= NULL;
122
350
static int charset_initialized=0;
353
static my_bool my_read_charset_file(const char *filename, myf myflags)
360
if (!my_stat(filename, &stat_info, MYF(myflags)) ||
361
((len= (uint)stat_info.st_size) > MY_MAX_ALLOWED_BUF) ||
362
!(buf= (uchar*) my_malloc(len,myflags)))
365
if ((fd=my_open(filename,O_RDONLY,myflags)) < 0)
367
tmp_len=my_read(fd, buf, len, myflags);
368
my_close(fd,myflags);
372
if (my_parse_charset_xml((char*) buf,len,add_collation))
375
printf("ERROR at line %d pos %d '%s'\n",
376
my_xml_error_lineno(&p)+1,
377
my_xml_error_pos(&p),
378
my_xml_error_string(&p));
382
my_free(buf, myflags);
386
my_free(buf, myflags);
125
391
char *get_charsets_dir(char *buf)
393
const char *sharedir= SHAREDIR;
395
DBUG_ENTER("get_charsets_dir");
129
397
if (charsets_dir != NULL)
130
strncpy(buf, charsets_dir, FN_REFLEN-1);
398
strmake(buf, charsets_dir, FN_REFLEN-1);
133
if (test_if_hard_path(PKGDATADIR) ||
134
is_prefix(PKGDATADIR, PREFIX))
135
sprintf(buf,"%s/%s",PKGDATADIR,CHARSET_DIR);
401
if (test_if_hard_path(sharedir) ||
402
is_prefix(sharedir, DEFAULT_CHARSET_HOME))
403
strxmov(buf, sharedir, "/", CHARSET_DIR, NullS);
137
sprintf(buf,"%s/%s/%s",PREFIX,PKGDATADIR,CHARSET_DIR);
405
strxmov(buf, DEFAULT_CHARSET_HOME, "/", sharedir, "/", CHARSET_DIR,
139
res= convert_dirname(buf,buf,NULL);
408
res= convert_dirname(buf,buf,NullS);
409
DBUG_PRINT("info",("charsets dir: '%s'", buf));
143
413
CHARSET_INFO *all_charsets[256];
144
const CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
414
CHARSET_INFO *default_charset_info = &my_charset_latin1;
146
void add_compiled_collation(CHARSET_INFO * cs)
416
void add_compiled_collation(CHARSET_INFO *cs)
148
418
all_charsets[cs->number]= cs;
149
419
cs->state|= MY_CS_AVAILABLE;
152
void *cs_alloc(size_t size)
422
static void *cs_alloc(size_t size)
424
return my_once_alloc(size, MYF(MY_WME));
158
static bool init_available_charsets(myf myflags)
428
static my_bool init_available_charsets(myf myflags)
160
430
char fname[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
163
433
We have to use charset_initialized to not lock on THR_LOCK_charset
164
434
inside get_internal_charset...
375
650
The function tries to resolve collation by the specified name. If there
376
651
is collation with the given name, it is assigned to the "cl" parameter
377
and false is returned. If there is no such collation, "default_cl" is
378
assigned to the "cl" and true is returned.
652
and FALSE is returned. If there is no such collation, "default_cl" is
653
assigned to the "cl" and TRUE is returned.
380
655
@param[out] cl Variable to store collation.
381
656
@param[in] cl_name Collation name.
382
657
@param[in] default_cl Default collation.
384
@return false if collation was resolved successfully; true if there is no
659
@return FALSE if collation was resolved successfully; TRUE if there is no
385
660
collation with given name.
388
bool resolve_collation(const char *cl_name,
389
const CHARSET_INFO *default_cl,
390
const CHARSET_INFO **cl)
663
my_bool resolve_collation(const char *cl_name,
664
CHARSET_INFO *default_cl,
392
667
*cl= get_charset_by_name(cl_name, MYF(0));
680
Escape string with backslashes (\)
683
escape_string_for_mysql()
684
charset_info Charset of the strings
685
to Buffer for escaped string
686
to_length Length of destination buffer, or 0
687
from The string to escape
688
length The length of the string to escape
691
This escapes the contents of a string by adding backslashes before special
692
characters, and turning others into specific escape sequences, such as
693
turning newlines into \n and null bytes into \0.
696
To maintain compatibility with the old C API, to_length may be 0 to mean
700
(size_t) -1 The escaped string did not fit in the to buffer
701
# The length of the escaped string
704
size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
705
char *to, size_t to_length,
706
const char *from, size_t length)
708
const char *to_start= to;
709
const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
710
my_bool overflow= FALSE;
712
my_bool use_mb_flag= use_mb(charset_info);
714
for (end= from + length; from < end; from++)
719
if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
721
if (to + tmp_length > to_end)
732
If the next character appears to begin a multi-byte character, we
733
escape that first byte of that apparent multi-byte character. (The
734
character just looks like a multi-byte character -- if it were actually
735
a multi-byte character, it would have been passed through in the test
738
Without this check, we can create a problem by converting an invalid
739
multi-byte character into a valid one. For example, 0xbf27 is not
740
a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
742
if (use_mb_flag && (tmp_length= my_mbcharlen(charset_info, *from)) > 1)
747
case 0: /* Must be escaped for 'mysql' */
750
case '\n': /* Must be escaped for logs */
762
case '"': /* Better safe than sorry */
765
case '\032': /* This gives problems on Win32 */
790
return overflow ? (size_t) -1 : (size_t) (to - to_start);