1
/* Copyright (C) 2000 MySQL AB
3
This program is free software; you can redistribute it and/or modify
4
it under the terms of the GNU General Public License as published by
5
the Free Software Foundation; version 2 of the License.
7
This program is distributed in the hope that it will be useful,
8
but WITHOUT ANY WARRANTY; without even the implied warranty of
9
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
GNU General Public License for more details.
12
You should have received a copy of the GNU General Public License
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
#include "mysys_priv.h"
17
#include "mysys_err.h"
25
The code below implements this functionality:
27
- Initializing charset related structures
28
- Loading dynamic charsets
29
- Searching for a proper CHARSET_INFO
30
using charset name, collation name or collation ID
31
- Setting server default character set
34
my_bool my_charset_same(CHARSET_INFO *cs1, CHARSET_INFO *cs2)
36
return ((cs1 == cs2) || !strcmp(cs1->csname,cs2->csname));
41
get_collation_number_internal(const char *name)
44
for (cs= all_charsets;
45
cs < all_charsets+array_elements(all_charsets)-1 ;
48
if ( cs[0] && cs[0]->name &&
49
!my_strcasecmp(&my_charset_latin1, cs[0]->name, name))
56
static my_bool init_state_maps(CHARSET_INFO *cs)
62
if (!(cs->state_map= (uchar*) my_once_alloc(256, MYF(MY_WME))))
65
if (!(cs->ident_map= (uchar*) my_once_alloc(256, MYF(MY_WME))))
68
state_map= cs->state_map;
69
ident_map= cs->ident_map;
71
/* Fill state_map with states to get a faster parser */
72
for (i=0; i < 256 ; i++)
75
state_map[i]=(uchar) MY_LEX_IDENT;
76
else if (my_isdigit(cs,i))
77
state_map[i]=(uchar) MY_LEX_NUMBER_IDENT;
78
#if defined(USE_MB) && defined(USE_MB_IDENT)
79
else if (my_mbcharlen(cs, i)>1)
80
state_map[i]=(uchar) MY_LEX_IDENT;
82
else if (my_isspace(cs,i))
83
state_map[i]=(uchar) MY_LEX_SKIP;
85
state_map[i]=(uchar) MY_LEX_CHAR;
87
state_map[(uchar)'_']=state_map[(uchar)'$']=(uchar) MY_LEX_IDENT;
88
state_map[(uchar)'\'']=(uchar) MY_LEX_STRING;
89
state_map[(uchar)'.']=(uchar) MY_LEX_REAL_OR_POINT;
90
state_map[(uchar)'>']=state_map[(uchar)'=']=state_map[(uchar)'!']= (uchar) MY_LEX_CMP_OP;
91
state_map[(uchar)'<']= (uchar) MY_LEX_LONG_CMP_OP;
92
state_map[(uchar)'&']=state_map[(uchar)'|']=(uchar) MY_LEX_BOOL;
93
state_map[(uchar)'#']=(uchar) MY_LEX_COMMENT;
94
state_map[(uchar)';']=(uchar) MY_LEX_SEMICOLON;
95
state_map[(uchar)':']=(uchar) MY_LEX_SET_VAR;
96
state_map[0]=(uchar) MY_LEX_EOL;
97
state_map[(uchar)'\\']= (uchar) MY_LEX_ESCAPE;
98
state_map[(uchar)'/']= (uchar) MY_LEX_LONG_COMMENT;
99
state_map[(uchar)'*']= (uchar) MY_LEX_END_LONG_COMMENT;
100
state_map[(uchar)'@']= (uchar) MY_LEX_USER_END;
101
state_map[(uchar) '`']= (uchar) MY_LEX_USER_VARIABLE_DELIMITER;
102
state_map[(uchar)'"']= (uchar) MY_LEX_STRING_OR_DELIMITER;
105
Create a second map to make it faster to find identifiers
107
for (i=0; i < 256 ; i++)
109
ident_map[i]= (uchar) (state_map[i] == MY_LEX_IDENT ||
110
state_map[i] == MY_LEX_NUMBER_IDENT);
113
/* Special handling of hex and binary strings */
114
state_map[(uchar)'x']= state_map[(uchar)'X']= (uchar) MY_LEX_IDENT_OR_HEX;
115
state_map[(uchar)'b']= state_map[(uchar)'B']= (uchar) MY_LEX_IDENT_OR_BIN;
116
state_map[(uchar)'n']= state_map[(uchar)'N']= (uchar) MY_LEX_IDENT_OR_NCHAR;
121
static void simple_cs_init_functions(CHARSET_INFO *cs)
123
if (cs->state & MY_CS_BINSORT)
124
cs->coll= &my_collation_8bit_bin_handler;
126
cs->coll= &my_collation_8bit_simple_ci_handler;
128
cs->cset= &my_charset_8bit_handler;
133
static int cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
135
to->number= from->number ? from->number : to->number;
138
if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME))))
142
if (!(to->name= my_once_strdup(from->name,MYF(MY_WME))))
146
if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
151
if (!(to->ctype= (uchar*) my_once_memdup((char*) from->ctype,
152
MY_CS_CTYPE_TABLE_SIZE,
155
if (init_state_maps(to))
159
if (!(to->to_lower= (uchar*) my_once_memdup((char*) from->to_lower,
160
MY_CS_TO_LOWER_TABLE_SIZE,
165
if (!(to->to_upper= (uchar*) my_once_memdup((char*) from->to_upper,
166
MY_CS_TO_UPPER_TABLE_SIZE,
169
if (from->sort_order)
171
if (!(to->sort_order= (uchar*) my_once_memdup((char*) from->sort_order,
172
MY_CS_SORT_ORDER_TABLE_SIZE,
177
if (from->tab_to_uni)
179
uint sz= MY_CS_TO_UNI_TABLE_SIZE*sizeof(uint16);
180
if (!(to->tab_to_uni= (uint16*) my_once_memdup((char*)from->tab_to_uni,
185
if (!(to->tailoring= my_once_strdup(from->tailoring,MYF(MY_WME))))
196
static my_bool simple_cs_is_full(CHARSET_INFO *cs)
198
return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper &&
200
(cs->number && cs->name &&
201
(cs->sort_order || (cs->state & MY_CS_BINSORT) )));
206
copy_uca_collation(CHARSET_INFO *to, CHARSET_INFO *from)
208
to->cset= from->cset;
209
to->coll= from->coll;
210
to->strxfrm_multiply= from->strxfrm_multiply;
211
to->min_sort_char= from->min_sort_char;
212
to->max_sort_char= from->max_sort_char;
213
to->mbminlen= from->mbminlen;
214
to->mbmaxlen= from->mbmaxlen;
218
static int add_collation(CHARSET_INFO *cs)
220
if (cs->name && (cs->number ||
221
(cs->number=get_collation_number_internal(cs->name))))
223
if (!all_charsets[cs->number])
225
if (!(all_charsets[cs->number]=
226
(CHARSET_INFO*) my_once_alloc(sizeof(CHARSET_INFO),MYF(0))))
228
bzero((void*)all_charsets[cs->number],sizeof(CHARSET_INFO));
231
if (cs->primary_number == cs->number)
232
cs->state |= MY_CS_PRIMARY;
234
if (cs->binary_number == cs->number)
235
cs->state |= MY_CS_BINSORT;
237
all_charsets[cs->number]->state|= cs->state;
239
if (!(all_charsets[cs->number]->state & MY_CS_COMPILED))
241
CHARSET_INFO *newcs= all_charsets[cs->number];
242
if (cs_copy_data(all_charsets[cs->number],cs))
245
newcs->levels_for_compare= 1;
246
newcs->levels_for_order= 1;
248
if (!strcmp(cs->csname,"ucs2") )
250
#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
251
copy_uca_collation(newcs, &my_charset_ucs2_unicode_ci);
252
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
255
else if (!strcmp(cs->csname, "utf8"))
257
#if defined (HAVE_CHARSET_utf8mb3) && defined(HAVE_UCA_COLLATIONS)
258
copy_uca_collation(newcs, &my_charset_utf8mb4_unicode_ci);
259
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
262
else if (!strcmp(cs->csname, "utf8mb3"))
264
#if defined (HAVE_CHARSET_utf8mb3) && defined(HAVE_UCA_COLLATIONS)
265
copy_uca_collation(newcs, &my_charset_utf8mb3_unicode_ci);
266
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
269
else if (!strcmp(cs->csname, "utf16"))
271
#if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS)
272
copy_uca_collation(newcs, &my_charset_utf16_unicode_ci);
273
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
276
else if (!strcmp(cs->csname, "utf32"))
278
#if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS)
279
copy_uca_collation(newcs, &my_charset_utf32_unicode_ci);
280
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
285
uchar *sort_order= all_charsets[cs->number]->sort_order;
286
simple_cs_init_functions(all_charsets[cs->number]);
289
if (simple_cs_is_full(all_charsets[cs->number]))
291
all_charsets[cs->number]->state |= MY_CS_LOADED;
293
all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
296
Check if case sensitive sort order: A < a < B.
297
We need MY_CS_FLAG for regex library, and for
298
case sensitivity flag for 5.0 client protocol,
299
to support isCaseSensitive() method in JDBC driver
301
if (sort_order && sort_order['A'] < sort_order['a'] &&
302
sort_order['a'] < sort_order['B'])
303
all_charsets[cs->number]->state|= MY_CS_CSSORT;
305
if (my_charset_is_8bit_pure_ascii(all_charsets[cs->number]))
306
all_charsets[cs->number]->state|= MY_CS_PUREASCII;
307
if (!my_charset_is_ascii_compatible(cs))
308
all_charsets[cs->number]->state|= MY_CS_NONASCII;
314
We need the below to make get_charset_name()
315
and get_charset_number() working even if a
316
character set has not been really incompiled.
317
The above functions are used for example
318
in error message compiler extra/comp_err.c.
319
If a character set was compiled, this information
320
will get lost and overwritten in add_compiled_collation().
322
CHARSET_INFO *dst= all_charsets[cs->number];
323
dst->number= cs->number;
325
if (!(dst->comment= my_once_strdup(cs->comment,MYF(MY_WME))))
327
if (cs->csname && !dst->csname)
328
if (!(dst->csname= my_once_strdup(cs->csname,MYF(MY_WME))))
330
if (cs->name && !dst->name)
331
if (!(dst->name= my_once_strdup(cs->name,MYF(MY_WME))))
335
cs->primary_number= 0;
336
cs->binary_number= 0;
339
cs->sort_order= NULL;
346
#define MY_MAX_ALLOWED_BUF 1024*1024
347
#define MY_CHARSET_INDEX "Index.xml"
349
const char *charsets_dir= NULL;
350
static int charset_initialized=0;
353
static my_bool my_read_charset_file(const char *filename, myf myflags)
360
if (!my_stat(filename, &stat_info, MYF(myflags)) ||
361
((len= (uint)stat_info.st_size) > MY_MAX_ALLOWED_BUF) ||
362
!(buf= (uchar*) my_malloc(len,myflags)))
365
if ((fd=my_open(filename,O_RDONLY,myflags)) < 0)
367
tmp_len=my_read(fd, buf, len, myflags);
368
my_close(fd,myflags);
372
if (my_parse_charset_xml((char*) buf,len,add_collation))
375
printf("ERROR at line %d pos %d '%s'\n",
376
my_xml_error_lineno(&p)+1,
377
my_xml_error_pos(&p),
378
my_xml_error_string(&p));
382
my_free(buf, myflags);
386
my_free(buf, myflags);
391
char *get_charsets_dir(char *buf)
393
const char *sharedir= SHAREDIR;
395
DBUG_ENTER("get_charsets_dir");
397
if (charsets_dir != NULL)
398
strmake(buf, charsets_dir, FN_REFLEN-1);
401
if (test_if_hard_path(sharedir) ||
402
is_prefix(sharedir, DEFAULT_CHARSET_HOME))
403
strxmov(buf, sharedir, "/", CHARSET_DIR, NullS);
405
strxmov(buf, DEFAULT_CHARSET_HOME, "/", sharedir, "/", CHARSET_DIR,
408
res= convert_dirname(buf,buf,NullS);
409
DBUG_PRINT("info",("charsets dir: '%s'", buf));
413
CHARSET_INFO *all_charsets[256];
414
CHARSET_INFO *default_charset_info = &my_charset_latin1;
416
void add_compiled_collation(CHARSET_INFO *cs)
418
all_charsets[cs->number]= cs;
419
cs->state|= MY_CS_AVAILABLE;
422
static void *cs_alloc(size_t size)
424
return my_once_alloc(size, MYF(MY_WME));
428
static my_bool init_available_charsets(myf myflags)
430
char fname[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
433
We have to use charset_initialized to not lock on THR_LOCK_charset
434
inside get_internal_charset...
436
if (!charset_initialized)
440
To make things thread safe we are not allowing other threads to interfere
441
while we may changing the cs_info_table
443
pthread_mutex_lock(&THR_LOCK_charset);
444
if (!charset_initialized)
446
bzero(&all_charsets,sizeof(all_charsets));
447
init_compiled_charsets(myflags);
449
/* Copy compiled charsets */
450
for (cs=all_charsets;
451
cs < all_charsets+array_elements(all_charsets)-1 ;
457
if (init_state_maps(*cs))
462
strmov(get_charsets_dir(fname), MY_CHARSET_INDEX);
463
error= my_read_charset_file(fname,myflags);
464
charset_initialized=1;
466
pthread_mutex_unlock(&THR_LOCK_charset);
472
void free_charsets(void)
474
charset_initialized=0;
478
uint get_collation_number(const char *name)
480
init_available_charsets(MYF(0));
481
return get_collation_number_internal(name);
485
uint get_charset_number(const char *charset_name, uint cs_flags)
488
init_available_charsets(MYF(0));
490
for (cs= all_charsets;
491
cs < all_charsets+array_elements(all_charsets)-1 ;
494
if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) &&
495
!my_strcasecmp(&my_charset_latin1, cs[0]->csname, charset_name))
496
return cs[0]->number;
502
const char *get_charset_name(uint charset_number)
505
init_available_charsets(MYF(0));
507
cs=all_charsets[charset_number];
508
if (cs && (cs->number == charset_number) && cs->name )
509
return (char*) cs->name;
511
return (char*) "?"; /* this mimics find_type() */
515
static CHARSET_INFO *get_internal_charset(uint cs_number, myf flags)
520
To make things thread safe we are not allowing other threads to interfere
521
while we may changing the cs_info_table
523
pthread_mutex_lock(&THR_LOCK_charset);
524
if ((cs= all_charsets[cs_number]))
526
if (!(cs->state & MY_CS_COMPILED) && !(cs->state & MY_CS_LOADED))
528
strxmov(get_charsets_dir(buf), cs->csname, ".xml", NullS);
529
my_read_charset_file(buf,flags);
531
cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
533
if (cs && !(cs->state & MY_CS_READY))
535
if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
536
(cs->coll->init && cs->coll->init(cs, cs_alloc)))
539
cs->state|= MY_CS_READY;
541
pthread_mutex_unlock(&THR_LOCK_charset);
546
CHARSET_INFO *get_charset(uint cs_number, myf flags)
549
if (cs_number == default_charset_info->number)
550
return default_charset_info;
552
(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
554
if (!cs_number || cs_number >= array_elements(all_charsets)-1)
557
cs=get_internal_charset(cs_number, flags);
559
if (!cs && (flags & MY_WME))
561
char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)], cs_string[23];
562
strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
564
int10_to_str(cs_number, cs_string+1, 10);
565
my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_string, index_file);
570
CHARSET_INFO *get_charset_by_name(const char *cs_name, myf flags)
574
(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
576
cs_number=get_collation_number(cs_name);
577
cs= cs_number ? get_internal_charset(cs_number,flags) : NULL;
579
if (!cs && (flags & MY_WME))
581
char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
582
strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
583
my_error(EE_UNKNOWN_COLLATION, MYF(ME_BELL), cs_name, index_file);
590
CHARSET_INFO *get_charset_by_csname(const char *cs_name,
596
DBUG_ENTER("get_charset_by_csname");
597
DBUG_PRINT("enter",("name: '%s'", cs_name));
599
(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
601
cs_number= get_charset_number(cs_name, cs_flags);
602
cs= cs_number ? get_internal_charset(cs_number, flags) : NULL;
604
if (!cs && (flags & MY_WME))
606
char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
607
strmov(get_charsets_dir(index_file),MY_CHARSET_INDEX);
608
my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_name, index_file);
616
Resolve character set by the character set name (utf8, latin1, ...).
618
The function tries to resolve character set by the specified name. If
619
there is character set with the given name, it is assigned to the "cs"
620
parameter and FALSE is returned. If there is no such character set,
621
"default_cs" is assigned to the "cs" and TRUE is returned.
623
@param[in] cs_name Character set name.
624
@param[in] default_cs Default character set.
625
@param[out] cs Variable to store character set.
627
@return FALSE if character set was resolved successfully; TRUE if there
628
is no character set with given name.
631
my_bool resolve_charset(const char *cs_name,
632
CHARSET_INFO *default_cs,
635
*cs= get_charset_by_csname(cs_name, MY_CS_PRIMARY, MYF(0));
648
Resolve collation by the collation name (utf8_general_ci, ...).
650
The function tries to resolve collation by the specified name. If there
651
is collation with the given name, it is assigned to the "cl" parameter
652
and FALSE is returned. If there is no such collation, "default_cl" is
653
assigned to the "cl" and TRUE is returned.
655
@param[out] cl Variable to store collation.
656
@param[in] cl_name Collation name.
657
@param[in] default_cl Default collation.
659
@return FALSE if collation was resolved successfully; TRUE if there is no
660
collation with given name.
663
my_bool resolve_collation(const char *cl_name,
664
CHARSET_INFO *default_cl,
667
*cl= get_charset_by_name(cl_name, MYF(0));
680
Escape string with backslashes (\)
683
escape_string_for_mysql()
684
charset_info Charset of the strings
685
to Buffer for escaped string
686
to_length Length of destination buffer, or 0
687
from The string to escape
688
length The length of the string to escape
691
This escapes the contents of a string by adding backslashes before special
692
characters, and turning others into specific escape sequences, such as
693
turning newlines into \n and null bytes into \0.
696
To maintain compatibility with the old C API, to_length may be 0 to mean
700
(size_t) -1 The escaped string did not fit in the to buffer
701
# The length of the escaped string
704
size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
705
char *to, size_t to_length,
706
const char *from, size_t length)
708
const char *to_start= to;
709
const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
710
my_bool overflow= FALSE;
712
my_bool use_mb_flag= use_mb(charset_info);
714
for (end= from + length; from < end; from++)
719
if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
721
if (to + tmp_length > to_end)
732
If the next character appears to begin a multi-byte character, we
733
escape that first byte of that apparent multi-byte character. (The
734
character just looks like a multi-byte character -- if it were actually
735
a multi-byte character, it would have been passed through in the test
738
Without this check, we can create a problem by converting an invalid
739
multi-byte character into a valid one. For example, 0xbf27 is not
740
a valid GBK character, but 0xbf5c is. (0x27 = ', 0x5c = \)
742
if (use_mb_flag && (tmp_length= my_mbcharlen(charset_info, *from)) > 1)
747
case 0: /* Must be escaped for 'mysql' */
750
case '\n': /* Must be escaped for logs */
762
case '"': /* Better safe than sorry */
765
case '\032': /* This gives problems on Win32 */
790
return overflow ? (size_t) -1 : (size_t) (to - to_start);
794
#ifdef BACKSLASH_MBTAIL
795
static CHARSET_INFO *fs_cset_cache= NULL;
797
CHARSET_INFO *fs_character_set()
802
GetLocaleInfo(LOCALE_SYSTEM_DEFAULT, LOCALE_IDEFAULTANSICODEPAGE,
803
buf+2, sizeof(buf)-3);
805
We cannot call get_charset_by_name here
806
because fs_character_set() is executed before
807
LOCK_THD_charset mutex initialization, which
808
is used inside get_charset_by_name.
809
As we're now interested in cp932 only,
810
let's just detect it using strcmp().
812
fs_cset_cache= !strcmp(buf, "cp932") ?
813
&my_charset_cp932_japanese_ci : &my_charset_bin;
815
return fs_cset_cache;
820
Escape apostrophes by doubling them up
823
escape_quotes_for_mysql()
824
charset_info Charset of the strings
825
to Buffer for escaped string
826
to_length Length of destination buffer, or 0
827
from The string to escape
828
length The length of the string to escape
831
This escapes the contents of a string by doubling up any apostrophes that
832
it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
833
effect on the server.
836
To be consistent with escape_string_for_mysql(), to_length may be 0 to
840
~0 The escaped string did not fit in the to buffer
841
>=0 The length of the escaped string
844
size_t escape_quotes_for_mysql(CHARSET_INFO *charset_info,
845
char *to, size_t to_length,
846
const char *from, size_t length)
848
const char *to_start= to;
849
const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
850
my_bool overflow= FALSE;
852
my_bool use_mb_flag= use_mb(charset_info);
854
for (end= from + length; from < end; from++)
858
if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
860
if (to + tmp_length > to_end)
871
We don't have the same issue here with a non-multi-byte character being
872
turned into a multi-byte character by the addition of an escaping
873
character, because we are only escaping the ' character with itself.
897
return overflow ? (ulong)~0 : (ulong) (to - to_start);