12
12
You should have received a copy of the GNU General Public License
13
13
along with this program; if not, write to the Free Software
14
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
18
#include <drizzled/charset.h>
19
#include <drizzled/error.h>
20
#include <drizzled/charset_info.h>
21
#include <drizzled/internal/m_string.h>
22
#include <drizzled/configmake.h>
25
#include <drizzled/visibility.h>
33
We collect memory in this vector that we free on delete.
35
static vector<unsigned char*> memory_vector;
14
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
#include "mysys_priv.h"
17
#include "mysys_err.h"
18
#include <mystrings/m_ctype.h>
19
#include <mystrings/m_string.h>
38
24
The code below implements this functionality:
40
26
- Initializing charset related structures
41
27
- Loading dynamic charsets
42
- Searching for a proper CHARSET_INFO
28
- Searching for a proper CHARSET_INFO
43
29
using charset name, collation name or collation ID
44
30
- Setting server default character set
54
40
get_collation_number_internal(const char *name)
56
for (CHARSET_INFO **cs= all_charsets;
57
cs < all_charsets+array_elements(all_charsets)-1;
43
for (cs= all_charsets;
44
cs < all_charsets+array_elements(all_charsets)-1 ;
60
if ( cs[0] && cs[0]->name && !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
47
if ( cs[0] && cs[0]->name &&
48
!my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
62
49
return cs[0]->number;
68
static unsigned char *cs_alloc(size_t size)
70
memory_vector.push_back(new unsigned char[size]);
71
return memory_vector.back();
74
55
static bool init_state_maps(CHARSET_INFO *cs)
76
if (!(cs->state_map= cs_alloc(256)))
58
unsigned char *state_map;
59
unsigned char *ident_map;
61
if (!(cs->state_map= (unsigned char*) my_once_alloc(256, MYF(MY_WME))))
79
if (!(cs->ident_map= cs_alloc(256)))
64
if (!(cs->ident_map= (unsigned char*) my_once_alloc(256, MYF(MY_WME))))
82
unsigned char *state_map= cs->state_map;
83
unsigned char *ident_map= cs->ident_map;
67
state_map= cs->state_map;
68
ident_map= cs->ident_map;
85
70
/* Fill state_map with states to get a faster parser */
86
for (int i= 0; i < 256; i++)
71
for (i=0; i < 256 ; i++)
88
73
if (my_isalpha(cs,i))
89
state_map[i]= MY_LEX_IDENT;
74
state_map[i]=(unsigned char) MY_LEX_IDENT;
90
75
else if (my_isdigit(cs,i))
91
state_map[i]= MY_LEX_NUMBER_IDENT;
76
state_map[i]=(unsigned char) MY_LEX_NUMBER_IDENT;
77
#if defined(USE_MB) && defined(USE_MB_IDENT)
92
78
else if (my_mbcharlen(cs, i)>1)
93
state_map[i]= MY_LEX_IDENT;
79
state_map[i]=(unsigned char) MY_LEX_IDENT;
94
81
else if (my_isspace(cs,i))
95
state_map[i]= MY_LEX_SKIP;
82
state_map[i]=(unsigned char) MY_LEX_SKIP;
97
state_map[i]= MY_LEX_CHAR;
84
state_map[i]=(unsigned char) MY_LEX_CHAR;
99
state_map['_']=state_map['$']= MY_LEX_IDENT;
100
state_map['\'']= MY_LEX_STRING;
101
state_map['.']= MY_LEX_REAL_OR_POINT;
102
state_map['>']=state_map['=']=state_map['!']= MY_LEX_CMP_OP;
103
state_map['<']= MY_LEX_LONG_CMP_OP;
104
state_map['&']=state_map['|']= MY_LEX_BOOL;
105
state_map['#']= MY_LEX_COMMENT;
106
state_map[';']= MY_LEX_SEMICOLON;
107
state_map[':']= MY_LEX_SET_VAR;
108
state_map[0]= MY_LEX_EOL;
109
state_map['\\']= MY_LEX_ESCAPE;
110
state_map['/']= MY_LEX_LONG_COMMENT;
111
state_map['*']= MY_LEX_END_LONG_COMMENT;
112
state_map['@']= MY_LEX_USER_END;
113
state_map['`']= MY_LEX_USER_VARIABLE_DELIMITER;
114
state_map['"']= MY_LEX_STRING_OR_DELIMITER;
86
state_map[(unsigned char)'_']=state_map[(unsigned char)'$']=(unsigned char) MY_LEX_IDENT;
87
state_map[(unsigned char)'\'']=(unsigned char) MY_LEX_STRING;
88
state_map[(unsigned char)'.']=(unsigned char) MY_LEX_REAL_OR_POINT;
89
state_map[(unsigned char)'>']=state_map[(unsigned char)'=']=state_map[(unsigned char)'!']= (unsigned char) MY_LEX_CMP_OP;
90
state_map[(unsigned char)'<']= (unsigned char) MY_LEX_LONG_CMP_OP;
91
state_map[(unsigned char)'&']=state_map[(unsigned char)'|']=(unsigned char) MY_LEX_BOOL;
92
state_map[(unsigned char)'#']=(unsigned char) MY_LEX_COMMENT;
93
state_map[(unsigned char)';']=(unsigned char) MY_LEX_SEMICOLON;
94
state_map[(unsigned char)':']=(unsigned char) MY_LEX_SET_VAR;
95
state_map[0]=(unsigned char) MY_LEX_EOL;
96
state_map[(unsigned char)'\\']= (unsigned char) MY_LEX_ESCAPE;
97
state_map[(unsigned char)'/']= (unsigned char) MY_LEX_LONG_COMMENT;
98
state_map[(unsigned char)'*']= (unsigned char) MY_LEX_END_LONG_COMMENT;
99
state_map[(unsigned char)'@']= (unsigned char) MY_LEX_USER_END;
100
state_map[(unsigned char) '`']= (unsigned char) MY_LEX_USER_VARIABLE_DELIMITER;
101
state_map[(unsigned char)'"']= (unsigned char) MY_LEX_STRING_OR_DELIMITER;
117
104
Create a second map to make it faster to find identifiers
119
for (int i= 0; i < 256; i++)
106
for (i=0; i < 256 ; i++)
121
ident_map[i]= state_map[i] == MY_LEX_IDENT || state_map[i] == MY_LEX_NUMBER_IDENT;
108
ident_map[i]= (unsigned char) (state_map[i] == MY_LEX_IDENT ||
109
state_map[i] == MY_LEX_NUMBER_IDENT);
124
112
/* Special handling of hex and binary strings */
125
state_map['x']= state_map['X']= MY_LEX_IDENT_OR_HEX;
126
state_map['b']= state_map['B']= MY_LEX_IDENT_OR_BIN;
113
state_map[(unsigned char)'x']= state_map[(unsigned char)'X']= (unsigned char) MY_LEX_IDENT_OR_HEX;
114
state_map[(unsigned char)'b']= state_map[(unsigned char)'B']= (unsigned char) MY_LEX_IDENT_OR_BIN;
130
static bool charset_initialized= false;
132
DRIZZLED_API CHARSET_INFO *all_charsets[256];
133
const DRIZZLED_API CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
119
#define MY_CHARSET_INDEX "Index.xml"
121
const char *charsets_dir= NULL;
122
static int charset_initialized=0;
125
char *get_charsets_dir(char *buf)
127
const char *sharedir= SHAREDIR;
130
if (charsets_dir != NULL)
131
strmake(buf, charsets_dir, FN_REFLEN-1);
134
if (test_if_hard_path(sharedir) ||
135
is_prefix(sharedir, DEFAULT_CHARSET_HOME))
136
strxmov(buf, sharedir, "/", CHARSET_DIR, NULL);
138
strxmov(buf, DEFAULT_CHARSET_HOME, "/", sharedir, "/", CHARSET_DIR,
141
res= convert_dirname(buf,buf,NULL);
145
CHARSET_INFO *all_charsets[256];
146
const CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
135
148
void add_compiled_collation(CHARSET_INFO * cs)
138
151
cs->state|= MY_CS_AVAILABLE;
154
static void *cs_alloc(size_t size)
156
return my_once_alloc(size, MYF(MY_WME));
141
160
static bool init_available_charsets(myf myflags)
162
char fname[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
145
165
We have to use charset_initialized to not lock on THR_LOCK_charset
146
166
inside get_internal_charset...
148
if (charset_initialized == false)
168
if (!charset_initialized)
150
170
CHARSET_INFO **cs;
151
memset(&all_charsets, 0, sizeof(all_charsets));
152
init_compiled_charsets(myflags);
154
/* Copy compiled charsets */
155
for (cs=all_charsets;
156
cs < all_charsets+array_elements(all_charsets)-1 ;
172
To make things thread safe we are not allowing other threads to interfere
173
while we may changing the cs_info_table
175
pthread_mutex_lock(&THR_LOCK_charset);
176
if (!charset_initialized)
178
memset(&all_charsets, 0, sizeof(all_charsets));
179
init_compiled_charsets(myflags);
181
/* Copy compiled charsets */
182
for (cs=all_charsets;
183
cs < all_charsets+array_elements(all_charsets)-1 ;
162
if (init_state_maps(*cs))
189
if (init_state_maps(*cs))
194
my_stpcpy(get_charsets_dir(fname), MY_CHARSET_INDEX);
195
charset_initialized=1;
167
charset_initialized= true;
197
pthread_mutex_unlock(&THR_LOCK_charset);
169
assert(charset_initialized);
203
void free_charsets(void)
177
charset_initialized= false;
179
while (not memory_vector.empty())
181
delete[] memory_vector.back();
182
memory_vector.pop_back();
205
charset_initialized=0;
243
268
cs->state|= MY_CS_READY;
270
pthread_mutex_unlock(&THR_LOCK_charset);
250
const CHARSET_INFO *get_charset(uint32_t cs_number)
275
const const CHARSET_INFO *get_charset(uint32_t cs_number, myf flags)
252
277
const CHARSET_INFO *cs;
253
278
if (cs_number == default_charset_info->number)
254
279
return default_charset_info;
256
281
(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
258
283
if (!cs_number || cs_number >= array_elements(all_charsets)-1)
261
286
cs= get_internal_charset(cs_number);
288
if (!cs && (flags & MY_WME))
290
char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)], cs_string[23];
291
my_stpcpy(get_charsets_dir(index_file),MY_CHARSET_INDEX);
293
int10_to_str(cs_number, cs_string+1, 10);
294
my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_string, index_file);
266
const CHARSET_INFO *get_charset_by_name(const char *cs_name)
299
const CHARSET_INFO *get_charset_by_name(const char *cs_name, myf flags)
268
301
uint32_t cs_number;
269
302
const CHARSET_INFO *cs;
270
303
(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
272
cs_number= get_collation_number(cs_name);
305
cs_number=get_collation_number(cs_name);
273
306
cs= cs_number ? get_internal_charset(cs_number) : NULL;
308
if (!cs && (flags & MY_WME))
310
char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
311
my_stpcpy(get_charsets_dir(index_file),MY_CHARSET_INDEX);
312
my_error(EE_UNKNOWN_COLLATION, MYF(ME_BELL), cs_name, index_file);
279
const CHARSET_INFO *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
319
const CHARSET_INFO *get_charset_by_csname(const char *cs_name,
281
323
uint32_t cs_number;
282
324
const CHARSET_INFO *cs;
286
328
cs_number= get_charset_number(cs_name, cs_flags);
287
329
cs= cs_number ? get_internal_charset(cs_number) : NULL;
331
if (!cs && (flags & MY_WME))
333
char index_file[FN_REFLEN + sizeof(MY_CHARSET_INDEX)];
334
my_stpcpy(get_charsets_dir(index_file),MY_CHARSET_INDEX);
335
my_error(EE_UNKNOWN_CHARSET, MYF(ME_BELL), cs_name, index_file);
343
Resolve character set by the character set name (utf8, latin1, ...).
345
The function tries to resolve character set by the specified name. If
346
there is character set with the given name, it is assigned to the "cs"
347
parameter and false is returned. If there is no such character set,
348
"default_cs" is assigned to the "cs" and true is returned.
350
@param[in] cs_name Character set name.
351
@param[in] default_cs Default character set.
352
@param[out] cs Variable to store character set.
354
@return false if character set was resolved successfully; true if there
355
is no character set with given name.
358
bool resolve_charset(const char *cs_name,
359
const CHARSET_INFO *default_cs,
360
const CHARSET_INFO **cs)
362
*cs= get_charset_by_csname(cs_name, MY_CS_PRIMARY, MYF(0));
375
Resolve collation by the collation name (utf8_general_ci, ...).
377
The function tries to resolve collation by the specified name. If there
378
is collation with the given name, it is assigned to the "cl" parameter
379
and false is returned. If there is no such collation, "default_cl" is
380
assigned to the "cl" and true is returned.
382
@param[out] cl Variable to store collation.
383
@param[in] cl_name Collation name.
384
@param[in] default_cl Default collation.
386
@return false if collation was resolved successfully; true if there is no
387
collation with given name.
390
bool resolve_collation(const char *cl_name,
391
const CHARSET_INFO *default_cl,
392
const CHARSET_INFO **cl)
394
*cl= get_charset_by_name(cl_name, MYF(0));
406
#ifdef BACKSLASH_MBTAIL
407
static CHARSET_INFO *fs_cset_cache= NULL;
409
CHARSET_INFO *fs_character_set()
414
GetLocaleInfo(LOCALE_SYSTEM_DEFAULT, LOCALE_IDEFAULTANSICODEPAGE,
415
buf+2, sizeof(buf)-3);
417
We cannot call get_charset_by_name here
418
because fs_character_set() is executed before
419
LOCK_THD_charset mutex initialization, which
420
is used inside get_charset_by_name.
421
As we're now interested in cp932 only,
422
let's just detect it using strcmp().
424
fs_cset_cache= !strcmp(buf, "cp932") ?
425
&my_charset_cp932_japanese_ci : &my_charset_bin;
427
return fs_cset_cache;
294
432
Escape apostrophes by doubling them up