13
13
along with this program; if not, write to the Free Software
14
14
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
18
#include <drizzled/charset.h>
19
#include <drizzled/error.h>
20
#include <drizzled/internal/m_string.h>
18
#include "drizzled/charset.h"
19
#include "drizzled/error.h"
20
#include "drizzled/charset_info.h"
21
#include "drizzled/internal/m_string.h"
21
22
#include <drizzled/configmake.h>
24
#include <drizzled/visibility.h>
26
25
using namespace std;
31
31
We collect memory in this vector that we free on delete.
33
static vector<unsigned char*> memory_vector;
35
extern charset_info_st my_charset_utf8mb4_icelandic_uca_ci;
36
extern charset_info_st my_charset_utf8mb4_latvian_uca_ci;
37
extern charset_info_st my_charset_utf8mb4_romanian_uca_ci;
38
extern charset_info_st my_charset_utf8mb4_slovenian_uca_ci;
39
extern charset_info_st my_charset_utf8mb4_polish_uca_ci;
40
extern charset_info_st my_charset_utf8mb4_estonian_uca_ci;
41
extern charset_info_st my_charset_utf8mb4_spanish_uca_ci;
42
extern charset_info_st my_charset_utf8mb4_swedish_uca_ci;
43
extern charset_info_st my_charset_utf8mb4_turkish_uca_ci;
44
extern charset_info_st my_charset_utf8mb4_czech_uca_ci;
45
extern charset_info_st my_charset_utf8mb4_danish_uca_ci;
46
extern charset_info_st my_charset_utf8mb4_lithuanian_uca_ci;
47
extern charset_info_st my_charset_utf8mb4_slovak_uca_ci;
48
extern charset_info_st my_charset_utf8mb4_spanish2_uca_ci;
49
extern charset_info_st my_charset_utf8mb4_roman_uca_ci;
50
extern charset_info_st my_charset_utf8mb4_persian_uca_ci;
51
extern charset_info_st my_charset_utf8mb4_esperanto_uca_ci;
52
extern charset_info_st my_charset_utf8mb4_hungarian_uca_ci;
53
extern charset_info_st my_charset_utf8mb4_sinhala_uca_ci;
33
static vector<void *>memory_vector;
56
36
The code below implements this functionality:
58
38
- Initializing charset related structures
59
39
- Loading dynamic charsets
60
- Searching for a proper charset_info_st
40
- Searching for a proper CHARSET_INFO
61
41
using charset name, collation name or collation ID
62
42
- Setting server default character set
65
bool my_charset_same(const charset_info_st *cs1, const charset_info_st *cs2)
45
bool my_charset_same(const CHARSET_INFO *cs1, const CHARSET_INFO *cs2)
67
return cs1 == cs2 || not strcmp(cs1->csname, cs2->csname);
47
return ((cs1 == cs2) || !strcmp(cs1->csname,cs2->csname));
70
static uint get_collation_number_internal(const char *name)
52
get_collation_number_internal(const char *name)
72
for (charset_info_st **cs= all_charsets;
73
cs < all_charsets+array_elements(all_charsets)-1;
55
for (cs= all_charsets;
56
cs < all_charsets+array_elements(all_charsets)-1 ;
76
if ( cs[0] && cs[0]->name && !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
59
if ( cs[0] && cs[0]->name &&
60
!my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
78
61
return cs[0]->number;
84
static unsigned char* cs_alloc(size_t size)
86
memory_vector.push_back(new unsigned char[size]);
87
return memory_vector.back();
90
static void init_state_maps(charset_info_st *cs)
92
cs->state_map= cs_alloc(256);
93
cs->ident_map= cs_alloc(256);
95
unsigned char *state_map= cs->state_map;
96
unsigned char *ident_map= cs->ident_map;
67
static bool init_state_maps(CHARSET_INFO *cs)
70
unsigned char *state_map;
71
unsigned char *ident_map;
73
if (!(cs->state_map= (unsigned char*) cs_alloc(256)))
76
if (!(cs->ident_map= (unsigned char*) cs_alloc(256)))
79
state_map= cs->state_map;
80
ident_map= cs->ident_map;
98
82
/* Fill state_map with states to get a faster parser */
99
for (int i= 0; i < 256; i++)
83
for (i=0; i < 256 ; i++)
101
85
if (my_isalpha(cs,i))
102
state_map[i]= MY_LEX_IDENT;
86
state_map[i]=(unsigned char) MY_LEX_IDENT;
103
87
else if (my_isdigit(cs,i))
104
state_map[i]= MY_LEX_NUMBER_IDENT;
88
state_map[i]=(unsigned char) MY_LEX_NUMBER_IDENT;
105
89
else if (my_mbcharlen(cs, i)>1)
106
state_map[i]= MY_LEX_IDENT;
90
state_map[i]=(unsigned char) MY_LEX_IDENT;
107
91
else if (my_isspace(cs,i))
108
state_map[i]= MY_LEX_SKIP;
92
state_map[i]=(unsigned char) MY_LEX_SKIP;
110
state_map[i]= MY_LEX_CHAR;
94
state_map[i]=(unsigned char) MY_LEX_CHAR;
112
state_map['_']=state_map['$']= MY_LEX_IDENT;
113
state_map['\'']= MY_LEX_STRING;
114
state_map['.']= MY_LEX_REAL_OR_POINT;
115
state_map['>']=state_map['=']=state_map['!']= MY_LEX_CMP_OP;
116
state_map['<']= MY_LEX_LONG_CMP_OP;
117
state_map['&']=state_map['|']= MY_LEX_BOOL;
118
state_map['#']= MY_LEX_COMMENT;
119
state_map[';']= MY_LEX_SEMICOLON;
120
state_map[':']= MY_LEX_SET_VAR;
121
state_map[0]= MY_LEX_EOL;
122
state_map['\\']= MY_LEX_ESCAPE;
123
state_map['/']= MY_LEX_LONG_COMMENT;
124
state_map['*']= MY_LEX_END_LONG_COMMENT;
125
state_map['@']= MY_LEX_USER_END;
126
state_map['`']= MY_LEX_USER_VARIABLE_DELIMITER;
127
state_map['"']= MY_LEX_STRING_OR_DELIMITER;
96
state_map[(unsigned char)'_']=state_map[(unsigned char)'$']=(unsigned char) MY_LEX_IDENT;
97
state_map[(unsigned char)'\'']=(unsigned char) MY_LEX_STRING;
98
state_map[(unsigned char)'.']=(unsigned char) MY_LEX_REAL_OR_POINT;
99
state_map[(unsigned char)'>']=state_map[(unsigned char)'=']=state_map[(unsigned char)'!']= (unsigned char) MY_LEX_CMP_OP;
100
state_map[(unsigned char)'<']= (unsigned char) MY_LEX_LONG_CMP_OP;
101
state_map[(unsigned char)'&']=state_map[(unsigned char)'|']=(unsigned char) MY_LEX_BOOL;
102
state_map[(unsigned char)'#']=(unsigned char) MY_LEX_COMMENT;
103
state_map[(unsigned char)';']=(unsigned char) MY_LEX_SEMICOLON;
104
state_map[(unsigned char)':']=(unsigned char) MY_LEX_SET_VAR;
105
state_map[0]=(unsigned char) MY_LEX_EOL;
106
state_map[(unsigned char)'\\']= (unsigned char) MY_LEX_ESCAPE;
107
state_map[(unsigned char)'/']= (unsigned char) MY_LEX_LONG_COMMENT;
108
state_map[(unsigned char)'*']= (unsigned char) MY_LEX_END_LONG_COMMENT;
109
state_map[(unsigned char)'@']= (unsigned char) MY_LEX_USER_END;
110
state_map[(unsigned char) '`']= (unsigned char) MY_LEX_USER_VARIABLE_DELIMITER;
111
state_map[(unsigned char)'"']= (unsigned char) MY_LEX_STRING_OR_DELIMITER;
130
114
Create a second map to make it faster to find identifiers
132
for (int i= 0; i < 256; i++)
116
for (i=0; i < 256 ; i++)
134
ident_map[i]= state_map[i] == MY_LEX_IDENT || state_map[i] == MY_LEX_NUMBER_IDENT;
118
ident_map[i]= (unsigned char) (state_map[i] == MY_LEX_IDENT ||
119
state_map[i] == MY_LEX_NUMBER_IDENT);
137
122
/* Special handling of hex and binary strings */
138
state_map['x']= state_map['X']= MY_LEX_IDENT_OR_HEX;
139
state_map['b']= state_map['B']= MY_LEX_IDENT_OR_BIN;
123
state_map[(unsigned char)'x']= state_map[(unsigned char)'X']= (unsigned char) MY_LEX_IDENT_OR_HEX;
124
state_map[(unsigned char)'b']= state_map[(unsigned char)'B']= (unsigned char) MY_LEX_IDENT_OR_BIN;
142
129
static bool charset_initialized= false;
144
DRIZZLED_API charset_info_st *all_charsets[256];
145
const DRIZZLED_API charset_info_st *default_charset_info = &my_charset_utf8_general_ci;
131
CHARSET_INFO *all_charsets[256];
132
const CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
147
static void add_compiled_collation(charset_info_st * cs)
134
void add_compiled_collation(CHARSET_INFO * cs)
149
136
all_charsets[cs->number]= cs;
150
137
cs->state|= MY_CS_AVAILABLE;
153
static void init_compiled_charsets()
140
void *cs_alloc(size_t size)
155
add_compiled_collation(&my_charset_bin);
157
add_compiled_collation(&my_charset_utf8mb4_general_ci);
158
add_compiled_collation(&my_charset_utf8mb4_bin);
159
add_compiled_collation(&my_charset_utf8mb4_unicode_ci);
160
add_compiled_collation(&my_charset_utf8mb4_icelandic_uca_ci);
161
add_compiled_collation(&my_charset_utf8mb4_latvian_uca_ci);
162
add_compiled_collation(&my_charset_utf8mb4_romanian_uca_ci);
163
add_compiled_collation(&my_charset_utf8mb4_slovenian_uca_ci);
164
add_compiled_collation(&my_charset_utf8mb4_polish_uca_ci);
165
add_compiled_collation(&my_charset_utf8mb4_estonian_uca_ci);
166
add_compiled_collation(&my_charset_utf8mb4_spanish_uca_ci);
167
add_compiled_collation(&my_charset_utf8mb4_swedish_uca_ci);
168
add_compiled_collation(&my_charset_utf8mb4_turkish_uca_ci);
169
add_compiled_collation(&my_charset_utf8mb4_czech_uca_ci);
170
add_compiled_collation(&my_charset_utf8mb4_danish_uca_ci);
171
add_compiled_collation(&my_charset_utf8mb4_lithuanian_uca_ci);
172
add_compiled_collation(&my_charset_utf8mb4_slovak_uca_ci);
173
add_compiled_collation(&my_charset_utf8mb4_spanish2_uca_ci);
174
add_compiled_collation(&my_charset_utf8mb4_roman_uca_ci);
175
add_compiled_collation(&my_charset_utf8mb4_persian_uca_ci);
176
add_compiled_collation(&my_charset_utf8mb4_esperanto_uca_ci);
177
add_compiled_collation(&my_charset_utf8mb4_hungarian_uca_ci);
178
add_compiled_collation(&my_charset_utf8mb4_sinhala_uca_ci);
142
void *ptr= malloc(size);
144
memory_vector.push_back(ptr);
181
static void init_available_charsets()
151
static bool init_available_charsets(myf myflags)
184
155
We have to use charset_initialized to not lock on THR_LOCK_charset
185
156
inside get_internal_charset...
187
if (charset_initialized)
189
memset(&all_charsets, 0, sizeof(all_charsets));
190
init_compiled_charsets();
192
/* Copy compiled charsets */
193
for (charset_info_st**cs= all_charsets;
194
cs < all_charsets+array_elements(all_charsets)-1;
158
if (charset_initialized == false)
197
if (*cs && cs[0]->ctype)
198
init_state_maps(*cs);
161
memset(&all_charsets, 0, sizeof(all_charsets));
162
init_compiled_charsets(myflags);
164
/* Copy compiled charsets */
165
for (cs=all_charsets;
166
cs < all_charsets+array_elements(all_charsets)-1 ;
172
if (init_state_maps(*cs))
177
charset_initialized= true;
179
assert(charset_initialized);
185
void free_charsets(void)
201
187
charset_initialized= true;
206
charset_initialized= false;
208
while (not memory_vector.empty())
189
while (memory_vector.empty() == false)
210
delete[] memory_vector.back();
191
void *ptr= memory_vector.back();
211
192
memory_vector.pop_back();
195
memory_vector.clear();
215
200
uint32_t get_collation_number(const char *name)
217
init_available_charsets();
202
init_available_charsets(MYF(0));
218
203
return get_collation_number_internal(name);
221
207
uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
223
charset_info_st **cs;
224
init_available_charsets();
210
init_available_charsets(MYF(0));
226
212
for (cs= all_charsets;
227
213
cs < all_charsets+array_elements(all_charsets)-1 ;
230
if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) && !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->csname, charset_name))
216
if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) &&
217
!my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->csname, charset_name))
231
218
return cs[0]->number;
236
224
const char *get_charset_name(uint32_t charset_number)
238
init_available_charsets();
239
const charset_info_st* cs= all_charsets[charset_number];
240
return cs && cs->number == charset_number && cs->name ? cs->name : "?";
226
const CHARSET_INFO *cs;
227
init_available_charsets(MYF(0));
229
cs=all_charsets[charset_number];
230
if (cs && (cs->number == charset_number) && cs->name )
231
return (char*) cs->name;
233
return (char*) "?"; /* this mimics find_type() */
243
static const charset_info_st *get_internal_charset(uint32_t cs_number)
237
static const CHARSET_INFO *get_internal_charset(uint32_t cs_number)
245
charset_info_st* cs= all_charsets[cs_number];
247
241
To make things thread safe we are not allowing other threads to interfere
248
242
while we may changing the cs_info_table
252
assert(not (not (cs->state & MY_CS_COMPILED) && not (cs->state & MY_CS_LOADED)));
253
if (not (cs->state & MY_CS_AVAILABLE))
255
if (not (cs->state & MY_CS_READY))
257
if (cs->coll->init && cs->coll->init(*cs, cs_alloc))
259
cs->state|= MY_CS_READY;
244
if ((cs= all_charsets[cs_number]))
246
if (!(cs->state & MY_CS_COMPILED) && !(cs->state & MY_CS_LOADED))
250
cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
252
if (cs && !(cs->state & MY_CS_READY))
254
if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
255
(cs->coll->init && cs->coll->init(cs, cs_alloc)))
258
cs->state|= MY_CS_READY;
264
const charset_info_st *get_charset(uint32_t cs_number)
265
const CHARSET_INFO *get_charset(uint32_t cs_number)
267
const CHARSET_INFO *cs;
266
268
if (cs_number == default_charset_info->number)
267
269
return default_charset_info;
269
init_available_charsets(); /* If it isn't initialized */
271
(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
271
273
if (!cs_number || cs_number >= array_elements(all_charsets)-1)
274
return get_internal_charset(cs_number);
277
const charset_info_st *get_charset_by_name(const char *cs_name)
279
init_available_charsets(); /* If it isn't initialized */
280
uint32_t cs_number= get_collation_number(cs_name);
281
return cs_number ? get_internal_charset(cs_number) : NULL;
284
const charset_info_st *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
286
init_available_charsets(); /* If it isn't initialized */
287
uint32_t cs_number= get_charset_number(cs_name, cs_flags);
288
return cs_number ? get_internal_charset(cs_number) : NULL;
276
cs= get_internal_charset(cs_number);
281
const CHARSET_INFO *get_charset_by_name(const char *cs_name)
284
const CHARSET_INFO *cs;
285
(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
287
cs_number= get_collation_number(cs_name);
288
cs= cs_number ? get_internal_charset(cs_number) : NULL;
294
const CHARSET_INFO *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
297
const CHARSET_INFO *cs;
299
(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
301
cs_number= get_charset_number(cs_name, cs_flags);
302
cs= cs_number ? get_internal_charset(cs_number) : NULL;