~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
1802.10.2 by Monty Taylor
Update all of the copyright headers to include the correct address.
14
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
1 by brian
clean slate
15
2173.2.1 by Monty Taylor
Fixes incorrect usage of include
16
#include <config.h>
1241.9.57 by Monty Taylor
Oy. Bigger change than I normally like - but this stuff is all intertwined.
17
2173.2.1 by Monty Taylor
Fixes incorrect usage of include
18
#include <drizzled/charset.h>
19
#include <drizzled/error.h>
20
#include <drizzled/internal/m_string.h>
722.1.4 by Monty Taylor
Removed all the setting of DEFS everywhere. Use configmake.h to get the values
21
#include <drizzled/configmake.h>
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
22
#include <vector>
23
2173.2.1 by Monty Taylor
Fixes incorrect usage of include
24
#include <drizzled/visibility.h>
2119.4.1 by Monty Taylor
Turns on -fvisibility=hidden by default. Symbols intended to be used by
25
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
26
using namespace std;
27
2318.4.7 by Olaf van der Spek
Refactor
28
namespace drizzled {
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
29
30
/*
31
  We collect memory in this vector that we free on delete.
32
*/
2160.1.2 by Olaf van der Spek
casts
33
static vector<unsigned char*> memory_vector;
1 by brian
clean slate
34
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
35
extern charset_info_st my_charset_utf8mb4_icelandic_uca_ci;
36
extern charset_info_st my_charset_utf8mb4_latvian_uca_ci;
37
extern charset_info_st my_charset_utf8mb4_romanian_uca_ci;
38
extern charset_info_st my_charset_utf8mb4_slovenian_uca_ci;
39
extern charset_info_st my_charset_utf8mb4_polish_uca_ci;
40
extern charset_info_st my_charset_utf8mb4_estonian_uca_ci;
41
extern charset_info_st my_charset_utf8mb4_spanish_uca_ci;
42
extern charset_info_st my_charset_utf8mb4_swedish_uca_ci;
43
extern charset_info_st my_charset_utf8mb4_turkish_uca_ci;
44
extern charset_info_st my_charset_utf8mb4_czech_uca_ci;
45
extern charset_info_st my_charset_utf8mb4_danish_uca_ci;
46
extern charset_info_st my_charset_utf8mb4_lithuanian_uca_ci;
47
extern charset_info_st my_charset_utf8mb4_slovak_uca_ci;
48
extern charset_info_st my_charset_utf8mb4_spanish2_uca_ci;
49
extern charset_info_st my_charset_utf8mb4_roman_uca_ci;
50
extern charset_info_st my_charset_utf8mb4_persian_uca_ci;
51
extern charset_info_st my_charset_utf8mb4_esperanto_uca_ci;
52
extern charset_info_st my_charset_utf8mb4_hungarian_uca_ci;
53
extern charset_info_st my_charset_utf8mb4_sinhala_uca_ci;
54
1 by brian
clean slate
55
/*
56
  The code below implements this functionality:
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
57
1 by brian
clean slate
58
    - Initializing charset related structures
59
    - Loading dynamic charsets
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
60
    - Searching for a proper charset_info_st
1 by brian
clean slate
61
      using charset name, collation name or collation ID
62
    - Setting server default character set
63
*/
64
2318.6.17 by Olaf van der Spek
Silly icc
65
bool my_charset_same(const charset_info_st *cs1, const charset_info_st *cs2)
66
{
67
  return cs1 == cs2 || not strcmp(cs1->csname, cs2->csname);
1 by brian
clean slate
68
}
69
2318.4.7 by Olaf van der Spek
Refactor
70
static uint get_collation_number_internal(const char *name)
1 by brian
clean slate
71
{
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
72
  for (charset_info_st **cs= all_charsets;
2160.1.2 by Olaf van der Spek
casts
73
       cs < all_charsets+array_elements(all_charsets)-1;
1 by brian
clean slate
74
       cs++)
75
  {
2085.2.3 by Brian Aker
Fix strcasecmp issues (ie, check UTF-8).
76
    if ( cs[0] && cs[0]->name && !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
77
    {
1 by brian
clean slate
78
      return cs[0]->number;
2085.2.3 by Brian Aker
Fix strcasecmp issues (ie, check UTF-8).
79
    }
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
80
  }
1 by brian
clean slate
81
  return 0;
82
}
83
2318.4.7 by Olaf van der Spek
Refactor
84
static unsigned char* cs_alloc(size_t size)
2160.1.2 by Olaf van der Spek
casts
85
{
86
  memory_vector.push_back(new unsigned char[size]);
87
  return memory_vector.back();
88
}
1 by brian
clean slate
89
2318.4.7 by Olaf van der Spek
Refactor
90
static void init_state_maps(charset_info_st *cs)
1 by brian
clean slate
91
{
2318.4.7 by Olaf van der Spek
Refactor
92
  cs->state_map= cs_alloc(256);
93
  cs->ident_map= cs_alloc(256);
1 by brian
clean slate
94
2160.1.2 by Olaf van der Spek
casts
95
  unsigned char *state_map= cs->state_map;
96
  unsigned char *ident_map= cs->ident_map;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
97
1 by brian
clean slate
98
  /* Fill state_map with states to get a faster parser */
2160.1.2 by Olaf van der Spek
casts
99
  for (int i= 0; i < 256; i++)
1 by brian
clean slate
100
  {
101
    if (my_isalpha(cs,i))
2160.1.2 by Olaf van der Spek
casts
102
      state_map[i]= MY_LEX_IDENT;
1 by brian
clean slate
103
    else if (my_isdigit(cs,i))
2160.1.2 by Olaf van der Spek
casts
104
      state_map[i]= MY_LEX_NUMBER_IDENT;
1 by brian
clean slate
105
    else if (my_mbcharlen(cs, i)>1)
2160.1.2 by Olaf van der Spek
casts
106
      state_map[i]= MY_LEX_IDENT;
1 by brian
clean slate
107
    else if (my_isspace(cs,i))
2160.1.2 by Olaf van der Spek
casts
108
      state_map[i]= MY_LEX_SKIP;
1 by brian
clean slate
109
    else
2160.1.2 by Olaf van der Spek
casts
110
      state_map[i]= MY_LEX_CHAR;
1 by brian
clean slate
111
  }
2160.1.2 by Olaf van der Spek
casts
112
  state_map['_']=state_map['$']= MY_LEX_IDENT;
2318.8.15 by Olaf van der Spek
Fix MY_LEX_STRING
113
  state_map['\'']= MY_LEX_STRING;
2160.1.2 by Olaf van der Spek
casts
114
  state_map['.']= MY_LEX_REAL_OR_POINT;
115
  state_map['>']=state_map['=']=state_map['!']=  MY_LEX_CMP_OP;
116
  state_map['<']=  MY_LEX_LONG_CMP_OP;
117
  state_map['&']=state_map['|']= MY_LEX_BOOL;
118
  state_map['#']= MY_LEX_COMMENT;
119
  state_map[';']= MY_LEX_SEMICOLON;
120
  state_map[':']= MY_LEX_SET_VAR;
121
  state_map[0]= MY_LEX_EOL;
122
  state_map['\\']=  MY_LEX_ESCAPE;
123
  state_map['/']=  MY_LEX_LONG_COMMENT;
124
  state_map['*']=  MY_LEX_END_LONG_COMMENT;
125
  state_map['@']=  MY_LEX_USER_END;
126
  state_map['`']=  MY_LEX_USER_VARIABLE_DELIMITER;
127
  state_map['"']=  MY_LEX_STRING_OR_DELIMITER;
1 by brian
clean slate
128
129
  /*
130
    Create a second map to make it faster to find identifiers
131
  */
2160.1.2 by Olaf van der Spek
casts
132
  for (int i= 0; i < 256; i++)
1 by brian
clean slate
133
  {
2160.1.2 by Olaf van der Spek
casts
134
    ident_map[i]= state_map[i] == MY_LEX_IDENT || state_map[i] == MY_LEX_NUMBER_IDENT;
1 by brian
clean slate
135
  }
136
137
  /* Special handling of hex and binary strings */
2160.1.2 by Olaf van der Spek
casts
138
  state_map['x']= state_map['X']=  MY_LEX_IDENT_OR_HEX;
139
  state_map['b']= state_map['B']=  MY_LEX_IDENT_OR_BIN;
1 by brian
clean slate
140
}
141
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
142
static bool charset_initialized= false;
1 by brian
clean slate
143
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
144
DRIZZLED_API charset_info_st *all_charsets[256];
145
const DRIZZLED_API charset_info_st *default_charset_info = &my_charset_utf8_general_ci;
1 by brian
clean slate
146
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
147
static void add_compiled_collation(charset_info_st * cs)
1 by brian
clean slate
148
{
149
  all_charsets[cs->number]= cs;
150
  cs->state|= MY_CS_AVAILABLE;
151
}
152
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
153
static void init_compiled_charsets()
154
{
155
  add_compiled_collation(&my_charset_bin);
156
157
  add_compiled_collation(&my_charset_utf8mb4_general_ci);
158
  add_compiled_collation(&my_charset_utf8mb4_bin);
159
  add_compiled_collation(&my_charset_utf8mb4_unicode_ci);
160
  add_compiled_collation(&my_charset_utf8mb4_icelandic_uca_ci);
161
  add_compiled_collation(&my_charset_utf8mb4_latvian_uca_ci);
162
  add_compiled_collation(&my_charset_utf8mb4_romanian_uca_ci);
163
  add_compiled_collation(&my_charset_utf8mb4_slovenian_uca_ci);
164
  add_compiled_collation(&my_charset_utf8mb4_polish_uca_ci);
165
  add_compiled_collation(&my_charset_utf8mb4_estonian_uca_ci);
166
  add_compiled_collation(&my_charset_utf8mb4_spanish_uca_ci);
167
  add_compiled_collation(&my_charset_utf8mb4_swedish_uca_ci);
168
  add_compiled_collation(&my_charset_utf8mb4_turkish_uca_ci);
169
  add_compiled_collation(&my_charset_utf8mb4_czech_uca_ci);
170
  add_compiled_collation(&my_charset_utf8mb4_danish_uca_ci);
171
  add_compiled_collation(&my_charset_utf8mb4_lithuanian_uca_ci);
172
  add_compiled_collation(&my_charset_utf8mb4_slovak_uca_ci);
173
  add_compiled_collation(&my_charset_utf8mb4_spanish2_uca_ci);
174
  add_compiled_collation(&my_charset_utf8mb4_roman_uca_ci);
175
  add_compiled_collation(&my_charset_utf8mb4_persian_uca_ci);
176
  add_compiled_collation(&my_charset_utf8mb4_esperanto_uca_ci);
177
  add_compiled_collation(&my_charset_utf8mb4_hungarian_uca_ci);
178
  add_compiled_collation(&my_charset_utf8mb4_sinhala_uca_ci);
179
}
180
181
static void init_available_charsets()
1 by brian
clean slate
182
{
183
  /*
184
    We have to use charset_initialized to not lock on THR_LOCK_charset
185
    inside get_internal_charset...
186
  */
2318.4.7 by Olaf van der Spek
Refactor
187
  if (charset_initialized)
188
    return;
189
  memset(&all_charsets, 0, sizeof(all_charsets));
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
190
  init_compiled_charsets();
2318.4.7 by Olaf van der Spek
Refactor
191
192
  /* Copy compiled charsets */
193
  for (charset_info_st**cs= all_charsets;
194
    cs < all_charsets+array_elements(all_charsets)-1;
195
    cs++)
1 by brian
clean slate
196
  {
2318.4.7 by Olaf van der Spek
Refactor
197
    if (*cs && cs[0]->ctype)
198
      init_state_maps(*cs);
1 by brian
clean slate
199
  }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
200
2318.4.7 by Olaf van der Spek
Refactor
201
  charset_initialized= true;
1 by brian
clean slate
202
}
203
2160.1.2 by Olaf van der Spek
casts
204
void free_charsets()
1 by brian
clean slate
205
{
2160.1.7 by Olaf van der Spek
fix
206
  charset_initialized= false;
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
207
2160.1.6 by Olaf van der Spek
USe "not" instead of "!"
208
  while (not memory_vector.empty())
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
209
  {
2160.1.2 by Olaf van der Spek
casts
210
    delete[] memory_vector.back();
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
211
    memory_vector.pop_back();
212
  }
1 by brian
clean slate
213
}
214
482 by Brian Aker
Remove uint.
215
uint32_t get_collation_number(const char *name)
1 by brian
clean slate
216
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
217
  init_available_charsets();
1 by brian
clean slate
218
  return get_collation_number_internal(name);
219
}
220
482 by Brian Aker
Remove uint.
221
uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
1 by brian
clean slate
222
{
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
223
  charset_info_st **cs;
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
224
  init_available_charsets();
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
225
1 by brian
clean slate
226
  for (cs= all_charsets;
227
       cs < all_charsets+array_elements(all_charsets)-1 ;
228
       cs++)
229
  {
2085.2.3 by Brian Aker
Fix strcasecmp issues (ie, check UTF-8).
230
    if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) && !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->csname, charset_name))
1 by brian
clean slate
231
      return cs[0]->number;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
232
  }
1 by brian
clean slate
233
  return 0;
234
}
235
482 by Brian Aker
Remove uint.
236
const char *get_charset_name(uint32_t charset_number)
1 by brian
clean slate
237
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
238
  init_available_charsets();
239
  const charset_info_st* cs= all_charsets[charset_number];
240
  return cs && cs->number == charset_number && cs->name ? cs->name : "?";
1 by brian
clean slate
241
}
242
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
243
static const charset_info_st *get_internal_charset(uint32_t cs_number)
1 by brian
clean slate
244
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
245
  charset_info_st* cs= all_charsets[cs_number];
1 by brian
clean slate
246
  /*
247
    To make things thread safe we are not allowing other threads to interfere
248
    while we may changing the cs_info_table
249
  */
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
250
  if (not cs)
251
    return NULL;
252
  assert(not (not (cs->state & MY_CS_COMPILED) && not (cs->state & MY_CS_LOADED)));
2318.6.108 by Olaf van der Spek
Oops
253
  if (not (cs->state & MY_CS_AVAILABLE))
254
    return NULL;
255
  if (not (cs->state & MY_CS_READY))
256
  {
2318.6.109 by Olaf van der Spek
Refactor
257
    if (cs->coll->init && cs->coll->init(*cs, cs_alloc))
2318.6.108 by Olaf van der Spek
Oops
258
      return NULL;
259
    cs->state|= MY_CS_READY;
260
  }
261
  return cs;
1 by brian
clean slate
262
}
263
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
264
const charset_info_st *get_charset(uint32_t cs_number)
1 by brian
clean slate
265
{
266
  if (cs_number == default_charset_info->number)
267
    return default_charset_info;
268
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
269
  init_available_charsets();	/* If it isn't initialized */
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
270
1 by brian
clean slate
271
  if (!cs_number || cs_number >= array_elements(all_charsets)-1)
272
    return NULL;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
273
2318.4.7 by Olaf van der Spek
Refactor
274
  return get_internal_charset(cs_number);
1 by brian
clean slate
275
}
276
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
277
const charset_info_st *get_charset_by_name(const char *cs_name)
1 by brian
clean slate
278
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
279
  init_available_charsets();	/* If it isn't initialized */
2318.4.7 by Olaf van der Spek
Refactor
280
  uint32_t cs_number= get_collation_number(cs_name);
281
  return cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
282
}
283
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
284
const charset_info_st *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
1 by brian
clean slate
285
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
286
  init_available_charsets();	/* If it isn't initialized */
2318.4.7 by Olaf van der Spek
Refactor
287
  uint32_t cs_number= get_charset_number(cs_name, cs_flags);
288
  return cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
289
}
290
291
292
/*
293
  Escape apostrophes by doubling them up
294
295
  SYNOPSIS
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
296
    escape_quotes_for_drizzle()
1 by brian
clean slate
297
    charset_info        Charset of the strings
298
    to                  Buffer for escaped string
299
    to_length           Length of destination buffer, or 0
300
    from                The string to escape
301
    length              The length of the string to escape
302
303
  DESCRIPTION
304
    This escapes the contents of a string by doubling up any apostrophes that
305
    it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
306
    effect on the server.
307
308
  NOTE
309
    To be consistent with escape_string_for_mysql(), to_length may be 0 to
310
    mean "big enough"
311
312
  RETURN VALUES
365.2.9 by Monty Taylor
Got rid of all instances of ~0
313
    UINT32_MAX  The escaped string did not fit in the to buffer
1 by brian
clean slate
314
    >=0         The length of the escaped string
315
*/
316
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
317
size_t escape_quotes_for_drizzle(const charset_info_st *charset_info,
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
318
                                 char *to, size_t to_length,
319
                                 const char *from, size_t length)
1 by brian
clean slate
320
{
321
  const char *to_start= to;
322
  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
163 by Brian Aker
Merge Monty's code.
323
  bool overflow= false;
146 by Brian Aker
my_bool cleanup.
324
  bool use_mb_flag= use_mb(charset_info);
1 by brian
clean slate
325
  for (end= from + length; from < end; from++)
326
  {
327
    int tmp_length;
328
    if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
329
    {
330
      if (to + tmp_length > to_end)
331
      {
163 by Brian Aker
Merge Monty's code.
332
        overflow= true;
1 by brian
clean slate
333
        break;
334
      }
335
      while (tmp_length--)
336
	*to++= *from++;
337
      from--;
338
      continue;
339
    }
340
    /*
341
      We don't have the same issue here with a non-multi-byte character being
342
      turned into a multi-byte character by the addition of an escaping
343
      character, because we are only escaping the ' character with itself.
344
     */
345
    if (*from == '\'')
346
    {
347
      if (to + 2 > to_end)
348
      {
163 by Brian Aker
Merge Monty's code.
349
        overflow= true;
1 by brian
clean slate
350
        break;
351
      }
352
      *to++= '\'';
353
      *to++= '\'';
354
    }
355
    else
356
    {
357
      if (to + 1 > to_end)
358
      {
163 by Brian Aker
Merge Monty's code.
359
        overflow= true;
1 by brian
clean slate
360
        break;
361
      }
362
      *to++= *from;
363
    }
364
  }
365
  *to= 0;
2160.1.2 by Olaf van der Spek
casts
366
  return overflow ? UINT32_MAX : to - to_start;
1 by brian
clean slate
367
}
1280.1.10 by Monty Taylor
Put everything in drizzled into drizzled namespace.
368
369
} /* namespace drizzled */