~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
1802.10.2 by Monty Taylor
Update all of the copyright headers to include the correct address.
14
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
1 by brian
clean slate
15
2173.2.1 by Monty Taylor
Fixes incorrect usage of include
16
#include <config.h>
1241.9.57 by Monty Taylor
Oy. Bigger change than I normally like - but this stuff is all intertwined.
17
2173.2.1 by Monty Taylor
Fixes incorrect usage of include
18
#include <drizzled/charset.h>
19
#include <drizzled/error.h>
20
#include <drizzled/internal/m_string.h>
722.1.4 by Monty Taylor
Removed all the setting of DEFS everywhere. Use configmake.h to get the values
21
#include <drizzled/configmake.h>
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
22
#include <vector>
23
2173.2.1 by Monty Taylor
Fixes incorrect usage of include
24
#include <drizzled/visibility.h>
2119.4.1 by Monty Taylor
Turns on -fvisibility=hidden by default. Symbols intended to be used by
25
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
26
using namespace std;
27
2318.4.7 by Olaf van der Spek
Refactor
28
namespace drizzled {
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
29
30
/*
31
  We collect memory in this vector that we free on delete.
32
*/
2160.1.2 by Olaf van der Spek
casts
33
static vector<unsigned char*> memory_vector;
1 by brian
clean slate
34
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
35
extern charset_info_st my_charset_utf8mb4_icelandic_uca_ci;
36
extern charset_info_st my_charset_utf8mb4_latvian_uca_ci;
37
extern charset_info_st my_charset_utf8mb4_romanian_uca_ci;
38
extern charset_info_st my_charset_utf8mb4_slovenian_uca_ci;
39
extern charset_info_st my_charset_utf8mb4_polish_uca_ci;
40
extern charset_info_st my_charset_utf8mb4_estonian_uca_ci;
41
extern charset_info_st my_charset_utf8mb4_spanish_uca_ci;
42
extern charset_info_st my_charset_utf8mb4_swedish_uca_ci;
43
extern charset_info_st my_charset_utf8mb4_turkish_uca_ci;
44
extern charset_info_st my_charset_utf8mb4_czech_uca_ci;
45
extern charset_info_st my_charset_utf8mb4_danish_uca_ci;
46
extern charset_info_st my_charset_utf8mb4_lithuanian_uca_ci;
47
extern charset_info_st my_charset_utf8mb4_slovak_uca_ci;
48
extern charset_info_st my_charset_utf8mb4_spanish2_uca_ci;
49
extern charset_info_st my_charset_utf8mb4_roman_uca_ci;
50
extern charset_info_st my_charset_utf8mb4_persian_uca_ci;
51
extern charset_info_st my_charset_utf8mb4_esperanto_uca_ci;
52
extern charset_info_st my_charset_utf8mb4_hungarian_uca_ci;
53
extern charset_info_st my_charset_utf8mb4_sinhala_uca_ci;
54
1 by brian
clean slate
55
/*
56
  The code below implements this functionality:
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
57
1 by brian
clean slate
58
    - Initializing charset related structures
59
    - Loading dynamic charsets
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
60
    - Searching for a proper charset_info_st
1 by brian
clean slate
61
      using charset name, collation name or collation ID
62
    - Setting server default character set
63
*/
64
2318.6.17 by Olaf van der Spek
Silly icc
65
bool my_charset_same(const charset_info_st *cs1, const charset_info_st *cs2)
66
{
67
  return cs1 == cs2 || not strcmp(cs1->csname, cs2->csname);
1 by brian
clean slate
68
}
69
2318.4.7 by Olaf van der Spek
Refactor
70
static uint get_collation_number_internal(const char *name)
1 by brian
clean slate
71
{
2456.1.4 by Olaf van der Spek
Refactor
72
  for (charset_info_st **cs= all_charsets; cs < all_charsets + array_elements(all_charsets) - 1; cs++)
1 by brian
clean slate
73
  {
2456.1.4 by Olaf van der Spek
Refactor
74
    if (cs[0] && cs[0]->name && not my_charset_utf8_general_ci.strcasecmp(cs[0]->name, name))
2085.2.3 by Brian Aker
Fix strcasecmp issues (ie, check UTF-8).
75
    {
1 by brian
clean slate
76
      return cs[0]->number;
2085.2.3 by Brian Aker
Fix strcasecmp issues (ie, check UTF-8).
77
    }
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
78
  }
1 by brian
clean slate
79
  return 0;
80
}
81
2318.4.7 by Olaf van der Spek
Refactor
82
static unsigned char* cs_alloc(size_t size)
2160.1.2 by Olaf van der Spek
casts
83
{
84
  memory_vector.push_back(new unsigned char[size]);
85
  return memory_vector.back();
86
}
1 by brian
clean slate
87
2318.4.7 by Olaf van der Spek
Refactor
88
static void init_state_maps(charset_info_st *cs)
1 by brian
clean slate
89
{
2318.4.7 by Olaf van der Spek
Refactor
90
  cs->state_map= cs_alloc(256);
91
  cs->ident_map= cs_alloc(256);
1 by brian
clean slate
92
2160.1.2 by Olaf van der Spek
casts
93
  unsigned char *state_map= cs->state_map;
94
  unsigned char *ident_map= cs->ident_map;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
95
1 by brian
clean slate
96
  /* Fill state_map with states to get a faster parser */
2160.1.2 by Olaf van der Spek
casts
97
  for (int i= 0; i < 256; i++)
1 by brian
clean slate
98
  {
2445.1.3 by Olaf van der Spek
Refactor
99
    if (cs->isalpha(i))
2160.1.2 by Olaf van der Spek
casts
100
      state_map[i]= MY_LEX_IDENT;
2445.1.3 by Olaf van der Spek
Refactor
101
    else if (cs->isdigit(i))
2160.1.2 by Olaf van der Spek
casts
102
      state_map[i]= MY_LEX_NUMBER_IDENT;
2445.1.3 by Olaf van der Spek
Refactor
103
    else if (my_mbcharlen(cs, i) > 1)
2160.1.2 by Olaf van der Spek
casts
104
      state_map[i]= MY_LEX_IDENT;
2445.1.3 by Olaf van der Spek
Refactor
105
    else if (cs->isspace(i))
2160.1.2 by Olaf van der Spek
casts
106
      state_map[i]= MY_LEX_SKIP;
1 by brian
clean slate
107
    else
2160.1.2 by Olaf van der Spek
casts
108
      state_map[i]= MY_LEX_CHAR;
1 by brian
clean slate
109
  }
2160.1.2 by Olaf van der Spek
casts
110
  state_map['_']=state_map['$']= MY_LEX_IDENT;
2318.8.15 by Olaf van der Spek
Fix MY_LEX_STRING
111
  state_map['\'']= MY_LEX_STRING;
2160.1.2 by Olaf van der Spek
casts
112
  state_map['.']= MY_LEX_REAL_OR_POINT;
113
  state_map['>']=state_map['=']=state_map['!']=  MY_LEX_CMP_OP;
114
  state_map['<']=  MY_LEX_LONG_CMP_OP;
115
  state_map['&']=state_map['|']= MY_LEX_BOOL;
116
  state_map['#']= MY_LEX_COMMENT;
117
  state_map[';']= MY_LEX_SEMICOLON;
118
  state_map[':']= MY_LEX_SET_VAR;
119
  state_map[0]= MY_LEX_EOL;
120
  state_map['\\']=  MY_LEX_ESCAPE;
121
  state_map['/']=  MY_LEX_LONG_COMMENT;
122
  state_map['*']=  MY_LEX_END_LONG_COMMENT;
123
  state_map['@']=  MY_LEX_USER_END;
124
  state_map['`']=  MY_LEX_USER_VARIABLE_DELIMITER;
125
  state_map['"']=  MY_LEX_STRING_OR_DELIMITER;
1 by brian
clean slate
126
127
  /*
128
    Create a second map to make it faster to find identifiers
129
  */
2160.1.2 by Olaf van der Spek
casts
130
  for (int i= 0; i < 256; i++)
1 by brian
clean slate
131
  {
2160.1.2 by Olaf van der Spek
casts
132
    ident_map[i]= state_map[i] == MY_LEX_IDENT || state_map[i] == MY_LEX_NUMBER_IDENT;
1 by brian
clean slate
133
  }
134
135
  /* Special handling of hex and binary strings */
2160.1.2 by Olaf van der Spek
casts
136
  state_map['x']= state_map['X']=  MY_LEX_IDENT_OR_HEX;
137
  state_map['b']= state_map['B']=  MY_LEX_IDENT_OR_BIN;
1 by brian
clean slate
138
}
139
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
140
static bool charset_initialized= false;
1 by brian
clean slate
141
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
142
DRIZZLED_API charset_info_st *all_charsets[256];
143
const DRIZZLED_API charset_info_st *default_charset_info = &my_charset_utf8_general_ci;
1 by brian
clean slate
144
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
145
static void add_compiled_collation(charset_info_st * cs)
1 by brian
clean slate
146
{
147
  all_charsets[cs->number]= cs;
148
  cs->state|= MY_CS_AVAILABLE;
149
}
150
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
151
static void init_compiled_charsets()
152
{
153
  add_compiled_collation(&my_charset_bin);
154
155
  add_compiled_collation(&my_charset_utf8mb4_general_ci);
156
  add_compiled_collation(&my_charset_utf8mb4_bin);
157
  add_compiled_collation(&my_charset_utf8mb4_unicode_ci);
158
  add_compiled_collation(&my_charset_utf8mb4_icelandic_uca_ci);
159
  add_compiled_collation(&my_charset_utf8mb4_latvian_uca_ci);
160
  add_compiled_collation(&my_charset_utf8mb4_romanian_uca_ci);
161
  add_compiled_collation(&my_charset_utf8mb4_slovenian_uca_ci);
162
  add_compiled_collation(&my_charset_utf8mb4_polish_uca_ci);
163
  add_compiled_collation(&my_charset_utf8mb4_estonian_uca_ci);
164
  add_compiled_collation(&my_charset_utf8mb4_spanish_uca_ci);
165
  add_compiled_collation(&my_charset_utf8mb4_swedish_uca_ci);
166
  add_compiled_collation(&my_charset_utf8mb4_turkish_uca_ci);
167
  add_compiled_collation(&my_charset_utf8mb4_czech_uca_ci);
168
  add_compiled_collation(&my_charset_utf8mb4_danish_uca_ci);
169
  add_compiled_collation(&my_charset_utf8mb4_lithuanian_uca_ci);
170
  add_compiled_collation(&my_charset_utf8mb4_slovak_uca_ci);
171
  add_compiled_collation(&my_charset_utf8mb4_spanish2_uca_ci);
172
  add_compiled_collation(&my_charset_utf8mb4_roman_uca_ci);
173
  add_compiled_collation(&my_charset_utf8mb4_persian_uca_ci);
174
  add_compiled_collation(&my_charset_utf8mb4_esperanto_uca_ci);
175
  add_compiled_collation(&my_charset_utf8mb4_hungarian_uca_ci);
176
  add_compiled_collation(&my_charset_utf8mb4_sinhala_uca_ci);
177
}
178
179
static void init_available_charsets()
1 by brian
clean slate
180
{
181
  /*
182
    We have to use charset_initialized to not lock on THR_LOCK_charset
183
    inside get_internal_charset...
184
  */
2318.4.7 by Olaf van der Spek
Refactor
185
  if (charset_initialized)
186
    return;
187
  memset(&all_charsets, 0, sizeof(all_charsets));
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
188
  init_compiled_charsets();
2318.4.7 by Olaf van der Spek
Refactor
189
190
  /* Copy compiled charsets */
191
  for (charset_info_st**cs= all_charsets;
192
    cs < all_charsets+array_elements(all_charsets)-1;
193
    cs++)
1 by brian
clean slate
194
  {
2318.4.7 by Olaf van der Spek
Refactor
195
    if (*cs && cs[0]->ctype)
196
      init_state_maps(*cs);
1 by brian
clean slate
197
  }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
198
2318.4.7 by Olaf van der Spek
Refactor
199
  charset_initialized= true;
1 by brian
clean slate
200
}
201
2160.1.2 by Olaf van der Spek
casts
202
void free_charsets()
1 by brian
clean slate
203
{
2160.1.7 by Olaf van der Spek
fix
204
  charset_initialized= false;
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
205
2160.1.6 by Olaf van der Spek
USe "not" instead of "!"
206
  while (not memory_vector.empty())
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
207
  {
2160.1.2 by Olaf van der Spek
casts
208
    delete[] memory_vector.back();
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
209
    memory_vector.pop_back();
210
  }
1 by brian
clean slate
211
}
212
482 by Brian Aker
Remove uint.
213
uint32_t get_collation_number(const char *name)
1 by brian
clean slate
214
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
215
  init_available_charsets();
1 by brian
clean slate
216
  return get_collation_number_internal(name);
217
}
218
482 by Brian Aker
Remove uint.
219
uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
1 by brian
clean slate
220
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
221
  init_available_charsets();
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
222
2456.1.4 by Olaf van der Spek
Refactor
223
  for (charset_info_st** cs= all_charsets; cs < all_charsets + array_elements(all_charsets) - 1; cs++)
1 by brian
clean slate
224
  {
2456.1.4 by Olaf van der Spek
Refactor
225
    if (cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) && not my_charset_utf8_general_ci.strcasecmp(cs[0]->csname, charset_name))
1 by brian
clean slate
226
      return cs[0]->number;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
227
  }
1 by brian
clean slate
228
  return 0;
229
}
230
482 by Brian Aker
Remove uint.
231
const char *get_charset_name(uint32_t charset_number)
1 by brian
clean slate
232
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
233
  init_available_charsets();
234
  const charset_info_st* cs= all_charsets[charset_number];
235
  return cs && cs->number == charset_number && cs->name ? cs->name : "?";
1 by brian
clean slate
236
}
237
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
238
static const charset_info_st *get_internal_charset(uint32_t cs_number)
1 by brian
clean slate
239
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
240
  charset_info_st* cs= all_charsets[cs_number];
1 by brian
clean slate
241
  /*
242
    To make things thread safe we are not allowing other threads to interfere
243
    while we may changing the cs_info_table
244
  */
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
245
  if (not cs)
246
    return NULL;
247
  assert(not (not (cs->state & MY_CS_COMPILED) && not (cs->state & MY_CS_LOADED)));
2318.6.108 by Olaf van der Spek
Oops
248
  if (not (cs->state & MY_CS_AVAILABLE))
249
    return NULL;
250
  if (not (cs->state & MY_CS_READY))
251
  {
2318.6.109 by Olaf van der Spek
Refactor
252
    if (cs->coll->init && cs->coll->init(*cs, cs_alloc))
2318.6.108 by Olaf van der Spek
Oops
253
      return NULL;
254
    cs->state|= MY_CS_READY;
255
  }
256
  return cs;
1 by brian
clean slate
257
}
258
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
259
const charset_info_st *get_charset(uint32_t cs_number)
1 by brian
clean slate
260
{
261
  if (cs_number == default_charset_info->number)
262
    return default_charset_info;
263
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
264
  init_available_charsets();	/* If it isn't initialized */
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
265
1 by brian
clean slate
266
  if (!cs_number || cs_number >= array_elements(all_charsets)-1)
267
    return NULL;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
268
2318.4.7 by Olaf van der Spek
Refactor
269
  return get_internal_charset(cs_number);
1 by brian
clean slate
270
}
271
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
272
const charset_info_st *get_charset_by_name(const char *cs_name)
1 by brian
clean slate
273
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
274
  init_available_charsets();	/* If it isn't initialized */
2318.4.7 by Olaf van der Spek
Refactor
275
  uint32_t cs_number= get_collation_number(cs_name);
276
  return cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
277
}
278
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
279
const charset_info_st *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
1 by brian
clean slate
280
{
2318.6.107 by Olaf van der Spek
MY_CHARSET_HANDLER::init is unused
281
  init_available_charsets();	/* If it isn't initialized */
2318.4.7 by Olaf van der Spek
Refactor
282
  uint32_t cs_number= get_charset_number(cs_name, cs_flags);
283
  return cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
284
}
285
286
287
/*
288
  Escape apostrophes by doubling them up
289
290
  SYNOPSIS
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
291
    escape_quotes_for_drizzle()
1 by brian
clean slate
292
    charset_info        Charset of the strings
293
    to                  Buffer for escaped string
294
    to_length           Length of destination buffer, or 0
295
    from                The string to escape
296
    length              The length of the string to escape
297
298
  DESCRIPTION
299
    This escapes the contents of a string by doubling up any apostrophes that
300
    it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
301
    effect on the server.
302
303
  NOTE
304
    To be consistent with escape_string_for_mysql(), to_length may be 0 to
305
    mean "big enough"
306
307
  RETURN VALUES
365.2.9 by Monty Taylor
Got rid of all instances of ~0
308
    UINT32_MAX  The escaped string did not fit in the to buffer
1 by brian
clean slate
309
    >=0         The length of the escaped string
310
*/
311
2254 by Brian Aker
Shift CHARSET_INFO to charset_info_st
312
size_t escape_quotes_for_drizzle(const charset_info_st *charset_info,
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
313
                                 char *to, size_t to_length,
314
                                 const char *from, size_t length)
1 by brian
clean slate
315
{
316
  const char *to_start= to;
317
  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
163 by Brian Aker
Merge Monty's code.
318
  bool overflow= false;
146 by Brian Aker
my_bool cleanup.
319
  bool use_mb_flag= use_mb(charset_info);
1 by brian
clean slate
320
  for (end= from + length; from < end; from++)
321
  {
322
    int tmp_length;
323
    if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
324
    {
325
      if (to + tmp_length > to_end)
326
      {
163 by Brian Aker
Merge Monty's code.
327
        overflow= true;
1 by brian
clean slate
328
        break;
329
      }
330
      while (tmp_length--)
331
	*to++= *from++;
332
      from--;
333
      continue;
334
    }
335
    /*
336
      We don't have the same issue here with a non-multi-byte character being
337
      turned into a multi-byte character by the addition of an escaping
338
      character, because we are only escaping the ' character with itself.
339
     */
340
    if (*from == '\'')
341
    {
342
      if (to + 2 > to_end)
343
      {
163 by Brian Aker
Merge Monty's code.
344
        overflow= true;
1 by brian
clean slate
345
        break;
346
      }
347
      *to++= '\'';
348
      *to++= '\'';
349
    }
350
    else
351
    {
352
      if (to + 1 > to_end)
353
      {
163 by Brian Aker
Merge Monty's code.
354
        overflow= true;
1 by brian
clean slate
355
        break;
356
      }
357
      *to++= *from;
358
    }
359
  }
360
  *to= 0;
2160.1.2 by Olaf van der Spek
casts
361
  return overflow ? UINT32_MAX : to - to_start;
1 by brian
clean slate
362
}
1280.1.10 by Monty Taylor
Put everything in drizzled into drizzled namespace.
363
364
} /* namespace drizzled */