~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
1802.10.2 by Monty Taylor
Update all of the copyright headers to include the correct address.
14
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
1 by brian
clean slate
15
1241.9.57 by Monty Taylor
Oy. Bigger change than I normally like - but this stuff is all intertwined.
16
#include "config.h"
17
18
#include "drizzled/charset.h"
1271.5.3 by Tim Penhey
change the include files
19
#include "drizzled/error.h"
1241.9.61 by Monty Taylor
No more mystrings in drizzled/
20
#include "drizzled/charset_info.h"
1241.9.64 by Monty Taylor
Moved remaining non-public portions of mysys and mystrings to drizzled/internal.
21
#include "drizzled/internal/m_string.h"
722.1.4 by Monty Taylor
Removed all the setting of DEFS everywhere. Use configmake.h to get the values
22
#include <drizzled/configmake.h>
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
23
#include <vector>
24
25
using namespace std;
26
1280.1.10 by Monty Taylor
Put everything in drizzled into drizzled namespace.
27
namespace drizzled
28
{
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
29
30
/*
31
  We collect memory in this vector that we free on delete.
32
*/
33
static vector<void *>memory_vector;
1 by brian
clean slate
34
35
/*
36
  The code below implements this functionality:
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
37
1 by brian
clean slate
38
    - Initializing charset related structures
39
    - Loading dynamic charsets
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
40
    - Searching for a proper CHARSET_INFO
1 by brian
clean slate
41
      using charset name, collation name or collation ID
42
    - Setting server default character set
43
*/
44
236.3.9 by Andrey Hristov
- Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)
45
bool my_charset_same(const CHARSET_INFO *cs1, const CHARSET_INFO *cs2)
1 by brian
clean slate
46
{
47
  return ((cs1 == cs2) || !strcmp(cs1->csname,cs2->csname));
48
}
49
50
51
static uint
52
get_collation_number_internal(const char *name)
53
{
54
  CHARSET_INFO **cs;
55
  for (cs= all_charsets;
56
       cs < all_charsets+array_elements(all_charsets)-1 ;
57
       cs++)
58
  {
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
59
    if ( cs[0] && cs[0]->name &&
383.1.12 by Brian Aker
Much closer toward UTF8 being around all the time...
60
         !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
1 by brian
clean slate
61
      return cs[0]->number;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
62
  }
1 by brian
clean slate
63
  return 0;
64
}
65
66
146 by Brian Aker
my_bool cleanup.
67
static bool init_state_maps(CHARSET_INFO *cs)
1 by brian
clean slate
68
{
482 by Brian Aker
Remove uint.
69
  uint32_t i;
481 by Brian Aker
Remove all of uchar.
70
  unsigned char *state_map;
71
  unsigned char *ident_map;
1 by brian
clean slate
72
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
73
  if (!(cs->state_map= (unsigned char*) cs_alloc(256)))
1 by brian
clean slate
74
    return 1;
75
    
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
76
  if (!(cs->ident_map= (unsigned char*) cs_alloc(256)))
1 by brian
clean slate
77
    return 1;
78
79
  state_map= cs->state_map;
80
  ident_map= cs->ident_map;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
81
1 by brian
clean slate
82
  /* Fill state_map with states to get a faster parser */
83
  for (i=0; i < 256 ; i++)
84
  {
85
    if (my_isalpha(cs,i))
481 by Brian Aker
Remove all of uchar.
86
      state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian
clean slate
87
    else if (my_isdigit(cs,i))
481 by Brian Aker
Remove all of uchar.
88
      state_map[i]=(unsigned char) MY_LEX_NUMBER_IDENT;
1 by brian
clean slate
89
    else if (my_mbcharlen(cs, i)>1)
481 by Brian Aker
Remove all of uchar.
90
      state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian
clean slate
91
    else if (my_isspace(cs,i))
481 by Brian Aker
Remove all of uchar.
92
      state_map[i]=(unsigned char) MY_LEX_SKIP;
1 by brian
clean slate
93
    else
481 by Brian Aker
Remove all of uchar.
94
      state_map[i]=(unsigned char) MY_LEX_CHAR;
1 by brian
clean slate
95
  }
481 by Brian Aker
Remove all of uchar.
96
  state_map[(unsigned char)'_']=state_map[(unsigned char)'$']=(unsigned char) MY_LEX_IDENT;
97
  state_map[(unsigned char)'\'']=(unsigned char) MY_LEX_STRING;
98
  state_map[(unsigned char)'.']=(unsigned char) MY_LEX_REAL_OR_POINT;
99
  state_map[(unsigned char)'>']=state_map[(unsigned char)'=']=state_map[(unsigned char)'!']= (unsigned char) MY_LEX_CMP_OP;
100
  state_map[(unsigned char)'<']= (unsigned char) MY_LEX_LONG_CMP_OP;
101
  state_map[(unsigned char)'&']=state_map[(unsigned char)'|']=(unsigned char) MY_LEX_BOOL;
102
  state_map[(unsigned char)'#']=(unsigned char) MY_LEX_COMMENT;
103
  state_map[(unsigned char)';']=(unsigned char) MY_LEX_SEMICOLON;
104
  state_map[(unsigned char)':']=(unsigned char) MY_LEX_SET_VAR;
105
  state_map[0]=(unsigned char) MY_LEX_EOL;
106
  state_map[(unsigned char)'\\']= (unsigned char) MY_LEX_ESCAPE;
107
  state_map[(unsigned char)'/']= (unsigned char) MY_LEX_LONG_COMMENT;
108
  state_map[(unsigned char)'*']= (unsigned char) MY_LEX_END_LONG_COMMENT;
109
  state_map[(unsigned char)'@']= (unsigned char) MY_LEX_USER_END;
110
  state_map[(unsigned char) '`']= (unsigned char) MY_LEX_USER_VARIABLE_DELIMITER;
111
  state_map[(unsigned char)'"']= (unsigned char) MY_LEX_STRING_OR_DELIMITER;
1 by brian
clean slate
112
113
  /*
114
    Create a second map to make it faster to find identifiers
115
  */
116
  for (i=0; i < 256 ; i++)
117
  {
481 by Brian Aker
Remove all of uchar.
118
    ident_map[i]= (unsigned char) (state_map[i] == MY_LEX_IDENT ||
1 by brian
clean slate
119
			   state_map[i] == MY_LEX_NUMBER_IDENT);
120
  }
121
122
  /* Special handling of hex and binary strings */
481 by Brian Aker
Remove all of uchar.
123
  state_map[(unsigned char)'x']= state_map[(unsigned char)'X']= (unsigned char) MY_LEX_IDENT_OR_HEX;
124
  state_map[(unsigned char)'b']= state_map[(unsigned char)'B']= (unsigned char) MY_LEX_IDENT_OR_BIN;
1 by brian
clean slate
125
  return 0;
126
}
127
128
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
129
static bool charset_initialized= false;
1 by brian
clean slate
130
131
CHARSET_INFO *all_charsets[256];
383.1.12 by Brian Aker
Much closer toward UTF8 being around all the time...
132
const CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
1 by brian
clean slate
133
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
134
void add_compiled_collation(CHARSET_INFO * cs)
1 by brian
clean slate
135
{
136
  all_charsets[cs->number]= cs;
137
  cs->state|= MY_CS_AVAILABLE;
138
}
139
632.1.11 by Monty Taylor
Fixed Sun Studio warnings in mysys.
140
void *cs_alloc(size_t size)
1 by brian
clean slate
141
{
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
142
  void *ptr= malloc(size);
143
144
  memory_vector.push_back(ptr);
145
146
  return ptr;
1 by brian
clean slate
147
}
148
149
1241.9.67 by Monty Taylor
Fixed Solaris.
150
146 by Brian Aker
my_bool cleanup.
151
static bool init_available_charsets(myf myflags)
1 by brian
clean slate
152
{
862 by Brian Aker
Remove charset directory code.
153
  bool error= false;
1 by brian
clean slate
154
  /*
155
    We have to use charset_initialized to not lock on THR_LOCK_charset
156
    inside get_internal_charset...
157
  */
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
158
  if (charset_initialized == false)
1 by brian
clean slate
159
  {
160
    CHARSET_INFO **cs;
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
161
    memset(&all_charsets, 0, sizeof(all_charsets));
162
    init_compiled_charsets(myflags);
163
164
    /* Copy compiled charsets */
165
    for (cs=all_charsets;
166
         cs < all_charsets+array_elements(all_charsets)-1 ;
167
         cs++)
1 by brian
clean slate
168
    {
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
169
      if (*cs)
1 by brian
clean slate
170
      {
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
171
        if (cs[0]->ctype)
172
          if (init_state_maps(*cs))
173
            *cs= NULL;
1 by brian
clean slate
174
      }
175
    }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
176
177
    charset_initialized= true;
1 by brian
clean slate
178
  }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
179
  assert(charset_initialized);
180
1 by brian
clean slate
181
  return error;
182
}
183
184
185
void free_charsets(void)
186
{
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
187
  charset_initialized= true;
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
188
189
  while (memory_vector.empty() == false)
190
  {
191
    void *ptr= memory_vector.back();
192
    memory_vector.pop_back();
193
    free(ptr);
194
  }
195
  memory_vector.clear();
196
1 by brian
clean slate
197
}
198
199
482 by Brian Aker
Remove uint.
200
uint32_t get_collation_number(const char *name)
1 by brian
clean slate
201
{
202
  init_available_charsets(MYF(0));
203
  return get_collation_number_internal(name);
204
}
205
206
482 by Brian Aker
Remove uint.
207
uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
1 by brian
clean slate
208
{
209
  CHARSET_INFO **cs;
210
  init_available_charsets(MYF(0));
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
211
1 by brian
clean slate
212
  for (cs= all_charsets;
213
       cs < all_charsets+array_elements(all_charsets)-1 ;
214
       cs++)
215
  {
216
    if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) &&
383.1.12 by Brian Aker
Much closer toward UTF8 being around all the time...
217
         !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->csname, charset_name))
1 by brian
clean slate
218
      return cs[0]->number;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
219
  }
1 by brian
clean slate
220
  return 0;
221
}
222
223
482 by Brian Aker
Remove uint.
224
const char *get_charset_name(uint32_t charset_number)
1 by brian
clean slate
225
{
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
226
  const CHARSET_INFO *cs;
1 by brian
clean slate
227
  init_available_charsets(MYF(0));
228
229
  cs=all_charsets[charset_number];
230
  if (cs && (cs->number == charset_number) && cs->name )
231
    return (char*) cs->name;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
232
1 by brian
clean slate
233
  return (char*) "?";   /* this mimics find_type() */
234
}
235
236
482 by Brian Aker
Remove uint.
237
static const CHARSET_INFO *get_internal_charset(uint32_t cs_number)
1 by brian
clean slate
238
{
239
  CHARSET_INFO *cs;
240
  /*
241
    To make things thread safe we are not allowing other threads to interfere
242
    while we may changing the cs_info_table
243
  */
244
  if ((cs= all_charsets[cs_number]))
245
  {
246
    if (!(cs->state & MY_CS_COMPILED) && !(cs->state & MY_CS_LOADED))
247
    {
383.1.7 by Brian Aker
Remove homebrew xml parser.
248
      assert(0);
1 by brian
clean slate
249
    }
250
    cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
251
  }
252
  if (cs && !(cs->state & MY_CS_READY))
253
  {
254
    if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
255
        (cs->coll->init && cs->coll->init(cs, cs_alloc)))
256
      cs= NULL;
257
    else
258
      cs->state|= MY_CS_READY;
259
  }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
260
1 by brian
clean slate
261
  return cs;
262
}
263
264
862 by Brian Aker
Remove charset directory code.
265
const CHARSET_INFO *get_charset(uint32_t cs_number)
1 by brian
clean slate
266
{
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
267
  const CHARSET_INFO *cs;
1 by brian
clean slate
268
  if (cs_number == default_charset_info->number)
269
    return default_charset_info;
270
271
  (void) init_available_charsets(MYF(0));	/* If it isn't initialized */
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
272
1 by brian
clean slate
273
  if (!cs_number || cs_number >= array_elements(all_charsets)-1)
274
    return NULL;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
275
383.1.7 by Brian Aker
Remove homebrew xml parser.
276
  cs= get_internal_charset(cs_number);
1 by brian
clean slate
277
278
  return cs;
279
}
280
862 by Brian Aker
Remove charset directory code.
281
const CHARSET_INFO *get_charset_by_name(const char *cs_name)
1 by brian
clean slate
282
{
482 by Brian Aker
Remove uint.
283
  uint32_t cs_number;
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
284
  const CHARSET_INFO *cs;
1 by brian
clean slate
285
  (void) init_available_charsets(MYF(0));	/* If it isn't initialized */
286
1014.3.1 by Brian Aker
Simplify the calling stack for getting schema collation. We need to extend
287
  cs_number= get_collation_number(cs_name);
383.1.7 by Brian Aker
Remove homebrew xml parser.
288
  cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
289
290
  return cs;
291
}
292
293
862 by Brian Aker
Remove charset directory code.
294
const CHARSET_INFO *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
1 by brian
clean slate
295
{
482 by Brian Aker
Remove uint.
296
  uint32_t cs_number;
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
297
  const CHARSET_INFO *cs;
1 by brian
clean slate
298
299
  (void) init_available_charsets(MYF(0));	/* If it isn't initialized */
300
301
  cs_number= get_charset_number(cs_name, cs_flags);
383.1.7 by Brian Aker
Remove homebrew xml parser.
302
  cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
303
51.3.22 by Jay Pipes
Final round of removal of DBUG in mysys/, including Makefile
304
  return(cs);
1 by brian
clean slate
305
}
306
307
308
/*
309
  Escape apostrophes by doubling them up
310
311
  SYNOPSIS
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
312
    escape_quotes_for_drizzle()
1 by brian
clean slate
313
    charset_info        Charset of the strings
314
    to                  Buffer for escaped string
315
    to_length           Length of destination buffer, or 0
316
    from                The string to escape
317
    length              The length of the string to escape
318
319
  DESCRIPTION
320
    This escapes the contents of a string by doubling up any apostrophes that
321
    it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
322
    effect on the server.
323
324
  NOTE
325
    To be consistent with escape_string_for_mysql(), to_length may be 0 to
326
    mean "big enough"
327
328
  RETURN VALUES
365.2.9 by Monty Taylor
Got rid of all instances of ~0
329
    UINT32_MAX  The escaped string did not fit in the to buffer
1 by brian
clean slate
330
    >=0         The length of the escaped string
331
*/
332
236.3.9 by Andrey Hristov
- Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)
333
size_t escape_quotes_for_drizzle(const CHARSET_INFO *charset_info,
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
334
                                 char *to, size_t to_length,
335
                                 const char *from, size_t length)
1 by brian
clean slate
336
{
337
  const char *to_start= to;
338
  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
163 by Brian Aker
Merge Monty's code.
339
  bool overflow= false;
146 by Brian Aker
my_bool cleanup.
340
  bool use_mb_flag= use_mb(charset_info);
1 by brian
clean slate
341
  for (end= from + length; from < end; from++)
342
  {
343
    int tmp_length;
344
    if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
345
    {
346
      if (to + tmp_length > to_end)
347
      {
163 by Brian Aker
Merge Monty's code.
348
        overflow= true;
1 by brian
clean slate
349
        break;
350
      }
351
      while (tmp_length--)
352
	*to++= *from++;
353
      from--;
354
      continue;
355
    }
356
    /*
357
      We don't have the same issue here with a non-multi-byte character being
358
      turned into a multi-byte character by the addition of an escaping
359
      character, because we are only escaping the ' character with itself.
360
     */
361
    if (*from == '\'')
362
    {
363
      if (to + 2 > to_end)
364
      {
163 by Brian Aker
Merge Monty's code.
365
        overflow= true;
1 by brian
clean slate
366
        break;
367
      }
368
      *to++= '\'';
369
      *to++= '\'';
370
    }
371
    else
372
    {
373
      if (to + 1 > to_end)
374
      {
163 by Brian Aker
Merge Monty's code.
375
        overflow= true;
1 by brian
clean slate
376
        break;
377
      }
378
      *to++= *from;
379
    }
380
  }
381
  *to= 0;
365.2.9 by Monty Taylor
Got rid of all instances of ~0
382
  return overflow ? UINT32_MAX : (uint32_t) (to - to_start);
1 by brian
clean slate
383
}
1280.1.10 by Monty Taylor
Put everything in drizzled into drizzled namespace.
384
385
} /* namespace drizzled */