~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
14
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
15
1241.9.57 by Monty Taylor
Oy. Bigger change than I normally like - but this stuff is all intertwined.
16
#include "config.h"
17
18
#include "drizzled/charset.h"
1271.5.3 by Tim Penhey
change the include files
19
#include "drizzled/error.h"
1241.9.61 by Monty Taylor
No more mystrings in drizzled/
20
#include "drizzled/charset_info.h"
1241.9.64 by Monty Taylor
Moved remaining non-public portions of mysys and mystrings to drizzled/internal.
21
#include "drizzled/internal/m_string.h"
722.1.4 by Monty Taylor
Removed all the setting of DEFS everywhere. Use configmake.h to get the values
22
#include <drizzled/configmake.h>
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
23
#include <vector>
24
25
using namespace std;
26
1280.1.10 by Monty Taylor
Put everything in drizzled into drizzled namespace.
27
namespace drizzled
28
{
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
29
30
/*
31
  We collect memory in this vector that we free on delete.
32
*/
33
static vector<void *>memory_vector;
1 by brian
clean slate
34
35
/*
36
  The code below implements this functionality:
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
37
1 by brian
clean slate
38
    - Initializing charset related structures
39
    - Loading dynamic charsets
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
40
    - Searching for a proper CHARSET_INFO
1 by brian
clean slate
41
      using charset name, collation name or collation ID
42
    - Setting server default character set
43
*/
44
236.3.9 by Andrey Hristov
- Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)
45
bool my_charset_same(const CHARSET_INFO *cs1, const CHARSET_INFO *cs2)
1 by brian
clean slate
46
{
47
  return ((cs1 == cs2) || !strcmp(cs1->csname,cs2->csname));
48
}
49
50
51
static uint
52
get_collation_number_internal(const char *name)
53
{
54
  CHARSET_INFO **cs;
55
  for (cs= all_charsets;
56
       cs < all_charsets+array_elements(all_charsets)-1 ;
57
       cs++)
58
  {
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
59
    if ( cs[0] && cs[0]->name &&
383.1.12 by Brian Aker
Much closer toward UTF8 being around all the time...
60
         !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
1 by brian
clean slate
61
      return cs[0]->number;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
62
  }
1 by brian
clean slate
63
  return 0;
64
}
65
66
146 by Brian Aker
my_bool cleanup.
67
static bool init_state_maps(CHARSET_INFO *cs)
1 by brian
clean slate
68
{
482 by Brian Aker
Remove uint.
69
  uint32_t i;
481 by Brian Aker
Remove all of uchar.
70
  unsigned char *state_map;
71
  unsigned char *ident_map;
1 by brian
clean slate
72
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
73
  if (!(cs->state_map= (unsigned char*) cs_alloc(256)))
1 by brian
clean slate
74
    return 1;
75
    
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
76
  if (!(cs->ident_map= (unsigned char*) cs_alloc(256)))
1 by brian
clean slate
77
    return 1;
78
79
  state_map= cs->state_map;
80
  ident_map= cs->ident_map;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
81
1 by brian
clean slate
82
  /* Fill state_map with states to get a faster parser */
83
  for (i=0; i < 256 ; i++)
84
  {
85
    if (my_isalpha(cs,i))
481 by Brian Aker
Remove all of uchar.
86
      state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian
clean slate
87
    else if (my_isdigit(cs,i))
481 by Brian Aker
Remove all of uchar.
88
      state_map[i]=(unsigned char) MY_LEX_NUMBER_IDENT;
1 by brian
clean slate
89
    else if (my_mbcharlen(cs, i)>1)
481 by Brian Aker
Remove all of uchar.
90
      state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian
clean slate
91
    else if (my_isspace(cs,i))
481 by Brian Aker
Remove all of uchar.
92
      state_map[i]=(unsigned char) MY_LEX_SKIP;
1 by brian
clean slate
93
    else
481 by Brian Aker
Remove all of uchar.
94
      state_map[i]=(unsigned char) MY_LEX_CHAR;
1 by brian
clean slate
95
  }
481 by Brian Aker
Remove all of uchar.
96
  state_map[(unsigned char)'_']=state_map[(unsigned char)'$']=(unsigned char) MY_LEX_IDENT;
97
  state_map[(unsigned char)'\'']=(unsigned char) MY_LEX_STRING;
98
  state_map[(unsigned char)'.']=(unsigned char) MY_LEX_REAL_OR_POINT;
99
  state_map[(unsigned char)'>']=state_map[(unsigned char)'=']=state_map[(unsigned char)'!']= (unsigned char) MY_LEX_CMP_OP;
100
  state_map[(unsigned char)'<']= (unsigned char) MY_LEX_LONG_CMP_OP;
101
  state_map[(unsigned char)'&']=state_map[(unsigned char)'|']=(unsigned char) MY_LEX_BOOL;
102
  state_map[(unsigned char)'#']=(unsigned char) MY_LEX_COMMENT;
103
  state_map[(unsigned char)';']=(unsigned char) MY_LEX_SEMICOLON;
104
  state_map[(unsigned char)':']=(unsigned char) MY_LEX_SET_VAR;
105
  state_map[0]=(unsigned char) MY_LEX_EOL;
106
  state_map[(unsigned char)'\\']= (unsigned char) MY_LEX_ESCAPE;
107
  state_map[(unsigned char)'/']= (unsigned char) MY_LEX_LONG_COMMENT;
108
  state_map[(unsigned char)'*']= (unsigned char) MY_LEX_END_LONG_COMMENT;
109
  state_map[(unsigned char)'@']= (unsigned char) MY_LEX_USER_END;
110
  state_map[(unsigned char) '`']= (unsigned char) MY_LEX_USER_VARIABLE_DELIMITER;
111
  state_map[(unsigned char)'"']= (unsigned char) MY_LEX_STRING_OR_DELIMITER;
1 by brian
clean slate
112
113
  /*
114
    Create a second map to make it faster to find identifiers
115
  */
116
  for (i=0; i < 256 ; i++)
117
  {
481 by Brian Aker
Remove all of uchar.
118
    ident_map[i]= (unsigned char) (state_map[i] == MY_LEX_IDENT ||
1 by brian
clean slate
119
			   state_map[i] == MY_LEX_NUMBER_IDENT);
120
  }
121
122
  /* Special handling of hex and binary strings */
481 by Brian Aker
Remove all of uchar.
123
  state_map[(unsigned char)'x']= state_map[(unsigned char)'X']= (unsigned char) MY_LEX_IDENT_OR_HEX;
124
  state_map[(unsigned char)'b']= state_map[(unsigned char)'B']= (unsigned char) MY_LEX_IDENT_OR_BIN;
1 by brian
clean slate
125
  return 0;
126
}
127
128
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
129
static bool charset_initialized= false;
1 by brian
clean slate
130
131
CHARSET_INFO *all_charsets[256];
383.1.12 by Brian Aker
Much closer toward UTF8 being around all the time...
132
const CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
1 by brian
clean slate
133
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
134
void add_compiled_collation(CHARSET_INFO * cs)
1 by brian
clean slate
135
{
136
  all_charsets[cs->number]= cs;
137
  cs->state|= MY_CS_AVAILABLE;
138
}
139
632.1.11 by Monty Taylor
Fixed Sun Studio warnings in mysys.
140
void *cs_alloc(size_t size)
1 by brian
clean slate
141
{
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
142
  void *ptr= malloc(size);
143
144
  memory_vector.push_back(ptr);
145
146
  return ptr;
1 by brian
clean slate
147
}
148
149
1241.9.67 by Monty Taylor
Fixed Solaris.
150
146 by Brian Aker
my_bool cleanup.
151
static bool init_available_charsets(myf myflags)
1 by brian
clean slate
152
{
862 by Brian Aker
Remove charset directory code.
153
  bool error= false;
1 by brian
clean slate
154
  /*
155
    We have to use charset_initialized to not lock on THR_LOCK_charset
156
    inside get_internal_charset...
157
  */
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
158
  if (charset_initialized == false)
1 by brian
clean slate
159
  {
160
    CHARSET_INFO **cs;
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
161
    memset(&all_charsets, 0, sizeof(all_charsets));
162
    init_compiled_charsets(myflags);
163
164
    /* Copy compiled charsets */
165
    for (cs=all_charsets;
166
         cs < all_charsets+array_elements(all_charsets)-1 ;
167
         cs++)
1 by brian
clean slate
168
    {
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
169
      if (*cs)
1 by brian
clean slate
170
      {
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
171
        if (cs[0]->ctype)
172
          if (init_state_maps(*cs))
173
            *cs= NULL;
1 by brian
clean slate
174
      }
175
    }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
176
177
    charset_initialized= true;
1 by brian
clean slate
178
  }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
179
  assert(charset_initialized);
180
1 by brian
clean slate
181
  return error;
182
}
183
184
185
void free_charsets(void)
186
{
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
187
  charset_initialized= true;
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
188
189
  while (memory_vector.empty() == false)
190
  {
191
    void *ptr= memory_vector.back();
192
    memory_vector.pop_back();
193
    free(ptr);
194
  }
195
  memory_vector.clear();
196
1 by brian
clean slate
197
}
198
199
482 by Brian Aker
Remove uint.
200
uint32_t get_collation_number(const char *name)
1 by brian
clean slate
201
{
202
  init_available_charsets(MYF(0));
203
  return get_collation_number_internal(name);
204
}
205
206
482 by Brian Aker
Remove uint.
207
uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
1 by brian
clean slate
208
{
209
  CHARSET_INFO **cs;
210
  init_available_charsets(MYF(0));
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
211
1 by brian
clean slate
212
  for (cs= all_charsets;
213
       cs < all_charsets+array_elements(all_charsets)-1 ;
214
       cs++)
215
  {
216
    if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) &&
383.1.12 by Brian Aker
Much closer toward UTF8 being around all the time...
217
         !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->csname, charset_name))
1 by brian
clean slate
218
      return cs[0]->number;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
219
  }
1 by brian
clean slate
220
  return 0;
221
}
222
223
482 by Brian Aker
Remove uint.
224
const char *get_charset_name(uint32_t charset_number)
1 by brian
clean slate
225
{
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
226
  const CHARSET_INFO *cs;
1 by brian
clean slate
227
  init_available_charsets(MYF(0));
228
229
  cs=all_charsets[charset_number];
230
  if (cs && (cs->number == charset_number) && cs->name )
231
    return (char*) cs->name;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
232
1 by brian
clean slate
233
  return (char*) "?";   /* this mimics find_type() */
234
}
235
236
482 by Brian Aker
Remove uint.
237
static const CHARSET_INFO *get_internal_charset(uint32_t cs_number)
1 by brian
clean slate
238
{
239
  CHARSET_INFO *cs;
240
  /*
241
    To make things thread safe we are not allowing other threads to interfere
242
    while we may changing the cs_info_table
243
  */
244
  if ((cs= all_charsets[cs_number]))
245
  {
246
    if (!(cs->state & MY_CS_COMPILED) && !(cs->state & MY_CS_LOADED))
247
    {
383.1.7 by Brian Aker
Remove homebrew xml parser.
248
      assert(0);
1 by brian
clean slate
249
    }
250
    cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
251
  }
252
  if (cs && !(cs->state & MY_CS_READY))
253
  {
254
    if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
255
        (cs->coll->init && cs->coll->init(cs, cs_alloc)))
256
      cs= NULL;
257
    else
258
      cs->state|= MY_CS_READY;
259
  }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
260
1 by brian
clean slate
261
  return cs;
262
}
263
264
862 by Brian Aker
Remove charset directory code.
265
const CHARSET_INFO *get_charset(uint32_t cs_number)
1 by brian
clean slate
266
{
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
267
  const CHARSET_INFO *cs;
1 by brian
clean slate
268
  if (cs_number == default_charset_info->number)
269
    return default_charset_info;
270
271
  (void) init_available_charsets(MYF(0));	/* If it isn't initialized */
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
272
1 by brian
clean slate
273
  if (!cs_number || cs_number >= array_elements(all_charsets)-1)
274
    return NULL;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
275
383.1.7 by Brian Aker
Remove homebrew xml parser.
276
  cs= get_internal_charset(cs_number);
1 by brian
clean slate
277
278
  return cs;
279
}
280
862 by Brian Aker
Remove charset directory code.
281
const CHARSET_INFO *get_charset_by_name(const char *cs_name)
1 by brian
clean slate
282
{
482 by Brian Aker
Remove uint.
283
  uint32_t cs_number;
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
284
  const CHARSET_INFO *cs;
1 by brian
clean slate
285
  (void) init_available_charsets(MYF(0));	/* If it isn't initialized */
286
1014.3.1 by Brian Aker
Simplify the calling stack for getting schema collation. We need to extend
287
  cs_number= get_collation_number(cs_name);
383.1.7 by Brian Aker
Remove homebrew xml parser.
288
  cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
289
290
  return cs;
291
}
292
293
862 by Brian Aker
Remove charset directory code.
294
const CHARSET_INFO *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
1 by brian
clean slate
295
{
482 by Brian Aker
Remove uint.
296
  uint32_t cs_number;
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
297
  const CHARSET_INFO *cs;
1 by brian
clean slate
298
299
  (void) init_available_charsets(MYF(0));	/* If it isn't initialized */
300
301
  cs_number= get_charset_number(cs_name, cs_flags);
383.1.7 by Brian Aker
Remove homebrew xml parser.
302
  cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
303
51.3.22 by Jay Pipes
Final round of removal of DBUG in mysys/, including Makefile
304
  return(cs);
1 by brian
clean slate
305
}
306
307
308
/*
309
  Escape apostrophes by doubling them up
310
311
  SYNOPSIS
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
312
    escape_quotes_for_drizzle()
1 by brian
clean slate
313
    charset_info        Charset of the strings
314
    to                  Buffer for escaped string
315
    to_length           Length of destination buffer, or 0
316
    from                The string to escape
317
    length              The length of the string to escape
318
319
  DESCRIPTION
320
    This escapes the contents of a string by doubling up any apostrophes that
321
    it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
322
    effect on the server.
323
324
  NOTE
325
    To be consistent with escape_string_for_mysql(), to_length may be 0 to
326
    mean "big enough"
327
328
  RETURN VALUES
365.2.9 by Monty Taylor
Got rid of all instances of ~0
329
    UINT32_MAX  The escaped string did not fit in the to buffer
1 by brian
clean slate
330
    >=0         The length of the escaped string
331
*/
332
236.3.9 by Andrey Hristov
- Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)
333
size_t escape_quotes_for_drizzle(const CHARSET_INFO *charset_info,
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
334
                                 char *to, size_t to_length,
335
                                 const char *from, size_t length)
1 by brian
clean slate
336
{
337
  const char *to_start= to;
338
  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
163 by Brian Aker
Merge Monty's code.
339
  bool overflow= false;
146 by Brian Aker
my_bool cleanup.
340
  bool use_mb_flag= use_mb(charset_info);
1 by brian
clean slate
341
  for (end= from + length; from < end; from++)
342
  {
343
    int tmp_length;
344
    if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
345
    {
346
      if (to + tmp_length > to_end)
347
      {
163 by Brian Aker
Merge Monty's code.
348
        overflow= true;
1 by brian
clean slate
349
        break;
350
      }
351
      while (tmp_length--)
352
	*to++= *from++;
353
      from--;
354
      continue;
355
    }
356
    /*
357
      We don't have the same issue here with a non-multi-byte character being
358
      turned into a multi-byte character by the addition of an escaping
359
      character, because we are only escaping the ' character with itself.
360
     */
361
    if (*from == '\'')
362
    {
363
      if (to + 2 > to_end)
364
      {
163 by Brian Aker
Merge Monty's code.
365
        overflow= true;
1 by brian
clean slate
366
        break;
367
      }
368
      *to++= '\'';
369
      *to++= '\'';
370
    }
371
    else
372
    {
373
      if (to + 1 > to_end)
374
      {
163 by Brian Aker
Merge Monty's code.
375
        overflow= true;
1 by brian
clean slate
376
        break;
377
      }
378
      *to++= *from;
379
    }
380
  }
381
  *to= 0;
365.2.9 by Monty Taylor
Got rid of all instances of ~0
382
  return overflow ? UINT32_MAX : (uint32_t) (to - to_start);
1 by brian
clean slate
383
}
1280.1.10 by Monty Taylor
Put everything in drizzled into drizzled namespace.
384
385
} /* namespace drizzled */