~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
1802.10.2 by Monty Taylor
Update all of the copyright headers to include the correct address.
14
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
1 by brian
clean slate
15
1241.9.57 by Monty Taylor
Oy. Bigger change than I normally like - but this stuff is all intertwined.
16
#include "config.h"
17
18
#include "drizzled/charset.h"
1271.5.3 by Tim Penhey
change the include files
19
#include "drizzled/error.h"
1241.9.61 by Monty Taylor
No more mystrings in drizzled/
20
#include "drizzled/charset_info.h"
1241.9.64 by Monty Taylor
Moved remaining non-public portions of mysys and mystrings to drizzled/internal.
21
#include "drizzled/internal/m_string.h"
722.1.4 by Monty Taylor
Removed all the setting of DEFS everywhere. Use configmake.h to get the values
22
#include <drizzled/configmake.h>
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
23
#include <vector>
24
25
using namespace std;
26
1280.1.10 by Monty Taylor
Put everything in drizzled into drizzled namespace.
27
namespace drizzled
28
{
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
29
30
/*
31
  We collect memory in this vector that we free on delete.
32
*/
33
static vector<void *>memory_vector;
1 by brian
clean slate
34
35
/*
36
  The code below implements this functionality:
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
37
1 by brian
clean slate
38
    - Initializing charset related structures
39
    - Loading dynamic charsets
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
40
    - Searching for a proper CHARSET_INFO
1 by brian
clean slate
41
      using charset name, collation name or collation ID
42
    - Setting server default character set
43
*/
44
236.3.9 by Andrey Hristov
- Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)
45
bool my_charset_same(const CHARSET_INFO *cs1, const CHARSET_INFO *cs2)
1 by brian
clean slate
46
{
47
  return ((cs1 == cs2) || !strcmp(cs1->csname,cs2->csname));
48
}
49
50
51
static uint
52
get_collation_number_internal(const char *name)
53
{
54
  CHARSET_INFO **cs;
55
  for (cs= all_charsets;
56
       cs < all_charsets+array_elements(all_charsets)-1 ;
57
       cs++)
58
  {
2085.2.3 by Brian Aker
Fix strcasecmp issues (ie, check UTF-8).
59
    if ( cs[0] && cs[0]->name && !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
60
    {
1 by brian
clean slate
61
      return cs[0]->number;
2085.2.3 by Brian Aker
Fix strcasecmp issues (ie, check UTF-8).
62
    }
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
63
  }
1 by brian
clean slate
64
  return 0;
65
}
66
67
146 by Brian Aker
my_bool cleanup.
68
static bool init_state_maps(CHARSET_INFO *cs)
1 by brian
clean slate
69
{
482 by Brian Aker
Remove uint.
70
  uint32_t i;
481 by Brian Aker
Remove all of uchar.
71
  unsigned char *state_map;
72
  unsigned char *ident_map;
1 by brian
clean slate
73
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
74
  if (!(cs->state_map= (unsigned char*) cs_alloc(256)))
1 by brian
clean slate
75
    return 1;
76
    
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
77
  if (!(cs->ident_map= (unsigned char*) cs_alloc(256)))
1 by brian
clean slate
78
    return 1;
79
80
  state_map= cs->state_map;
81
  ident_map= cs->ident_map;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
82
1 by brian
clean slate
83
  /* Fill state_map with states to get a faster parser */
84
  for (i=0; i < 256 ; i++)
85
  {
86
    if (my_isalpha(cs,i))
481 by Brian Aker
Remove all of uchar.
87
      state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian
clean slate
88
    else if (my_isdigit(cs,i))
481 by Brian Aker
Remove all of uchar.
89
      state_map[i]=(unsigned char) MY_LEX_NUMBER_IDENT;
1 by brian
clean slate
90
    else if (my_mbcharlen(cs, i)>1)
481 by Brian Aker
Remove all of uchar.
91
      state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian
clean slate
92
    else if (my_isspace(cs,i))
481 by Brian Aker
Remove all of uchar.
93
      state_map[i]=(unsigned char) MY_LEX_SKIP;
1 by brian
clean slate
94
    else
481 by Brian Aker
Remove all of uchar.
95
      state_map[i]=(unsigned char) MY_LEX_CHAR;
1 by brian
clean slate
96
  }
481 by Brian Aker
Remove all of uchar.
97
  state_map[(unsigned char)'_']=state_map[(unsigned char)'$']=(unsigned char) MY_LEX_IDENT;
98
  state_map[(unsigned char)'\'']=(unsigned char) MY_LEX_STRING;
99
  state_map[(unsigned char)'.']=(unsigned char) MY_LEX_REAL_OR_POINT;
100
  state_map[(unsigned char)'>']=state_map[(unsigned char)'=']=state_map[(unsigned char)'!']= (unsigned char) MY_LEX_CMP_OP;
101
  state_map[(unsigned char)'<']= (unsigned char) MY_LEX_LONG_CMP_OP;
102
  state_map[(unsigned char)'&']=state_map[(unsigned char)'|']=(unsigned char) MY_LEX_BOOL;
103
  state_map[(unsigned char)'#']=(unsigned char) MY_LEX_COMMENT;
104
  state_map[(unsigned char)';']=(unsigned char) MY_LEX_SEMICOLON;
105
  state_map[(unsigned char)':']=(unsigned char) MY_LEX_SET_VAR;
106
  state_map[0]=(unsigned char) MY_LEX_EOL;
107
  state_map[(unsigned char)'\\']= (unsigned char) MY_LEX_ESCAPE;
108
  state_map[(unsigned char)'/']= (unsigned char) MY_LEX_LONG_COMMENT;
109
  state_map[(unsigned char)'*']= (unsigned char) MY_LEX_END_LONG_COMMENT;
110
  state_map[(unsigned char)'@']= (unsigned char) MY_LEX_USER_END;
111
  state_map[(unsigned char) '`']= (unsigned char) MY_LEX_USER_VARIABLE_DELIMITER;
112
  state_map[(unsigned char)'"']= (unsigned char) MY_LEX_STRING_OR_DELIMITER;
1 by brian
clean slate
113
114
  /*
115
    Create a second map to make it faster to find identifiers
116
  */
117
  for (i=0; i < 256 ; i++)
118
  {
481 by Brian Aker
Remove all of uchar.
119
    ident_map[i]= (unsigned char) (state_map[i] == MY_LEX_IDENT ||
1 by brian
clean slate
120
			   state_map[i] == MY_LEX_NUMBER_IDENT);
121
  }
122
123
  /* Special handling of hex and binary strings */
481 by Brian Aker
Remove all of uchar.
124
  state_map[(unsigned char)'x']= state_map[(unsigned char)'X']= (unsigned char) MY_LEX_IDENT_OR_HEX;
125
  state_map[(unsigned char)'b']= state_map[(unsigned char)'B']= (unsigned char) MY_LEX_IDENT_OR_BIN;
1 by brian
clean slate
126
  return 0;
127
}
128
129
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
130
static bool charset_initialized= false;
1 by brian
clean slate
131
132
CHARSET_INFO *all_charsets[256];
383.1.12 by Brian Aker
Much closer toward UTF8 being around all the time...
133
const CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
1 by brian
clean slate
134
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
135
void add_compiled_collation(CHARSET_INFO * cs)
1 by brian
clean slate
136
{
137
  all_charsets[cs->number]= cs;
138
  cs->state|= MY_CS_AVAILABLE;
139
}
140
632.1.11 by Monty Taylor
Fixed Sun Studio warnings in mysys.
141
void *cs_alloc(size_t size)
1 by brian
clean slate
142
{
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
143
  void *ptr= malloc(size);
144
145
  memory_vector.push_back(ptr);
146
147
  return ptr;
1 by brian
clean slate
148
}
149
150
1241.9.67 by Monty Taylor
Fixed Solaris.
151
146 by Brian Aker
my_bool cleanup.
152
static bool init_available_charsets(myf myflags)
1 by brian
clean slate
153
{
862 by Brian Aker
Remove charset directory code.
154
  bool error= false;
1 by brian
clean slate
155
  /*
156
    We have to use charset_initialized to not lock on THR_LOCK_charset
157
    inside get_internal_charset...
158
  */
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
159
  if (charset_initialized == false)
1 by brian
clean slate
160
  {
161
    CHARSET_INFO **cs;
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
162
    memset(&all_charsets, 0, sizeof(all_charsets));
163
    init_compiled_charsets(myflags);
164
165
    /* Copy compiled charsets */
166
    for (cs=all_charsets;
167
         cs < all_charsets+array_elements(all_charsets)-1 ;
168
         cs++)
1 by brian
clean slate
169
    {
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
170
      if (*cs)
1 by brian
clean slate
171
      {
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
172
        if (cs[0]->ctype)
173
          if (init_state_maps(*cs))
174
            *cs= NULL;
1 by brian
clean slate
175
      }
176
    }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
177
178
    charset_initialized= true;
1 by brian
clean slate
179
  }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
180
  assert(charset_initialized);
181
1 by brian
clean slate
182
  return error;
183
}
184
185
186
void free_charsets(void)
187
{
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
188
  charset_initialized= true;
1106.1.1 by Brian Aker
Monty fixes pluss a few from me for charset.
189
190
  while (memory_vector.empty() == false)
191
  {
192
    void *ptr= memory_vector.back();
193
    memory_vector.pop_back();
194
    free(ptr);
195
  }
196
  memory_vector.clear();
197
1 by brian
clean slate
198
}
199
200
482 by Brian Aker
Remove uint.
201
uint32_t get_collation_number(const char *name)
1 by brian
clean slate
202
{
203
  init_available_charsets(MYF(0));
204
  return get_collation_number_internal(name);
205
}
206
207
482 by Brian Aker
Remove uint.
208
uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
1 by brian
clean slate
209
{
210
  CHARSET_INFO **cs;
211
  init_available_charsets(MYF(0));
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
212
1 by brian
clean slate
213
  for (cs= all_charsets;
214
       cs < all_charsets+array_elements(all_charsets)-1 ;
215
       cs++)
216
  {
2085.2.3 by Brian Aker
Fix strcasecmp issues (ie, check UTF-8).
217
    if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) && !my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->csname, charset_name))
1 by brian
clean slate
218
      return cs[0]->number;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
219
  }
1 by brian
clean slate
220
  return 0;
221
}
222
223
482 by Brian Aker
Remove uint.
224
const char *get_charset_name(uint32_t charset_number)
1 by brian
clean slate
225
{
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
226
  const CHARSET_INFO *cs;
1 by brian
clean slate
227
  init_available_charsets(MYF(0));
228
229
  cs=all_charsets[charset_number];
230
  if (cs && (cs->number == charset_number) && cs->name )
231
    return (char*) cs->name;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
232
1 by brian
clean slate
233
  return (char*) "?";   /* this mimics find_type() */
234
}
235
236
482 by Brian Aker
Remove uint.
237
static const CHARSET_INFO *get_internal_charset(uint32_t cs_number)
1 by brian
clean slate
238
{
239
  CHARSET_INFO *cs;
240
  /*
241
    To make things thread safe we are not allowing other threads to interfere
242
    while we may changing the cs_info_table
243
  */
244
  if ((cs= all_charsets[cs_number]))
245
  {
246
    if (!(cs->state & MY_CS_COMPILED) && !(cs->state & MY_CS_LOADED))
247
    {
383.1.7 by Brian Aker
Remove homebrew xml parser.
248
      assert(0);
1 by brian
clean slate
249
    }
250
    cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
251
  }
252
  if (cs && !(cs->state & MY_CS_READY))
253
  {
254
    if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) ||
255
        (cs->coll->init && cs->coll->init(cs, cs_alloc)))
256
      cs= NULL;
257
    else
258
      cs->state|= MY_CS_READY;
259
  }
861 by Brian Aker
Remove THR_LOCK_charset (we never recall it anymore)
260
1 by brian
clean slate
261
  return cs;
262
}
263
264
862 by Brian Aker
Remove charset directory code.
265
const CHARSET_INFO *get_charset(uint32_t cs_number)
1 by brian
clean slate
266
{
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
267
  const CHARSET_INFO *cs;
1 by brian
clean slate
268
  if (cs_number == default_charset_info->number)
269
    return default_charset_info;
270
271
  (void) init_available_charsets(MYF(0));	/* If it isn't initialized */
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
272
1 by brian
clean slate
273
  if (!cs_number || cs_number >= array_elements(all_charsets)-1)
274
    return NULL;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
275
383.1.7 by Brian Aker
Remove homebrew xml parser.
276
  cs= get_internal_charset(cs_number);
1 by brian
clean slate
277
278
  return cs;
279
}
280
862 by Brian Aker
Remove charset directory code.
281
const CHARSET_INFO *get_charset_by_name(const char *cs_name)
1 by brian
clean slate
282
{
482 by Brian Aker
Remove uint.
283
  uint32_t cs_number;
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
284
  const CHARSET_INFO *cs;
1 by brian
clean slate
285
  (void) init_available_charsets(MYF(0));	/* If it isn't initialized */
286
1014.3.1 by Brian Aker
Simplify the calling stack for getting schema collation. We need to extend
287
  cs_number= get_collation_number(cs_name);
383.1.7 by Brian Aker
Remove homebrew xml parser.
288
  cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
289
290
  return cs;
291
}
292
293
862 by Brian Aker
Remove charset directory code.
294
const CHARSET_INFO *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
1 by brian
clean slate
295
{
482 by Brian Aker
Remove uint.
296
  uint32_t cs_number;
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
297
  const CHARSET_INFO *cs;
1 by brian
clean slate
298
299
  (void) init_available_charsets(MYF(0));	/* If it isn't initialized */
300
301
  cs_number= get_charset_number(cs_name, cs_flags);
383.1.7 by Brian Aker
Remove homebrew xml parser.
302
  cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian
clean slate
303
51.3.22 by Jay Pipes
Final round of removal of DBUG in mysys/, including Makefile
304
  return(cs);
1 by brian
clean slate
305
}
306
307
308
/*
309
  Escape apostrophes by doubling them up
310
311
  SYNOPSIS
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
312
    escape_quotes_for_drizzle()
1 by brian
clean slate
313
    charset_info        Charset of the strings
314
    to                  Buffer for escaped string
315
    to_length           Length of destination buffer, or 0
316
    from                The string to escape
317
    length              The length of the string to escape
318
319
  DESCRIPTION
320
    This escapes the contents of a string by doubling up any apostrophes that
321
    it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
322
    effect on the server.
323
324
  NOTE
325
    To be consistent with escape_string_for_mysql(), to_length may be 0 to
326
    mean "big enough"
327
328
  RETURN VALUES
365.2.9 by Monty Taylor
Got rid of all instances of ~0
329
    UINT32_MAX  The escaped string did not fit in the to buffer
1 by brian
clean slate
330
    >=0         The length of the escaped string
331
*/
332
236.3.9 by Andrey Hristov
- Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)
333
size_t escape_quotes_for_drizzle(const CHARSET_INFO *charset_info,
236.3.4 by Andrey Hristov
Rename escape_(string|quotes)_for_mysql to escape_(string|quotes)_for_drizzle
334
                                 char *to, size_t to_length,
335
                                 const char *from, size_t length)
1 by brian
clean slate
336
{
337
  const char *to_start= to;
338
  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
163 by Brian Aker
Merge Monty's code.
339
  bool overflow= false;
146 by Brian Aker
my_bool cleanup.
340
  bool use_mb_flag= use_mb(charset_info);
1 by brian
clean slate
341
  for (end= from + length; from < end; from++)
342
  {
343
    int tmp_length;
344
    if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
345
    {
346
      if (to + tmp_length > to_end)
347
      {
163 by Brian Aker
Merge Monty's code.
348
        overflow= true;
1 by brian
clean slate
349
        break;
350
      }
351
      while (tmp_length--)
352
	*to++= *from++;
353
      from--;
354
      continue;
355
    }
356
    /*
357
      We don't have the same issue here with a non-multi-byte character being
358
      turned into a multi-byte character by the addition of an escaping
359
      character, because we are only escaping the ' character with itself.
360
     */
361
    if (*from == '\'')
362
    {
363
      if (to + 2 > to_end)
364
      {
163 by Brian Aker
Merge Monty's code.
365
        overflow= true;
1 by brian
clean slate
366
        break;
367
      }
368
      *to++= '\'';
369
      *to++= '\'';
370
    }
371
    else
372
    {
373
      if (to + 1 > to_end)
374
      {
163 by Brian Aker
Merge Monty's code.
375
        overflow= true;
1 by brian
clean slate
376
        break;
377
      }
378
      *to++= *from;
379
    }
380
  }
381
  *to= 0;
365.2.9 by Monty Taylor
Got rid of all instances of ~0
382
  return overflow ? UINT32_MAX : (uint32_t) (to - to_start);
1 by brian
clean slate
383
}
1280.1.10 by Monty Taylor
Put everything in drizzled into drizzled namespace.
384
385
} /* namespace drizzled */