~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
14
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
15
212.5.39 by Monty Taylor
Phew. Moved my_base and my_global.
16
#include <m_string.h>
1 by brian
clean slate
17
#include <m_ctype.h>
18
#include <my_xml.h>
19
20
21
/*
22
23
  This files implements routines which parse XML based
24
  character set and collation description files.
25
  
26
  Unicode collations are encoded according to
27
  
28
    Unicode Technical Standard #35
29
    Locale Data Markup Language (LDML)
30
    http://www.unicode.org/reports/tr35/
31
  
32
  and converted into ICU string according to
33
  
34
    Collation Customization
35
    http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
36
  
37
*/
38
39
static char *mstr(char *str,const char *src,uint l1,uint l2)
40
{
41
  l1= l1<l2 ? l1 : l2;
42
  memcpy(str,src,l1);
43
  str[l1]='\0';
44
  return str;
45
}
46
47
struct my_cs_file_section_st
48
{
49
  int        state;
50
  const char *str;
51
};
52
53
#define _CS_MISC	1
54
#define _CS_ID		2
55
#define _CS_CSNAME	3
56
#define _CS_FAMILY	4
57
#define _CS_ORDER	5
58
#define _CS_COLNAME	6
59
#define _CS_FLAG	7
60
#define _CS_CHARSET	8
61
#define _CS_COLLATION	9
62
#define _CS_UPPERMAP	10
63
#define _CS_LOWERMAP	11
64
#define _CS_UNIMAP	12
65
#define _CS_COLLMAP	13
66
#define _CS_CTYPEMAP	14
67
#define _CS_PRIMARY_ID	15
68
#define _CS_BINARY_ID	16
69
#define _CS_CSDESCRIPT	17
70
#define _CS_RESET	18
71
#define	_CS_DIFF1	19
72
#define	_CS_DIFF2	20
73
#define	_CS_DIFF3	21
74
75
76
static struct my_cs_file_section_st sec[] =
77
{
78
  {_CS_MISC,		"xml"},
79
  {_CS_MISC,		"xml/version"},
80
  {_CS_MISC,		"xml/encoding"},
81
  {_CS_MISC,		"charsets"},
82
  {_CS_MISC,		"charsets/max-id"},
83
  {_CS_CHARSET,		"charsets/charset"},
84
  {_CS_PRIMARY_ID,	"charsets/charset/primary-id"},
85
  {_CS_BINARY_ID,	"charsets/charset/binary-id"},
86
  {_CS_CSNAME,		"charsets/charset/name"},
87
  {_CS_FAMILY,		"charsets/charset/family"},
88
  {_CS_CSDESCRIPT,	"charsets/charset/description"},
89
  {_CS_MISC,		"charsets/charset/alias"},
90
  {_CS_MISC,		"charsets/charset/ctype"},
91
  {_CS_CTYPEMAP,	"charsets/charset/ctype/map"},
92
  {_CS_MISC,		"charsets/charset/upper"},
93
  {_CS_UPPERMAP,	"charsets/charset/upper/map"},
94
  {_CS_MISC,		"charsets/charset/lower"},
95
  {_CS_LOWERMAP,	"charsets/charset/lower/map"},
96
  {_CS_MISC,		"charsets/charset/unicode"},
97
  {_CS_UNIMAP,		"charsets/charset/unicode/map"},
98
  {_CS_COLLATION,	"charsets/charset/collation"},
99
  {_CS_COLNAME,		"charsets/charset/collation/name"},
100
  {_CS_ID,		"charsets/charset/collation/id"},
101
  {_CS_ORDER,		"charsets/charset/collation/order"},
102
  {_CS_FLAG,		"charsets/charset/collation/flag"},
103
  {_CS_COLLMAP,		"charsets/charset/collation/map"},
104
  {_CS_RESET,		"charsets/charset/collation/rules/reset"},
105
  {_CS_DIFF1,		"charsets/charset/collation/rules/p"},
106
  {_CS_DIFF2,		"charsets/charset/collation/rules/s"},
107
  {_CS_DIFF3,		"charsets/charset/collation/rules/t"},
108
  {0,	NULL}
109
};
110
111
static struct my_cs_file_section_st * cs_file_sec(const char *attr, size_t len)
112
{
113
  struct my_cs_file_section_st *s;
114
  for (s=sec; s->str; s++)
115
  {
116
    if (!strncmp(attr,s->str,len))
117
      return s;
118
  }
119
  return NULL;
120
}
121
122
#define MY_CS_CSDESCR_SIZE	64
123
#define MY_CS_TAILORING_SIZE	1024
124
125
typedef struct my_cs_file_info
126
{
127
  char   csname[MY_CS_NAME_SIZE];
128
  char   name[MY_CS_NAME_SIZE];
129
  uchar  ctype[MY_CS_CTYPE_TABLE_SIZE];
130
  uchar  to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
131
  uchar  to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
132
  uchar  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
206 by Brian Aker
Removed final uint dead types.
133
  uint16_t tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
1 by brian
clean slate
134
  char   comment[MY_CS_CSDESCR_SIZE];
135
  char   tailoring[MY_CS_TAILORING_SIZE];
136
  size_t tailoring_length;
137
  CHARSET_INFO cs;
138
  int (*add_collation)(CHARSET_INFO *cs);
139
} MY_CHARSET_LOADER;
140
141
142
143
static int fill_uchar(uchar *a,uint size,const char *str, uint len)
144
{
145
  uint i= 0;
146
  const char *s, *b, *e=str+len;
147
  
148
  for (s=str ; s < e ; i++)
149
  { 
150
    for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
151
    b=s;
152
    for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
153
    if (s == b || i > size)
154
      break;
155
    a[i]= (uchar) strtoul(b,NULL,16);
156
  }
157
  return 0;
158
}
159
206 by Brian Aker
Removed final uint dead types.
160
static int fill_uint16(uint16_t *a,uint size,const char *str, size_t len)
1 by brian
clean slate
161
{
162
  uint i= 0;
163
  
164
  const char *s, *b, *e=str+len;
165
  for (s=str ; s < e ; i++)
166
  { 
167
    for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
168
    b=s;
169
    for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
170
    if (s == b || i > size)
171
      break;
206 by Brian Aker
Removed final uint dead types.
172
    a[i]= (uint16_t) strtol(b,NULL,16);
1 by brian
clean slate
173
  }
174
  return 0;
175
}
176
177
178
static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len)
179
{
180
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
181
  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
182
  
183
  if ( s && (s->state == _CS_CHARSET))
212.6.1 by Mats Kindahl
Replacing all bzero() calls with memset() calls and removing the bzero.c file.
184
    memset(&i->cs, 0, sizeof(i->cs));
1 by brian
clean slate
185
  
186
  if (s && (s->state == _CS_COLLATION))
187
    i->tailoring_length= 0;
188
189
  return MY_XML_OK;
190
}
191
192
193
static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len)
194
{
195
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
196
  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
197
  int    state= s ? s->state : 0;
198
  int    rc;
199
  
200
  switch(state){
201
  case _CS_COLLATION:
202
    rc= i->add_collation ? i->add_collation(&i->cs) : MY_XML_OK;
203
    break;
204
  default:
205
    rc=MY_XML_OK;
206
  }
207
  return rc;
208
}
209
210
211
static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
212
{
213
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
214
  struct my_cs_file_section_st *s;
215
  int    state= (int)((s=cs_file_sec(st->attr, strlen(st->attr))) ? s->state :
216
                      0);
217
  
218
  switch (state) {
219
  case _CS_ID:
220
    i->cs.number= strtol(attr,(char**)NULL,10);
221
    break;
222
  case _CS_BINARY_ID:
223
    i->cs.binary_number= strtol(attr,(char**)NULL,10);
224
    break;
225
  case _CS_PRIMARY_ID:
226
    i->cs.primary_number= strtol(attr,(char**)NULL,10);
227
    break;
228
  case _CS_COLNAME:
229
    i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1);
230
    break;
231
  case _CS_CSNAME:
232
    i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1);
233
    break;
234
  case _CS_CSDESCRIPT:
235
    i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
236
    break;
237
  case _CS_FLAG:
238
    if (!strncmp("primary",attr,len))
239
      i->cs.state|= MY_CS_PRIMARY;
240
    else if (!strncmp("binary",attr,len))
241
      i->cs.state|= MY_CS_BINSORT;
242
    else if (!strncmp("compiled",attr,len))
243
      i->cs.state|= MY_CS_COMPILED;
244
    break;
245
  case _CS_UPPERMAP:
246
    fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
247
    i->cs.to_upper=i->to_upper;
248
    break;
249
  case _CS_LOWERMAP:
250
    fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
251
    i->cs.to_lower=i->to_lower;
252
    break;
253
  case _CS_UNIMAP:
254
    fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
255
    i->cs.tab_to_uni=i->tab_to_uni;
256
    break;
257
  case _CS_COLLMAP:
258
    fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
259
    i->cs.sort_order=i->sort_order;
260
    break;
261
  case _CS_CTYPEMAP:
262
    fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
263
    i->cs.ctype=i->ctype;
264
    break;
265
  case _CS_RESET:
266
  case _CS_DIFF1:
267
  case _CS_DIFF2:
268
  case _CS_DIFF3:
269
    {
270
      /*
271
        Convert collation description from
272
        Locale Data Markup Language (LDML)
273
        into ICU Collation Customization expression.
274
      */
275
      char arg[16];
276
      const char *cmd[]= {"&","<","<<","<<<"};
277
      i->cs.tailoring= i->tailoring;
278
      mstr(arg,attr,len,sizeof(arg)-1);
279
      if (i->tailoring_length + 20 < sizeof(i->tailoring))
280
      {
281
        char *dst= i->tailoring_length + i->tailoring;
282
        i->tailoring_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg);
283
      }
284
    }
285
  }
286
  return MY_XML_OK;
287
}
288
289
276 by Brian Aker
Cleaned out my_bool from strings.
290
bool my_parse_charset_xml(const char *buf, size_t len,
1 by brian
clean slate
291
                             int (*add_collation)(CHARSET_INFO *cs))
292
{
293
  MY_XML_PARSER p;
294
  struct my_cs_file_info i;
276 by Brian Aker
Cleaned out my_bool from strings.
295
  bool rc;
1 by brian
clean slate
296
  
297
  my_xml_parser_create(&p);
298
  my_xml_set_enter_handler(&p,cs_enter);
299
  my_xml_set_value_handler(&p,cs_value);
300
  my_xml_set_leave_handler(&p,cs_leave);
301
  i.add_collation= add_collation;
302
  my_xml_set_user_data(&p,(void*)&i);
163 by Brian Aker
Merge Monty's code.
303
  rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? false : true;
1 by brian
clean slate
304
  my_xml_parser_free(&p);
305
  return rc;
306
}
307
308
309
/*
310
  Check repertoire: detect pure ascii strings
311
*/
312
uint
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
313
my_string_repertoire(const CHARSET_INFO * const cs, const char *str, ulong length)
1 by brian
clean slate
314
{
315
  const char *strend= str + length;
316
  if (cs->mbminlen == 1)
317
  {
318
    for ( ; str < strend; str++)
319
    {
320
      if (((uchar) *str) > 0x7F)
321
        return MY_REPERTOIRE_UNICODE30;
322
    }
323
  }
324
  else
325
  {
326
    my_wc_t wc;
327
    int chlen;
53.2.11 by Monty Taylor
Added cast for a type-signedness problem.
328
    for (; (chlen= cs->cset->mb_wc(cs, &wc, (uchar *)str, (uchar *)strend)) > 0; str+= chlen)
1 by brian
clean slate
329
    {
330
      if (wc > 0x7F)
331
        return MY_REPERTOIRE_UNICODE30;
332
    }
333
  }
334
  return MY_REPERTOIRE_ASCII;
335
}
336
337
338
/*
339
  Detect whether a character set is ASCII compatible.
340
163 by Brian Aker
Merge Monty's code.
341
  Returns true for:
1 by brian
clean slate
342
  
343
  - all 8bit character sets whose Unicode mapping of 0x7B is '{'
344
    (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
345
  
346
  - all multi-byte character sets having mbminlen == 1
347
    (ignores ucs2 whose mbminlen is 2)
348
  
349
  TODO:
350
  
351
  When merging to 5.2, this function should be changed
352
  to check a new flag MY_CS_NONASCII, 
353
  
354
     return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
355
  
356
  This flag was previously added into 5.2 under terms
357
  of WL#3759 "Optimize identifier conversion in client-server protocol"
358
  especially to mark character sets not compatible with ASCII.
359
  
360
  We won't backport this flag to 5.0 or 5.1.
361
  This function is Ok for 5.0 and 5.1, because we're not going
362
  to introduce new tricky character sets between 5.0 and 5.2.
363
*/
276 by Brian Aker
Cleaned out my_bool from strings.
364
bool
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
365
my_charset_is_ascii_based(const CHARSET_INFO * const cs)
1 by brian
clean slate
366
{
367
  return 
368
    (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
369
    (cs->mbminlen == 1 && cs->mbmaxlen > 1);
370
}
371
372
373
/*
374
  Detect if a character set is 8bit,
375
  and it is pure ascii, i.e. doesn't have
376
  characters outside U+0000..U+007F
377
  This functions is shared between "conf_to_src"
378
  and dynamic charsets loader in "mysqld".
379
*/
276 by Brian Aker
Cleaned out my_bool from strings.
380
bool
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
381
my_charset_is_8bit_pure_ascii(const CHARSET_INFO * const cs)
1 by brian
clean slate
382
{
383
  size_t code;
384
  if (!cs->tab_to_uni)
385
    return 0;
386
  for (code= 0; code < 256; code++)
387
  {
388
    if (cs->tab_to_uni[code] > 0x7F)
389
      return 0;
390
  }
391
  return 1;
392
}
393
394
395
/*
396
  Shared function between conf_to_src and mysys.
397
  Check if a 8bit character set is compatible with
398
  ascii on the range 0x00..0x7F.
399
*/
276 by Brian Aker
Cleaned out my_bool from strings.
400
bool
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
401
my_charset_is_ascii_compatible(const CHARSET_INFO * const cs)
1 by brian
clean slate
402
{
403
  uint i;
404
  if (!cs->tab_to_uni)
405
    return 1;
406
  for (i= 0; i < 128; i++)
407
  {
408
    if (cs->tab_to_uni[i] != i)
409
      return 0;
410
  }
411
  return 1;
412
}