~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
14
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
15
16
#include <my_global.h>
17
#include <m_ctype.h>
18
#include <my_xml.h>
19
#ifndef SCO
20
#include <m_string.h>
21
#endif
22
23
24
/*
25
26
  This files implements routines which parse XML based
27
  character set and collation description files.
28
  
29
  Unicode collations are encoded according to
30
  
31
    Unicode Technical Standard #35
32
    Locale Data Markup Language (LDML)
33
    http://www.unicode.org/reports/tr35/
34
  
35
  and converted into ICU string according to
36
  
37
    Collation Customization
38
    http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
39
  
40
*/
41
42
static char *mstr(char *str,const char *src,uint l1,uint l2)
43
{
44
  l1= l1<l2 ? l1 : l2;
45
  memcpy(str,src,l1);
46
  str[l1]='\0';
47
  return str;
48
}
49
50
struct my_cs_file_section_st
51
{
52
  int        state;
53
  const char *str;
54
};
55
56
#define _CS_MISC	1
57
#define _CS_ID		2
58
#define _CS_CSNAME	3
59
#define _CS_FAMILY	4
60
#define _CS_ORDER	5
61
#define _CS_COLNAME	6
62
#define _CS_FLAG	7
63
#define _CS_CHARSET	8
64
#define _CS_COLLATION	9
65
#define _CS_UPPERMAP	10
66
#define _CS_LOWERMAP	11
67
#define _CS_UNIMAP	12
68
#define _CS_COLLMAP	13
69
#define _CS_CTYPEMAP	14
70
#define _CS_PRIMARY_ID	15
71
#define _CS_BINARY_ID	16
72
#define _CS_CSDESCRIPT	17
73
#define _CS_RESET	18
74
#define	_CS_DIFF1	19
75
#define	_CS_DIFF2	20
76
#define	_CS_DIFF3	21
77
78
79
static struct my_cs_file_section_st sec[] =
80
{
81
  {_CS_MISC,		"xml"},
82
  {_CS_MISC,		"xml/version"},
83
  {_CS_MISC,		"xml/encoding"},
84
  {_CS_MISC,		"charsets"},
85
  {_CS_MISC,		"charsets/max-id"},
86
  {_CS_CHARSET,		"charsets/charset"},
87
  {_CS_PRIMARY_ID,	"charsets/charset/primary-id"},
88
  {_CS_BINARY_ID,	"charsets/charset/binary-id"},
89
  {_CS_CSNAME,		"charsets/charset/name"},
90
  {_CS_FAMILY,		"charsets/charset/family"},
91
  {_CS_CSDESCRIPT,	"charsets/charset/description"},
92
  {_CS_MISC,		"charsets/charset/alias"},
93
  {_CS_MISC,		"charsets/charset/ctype"},
94
  {_CS_CTYPEMAP,	"charsets/charset/ctype/map"},
95
  {_CS_MISC,		"charsets/charset/upper"},
96
  {_CS_UPPERMAP,	"charsets/charset/upper/map"},
97
  {_CS_MISC,		"charsets/charset/lower"},
98
  {_CS_LOWERMAP,	"charsets/charset/lower/map"},
99
  {_CS_MISC,		"charsets/charset/unicode"},
100
  {_CS_UNIMAP,		"charsets/charset/unicode/map"},
101
  {_CS_COLLATION,	"charsets/charset/collation"},
102
  {_CS_COLNAME,		"charsets/charset/collation/name"},
103
  {_CS_ID,		"charsets/charset/collation/id"},
104
  {_CS_ORDER,		"charsets/charset/collation/order"},
105
  {_CS_FLAG,		"charsets/charset/collation/flag"},
106
  {_CS_COLLMAP,		"charsets/charset/collation/map"},
107
  {_CS_RESET,		"charsets/charset/collation/rules/reset"},
108
  {_CS_DIFF1,		"charsets/charset/collation/rules/p"},
109
  {_CS_DIFF2,		"charsets/charset/collation/rules/s"},
110
  {_CS_DIFF3,		"charsets/charset/collation/rules/t"},
111
  {0,	NULL}
112
};
113
114
static struct my_cs_file_section_st * cs_file_sec(const char *attr, size_t len)
115
{
116
  struct my_cs_file_section_st *s;
117
  for (s=sec; s->str; s++)
118
  {
119
    if (!strncmp(attr,s->str,len))
120
      return s;
121
  }
122
  return NULL;
123
}
124
125
#define MY_CS_CSDESCR_SIZE	64
126
#define MY_CS_TAILORING_SIZE	1024
127
128
typedef struct my_cs_file_info
129
{
130
  char   csname[MY_CS_NAME_SIZE];
131
  char   name[MY_CS_NAME_SIZE];
132
  uchar  ctype[MY_CS_CTYPE_TABLE_SIZE];
133
  uchar  to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
134
  uchar  to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
135
  uchar  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
136
  uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
137
  char   comment[MY_CS_CSDESCR_SIZE];
138
  char   tailoring[MY_CS_TAILORING_SIZE];
139
  size_t tailoring_length;
140
  CHARSET_INFO cs;
141
  int (*add_collation)(CHARSET_INFO *cs);
142
} MY_CHARSET_LOADER;
143
144
145
146
static int fill_uchar(uchar *a,uint size,const char *str, uint len)
147
{
148
  uint i= 0;
149
  const char *s, *b, *e=str+len;
150
  
151
  for (s=str ; s < e ; i++)
152
  { 
153
    for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
154
    b=s;
155
    for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
156
    if (s == b || i > size)
157
      break;
158
    a[i]= (uchar) strtoul(b,NULL,16);
159
  }
160
  return 0;
161
}
162
163
static int fill_uint16(uint16 *a,uint size,const char *str, size_t len)
164
{
165
  uint i= 0;
166
  
167
  const char *s, *b, *e=str+len;
168
  for (s=str ; s < e ; i++)
169
  { 
170
    for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
171
    b=s;
172
    for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
173
    if (s == b || i > size)
174
      break;
175
    a[i]= (uint16) strtol(b,NULL,16);
176
  }
177
  return 0;
178
}
179
180
181
static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len)
182
{
183
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
184
  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
185
  
186
  if ( s && (s->state == _CS_CHARSET))
187
    bzero(&i->cs,sizeof(i->cs));
188
  
189
  if (s && (s->state == _CS_COLLATION))
190
    i->tailoring_length= 0;
191
192
  return MY_XML_OK;
193
}
194
195
196
static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len)
197
{
198
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
199
  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
200
  int    state= s ? s->state : 0;
201
  int    rc;
202
  
203
  switch(state){
204
  case _CS_COLLATION:
205
    rc= i->add_collation ? i->add_collation(&i->cs) : MY_XML_OK;
206
    break;
207
  default:
208
    rc=MY_XML_OK;
209
  }
210
  return rc;
211
}
212
213
214
static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
215
{
216
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
217
  struct my_cs_file_section_st *s;
218
  int    state= (int)((s=cs_file_sec(st->attr, strlen(st->attr))) ? s->state :
219
                      0);
220
  
221
  switch (state) {
222
  case _CS_ID:
223
    i->cs.number= strtol(attr,(char**)NULL,10);
224
    break;
225
  case _CS_BINARY_ID:
226
    i->cs.binary_number= strtol(attr,(char**)NULL,10);
227
    break;
228
  case _CS_PRIMARY_ID:
229
    i->cs.primary_number= strtol(attr,(char**)NULL,10);
230
    break;
231
  case _CS_COLNAME:
232
    i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1);
233
    break;
234
  case _CS_CSNAME:
235
    i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1);
236
    break;
237
  case _CS_CSDESCRIPT:
238
    i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
239
    break;
240
  case _CS_FLAG:
241
    if (!strncmp("primary",attr,len))
242
      i->cs.state|= MY_CS_PRIMARY;
243
    else if (!strncmp("binary",attr,len))
244
      i->cs.state|= MY_CS_BINSORT;
245
    else if (!strncmp("compiled",attr,len))
246
      i->cs.state|= MY_CS_COMPILED;
247
    break;
248
  case _CS_UPPERMAP:
249
    fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
250
    i->cs.to_upper=i->to_upper;
251
    break;
252
  case _CS_LOWERMAP:
253
    fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
254
    i->cs.to_lower=i->to_lower;
255
    break;
256
  case _CS_UNIMAP:
257
    fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
258
    i->cs.tab_to_uni=i->tab_to_uni;
259
    break;
260
  case _CS_COLLMAP:
261
    fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
262
    i->cs.sort_order=i->sort_order;
263
    break;
264
  case _CS_CTYPEMAP:
265
    fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
266
    i->cs.ctype=i->ctype;
267
    break;
268
  case _CS_RESET:
269
  case _CS_DIFF1:
270
  case _CS_DIFF2:
271
  case _CS_DIFF3:
272
    {
273
      /*
274
        Convert collation description from
275
        Locale Data Markup Language (LDML)
276
        into ICU Collation Customization expression.
277
      */
278
      char arg[16];
279
      const char *cmd[]= {"&","<","<<","<<<"};
280
      i->cs.tailoring= i->tailoring;
281
      mstr(arg,attr,len,sizeof(arg)-1);
282
      if (i->tailoring_length + 20 < sizeof(i->tailoring))
283
      {
284
        char *dst= i->tailoring_length + i->tailoring;
285
        i->tailoring_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg);
286
      }
287
    }
288
  }
289
  return MY_XML_OK;
290
}
291
292
293
my_bool my_parse_charset_xml(const char *buf, size_t len,
294
                             int (*add_collation)(CHARSET_INFO *cs))
295
{
296
  MY_XML_PARSER p;
297
  struct my_cs_file_info i;
298
  my_bool rc;
299
  
300
  my_xml_parser_create(&p);
301
  my_xml_set_enter_handler(&p,cs_enter);
302
  my_xml_set_value_handler(&p,cs_value);
303
  my_xml_set_leave_handler(&p,cs_leave);
304
  i.add_collation= add_collation;
305
  my_xml_set_user_data(&p,(void*)&i);
306
  rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
307
  my_xml_parser_free(&p);
308
  return rc;
309
}
310
311
312
/*
313
  Check repertoire: detect pure ascii strings
314
*/
315
uint
316
my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong length)
317
{
318
  const char *strend= str + length;
319
  if (cs->mbminlen == 1)
320
  {
321
    for ( ; str < strend; str++)
322
    {
323
      if (((uchar) *str) > 0x7F)
324
        return MY_REPERTOIRE_UNICODE30;
325
    }
326
  }
327
  else
328
  {
329
    my_wc_t wc;
330
    int chlen;
331
    for (; (chlen= cs->cset->mb_wc(cs, &wc, str, strend)) > 0; str+= chlen)
332
    {
333
      if (wc > 0x7F)
334
        return MY_REPERTOIRE_UNICODE30;
335
    }
336
  }
337
  return MY_REPERTOIRE_ASCII;
338
}
339
340
341
/*
342
  Detect whether a character set is ASCII compatible.
343
344
  Returns TRUE for:
345
  
346
  - all 8bit character sets whose Unicode mapping of 0x7B is '{'
347
    (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
348
  
349
  - all multi-byte character sets having mbminlen == 1
350
    (ignores ucs2 whose mbminlen is 2)
351
  
352
  TODO:
353
  
354
  When merging to 5.2, this function should be changed
355
  to check a new flag MY_CS_NONASCII, 
356
  
357
     return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
358
  
359
  This flag was previously added into 5.2 under terms
360
  of WL#3759 "Optimize identifier conversion in client-server protocol"
361
  especially to mark character sets not compatible with ASCII.
362
  
363
  We won't backport this flag to 5.0 or 5.1.
364
  This function is Ok for 5.0 and 5.1, because we're not going
365
  to introduce new tricky character sets between 5.0 and 5.2.
366
*/
367
my_bool
368
my_charset_is_ascii_based(CHARSET_INFO *cs)
369
{
370
  return 
371
    (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
372
    (cs->mbminlen == 1 && cs->mbmaxlen > 1);
373
}
374
375
376
/*
377
  Detect if a character set is 8bit,
378
  and it is pure ascii, i.e. doesn't have
379
  characters outside U+0000..U+007F
380
  This functions is shared between "conf_to_src"
381
  and dynamic charsets loader in "mysqld".
382
*/
383
my_bool
384
my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs)
385
{
386
  size_t code;
387
  if (!cs->tab_to_uni)
388
    return 0;
389
  for (code= 0; code < 256; code++)
390
  {
391
    if (cs->tab_to_uni[code] > 0x7F)
392
      return 0;
393
  }
394
  return 1;
395
}
396
397
398
/*
399
  Shared function between conf_to_src and mysys.
400
  Check if a 8bit character set is compatible with
401
  ascii on the range 0x00..0x7F.
402
*/
403
my_bool
404
my_charset_is_ascii_compatible(CHARSET_INFO *cs)
405
{
406
  uint i;
407
  if (!cs->tab_to_uni)
408
    return 1;
409
  for (i= 0; i < 128; i++)
410
  {
411
    if (cs->tab_to_uni[i] != i)
412
      return 0;
413
  }
414
  return 1;
415
}