~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/* Copyright (C) 2000 MySQL AB
2
3
   This program is free software; you can redistribute it and/or modify
4
   it under the terms of the GNU General Public License as published by
5
   the Free Software Foundation; version 2 of the License.
6
7
   This program is distributed in the hope that it will be useful,
8
   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
   GNU General Public License for more details.
11
12
   You should have received a copy of the GNU General Public License
13
   along with this program; if not, write to the Free Software
14
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
15
685.1.3 by Monty Taylor
Turned off stdinc - and then fixed the carnage.
16
#include "m_string.h"
17
#include "m_ctype.h"
1 by brian
clean slate
18
19
20
/*
21
22
  This files implements routines which parse XML based
23
  character set and collation description files.
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
24
1 by brian
clean slate
25
  Unicode collations are encoded according to
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
26
1 by brian
clean slate
27
    Unicode Technical Standard #35
28
    Locale Data Markup Language (LDML)
29
    http://www.unicode.org/reports/tr35/
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
30
1 by brian
clean slate
31
  and converted into ICU string according to
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
32
1 by brian
clean slate
33
    Collation Customization
34
    http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
35
1 by brian
clean slate
36
*/
37
38
struct my_cs_file_section_st
39
{
40
  int        state;
41
  const char *str;
42
};
43
44
45
#define MY_CS_CSDESCR_SIZE	64
46
#define MY_CS_TAILORING_SIZE	1024
47
48
typedef struct my_cs_file_info
49
{
50
  char   csname[MY_CS_NAME_SIZE];
51
  char   name[MY_CS_NAME_SIZE];
481 by Brian Aker
Remove all of uchar.
52
  unsigned char  ctype[MY_CS_CTYPE_TABLE_SIZE];
53
  unsigned char  to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
54
  unsigned char  to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
55
  unsigned char  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
206 by Brian Aker
Removed final uint dead types.
56
  uint16_t tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
1 by brian
clean slate
57
  char   comment[MY_CS_CSDESCR_SIZE];
58
  char   tailoring[MY_CS_TAILORING_SIZE];
59
  size_t tailoring_length;
60
  CHARSET_INFO cs;
61
  int (*add_collation)(CHARSET_INFO *cs);
62
} MY_CHARSET_LOADER;
63
64
65
66
/*
67
  Check repertoire: detect pure ascii strings
68
*/
69
uint
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
70
my_string_repertoire(const CHARSET_INFO * const cs, const char *str, ulong length)
1 by brian
clean slate
71
{
72
  const char *strend= str + length;
73
  if (cs->mbminlen == 1)
74
  {
75
    for ( ; str < strend; str++)
76
    {
481 by Brian Aker
Remove all of uchar.
77
      if (((unsigned char) *str) > 0x7F)
1 by brian
clean slate
78
        return MY_REPERTOIRE_UNICODE30;
79
    }
80
  }
81
  else
82
  {
83
    my_wc_t wc;
84
    int chlen;
481 by Brian Aker
Remove all of uchar.
85
    for (; (chlen= cs->cset->mb_wc(cs, &wc, (unsigned char *)str, (unsigned char *)strend)) > 0; str+= chlen)
1 by brian
clean slate
86
    {
87
      if (wc > 0x7F)
88
        return MY_REPERTOIRE_UNICODE30;
89
    }
90
  }
91
  return MY_REPERTOIRE_ASCII;
92
}
93
94
95
/*
96
  Detect whether a character set is ASCII compatible.
97
163 by Brian Aker
Merge Monty's code.
98
  Returns true for:
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
99
1 by brian
clean slate
100
  - all 8bit character sets whose Unicode mapping of 0x7B is '{'
101
    (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
102
1 by brian
clean slate
103
  - all multi-byte character sets having mbminlen == 1
104
    (ignores ucs2 whose mbminlen is 2)
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
105
1 by brian
clean slate
106
  TODO:
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
107
1 by brian
clean slate
108
  When merging to 5.2, this function should be changed
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
109
  to check a new flag MY_CS_NONASCII,
110
1 by brian
clean slate
111
     return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
112
1 by brian
clean slate
113
  This flag was previously added into 5.2 under terms
114
  of WL#3759 "Optimize identifier conversion in client-server protocol"
115
  especially to mark character sets not compatible with ASCII.
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
116
1 by brian
clean slate
117
  We won't backport this flag to 5.0 or 5.1.
118
  This function is Ok for 5.0 and 5.1, because we're not going
119
  to introduce new tricky character sets between 5.0 and 5.2.
120
*/
276 by Brian Aker
Cleaned out my_bool from strings.
121
bool
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
122
my_charset_is_ascii_based(const CHARSET_INFO * const cs)
1 by brian
clean slate
123
{
660.1.3 by Eric Herman
removed trailing whitespace with simple script:
124
  return
1 by brian
clean slate
125
    (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
126
    (cs->mbminlen == 1 && cs->mbmaxlen > 1);
127
}
128
129
130
/*
131
  Detect if a character set is 8bit,
132
  and it is pure ascii, i.e. doesn't have
133
  characters outside U+0000..U+007F
134
  This functions is shared between "conf_to_src"
135
  and dynamic charsets loader in "mysqld".
136
*/
276 by Brian Aker
Cleaned out my_bool from strings.
137
bool
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
138
my_charset_is_8bit_pure_ascii(const CHARSET_INFO * const cs)
1 by brian
clean slate
139
{
140
  size_t code;
141
  if (!cs->tab_to_uni)
142
    return 0;
143
  for (code= 0; code < 256; code++)
144
  {
145
    if (cs->tab_to_uni[code] > 0x7F)
146
      return 0;
147
  }
148
  return 1;
149
}
150
151
152
/*
153
  Shared function between conf_to_src and mysys.
154
  Check if a 8bit character set is compatible with
155
  ascii on the range 0x00..0x7F.
156
*/
276 by Brian Aker
Cleaned out my_bool from strings.
157
bool
264.2.6 by Andrey Hristov
Constify the usage of CHARSET_INFO almost to the last place in the code.
158
my_charset_is_ascii_compatible(const CHARSET_INFO * const cs)
1 by brian
clean slate
159
{
482 by Brian Aker
Remove uint.
160
  uint32_t i;
1 by brian
clean slate
161
  if (!cs->tab_to_uni)
162
    return 1;
163
  for (i= 0; i < 128; i++)
164
  {
165
    if (cs->tab_to_uni[i] != i)
166
      return 0;
167
  }
168
  return 1;
169
}