1
by brian
clean slate |
1 |
CHARSET_INFO |
2 |
============ |
|
3 |
A structure containing data for charset+collation pair implementation. |
|
4 |
||
5 |
Virtual functions which use this data are collected |
|
6 |
into separate structures MY_CHARSET_HANDLER and |
|
7 |
MY_COLLATION_HANDLER. |
|
8 |
||
9 |
||
10 |
typedef struct charset_info_st |
|
11 |
{ |
|
12 |
uint number; |
|
13 |
uint primary_number; |
|
14 |
uint binary_number; |
|
15 |
uint state; |
|
16 |
||
17 |
const char *csname; |
|
18 |
const char *name; |
|
19 |
const char *comment; |
|
20 |
||
21 |
uchar *ctype; |
|
22 |
uchar *to_lower; |
|
23 |
uchar *to_upper; |
|
24 |
uchar *sort_order; |
|
25 |
||
26 |
uint16 *tab_to_uni; |
|
27 |
MY_UNI_IDX *tab_from_uni; |
|
28 |
||
29 |
uchar state_map[256]; |
|
30 |
uchar ident_map[256]; |
|
31 |
||
32 |
uint strxfrm_multiply; |
|
33 |
uint mbminlen; |
|
34 |
uint mbmaxlen; |
|
35 |
uint16 max_sort_char; /* For LIKE optimization */ |
|
36 |
||
37 |
MY_CHARSET_HANDLER *cset; |
|
38 |
MY_COLLATION_HANDLER *coll; |
|
39 |
||
40 |
} CHARSET_INFO; |
|
41 |
||
42 |
||
43 |
CHARSET_INFO fields description: |
|
44 |
=============================== |
|
45 |
||
46 |
||
47 |
Numbers (identifiers) |
|
48 |
--------------------- |
|
49 |
||
50 |
number - an ID uniquely identifying this charset+collation pair. |
|
51 |
||
52 |
primary_number - ID of a charset+collation pair, which consists |
|
53 |
of the same character set and the default collation of this |
|
54 |
character set. Not really used now. Intended to optimize some |
|
55 |
parts of the code where we need to find the default collation |
|
56 |
using its non-default counterpart for the given character set. |
|
57 |
||
58 |
binary_numner - ID of a charset+collation pair, which consists |
|
59 |
of the same character set and the binary collation of this |
|
60 |
character set. Not really used now. |
|
61 |
||
62 |
Names |
|
63 |
----- |
|
64 |
||
65 |
csname - name of the character set for this charset+collation pair. |
|
66 |
name - name of the collation for this charset+collation pair. |
|
67 |
comment - a text comment, dysplayed in "Description" column of |
|
68 |
SHOW CHARACTER SET output. |
|
69 |
||
70 |
Conversion tables |
|
71 |
----------------- |
|
72 |
||
73 |
ctype - pointer to array[257] of "type of characters" |
|
74 |
bit mask for each chatacter, e.g. if a |
|
75 |
character is a digit or a letter or a separator, etc. |
|
76 |
||
77 |
Monty 2004-10-21: |
|
78 |
If you look at the macros, we use ctype[(char)+1]. |
|
79 |
ctype[0] is traditionally in most ctype libraries |
|
80 |
reserved for EOF (-1). The idea is that you can use |
|
81 |
the result from fgetc() directly with ctype[]. As |
|
82 |
we have to be compatible with external ctype[] versions, |
|
83 |
it's better to do it the same way as they do... |
|
84 |
||
85 |
to_lower - pointer to array[256] used in LCASE() |
|
86 |
to_upper - pointer to array[256] used in UCASE() |
|
87 |
sort_order - pointer to array[256] used for strings comparison |
|
88 |
||
89 |
||
90 |
||
91 |
Unicode conversion data |
|
92 |
----------------------- |
|
93 |
For 8bit character sets: |
|
94 |
||
95 |
tab_to_uni : array[256] of charset->Unicode translation |
|
96 |
tab_from_uni: a structure for Unicode->charset translation |
|
97 |
||
98 |
Non-8 bit charsets have their own structures per charset |
|
99 |
hidden in correspondent ctype-xxx.c file and don't use |
|
100 |
tab_to_uni and tab_from_uni tables. |
|
101 |
||
102 |
||
103 |
Parser maps |
|
104 |
----------- |
|
105 |
state_map[] |
|
106 |
ident_map[] |
|
107 |
||
108 |
These maps are to quickly identify if a character is |
|
109 |
an identificator part, a digit, a special character, |
|
110 |
or a part of other SQL language lexical item. |
|
111 |
||
112 |
Probably can be combined with ctype array in the future. |
|
113 |
But for some reasons these two arrays are used in the parser, |
|
114 |
while a separate ctype[] array is used in the other part of the |
|
115 |
code, like fulltext, etc. |
|
116 |
||
117 |
||
118 |
Misc fields |
|
119 |
----------- |
|
120 |
||
121 |
strxfrm_multiply - how many times a sort key (i.e. a string |
|
122 |
which can be passed into memcmp() for comparison) |
|
123 |
can be longer than the original string. |
|
124 |
Usually it is 1. For some complex |
|
125 |
collations it can be bigger. For example |
|
126 |
in latin1_german2_ci, a sort key is up to |
|
127 |
twice longer than the original string. |
|
128 |
e.g. Letter 'A' with two dots above is |
|
129 |
substituted with 'AE'. |
|
130 |
mbminlen - mininum multibyte sequence length. |
|
131 |
Now always 1 except ucs2. For ucs2 |
|
132 |
it is 2. |
|
133 |
mbmaxlen - maximum multibyte sequence length. |
|
134 |
1 for 8bit charsets. Can be also 2 or 3. |
|
135 |
||
136 |
max_sort_char - for LIKE range |
|
137 |
in case of 8bit character sets - native code |
|
138 |
of maximum character (max_str pad byte); |
|
139 |
in case of UTF8 and UCS2 - Unicode code of the maximum |
|
140 |
possible character (usually U+FFFF). This code is |
|
141 |
converted to multibyte representation (usually 0xEFBFBF) |
|
142 |
and then used as a pad sequence for max_str. |
|
143 |
in case of other multibyte character sets - |
|
144 |
max_str pad byte (usually 0xFF). |
|
145 |
||
146 |
MY_CHARSET_HANDLER |
|
147 |
================== |
|
148 |
||
149 |
MY_CHARSET_HANDLER is a collection of character-set |
|
150 |
related routines. Defined in m_ctype.h. Have the |
|
151 |
following set of functions: |
|
152 |
||
153 |
Multibyte routines |
|
154 |
------------------ |
|
155 |
ismbchar() - detects if the given string is a multibyte sequence |
|
156 |
mbcharlen() - returns length of multibyte sequence starting with |
|
157 |
the given character |
|
158 |
numchars() - returns number of characters in the given string, e.g. |
|
159 |
in SQL function CHAR_LENGTH(). |
|
160 |
charpos() - calculates the offset of the given position in the string. |
|
161 |
Used in SQL functions LEFT(), RIGHT(), SUBSTRING(), |
|
162 |
INSERT() |
|
163 |
||
164 |
well_formed_length() |
|
165 |
- finds the length of correctly formed multybyte beginning. |
|
166 |
Used in INSERTs to cut a beginning of the given string |
|
167 |
which is |
|
168 |
a) "well formed" according to the given character set. |
|
169 |
b) can fit into the given data type |
|
170 |
Terminates the string in the good position, taking in account |
|
171 |
multibyte character boundaries. |
|
172 |
||
173 |
lengthsp() - returns the length of the given string without traling spaces. |
|
174 |
||
175 |
||
176 |
Unicode conversion routines |
|
177 |
--------------------------- |
|
178 |
mb_wc - converts the left multibyte sequence into it Unicode code. |
|
179 |
mc_mb - converts the given Unicode code into multibyte sequence. |
|
180 |
||
181 |
||
182 |
Case and sort conversion |
|
183 |
------------------------ |
|
184 |
caseup_str - converts the given 0-terminated string into the upper case |
|
185 |
casedn_str - converts the given 0-terminated string into the lower case |
|
186 |
caseup - converts the given string into the lower case using length |
|
187 |
casedn - converts the given string into the lower case using length |
|
188 |
||
189 |
Number-to-string conversion routines |
|
190 |
------------------------------------ |
|
191 |
snprintf() |
|
192 |
long10_to_str() |
|
193 |
longlong10_to_str() |
|
194 |
||
195 |
The names are pretty self-descripting. |
|
196 |
||
197 |
String padding routines |
|
198 |
----------------------- |
|
199 |
fill() - writes the given Unicode value into the given string |
|
200 |
with the given length. Used to pad the string, usually |
|
201 |
with space character, according to the given charset. |
|
202 |
||
203 |
String-to-numner conversion routines |
|
204 |
------------------------------------ |
|
205 |
strntol() |
|
206 |
strntoul() |
|
207 |
strntoll() |
|
208 |
strntoull() |
|
209 |
strntod() |
|
210 |
||
211 |
These functions are almost for the same thing with their |
|
212 |
STDLIB counterparts, but also: |
|
213 |
- accept length instead of 0-terminator |
|
214 |
- and are character set dependant |
|
215 |
||
216 |
Simple scanner routines |
|
217 |
----------------------- |
|
218 |
scan() - to skip leading spaces in the given string. |
|
219 |
Used when a string value is inserted into a numeric field. |
|
220 |
||
221 |
||
222 |
||
223 |
MY_COLLATION_HANDLER |
|
224 |
==================== |
|
225 |
strnncoll() - compares two strings according to the given collation |
|
226 |
strnncollsp() - like the above but ignores trailing spaces |
|
227 |
strnxfrm() - makes a sort key suitable for memcmp() corresponding |
|
228 |
to the given string |
|
229 |
like_range() - creates a LIKE range, for optimizer |
|
230 |
wildcmp() - wildcard comparison, for LIKE |
|
231 |
strcasecmp() - 0-terminated string comparison |
|
232 |
instr() - finds the first substring appearence in the string |
|
233 |
hash_sort() - calculates hash value taking in account |
|
234 |
the collation rules, e.g. case-insensitivity, |
|
235 |
accent sensitivity, etc. |
|
236 |
||
237 |
||
238 |