~drizzle-trunk/drizzle/development : contents of strings/ctype.c at revision 33

~drizzle-trunk/drizzle/development : (revision 33)

/* Copyright (C) 2000 MySQL AB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

#include <my_global.h>
#include <m_ctype.h>
#include <my_xml.h>
#ifndef SCO
#include <m_string.h>
#endif


/*

  This files implements routines which parse XML based
  character set and collation description files.
  
  Unicode collations are encoded according to
  
    Unicode Technical Standard #35
    Locale Data Markup Language (LDML)
    http://www.unicode.org/reports/tr35/
  
  and converted into ICU string according to
  
    Collation Customization
    http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
  
*/

static char *mstr(char *str,const char *src,uint l1,uint l2)
{
  l1= l1<l2 ? l1 : l2;
  memcpy(str,src,l1);
  str[l1]='\0';
  return str;
}

struct my_cs_file_section_st
{
  int        state;
  const char *str;
};

#define _CS_MISC	1
#define _CS_ID		2
#define _CS_CSNAME	3
#define _CS_FAMILY	4
#define _CS_ORDER	5
#define _CS_COLNAME	6
#define _CS_FLAG	7
#define _CS_CHARSET	8
#define _CS_COLLATION	9
#define _CS_UPPERMAP	10
#define _CS_LOWERMAP	11
#define _CS_UNIMAP	12
#define _CS_COLLMAP	13
#define _CS_CTYPEMAP	14
#define _CS_PRIMARY_ID	15
#define _CS_BINARY_ID	16
#define _CS_CSDESCRIPT	17
#define _CS_RESET	18
#define	_CS_DIFF1	19
#define	_CS_DIFF2	20
#define	_CS_DIFF3	21


static struct my_cs_file_section_st sec[] =
{
  {_CS_MISC,		"xml"},
  {_CS_MISC,		"xml/version"},
  {_CS_MISC,		"xml/encoding"},
  {_CS_MISC,		"charsets"},
  {_CS_MISC,		"charsets/max-id"},
  {_CS_CHARSET,		"charsets/charset"},
  {_CS_PRIMARY_ID,	"charsets/charset/primary-id"},
  {_CS_BINARY_ID,	"charsets/charset/binary-id"},
  {_CS_CSNAME,		"charsets/charset/name"},
  {_CS_FAMILY,		"charsets/charset/family"},
  {_CS_CSDESCRIPT,	"charsets/charset/description"},
  {_CS_MISC,		"charsets/charset/alias"},
  {_CS_MISC,		"charsets/charset/ctype"},
  {_CS_CTYPEMAP,	"charsets/charset/ctype/map"},
  {_CS_MISC,		"charsets/charset/upper"},
  {_CS_UPPERMAP,	"charsets/charset/upper/map"},
  {_CS_MISC,		"charsets/charset/lower"},
  {_CS_LOWERMAP,	"charsets/charset/lower/map"},
  {_CS_MISC,		"charsets/charset/unicode"},
  {_CS_UNIMAP,		"charsets/charset/unicode/map"},
  {_CS_COLLATION,	"charsets/charset/collation"},
  {_CS_COLNAME,		"charsets/charset/collation/name"},
  {_CS_ID,		"charsets/charset/collation/id"},
  {_CS_ORDER,		"charsets/charset/collation/order"},
  {_CS_FLAG,		"charsets/charset/collation/flag"},
  {_CS_COLLMAP,		"charsets/charset/collation/map"},
  {_CS_RESET,		"charsets/charset/collation/rules/reset"},
  {_CS_DIFF1,		"charsets/charset/collation/rules/p"},
  {_CS_DIFF2,		"charsets/charset/collation/rules/s"},
  {_CS_DIFF3,		"charsets/charset/collation/rules/t"},
  {0,	NULL}
};

static struct my_cs_file_section_st * cs_file_sec(const char *attr, size_t len)
{
  struct my_cs_file_section_st *s;
  for (s=sec; s->str; s++)
  {
    if (!strncmp(attr,s->str,len))
      return s;
  }
  return NULL;
}

#define MY_CS_CSDESCR_SIZE	64
#define MY_CS_TAILORING_SIZE	1024

typedef struct my_cs_file_info
{
  char   csname[MY_CS_NAME_SIZE];
  char   name[MY_CS_NAME_SIZE];
  uchar  ctype[MY_CS_CTYPE_TABLE_SIZE];
  uchar  to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
  uchar  to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
  uchar  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
  uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
  char   comment[MY_CS_CSDESCR_SIZE];
  char   tailoring[MY_CS_TAILORING_SIZE];
  size_t tailoring_length;
  CHARSET_INFO cs;
  int (*add_collation)(CHARSET_INFO *cs);
} MY_CHARSET_LOADER;



static int fill_uchar(uchar *a,uint size,const char *str, uint len)
{
  uint i= 0;
  const char *s, *b, *e=str+len;
  
  for (s=str ; s < e ; i++)
  { 
    for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
    b=s;
    for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
    if (s == b || i > size)
      break;
    a[i]= (uchar) strtoul(b,NULL,16);
  }
  return 0;
}

static int fill_uint16(uint16 *a,uint size,const char *str, size_t len)
{
  uint i= 0;
  
  const char *s, *b, *e=str+len;
  for (s=str ; s < e ; i++)
  { 
    for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
    b=s;
    for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
    if (s == b || i > size)
      break;
    a[i]= (uint16) strtol(b,NULL,16);
  }
  return 0;
}


static int cs_enter(MY_XML_PARSER *st,const char *attr, size_t len)
{
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
  
  if ( s && (s->state == _CS_CHARSET))
    bzero(&i->cs,sizeof(i->cs));
  
  if (s && (s->state == _CS_COLLATION))
    i->tailoring_length= 0;

  return MY_XML_OK;
}


static int cs_leave(MY_XML_PARSER *st,const char *attr, size_t len)
{
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
  int    state= s ? s->state : 0;
  int    rc;
  
  switch(state){
  case _CS_COLLATION:
    rc= i->add_collation ? i->add_collation(&i->cs) : MY_XML_OK;
    break;
  default:
    rc=MY_XML_OK;
  }
  return rc;
}


static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
{
  struct my_cs_file_info *i= (struct my_cs_file_info *)st->user_data;
  struct my_cs_file_section_st *s;
  int    state= (int)((s=cs_file_sec(st->attr, strlen(st->attr))) ? s->state :
                      0);
  
  switch (state) {
  case _CS_ID:
    i->cs.number= strtol(attr,(char**)NULL,10);
    break;
  case _CS_BINARY_ID:
    i->cs.binary_number= strtol(attr,(char**)NULL,10);
    break;
  case _CS_PRIMARY_ID:
    i->cs.primary_number= strtol(attr,(char**)NULL,10);
    break;
  case _CS_COLNAME:
    i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1);
    break;
  case _CS_CSNAME:
    i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1);
    break;
  case _CS_CSDESCRIPT:
    i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
    break;
  case _CS_FLAG:
    if (!strncmp("primary",attr,len))
      i->cs.state|= MY_CS_PRIMARY;
    else if (!strncmp("binary",attr,len))
      i->cs.state|= MY_CS_BINSORT;
    else if (!strncmp("compiled",attr,len))
      i->cs.state|= MY_CS_COMPILED;
    break;
  case _CS_UPPERMAP:
    fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
    i->cs.to_upper=i->to_upper;
    break;
  case _CS_LOWERMAP:
    fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
    i->cs.to_lower=i->to_lower;
    break;
  case _CS_UNIMAP:
    fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
    i->cs.tab_to_uni=i->tab_to_uni;
    break;
  case _CS_COLLMAP:
    fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
    i->cs.sort_order=i->sort_order;
    break;
  case _CS_CTYPEMAP:
    fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
    i->cs.ctype=i->ctype;
    break;
  case _CS_RESET:
  case _CS_DIFF1:
  case _CS_DIFF2:
  case _CS_DIFF3:
    {
      /*
        Convert collation description from
        Locale Data Markup Language (LDML)
        into ICU Collation Customization expression.
      */
      char arg[16];
      const char *cmd[]= {"&","<","<<","<<<"};
      i->cs.tailoring= i->tailoring;
      mstr(arg,attr,len,sizeof(arg)-1);
      if (i->tailoring_length + 20 < sizeof(i->tailoring))
      {
        char *dst= i->tailoring_length + i->tailoring;
        i->tailoring_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg);
      }
    }
  }
  return MY_XML_OK;
}


my_bool my_parse_charset_xml(const char *buf, size_t len,
                             int (*add_collation)(CHARSET_INFO *cs))
{
  MY_XML_PARSER p;
  struct my_cs_file_info i;
  my_bool rc;
  
  my_xml_parser_create(&p);
  my_xml_set_enter_handler(&p,cs_enter);
  my_xml_set_value_handler(&p,cs_value);
  my_xml_set_leave_handler(&p,cs_leave);
  i.add_collation= add_collation;
  my_xml_set_user_data(&p,(void*)&i);
  rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
  my_xml_parser_free(&p);
  return rc;
}


/*
  Check repertoire: detect pure ascii strings
*/
uint
my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong length)
{
  const char *strend= str + length;
  if (cs->mbminlen == 1)
  {
    for ( ; str < strend; str++)
    {
      if (((uchar) *str) > 0x7F)
        return MY_REPERTOIRE_UNICODE30;
    }
  }
  else
  {
    my_wc_t wc;
    int chlen;
    for (; (chlen= cs->cset->mb_wc(cs, &wc, str, strend)) > 0; str+= chlen)
    {
      if (wc > 0x7F)
        return MY_REPERTOIRE_UNICODE30;
    }
  }
  return MY_REPERTOIRE_ASCII;
}


/*
  Detect whether a character set is ASCII compatible.

  Returns TRUE for:
  
  - all 8bit character sets whose Unicode mapping of 0x7B is '{'
    (ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
  
  - all multi-byte character sets having mbminlen == 1
    (ignores ucs2 whose mbminlen is 2)
  
  TODO:
  
  When merging to 5.2, this function should be changed
  to check a new flag MY_CS_NONASCII, 
  
     return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
  
  This flag was previously added into 5.2 under terms
  of WL#3759 "Optimize identifier conversion in client-server protocol"
  especially to mark character sets not compatible with ASCII.
  
  We won't backport this flag to 5.0 or 5.1.
  This function is Ok for 5.0 and 5.1, because we're not going
  to introduce new tricky character sets between 5.0 and 5.2.
*/
my_bool
my_charset_is_ascii_based(CHARSET_INFO *cs)
{
  return 
    (cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
    (cs->mbminlen == 1 && cs->mbmaxlen > 1);
}


/*
  Detect if a character set is 8bit,
  and it is pure ascii, i.e. doesn't have
  characters outside U+0000..U+007F
  This functions is shared between "conf_to_src"
  and dynamic charsets loader in "mysqld".
*/
my_bool
my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs)
{
  size_t code;
  if (!cs->tab_to_uni)
    return 0;
  for (code= 0; code < 256; code++)
  {
    if (cs->tab_to_uni[code] > 0x7F)
      return 0;
  }
  return 1;
}


/*
  Shared function between conf_to_src and mysys.
  Check if a 8bit character set is compatible with
  ascii on the range 0x00..0x7F.
*/
my_bool
my_charset_is_ascii_compatible(CHARSET_INFO *cs)
{
  uint i;
  if (!cs->tab_to_uni)
    return 1;
  for (i= 0; i < 128; i++)
  {
    if (cs->tab_to_uni[i] != i)
      return 0;
  }
  return 1;
}

1 by brian clean slate	1	/* Copyright (C) 2000 MySQL AB
	2
	3	This program is free software; you can redistribute it and/or modify
	4	it under the terms of the GNU General Public License as published by
	5	the Free Software Foundation; version 2 of the License.
	6
	7	This program is distributed in the hope that it will be useful,
	8	but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	GNU General Public License for more details.
	11
	12	You should have received a copy of the GNU General Public License
	13	along with this program; if not, write to the Free Software
	14	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
	15
	16	#include <my_global.h>
	17	#include <m_ctype.h>
	18	#include <my_xml.h>
	19	#ifndef SCO
	20	#include <m_string.h>
	21	#endif
	22
	23
	24	/*
	25
	26	This files implements routines which parse XML based
	27	character set and collation description files.
	28
	29	Unicode collations are encoded according to
	30
	31	Unicode Technical Standard #35
	32	Locale Data Markup Language (LDML)
	33	http://www.unicode.org/reports/tr35/
	34
	35	and converted into ICU string according to
	36
	37	Collation Customization
	38	http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
	39
	40	*/
	41
	42	static char mstr(char str,const char *src,uint l1,uint l2)
	43	{
	44	l1= l1<l2 ? l1 : l2;
	45	memcpy(str,src,l1);
	46	str[l1]='\0';
	47	return str;
	48	}
	49
	50	struct my_cs_file_section_st
	51	{
	52	int state;
	53	const char *str;
	54	};
	55
	56	#define _CS_MISC 1
	57	#define _CS_ID 2
	58	#define _CS_CSNAME 3
	59	#define _CS_FAMILY 4
	60	#define _CS_ORDER 5
	61	#define _CS_COLNAME 6
	62	#define _CS_FLAG 7
	63	#define _CS_CHARSET 8
	64	#define _CS_COLLATION 9
65	#define _CS_UPPERMAP 10
66	#define _CS_LOWERMAP 11
67	#define _CS_UNIMAP 12
68	#define _CS_COLLMAP 13
69	#define _CS_CTYPEMAP 14
70	#define _CS_PRIMARY_ID 15
71	#define _CS_BINARY_ID 16
72	#define _CS_CSDESCRIPT 17
73	#define _CS_RESET 18
74	#define _CS_DIFF1 19
75	#define _CS_DIFF2 20
76	#define _CS_DIFF3 21
77
78
79	static struct my_cs_file_section_st sec[] =
80	{
81	{_CS_MISC, "xml"},
82	{_CS_MISC, "xml/version"},
83	{_CS_MISC, "xml/encoding"},
84	{_CS_MISC, "charsets"},
85	{_CS_MISC, "charsets/max-id"},
86	{_CS_CHARSET, "charsets/charset"},
87	{_CS_PRIMARY_ID, "charsets/charset/primary-id"},
88	{_CS_BINARY_ID, "charsets/charset/binary-id"},
89	{_CS_CSNAME, "charsets/charset/name"},
90	{_CS_FAMILY, "charsets/charset/family"},
91	{_CS_CSDESCRIPT, "charsets/charset/description"},
92	{_CS_MISC, "charsets/charset/alias"},
93	{_CS_MISC, "charsets/charset/ctype"},
94	{_CS_CTYPEMAP, "charsets/charset/ctype/map"},
95	{_CS_MISC, "charsets/charset/upper"},
96	{_CS_UPPERMAP, "charsets/charset/upper/map"},
97	{_CS_MISC, "charsets/charset/lower"},
98	{_CS_LOWERMAP, "charsets/charset/lower/map"},
99	{_CS_MISC, "charsets/charset/unicode"},
100	{_CS_UNIMAP, "charsets/charset/unicode/map"},
101	{_CS_COLLATION, "charsets/charset/collation"},
102	{_CS_COLNAME, "charsets/charset/collation/name"},
103	{_CS_ID, "charsets/charset/collation/id"},
104	{_CS_ORDER, "charsets/charset/collation/order"},
105	{_CS_FLAG, "charsets/charset/collation/flag"},
106	{_CS_COLLMAP, "charsets/charset/collation/map"},
107	{_CS_RESET, "charsets/charset/collation/rules/reset"},
108	{_CS_DIFF1, "charsets/charset/collation/rules/p"},
109	{_CS_DIFF2, "charsets/charset/collation/rules/s"},
110	{_CS_DIFF3, "charsets/charset/collation/rules/t"},
111	{0, NULL}
112	};
113
114	static struct my_cs_file_section_st * cs_file_sec(const char *attr, size_t len)
115	{
116	struct my_cs_file_section_st *s;
117	for (s=sec; s->str; s++)
118	{
119	if (!strncmp(attr,s->str,len))
120	return s;
121	}
122	return NULL;
123	}
124
125	#define MY_CS_CSDESCR_SIZE 64
126	#define MY_CS_TAILORING_SIZE 1024
127
128	typedef struct my_cs_file_info
129	{
130	char csname[MY_CS_NAME_SIZE];
131	char name[MY_CS_NAME_SIZE];
132	uchar ctype[MY_CS_CTYPE_TABLE_SIZE];
133	uchar to_lower[MY_CS_TO_LOWER_TABLE_SIZE];
134	uchar to_upper[MY_CS_TO_UPPER_TABLE_SIZE];
135	uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
136	uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
137	char comment[MY_CS_CSDESCR_SIZE];
138	char tailoring[MY_CS_TAILORING_SIZE];
139	size_t tailoring_length;
140	CHARSET_INFO cs;
141	int (add_collation)(CHARSET_INFO cs);
142	} MY_CHARSET_LOADER;
143
144
145
146	static int fill_uchar(uchar a,uint size,const char str, uint len)
147	{
148	uint i= 0;
149	const char s, b, *e=str+len;
150
151	for (s=str ; s < e ; i++)
152	{
153	for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
154	b=s;
155	for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
156	if (s == b \|\| i > size)
157	break;
158	a[i]= (uchar) strtoul(b,NULL,16);
159	}
160	return 0;
161	}
162
163	static int fill_uint16(uint16 a,uint size,const char str, size_t len)
164	{
165	uint i= 0;
166
167	const char s, b, *e=str+len;
168	for (s=str ; s < e ; i++)
169	{
170	for ( ; (s < e) && strchr(" \t\r\n",s[0]); s++) ;
171	b=s;
172	for ( ; (s < e) && !strchr(" \t\r\n",s[0]); s++) ;
173	if (s == b \|\| i > size)
174	break;
175	a[i]= (uint16) strtol(b,NULL,16);
176	}
177	return 0;
178	}
179
180
181	static int cs_enter(MY_XML_PARSER st,const char attr, size_t len)
182	{
183	struct my_cs_file_info i= (struct my_cs_file_info )st->user_data;
184	struct my_cs_file_section_st *s= cs_file_sec(attr,len);
185
186	if ( s && (s->state == _CS_CHARSET))
187	bzero(&i->cs,sizeof(i->cs));
188
189	if (s && (s->state == _CS_COLLATION))
190	i->tailoring_length= 0;
191
192	return MY_XML_OK;
193	}
194
195
196	static int cs_leave(MY_XML_PARSER st,const char attr, size_t len)
197	{
198	struct my_cs_file_info i= (struct my_cs_file_info )st->user_data;
199	struct my_cs_file_section_st *s= cs_file_sec(attr,len);
200	int state= s ? s->state : 0;
201	int rc;
202
203	switch(state){
204	case _CS_COLLATION:
205	rc= i->add_collation ? i->add_collation(&i->cs) : MY_XML_OK;
206	break;
207	default:
208	rc=MY_XML_OK;
209	}
210	return rc;
211	}
212
213
214	static int cs_value(MY_XML_PARSER st,const char attr, size_t len)
215	{
216	struct my_cs_file_info i= (struct my_cs_file_info )st->user_data;
217	struct my_cs_file_section_st *s;
218	int state= (int)((s=cs_file_sec(st->attr, strlen(st->attr))) ? s->state :
219	0);
220
221	switch (state) {
222	case _CS_ID:
223	i->cs.number= strtol(attr,(char**)NULL,10);
224	break;
225	case _CS_BINARY_ID:
226	i->cs.binary_number= strtol(attr,(char**)NULL,10);
227	break;
228	case _CS_PRIMARY_ID:
229	i->cs.primary_number= strtol(attr,(char**)NULL,10);
230	break;
231	case _CS_COLNAME:
232	i->cs.name=mstr(i->name,attr,len,MY_CS_NAME_SIZE-1);
233	break;
234	case _CS_CSNAME:
235	i->cs.csname=mstr(i->csname,attr,len,MY_CS_NAME_SIZE-1);
236	break;
237	case _CS_CSDESCRIPT:
238	i->cs.comment=mstr(i->comment,attr,len,MY_CS_CSDESCR_SIZE-1);
239	break;
240	case _CS_FLAG:
241	if (!strncmp("primary",attr,len))
242	i->cs.state\|= MY_CS_PRIMARY;
243	else if (!strncmp("binary",attr,len))
244	i->cs.state\|= MY_CS_BINSORT;
245	else if (!strncmp("compiled",attr,len))
246	i->cs.state\|= MY_CS_COMPILED;
247	break;
248	case _CS_UPPERMAP:
249	fill_uchar(i->to_upper,MY_CS_TO_UPPER_TABLE_SIZE,attr,len);
250	i->cs.to_upper=i->to_upper;
251	break;
252	case _CS_LOWERMAP:
253	fill_uchar(i->to_lower,MY_CS_TO_LOWER_TABLE_SIZE,attr,len);
254	i->cs.to_lower=i->to_lower;
255	break;
256	case _CS_UNIMAP:
257	fill_uint16(i->tab_to_uni,MY_CS_TO_UNI_TABLE_SIZE,attr,len);
258	i->cs.tab_to_uni=i->tab_to_uni;
259	break;
260	case _CS_COLLMAP:
261	fill_uchar(i->sort_order,MY_CS_SORT_ORDER_TABLE_SIZE,attr,len);
262	i->cs.sort_order=i->sort_order;
263	break;
264	case _CS_CTYPEMAP:
265	fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
266	i->cs.ctype=i->ctype;
267	break;
268	case _CS_RESET:
269	case _CS_DIFF1:
270	case _CS_DIFF2:
271	case _CS_DIFF3:
272	{
273	/*
274	Convert collation description from
275	Locale Data Markup Language (LDML)
276	into ICU Collation Customization expression.
277	*/
278	char arg[16];
279	const char *cmd[]= {"&","<","<<","<<<"};
280	i->cs.tailoring= i->tailoring;
281	mstr(arg,attr,len,sizeof(arg)-1);
282	if (i->tailoring_length + 20 < sizeof(i->tailoring))
283	{
284	char *dst= i->tailoring_length + i->tailoring;
285	i->tailoring_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg);
286	}
287	}
288	}
289	return MY_XML_OK;
290	}
291
292
293	my_bool my_parse_charset_xml(const char *buf, size_t len,
294	int (add_collation)(CHARSET_INFO cs))
295	{
296	MY_XML_PARSER p;
297	struct my_cs_file_info i;
298	my_bool rc;
299
300	my_xml_parser_create(&p);
301	my_xml_set_enter_handler(&p,cs_enter);
302	my_xml_set_value_handler(&p,cs_value);
303	my_xml_set_leave_handler(&p,cs_leave);
304	i.add_collation= add_collation;
305	my_xml_set_user_data(&p,(void*)&i);
306	rc= (my_xml_parse(&p,buf,len) == MY_XML_OK) ? FALSE : TRUE;
307	my_xml_parser_free(&p);
308	return rc;
309	}
310
311
312	/*
313	Check repertoire: detect pure ascii strings
314	*/
315	uint
316	my_string_repertoire(CHARSET_INFO cs, const char str, ulong length)
317	{
318	const char *strend= str + length;
319	if (cs->mbminlen == 1)
320	{
321	for ( ; str < strend; str++)
322	{
323	if (((uchar) *str) > 0x7F)
324	return MY_REPERTOIRE_UNICODE30;
325	}
326	}
327	else
328	{
329	my_wc_t wc;
330	int chlen;
331	for (; (chlen= cs->cset->mb_wc(cs, &wc, str, strend)) > 0; str+= chlen)
332	{
333	if (wc > 0x7F)
334	return MY_REPERTOIRE_UNICODE30;
335	}
336	}
337	return MY_REPERTOIRE_ASCII;
338	}
339
340
341	/*
342	Detect whether a character set is ASCII compatible.
343
344	Returns TRUE for:
345
346	- all 8bit character sets whose Unicode mapping of 0x7B is '{'
347	(ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
348
349	- all multi-byte character sets having mbminlen == 1
350	(ignores ucs2 whose mbminlen is 2)
351
352	TODO:
353
354	When merging to 5.2, this function should be changed
355	to check a new flag MY_CS_NONASCII,
356
357	return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
358
359	This flag was previously added into 5.2 under terms
360	of WL#3759 "Optimize identifier conversion in client-server protocol"
361	especially to mark character sets not compatible with ASCII.
362
363	We won't backport this flag to 5.0 or 5.1.
364	This function is Ok for 5.0 and 5.1, because we're not going
365	to introduce new tricky character sets between 5.0 and 5.2.
366	*/
367	my_bool
368	my_charset_is_ascii_based(CHARSET_INFO *cs)
369	{
370	return
371	(cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') \|\|
372	(cs->mbminlen == 1 && cs->mbmaxlen > 1);
373	}
374
375
376	/*
377	Detect if a character set is 8bit,
378	and it is pure ascii, i.e. doesn't have
379	characters outside U+0000..U+007F
380	This functions is shared between "conf_to_src"
381	and dynamic charsets loader in "mysqld".
382	*/
383	my_bool
384	my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs)
385	{
386	size_t code;
387	if (!cs->tab_to_uni)
388	return 0;
389	for (code= 0; code < 256; code++)
390	{
391	if (cs->tab_to_uni[code] > 0x7F)
392	return 0;
393	}
394	return 1;
395	}
396
397
398	/*
399	Shared function between conf_to_src and mysys.
400	Check if a 8bit character set is compatible with
401	ascii on the range 0x00..0x7F.
402	*/
403	my_bool
404	my_charset_is_ascii_compatible(CHARSET_INFO *cs)
405	{
406	uint i;
407	if (!cs->tab_to_uni)
408	return 1;
409	for (i= 0; i < 128; i++)
410	{
411	if (cs->tab_to_uni[i] != i)
412	return 0;
413	}
414	return 1;
415	}