~drizzle-trunk/drizzle/development : contents of mysys/charset.cc at revision 971.3.52

~drizzle-trunk/drizzle/development : (revision 971.3.52)

1 by brian clean slate	1	/* Copyright (C) 2000 MySQL AB
	2
	3	This program is free software; you can redistribute it and/or modify
	4	it under the terms of the GNU General Public License as published by
	5	the Free Software Foundation; version 2 of the License.
	6
	7	This program is distributed in the hope that it will be useful,
	8	but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	GNU General Public License for more details.
	11
	12	You should have received a copy of the GNU General Public License
	13	along with this program; if not, write to the Free Software
	14	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
	15
994.2.4 by Monty Taylor Blast. Fixed some make distcheck issues.	16	#include "mysys/mysys_priv.h"
	17	#include "mysys/mysys_err.h"
212.5.18 by Monty Taylor Moved m_ctype, m_string and my_bitmap. Removed t_ctype.	18	#include <mystrings/m_ctype.h>
	19	#include <mystrings/m_string.h>
722.1.4 by Monty Taylor Removed all the setting of DEFS everywhere. Use configmake.h to get the values	20	#include <drizzled/configmake.h>
1 by brian clean slate	21
	22
	23	/*
	24	The code below implements this functionality:
660.1.3 by Eric Herman removed trailing whitespace with simple script:	25
1 by brian clean slate	26	- Initializing charset related structures
1 by brian clean slate	27	- Loading dynamic charsets
660.1.3 by Eric Herman removed trailing whitespace with simple script:	28	- Searching for a proper CHARSET_INFO
1 by brian clean slate	29	using charset name, collation name or collation ID
	30	- Setting server default character set
	31	*/
	32
236.3.9 by Andrey Hristov - Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)	33	bool my_charset_same(const CHARSET_INFO cs1, const CHARSET_INFO cs2)
1 by brian clean slate	34	{
	35	return ((cs1 == cs2) \|\| !strcmp(cs1->csname,cs2->csname));
	36	}
	37
	38
	39	static uint
	40	get_collation_number_internal(const char *name)
	41	{
	42	CHARSET_INFO **cs;
	43	for (cs= all_charsets;
	44	cs < all_charsets+array_elements(all_charsets)-1 ;
	45	cs++)
	46	{
660.1.3 by Eric Herman removed trailing whitespace with simple script:	47	if ( cs[0] && cs[0]->name &&
383.1.12 by Brian Aker Much closer toward UTF8 being around all the time...	48	!my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
1 by brian clean slate	49	return cs[0]->number;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	50	}
1 by brian clean slate	51	return 0;
	52	}
	53
	54
146 by Brian Aker my_bool cleanup.	55	static bool init_state_maps(CHARSET_INFO *cs)
1 by brian clean slate	56	{
482 by Brian Aker Remove uint.	57	uint32_t i;
481 by Brian Aker Remove all of uchar.	58	unsigned char *state_map;
481 by Brian Aker Remove all of uchar.	59	unsigned char *ident_map;
1 by brian clean slate	60
656.3.1 by Monty Taylor Got rid of my_once_alloc.	61	if (!(cs->state_map= (unsigned char*) malloc(256)))
1 by brian clean slate	62	return 1;
1 by brian clean slate	63
656.3.1 by Monty Taylor Got rid of my_once_alloc.	64	if (!(cs->ident_map= (unsigned char*) malloc(256)))
1 by brian clean slate	65	return 1;
	66
	67	state_map= cs->state_map;
	68	ident_map= cs->ident_map;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	69
1 by brian clean slate	70	/* Fill state_map with states to get a faster parser */
	71	for (i=0; i < 256 ; i++)
	72	{
	73	if (my_isalpha(cs,i))
481 by Brian Aker Remove all of uchar.	74	state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian clean slate	75	else if (my_isdigit(cs,i))
481 by Brian Aker Remove all of uchar.	76	state_map[i]=(unsigned char) MY_LEX_NUMBER_IDENT;
1 by brian clean slate	77	#if defined(USE_MB) && defined(USE_MB_IDENT)
1 by brian clean slate	78	else if (my_mbcharlen(cs, i)>1)
481 by Brian Aker Remove all of uchar.	79	state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian clean slate	80	#endif
1 by brian clean slate	81	else if (my_isspace(cs,i))
481 by Brian Aker Remove all of uchar.	82	state_map[i]=(unsigned char) MY_LEX_SKIP;
1 by brian clean slate	83	else
481 by Brian Aker Remove all of uchar.	84	state_map[i]=(unsigned char) MY_LEX_CHAR;
1 by brian clean slate	85	}
481 by Brian Aker Remove all of uchar.	86	state_map[(unsigned char)'_']=state_map[(unsigned char)'$']=(unsigned char) MY_LEX_IDENT;
	87	state_map[(unsigned char)'\'']=(unsigned char) MY_LEX_STRING;
	88	state_map[(unsigned char)'.']=(unsigned char) MY_LEX_REAL_OR_POINT;
	89	state_map[(unsigned char)'>']=state_map[(unsigned char)'=']=state_map[(unsigned char)'!']= (unsigned char) MY_LEX_CMP_OP;
	90	state_map[(unsigned char)'<']= (unsigned char) MY_LEX_LONG_CMP_OP;
	91	state_map[(unsigned char)'&']=state_map[(unsigned char)'\|']=(unsigned char) MY_LEX_BOOL;
	92	state_map[(unsigned char)'#']=(unsigned char) MY_LEX_COMMENT;
	93	state_map[(unsigned char)';']=(unsigned char) MY_LEX_SEMICOLON;
	94	state_map[(unsigned char)':']=(unsigned char) MY_LEX_SET_VAR;
	95	state_map[0]=(unsigned char) MY_LEX_EOL;
	96	state_map[(unsigned char)'\\']= (unsigned char) MY_LEX_ESCAPE;
	97	state_map[(unsigned char)'/']= (unsigned char) MY_LEX_LONG_COMMENT;
	98	state_map[(unsigned char)'*']= (unsigned char) MY_LEX_END_LONG_COMMENT;
	99	state_map[(unsigned char)'@']= (unsigned char) MY_LEX_USER_END;
	100	state_map[(unsigned char) '`']= (unsigned char) MY_LEX_USER_VARIABLE_DELIMITER;
	101	state_map[(unsigned char)'"']= (unsigned char) MY_LEX_STRING_OR_DELIMITER;
1 by brian clean slate	102
	103	/*
	104	Create a second map to make it faster to find identifiers
	105	*/
	106	for (i=0; i < 256 ; i++)
	107	{
481 by Brian Aker Remove all of uchar.	108	ident_map[i]= (unsigned char) (state_map[i] == MY_LEX_IDENT \|\|
1 by brian clean slate	109	state_map[i] == MY_LEX_NUMBER_IDENT);
	110	}
	111
	112	/* Special handling of hex and binary strings */
481 by Brian Aker Remove all of uchar.	113	state_map[(unsigned char)'x']= state_map[(unsigned char)'X']= (unsigned char) MY_LEX_IDENT_OR_HEX;
481 by Brian Aker Remove all of uchar.	114	state_map[(unsigned char)'b']= state_map[(unsigned char)'B']= (unsigned char) MY_LEX_IDENT_OR_BIN;
1 by brian clean slate	115	return 0;
	116	}
	117
	118
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	119	static bool charset_initialized= false;
1 by brian clean slate	120
1 by brian clean slate	121	CHARSET_INFO *all_charsets[256];
383.1.12 by Brian Aker Much closer toward UTF8 being around all the time...	122	const CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
1 by brian clean slate	123
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	124	void add_compiled_collation(CHARSET_INFO * cs)
1 by brian clean slate	125	{
	126	all_charsets[cs->number]= cs;
	127	cs->state\|= MY_CS_AVAILABLE;
	128	}
	129
632.1.11 by Monty Taylor Fixed Sun Studio warnings in mysys.	130	void *cs_alloc(size_t size)
1 by brian clean slate	131	{
656.3.1 by Monty Taylor Got rid of my_once_alloc.	132	return malloc(size);
1 by brian clean slate	133	}
	134
	135
146 by Brian Aker my_bool cleanup.	136	static bool init_available_charsets(myf myflags)
1 by brian clean slate	137	{
862 by Brian Aker Remove charset directory code.	138	bool error= false;
1 by brian clean slate	139	/*
	140	We have to use charset_initialized to not lock on THR_LOCK_charset
	141	inside get_internal_charset...
	142	*/
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	143	if (charset_initialized == false)
1 by brian clean slate	144	{
1 by brian clean slate	145	CHARSET_INFO **cs;
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	146	memset(&all_charsets, 0, sizeof(all_charsets));
	147	init_compiled_charsets(myflags);
	148
	149	/* Copy compiled charsets */
	150	for (cs=all_charsets;
	151	cs < all_charsets+array_elements(all_charsets)-1 ;
	152	cs++)
1 by brian clean slate	153	{
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	154	if (*cs)
1 by brian clean slate	155	{
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	156	if (cs[0]->ctype)
	157	if (init_state_maps(*cs))
	158	*cs= NULL;
1 by brian clean slate	159	}
1 by brian clean slate	160	}
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	161
	162	charset_initialized= true;
1 by brian clean slate	163	}
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	164	assert(charset_initialized);
	165
1 by brian clean slate	166	return error;
	167	}
	168
	169
	170	void free_charsets(void)
	171	{
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	172	charset_initialized= true;
1 by brian clean slate	173	}
	174
	175
482 by Brian Aker Remove uint.	176	uint32_t get_collation_number(const char *name)
1 by brian clean slate	177	{
	178	init_available_charsets(MYF(0));
	179	return get_collation_number_internal(name);
	180	}
	181
	182
482 by Brian Aker Remove uint.	183	uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
1 by brian clean slate	184	{
	185	CHARSET_INFO **cs;
	186	init_available_charsets(MYF(0));
660.1.3 by Eric Herman removed trailing whitespace with simple script:	187
1 by brian clean slate	188	for (cs= all_charsets;
	189	cs < all_charsets+array_elements(all_charsets)-1 ;
	190	cs++)
	191	{
	192	if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) &&
383.1.12 by Brian Aker Much closer toward UTF8 being around all the time...	193	!my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->csname, charset_name))
1 by brian clean slate	194	return cs[0]->number;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	195	}
1 by brian clean slate	196	return 0;
	197	}
	198
	199
482 by Brian Aker Remove uint.	200	const char *get_charset_name(uint32_t charset_number)
1 by brian clean slate	201	{
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	202	const CHARSET_INFO *cs;
1 by brian clean slate	203	init_available_charsets(MYF(0));
	204
	205	cs=all_charsets[charset_number];
	206	if (cs && (cs->number == charset_number) && cs->name )
	207	return (char*) cs->name;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	208
1 by brian clean slate	209	return (char) "?"; / this mimics find_type() */
	210	}
	211
	212
482 by Brian Aker Remove uint.	213	static const CHARSET_INFO *get_internal_charset(uint32_t cs_number)
1 by brian clean slate	214	{
	215	CHARSET_INFO *cs;
	216	/*
	217	To make things thread safe we are not allowing other threads to interfere
	218	while we may changing the cs_info_table
	219	*/
	220	if ((cs= all_charsets[cs_number]))
	221	{
	222	if (!(cs->state & MY_CS_COMPILED) && !(cs->state & MY_CS_LOADED))
	223	{
383.1.7 by Brian Aker Remove homebrew xml parser.	224	assert(0);
1 by brian clean slate	225	}
	226	cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
	227	}
	228	if (cs && !(cs->state & MY_CS_READY))
	229	{
	230	if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) \|\|
	231	(cs->coll->init && cs->coll->init(cs, cs_alloc)))
	232	cs= NULL;
	233	else
	234	cs->state\|= MY_CS_READY;
	235	}
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	236
1 by brian clean slate	237	return cs;
	238	}
	239
	240
862 by Brian Aker Remove charset directory code.	241	const CHARSET_INFO *get_charset(uint32_t cs_number)
1 by brian clean slate	242	{
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	243	const CHARSET_INFO *cs;
1 by brian clean slate	244	if (cs_number == default_charset_info->number)
	245	return default_charset_info;
	246
	247	(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
660.1.3 by Eric Herman removed trailing whitespace with simple script:	248
1 by brian clean slate	249	if (!cs_number \|\| cs_number >= array_elements(all_charsets)-1)
1 by brian clean slate	250	return NULL;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	251
383.1.7 by Brian Aker Remove homebrew xml parser.	252	cs= get_internal_charset(cs_number);
1 by brian clean slate	253
	254	return cs;
	255	}
	256
862 by Brian Aker Remove charset directory code.	257	const CHARSET_INFO get_charset_by_name(const char cs_name)
1 by brian clean slate	258	{
482 by Brian Aker Remove uint.	259	uint32_t cs_number;
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	260	const CHARSET_INFO *cs;
1 by brian clean slate	261	(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
1 by brian clean slate	262
1014.3.1 by Brian Aker Simplify the calling stack for getting schema collation. We need to extend	263	cs_number= get_collation_number(cs_name);
383.1.7 by Brian Aker Remove homebrew xml parser.	264	cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian clean slate	265
	266	return cs;
	267	}
	268
	269
862 by Brian Aker Remove charset directory code.	270	const CHARSET_INFO get_charset_by_csname(const char cs_name, uint32_t cs_flags)
1 by brian clean slate	271	{
482 by Brian Aker Remove uint.	272	uint32_t cs_number;
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	273	const CHARSET_INFO *cs;
1 by brian clean slate	274
	275	(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
	276
	277	cs_number= get_charset_number(cs_name, cs_flags);
383.1.7 by Brian Aker Remove homebrew xml parser.	278	cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian clean slate	279
51.3.22 by Jay Pipes Final round of removal of DBUG in mysys/, including Makefile	280	return(cs);
1 by brian clean slate	281	}
	282
	283
	284	/*
	285	Escape apostrophes by doubling them up
	286
	287	SYNOPSIS
236.3.4 by Andrey Hristov Rename escape_(string\|quotes)_for_mysql to escape_(string\|quotes)_for_drizzle	288	escape_quotes_for_drizzle()
1 by brian clean slate	289	charset_info Charset of the strings
	290	to Buffer for escaped string
	291	to_length Length of destination buffer, or 0
	292	from The string to escape
	293	length The length of the string to escape
	294
	295	DESCRIPTION
	296	This escapes the contents of a string by doubling up any apostrophes that
	297	it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
	298	effect on the server.
	299
	300	NOTE
	301	To be consistent with escape_string_for_mysql(), to_length may be 0 to
	302	mean "big enough"
	303
	304	RETURN VALUES
365.2.9 by Monty Taylor Got rid of all instances of ~0	305	UINT32_MAX The escaped string did not fit in the to buffer
1 by brian clean slate	306	>=0 The length of the escaped string
	307	*/
	308
236.3.9 by Andrey Hristov - Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)	309	size_t escape_quotes_for_drizzle(const CHARSET_INFO *charset_info,
236.3.4 by Andrey Hristov Rename escape_(string\|quotes)_for_mysql to escape_(string\|quotes)_for_drizzle	310	char *to, size_t to_length,
	311	const char *from, size_t length)
1 by brian clean slate	312	{
	313	const char *to_start= to;
	314	const char end, to_end=to_start + (to_length ? to_length-1 : 2*length);
163 by Brian Aker Merge Monty's code.	315	bool overflow= false;
1 by brian clean slate	316	#ifdef USE_MB
146 by Brian Aker my_bool cleanup.	317	bool use_mb_flag= use_mb(charset_info);
1 by brian clean slate	318	#endif
	319	for (end= from + length; from < end; from++)
	320	{
	321	#ifdef USE_MB
	322	int tmp_length;
	323	if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
	324	{
	325	if (to + tmp_length > to_end)
	326	{
163 by Brian Aker Merge Monty's code.	327	overflow= true;
1 by brian clean slate	328	break;
	329	}
	330	while (tmp_length--)
	331	to++= from++;
	332	from--;
	333	continue;
	334	}
	335	/*
	336	We don't have the same issue here with a non-multi-byte character being
	337	turned into a multi-byte character by the addition of an escaping
	338	character, because we are only escaping the ' character with itself.
	339	*/
	340	#endif
	341	if (*from == '\'')
	342	{
	343	if (to + 2 > to_end)
	344	{
163 by Brian Aker Merge Monty's code.	345	overflow= true;
1 by brian clean slate	346	break;
	347	}
	348	*to++= '\'';
	349	*to++= '\'';
	350	}
	351	else
	352	{
	353	if (to + 1 > to_end)
	354	{
163 by Brian Aker Merge Monty's code.	355	overflow= true;
1 by brian clean slate	356	break;
	357	}
	358	to++= from;
	359	}
	360	}
	361	*to= 0;
365.2.9 by Monty Taylor Got rid of all instances of ~0	362	return overflow ? UINT32_MAX : (uint32_t) (to - to_start);
1 by brian clean slate	363	}