~drizzle-trunk/drizzle/development : contents of drizzled/charset.cc at revision 1912

~drizzle-trunk/drizzle/development : (revision 1912)

1 by brian clean slate	1	/* Copyright (C) 2000 MySQL AB
	2
	3	This program is free software; you can redistribute it and/or modify
	4	it under the terms of the GNU General Public License as published by
	5	the Free Software Foundation; version 2 of the License.
	6
	7	This program is distributed in the hope that it will be useful,
	8	but WITHOUT ANY WARRANTY; without even the implied warranty of
	9	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	10	GNU General Public License for more details.
	11
	12	You should have received a copy of the GNU General Public License
	13	along with this program; if not, write to the Free Software
1802.10.2 by Monty Taylor Update all of the copyright headers to include the correct address.	14	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
1 by brian clean slate	15
1241.9.57 by Monty Taylor Oy. Bigger change than I normally like - but this stuff is all intertwined.	16	#include "config.h"
	17
	18	#include "drizzled/charset.h"
1271.5.3 by Tim Penhey change the include files	19	#include "drizzled/error.h"
1241.9.61 by Monty Taylor No more mystrings in drizzled/	20	#include "drizzled/charset_info.h"
1241.9.64 by Monty Taylor Moved remaining non-public portions of mysys and mystrings to drizzled/internal.	21	#include "drizzled/internal/m_string.h"
722.1.4 by Monty Taylor Removed all the setting of DEFS everywhere. Use configmake.h to get the values	22	#include <drizzled/configmake.h>
1106.1.1 by Brian Aker Monty fixes pluss a few from me for charset.	23	#include <vector>
	24
	25	using namespace std;
	26
1280.1.10 by Monty Taylor Put everything in drizzled into drizzled namespace.	27	namespace drizzled
	28	{
1106.1.1 by Brian Aker Monty fixes pluss a few from me for charset.	29
	30	/*
	31	We collect memory in this vector that we free on delete.
	32	*/
	33	static vector<void *>memory_vector;
1 by brian clean slate	34
	35	/*
	36	The code below implements this functionality:
660.1.3 by Eric Herman removed trailing whitespace with simple script:	37
1 by brian clean slate	38	- Initializing charset related structures
1 by brian clean slate	39	- Loading dynamic charsets
660.1.3 by Eric Herman removed trailing whitespace with simple script:	40	- Searching for a proper CHARSET_INFO
1 by brian clean slate	41	using charset name, collation name or collation ID
	42	- Setting server default character set
	43	*/
	44
236.3.9 by Andrey Hristov - Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)	45	bool my_charset_same(const CHARSET_INFO cs1, const CHARSET_INFO cs2)
1 by brian clean slate	46	{
	47	return ((cs1 == cs2) \|\| !strcmp(cs1->csname,cs2->csname));
	48	}
	49
	50
	51	static uint
	52	get_collation_number_internal(const char *name)
	53	{
	54	CHARSET_INFO **cs;
	55	for (cs= all_charsets;
	56	cs < all_charsets+array_elements(all_charsets)-1 ;
	57	cs++)
	58	{
660.1.3 by Eric Herman removed trailing whitespace with simple script:	59	if ( cs[0] && cs[0]->name &&
383.1.12 by Brian Aker Much closer toward UTF8 being around all the time...	60	!my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->name, name))
1 by brian clean slate	61	return cs[0]->number;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	62	}
1 by brian clean slate	63	return 0;
	64	}
	65
	66
146 by Brian Aker my_bool cleanup.	67	static bool init_state_maps(CHARSET_INFO *cs)
1 by brian clean slate	68	{
482 by Brian Aker Remove uint.	69	uint32_t i;
481 by Brian Aker Remove all of uchar.	70	unsigned char *state_map;
481 by Brian Aker Remove all of uchar.	71	unsigned char *ident_map;
1 by brian clean slate	72
1106.1.1 by Brian Aker Monty fixes pluss a few from me for charset.	73	if (!(cs->state_map= (unsigned char*) cs_alloc(256)))
1 by brian clean slate	74	return 1;
1 by brian clean slate	75
1106.1.1 by Brian Aker Monty fixes pluss a few from me for charset.	76	if (!(cs->ident_map= (unsigned char*) cs_alloc(256)))
1 by brian clean slate	77	return 1;
	78
	79	state_map= cs->state_map;
	80	ident_map= cs->ident_map;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	81
1 by brian clean slate	82	/* Fill state_map with states to get a faster parser */
	83	for (i=0; i < 256 ; i++)
	84	{
	85	if (my_isalpha(cs,i))
481 by Brian Aker Remove all of uchar.	86	state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian clean slate	87	else if (my_isdigit(cs,i))
481 by Brian Aker Remove all of uchar.	88	state_map[i]=(unsigned char) MY_LEX_NUMBER_IDENT;
1 by brian clean slate	89	else if (my_mbcharlen(cs, i)>1)
481 by Brian Aker Remove all of uchar.	90	state_map[i]=(unsigned char) MY_LEX_IDENT;
1 by brian clean slate	91	else if (my_isspace(cs,i))
481 by Brian Aker Remove all of uchar.	92	state_map[i]=(unsigned char) MY_LEX_SKIP;
1 by brian clean slate	93	else
481 by Brian Aker Remove all of uchar.	94	state_map[i]=(unsigned char) MY_LEX_CHAR;
1 by brian clean slate	95	}
481 by Brian Aker Remove all of uchar.	96	state_map[(unsigned char)'_']=state_map[(unsigned char)'$']=(unsigned char) MY_LEX_IDENT;
	97	state_map[(unsigned char)'\'']=(unsigned char) MY_LEX_STRING;
	98	state_map[(unsigned char)'.']=(unsigned char) MY_LEX_REAL_OR_POINT;
	99	state_map[(unsigned char)'>']=state_map[(unsigned char)'=']=state_map[(unsigned char)'!']= (unsigned char) MY_LEX_CMP_OP;
	100	state_map[(unsigned char)'<']= (unsigned char) MY_LEX_LONG_CMP_OP;
	101	state_map[(unsigned char)'&']=state_map[(unsigned char)'\|']=(unsigned char) MY_LEX_BOOL;
	102	state_map[(unsigned char)'#']=(unsigned char) MY_LEX_COMMENT;
	103	state_map[(unsigned char)';']=(unsigned char) MY_LEX_SEMICOLON;
	104	state_map[(unsigned char)':']=(unsigned char) MY_LEX_SET_VAR;
	105	state_map[0]=(unsigned char) MY_LEX_EOL;
	106	state_map[(unsigned char)'\\']= (unsigned char) MY_LEX_ESCAPE;
	107	state_map[(unsigned char)'/']= (unsigned char) MY_LEX_LONG_COMMENT;
	108	state_map[(unsigned char)'*']= (unsigned char) MY_LEX_END_LONG_COMMENT;
	109	state_map[(unsigned char)'@']= (unsigned char) MY_LEX_USER_END;
	110	state_map[(unsigned char) '`']= (unsigned char) MY_LEX_USER_VARIABLE_DELIMITER;
	111	state_map[(unsigned char)'"']= (unsigned char) MY_LEX_STRING_OR_DELIMITER;
1 by brian clean slate	112
	113	/*
	114	Create a second map to make it faster to find identifiers
	115	*/
	116	for (i=0; i < 256 ; i++)
	117	{
481 by Brian Aker Remove all of uchar.	118	ident_map[i]= (unsigned char) (state_map[i] == MY_LEX_IDENT \|\|
1 by brian clean slate	119	state_map[i] == MY_LEX_NUMBER_IDENT);
	120	}
	121
	122	/* Special handling of hex and binary strings */
481 by Brian Aker Remove all of uchar.	123	state_map[(unsigned char)'x']= state_map[(unsigned char)'X']= (unsigned char) MY_LEX_IDENT_OR_HEX;
481 by Brian Aker Remove all of uchar.	124	state_map[(unsigned char)'b']= state_map[(unsigned char)'B']= (unsigned char) MY_LEX_IDENT_OR_BIN;
1 by brian clean slate	125	return 0;
	126	}
	127
	128
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	129	static bool charset_initialized= false;
1 by brian clean slate	130
1 by brian clean slate	131	CHARSET_INFO *all_charsets[256];
383.1.12 by Brian Aker Much closer toward UTF8 being around all the time...	132	const CHARSET_INFO *default_charset_info = &my_charset_utf8_general_ci;
1 by brian clean slate	133
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	134	void add_compiled_collation(CHARSET_INFO * cs)
1 by brian clean slate	135	{
	136	all_charsets[cs->number]= cs;
	137	cs->state\|= MY_CS_AVAILABLE;
	138	}
	139
632.1.11 by Monty Taylor Fixed Sun Studio warnings in mysys.	140	void *cs_alloc(size_t size)
1 by brian clean slate	141	{
1106.1.1 by Brian Aker Monty fixes pluss a few from me for charset.	142	void *ptr= malloc(size);
	143
	144	memory_vector.push_back(ptr);
	145
	146	return ptr;
1 by brian clean slate	147	}
	148
	149
1241.9.67 by Monty Taylor Fixed Solaris.	150
146 by Brian Aker my_bool cleanup.	151	static bool init_available_charsets(myf myflags)
1 by brian clean slate	152	{
862 by Brian Aker Remove charset directory code.	153	bool error= false;
1 by brian clean slate	154	/*
	155	We have to use charset_initialized to not lock on THR_LOCK_charset
	156	inside get_internal_charset...
	157	*/
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	158	if (charset_initialized == false)
1 by brian clean slate	159	{
1 by brian clean slate	160	CHARSET_INFO **cs;
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	161	memset(&all_charsets, 0, sizeof(all_charsets));
	162	init_compiled_charsets(myflags);
	163
	164	/* Copy compiled charsets */
	165	for (cs=all_charsets;
	166	cs < all_charsets+array_elements(all_charsets)-1 ;
	167	cs++)
1 by brian clean slate	168	{
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	169	if (*cs)
1 by brian clean slate	170	{
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	171	if (cs[0]->ctype)
	172	if (init_state_maps(*cs))
	173	*cs= NULL;
1 by brian clean slate	174	}
1 by brian clean slate	175	}
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	176
	177	charset_initialized= true;
1 by brian clean slate	178	}
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	179	assert(charset_initialized);
	180
1 by brian clean slate	181	return error;
	182	}
	183
	184
	185	void free_charsets(void)
	186	{
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	187	charset_initialized= true;
1106.1.1 by Brian Aker Monty fixes pluss a few from me for charset.	188
	189	while (memory_vector.empty() == false)
	190	{
	191	void *ptr= memory_vector.back();
	192	memory_vector.pop_back();
	193	free(ptr);
	194	}
	195	memory_vector.clear();
	196
1 by brian clean slate	197	}
	198
	199
482 by Brian Aker Remove uint.	200	uint32_t get_collation_number(const char *name)
1 by brian clean slate	201	{
	202	init_available_charsets(MYF(0));
	203	return get_collation_number_internal(name);
	204	}
	205
	206
482 by Brian Aker Remove uint.	207	uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
1 by brian clean slate	208	{
	209	CHARSET_INFO **cs;
	210	init_available_charsets(MYF(0));
660.1.3 by Eric Herman removed trailing whitespace with simple script:	211
1 by brian clean slate	212	for (cs= all_charsets;
	213	cs < all_charsets+array_elements(all_charsets)-1 ;
	214	cs++)
	215	{
	216	if ( cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) &&
383.1.12 by Brian Aker Much closer toward UTF8 being around all the time...	217	!my_strcasecmp(&my_charset_utf8_general_ci, cs[0]->csname, charset_name))
1 by brian clean slate	218	return cs[0]->number;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	219	}
1 by brian clean slate	220	return 0;
	221	}
	222
	223
482 by Brian Aker Remove uint.	224	const char *get_charset_name(uint32_t charset_number)
1 by brian clean slate	225	{
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	226	const CHARSET_INFO *cs;
1 by brian clean slate	227	init_available_charsets(MYF(0));
	228
	229	cs=all_charsets[charset_number];
	230	if (cs && (cs->number == charset_number) && cs->name )
	231	return (char*) cs->name;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	232
1 by brian clean slate	233	return (char) "?"; / this mimics find_type() */
	234	}
	235
	236
482 by Brian Aker Remove uint.	237	static const CHARSET_INFO *get_internal_charset(uint32_t cs_number)
1 by brian clean slate	238	{
	239	CHARSET_INFO *cs;
	240	/*
	241	To make things thread safe we are not allowing other threads to interfere
	242	while we may changing the cs_info_table
	243	*/
	244	if ((cs= all_charsets[cs_number]))
	245	{
	246	if (!(cs->state & MY_CS_COMPILED) && !(cs->state & MY_CS_LOADED))
	247	{
383.1.7 by Brian Aker Remove homebrew xml parser.	248	assert(0);
1 by brian clean slate	249	}
	250	cs= (cs->state & MY_CS_AVAILABLE) ? cs : NULL;
	251	}
	252	if (cs && !(cs->state & MY_CS_READY))
	253	{
	254	if ((cs->cset->init && cs->cset->init(cs, cs_alloc)) \|\|
	255	(cs->coll->init && cs->coll->init(cs, cs_alloc)))
	256	cs= NULL;
	257	else
	258	cs->state\|= MY_CS_READY;
	259	}
861 by Brian Aker Remove THR_LOCK_charset (we never recall it anymore)	260
1 by brian clean slate	261	return cs;
	262	}
	263
	264
862 by Brian Aker Remove charset directory code.	265	const CHARSET_INFO *get_charset(uint32_t cs_number)
1 by brian clean slate	266	{
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	267	const CHARSET_INFO *cs;
1 by brian clean slate	268	if (cs_number == default_charset_info->number)
	269	return default_charset_info;
	270
	271	(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
660.1.3 by Eric Herman removed trailing whitespace with simple script:	272
1 by brian clean slate	273	if (!cs_number \|\| cs_number >= array_elements(all_charsets)-1)
1 by brian clean slate	274	return NULL;
660.1.3 by Eric Herman removed trailing whitespace with simple script:	275
383.1.7 by Brian Aker Remove homebrew xml parser.	276	cs= get_internal_charset(cs_number);
1 by brian clean slate	277
	278	return cs;
	279	}
	280
862 by Brian Aker Remove charset directory code.	281	const CHARSET_INFO get_charset_by_name(const char cs_name)
1 by brian clean slate	282	{
482 by Brian Aker Remove uint.	283	uint32_t cs_number;
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	284	const CHARSET_INFO *cs;
1 by brian clean slate	285	(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
1 by brian clean slate	286
1014.3.1 by Brian Aker Simplify the calling stack for getting schema collation. We need to extend	287	cs_number= get_collation_number(cs_name);
383.1.7 by Brian Aker Remove homebrew xml parser.	288	cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian clean slate	289
	290	return cs;
	291	}
	292
	293
862 by Brian Aker Remove charset directory code.	294	const CHARSET_INFO get_charset_by_csname(const char cs_name, uint32_t cs_flags)
1 by brian clean slate	295	{
482 by Brian Aker Remove uint.	296	uint32_t cs_number;
264.2.6 by Andrey Hristov Constify the usage of CHARSET_INFO almost to the last place in the code.	297	const CHARSET_INFO *cs;
1 by brian clean slate	298
	299	(void) init_available_charsets(MYF(0)); /* If it isn't initialized */
	300
	301	cs_number= get_charset_number(cs_name, cs_flags);
383.1.7 by Brian Aker Remove homebrew xml parser.	302	cs= cs_number ? get_internal_charset(cs_number) : NULL;
1 by brian clean slate	303
51.3.22 by Jay Pipes Final round of removal of DBUG in mysys/, including Makefile	304	return(cs);
1 by brian clean slate	305	}
	306
	307
	308	/*
	309	Escape apostrophes by doubling them up
	310
	311	SYNOPSIS
236.3.4 by Andrey Hristov Rename escape_(string\|quotes)_for_mysql to escape_(string\|quotes)_for_drizzle	312	escape_quotes_for_drizzle()
1 by brian clean slate	313	charset_info Charset of the strings
	314	to Buffer for escaped string
	315	to_length Length of destination buffer, or 0
	316	from The string to escape
	317	length The length of the string to escape
	318
	319	DESCRIPTION
	320	This escapes the contents of a string by doubling up any apostrophes that
	321	it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
	322	effect on the server.
	323
	324	NOTE
	325	To be consistent with escape_string_for_mysql(), to_length may be 0 to
	326	mean "big enough"
	327
	328	RETURN VALUES
365.2.9 by Monty Taylor Got rid of all instances of ~0	329	UINT32_MAX The escaped string did not fit in the to buffer
1 by brian clean slate	330	>=0 The length of the escaped string
	331	*/
	332
236.3.9 by Andrey Hristov - Fix build of exotic, mostly non-western, charsets (--with-extra-charsets)	333	size_t escape_quotes_for_drizzle(const CHARSET_INFO *charset_info,
236.3.4 by Andrey Hristov Rename escape_(string\|quotes)_for_mysql to escape_(string\|quotes)_for_drizzle	334	char *to, size_t to_length,
	335	const char *from, size_t length)
1 by brian clean slate	336	{
	337	const char *to_start= to;
	338	const char end, to_end=to_start + (to_length ? to_length-1 : 2*length);
163 by Brian Aker Merge Monty's code.	339	bool overflow= false;
146 by Brian Aker my_bool cleanup.	340	bool use_mb_flag= use_mb(charset_info);
1 by brian clean slate	341	for (end= from + length; from < end; from++)
	342	{
	343	int tmp_length;
	344	if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
	345	{
	346	if (to + tmp_length > to_end)
	347	{
163 by Brian Aker Merge Monty's code.	348	overflow= true;
1 by brian clean slate	349	break;
	350	}
	351	while (tmp_length--)
	352	to++= from++;
	353	from--;
	354	continue;
	355	}
	356	/*
	357	We don't have the same issue here with a non-multi-byte character being
	358	turned into a multi-byte character by the addition of an escaping
	359	character, because we are only escaping the ' character with itself.
	360	*/
	361	if (*from == '\'')
	362	{
	363	if (to + 2 > to_end)
	364	{
163 by Brian Aker Merge Monty's code.	365	overflow= true;
1 by brian clean slate	366	break;
	367	}
	368	*to++= '\'';
	369	*to++= '\'';
	370	}
	371	else
	372	{
	373	if (to + 1 > to_end)
	374	{
163 by Brian Aker Merge Monty's code.	375	overflow= true;
1 by brian clean slate	376	break;
	377	}
	378	to++= from;
	379	}
	380	}
	381	*to= 0;
365.2.9 by Monty Taylor Got rid of all instances of ~0	382	return overflow ? UINT32_MAX : (uint32_t) (to - to_start);
1 by brian clean slate	383	}
1280.1.10 by Monty Taylor Put everything in drizzled into drizzled namespace.	384
	385	} /* namespace drizzled */