~drizzle-trunk/drizzle/development : contents of drizzled/lex_input

~drizzle-trunk/drizzle/development : (revision 2390)

/* -*- mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; -*-
 *  vim:expandtab:shiftwidth=2:tabstop=2:smarttab:
 *
 *  Copyright (C) 2008 Sun Microsystems, Inc.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#pragma once

/**
  @brief This class represents the character input stream consumed during
  lexical analysis.

  In addition to consuming the input stream, this class performs some
  comment pre processing, by filtering out out of bound special text
  from the query input stream.
  Two buffers, with pointers inside each buffers, are maintained in
  parallel. The 'raw' buffer is the original query text, which may
  contain out-of-bound comments. The 'cpp' (for comments pre processor)
  is the pre-processed buffer that contains only the query text that
  should be seen once out-of-bound data is removed.
*/

namespace drizzled {

class Lex_input_stream
{
public:
  Lex_input_stream(Session *session, const char* buff, unsigned int length);

  /**
    Set the echo mode.

    When echo is true, characters parsed from the raw input stream are
    preserved. When false, characters parsed are silently ignored.
    @param echo the echo mode.
  */
  void set_echo(bool echo)
  {
    m_echo= echo;
  }

  /**
    Skip binary from the input stream.
    @param n number of bytes to accept.
  */
  void skip_binary(int n)
  {
    if (m_echo)
    {
      memcpy(m_cpp_ptr, m_ptr, n);
      m_cpp_ptr += n;
    }
    m_ptr += n;
  }

  /**
    Get a character, and advance in the stream.
    @return the next character to parse.
  */
  char yyGet()
  {
    char c= *m_ptr++;
    if (m_echo)
      *m_cpp_ptr++ = c;
    return c;
  }

  /**
    Get the last character accepted.
    @return the last character accepted.
  */
  char yyGetLast() const
  {
    return m_ptr[-1];
  }

  /**
    Look at the next character to parse, but do not accept it.
  */
  char yyPeek() const
  {
    return m_ptr[0];
  }

  /**
    Look ahead at some character to parse.
    @param n offset of the character to look up
  */
  char yyPeekn(int n) const
  {
    return m_ptr[n];
  }

  /**
    Cancel the effect of the last yyGet() or yySkip().
    Note that the echo mode should not change between calls to yyGet / yySkip
    and yyUnget. The caller is responsible for ensuring that.
  */
  void yyUnget()
  {
    m_ptr--;
    if (m_echo)
      m_cpp_ptr--;
  }

  /**
    Accept a character, by advancing the input stream.
  */
  void yySkip()
  {
    if (m_echo)
      *m_cpp_ptr++ = *m_ptr++;
    else
      m_ptr++;
  }

  /**
    Accept multiple characters at once.
    @param n the number of characters to accept.
  */
  void yySkipn(int n)
  {
    if (m_echo)
    {
      memcpy(m_cpp_ptr, m_ptr, n);
      m_cpp_ptr += n;
    }
    m_ptr += n;
  }

  /**
    End of file indicator for the query text to parse.
    @return true if there are no more characters to parse
  */
  bool eof()
  {
    return (m_ptr >= m_end_of_query);
  }

  /**
    End of file indicator for the query text to parse.
    @param n number of characters expected
    @return true if there are less than n characters to parse
  */
  bool eof(int n)
  {
    return ((m_ptr + n) >= m_end_of_query);
  }

  /** Get the raw query buffer. */
  const char *get_buf()
  {
    return m_buf;
  }

  /** Get the pre-processed query buffer. */
  const char *get_cpp_buf()
  {
    return m_cpp_buf;
  }

  /** Get the end of the raw query buffer. */
  const char *get_end_of_query()
  {
    return m_end_of_query;
  }

  /** Mark the stream position as the start of a new token. */
  void start_token()
  {
    m_tok_start_prev= m_tok_start;
    m_tok_start= m_ptr;
    m_tok_end= m_ptr;

    m_cpp_tok_start_prev= m_cpp_tok_start;
    m_cpp_tok_start= m_cpp_ptr;
    m_cpp_tok_end= m_cpp_ptr;
  }

  /**
    Adjust the starting position of the current token.
    This is used to compensate for starting whitespace.
  */
  void restart_token()
  {
    m_tok_start= m_ptr;
    m_cpp_tok_start= m_cpp_ptr;
  }

  /** Get the token start position, in the raw buffer. */
  const char *get_tok_start() const
  {
    return m_tok_start;
  }

  /** Get the token start position, in the pre-processed buffer. */
  const char *get_cpp_tok_start() const
  {
    return m_cpp_tok_start;
  }

  /** Get the token end position, in the raw buffer. */
  const char *get_tok_end() const
  {
    return m_tok_end;
  }

  /** Get the token end position, in the pre-processed buffer. */
  const char *get_cpp_tok_end() const
  {
    return m_cpp_tok_end;
  }

  /** Get the previous token start position, in the raw buffer. */
  const char *get_tok_start_prev() const
  {
    return m_tok_start_prev;
  }

  /** Get the current stream pointer, in the raw buffer. */
  const char *get_ptr() const
  {
    return m_ptr;
  }

  /** Get the current stream pointer, in the pre-processed buffer. */
  const char *get_cpp_ptr() const
  {
    return m_cpp_ptr;
  }

  /** Get the length of the current token, in the raw buffer. */
  uint32_t yyLength() const
  {
    /*
      The assumption is that the lexical analyser is always 1 character ahead,
      which the -1 account for.
    */
    assert(m_ptr > m_tok_start);
    return (uint32_t) ((m_ptr - m_tok_start) - 1);
  }

  /** Get the utf8-body string. */
  const char *get_body_utf8_str() const
  {
    return m_body_utf8;
  }

  /** Get the utf8-body length. */
  uint32_t get_body_utf8_length() const
  {
    return m_body_utf8_ptr - m_body_utf8;
  }

  void body_utf8_append(const char *ptr);
  void body_utf8_append(const char *ptr, const char *end_ptr);
  void body_utf8_append_literal(const lex_string_t *txt,
                                const char *end_ptr);

  /** Current thread. */
  Session *m_session;

  /** Current line number. */
  uint32_t yylineno;

  /** Length of the last token parsed. */
  uint32_t yytoklen;

  /** Interface with bison, value of the last token parsed. */
  LEX_YYSTYPE yylval;

  /** LALR(2) resolution, look ahead token.*/
  int lookahead_token;

  /** LALR(2) resolution, value of the look ahead token.*/
  LEX_YYSTYPE lookahead_yylval;

private:
  /** Pointer to the current position in the raw input stream. */
  const char *m_ptr;

  /** Starting position of the last token parsed, in the raw buffer. */
  const char *m_tok_start;

  /** Ending position of the previous token parsed, in the raw buffer. */
  const char *m_tok_end;

  /** End of the query text in the input stream, in the raw buffer. */
  const char *m_end_of_query;

  /** Starting position of the previous token parsed, in the raw buffer. */
  const char *m_tok_start_prev;

  /** Begining of the query text in the input stream, in the raw buffer. */
  const char *m_buf;

  /** Length of the raw buffer. */
  uint32_t m_buf_length;

  /** Echo the parsed stream to the pre-processed buffer. */
  bool m_echo;

  /** Pre-processed buffer. */
  char *m_cpp_buf;

  /** Pointer to the current position in the pre-processed input stream. */
  char *m_cpp_ptr;

  /**
    Starting position of the last token parsed,
    in the pre-processed buffer.
  */
  const char *m_cpp_tok_start;

  /**
    Starting position of the previous token parsed,
    in the pre-procedded buffer.
  */
  const char *m_cpp_tok_start_prev;

  /**
    Ending position of the previous token parsed,
    in the pre-processed buffer.
  */
  const char *m_cpp_tok_end;

  /** UTF8-body buffer created during parsing. */
  char *m_body_utf8;

  /** Pointer to the current position in the UTF8-body buffer. */
  char *m_body_utf8_ptr;

  /**
    Position in the pre-processed buffer. The query from m_cpp_buf to
    m_cpp_utf_processed_ptr is converted to UTF8-body.
  */
  const char *m_cpp_utf8_processed_ptr;

public:

  /** Current state of the lexical analyser. */
  enum my_lex_states next_state;

  /** Token character bitmaps, to detect 7bit strings. */
  unsigned char tok_bitmap;

  /** SQL_MODE = IGNORE_SPACE. */
  bool ignore_space;

  /** State of the lexical analyser for comments. */
  enum_comment_state in_comment;

  /**
    Starting position of the TEXT_STRING or IDENT in the pre-processed
    buffer.

    NOTE: this member must be used within base_sql_lex() function only.
  */
  const char *m_cpp_text_start;

  /**
    Ending position of the TEXT_STRING or IDENT in the pre-processed
    buffer.

    NOTE: this member must be used within base_sql_lex() function only.
    */
  const char *m_cpp_text_end;

};

} /* namespace drizzled */


1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	1	/* -- mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; --
	2	* vim:expandtab:shiftwidth=2:tabstop=2:smarttab:
	3	*
1999.6.1 by kalebral at gmail update Copyright strings to a more common format to help with creating the master debian copyright file	4	* Copyright (C) 2008 Sun Microsystems, Inc.
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; version 2 of the License.
	9	*
	10	* This program is distributed in the hope that it will be useful,
	11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	* GNU General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	18	*/
	19
2234 by Brian Aker Mass removal of ifdef/endif in favor of pragma once.	20	#pragma once
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	21
	22	/**
	23	@brief This class represents the character input stream consumed during
	24	lexical analysis.
	25
	26	In addition to consuming the input stream, this class performs some
	27	comment pre processing, by filtering out out of bound special text
	28	from the query input stream.
	29	Two buffers, with pointers inside each buffers, are maintained in
	30	parallel. The 'raw' buffer is the original query text, which may
	31	contain out-of-bound comments. The 'cpp' (for comments pre processor)
	32	is the pre-processed buffer that contains only the query text that
	33	should be seen once out-of-bound data is removed.
	34	*/
	35
2279.2.2 by Olaf van der Spek Prune	36	namespace drizzled {
1280.1.10 by Monty Taylor Put everything in drizzled into drizzled namespace.	37
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	38	class Lex_input_stream
	39	{
	40	public:
	41	Lex_input_stream(Session session, const char buff, unsigned int length);
	42
	43	/**
	44	Set the echo mode.
	45
	46	When echo is true, characters parsed from the raw input stream are
	47	preserved. When false, characters parsed are silently ignored.
	48	@param echo the echo mode.
	49	*/
	50	void set_echo(bool echo)
	51	{
	52	m_echo= echo;
	53	}
	54
	55	/**
	56	Skip binary from the input stream.
	57	@param n number of bytes to accept.
	58	*/
	59	void skip_binary(int n)
	60	{
	61	if (m_echo)
	62	{
	63	memcpy(m_cpp_ptr, m_ptr, n);
	64	m_cpp_ptr += n;
	65	}
	66	m_ptr += n;
	67	}
	68
	69	/**
	70	Get a character, and advance in the stream.
	71	@return the next character to parse.
	72	*/
	73	char yyGet()
	74	{
	75	char c= *m_ptr++;
	76	if (m_echo)
	77	*m_cpp_ptr++ = c;
	78	return c;
	79	}
	80
	81	/**
	82	Get the last character accepted.
	83	@return the last character accepted.
	84	*/
2318.8.7 by Olaf van der Spek Add const	85	char yyGetLast() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	86	{
	87	return m_ptr[-1];
	88	}
	89
	90	/**
	91	Look at the next character to parse, but do not accept it.
	92	*/
2318.8.7 by Olaf van der Spek Add const	93	char yyPeek() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	94	{
	95	return m_ptr[0];
	96	}
	97
	98	/**
	99	Look ahead at some character to parse.
	100	@param n offset of the character to look up
	101	*/
2318.8.7 by Olaf van der Spek Add const	102	char yyPeekn(int n) const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	103	{
	104	return m_ptr[n];
	105	}
	106
	107	/**
	108	Cancel the effect of the last yyGet() or yySkip().
	109	Note that the echo mode should not change between calls to yyGet / yySkip
	110	and yyUnget. The caller is responsible for ensuring that.
	111	*/
	112	void yyUnget()
	113	{
	114	m_ptr--;
	115	if (m_echo)
	116	m_cpp_ptr--;
	117	}
	118
	119	/**
	120	Accept a character, by advancing the input stream.
	121	*/
	122	void yySkip()
	123	{
	124	if (m_echo)
	125	m_cpp_ptr++ = m_ptr++;
	126	else
	127	m_ptr++;
	128	}
	129
	130	/**
	131	Accept multiple characters at once.
	132	@param n the number of characters to accept.
	133	*/
	134	void yySkipn(int n)
	135	{
	136	if (m_echo)
	137	{
	138	memcpy(m_cpp_ptr, m_ptr, n);
	139	m_cpp_ptr += n;
	140	}
	141	m_ptr += n;
	142	}
	143
	144	/**
	145	End of file indicator for the query text to parse.
	146	@return true if there are no more characters to parse
	147	*/
	148	bool eof()
	149	{
	150	return (m_ptr >= m_end_of_query);
	151	}
	152
	153	/**
	154	End of file indicator for the query text to parse.
	155	@param n number of characters expected
	156	@return true if there are less than n characters to parse
	157	*/
	158	bool eof(int n)
	159	{
	160	return ((m_ptr + n) >= m_end_of_query);
	161	}
	162
	163	/** Get the raw query buffer. */
	164	const char *get_buf()
	165	{
	166	return m_buf;
167	}
168
169	/** Get the pre-processed query buffer. */
170	const char *get_cpp_buf()
171	{
172	return m_cpp_buf;
173	}
174
175	/** Get the end of the raw query buffer. */
176	const char *get_end_of_query()
177	{
178	return m_end_of_query;
179	}
180
181	/** Mark the stream position as the start of a new token. */
182	void start_token()
183	{
184	m_tok_start_prev= m_tok_start;
185	m_tok_start= m_ptr;
186	m_tok_end= m_ptr;
187
188	m_cpp_tok_start_prev= m_cpp_tok_start;
189	m_cpp_tok_start= m_cpp_ptr;
190	m_cpp_tok_end= m_cpp_ptr;
191	}
192
193	/**
194	Adjust the starting position of the current token.
195	This is used to compensate for starting whitespace.
196	*/
197	void restart_token()
198	{
199	m_tok_start= m_ptr;
200	m_cpp_tok_start= m_cpp_ptr;
201	}
202
203	/** Get the token start position, in the raw buffer. */
2318.8.7 by Olaf van der Spek Add const	204	const char *get_tok_start() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	205	{
	206	return m_tok_start;
	207	}
	208
	209	/** Get the token start position, in the pre-processed buffer. */
2318.8.7 by Olaf van der Spek Add const	210	const char *get_cpp_tok_start() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	211	{
	212	return m_cpp_tok_start;
	213	}
	214
	215	/** Get the token end position, in the raw buffer. */
2318.8.7 by Olaf van der Spek Add const	216	const char *get_tok_end() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	217	{
	218	return m_tok_end;
	219	}
	220
	221	/** Get the token end position, in the pre-processed buffer. */
2318.8.7 by Olaf van der Spek Add const	222	const char *get_cpp_tok_end() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	223	{
	224	return m_cpp_tok_end;
	225	}
	226
	227	/** Get the previous token start position, in the raw buffer. */
2318.8.7 by Olaf van der Spek Add const	228	const char *get_tok_start_prev() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	229	{
	230	return m_tok_start_prev;
	231	}
	232
	233	/** Get the current stream pointer, in the raw buffer. */
2318.8.7 by Olaf van der Spek Add const	234	const char *get_ptr() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	235	{
	236	return m_ptr;
	237	}
	238
	239	/** Get the current stream pointer, in the pre-processed buffer. */
2318.8.7 by Olaf van der Spek Add const	240	const char *get_cpp_ptr() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	241	{
	242	return m_cpp_ptr;
	243	}
	244
	245	/** Get the length of the current token, in the raw buffer. */
2318.8.7 by Olaf van der Spek Add const	246	uint32_t yyLength() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	247	{
	248	/*
	249	The assumption is that the lexical analyser is always 1 character ahead,
	250	which the -1 account for.
	251	*/
	252	assert(m_ptr > m_tok_start);
	253	return (uint32_t) ((m_ptr - m_tok_start) - 1);
	254	}
	255
	256	/** Get the utf8-body string. */
2318.8.7 by Olaf van der Spek Add const	257	const char *get_body_utf8_str() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	258	{
	259	return m_body_utf8;
	260	}
	261
	262	/** Get the utf8-body length. */
2318.8.7 by Olaf van der Spek Add const	263	uint32_t get_body_utf8_length() const
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	264	{
	265	return m_body_utf8_ptr - m_body_utf8;
	266	}
	267
	268	void body_utf8_append(const char *ptr);
	269	void body_utf8_append(const char ptr, const char end_ptr);
2371.1.2 by Brian Aker Remove the typedef on lexkey	270	void body_utf8_append_literal(const lex_string_t *txt,
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	271	const char *end_ptr);
	272
	273	/** Current thread. */
	274	Session *m_session;
	275
	276	/** Current line number. */
	277	uint32_t yylineno;
	278
	279	/** Length of the last token parsed. */
	280	uint32_t yytoklen;
	281
	282	/** Interface with bison, value of the last token parsed. */
	283	LEX_YYSTYPE yylval;
	284
	285	/** LALR(2) resolution, look ahead token.*/
	286	int lookahead_token;
	287
	288	/** LALR(2) resolution, value of the look ahead token.*/
	289	LEX_YYSTYPE lookahead_yylval;
	290
	291	private:
	292	/** Pointer to the current position in the raw input stream. */
	293	const char *m_ptr;
	294
	295	/** Starting position of the last token parsed, in the raw buffer. */
	296	const char *m_tok_start;
	297
	298	/** Ending position of the previous token parsed, in the raw buffer. */
	299	const char *m_tok_end;
	300
	301	/** End of the query text in the input stream, in the raw buffer. */
	302	const char *m_end_of_query;
	303
	304	/** Starting position of the previous token parsed, in the raw buffer. */
	305	const char *m_tok_start_prev;
	306
	307	/** Begining of the query text in the input stream, in the raw buffer. */
	308	const char *m_buf;
	309
	310	/** Length of the raw buffer. */
	311	uint32_t m_buf_length;
	312
	313	/** Echo the parsed stream to the pre-processed buffer. */
	314	bool m_echo;
	315
	316	/** Pre-processed buffer. */
	317	char *m_cpp_buf;
	318
	319	/** Pointer to the current position in the pre-processed input stream. */
	320	char *m_cpp_ptr;
	321
	322	/**
	323	Starting position of the last token parsed,
	324	in the pre-processed buffer.
	325	*/
	326	const char *m_cpp_tok_start;
	327
	328	/**
	329	Starting position of the previous token parsed,
	330	in the pre-procedded buffer.
	331	*/
	332	const char *m_cpp_tok_start_prev;
	333
	334	/**
335	Ending position of the previous token parsed,
336	in the pre-processed buffer.
337	*/
338	const char *m_cpp_tok_end;
339
340	/** UTF8-body buffer created during parsing. */
341	char *m_body_utf8;
342
343	/** Pointer to the current position in the UTF8-body buffer. */
344	char *m_body_utf8_ptr;
345
346	/**
347	Position in the pre-processed buffer. The query from m_cpp_buf to
348	m_cpp_utf_processed_ptr is converted to UTF8-body.
349	*/
350	const char *m_cpp_utf8_processed_ptr;
351
352	public:
353
354	/** Current state of the lexical analyser. */
355	enum my_lex_states next_state;
356
357	/** Token character bitmaps, to detect 7bit strings. */
358	unsigned char tok_bitmap;
359
360	/** SQL_MODE = IGNORE_SPACE. */
361	bool ignore_space;
362
363	/** State of the lexical analyser for comments. */
364	enum_comment_state in_comment;
365
366	/**
367	Starting position of the TEXT_STRING or IDENT in the pre-processed
368	buffer.
369
2172.3.6 by Brian Aker Namespace the parser just a bit, and update our call for the type of parser	370	NOTE: this member must be used within base_sql_lex() function only.
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	371	*/
	372	const char *m_cpp_text_start;
	373
	374	/**
	375	Ending position of the TEXT_STRING or IDENT in the pre-processed
	376	buffer.
	377
2172.3.6 by Brian Aker Namespace the parser just a bit, and update our call for the type of parser	378	NOTE: this member must be used within base_sql_lex() function only.
1014.4.8 by Jay Pipes Split Lex_input_stream out into its own header.	379	*/
	380	const char *m_cpp_text_end;
	381
	382	};
1280.1.10 by Monty Taylor Put everything in drizzled into drizzled namespace.	383
	384	} /* namespace drizzled */
	385