1
by brian
clean slate |
1 |
/* Copyright (C) 2005 MySQL AB
|
2 |
||
3 |
This program is free software; you can redistribute it and/or modify
|
|
4 |
it under the terms of the GNU General Public License as published by
|
|
5 |
the Free Software Foundation; version 2 of the License.
|
|
6 |
||
7 |
This program is distributed in the hope that it will be useful,
|
|
8 |
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
9 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
10 |
GNU General Public License for more details.
|
|
11 |
||
12 |
You should have received a copy of the GNU General Public License
|
|
13 |
along with this program; if not, write to the Free Software
|
|
14 |
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
|
|
15 |
||
16 |
#ifndef _my_plugin_ftparser_h
|
|
17 |
#define _my_plugin_ftparser_h
|
|
18 |
||
19 |
/*************************************************************************
|
|
20 |
API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN)
|
|
21 |
*/
|
|
22 |
||
23 |
#define MYSQL_FTPARSER_INTERFACE_VERSION 0x0100
|
|
24 |
||
25 |
/* Parsing modes. Set in MYSQL_FTPARSER_PARAM::mode */
|
|
26 |
enum enum_ftparser_mode |
|
27 |
{
|
|
28 |
/*
|
|
29 |
Fast and simple mode. This mode is used for indexing, and natural
|
|
30 |
language queries.
|
|
31 |
||
32 |
The parser is expected to return only those words that go into the
|
|
33 |
index. Stopwords or too short/long words should not be returned. The
|
|
34 |
'boolean_info' argument of mysql_add_word() does not have to be set.
|
|
35 |
*/
|
|
36 |
MYSQL_FTPARSER_SIMPLE_MODE= 0, |
|
37 |
||
38 |
/*
|
|
39 |
Parse with stopwords mode. This mode is used in boolean searches for
|
|
40 |
"phrase matching."
|
|
41 |
||
42 |
The parser is not allowed to ignore words in this mode. Every word
|
|
43 |
should be returned, including stopwords and words that are too short
|
|
44 |
or long. The 'boolean_info' argument of mysql_add_word() does not
|
|
45 |
have to be set.
|
|
46 |
*/
|
|
47 |
MYSQL_FTPARSER_WITH_STOPWORDS= 1, |
|
48 |
||
49 |
/*
|
|
50 |
Parse in boolean mode. This mode is used to parse a boolean query string.
|
|
51 |
||
52 |
The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO
|
|
53 |
structure in the 'boolean_info' argument to mysql_add_word().
|
|
54 |
Usually that means that the parser should recognize boolean operators
|
|
55 |
in the parsing stream and set appropriate fields in
|
|
56 |
MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As for
|
|
57 |
MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
|
|
58 |
Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
|
|
59 |
*/
|
|
60 |
MYSQL_FTPARSER_FULL_BOOLEAN_INFO= 2 |
|
61 |
};
|
|
62 |
||
63 |
/*
|
|
64 |
Token types for boolean mode searching (used for the type member of
|
|
65 |
MYSQL_FTPARSER_BOOLEAN_INFO struct)
|
|
66 |
||
67 |
FT_TOKEN_EOF: End of data.
|
|
68 |
FT_TOKEN_WORD: Regular word.
|
|
69 |
FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression).
|
|
70 |
FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression).
|
|
71 |
FT_TOKEN_STOPWORD: Stopword.
|
|
72 |
*/
|
|
73 |
||
74 |
enum enum_ft_token_type |
|
75 |
{
|
|
76 |
FT_TOKEN_EOF= 0, |
|
77 |
FT_TOKEN_WORD= 1, |
|
78 |
FT_TOKEN_LEFT_PAREN= 2, |
|
79 |
FT_TOKEN_RIGHT_PAREN= 3, |
|
80 |
FT_TOKEN_STOPWORD= 4 |
|
81 |
};
|
|
82 |
||
83 |
/*
|
|
84 |
This structure is used in boolean search mode only. It conveys
|
|
85 |
boolean-mode metadata to the MySQL search engine for every word in
|
|
86 |
the search query. A valid instance of this structure must be filled
|
|
87 |
in by the plugin parser and passed as an argument in the call to
|
|
88 |
mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM
|
|
89 |
structure) when a query is parsed in boolean mode.
|
|
90 |
||
91 |
type: The token type. Should be one of the enum_ft_token_type values.
|
|
92 |
||
93 |
yesno: Whether the word must be present for a match to occur:
|
|
94 |
>0 Must be present
|
|
95 |
<0 Must not be present
|
|
96 |
0 Neither; the word is optional but its presence increases the relevance
|
|
97 |
With the default settings of the ft_boolean_syntax system variable,
|
|
98 |
>0 corresponds to the '+' operator, <0 corrresponds to the '-' operator,
|
|
99 |
and 0 means neither operator was used.
|
|
100 |
||
101 |
weight_adjust: A weighting factor that determines how much a match
|
|
102 |
for the word counts. Positive values increase, negative - decrease the
|
|
103 |
relative word's importance in the query.
|
|
104 |
||
105 |
wasign: The sign of the word's weight in the query. If it's non-negative
|
|
106 |
the match for the word will increase document relevance, if it's
|
|
107 |
negative - decrease (the word becomes a "noise word", the less of it the
|
|
108 |
better).
|
|
109 |
||
110 |
trunc: Corresponds to the '*' operator in the default setting of the
|
|
111 |
ft_boolean_syntax system variable.
|
|
112 |
*/
|
|
113 |
||
114 |
typedef struct st_mysql_ftparser_boolean_info |
|
115 |
{
|
|
116 |
enum enum_ft_token_type type; |
|
117 |
int yesno; |
|
118 |
int weight_adjust; |
|
119 |
char wasign; |
|
120 |
char trunc; |
|
121 |
/* These are parser state and must be removed. */
|
|
122 |
char prev; |
|
123 |
char *quot; |
|
124 |
} MYSQL_FTPARSER_BOOLEAN_INFO; |
|
125 |
||
126 |
/*
|
|
127 |
The following flag means that buffer with a string (document, word)
|
|
128 |
may be overwritten by the caller before the end of the parsing (that is
|
|
129 |
before st_mysql_ftparser::deinit() call). If one needs the string
|
|
130 |
to survive between two successive calls of the parsing function, she
|
|
131 |
needs to save a copy of it. The flag may be set by MySQL before calling
|
|
132 |
st_mysql_ftparser::parse(), or it may be set by a plugin before calling
|
|
133 |
st_mysql_ftparser_param::mysql_parse() or
|
|
134 |
st_mysql_ftparser_param::mysql_add_word().
|
|
135 |
*/
|
|
136 |
#define MYSQL_FTFLAGS_NEED_COPY 1
|
|
137 |
||
138 |
/*
|
|
139 |
An argument of the full-text parser plugin. This structure is
|
|
140 |
filled in by MySQL server and passed to the parsing function of the
|
|
141 |
plugin as an in/out parameter.
|
|
142 |
||
143 |
mysql_parse: A pointer to the built-in parser implementation of the
|
|
144 |
server. It's set by the server and can be used by the parser plugin
|
|
145 |
to invoke the MySQL default parser. If plugin's role is to extract
|
|
146 |
textual data from .doc, .pdf or .xml content, it might extract
|
|
147 |
plaintext from the content, and then pass the text to the default
|
|
148 |
MySQL parser to be parsed.
|
|
149 |
||
150 |
mysql_add_word: A server callback to add a new word. When parsing
|
|
151 |
a document, the server sets this to point at a function that adds
|
|
152 |
the word to MySQL full-text index. When parsing a search query,
|
|
153 |
this function will add the new word to the list of words to search
|
|
154 |
for. The boolean_info argument can be NULL for all cases except
|
|
155 |
when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO.
|
|
156 |
||
157 |
ftparser_state: A generic pointer. The plugin can set it to point
|
|
158 |
to information to be used internally for its own purposes.
|
|
159 |
||
160 |
mysql_ftparam: This is set by the server. It is used by MySQL functions
|
|
161 |
called via mysql_parse() and mysql_add_word() callback. The plugin
|
|
162 |
should not modify it.
|
|
163 |
||
164 |
cs: Information about the character set of the document or query string.
|
|
165 |
||
166 |
doc: A pointer to the document or query string to be parsed.
|
|
167 |
||
168 |
length: Length of the document or query string, in bytes.
|
|
169 |
||
170 |
flags: See MYSQL_FTFLAGS_* constants above.
|
|
171 |
||
172 |
mode: The parsing mode. With boolean operators, with stopwords, or
|
|
173 |
nothing. See enum_ftparser_mode above.
|
|
174 |
*/
|
|
175 |
||
176 |
typedef struct st_mysql_ftparser_param |
|
177 |
{
|
|
178 |
int (*mysql_parse)(struct st_mysql_ftparser_param *, |
|
179 |
char *doc, int doc_len); |
|
180 |
int (*mysql_add_word)(struct st_mysql_ftparser_param *, |
|
181 |
char *word, int word_len, |
|
182 |
MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info); |
|
183 |
void *ftparser_state; |
|
184 |
void *mysql_ftparam; |
|
185 |
struct charset_info_st *cs; |
|
186 |
char *doc; |
|
187 |
int length; |
|
188 |
int flags; |
|
189 |
enum enum_ftparser_mode mode; |
|
190 |
} MYSQL_FTPARSER_PARAM; |
|
191 |
||
192 |
/*
|
|
193 |
Full-text parser descriptor.
|
|
194 |
||
195 |
interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION.
|
|
196 |
The parsing, initialization, and deinitialization functions are
|
|
197 |
invoked per SQL statement for which the parser is used.
|
|
198 |
*/
|
|
199 |
||
200 |
struct st_mysql_ftparser |
|
201 |
{
|
|
202 |
int interface_version; |
|
203 |
int (*parse)(MYSQL_FTPARSER_PARAM *param); |
|
204 |
int (*init)(MYSQL_FTPARSER_PARAM *param); |
|
205 |
int (*deinit)(MYSQL_FTPARSER_PARAM *param); |
|
206 |
};
|
|
207 |
||
208 |
||
209 |
#endif
|
|
210 |