5
* @version $Id: fulltext_native.php,v 1.60 2007/10/05 14:36:33 acydburn Exp $
6
* @copyright (c) 2005 phpBB Group
7
* @license http://opensource.org/licenses/gpl-license.php GNU Public License
14
if (!defined('IN_PHPBB'))
22
include_once($phpbb_root_path . 'includes/search/search.' . $phpEx);
26
* phpBB's own db driven fulltext search, version 2
29
class fulltext_native extends search_backend
32
var $word_length = array();
34
var $common_words = array();
36
var $must_contain_ids = array();
37
var $must_not_contain_ids = array();
38
var $must_exclude_one_ids = array();
41
* Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded.
43
* @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure.
47
function fulltext_native(&$error)
49
global $phpbb_root_path, $phpEx, $config;
51
$this->word_length = array('min' => $config['fulltext_native_min_chars'], 'max' => $config['fulltext_native_max_chars']);
56
if (!class_exists('utf_normalizer'))
58
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
66
* This function fills $this->search_query with the cleaned user search query.
68
* If $terms is 'any' then the words will be extracted from the search query
69
* and combined with | inside brackets. They will afterwards be treated like
70
* an standard search query.
72
* Then it analyses the query and fills the internal arrays $must_not_contain_ids,
73
* $must_contain_ids and $must_exclude_one_ids which are later used by keyword_search().
75
* @param string $keywords contains the search query string as entered by the user
76
* @param string $terms is either 'all' (use search query as entered, default words to 'must be contained in post')
77
* or 'any' (find all posts containing at least one of the given words)
78
* @return boolean false if no valid keywords were found and otherwise true
82
function split_keywords($keywords, $terms)
86
$keywords = trim($this->cleanup($keywords, '+-|()*'));
88
// allow word|word|word without brackets
89
if ((strpos($keywords, ' ') === false) && (strpos($keywords, '|') !== false) && (strpos($keywords, '(') === false))
91
$keywords = '(' . $keywords . ')';
94
$open_bracket = $space = false;
95
for ($i = 0, $n = strlen($keywords); $i < $n; $i++)
97
if ($open_bracket !== false)
99
switch ($keywords[$i])
102
if ($open_bracket + 1 == $i)
104
$keywords[$i - 1] = '|';
107
$open_bracket = false;
121
switch ($keywords[$i])
135
$space = $keywords[$i];
138
if ($space !== false)
140
$keywords[$i] = $space;
157
'#(\+|\-)(?:\+|\-)+#',
169
$keywords = preg_replace($match, $replace, $keywords);
171
// $keywords input format: each word separated by a space, words in a bracket are not separated
173
// the user wants to search for any word, convert the search query
178
preg_match_all('#([^\\s+\\-|()]+)(?:$|[\\s+\\-|()])#u', $keywords, $words);
179
if (sizeof($words[1]))
181
$keywords = '(' . implode('|', $words[1]) . ')';
185
// set the search_query which is shown to the user
186
$this->search_query = $keywords;
188
$exact_words = array();
189
preg_match_all('#([^\\s+\\-|*()]+)(?:$|[\\s+\\-|()])#u', $keywords, $exact_words);
190
$exact_words = $exact_words[1];
192
$common_ids = $words = array();
194
if (sizeof($exact_words))
196
$sql = 'SELECT word_id, word_text, word_common
197
FROM ' . SEARCH_WORDLIST_TABLE . '
198
WHERE ' . $db->sql_in_set('word_text', $exact_words);
199
$result = $db->sql_query($sql);
201
// store an array of words and ids, remove common words
202
while ($row = $db->sql_fetchrow($result))
204
if ($row['word_common'])
206
$this->common_words[] = $row['word_text'];
207
$common_ids[$row['word_text']] = (int) $row['word_id'];
211
$words[$row['word_text']] = (int) $row['word_id'];
213
$db->sql_freeresult($result);
217
// now analyse the search query, first split it using the spaces
218
$query = explode(' ', $keywords);
220
$this->must_contain_ids = array();
221
$this->must_not_contain_ids = array();
222
$this->must_exclude_one_ids = array();
225
$ignore_no_id = true;
227
foreach ($query as $word)
234
// words which should not be included
237
$word = substr($word, 1);
239
// a group of which at least one may not be in the resulting posts
242
$word = array_unique(explode('|', substr($word, 1, -1)));
243
$mode = 'must_exclude_one';
245
// one word which should not be in the resulting posts
248
$mode = 'must_not_contain';
250
$ignore_no_id = true;
252
// words which have to be included
255
// no prefix is the same as a +prefix
258
$word = substr($word, 1);
261
// a group of words of which at least one word should be in every resulting post
264
$word = array_unique(explode('|', substr($word, 1, -1)));
266
$ignore_no_id = false;
267
$mode = 'must_contain';
275
// if this is an array of words then retrieve an id for each
278
$non_common_words = array();
280
foreach ($word as $i => $word_part)
282
if (strpos($word_part, '*') !== false)
284
$id_words[] = '\'' . $db->sql_escape(str_replace('*', '%', $word_part)) . '\'';
285
$non_common_words[] = $word_part;
287
else if (isset($words[$word_part]))
289
$id_words[] = $words[$word_part];
290
$non_common_words[] = $word_part;
294
$len = utf8_strlen($word_part);
295
if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
297
$this->common_words[] = $word_part;
301
if (sizeof($id_words))
304
if (sizeof($id_words) > 1)
306
$this->{$mode . '_ids'}[] = $id_words;
310
$mode = ($mode == 'must_exclude_one') ? 'must_not_contain' : $mode;
311
$this->{$mode . '_ids'}[] = $id_words[0];
314
// throw an error if we shall not ignore unexistant words
315
else if (!$ignore_no_id && sizeof($non_common_words))
317
trigger_error(sprintf($user->lang['WORDS_IN_NO_POST'], implode(', ', $non_common_words)));
319
unset($non_common_words);
321
// else we only need one id
322
else if (($wildcard = strpos($word, '*') !== false) || isset($words[$word]))
326
$len = utf8_strlen(str_replace('*', '', $word));
327
if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
329
$this->{$mode . '_ids'}[] = '\'' . $db->sql_escape(str_replace('*', '%', $word)) . '\'';
333
$this->common_words[] = $word;
338
$this->{$mode . '_ids'}[] = $words[$word];
341
// throw an error if we shall not ignore unexistant words
342
else if (!$ignore_no_id)
344
if (!isset($common_ids[$word]))
346
$len = utf8_strlen($word);
347
if ($len >= $this->word_length['min'] && $len <= $this->word_length['max'])
349
trigger_error(sprintf($user->lang['WORD_IN_NO_POST'], $word));
353
$this->common_words[] = $word;
359
$len = utf8_strlen($word);
360
if ($len < $this->word_length['min'] || $len > $this->word_length['max'])
362
$this->common_words[] = $word;
367
// we can't search for negatives only
368
if (!sizeof($this->must_contain_ids))
373
sort($this->must_contain_ids);
374
sort($this->must_not_contain_ids);
375
sort($this->must_exclude_one_ids);
377
if (!empty($this->search_query))
385
* Performs a search on keywords depending on display specific params. You have to run split_keywords() first.
387
* @param string $type contains either posts or topics depending on what should be searched for
388
* @param string &$fields contains either titleonly (topic titles should be searched), msgonly (only message bodies should be searched), firstpost (only subject and body of the first post should be searched) or all (all post bodies and subjects should be searched)
389
* @param string &$terms is either 'all' (use query as entered, words without prefix should default to "have to be in field") or 'any' (ignore search query parts and just return all posts that contain any of the specified words)
390
* @param array &$sort_by_sql contains SQL code for the ORDER BY part of a query
391
* @param string &$sort_key is the key of $sort_by_sql for the selected sorting
392
* @param string &$sort_dir is either a or d representing ASC and DESC
393
* @param string &$sort_days specifies the maximum amount of days a post may be old
394
* @param array &$ex_fid_ary specifies an array of forum ids which should not be searched
395
* @param array &$m_approve_fid_ary specifies an array of forum ids in which the searcher is allowed to view unapproved posts
396
* @param int &$topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
397
* @param array &$author_ary an array of author ids if the author should be ignored during the search the array is empty
398
* @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
399
* @param int $start indicates the first index of the page
400
* @param int $per_page number of ids each page is supposed to contain
401
* @return boolean|int total number of results
405
function keyword_search($type, &$fields, &$terms, &$sort_by_sql, &$sort_key, &$sort_dir, &$sort_days, &$ex_fid_ary, &$m_approve_fid_ary, &$topic_id, &$author_ary, &$id_ary, $start, $per_page)
409
// No keywords? No posts.
410
if (empty($this->search_query))
415
// generate a search_key from all the options to identify the results
416
$search_key = md5(implode('#', array(
417
serialize($this->must_contain_ids),
418
serialize($this->must_not_contain_ids),
419
serialize($this->must_exclude_one_ids),
426
implode(',', $ex_fid_ary),
427
implode(',', $m_approve_fid_ary),
428
implode(',', $author_ary)
431
// try reading the results from cache
433
if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
435
return $total_results;
440
$sql_where = array();
446
'SELECT' => ($type == 'posts') ? 'p.post_id' : 'p.topic_id',
448
SEARCH_WORDMATCH_TABLE => array(),
449
SEARCH_WORDLIST_TABLE => array(),
452
'LEFT_JOIN' => array()
454
$sql_where[] = 'm0.post_id = p.post_id';
458
// Build some display specific sql strings
462
$title_match = 'title_match = 1';
466
$sql_array['FROM'][TOPICS_TABLE] = 't';
467
$sql_where[] = 'p.post_id = t.topic_first_post_id';
471
$title_match = 'title_match = 0';
476
if ($type == 'topics')
478
if (!isset($sql_array['FROM'][TOPICS_TABLE]))
480
$sql_array['FROM'][TOPICS_TABLE] = 't';
481
$sql_where[] = 'p.topic_id = t.topic_id';
487
* @todo Add a query optimizer (handle stuff like "+(4|3) +4")
490
foreach ($this->must_contain_ids as $subquery)
492
if (is_array($subquery))
496
$word_id_sql = array();
498
foreach ($subquery as $id)
502
$sql_array['LEFT_JOIN'][] = array(
503
'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
504
'ON' => "w$w_num.word_text LIKE $id"
506
$word_ids[] = "w$w_num.word_id";
516
$sql_where[] = $db->sql_in_set("m$m_num.word_id", $word_ids);
521
else if (is_string($subquery))
523
$sql_array['FROM'][SEARCH_WORDLIST_TABLE][] = 'w' . $w_num;
525
$sql_where[] = "w$w_num.word_text LIKE $subquery";
526
$sql_where[] = "m$m_num.word_id = w$w_num.word_id";
533
$sql_where[] = "m$m_num.word_id = $subquery";
536
$sql_array['FROM'][SEARCH_WORDMATCH_TABLE][] = 'm' . $m_num;
540
$sql_where[] = "m$m_num.$title_match";
545
$sql_where[] = "m$m_num.post_id = m0.post_id";
550
foreach ($this->must_not_contain_ids as $key => $subquery)
552
if (is_string($subquery))
554
$sql_array['LEFT_JOIN'][] = array(
555
'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
556
'ON' => "w$w_num.word_text LIKE $subquery"
559
$this->must_not_contain_ids[$key] = "w$w_num.word_id";
566
if (sizeof($this->must_not_contain_ids))
568
$sql_array['LEFT_JOIN'][] = array(
569
'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
570
'ON' => $db->sql_in_set("m$m_num.word_id", $this->must_not_contain_ids) . (($title_match) ? " AND m$m_num.$title_match" : '') . " AND m$m_num.post_id = m0.post_id"
573
$sql_where[] = "m$m_num.word_id IS NULL";
577
foreach ($this->must_exclude_one_ids as $ids)
579
$is_null_joins = array();
580
foreach ($ids as $id)
584
$sql_array['LEFT_JOIN'][] = array(
585
'FROM' => array(SEARCH_WORDLIST_TABLE => 'w' . $w_num),
586
'ON' => "w$w_num.word_text LIKE $id"
588
$id = "w$w_num.word_id";
594
$sql_array['LEFT_JOIN'][] = array(
595
'FROM' => array(SEARCH_WORDMATCH_TABLE => 'm' . $m_num),
596
'ON' => "m$m_num.word_id = $id AND m$m_num.post_id = m0.post_id" . (($title_match) ? " AND m$m_num.$title_match" : '')
598
$is_null_joins[] = "m$m_num.word_id IS NULL";
602
$sql_where[] = '(' . implode(' OR ', $is_null_joins) . ')';
605
if (!sizeof($m_approve_fid_ary))
607
$sql_where[] = 'p.post_approved = 1';
609
else if ($m_approve_fid_ary !== array(-1))
611
$sql_where[] = '(p.post_approved = 1 OR ' . $db->sql_in_set('p.forum_id', $m_approve_fid_ary, true) . ')';
616
$sql_where[] = 'p.topic_id = ' . $topic_id;
619
if (sizeof($author_ary))
621
$sql_where[] = $db->sql_in_set('p.poster_id', $author_ary);
624
if (sizeof($ex_fid_ary))
626
$sql_where[] = $db->sql_in_set('p.forum_id', $ex_fid_ary, true);
631
$sql_where[] = 'p.post_time >= ' . (time() - ($sort_days * 86400));
634
$sql_array['WHERE'] = implode(' AND ', $sql_where);
637
// if the total result count is not cached yet, retrieve it from the db
641
$sql_array_count = $sql_array;
643
switch ($db->sql_layer)
648
// 3.x does not support SQL_CALC_FOUND_ROWS
649
$sql_array['SELECT'] = 'SQL_CALC_FOUND_ROWS ' . $sql_array['SELECT'];
655
$sql_array_count['SELECT'] = ($type == 'posts') ? 'DISTINCT p.post_id' : 'DISTINCT p.topic_id';
656
$sql = 'SELECT COUNT(' . (($type == 'posts') ? 'post_id' : 'topic_id') . ') as total_results
657
FROM (' . $db->sql_build_query('SELECT', $sql_array_count) . ')';
662
$sql_array_count['SELECT'] = ($type == 'posts') ? 'COUNT(DISTINCT p.post_id) AS total_results' : 'COUNT(DISTINCT p.topic_id) AS total_results';
663
$sql = (!$sql) ? $db->sql_build_query('SELECT', $sql_array_count) : $sql;
665
$result = $db->sql_query($sql);
666
$total_results = (int) $db->sql_fetchfield('total_results');
667
$db->sql_freeresult($result);
676
unset($sql_array_count, $sql);
679
// Build sql strings for sorting
680
$sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
682
switch ($sql_sort[0])
685
$sql_array['FROM'][USERS_TABLE] = 'u';
686
$sql_where[] = 'u.user_id = p.poster_id ';
690
if (!isset($sql_array['FROM'][TOPICS_TABLE]))
692
$sql_array['FROM'][TOPICS_TABLE] = 't';
693
$sql_where[] = 'p.topic_id = t.topic_id';
698
$sql_array['FROM'][FORUMS_TABLE] = 'f';
699
$sql_where[] = 'f.forum_id = p.forum_id';
703
$sql_array['WHERE'] = implode(' AND ', $sql_where);
704
$sql_array['GROUP_BY'] = ($group_by) ? (($type == 'posts') ? 'p.post_id' : 'p.topic_id') . ', ' . $sort_by_sql[$sort_key] : '';
705
$sql_array['ORDER_BY'] = $sql_sort;
707
unset($sql_where, $sql_sort, $group_by);
709
$sql = $db->sql_build_query('SELECT', $sql_array);
710
$result = $db->sql_query_limit($sql, $config['search_block_size'], $start);
712
while ($row = $db->sql_fetchrow($result))
714
$id_ary[] = $row[(($type == 'posts') ? 'post_id' : 'topic_id')];
716
$db->sql_freeresult($result);
718
if (!sizeof($id_ary))
723
// if we use mysql and the total result count is not cached yet, retrieve it from the db
724
if (!$total_results && $is_mysql)
726
$sql = 'SELECT FOUND_ROWS() as total_results';
727
$result = $db->sql_query($sql);
728
$total_results = (int) $db->sql_fetchfield('total_results');
729
$db->sql_freeresult($result);
737
// store the ids, from start on then delete anything that isn't on the current page because we only need ids for one page
738
$this->save_ids($search_key, $this->search_query, $author_ary, $total_results, $id_ary, $start, $sort_dir);
739
$id_ary = array_slice($id_ary, 0, (int) $per_page);
741
return $total_results;
745
* Performs a search on an author's posts without caring about message contents. Depends on display specific params
747
* @param string $type contains either posts or topics depending on what should be searched for
748
* @param boolean $firstpost_only if true, only topic starting posts will be considered
749
* @param array &$sort_by_sql contains SQL code for the ORDER BY part of a query
750
* @param string &$sort_key is the key of $sort_by_sql for the selected sorting
751
* @param string &$sort_dir is either a or d representing ASC and DESC
752
* @param string &$sort_days specifies the maximum amount of days a post may be old
753
* @param array &$ex_fid_ary specifies an array of forum ids which should not be searched
754
* @param array &$m_approve_fid_ary specifies an array of forum ids in which the searcher is allowed to view unapproved posts
755
* @param int &$topic_id is set to 0 or a topic id, if it is not 0 then only posts in this topic should be searched
756
* @param array &$author_ary an array of author ids
757
* @param array &$id_ary passed by reference, to be filled with ids for the page specified by $start and $per_page, should be ordered
758
* @param int $start indicates the first index of the page
759
* @param int $per_page number of ids each page is supposed to contain
760
* @return boolean|int total number of results
764
function author_search($type, $firstpost_only, &$sort_by_sql, &$sort_key, &$sort_dir, &$sort_days, &$ex_fid_ary, &$m_approve_fid_ary, &$topic_id, &$author_ary, &$id_ary, $start, $per_page)
768
// No author? No posts.
769
if (!sizeof($author_ary))
774
// generate a search_key from all the options to identify the results
775
$search_key = md5(implode('#', array(
778
($firstpost_only) ? 'firstpost' : '',
784
implode(',', $ex_fid_ary),
785
implode(',', $m_approve_fid_ary),
786
implode(',', $author_ary)
789
// try reading the results from cache
791
if ($this->obtain_ids($search_key, $total_results, $id_ary, $start, $per_page, $sort_dir) == SEARCH_RESULT_IN_CACHE)
793
return $total_results;
798
// Create some display specific sql strings
799
$sql_author = $db->sql_in_set('p.poster_id', $author_ary);
800
$sql_fora = (sizeof($ex_fid_ary)) ? ' AND ' . $db->sql_in_set('p.forum_id', $ex_fid_ary, true) : '';
801
$sql_time = ($sort_days) ? ' AND p.post_time >= ' . (time() - ($sort_days * 86400)) : '';
802
$sql_topic_id = ($topic_id) ? ' AND p.topic_id = ' . (int) $topic_id : '';
803
$sql_firstpost = ($firstpost_only) ? ' AND p.post_id = t.topic_first_post_id' : '';
805
// Build sql strings for sorting
806
$sql_sort = $sort_by_sql[$sort_key] . (($sort_dir == 'a') ? ' ASC' : ' DESC');
807
$sql_sort_table = $sql_sort_join = '';
808
switch ($sql_sort[0])
811
$sql_sort_table = USERS_TABLE . ' u, ';
812
$sql_sort_join = ' AND u.user_id = p.poster_id ';
816
$sql_sort_table = ($type == 'posts') ? TOPICS_TABLE . ' t, ' : '';
817
$sql_sort_join = ($type == 'posts') ? ' AND t.topic_id = p.topic_id ' : '';
821
$sql_sort_table = FORUMS_TABLE . ' f, ';
822
$sql_sort_join = ' AND f.forum_id = p.forum_id ';
826
if (!sizeof($m_approve_fid_ary))
828
$m_approve_fid_sql = ' AND p.post_approved = 1';
830
else if ($m_approve_fid_ary == array(-1))
832
$m_approve_fid_sql = '';
836
$m_approve_fid_sql = ' AND (p.post_approved = 1 OR ' . $db->sql_in_set('p.forum_id', $m_approve_fid_ary, true) . ')';
839
$select = ($type == 'posts') ? 'p.post_id' : 't.topic_id';
842
// If the cache was completely empty count the results
845
switch ($db->sql_layer)
849
$select = 'SQL_CALC_FOUND_ROWS ' . $select;
854
if ($type == 'posts')
856
$sql = 'SELECT COUNT(p.post_id) as total_results
857
FROM ' . POSTS_TABLE . ' p' . (($firstpost_only) ? ', ' . TOPICS_TABLE . ' t ' : ' ') . "
867
if ($db->sql_layer == 'sqlite')
869
$sql = 'SELECT COUNT(topic_id) as total_results
870
FROM (SELECT DISTINCT t.topic_id';
874
$sql = 'SELECT COUNT(DISTINCT t.topic_id) as total_results';
877
$sql .= ' FROM ' . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
883
AND t.topic_id = p.topic_id
884
$sql_time" . (($db->sql_layer == 'sqlite') ? ')' : '');
886
$result = $db->sql_query($sql);
888
$total_results = (int) $db->sql_fetchfield('total_results');
889
$db->sql_freeresult($result);
899
// Build the query for really selecting the post_ids
900
if ($type == 'posts')
902
$sql = "SELECT $select
903
FROM " . $sql_sort_table . POSTS_TABLE . ' p' . (($topic_id || $firstpost_only) ? ', ' . TOPICS_TABLE . ' t' : '') . "
916
$sql = "SELECT $select
917
FROM " . $sql_sort_table . TOPICS_TABLE . ' t, ' . POSTS_TABLE . " p
923
AND t.topic_id = p.topic_id
926
GROUP BY t.topic_id, " . $sort_by_sql[$sort_key] . '
927
ORDER BY ' . $sql_sort;
931
// Only read one block of posts from the db and then cache it
932
$result = $db->sql_query_limit($sql, $config['search_block_size'], $start);
934
while ($row = $db->sql_fetchrow($result))
936
$id_ary[] = $row[$field];
938
$db->sql_freeresult($result);
940
if (!$total_results && $is_mysql)
942
$sql = 'SELECT FOUND_ROWS() as total_results';
943
$result = $db->sql_query($sql);
944
$total_results = (int) $db->sql_fetchfield('total_results');
945
$db->sql_freeresult($result);
955
$this->save_ids($search_key, '', $author_ary, $total_results, $id_ary, $start, $sort_dir);
956
$id_ary = array_slice($id_ary, 0, $per_page);
958
return $total_results;
964
* Split a text into words of a given length
966
* The text is converted to UTF-8, cleaned up, and split. Then, words that
967
* conform to the defined length range are returned in an array.
969
* NOTE: duplicates are NOT removed from the return array
971
* @param string $text Text to split, encoded in UTF-8
972
* @return array Array of UTF-8 words
976
function split_message($text)
978
global $phpbb_root_path, $phpEx, $user;
980
$match = $words = array();
983
* Taken from the original code
986
$match[] = '#\[code(?:=.*?)?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is';
988
$match[] = '#\[\/?[a-z0-9\*\+\-]+(?:=.*?)?(?::[a-z])?(\:?[0-9a-z]{5,})\]#';
990
$min = $this->word_length['min'];
991
$max = $this->word_length['max'];
993
$isset_min = $min - 1;
996
* Clean up the string, remove HTML tags, remove BBCodes
998
$word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), -1), ' ');
1000
while (strlen($word))
1002
if (strlen($word) > 255 || strlen($word) <= $isset_min)
1005
* Words longer than 255 bytes are ignored. This will have to be
1006
* changed whenever we change the length of search_wordlist.word_text
1008
* Words shorter than $isset_min bytes are ignored, too
1010
$word = strtok(' ');
1014
$len = utf8_strlen($word);
1017
* Test whether the word is too short to be indexed.
1019
* Note that this limit does NOT apply to CJK and Hangul
1024
* Note: this could be optimized. If the codepoint is lower than Hangul's range
1025
* we know that it will also be lower than CJK ranges
1027
if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
1028
&& (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
1029
&& (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
1031
$word = strtok(' ');
1037
$word = strtok(' ');
1044
* Updates wordlist and wordmatch tables when a message is posted or changed
1046
* @param string $mode Contains the post mode: edit, post, reply, quote
1047
* @param int $post_id The id of the post which is modified/created
1048
* @param string &$message New or updated post content
1049
* @param string &$subject New or updated post subject
1050
* @param int $poster_id Post author's user id
1051
* @param int $forum_id The id of the forum in which the post is located
1055
function index($mode, $post_id, &$message, &$subject, $poster_id, $forum_id)
1057
global $config, $db, $user;
1059
if (!$config['fulltext_native_load_upd'])
1062
* The search indexer is disabled, return
1067
// Split old and new post/subject to obtain array of 'words'
1068
$split_text = $this->split_message($message);
1069
$split_title = $this->split_message($subject);
1071
$cur_words = array('post' => array(), 'title' => array());
1074
if ($mode == 'edit')
1076
$words['add']['post'] = array();
1077
$words['add']['title'] = array();
1078
$words['del']['post'] = array();
1079
$words['del']['title'] = array();
1081
$sql = 'SELECT w.word_id, w.word_text, m.title_match
1082
FROM ' . SEARCH_WORDLIST_TABLE . ' w, ' . SEARCH_WORDMATCH_TABLE . " m
1083
WHERE m.post_id = $post_id
1084
AND w.word_id = m.word_id";
1085
$result = $db->sql_query($sql);
1087
while ($row = $db->sql_fetchrow($result))
1089
$which = ($row['title_match']) ? 'title' : 'post';
1090
$cur_words[$which][$row['word_text']] = $row['word_id'];
1092
$db->sql_freeresult($result);
1094
$words['add']['post'] = array_diff($split_text, array_keys($cur_words['post']));
1095
$words['add']['title'] = array_diff($split_title, array_keys($cur_words['title']));
1096
$words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text);
1097
$words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title);
1101
$words['add']['post'] = $split_text;
1102
$words['add']['title'] = $split_title;
1103
$words['del']['post'] = array();
1104
$words['del']['title'] = array();
1107
unset($split_title);
1109
// Get unique words from the above arrays
1110
$unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title']));
1112
// We now have unique arrays of all words to be added and removed and
1113
// individual arrays of added and removed words for text and title. What
1114
// we need to do now is add the new words (if they don't already exist)
1115
// and then add (or remove) matches between the words and this post
1116
if (sizeof($unique_add_words))
1118
$sql = 'SELECT word_id, word_text
1119
FROM ' . SEARCH_WORDLIST_TABLE . '
1120
WHERE ' . $db->sql_in_set('word_text', $unique_add_words);
1121
$result = $db->sql_query($sql);
1123
$word_ids = array();
1124
while ($row = $db->sql_fetchrow($result))
1126
$word_ids[$row['word_text']] = $row['word_id'];
1128
$db->sql_freeresult($result);
1129
$new_words = array_diff($unique_add_words, array_keys($word_ids));
1131
$db->sql_transaction('begin');
1132
if (sizeof($new_words))
1136
foreach ($new_words as $word)
1138
$sql_ary[] = array('word_text' => (string) $word, 'word_count' => 0);
1140
$db->sql_return_on_error(true);
1141
$db->sql_multi_insert(SEARCH_WORDLIST_TABLE, $sql_ary);
1142
$db->sql_return_on_error(false);
1144
unset($new_words, $sql_ary);
1148
$db->sql_transaction('begin');
1151
// now update the search match table, remove links to removed words and add links to new words
1152
foreach ($words['del'] as $word_in => $word_ary)
1154
$title_match = ($word_in == 'title') ? 1 : 0;
1156
if (sizeof($word_ary))
1159
foreach ($word_ary as $word)
1161
$sql_in[] = $cur_words[$word_in][$word];
1164
$sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
1165
WHERE ' . $db->sql_in_set('word_id', $sql_in) . '
1166
AND post_id = ' . intval($post_id) . "
1167
AND title_match = $title_match";
1168
$db->sql_query($sql);
1170
$sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1171
SET word_count = word_count - 1
1172
WHERE ' . $db->sql_in_set('word_id', $sql_in) . '
1173
AND word_count > 0';
1174
$db->sql_query($sql);
1180
$db->sql_return_on_error(true);
1181
foreach ($words['add'] as $word_in => $word_ary)
1183
$title_match = ($word_in == 'title') ? 1 : 0;
1185
if (sizeof($word_ary))
1187
$sql = 'INSERT INTO ' . SEARCH_WORDMATCH_TABLE . ' (post_id, word_id, title_match)
1188
SELECT ' . (int) $post_id . ', word_id, ' . (int) $title_match . '
1189
FROM ' . SEARCH_WORDLIST_TABLE . '
1190
WHERE ' . $db->sql_in_set('word_text', $word_ary);
1191
$db->sql_query($sql);
1193
$sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1194
SET word_count = word_count + 1
1195
WHERE ' . $db->sql_in_set('word_text', $word_ary);
1196
$db->sql_query($sql);
1199
$db->sql_return_on_error(false);
1201
$db->sql_transaction('commit');
1203
// destroy cached search results containing any of the words removed or added
1204
$this->destroy_cache(array_unique(array_merge($words['add']['post'], $words['add']['title'], $words['del']['post'], $words['del']['title'])), array($poster_id));
1206
unset($unique_add_words);
1212
* Removes entries from the wordmatch table for the specified post_ids
1214
function index_remove($post_ids, $author_ids, $forum_ids)
1218
if (sizeof($post_ids))
1220
$sql = 'SELECT w.word_id, w.word_text, m.title_match
1221
FROM ' . SEARCH_WORDMATCH_TABLE . ' m, ' . SEARCH_WORDLIST_TABLE . ' w
1222
WHERE ' . $db->sql_in_set('m.post_id', $post_ids) . '
1223
AND w.word_id = m.word_id';
1224
$result = $db->sql_query($sql);
1226
$message_word_ids = $title_word_ids = $word_texts = array();
1227
while ($row = $db->sql_fetchrow($result))
1229
if ($row['title_match'])
1231
$title_word_ids[] = $row['word_id'];
1235
$message_word_ids[] = $row['word_id'];
1237
$word_texts[] = $row['word_text'];
1239
$db->sql_freeresult($result);
1241
if (sizeof($title_word_ids))
1243
$sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1244
SET word_count = word_count - 1
1245
WHERE ' . $db->sql_in_set('word_id', $title_word_ids) . '
1246
AND word_count > 0';
1247
$db->sql_query($sql);
1250
if (sizeof($message_word_ids))
1252
$sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1253
SET word_count = word_count - 1
1254
WHERE ' . $db->sql_in_set('word_id', $message_word_ids) . '
1255
AND word_count > 0';
1256
$db->sql_query($sql);
1259
unset($title_word_ids);
1260
unset($message_word_ids);
1262
$sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
1263
WHERE ' . $db->sql_in_set('post_id', $post_ids);
1264
$db->sql_query($sql);
1267
$this->destroy_cache(array_unique($word_texts), $author_ids);
1271
* Tidy up indexes: Tag 'common words' and remove
1272
* words no longer referenced in the match table
1276
global $db, $config;
1278
// Is the fulltext indexer disabled? If yes then we need not
1279
// carry on ... it's okay ... I know when I'm not wanted boo hoo
1280
if (!$config['fulltext_native_load_upd'])
1282
set_config('search_last_gc', time(), true);
1286
$destroy_cache_words = array();
1288
// Remove common words
1289
if ($config['num_posts'] >= 100 && $config['fulltext_native_common_thres'])
1291
$common_threshold = ((double) $config['fulltext_native_common_thres']) / 100.0;
1292
// First, get the IDs of common words
1293
$sql = 'SELECT word_id, word_text
1294
FROM ' . SEARCH_WORDLIST_TABLE . '
1295
WHERE word_count > ' . floor($config['num_posts'] * $common_threshold) . '
1296
OR word_common = 1';
1297
$result = $db->sql_query($sql);
1300
while ($row = $db->sql_fetchrow($result))
1302
$sql_in[] = $row['word_id'];
1303
$destroy_cache_words[] = $row['word_text'];
1305
$db->sql_freeresult($result);
1307
if (sizeof($sql_in))
1310
$sql = 'UPDATE ' . SEARCH_WORDLIST_TABLE . '
1312
WHERE ' . $db->sql_in_set('word_id', $sql_in);
1313
$db->sql_query($sql);
1315
// by setting search_last_gc to the new time here we make sure that if a user reloads because the
1316
// following query takes too long, he won't run into it again
1317
set_config('search_last_gc', time(), true);
1319
// Delete the matches
1320
$sql = 'DELETE FROM ' . SEARCH_WORDMATCH_TABLE . '
1321
WHERE ' . $db->sql_in_set('word_id', $sql_in);
1322
$db->sql_query($sql);
1327
if (sizeof($destroy_cache_words))
1329
// destroy cached search results containing any of the words that are now common or were removed
1330
$this->destroy_cache(array_unique($destroy_cache_words));
1333
set_config('search_last_gc', time(), true);
1337
* Deletes all words from the index
1339
function delete_index($acp_module, $u_action)
1343
switch ($db->sql_layer)
1347
$db->sql_query('DELETE FROM ' . SEARCH_WORDLIST_TABLE);
1348
$db->sql_query('DELETE FROM ' . SEARCH_WORDMATCH_TABLE);
1349
$db->sql_query('DELETE FROM ' . SEARCH_RESULTS_TABLE);
1353
$db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDLIST_TABLE);
1354
$db->sql_query('TRUNCATE TABLE ' . SEARCH_WORDMATCH_TABLE);
1355
$db->sql_query('TRUNCATE TABLE ' . SEARCH_RESULTS_TABLE);
1361
* Returns true if both FULLTEXT indexes exist
1363
function index_created()
1365
if (!sizeof($this->stats))
1370
return ($this->stats['total_words'] && $this->stats['total_matches']) ? true : false;
1374
* Returns an associative array containing information about the indexes
1376
function index_stats()
1380
if (!sizeof($this->stats))
1386
$user->lang['TOTAL_WORDS'] => $this->stats['total_words'],
1387
$user->lang['TOTAL_MATCHES'] => $this->stats['total_matches']);
1390
function get_stats()
1394
$sql = 'SELECT COUNT(*) as total_words
1395
FROM ' . SEARCH_WORDLIST_TABLE;
1396
$result = $db->sql_query($sql);
1397
$this->stats['total_words'] = (int) $db->sql_fetchfield('total_words');
1398
$db->sql_freeresult($result);
1400
$sql = 'SELECT COUNT(*) as total_matches
1401
FROM ' . SEARCH_WORDMATCH_TABLE;
1402
$result = $db->sql_query($sql);
1403
$this->stats['total_matches'] = (int) $db->sql_fetchfield('total_matches');
1404
$db->sql_freeresult($result);
1408
* Clean up a text to remove non-alphanumeric characters
1410
* This method receives a UTF-8 string, normalizes and validates it, replaces all
1411
* non-alphanumeric characters with strings then returns the result.
1413
* Any number of "allowed chars" can be passed as a UTF-8 string in NFC.
1415
* @param string $text Text to split, in UTF-8 (not normalized or sanitized)
1416
* @param string $allowed_chars String of special chars to allow
1417
* @param string $encoding Text encoding
1418
* @return string Cleaned up text, only alphanumeric chars are left
1420
* @todo normalizer::cleanup being able to be used?
1422
function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
1424
global $phpbb_root_path, $phpEx;
1425
static $conv = array(), $conv_loaded = array();
1426
$words = $allow = array();
1428
// Convert the text to UTF-8
1429
$encoding = strtolower($encoding);
1430
if ($encoding != 'utf-8')
1432
$text = utf8_recode($text, $encoding);
1435
$utf_len_mask = array(
1443
* Replace HTML entities and NCRs
1445
$text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
1448
* Load the UTF-8 normalizer
1450
* If we use it more widely, an instance of that class should be held in a
1451
* a global variable instead
1453
utf_normalizer::nfc($text);
1456
* The first thing we do is:
1458
* - convert ASCII-7 letters to lowercase
1459
* - remove the ASCII-7 non-alpha characters
1460
* - remove the bytes that should not appear in a valid UTF-8 string: 0xC0,
1461
* 0xC1 and 0xF5-0xFF
1463
* @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars
1465
$sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\xC0\xC1\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
1466
$sb_replace = 'istcpamelrdojbnhfgvwuqkyxz ';
1469
* This is the list of legal ASCII chars, it is automatically extended
1470
* with ASCII chars from $allowed_chars
1472
$legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z';
1475
* Prepare an array containing the extra chars to allow
1477
if (isset($allowed_chars[0]))
1480
$len = strlen($allowed_chars);
1483
$c = $allowed_chars[$pos];
1490
$sb_pos = strpos($sb_match, $c);
1491
if (is_int($sb_pos))
1494
* Remove the char from $sb_match and its corresponding
1495
* replacement in $sb_replace
1497
$sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1);
1498
$sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1);
1509
$utf_len = $utf_len_mask[$c & "\xF0"];
1510
$allow[substr($allowed_chars, $pos, $utf_len)] = 1;
1514
while ($pos < $len);
1517
$text = strtr($text, $sb_match, $sb_replace);
1521
$len = strlen($text);
1526
* Do all consecutive ASCII chars at once
1528
if ($spn = strspn($text, $legal_ascii, $pos))
1530
$ret .= substr($text, $pos, $spn);
1540
* Capture the UTF char
1542
$utf_len = $utf_len_mask[$text[$pos] & "\xF0"];
1543
$utf_char = substr($text, $pos, $utf_len);
1546
if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
1547
|| ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
1548
|| ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
1551
* All characters within these ranges are valid
1553
* We separate them with a space in order to index each character
1556
$ret .= ' ' . $utf_char . ' ';
1560
if (isset($allow[$utf_char]))
1563
* The char is explicitly allowed
1569
if (isset($conv[$utf_char]))
1572
* The char is mapped to something, maybe to itself actually
1574
$ret .= $conv[$utf_char];
1579
* The char isn't mapped, but did we load its conversion table?
1581
* The search indexer table is split into blocks. The block number of
1582
* each char is equal to its codepoint right-shifted for 11 bits. It
1583
* means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or
1584
* 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus,
1585
* all UTF chars encoded in 2 bytes are in the same first block.
1587
if (isset($utf_char[2]))
1589
if (isset($utf_char[3]))
1592
* 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx
1593
* 0000 0111 0011 1111 0010 0000
1595
$idx = ((ord($utf_char[0]) & 0x07) << 7) | ((ord($utf_char[1]) & 0x3F) << 1) | ((ord($utf_char[2]) & 0x20) >> 5);
1600
* 1110 nnnn 10nx xxxx 10xx xxxx
1601
* 0000 0111 0010 0000
1603
$idx = ((ord($utf_char[0]) & 0x07) << 1) | ((ord($utf_char[1]) & 0x20) >> 5);
1609
* 110x xxxx 10xx xxxx
1610
* 0000 0000 0000 0000
1616
* Check if the required conv table has been loaded already
1618
if (!isset($conv_loaded[$idx]))
1620
$conv_loaded[$idx] = 1;
1621
$file = $phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx;
1623
if (file_exists($file))
1625
$conv += include($file);
1629
if (isset($conv[$utf_char]))
1631
$ret .= $conv[$utf_char];
1636
* We add an entry to the conversion table so that we
1637
* don't have to convert to codepoint and perform the checks
1638
* that are above this block
1640
$conv[$utf_char] = ' ';
1650
* Returns a list of options for the ACP to display
1654
global $user, $config;
1658
* if we need any options, copied from fulltext_native for now, will have to be adjusted or removed
1663
<dt><label for="fulltext_native_load_upd">' . $user->lang['YES_SEARCH_UPDATE'] . ':</label><br /><span>' . $user->lang['YES_SEARCH_UPDATE_EXPLAIN'] . '</span></dt>
1664
<dd><label><input type="radio" id="fulltext_native_load_upd" name="config[fulltext_native_load_upd]" value="1"' . (($config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $user->lang['YES'] . '</label><label><input type="radio" name="config[fulltext_native_load_upd]" value="0"' . ((!$config['fulltext_native_load_upd']) ? ' checked="checked"' : '') . ' class="radio" /> ' . $user->lang['NO'] . '</label></dd>
1667
<dt><label for="fulltext_native_min_chars">' . $user->lang['MIN_SEARCH_CHARS'] . ':</label><br /><span>' . $user->lang['MIN_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
1668
<dd><input id="fulltext_native_min_chars" type="text" size="3" maxlength="3" name="config[fulltext_native_min_chars]" value="' . (int) $config['fulltext_native_min_chars'] . '" /></dd>
1671
<dt><label for="fulltext_native_max_chars">' . $user->lang['MAX_SEARCH_CHARS'] . ':</label><br /><span>' . $user->lang['MAX_SEARCH_CHARS_EXPLAIN'] . '</span></dt>
1672
<dd><input id="fulltext_native_max_chars" type="text" size="3" maxlength="3" name="config[fulltext_native_max_chars]" value="' . (int) $config['fulltext_native_max_chars'] . '" /></dd>
1675
<dt><label for="fulltext_native_common_thres">' . $user->lang['COMMON_WORD_THRESHOLD'] . ':</label><br /><span>' . $user->lang['COMMON_WORD_THRESHOLD_EXPLAIN'] . '</span></dt>
1676
<dd><input id="fulltext_native_common_thres" type="text" size="3" maxlength="3" name="config[fulltext_native_common_thres]" value="' . (int) $config['fulltext_native_common_thres'] . '" /> %</dd>
1680
// These are fields required in the config table
1683
'config' => array('fulltext_native_load_upd' => 'bool', 'fulltext_native_min_chars' => 'integer:0:255', 'fulltext_native_max_chars' => 'integer:0:255', 'fulltext_native_common_thres' => 'double:0:100')
b'\\ No newline at end of file'