1 <?php # vim:ts=2:sw=2:et:
2 /* For licensing and copyright terms, see the file named LICENSE */
4 require_once 'Zend/Search/Lucene.php';
7 * Copyright (c) 2005 Richard Heyes (http://www.phpguru.org/)
8 * PHP5 Implementation of the Porter Stemmer algorithm. Certain elements
9 * were borrowed from the (broken) implementation by Jon Abernathy.
13 * Regex for matching a consonant
16 private static $regex_consonant =
17 '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
20 * Regex for matching a vowel
23 private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
26 * Stems a word. Simple huh?
28 * @param string $word Word to stem
29 * @return string Stemmed word
31 public static function Stem($word)
33 if (strlen($word) <= 2) {
37 $word = self::step1ab($word);
38 $word = self::step1c($word);
39 $word = self::step2($word);
40 $word = self::step3($word);
41 $word = self::step4($word);
42 $word = self::step5($word);
50 private static function step1ab($word)
53 if (substr($word, -1) == 's') {
55 self::replace($word, 'sses', 'ss')
56 OR self::replace($word, 'ies', 'i')
57 OR self::replace($word, 'ss', 'ss')
58 OR self::replace($word, 's', '');
62 if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
63 $v = self::$regex_vowel;
66 if ( preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
67 OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
69 // If one of above two test successful
70 if ( !self::replace($word, 'at', 'ate')
71 AND !self::replace($word, 'bl', 'ble')
72 AND !self::replace($word, 'iz', 'ize')) {
74 // Double consonant ending
75 if ( self::doubleConsonant($word)
76 AND substr($word, -2) != 'll'
77 AND substr($word, -2) != 'ss'
78 AND substr($word, -2) != 'zz') {
80 $word = substr($word, 0, -1);
82 } else if (self::m($word) == 1 AND self::cvc($word)) {
95 * @param string $word Word to stem
97 private static function step1c($word)
99 $v = self::$regex_vowel;
101 if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
102 self::replace($word, 'y', 'i');
111 * @param string $word Word to stem
113 private static function step2($word)
115 switch (substr($word, -2, 1)) {
117 self::replace($word, 'ational', 'ate', 0)
118 OR self::replace($word, 'tional', 'tion', 0);
122 self::replace($word, 'enci', 'ence', 0)
123 OR self::replace($word, 'anci', 'ance', 0);
127 self::replace($word, 'izer', 'ize', 0);
131 self::replace($word, 'logi', 'log', 0);
135 self::replace($word, 'entli', 'ent', 0)
136 OR self::replace($word, 'ousli', 'ous', 0)
137 OR self::replace($word, 'alli', 'al', 0)
138 OR self::replace($word, 'bli', 'ble', 0)
139 OR self::replace($word, 'eli', 'e', 0);
143 self::replace($word, 'ization', 'ize', 0)
144 OR self::replace($word, 'ation', 'ate', 0)
145 OR self::replace($word, 'ator', 'ate', 0);
149 self::replace($word, 'iveness', 'ive', 0)
150 OR self::replace($word, 'fulness', 'ful', 0)
151 OR self::replace($word, 'ousness', 'ous', 0)
152 OR self::replace($word, 'alism', 'al', 0);
156 self::replace($word, 'biliti', 'ble', 0)
157 OR self::replace($word, 'aliti', 'al', 0)
158 OR self::replace($word, 'iviti', 'ive', 0);
168 * @param string $word String to stem
170 private static function step3($word)
172 switch (substr($word, -2, 1)) {
174 self::replace($word, 'ical', 'ic', 0);
178 self::replace($word, 'ness', '', 0);
182 self::replace($word, 'icate', 'ic', 0)
183 OR self::replace($word, 'iciti', 'ic', 0);
187 self::replace($word, 'ful', '', 0);
191 self::replace($word, 'ative', '', 0);
195 self::replace($word, 'alize', 'al', 0);
205 * @param string $word Word to stem
207 private static function step4($word)
209 switch (substr($word, -2, 1)) {
211 self::replace($word, 'al', '', 1);
215 self::replace($word, 'ance', '', 1)
216 OR self::replace($word, 'ence', '', 1);
220 self::replace($word, 'er', '', 1);
224 self::replace($word, 'ic', '', 1);
228 self::replace($word, 'able', '', 1)
229 OR self::replace($word, 'ible', '', 1);
233 self::replace($word, 'ant', '', 1)
234 OR self::replace($word, 'ement', '', 1)
235 OR self::replace($word, 'ment', '', 1)
236 OR self::replace($word, 'ent', '', 1);
240 if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
241 self::replace($word, 'ion', '', 1);
243 self::replace($word, 'ou', '', 1);
248 self::replace($word, 'ism', '', 1);
252 self::replace($word, 'ate', '', 1)
253 OR self::replace($word, 'iti', '', 1);
257 self::replace($word, 'ous', '', 1);
261 self::replace($word, 'ive', '', 1);
265 self::replace($word, 'ize', '', 1);
275 * @param string $word Word to stem
277 private static function step5($word)
280 if (substr($word, -1) == 'e') {
281 if (self::m(substr($word, 0, -1)) > 1) {
282 self::replace($word, 'e', '');
284 } else if (self::m(substr($word, 0, -1)) == 1) {
286 if (!self::cvc(substr($word, 0, -1))) {
287 self::replace($word, 'e', '');
293 if (self::m($word) > 1 AND
294 self::doubleConsonant($word) AND substr($word, -1) == 'l') {
295 $word = substr($word, 0, -1);
302 * Replaces the first string with the second, at the end of the string. If third
303 * arg is given, then the preceding string must match that m count at least.
305 * @param string $str String to check
306 * @param string $check Ending to check for
307 * @param string $repl Replacement string
308 * @param int $m Optional minimum number of m() to meet
309 * @return bool Whether the $check string was at the end
310 * of the $str string. True does not necessarily mean
311 * that it was replaced.
313 private static function replace(&$str, $check, $repl, $m = null)
315 $len = 0 - strlen($check);
317 if (substr($str, $len) == $check) {
318 $substr = substr($str, 0, $len);
319 if (is_null($m) OR self::m($substr) > $m) {
320 $str = $substr . $repl;
330 * What, you mean it's not obvious from the name?
332 * m() measures the number of consonant sequences in $str. if c is
333 * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
339 * <c>vcvcvc<v> gives 3
341 * @param string $str The string to return the m count for
342 * @return int The m count
344 private static function m($str)
346 $c = self::$regex_consonant;
347 $v = self::$regex_vowel;
349 $str = preg_replace("#^$c+#", '', $str);
350 $str = preg_replace("#$v+$#", '', $str);
352 preg_match_all("#($v+$c+)#", $str, $matches);
354 return count($matches[1]);
359 * Returns true/false as to whether the given string contains two
360 * of the same consonant next to each other at the end of the string.
362 * @param string $str String to check
363 * @return bool Result
365 private static function doubleConsonant($str)
367 $c = self::$regex_consonant;
369 return preg_match("#$c{2}$#", $str, $matches)
370 AND $matches[0]{0} == $matches[0]{1};
375 * Checks for ending CVC sequence where second C is not W, X or Y
377 * @param string $str String to check
378 * @return bool Result
380 private static function cvc($str)
382 $c = self::$regex_consonant;
383 $v = self::$regex_vowel;
385 return preg_match("#($c$v$c)$#", $str, $matches)
386 AND strlen($matches[1]) == 3
387 AND $matches[1]{2} != 'w'
388 AND $matches[1]{2} != 'x'
389 AND $matches[1]{2} != 'y';
393 class MTrackSearchStemmer extends
394 Zend_Search_Lucene_Analysis_TokenFilter {
396 public function normalize(Zend_Search_Lucene_Analysis_Token $tok)
398 $text = $tok->getTermText();
399 $text = PorterStemmer::Stem($text);
400 $ntok = new Zend_Search_Lucene_Analysis_Token($text,
401 $tok->getStartOffset(),
402 $tok->getEndOffset());
403 $ntok->setPositionIncrement($tok->getPositionIncrement());
408 class MTrackSearchDateToken extends Zend_Search_Lucene_Analysis_Token {
411 class MTrackSearchAnalyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common
414 private $_bytePosition;
415 private $_moreTokens = array();
419 $this->_position = 0;
420 $this->_bytePosition = 0;
425 if (count($this->_moreTokens)) {
426 $tok = array_shift($this->_moreTokens);
429 if ($this->_input == null) {
434 /* first check for date fields */
437 // 2008-12-22T05:42:42.285445Z
438 if (preg_match('/\d{4}-\d\d-\d\d(?:T\d\d:\d\d:\d\d(?:\.\d+)?Z?)?/u',
439 $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
441 } else if (!preg_match('/[\p{L}\p{N}_]+/u',
442 $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
445 if (!function_exists('mb_strtolower')) {
446 $matchedWord = strtolower($match[0][0]);
448 $matchedWord = mb_strtolower($match[0][0], 'UTF-8');
450 $binStartPos = $match[0][1];
451 $startPos = $this->_position +
452 iconv_strlen(substr($this->_input, $this->_bytePosition,
453 $binStartPos - $this->_bytePosition),
455 $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
456 $this->_bytePosition = $binStartPos + strlen($matchedWord);
457 $this->_position = $endPos;
460 // $this->_moreTokens[] = new MTrackSearchDateToken($matchedWord,
461 // $startPos, $endPos);
463 /* Seems very difficult to allow range searching on strings
464 * of the form "2009-10-10", so we just smush it together */
465 $no_sep = str_replace(array('-', ':'), array('', ''), $matchedWord);
466 list($no_sep) = explode('.', $no_sep);
468 /* full date and time */
469 // $this->_moreTokens[] = new MTrackSearchDateToken(
470 // $no_sep, $startPos, $endPos);
473 $date = substr($no_sep, 0, 8);
474 $this->_moreTokens[] = new MTrackSearchDateToken(
475 $date, $startPos, $endPos);
477 $token = new Zend_Search_Lucene_Analysis_Token(
478 $matchedWord, $startPos, $endPos);
479 $token = $this->normalize($token);
480 if ($token !== null) {
481 $this->_moreTokens[] = $token;
485 /* split by underscores and add those tokens too */
486 foreach (explode('_', $matchedWord) as $ele) {
487 $token = new Zend_Search_Lucene_Analysis_Token(
488 $ele, $startPos, $endPos);
489 $token = $this->normalize($token);
490 if ($token !== null) {
491 $this->_moreTokens[] = $token;
495 } while (count($this->_moreTokens) == 0);
496 return array_shift($this->_moreTokens);
499 function normalize(Zend_Search_Lucene_Analysis_Token $tok)
501 if ($tok instanceof MTrackSearchDateToken) {
504 return parent::normalize($tok);
508 class MTrackSearchQueryParser {
513 function __construct($q) {
514 $this->toks = $this->tokenize($q);
515 $this->alltoks = $this->toks;
516 // echo '<pre>', htmlentities(var_export($this->toks, true)), '</pre>';
518 $this->query = $this->expression();
521 function tokenize($string)
524 while (strlen($string)) {
525 if (preg_match("/^\s+/", $string, $M)) {
526 $toks[] = array('white', $M[0]);
527 $string = substr($string, strlen($M[0]));
530 if (preg_match("/^[+!(){}^~*?:\\\[\]-]/", $string)) {
531 $toks[] = array($string[0]);
532 $string = substr($string, 1);
535 if (!strncmp($string, "&&", 2)) {
536 $toks[] = array("&&");
537 $string = substr($string, 2);
540 if (preg_match("/^and\W/i", $string, $M)) {
541 $toks[] = array("&&", $M[0]);
542 $string = substr($string, 3);
545 if (preg_match("/^not\W/i", $string, $M)) {
546 $toks[] = array("!", $M[0]);
547 $string = substr($string, 3);
550 if (!strncmp($string, "||", 2)) {
551 $toks[] = array("||");
552 $string = substr($string, 2);
555 if (preg_match("/^or\W/i", $string, $M)) {
556 $toks[] = array("||", $M[0]);
557 $string = substr($string, 2);
560 if (preg_match('/^"([^"]*)"/', $string, $M)) {
561 $toks[] = array('literal', $M[1]);
562 $string = substr($string, strlen($M[0]));
565 if (preg_match("/^[a-zA-Z0-9_][a-zA-Z0-9_.+-]*/", $string, $M)) {
566 $toks[] = array('literal', $M[0]);
567 $string = substr($string, strlen($M[0]));
570 $string = trim($string);
571 if (strlen($string)) {
572 echo "Invalid search string: <b>" . htmlentities($string) . "</b>";
581 if (count($this->toks) == 0) {
584 $t = array_shift($this->toks);
585 $args = func_get_args();
589 foreach ($args as $expect) {
590 if ($t[0] == $expect) {
594 $expected[] = $expect;
598 $value = isset($t[1]) ? $t[1] : $t[0];
599 $ntoks = count($this->alltoks);
600 $rtoks = count($this->toks);
602 for ($i = 0; $i < $rtoks; $i++) {
603 $hint .= htmlentities($this->alltoks[$i][1], ENT_QUOTES, 'utf-8');
605 $hint .= "<b>$value</b>";
606 foreach ($this->toks as $tok) {
607 $hint .= htmlentities($tok[1]);
610 "Unexpected token '$value' of type $name expected " .
611 join(', ', $expected) . "<br>$hint");
619 if (!count($this->toks)) {
623 $args = func_get_args();
626 foreach ($args as $expect) {
627 if ($t[0] == $expect) {
639 function try_rule($name) {
642 return $this->$name();
643 } catch (Exception $e) {
649 function _make_term($t, $field = null)
651 if (function_exists('mb_strtolower')) {
652 $t[1] = mb_strtolower($t[1], 'UTF-8');
654 $t[1] = strtolower($t[1]);
656 if ($t[0] == 'literal') {
657 $bits = preg_split("/\s+/u", $t[1]);
659 /* only treat it as a phrase if it is a phrase */
660 if (count($bits) > 1) {
661 $q = new Zend_Search_Lucene_Search_Query_Phrase;
663 foreach ($bits as $w) {
664 $t = new Zend_Search_Lucene_Index_Term($w, $field);
671 /* underscores and periods!
672 * if we're searching for text delimited by underscores, we
673 * rewrite that as a phrase search also */
674 $bits = preg_split("/[._]/", $t[1]);
675 if (count($bits) > 1) {
676 $q = new Zend_Search_Lucene_Search_Query_Phrase;
678 foreach ($bits as $w) {
679 $t = new Zend_Search_Lucene_Index_Term($w, $field);
685 return new Zend_Search_Lucene_Index_Term((string)$t[1], $field);
690 if ($this->peek('literal')) {
692 if ($this->peek(':')) {
697 /* does it have a range? */
698 if ($this->peek('[')) {
703 $from = $this->get('literal');
704 $from = $this->_make_term($from, $field);
707 $t = $this->get('literal');
708 if (strcasecmp($t[1], 'to')) {
709 throw new Exception("Expected 'to'");
713 $to = $this->get('literal');
714 $to = $this->_make_term($to, $field);
716 $q = new Zend_Search_Lucene_Search_Query_Range(
725 $t = $this->get('literal');
727 return $this->_make_term($t, $field);
730 $t = $this->get('literal');
734 return $this->_make_term($t);
741 while ($this->peek('white')) {
746 function expression()
750 while (count($this->toks)) {
755 if ($this->peek('+')) {
759 if ($this->peek('-')) {
763 if ($modifier === null) {
769 $terms[] = array($t, $modifier);
775 if (count($terms) == 0) {
779 if (count($terms) == 1) {
780 if ($terms[0][0] instanceof Zend_Search_Lucene_Search_Query) {
781 if ($terms[0][1] === null) {
787 $q = new Zend_Search_Lucene_Search_Query_Boolean();
788 foreach ($terms as $term) {
789 list($t, $mod) = $term;
791 if ($t instanceof Zend_Search_Lucene_Search_Query) {
792 $q->addSubquery($t, $mod);
794 $sq = new Zend_Search_Lucene_Search_Query_MultiTerm;
796 $q->addSubquery($sq, $mod);
804 /* the highlighter insists on using html document things,
805 * so we force in our own dummy so that we can present the
806 * same text we used initially */
807 class MTrackSearchLuceneDummyDocument {
809 function __construct($text) {
812 function getFieldUtf8Value($name) {
818 implements Zend_Search_Lucene_Search_Highlighter_Interface {
820 public $context = array();
822 public $matched = array();
824 function setDocument(Zend_Search_Lucene_Document_Html $doc)
826 /* sure, I'll get right on that... */
829 function getDocument() {
830 /* we just return our dummy doc instead */
834 function highlight($words) {
835 if (!is_array($words)) {
836 $words = array($words);
838 foreach ($words as $word) {
839 foreach ($this->text as $line) {
840 $x = strpos($line, $word);
842 if (isset($this->matched[$word])) {
843 $this->matched[$word]++;
845 $this->matched[$word] = 1;
847 if (isset($this->context[$line])) {
848 $this->context[$line]++;
850 $this->context[$line] = 1;
857 function __construct($text, $query)
859 $this->doc = new MTrackSearchLuceneDummyDocument($text);
860 $text = wordwrap($text);
861 $this->text = preg_split("/\r?\n/", $text);
862 $query->htmlFragmenthighlightMatches($text, 'utf-8', $this);
866 class MTrackSearchResultLucene extends MTrackSearchResult {
869 function getExcerpt($text) {
870 $hl = new MTrackHLText($text, $this->_query);
872 foreach ($hl->context as $line => $count) {
874 if (!strlen($line)) continue;
875 $line = htmlentities($line, ENT_QUOTES, 'utf-8');
876 foreach ($hl->matched as $word => $wcount) {
877 $line = str_replace($word, "<span class='hl'>$word</span>", $line);
880 if (count($lines) > 6) {
884 $ex = join(" … ", $lines);
886 return "<div class='excerpt'>$ex</div>";
892 class MTrackSearchEngineLucene implements IMTrackSearchEngine
897 if ($this->idx) return $this->idx;
898 $ana = new MTrackSearchAnalyzer;
899 $ana->addFilter(new MTrackSearchStemmer);
900 Zend_Search_Lucene_Analysis_Analyzer::setDefault($ana);
902 $p = MTrackConfig::get('core', 'searchdb');
904 $idx = Zend_Search_Lucene::create($p);
907 $idx = Zend_Search_Lucene::open($p);
913 public function setBatchMode()
915 $idx = $this->getIdx();
916 $idx->setMaxBufferedDocs(64);
917 $idx->setMergeFactor(15);
920 public function commit($optimize = false)
922 $idx = $this->getIdx();
930 public function add($object, $fields, $replace = false)
932 $idx = $this->getIdx();
935 $term = new Zend_Search_Lucene_Index_Term($object, 'object');
936 foreach ($idx->termDocs($term) as $id) {
941 $doc = new Zend_Search_Lucene_Document();
943 $doc->addField(Zend_Search_Lucene_Field::Text('object', $object, 'utf-8'));
944 foreach ($fields as $key => $value) {
945 if (!strlen($value)) continue;
946 if (!strncmp($key, 'stored:', 7)) {
947 $key = substr($key, 7);
948 $F = Zend_Search_Lucene_Field::Text($key, $value, 'utf-8');
950 $F = Zend_Search_Lucene_Field::UnStored($key, $value, 'utf-8');
955 $idx->addDocument($doc);
958 public function search($query) {
959 Zend_Search_Lucene::setTermsPerQueryLimit(150);
960 Zend_Search_Lucene::setResultSetLimit(250);
962 $p = new MTrackSearchQueryParser($query);
964 $idx = $this->getIdx();
965 $hits = $idx->find($q);
967 foreach ($hits as $hit) {
968 $r = new MTrackSearchResultLucene;
970 $r->objectid = $hit->object;
971 $r->score = $hit->score;