git.roojs.org Git - web.mtrack/blob - MTrack/search/lucene.php

   1 <?php # vim:ts=2:sw=2:et:
   2 /* For licensing and copyright terms, see the file named LICENSE */
   3
   4 require_once 'Zend/Search/Lucene.php';
   5
   6 /**
   7  * Copyright (c) 2005 Richard Heyes (http://www.phpguru.org/)
   8  * PHP5 Implementation of the Porter Stemmer algorithm. Certain elements
   9  * were borrowed from the (broken) implementation by Jon Abernathy.
  10  */
  11 class PorterStemmer {
  12   /**
  13    * Regex for matching a consonant
  14    * @var string
  15    */
  16   private static $regex_consonant =
  17     '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
  18
  19   /**
  20    * Regex for matching a vowel
  21    * @var string
  22    */
  23   private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
  24
  25   /**
  26    * Stems a word. Simple huh?
  27    *
  28    * @param  string $word Word to stem
  29    * @return string       Stemmed word
  30    */
  31   public static function Stem($word)
  32   {
  33     if (strlen($word) <= 2) {
  34       return $word;
  35     }
  36
  37     $word = self::step1ab($word);
  38     $word = self::step1c($word);
  39     $word = self::step2($word);
  40     $word = self::step3($word);
  41     $word = self::step4($word);
  42     $word = self::step5($word);
  43
  44     return $word;
  45   }
  46
  47   /**
  48    * Step 1
  49    */
  50   private static function step1ab($word)
  51   {
  52     // Part a
  53     if (substr($word, -1) == 's') {
  54
  55       self::replace($word, 'sses', 'ss')
  56         OR self::replace($word, 'ies', 'i')
  57         OR self::replace($word, 'ss', 'ss')
  58         OR self::replace($word, 's', '');
  59     }
  60
  61     // Part b
  62     if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
  63       $v = self::$regex_vowel;
  64
  65       // ing and ed
  66       if (   preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
  67           OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
  68
  69         // If one of above two test successful
  70         if (    !self::replace($word, 'at', 'ate')
  71             AND !self::replace($word, 'bl', 'ble')
  72             AND !self::replace($word, 'iz', 'ize')) {
  73
  74           // Double consonant ending
  75           if (    self::doubleConsonant($word)
  76               AND substr($word, -2) != 'll'
  77               AND substr($word, -2) != 'ss'
  78               AND substr($word, -2) != 'zz') {
  79
  80             $word = substr($word, 0, -1);
  81
  82           } else if (self::m($word) == 1 AND self::cvc($word)) {
  83             $word .= 'e';
  84           }
  85         }
  86       }
  87     }
  88
  89     return $word;
  90   }
  91
  92   /**
  93    * Step 1c
  94    *
  95    * @param string $word Word to stem
  96    */
  97   private static function step1c($word)
  98   {
  99     $v = self::$regex_vowel;
 100
 101     if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
 102       self::replace($word, 'y', 'i');
 103     }
 104
 105     return $word;
 106   }
 107
 108   /**
 109    * Step 2
 110    *
 111    * @param string $word Word to stem
 112    */
 113   private static function step2($word)
 114   {
 115     switch (substr($word, -2, 1)) {
 116       case 'a':
 117         self::replace($word, 'ational', 'ate', 0)
 118           OR self::replace($word, 'tional', 'tion', 0);
 119         break;
 120
 121       case 'c':
 122         self::replace($word, 'enci', 'ence', 0)
 123           OR self::replace($word, 'anci', 'ance', 0);
 124         break;
 125
 126       case 'e':
 127         self::replace($word, 'izer', 'ize', 0);
 128         break;
 129
 130       case 'g':
 131         self::replace($word, 'logi', 'log', 0);
 132         break;
 133
 134       case 'l':
 135         self::replace($word, 'entli', 'ent', 0)
 136           OR self::replace($word, 'ousli', 'ous', 0)
 137           OR self::replace($word, 'alli', 'al', 0)
 138           OR self::replace($word, 'bli', 'ble', 0)
 139           OR self::replace($word, 'eli', 'e', 0);
 140         break;
 141
 142       case 'o':
 143         self::replace($word, 'ization', 'ize', 0)
 144           OR self::replace($word, 'ation', 'ate', 0)
 145           OR self::replace($word, 'ator', 'ate', 0);
 146         break;
 147
 148       case 's':
 149         self::replace($word, 'iveness', 'ive', 0)
 150           OR self::replace($word, 'fulness', 'ful', 0)
 151           OR self::replace($word, 'ousness', 'ous', 0)
 152           OR self::replace($word, 'alism', 'al', 0);
 153         break;
 154
 155       case 't':
 156         self::replace($word, 'biliti', 'ble', 0)
 157           OR self::replace($word, 'aliti', 'al', 0)
 158           OR self::replace($word, 'iviti', 'ive', 0);
 159         break;
 160     }
 161
 162     return $word;
 163   }
 164
 165   /**
 166    * Step 3
 167    *
 168    * @param string $word String to stem
 169    */
 170   private static function step3($word)
 171   {
 172     switch (substr($word, -2, 1)) {
 173       case 'a':
 174         self::replace($word, 'ical', 'ic', 0);
 175         break;
 176
 177       case 's':
 178         self::replace($word, 'ness', '', 0);
 179         break;
 180
 181       case 't':
 182         self::replace($word, 'icate', 'ic', 0)
 183           OR self::replace($word, 'iciti', 'ic', 0);
 184         break;
 185
 186       case 'u':
 187         self::replace($word, 'ful', '', 0);
 188         break;
 189
 190       case 'v':
 191         self::replace($word, 'ative', '', 0);
 192         break;
 193
 194       case 'z':
 195         self::replace($word, 'alize', 'al', 0);
 196         break;
 197     }
 198
 199     return $word;
 200   }
 201
 202   /**
 203    * Step 4
 204    *
 205    * @param string $word Word to stem
 206    */
 207   private static function step4($word)
 208   {
 209     switch (substr($word, -2, 1)) {
 210       case 'a':
 211         self::replace($word, 'al', '', 1);
 212         break;
 213
 214       case 'c':
 215         self::replace($word, 'ance', '', 1)
 216           OR self::replace($word, 'ence', '', 1);
 217         break;
 218
 219       case 'e':
 220         self::replace($word, 'er', '', 1);
 221         break;
 222
 223       case 'i':
 224         self::replace($word, 'ic', '', 1);
 225         break;
 226
 227       case 'l':
 228         self::replace($word, 'able', '', 1)
 229           OR self::replace($word, 'ible', '', 1);
 230         break;
 231
 232       case 'n':
 233         self::replace($word, 'ant', '', 1)
 234           OR self::replace($word, 'ement', '', 1)
 235           OR self::replace($word, 'ment', '', 1)
 236           OR self::replace($word, 'ent', '', 1);
 237         break;
 238
 239       case 'o':
 240         if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
 241           self::replace($word, 'ion', '', 1);
 242         } else {
 243           self::replace($word, 'ou', '', 1);
 244         }
 245         break;
 246
 247       case 's':
 248         self::replace($word, 'ism', '', 1);
 249         break;
 250
 251       case 't':
 252         self::replace($word, 'ate', '', 1)
 253           OR self::replace($word, 'iti', '', 1);
 254         break;
 255
 256       case 'u':
 257         self::replace($word, 'ous', '', 1);
 258         break;
 259
 260       case 'v':
 261         self::replace($word, 'ive', '', 1);
 262         break;
 263
 264       case 'z':
 265         self::replace($word, 'ize', '', 1);
 266         break;
 267     }
 268
 269     return $word;
 270   }
 271
 272   /**
 273    * Step 5
 274    *
 275    * @param string $word Word to stem
 276    */
 277   private static function step5($word)
 278   {
 279     // Part a
 280     if (substr($word, -1) == 'e') {
 281       if (self::m(substr($word, 0, -1)) > 1) {
 282         self::replace($word, 'e', '');
 283
 284       } else if (self::m(substr($word, 0, -1)) == 1) {
 285
 286         if (!self::cvc(substr($word, 0, -1))) {
 287           self::replace($word, 'e', '');
 288         }
 289       }
 290     }
 291
 292     // Part b
 293     if (self::m($word) > 1 AND
 294         self::doubleConsonant($word) AND substr($word, -1) == 'l') {
 295       $word = substr($word, 0, -1);
 296     }
 297
 298     return $word;
 299   }
 300
 301   /**
 302    * Replaces the first string with the second, at the end of the string. If third
 303    * arg is given, then the preceding string must match that m count at least.
 304    *
 305    * @param  string $str   String to check
 306    * @param  string $check Ending to check for
 307    * @param  string $repl  Replacement string
 308    * @param  int    $m     Optional minimum number of m() to meet
 309    * @return bool          Whether the $check string was at the end
 310    *                       of the $str string. True does not necessarily mean
 311    *                       that it was replaced.
 312    */
 313   private static function replace(&$str, $check, $repl, $m = null)
 314   {
 315     $len = 0 - strlen($check);
 316
 317     if (substr($str, $len) == $check) {
 318       $substr = substr($str, 0, $len);
 319       if (is_null($m) OR self::m($substr) > $m) {
 320         $str = $substr . $repl;
 321       }
 322
 323       return true;
 324     }
 325
 326     return false;
 327   }
 328
 329   /**
 330    * What, you mean it's not obvious from the name?
 331    *
 332    * m() measures the number of consonant sequences in $str. if c is
 333    * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
 334    * presence,
 335    *
 336    * <c><v>       gives 0
 337    * <c>vc<v>     gives 1
 338    * <c>vcvc<v>   gives 2
 339    * <c>vcvcvc<v> gives 3
 340    *
 341    * @param  string $str The string to return the m count for
 342    * @return int         The m count
 343    */
 344   private static function m($str)
 345   {
 346     $c = self::$regex_consonant;
 347     $v = self::$regex_vowel;
 348
 349     $str = preg_replace("#^$c+#", '', $str);
 350     $str = preg_replace("#$v+$#", '', $str);
 351
 352     preg_match_all("#($v+$c+)#", $str, $matches);
 353
 354     return count($matches[1]);
 355   }
 356
 357
 358   /**
 359    * Returns true/false as to whether the given string contains two
 360    * of the same consonant next to each other at the end of the string.
 361    *
 362    * @param  string $str String to check
 363    * @return bool        Result
 364    */
 365   private static function doubleConsonant($str)
 366   {
 367     $c = self::$regex_consonant;
 368
 369     return preg_match("#$c{2}$#", $str, $matches)
 370       AND $matches[0]{0} == $matches[0]{1};
 371   }
 372
 373
 374   /**
 375    * Checks for ending CVC sequence where second C is not W, X or Y
 376    *
 377    * @param  string $str String to check
 378    * @return bool        Result
 379    */
 380   private static function cvc($str)
 381   {
 382     $c = self::$regex_consonant;
 383     $v = self::$regex_vowel;
 384
 385     return     preg_match("#($c$v$c)$#", $str, $matches)
 386       AND strlen($matches[1]) == 3
 387       AND $matches[1]{2} != 'w'
 388       AND $matches[1]{2} != 'x'
 389       AND $matches[1]{2} != 'y';
 390   }
 391 }
 392
 393 class MTrackSearchStemmer extends
 394     Zend_Search_Lucene_Analysis_TokenFilter {
 395
 396   public function normalize(Zend_Search_Lucene_Analysis_Token $tok)
 397   {
 398     $text = $tok->getTermText();
 399     $text = PorterStemmer::Stem($text);
 400     $ntok = new Zend_Search_Lucene_Analysis_Token($text,
 401                   $tok->getStartOffset(),
 402                   $tok->getEndOffset());
 403     $ntok->setPositionIncrement($tok->getPositionIncrement());
 404     return $tok;
 405   }
 406 }
 407
 408 class MTrackSearchDateToken extends Zend_Search_Lucene_Analysis_Token {
 409 }
 410
 411 class MTrackSearchAnalyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common
 412 {
 413   private $_position;
 414   private $_bytePosition;
 415   private $_moreTokens = array();
 416
 417   function reset()
 418   {
 419     $this->_position = 0;
 420     $this->_bytePosition = 0;
 421   }
 422
 423   function nextToken()
 424   {
 425     if (count($this->_moreTokens))  {
 426       $tok = array_shift($this->_moreTokens);
 427       return $tok;
 428     }
 429     if ($this->_input == null) {
 430       return null;
 431     }
 432
 433     do {
 434       /* first check for date fields */
 435
 436       $is_date = false;
 437       // 2008-12-22T05:42:42.285445Z
 438       if (preg_match('/\d{4}-\d\d-\d\d(?:T\d\d:\d\d:\d\d(?:\.\d+)?Z?)?/u',
 439           $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
 440         $is_date = true;
 441       } else if (!preg_match('/[\p{L}\p{N}_]+/u',
 442           $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
 443         return null;
 444       }
 445       if (!function_exists('mb_strtolower')) {
 446         $matchedWord = strtolower($match[0][0]);
 447       } else {
 448         $matchedWord = mb_strtolower($match[0][0], 'UTF-8');
 449       }
 450       $binStartPos = $match[0][1];
 451       $startPos = $this->_position +
 452           iconv_strlen(substr($this->_input, $this->_bytePosition,
 453             $binStartPos - $this->_bytePosition),
 454             'UTF-8');
 455       $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
 456       $this->_bytePosition = $binStartPos + strlen($matchedWord);
 457       $this->_position = $endPos;
 458
 459       if ($is_date) {
 460 //        $this->_moreTokens[] = new MTrackSearchDateToken($matchedWord,
 461 //          $startPos, $endPos);
 462
 463         /* Seems very difficult to allow range searching on strings
 464          * of the form "2009-10-10", so we just smush it together */
 465         $no_sep = str_replace(array('-', ':'), array('', ''), $matchedWord);
 466         list($no_sep) = explode('.', $no_sep);
 467
 468         /* full date and time */
 469 //        $this->_moreTokens[] = new MTrackSearchDateToken(
 470 //          $no_sep, $startPos, $endPos);
 471
 472         /* date only */
 473         $date = substr($no_sep, 0, 8);
 474         $this->_moreTokens[] = new MTrackSearchDateToken(
 475           $date, $startPos, $endPos);
 476       } else {
 477         $token = new Zend_Search_Lucene_Analysis_Token(
 478           $matchedWord, $startPos, $endPos);
 479         $token = $this->normalize($token);
 480         if ($token !== null) {
 481           $this->_moreTokens[] = $token;
 482         }
 483       }
 484       if (!$is_date) {
 485         /* split by underscores and add those tokens too */
 486         foreach (explode('_', $matchedWord) as $ele) {
 487           $token  = new Zend_Search_Lucene_Analysis_Token(
 488             $ele, $startPos, $endPos);
 489           $token = $this->normalize($token);
 490           if ($token !== null) {
 491             $this->_moreTokens[] = $token;
 492           }
 493         }
 494       }
 495     } while (count($this->_moreTokens) == 0);
 496     return array_shift($this->_moreTokens);
 497   }
 498
 499   function normalize(Zend_Search_Lucene_Analysis_Token $tok)
 500   {
 501     if ($tok instanceof MTrackSearchDateToken) {
 502       return $tok;
 503     }
 504     return parent::normalize($tok);
 505   }
 506 }
 507
 508 class MTrackSearchQueryParser {
 509   public $toks;
 510   public $syntax;
 511   public $query;
 512
 513   function __construct($q) {
 514     $this->toks = $this->tokenize($q);
 515     $this->alltoks = $this->toks;
 516 //    echo '<pre>', htmlentities(var_export($this->toks, true)), '</pre>';
 517
 518     $this->query = $this->expression();
 519   }
 520
 521   function tokenize($string)
 522   {
 523     $toks = array();
 524     while (strlen($string)) {
 525       if (preg_match("/^\s+/", $string, $M)) {
 526         $toks[] = array('white', $M[0]);
 527         $string = substr($string, strlen($M[0]));
 528         continue;
 529       }
 530       if (preg_match("/^[+!(){}^~*?:\\\[\]-]/", $string)) {
 531         $toks[] = array($string[0]);
 532         $string = substr($string, 1);
 533         continue;
 534       }
 535       if (!strncmp($string, "&&", 2)) {
 536         $toks[] = array("&&");
 537         $string = substr($string, 2);
 538         continue;
 539       }
 540       if (preg_match("/^and\W/i", $string, $M)) {
 541         $toks[] = array("&&", $M[0]);
 542         $string = substr($string, 3);
 543         continue;
 544       }
 545       if (preg_match("/^not\W/i", $string, $M)) {
 546         $toks[] = array("!", $M[0]);
 547         $string = substr($string, 3);
 548         continue;
 549       }
 550       if (!strncmp($string, "||", 2)) {
 551         $toks[] = array("||");
 552         $string = substr($string, 2);
 553         continue;
 554       }
 555       if (preg_match("/^or\W/i", $string, $M)) {
 556         $toks[] = array("||", $M[0]);
 557         $string = substr($string, 2);
 558         continue;
 559       }
 560       if (preg_match('/^"([^"]*)"/', $string, $M)) {
 561         $toks[] = array('literal', $M[1]);
 562         $string = substr($string, strlen($M[0]));
 563         continue;
 564       }
 565       if (preg_match("/^[a-zA-Z0-9_][a-zA-Z0-9_.+-]*/", $string, $M)) {
 566         $toks[] = array('literal', $M[0]);
 567         $string = substr($string, strlen($M[0]));
 568         continue;
 569       }
 570       $string = trim($string);
 571       if (strlen($string)) {
 572         echo "Invalid search string: <b>" . htmlentities($string) . "</b>";
 573         break;
 574       }
 575     }
 576     return $toks;
 577   }
 578
 579   function get()
 580   {
 581     if (count($this->toks) == 0) {
 582       return null;
 583     }
 584     $t = array_shift($this->toks);
 585     $args = func_get_args();
 586     if (count($args)) {
 587       $ok = false;
 588       $expected = array();
 589       foreach ($args as $expect) {
 590         if ($t[0] == $expect) {
 591           $ok = true;
 592           break;
 593         }
 594         $expected[] = $expect;
 595       }
 596       if (!$ok) {
 597         $name = $t[0];
 598         $value = isset($t[1]) ? $t[1] : $t[0];
 599         $ntoks = count($this->alltoks);
 600         $rtoks = count($this->toks);
 601         $hint = '';
 602         for ($i = 0; $i < $rtoks; $i++) {
 603           $hint .= htmlentities($this->alltoks[$i][1], ENT_QUOTES, 'utf-8');
 604         }
 605         $hint .= "<b>$value</b>";
 606         foreach ($this->toks as $tok) {
 607           $hint .= htmlentities($tok[1]);
 608         }
 609         throw new Exception(
 610           "Unexpected token '$value' of type $name expected " .
 611           join(', ', $expected) . "<br>$hint");
 612       }
 613     }
 614     return $t;
 615   }
 616
 617   function peek()
 618   {
 619     if (!count($this->toks)) {
 620       return null;
 621     }
 622     $t = $this->toks[0];
 623     $args = func_get_args();
 624     if (count($args)) {
 625       $ok = false;
 626       foreach ($args as $expect) {
 627         if ($t[0] == $expect) {
 628           $ok = true;
 629           break;
 630         }
 631       }
 632       if (!$ok) {
 633         return false;
 634       }
 635     }
 636     return $t;
 637   }
 638
 639   function try_rule($name) {
 640     $save = $this->toks;
 641     try {
 642       return $this->$name();
 643     } catch (Exception $e) {
 644       $this->toks = $save;
 645       return false;
 646     }
 647   }
 648
 649   function _make_term($t, $field = null)
 650   {
 651     if (function_exists('mb_strtolower')) {
 652       $t[1] = mb_strtolower($t[1], 'UTF-8');
 653     } else {
 654       $t[1] = strtolower($t[1]);
 655     }
 656     if ($t[0] == 'literal') {
 657       $bits = preg_split("/\s+/u", $t[1]);
 658
 659       /* only treat it as a phrase if it is a phrase */
 660       if (count($bits) > 1) {
 661         $q = new Zend_Search_Lucene_Search_Query_Phrase;
 662
 663         foreach ($bits as $w) {
 664           $t = new Zend_Search_Lucene_Index_Term($w, $field);
 665           $q->addTerm($t);
 666         }
 667         return $q;
 668       }
 669     }
 670
 671     /* underscores and periods!
 672      * if we're searching for text delimited by underscores, we
 673      * rewrite that as a phrase search also */
 674     $bits = preg_split("/[._]/", $t[1]);
 675     if (count($bits) > 1) {
 676       $q = new Zend_Search_Lucene_Search_Query_Phrase;
 677
 678       foreach ($bits as $w) {
 679         $t = new Zend_Search_Lucene_Index_Term($w, $field);
 680         $q->addTerm($t);
 681       }
 682       return $q;
 683     }
 684
 685     return new Zend_Search_Lucene_Index_Term((string)$t[1], $field);
 686   }
 687
 688   function term()
 689   {
 690     if ($this->peek('literal')) {
 691       $t = $this->get();
 692       if ($this->peek(':')) {
 693         /* specific field */
 694         $field = $t[1];
 695         $this->get();
 696
 697         /* does it have a range? */
 698         if ($this->peek('[')) {
 699           $this->get();
 700
 701           $this->skipwhite();
 702
 703           $from = $this->get('literal');
 704           $from = $this->_make_term($from, $field);
 705
 706           $this->skipwhite();
 707           $t = $this->get('literal');
 708           if (strcasecmp($t[1], 'to')) {
 709             throw new Exception("Expected 'to'");
 710           }
 711           $this->skipwhite();
 712
 713           $to = $this->get('literal');
 714           $to = $this->_make_term($to, $field);
 715
 716           $q = new Zend_Search_Lucene_Search_Query_Range(
 717                 $from, $to, true);
 718           $this->skipwhite();
 719
 720           $this->get(']');
 721
 722           return $q;
 723         }
 724
 725         $t = $this->get('literal');
 726
 727         return $this->_make_term($t, $field);
 728       }
 729     } else {
 730       $t = $this->get('literal');
 731     }
 732
 733     if ($t) {
 734       return $this->_make_term($t);
 735     }
 736     return null;
 737   }
 738
 739   function skipwhite()
 740   {
 741     while ($this->peek('white')) {
 742       $this->get();
 743     }
 744   }
 745
 746   function expression()
 747   {
 748     $terms = array();
 749
 750     while (count($this->toks)) {
 751       $modifier = null;
 752
 753       $this->skipwhite();
 754
 755       if ($this->peek('+')) {
 756         $this->get();
 757         $modifier = true;
 758       }
 759       if ($this->peek('-')) {
 760         $this->get();
 761         $modifier = false;
 762       }
 763       if ($modifier === null) {
 764         $modifier = true;
 765       }
 766
 767       $t = $this->term();
 768       if ($t) {
 769         $terms[] = array($t, $modifier);
 770       } else {
 771         break;
 772       }
 773     }
 774
 775     if (count($terms) == 0) {
 776       return null;
 777     }
 778
 779     if (count($terms) == 1) {
 780       if ($terms[0][0] instanceof Zend_Search_Lucene_Search_Query) {
 781         if ($terms[0][1] === null) {
 782           return $terms[0][0];
 783         }
 784       }
 785     }
 786
 787     $q = new Zend_Search_Lucene_Search_Query_Boolean();
 788     foreach ($terms as $term) {
 789       list($t, $mod) = $term;
 790
 791       if ($t instanceof Zend_Search_Lucene_Search_Query) {
 792         $q->addSubquery($t, $mod);
 793       } else {
 794         $sq = new Zend_Search_Lucene_Search_Query_MultiTerm;
 795         $sq->addTerm($t);
 796         $q->addSubquery($sq, $mod);
 797       }
 798     }
 799
 800     return $q;
 801   }
 802 }
 803
 804 /* the highlighter insists on using html document things,
 805  * so we force in our own dummy so that we can present the
 806  * same text we used initially */
 807 class MTrackSearchLuceneDummyDocument {
 808   public $text;
 809   function __construct($text) {
 810     $this->text = $text;
 811   }
 812   function getFieldUtf8Value($name) {
 813     return $this->text;
 814   }
 815 }
 816
 817 class MTrackHLText
 818     implements Zend_Search_Lucene_Search_Highlighter_Interface {
 819   public $doc;
 820   public $context = array();
 821   public $text;
 822   public $matched = array();
 823
 824   function setDocument(Zend_Search_Lucene_Document_Html $doc)
 825   {
 826     /* sure, I'll get right on that... */
 827   }
 828
 829   function getDocument() {
 830     /* we just return our dummy doc instead */
 831     return $this->doc;
 832   }
 833
 834   function highlight($words) {
 835     if (!is_array($words)) {
 836       $words = array($words);
 837     }
 838     foreach ($words as $word) {
 839       foreach ($this->text as $line) {
 840         $x = strpos($line, $word);
 841         if ($x !== false) {
 842           if (isset($this->matched[$word])) {
 843             $this->matched[$word]++;
 844           } else {
 845             $this->matched[$word] = 1;
 846           }
 847           if (isset($this->context[$line])) {
 848             $this->context[$line]++;
 849           } else {
 850             $this->context[$line] = 1;
 851           }
 852         }
 853       }
 854     }
 855   }
 856
 857   function __construct($text, $query)
 858   {
 859     $this->doc = new MTrackSearchLuceneDummyDocument($text);
 860     $text = wordwrap($text);
 861     $this->text = preg_split("/\r?\n/", $text);
 862     $query->htmlFragmenthighlightMatches($text, 'utf-8', $this);
 863   }
 864 }
 865
 866 class MTrackSearchResultLucene extends MTrackSearchResult {
 867   var $_query;
 868
 869   function getExcerpt($text) {
 870     $hl = new MTrackHLText($text, $this->_query);
 871     $lines = array();
 872     foreach ($hl->context as $line => $count) {
 873       $line = trim($line);
 874       if (!strlen($line)) continue;
 875       $line = htmlentities($line, ENT_QUOTES, 'utf-8');
 876       foreach ($hl->matched as $word => $wcount) {
 877         $line = str_replace($word, "<span class='hl'>$word</span>", $line);
 878       }
 879       $lines[] = $line;
 880       if (count($lines) > 6) {
 881         break;
 882       }
 883     }
 884     $ex = join(" &hellip; ", $lines);
 885     if (strlen($ex)) {
 886       return "<div class='excerpt'>$ex</div>";
 887     }
 888     return '';
 889   }
 890 }
 891
 892 class MTrackSearchEngineLucene implements IMTrackSearchEngine
 893 {
 894   var $idx = null;
 895
 896   function getIdx() {
 897     if ($this->idx) return $this->idx;
 898     $ana = new MTrackSearchAnalyzer;
 899     $ana->addFilter(new MTrackSearchStemmer);
 900     Zend_Search_Lucene_Analysis_Analyzer::setDefault($ana);
 901
 902     $p = MTrackConfig::get('core', 'searchdb');
 903     if (!is_dir($p)) {
 904       $idx = Zend_Search_Lucene::create($p);
 905       chmod($p, 0777);
 906     } else {
 907       $idx = Zend_Search_Lucene::open($p);
 908     }
 909     $this->index = $idx;
 910     return $idx;
 911   }
 912
 913   public function setBatchMode()
 914   {
 915     $idx = $this->getIdx();
 916     $idx->setMaxBufferedDocs(64);
 917     $idx->setMergeFactor(15);
 918   }
 919
 920   public function commit($optimize = false)
 921   {
 922     $idx = $this->getIdx();
 923     if ($optimize) {
 924       $idx->optimize();
 925     }
 926     $idx->commit();
 927     $this->idx = null;
 928   }
 929
 930   public function add($object, $fields, $replace = false)
 931   {
 932     $idx = $this->getIdx();
 933
 934     if ($replace) {
 935       $term = new Zend_Search_Lucene_Index_Term($object, 'object');
 936       foreach ($idx->termDocs($term) as $id) {
 937         $idx->delete($id);
 938       }
 939     }
 940
 941     $doc = new Zend_Search_Lucene_Document();
 942
 943     $doc->addField(Zend_Search_Lucene_Field::Text('object', $object, 'utf-8'));
 944     foreach ($fields as $key => $value) {
 945       if (!strlen($value)) continue;
 946       if (!strncmp($key, 'stored:', 7)) {
 947         $key = substr($key, 7);
 948         $F = Zend_Search_Lucene_Field::Text($key, $value, 'utf-8');
 949       } else {
 950         $F = Zend_Search_Lucene_Field::UnStored($key, $value, 'utf-8');
 951       }
 952       $doc->addField($F);
 953     }
 954
 955     $idx->addDocument($doc);
 956   }
 957
 958   public function search($query) {
 959     Zend_Search_Lucene::setTermsPerQueryLimit(150);
 960     Zend_Search_Lucene::setResultSetLimit(250);
 961
 962     $p = new MTrackSearchQueryParser($query);
 963     $q = $p->query;
 964     $idx = $this->getIdx();
 965     $hits = $idx->find($q);
 966     $result = array();
 967     foreach ($hits as $hit) {
 968       $r = new MTrackSearchResultLucene;
 969       $r->_query = $q;
 970       $r->objectid = $hit->object;
 971       $r->score = $hit->score;
 972       $result[] = $r;
 973     }
 974     return $result;
 975   }
 976
 977
 978 }
 979
 980