git.roojs.org Git - web.mtrack/blob - inc/lib/Zend/Search/Lucene/Search/Query/Preprocessing/Term.php

   1 <?php\r
   2 /**\r
   3  * Zend Framework\r
   4  *\r
   5  * LICENSE\r
   6  *\r
   7  * This source file is subject to the new BSD license that is bundled\r
   8  * with this package in the file LICENSE.txt.\r
   9  * It is also available through the world-wide-web at this URL:\r
  10  * http://framework.zend.com/license/new-bsd\r
  11  * If you did not receive a copy of the license and are unable to\r
  12  * obtain it through the world-wide-web, please send an email\r
  13  * to license@zend.com so we can send you a copy immediately.\r
  14  *\r
  15  * @category   Zend\r
  16  * @package    Zend_Search_Lucene\r
  17  * @subpackage Search\r
  18  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)\r
  19  * @license    http://framework.zend.com/license/new-bsd     New BSD License\r
  20  * @version    $Id: Term.php 16971 2009-07-22 18:05:45Z mikaelkael $\r
  21  */\r
  22 \r
  23 \r
  24 /** Zend_Search_Lucene_Search_Query_Processing */\r
  25 require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';\r
  26 \r
  27 /** Zend_Search_Lucene_Search_Query_Phrase */\r
  28 require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';\r
  29 \r
  30 /** Zend_Search_Lucene_Search_Query_Insignificant */\r
  31 require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';\r
  32 \r
  33 /** Zend_Search_Lucene_Search_Query_Empty */\r
  34 require_once 'Zend/Search/Lucene/Search/Query/Empty.php';\r
  35 \r
  36 /** Zend_Search_Lucene_Search_Query_Term */\r
  37 require_once 'Zend/Search/Lucene/Search/Query/Term.php';\r
  38 \r
  39 /** Zend_Search_Lucene_Index_Term */\r
  40 require_once 'Zend/Search/Lucene/Index/Term.php';\r
  41 \r
  42 \r
  43 /**\r
  44  * It's an internal abstract class intended to finalize ase a query processing after query parsing.\r
  45  * This type of query is not actually involved into query execution.\r
  46  *\r
  47  * @category   Zend\r
  48  * @package    Zend_Search_Lucene\r
  49  * @subpackage Search\r
  50  * @internal\r
  51  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)\r
  52  * @license    http://framework.zend.com/license/new-bsd     New BSD License\r
  53  */\r
  54 class Zend_Search_Lucene_Search_Query_Preprocessing_Term extends Zend_Search_Lucene_Search_Query_Preprocessing\r
  55 {\r
  56     /**\r
  57      * word (query parser lexeme) to find.\r
  58      *\r
  59      * @var string\r
  60      */\r
  61     private $_word;\r
  62 \r
  63     /**\r
  64      * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).\r
  65      *\r
  66      * @var string\r
  67      */\r
  68     private $_encoding;\r
  69 \r
  70 \r
  71     /**\r
  72      * Field name.\r
  73      *\r
  74      * @var string\r
  75      */\r
  76     private $_field;\r
  77 \r
  78     /**\r
  79      * Class constructor.  Create a new preprocessing object for prase query.\r
  80      *\r
  81      * @param string $word       Non-tokenized word (query parser lexeme) to search.\r
  82      * @param string $encoding   Word encoding.\r
  83      * @param string $fieldName  Field name.\r
  84      */\r
  85     public function __construct($word, $encoding, $fieldName)\r
  86     {\r
  87         $this->_word     = $word;\r
  88         $this->_encoding = $encoding;\r
  89         $this->_field    = $fieldName;\r
  90     }\r
  91 \r
  92     /**\r
  93      * Re-write query into primitive queries in the context of specified index\r
  94      *\r
  95      * @param Zend_Search_Lucene_Interface $index\r
  96      * @return Zend_Search_Lucene_Search_Query\r
  97      */\r
  98     public function rewrite(Zend_Search_Lucene_Interface $index)\r
  99     {\r
 100         if ($this->_field === null) {\r
 101             $query = new Zend_Search_Lucene_Search_Query_MultiTerm();\r
 102             $query->setBoost($this->getBoost());\r
 103 \r
 104             $hasInsignificantSubqueries = false;\r
 105 \r
 106             if (Zend_Search_Lucene::getDefaultSearchField() === null) {\r
 107                 $searchFields = $index->getFieldNames(true);\r
 108             } else {\r
 109                 $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());\r
 110             }\r
 111 \r
 112             foreach ($searchFields as $fieldName) {\r
 113                 $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Term($this->_word,\r
 114                                                                                    $this->_encoding,\r
 115                                                                                    $fieldName);\r
 116                 $rewrittenSubquery = $subquery->rewrite($index);\r
 117                 foreach ($rewrittenSubquery->getQueryTerms() as $term) {\r
 118                         $query->addTerm($term);\r
 119                 }\r
 120 \r
 121                 if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {\r
 122                         $hasInsignificantSubqueries = true;\r
 123                 }\r
 124             }\r
 125 \r
 126             if (count($query->getTerms()) == 0) {\r
 127                 $this->_matches = array();\r
 128                 if ($hasInsignificantSubqueries) {\r
 129                         return new Zend_Search_Lucene_Search_Query_Insignificant();\r
 130                 } else {\r
 131                         return new Zend_Search_Lucene_Search_Query_Empty();\r
 132                 }\r
 133             }\r
 134 \r
 135             $this->_matches = $query->getQueryTerms();\r
 136             return $query;\r
 137         }\r
 138 \r
 139         // -------------------------------------\r
 140         // Recognize exact term matching (it corresponds to Keyword fields stored in the index)\r
 141         // encoding is not used since we expect binary matching\r
 142         $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);\r
 143         if ($index->hasTerm($term)) {\r
 144             $query = new Zend_Search_Lucene_Search_Query_Term($term);\r
 145             $query->setBoost($this->getBoost());\r
 146 \r
 147             $this->_matches = $query->getQueryTerms();\r
 148             return $query;\r
 149         }\r
 150 \r
 151 \r
 152         // -------------------------------------\r
 153         // Recognize wildcard queries\r
 154 \r
 155         /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */\r
 156         if (@preg_match('/\pL/u', 'a') == 1) {\r
 157                 $word = iconv($this->_encoding, 'UTF-8', $this->_word);\r
 158                 $wildcardsPattern = '/[*?]/u';\r
 159                 $subPatternsEncoding = 'UTF-8';\r
 160         } else {\r
 161                 $word = $this->_word;\r
 162                 $wildcardsPattern = '/[*?]/';\r
 163             $subPatternsEncoding = $this->_encoding;\r
 164         }\r
 165 \r
 166         $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);\r
 167 \r
 168         if (count($subPatterns) > 1) {\r
 169                 // Wildcard query is recognized\r
 170 \r
 171                 $pattern = '';\r
 172 \r
 173             foreach ($subPatterns as $id => $subPattern) {\r
 174                 // Append corresponding wildcard character to the pattern before each sub-pattern (except first)\r
 175                 if ($id != 0) {\r
 176                         $pattern .= $word[ $subPattern[1] - 1 ];\r
 177                 }\r
 178 \r
 179                 // Check if each subputtern is a single word in terms of current analyzer\r
 180                 $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);\r
 181                 if (count($tokens) > 1) {\r
 182                     require_once 'Zend/Search/Lucene/Search/QueryParserException.php';\r
 183                     throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms');\r
 184                 }\r
 185                 foreach ($tokens as $token) {\r
 186                     $pattern .= $token->getTermText();\r
 187                 }\r
 188             }\r
 189 \r
 190             $term  = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);\r
 191             $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);\r
 192             $query->setBoost($this->getBoost());\r
 193 \r
 194             // Get rewritten query. Important! It also fills terms matching container.\r
 195             $rewrittenQuery = $query->rewrite($index);\r
 196             $this->_matches = $query->getQueryTerms();\r
 197 \r
 198             return $rewrittenQuery;\r
 199         }\r
 200 \r
 201 \r
 202         // -------------------------------------\r
 203         // Recognize one-term multi-term and "insignificant" queries\r
 204         $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);\r
 205 \r
 206         if (count($tokens) == 0) {\r
 207                 $this->_matches = array();\r
 208             return new Zend_Search_Lucene_Search_Query_Insignificant();\r
 209         }\r
 210 \r
 211         if (count($tokens) == 1) {\r
 212             $term  = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);\r
 213             $query = new Zend_Search_Lucene_Search_Query_Term($term);\r
 214             $query->setBoost($this->getBoost());\r
 215 \r
 216             $this->_matches = $query->getQueryTerms();\r
 217             return $query;\r
 218         }\r
 219 \r
 220         //It's not insignificant or one term query\r
 221         $query = new Zend_Search_Lucene_Search_Query_MultiTerm();\r
 222 \r
 223         /**\r
 224          * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other\r
 225          * analizer design features\r
 226          */\r
 227         foreach ($tokens as $token) {\r
 228             $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);\r
 229             $query->addTerm($term, true); // all subterms are required\r
 230         }\r
 231 \r
 232         $query->setBoost($this->getBoost());\r
 233 \r
 234         $this->_matches = $query->getQueryTerms();\r
 235         return $query;\r
 236     }\r
 237 \r
 238     /**\r
 239      * Query specific matches highlighting\r
 240      *\r
 241      * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)\r
 242      */\r
 243     protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)\r
 244     {\r
 245         /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */\r
 246 \r
 247         /** Skip exact term matching recognition, keyword fields highlighting is not supported */\r
 248 \r
 249         // -------------------------------------\r
 250         // Recognize wildcard queries\r
 251         /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */\r
 252         if (@preg_match('/\pL/u', 'a') == 1) {\r
 253             $word = iconv($this->_encoding, 'UTF-8', $this->_word);\r
 254             $wildcardsPattern = '/[*?]/u';\r
 255             $subPatternsEncoding = 'UTF-8';\r
 256         } else {\r
 257             $word = $this->_word;\r
 258             $wildcardsPattern = '/[*?]/';\r
 259             $subPatternsEncoding = $this->_encoding;\r
 260         }\r
 261         $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);\r
 262         if (count($subPatterns) > 1) {\r
 263             // Wildcard query is recognized\r
 264 \r
 265             $pattern = '';\r
 266 \r
 267             foreach ($subPatterns as $id => $subPattern) {\r
 268                 // Append corresponding wildcard character to the pattern before each sub-pattern (except first)\r
 269                 if ($id != 0) {\r
 270                     $pattern .= $word[ $subPattern[1] - 1 ];\r
 271                 }\r
 272 \r
 273                 // Check if each subputtern is a single word in terms of current analyzer\r
 274                 $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);\r
 275                 if (count($tokens) > 1) {\r
 276                         // Do nothing (nothing is highlighted)\r
 277                     return;\r
 278                 }\r
 279                 foreach ($tokens as $token) {\r
 280                     $pattern .= $token->getTermText();\r
 281                 }\r
 282             }\r
 283 \r
 284             $term  = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);\r
 285             $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);\r
 286 \r
 287             $query->_highlightMatches($highlighter);\r
 288             return;\r
 289         }\r
 290 \r
 291         // -------------------------------------\r
 292         // Recognize one-term multi-term and "insignificant" queries\r
 293         $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);\r
 294 \r
 295         if (count($tokens) == 0) {\r
 296             // Do nothing\r
 297             return;\r
 298         }\r
 299 \r
 300         if (count($tokens) == 1) {\r
 301             $highlighter->highlight($tokens[0]->getTermText());\r
 302             return;\r
 303         }\r
 304 \r
 305         //It's not insignificant or one term query\r
 306         $words = array();\r
 307         foreach ($tokens as $token) {\r
 308             $words[] = $token->getTermText();\r
 309         }\r
 310         $highlighter->highlight($words);\r
 311     }\r
 312 \r
 313     /**\r
 314      * Print a query\r
 315      *\r
 316      * @return string\r
 317      */\r
 318     public function __toString()\r
 319     {\r
 320         // It's used only for query visualisation, so we don't care about characters escaping\r
 321         if ($this->_field !== null) {\r
 322             $query = $this->_field . ':';\r
 323         } else {\r
 324             $query = '';\r
 325         }\r
 326 \r
 327         $query .= $this->_word;\r
 328 \r
 329         if ($this->getBoost() != 1) {\r
 330             $query .= '^' . round($this->getBoost(), 4);\r
 331         }\r
 332 \r
 333         return $query;\r
 334     }\r
 335 }\r