git.roojs.org Git - web.mtrack/blob - inc/lib/Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php

   1 <?php\r
   2 /**\r
   3  * Zend Framework\r
   4  *\r
   5  * LICENSE\r
   6  *\r
   7  * This source file is subject to the new BSD license that is bundled\r
   8  * with this package in the file LICENSE.txt.\r
   9  * It is also available through the world-wide-web at this URL:\r
  10  * http://framework.zend.com/license/new-bsd\r
  11  * If you did not receive a copy of the license and are unable to\r
  12  * obtain it through the world-wide-web, please send an email\r
  13  * to license@zend.com so we can send you a copy immediately.\r
  14  *\r
  15  * @category   Zend\r
  16  * @package    Zend_Search_Lucene\r
  17  * @subpackage Search\r
  18  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)\r
  19  * @license    http://framework.zend.com/license/new-bsd     New BSD License\r
  20  * @version    $Id: Fuzzy.php 16971 2009-07-22 18:05:45Z mikaelkael $\r
  21  */\r
  22 \r
  23 \r
  24 /** Zend_Search_Lucene_Search_Query_Processing */\r
  25 require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';\r
  26 \r
  27 /** Zend_Search_Lucene_Search_Query_Phrase */\r
  28 require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';\r
  29 \r
  30 /** Zend_Search_Lucene_Search_Query_Insignificant */\r
  31 require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';\r
  32 \r
  33 /** Zend_Search_Lucene_Search_Query_Empty */\r
  34 require_once 'Zend/Search/Lucene/Search/Query/Empty.php';\r
  35 \r
  36 /** Zend_Search_Lucene_Search_Query_Term */\r
  37 require_once 'Zend/Search/Lucene/Search/Query/Term.php';\r
  38 \r
  39 /** Zend_Search_Lucene_Index_Term */\r
  40 require_once 'Zend/Search/Lucene/Index/Term.php';\r
  41 \r
  42 \r
  43 /**\r
  44  * It's an internal abstract class intended to finalize ase a query processing after query parsing.\r
  45  * This type of query is not actually involved into query execution.\r
  46  *\r
  47  * @category   Zend\r
  48  * @package    Zend_Search_Lucene\r
  49  * @subpackage Search\r
  50  * @internal\r
  51  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)\r
  52  * @license    http://framework.zend.com/license/new-bsd     New BSD License\r
  53  */\r
  54 class Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy extends Zend_Search_Lucene_Search_Query_Preprocessing\r
  55 {\r
  56     /**\r
  57      * word (query parser lexeme) to find.\r
  58      *\r
  59      * @var string\r
  60      */\r
  61     private $_word;\r
  62 \r
  63     /**\r
  64      * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).\r
  65      *\r
  66      * @var string\r
  67      */\r
  68     private $_encoding;\r
  69 \r
  70 \r
  71     /**\r
  72      * Field name.\r
  73      *\r
  74      * @var string\r
  75      */\r
  76     private $_field;\r
  77 \r
  78     /**\r
  79      * A value between 0 and 1 to set the required similarity\r
  80      *  between the query term and the matching terms. For example, for a\r
  81      *  _minimumSimilarity of 0.5 a term of the same length\r
  82      *  as the query term is considered similar to the query term if the edit distance\r
  83      *  between both terms is less than length(term)*0.5\r
  84      *\r
  85      * @var float\r
  86      */\r
  87     private $_minimumSimilarity;\r
  88 \r
  89     /**\r
  90      * Class constructor.  Create a new preprocessing object for prase query.\r
  91      *\r
  92      * @param string $word       Non-tokenized word (query parser lexeme) to search.\r
  93      * @param string $encoding   Word encoding.\r
  94      * @param string $fieldName  Field name.\r
  95      * @param float  $minimumSimilarity minimum similarity\r
  96      */\r
  97     public function __construct($word, $encoding, $fieldName, $minimumSimilarity)\r
  98     {\r
  99         $this->_word     = $word;\r
 100         $this->_encoding = $encoding;\r
 101         $this->_field    = $fieldName;\r
 102         $this->_minimumSimilarity = $minimumSimilarity;\r
 103     }\r
 104 \r
 105     /**\r
 106      * Re-write query into primitive queries in the context of specified index\r
 107      *\r
 108      * @param Zend_Search_Lucene_Interface $index\r
 109      * @return Zend_Search_Lucene_Search_Query\r
 110      */\r
 111     public function rewrite(Zend_Search_Lucene_Interface $index)\r
 112     {\r
 113         if ($this->_field === null) {\r
 114             $query = new Zend_Search_Lucene_Search_Query_Boolean();\r
 115 \r
 116             $hasInsignificantSubqueries = false;\r
 117 \r
 118             if (Zend_Search_Lucene::getDefaultSearchField() === null) {\r
 119                 $searchFields = $index->getFieldNames(true);\r
 120             } else {\r
 121                 $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());\r
 122             }\r
 123 \r
 124             foreach ($searchFields as $fieldName) {\r
 125                 $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy($this->_word,\r
 126                                                                                     $this->_encoding,\r
 127                                                                                     $fieldName,\r
 128                                                                                     $this->_minimumSimilarity);\r
 129 \r
 130                 $rewrittenSubquery = $subquery->rewrite($index);\r
 131 \r
 132                 if ( !($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant  ||\r
 133                        $rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Empty) ) {\r
 134                     $query->addSubquery($rewrittenSubquery);\r
 135                 }\r
 136 \r
 137                 if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {\r
 138                         $hasInsignificantSubqueries = true;\r
 139                 }\r
 140             }\r
 141 \r
 142             $subqueries = $query->getSubqueries();\r
 143 \r
 144             if (count($subqueries) == 0) {\r
 145                 $this->_matches = array();\r
 146                 if ($hasInsignificantSubqueries) {\r
 147                     return new Zend_Search_Lucene_Search_Query_Insignificant();\r
 148                 } else {\r
 149                     return new Zend_Search_Lucene_Search_Query_Empty();\r
 150                 }\r
 151             }\r
 152 \r
 153             if (count($subqueries) == 1) {\r
 154                 $query = reset($subqueries);\r
 155             }\r
 156 \r
 157             $query->setBoost($this->getBoost());\r
 158 \r
 159             $this->_matches = $query->getQueryTerms();\r
 160             return $query;\r
 161         }\r
 162 \r
 163         // -------------------------------------\r
 164         // Recognize exact term matching (it corresponds to Keyword fields stored in the index)\r
 165         // encoding is not used since we expect binary matching\r
 166         $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);\r
 167         if ($index->hasTerm($term)) {\r
 168             $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);\r
 169             $query->setBoost($this->getBoost());\r
 170 \r
 171             // Get rewritten query. Important! It also fills terms matching container.\r
 172             $rewrittenQuery = $query->rewrite($index);\r
 173             $this->_matches = $query->getQueryTerms();\r
 174 \r
 175             return $rewrittenQuery;\r
 176         }\r
 177 \r
 178 \r
 179         // -------------------------------------\r
 180         // Recognize wildcard queries\r
 181 \r
 182         /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */\r
 183         if (@preg_match('/\pL/u', 'a') == 1) {\r
 184                 $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));\r
 185         } else {\r
 186                 $subPatterns = preg_split('/[*?]/', $this->_word);\r
 187         }\r
 188         if (count($subPatterns) > 1) {\r
 189             require_once 'Zend/Search/Lucene/Search/QueryParserException.php';\r
 190             throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).');\r
 191         }\r
 192 \r
 193 \r
 194         // -------------------------------------\r
 195         // Recognize one-term multi-term and "insignificant" queries\r
 196         $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);\r
 197 \r
 198         if (count($tokens) == 0) {\r
 199                 $this->_matches = array();\r
 200             return new Zend_Search_Lucene_Search_Query_Insignificant();\r
 201         }\r
 202 \r
 203         if (count($tokens) == 1) {\r
 204             $term  = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);\r
 205             $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);\r
 206             $query->setBoost($this->getBoost());\r
 207 \r
 208             // Get rewritten query. Important! It also fills terms matching container.\r
 209             $rewrittenQuery = $query->rewrite($index);\r
 210             $this->_matches = $query->getQueryTerms();\r
 211 \r
 212             return $rewrittenQuery;\r
 213         }\r
 214 \r
 215         // Word is tokenized into several tokens\r
 216         require_once 'Zend/Search/Lucene/Search/QueryParserException.php';\r
 217         throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms');\r
 218     }\r
 219 \r
 220     /**\r
 221      * Query specific matches highlighting\r
 222      *\r
 223      * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)\r
 224      */\r
 225     protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)\r
 226     {\r
 227         /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */\r
 228 \r
 229         /** Skip exact term matching recognition, keyword fields highlighting is not supported */\r
 230 \r
 231         // -------------------------------------\r
 232         // Recognize wildcard queries\r
 233 \r
 234         /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */\r
 235         if (@preg_match('/\pL/u', 'a') == 1) {\r
 236             $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));\r
 237         } else {\r
 238             $subPatterns = preg_split('/[*?]/', $this->_word);\r
 239         }\r
 240         if (count($subPatterns) > 1) {\r
 241             // Do nothing\r
 242             return;\r
 243         }\r
 244 \r
 245         // -------------------------------------\r
 246         // Recognize one-term multi-term and "insignificant" queries\r
 247         $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);\r
 248         if (count($tokens) == 0) {\r
 249             // Do nothing\r
 250             return;\r
 251         }\r
 252         if (count($tokens) == 1) {\r
 253             $term  = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);\r
 254             $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);\r
 255 \r
 256             $query->_highlightMatches($highlighter);\r
 257             return;\r
 258         }\r
 259 \r
 260         // Word is tokenized into several tokens\r
 261         // But fuzzy search is supported only for non-multiple word terms\r
 262         // Do nothing\r
 263     }\r
 264 \r
 265     /**\r
 266      * Print a query\r
 267      *\r
 268      * @return string\r
 269      */\r
 270     public function __toString()\r
 271     {\r
 272         // It's used only for query visualisation, so we don't care about characters escaping\r
 273         if ($this->_field !== null) {\r
 274             $query = $this->_field . ':';\r
 275         } else {\r
 276             $query = '';\r
 277         }\r
 278 \r
 279         $query .= $this->_word;\r
 280 \r
 281         if ($this->getBoost() != 1) {\r
 282             $query .= '^' . round($this->getBoost(), 4);\r
 283         }\r
 284 \r
 285         return $query;\r
 286     }\r
 287 }\r