7 * This source file is subject to the new BSD license that is bundled
\r
8 * with this package in the file LICENSE.txt.
\r
9 * It is also available through the world-wide-web at this URL:
\r
10 * http://framework.zend.com/license/new-bsd
\r
11 * If you did not receive a copy of the license and are unable to
\r
12 * obtain it through the world-wide-web, please send an email
\r
13 * to license@zend.com so we can send you a copy immediately.
\r
16 * @package Zend_Search_Lucene
\r
17 * @subpackage Search
\r
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
\r
19 * @license http://framework.zend.com/license/new-bsd New BSD License
\r
20 * @version $Id: Fuzzy.php 16971 2009-07-22 18:05:45Z mikaelkael $
\r
24 /** Zend_Search_Lucene_Search_Query_Processing */
\r
25 require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
\r
27 /** Zend_Search_Lucene_Search_Query_Phrase */
\r
28 require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
\r
30 /** Zend_Search_Lucene_Search_Query_Insignificant */
\r
31 require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
\r
33 /** Zend_Search_Lucene_Search_Query_Empty */
\r
34 require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
\r
36 /** Zend_Search_Lucene_Search_Query_Term */
\r
37 require_once 'Zend/Search/Lucene/Search/Query/Term.php';
\r
39 /** Zend_Search_Lucene_Index_Term */
\r
40 require_once 'Zend/Search/Lucene/Index/Term.php';
\r
44 * It's an internal abstract class intended to finalize ase a query processing after query parsing.
\r
45 * This type of query is not actually involved into query execution.
\r
48 * @package Zend_Search_Lucene
\r
49 * @subpackage Search
\r
51 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
\r
52 * @license http://framework.zend.com/license/new-bsd New BSD License
\r
54 class Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy extends Zend_Search_Lucene_Search_Query_Preprocessing
\r
57 * word (query parser lexeme) to find.
\r
64 * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
\r
79 * A value between 0 and 1 to set the required similarity
\r
80 * between the query term and the matching terms. For example, for a
\r
81 * _minimumSimilarity of 0.5 a term of the same length
\r
82 * as the query term is considered similar to the query term if the edit distance
\r
83 * between both terms is less than length(term)*0.5
\r
87 private $_minimumSimilarity;
\r
90 * Class constructor. Create a new preprocessing object for prase query.
\r
92 * @param string $word Non-tokenized word (query parser lexeme) to search.
\r
93 * @param string $encoding Word encoding.
\r
94 * @param string $fieldName Field name.
\r
95 * @param float $minimumSimilarity minimum similarity
\r
97 public function __construct($word, $encoding, $fieldName, $minimumSimilarity)
\r
99 $this->_word = $word;
\r
100 $this->_encoding = $encoding;
\r
101 $this->_field = $fieldName;
\r
102 $this->_minimumSimilarity = $minimumSimilarity;
\r
106 * Re-write query into primitive queries in the context of specified index
\r
108 * @param Zend_Search_Lucene_Interface $index
\r
109 * @return Zend_Search_Lucene_Search_Query
\r
111 public function rewrite(Zend_Search_Lucene_Interface $index)
\r
113 if ($this->_field === null) {
\r
114 $query = new Zend_Search_Lucene_Search_Query_Boolean();
\r
116 $hasInsignificantSubqueries = false;
\r
118 if (Zend_Search_Lucene::getDefaultSearchField() === null) {
\r
119 $searchFields = $index->getFieldNames(true);
\r
121 $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
\r
124 foreach ($searchFields as $fieldName) {
\r
125 $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy($this->_word,
\r
128 $this->_minimumSimilarity);
\r
130 $rewrittenSubquery = $subquery->rewrite($index);
\r
132 if ( !($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant ||
\r
133 $rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Empty) ) {
\r
134 $query->addSubquery($rewrittenSubquery);
\r
137 if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
\r
138 $hasInsignificantSubqueries = true;
\r
142 $subqueries = $query->getSubqueries();
\r
144 if (count($subqueries) == 0) {
\r
145 $this->_matches = array();
\r
146 if ($hasInsignificantSubqueries) {
\r
147 return new Zend_Search_Lucene_Search_Query_Insignificant();
\r
149 return new Zend_Search_Lucene_Search_Query_Empty();
\r
153 if (count($subqueries) == 1) {
\r
154 $query = reset($subqueries);
\r
157 $query->setBoost($this->getBoost());
\r
159 $this->_matches = $query->getQueryTerms();
\r
163 // -------------------------------------
\r
164 // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
\r
165 // encoding is not used since we expect binary matching
\r
166 $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);
\r
167 if ($index->hasTerm($term)) {
\r
168 $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
\r
169 $query->setBoost($this->getBoost());
\r
171 // Get rewritten query. Important! It also fills terms matching container.
\r
172 $rewrittenQuery = $query->rewrite($index);
\r
173 $this->_matches = $query->getQueryTerms();
\r
175 return $rewrittenQuery;
\r
179 // -------------------------------------
\r
180 // Recognize wildcard queries
\r
182 /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
\r
183 if (@preg_match('/\pL/u', 'a') == 1) {
\r
184 $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
\r
186 $subPatterns = preg_split('/[*?]/', $this->_word);
\r
188 if (count($subPatterns) > 1) {
\r
189 require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
\r
190 throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).');
\r
194 // -------------------------------------
\r
195 // Recognize one-term multi-term and "insignificant" queries
\r
196 $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
\r
198 if (count($tokens) == 0) {
\r
199 $this->_matches = array();
\r
200 return new Zend_Search_Lucene_Search_Query_Insignificant();
\r
203 if (count($tokens) == 1) {
\r
204 $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
\r
205 $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
\r
206 $query->setBoost($this->getBoost());
\r
208 // Get rewritten query. Important! It also fills terms matching container.
\r
209 $rewrittenQuery = $query->rewrite($index);
\r
210 $this->_matches = $query->getQueryTerms();
\r
212 return $rewrittenQuery;
\r
215 // Word is tokenized into several tokens
\r
216 require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
\r
217 throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms');
\r
221 * Query specific matches highlighting
\r
223 * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
\r
225 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
\r
227 /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
\r
229 /** Skip exact term matching recognition, keyword fields highlighting is not supported */
\r
231 // -------------------------------------
\r
232 // Recognize wildcard queries
\r
234 /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
\r
235 if (@preg_match('/\pL/u', 'a') == 1) {
\r
236 $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
\r
238 $subPatterns = preg_split('/[*?]/', $this->_word);
\r
240 if (count($subPatterns) > 1) {
\r
245 // -------------------------------------
\r
246 // Recognize one-term multi-term and "insignificant" queries
\r
247 $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
\r
248 if (count($tokens) == 0) {
\r
252 if (count($tokens) == 1) {
\r
253 $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
\r
254 $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
\r
256 $query->_highlightMatches($highlighter);
\r
260 // Word is tokenized into several tokens
\r
261 // But fuzzy search is supported only for non-multiple word terms
\r
270 public function __toString()
\r
272 // It's used only for query visualisation, so we don't care about characters escaping
\r
273 if ($this->_field !== null) {
\r
274 $query = $this->_field . ':';
\r
279 $query .= $this->_word;
\r
281 if ($this->getBoost() != 1) {
\r
282 $query .= '^' . round($this->getBoost(), 4);
\r