7 * This source file is subject to the new BSD license that is bundled
\r
8 * with this package in the file LICENSE.txt.
\r
9 * It is also available through the world-wide-web at this URL:
\r
10 * http://framework.zend.com/license/new-bsd
\r
11 * If you did not receive a copy of the license and are unable to
\r
12 * obtain it through the world-wide-web, please send an email
\r
13 * to license@zend.com so we can send you a copy immediately.
\r
16 * @package Zend_Search_Lucene
\r
17 * @subpackage Search
\r
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
\r
19 * @license http://framework.zend.com/license/new-bsd New BSD License
\r
20 * @version $Id: Term.php 16971 2009-07-22 18:05:45Z mikaelkael $
\r
24 /** Zend_Search_Lucene_Search_Query_Processing */
\r
25 require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
\r
27 /** Zend_Search_Lucene_Search_Query_Phrase */
\r
28 require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
\r
30 /** Zend_Search_Lucene_Search_Query_Insignificant */
\r
31 require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
\r
33 /** Zend_Search_Lucene_Search_Query_Empty */
\r
34 require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
\r
36 /** Zend_Search_Lucene_Search_Query_Term */
\r
37 require_once 'Zend/Search/Lucene/Search/Query/Term.php';
\r
39 /** Zend_Search_Lucene_Index_Term */
\r
40 require_once 'Zend/Search/Lucene/Index/Term.php';
\r
44 * It's an internal abstract class intended to finalize ase a query processing after query parsing.
\r
45 * This type of query is not actually involved into query execution.
\r
48 * @package Zend_Search_Lucene
\r
49 * @subpackage Search
\r
51 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
\r
52 * @license http://framework.zend.com/license/new-bsd New BSD License
\r
54 class Zend_Search_Lucene_Search_Query_Preprocessing_Term extends Zend_Search_Lucene_Search_Query_Preprocessing
\r
57 * word (query parser lexeme) to find.
\r
64 * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
\r
79 * Class constructor. Create a new preprocessing object for prase query.
\r
81 * @param string $word Non-tokenized word (query parser lexeme) to search.
\r
82 * @param string $encoding Word encoding.
\r
83 * @param string $fieldName Field name.
\r
85 public function __construct($word, $encoding, $fieldName)
\r
87 $this->_word = $word;
\r
88 $this->_encoding = $encoding;
\r
89 $this->_field = $fieldName;
\r
93 * Re-write query into primitive queries in the context of specified index
\r
95 * @param Zend_Search_Lucene_Interface $index
\r
96 * @return Zend_Search_Lucene_Search_Query
\r
98 public function rewrite(Zend_Search_Lucene_Interface $index)
\r
100 if ($this->_field === null) {
\r
101 $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
\r
102 $query->setBoost($this->getBoost());
\r
104 $hasInsignificantSubqueries = false;
\r
106 if (Zend_Search_Lucene::getDefaultSearchField() === null) {
\r
107 $searchFields = $index->getFieldNames(true);
\r
109 $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
\r
112 foreach ($searchFields as $fieldName) {
\r
113 $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Term($this->_word,
\r
116 $rewrittenSubquery = $subquery->rewrite($index);
\r
117 foreach ($rewrittenSubquery->getQueryTerms() as $term) {
\r
118 $query->addTerm($term);
\r
121 if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
\r
122 $hasInsignificantSubqueries = true;
\r
126 if (count($query->getTerms()) == 0) {
\r
127 $this->_matches = array();
\r
128 if ($hasInsignificantSubqueries) {
\r
129 return new Zend_Search_Lucene_Search_Query_Insignificant();
\r
131 return new Zend_Search_Lucene_Search_Query_Empty();
\r
135 $this->_matches = $query->getQueryTerms();
\r
139 // -------------------------------------
\r
140 // Recognize exact term matching (it corresponds to Keyword fields stored in the index)
\r
141 // encoding is not used since we expect binary matching
\r
142 $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);
\r
143 if ($index->hasTerm($term)) {
\r
144 $query = new Zend_Search_Lucene_Search_Query_Term($term);
\r
145 $query->setBoost($this->getBoost());
\r
147 $this->_matches = $query->getQueryTerms();
\r
152 // -------------------------------------
\r
153 // Recognize wildcard queries
\r
155 /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
\r
156 if (@preg_match('/\pL/u', 'a') == 1) {
\r
157 $word = iconv($this->_encoding, 'UTF-8', $this->_word);
\r
158 $wildcardsPattern = '/[*?]/u';
\r
159 $subPatternsEncoding = 'UTF-8';
\r
161 $word = $this->_word;
\r
162 $wildcardsPattern = '/[*?]/';
\r
163 $subPatternsEncoding = $this->_encoding;
\r
166 $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
\r
168 if (count($subPatterns) > 1) {
\r
169 // Wildcard query is recognized
\r
173 foreach ($subPatterns as $id => $subPattern) {
\r
174 // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
\r
176 $pattern .= $word[ $subPattern[1] - 1 ];
\r
179 // Check if each subputtern is a single word in terms of current analyzer
\r
180 $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
\r
181 if (count($tokens) > 1) {
\r
182 require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
\r
183 throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms');
\r
185 foreach ($tokens as $token) {
\r
186 $pattern .= $token->getTermText();
\r
190 $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
\r
191 $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
\r
192 $query->setBoost($this->getBoost());
\r
194 // Get rewritten query. Important! It also fills terms matching container.
\r
195 $rewrittenQuery = $query->rewrite($index);
\r
196 $this->_matches = $query->getQueryTerms();
\r
198 return $rewrittenQuery;
\r
202 // -------------------------------------
\r
203 // Recognize one-term multi-term and "insignificant" queries
\r
204 $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
\r
206 if (count($tokens) == 0) {
\r
207 $this->_matches = array();
\r
208 return new Zend_Search_Lucene_Search_Query_Insignificant();
\r
211 if (count($tokens) == 1) {
\r
212 $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
\r
213 $query = new Zend_Search_Lucene_Search_Query_Term($term);
\r
214 $query->setBoost($this->getBoost());
\r
216 $this->_matches = $query->getQueryTerms();
\r
220 //It's not insignificant or one term query
\r
221 $query = new Zend_Search_Lucene_Search_Query_MultiTerm();
\r
224 * @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
\r
225 * analizer design features
\r
227 foreach ($tokens as $token) {
\r
228 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
\r
229 $query->addTerm($term, true); // all subterms are required
\r
232 $query->setBoost($this->getBoost());
\r
234 $this->_matches = $query->getQueryTerms();
\r
239 * Query specific matches highlighting
\r
241 * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
\r
243 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
\r
245 /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
\r
247 /** Skip exact term matching recognition, keyword fields highlighting is not supported */
\r
249 // -------------------------------------
\r
250 // Recognize wildcard queries
\r
251 /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
\r
252 if (@preg_match('/\pL/u', 'a') == 1) {
\r
253 $word = iconv($this->_encoding, 'UTF-8', $this->_word);
\r
254 $wildcardsPattern = '/[*?]/u';
\r
255 $subPatternsEncoding = 'UTF-8';
\r
257 $word = $this->_word;
\r
258 $wildcardsPattern = '/[*?]/';
\r
259 $subPatternsEncoding = $this->_encoding;
\r
261 $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
\r
262 if (count($subPatterns) > 1) {
\r
263 // Wildcard query is recognized
\r
267 foreach ($subPatterns as $id => $subPattern) {
\r
268 // Append corresponding wildcard character to the pattern before each sub-pattern (except first)
\r
270 $pattern .= $word[ $subPattern[1] - 1 ];
\r
273 // Check if each subputtern is a single word in terms of current analyzer
\r
274 $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
\r
275 if (count($tokens) > 1) {
\r
276 // Do nothing (nothing is highlighted)
\r
279 foreach ($tokens as $token) {
\r
280 $pattern .= $token->getTermText();
\r
284 $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
\r
285 $query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
\r
287 $query->_highlightMatches($highlighter);
\r
291 // -------------------------------------
\r
292 // Recognize one-term multi-term and "insignificant" queries
\r
293 $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
\r
295 if (count($tokens) == 0) {
\r
300 if (count($tokens) == 1) {
\r
301 $highlighter->highlight($tokens[0]->getTermText());
\r
305 //It's not insignificant or one term query
\r
307 foreach ($tokens as $token) {
\r
308 $words[] = $token->getTermText();
\r
310 $highlighter->highlight($words);
\r
318 public function __toString()
\r
320 // It's used only for query visualisation, so we don't care about characters escaping
\r
321 if ($this->_field !== null) {
\r
322 $query = $this->_field . ':';
\r
327 $query .= $this->_word;
\r
329 if ($this->getBoost() != 1) {
\r
330 $query .= '^' . round($this->getBoost(), 4);
\r