7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
16 * @package Zend_Search_Lucene
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
20 * @version $Id: Phrase.php 16541 2009-07-07 06:59:03Z bkarwin $
25 * Zend_Search_Lucene_Search_Query
27 require_once 'Zend/Search/Lucene/Search/Query.php';
30 * Zend_Search_Lucene_Search_Weight_Phrase
32 require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php';
36 * A Query that matches documents containing a particular sequence of terms.
39 * @package Zend_Search_Lucene
41 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
42 * @license http://framework.zend.com/license/new-bsd New BSD License
44 class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query
48 * Array of Zend_Search_Lucene_Index_Term objects.
55 * Term positions (relative positions of terms within the phrase).
63 * Sets the number of other words permitted between words in query phrase.
64 * If zero, then this is an exact phrase search. For larger values this works
65 * like a WITHIN or NEAR operator.
67 * The slop is in fact an edit-distance, where the units correspond to
68 * moves of terms in the query phrase out of position. For example, to switch
69 * the order of two words requires two moves (the first move places the words
70 * atop one another), so to permit re-orderings of phrases, the slop must be
72 * More exact matches are scored higher than sloppier matches, thus search
73 * results are sorted by exactness.
75 * The slop is zero by default, requiring exact matches.
86 private $_resVector = null;
89 * Terms positions vectors.
91 * term1Id => (docId => array( pos1, pos2, ... ), ...)
92 * term2Id => (docId => array( pos1, pos2, ... ), ...)
96 private $_termsPositions = array();
99 * Class constructor. Create a new prase query.
101 * @param string $field Field to search.
102 * @param array $terms Terms to search Array of strings.
103 * @param array $offsets Relative term positions. Array of integers.
104 * @throws Zend_Search_Lucene_Exception
106 public function __construct($terms = null, $offsets = null, $field = null)
110 if (is_array($terms)) {
111 $this->_terms = array();
112 foreach ($terms as $termId => $termText) {
113 $this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field):
114 new Zend_Search_Lucene_Index_Term($termText);
116 } else if ($terms === null) {
117 $this->_terms = array();
119 throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null');
122 if (is_array($offsets)) {
123 if (count($this->_terms) != count($offsets)) {
124 throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.');
126 $this->_offsets = $offsets;
127 } else if ($offsets === null) {
128 $this->_offsets = array();
129 foreach ($this->_terms as $termId => $term) {
130 $position = count($this->_offsets);
131 $this->_offsets[$termId] = $position;
134 throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null');
141 * @param integer $slop
143 public function setSlop($slop)
145 $this->_slop = $slop;
154 public function getSlop()
161 * Adds a term to the end of the query phrase.
162 * The relative position of the term is specified explicitly or the one immediately
163 * after the last term added.
165 * @param Zend_Search_Lucene_Index_Term $term
166 * @param integer $position
168 public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) {
169 if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) {
170 throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' .
171 $term->field . ':' . $term->text);
174 $this->_terms[] = $term;
175 if ($position !== null) {
176 $this->_offsets[] = $position;
177 } else if (count($this->_offsets) != 0) {
178 $this->_offsets[] = end($this->_offsets) + 1;
180 $this->_offsets[] = 0;
186 * Re-write query into primitive queries in the context of specified index
188 * @param Zend_Search_Lucene_Interface $index
189 * @return Zend_Search_Lucene_Search_Query
191 public function rewrite(Zend_Search_Lucene_Interface $index)
193 if (count($this->_terms) == 0) {
194 return new Zend_Search_Lucene_Search_Query_Empty();
195 } else if ($this->_terms[0]->field !== null) {
198 $query = new Zend_Search_Lucene_Search_Query_Boolean();
199 $query->setBoost($this->getBoost());
201 foreach ($index->getFieldNames(true) as $fieldName) {
202 $subquery = new Zend_Search_Lucene_Search_Query_Phrase();
203 $subquery->setSlop($this->getSlop());
205 foreach ($this->_terms as $termId => $term) {
206 $qualifiedTerm = new Zend_Search_Lucene_Index_Term($term->text, $fieldName);
208 $subquery->addTerm($qualifiedTerm, $this->_offsets[$termId]);
211 $query->addSubquery($subquery);
219 * Optimize query in the context of specified index
221 * @param Zend_Search_Lucene_Interface $index
222 * @return Zend_Search_Lucene_Search_Query
224 public function optimize(Zend_Search_Lucene_Interface $index)
226 // Check, that index contains all phrase terms
227 foreach ($this->_terms as $term) {
228 if (!$index->hasTerm($term)) {
229 return new Zend_Search_Lucene_Search_Query_Empty();
233 if (count($this->_terms) == 1) {
234 // It's one term query
235 $optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($this->_terms));
236 $optimizedQuery->setBoost($this->getBoost());
238 return $optimizedQuery;
241 if (count($this->_terms) == 0) {
242 return new Zend_Search_Lucene_Search_Query_Empty();
254 public function getTerms()
256 return $this->_terms;
261 * Set weight for specified term
263 * @param integer $num
264 * @param Zend_Search_Lucene_Search_Weight_Term $weight
266 public function setWeight($num, $weight)
268 $this->_weights[$num] = $weight;
273 * Constructs an appropriate Weight implementation for this query.
275 * @param Zend_Search_Lucene_Interface $reader
276 * @return Zend_Search_Lucene_Search_Weight
278 public function createWeight(Zend_Search_Lucene_Interface $reader)
280 $this->_weight = new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader);
281 return $this->_weight;
286 * Score calculator for exact phrase queries (terms sequence is fixed)
288 * @param integer $docId
291 public function _exactPhraseFreq($docId)
295 // Term Id with lowest cardinality
296 $lowCardTermId = null;
298 // Calculate $lowCardTermId
299 foreach ($this->_terms as $termId => $term) {
300 if ($lowCardTermId === null ||
301 count($this->_termsPositions[$termId][$docId]) <
302 count($this->_termsPositions[$lowCardTermId][$docId]) ) {
303 $lowCardTermId = $termId;
307 // Walk through positions of the term with lowest cardinality
308 foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) {
309 // We expect phrase to be found
312 // Walk through other terms
313 foreach ($this->_terms as $termId => $term) {
314 if ($termId != $lowCardTermId) {
315 $expectedPosition = $lowCardPos +
316 ($this->_offsets[$termId] -
317 $this->_offsets[$lowCardTermId]);
319 if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) {
320 $freq--; // Phrase wasn't found.
331 * Score calculator for sloppy phrase queries (terms sequence is fixed)
333 * @param integer $docId
334 * @param Zend_Search_Lucene_Interface $reader
337 public function _sloppyPhraseFreq($docId, Zend_Search_Lucene_Interface $reader)
341 $phraseQueue = array();
342 $phraseQueue[0] = array(); // empty phrase
345 // Walk through the terms to create phrases.
346 foreach ($this->_terms as $termId => $term) {
347 $queueSize = count($phraseQueue);
350 // Walk through the term positions.
351 // Each term position produces a set of phrases.
352 foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) {
354 for ($count = 0; $count < $queueSize; $count++) {
355 $phraseQueue[$count][$termId] = $termPosition;
358 for ($count = 0; $count < $queueSize; $count++) {
359 if ($lastTerm !== null &&
360 abs( $termPosition - $phraseQueue[$count][$lastTerm] -
361 ($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) {
365 $newPhraseId = count($phraseQueue);
366 $phraseQueue[$newPhraseId] = $phraseQueue[$count];
367 $phraseQueue[$newPhraseId][$termId] = $termPosition;
378 foreach ($phraseQueue as $phrasePos) {
381 for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) {
383 $start = reset($phrasePos) - reset($this->_offsets) + $shift;
385 foreach ($this->_terms as $termId => $term) {
386 $distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start);
388 if($distance > $this->_slop) {
393 if ($minDistance === null || $distance < $minDistance) {
394 $minDistance = $distance;
398 if ($minDistance <= $this->_slop) {
399 $freq += $reader->getSimilarity()->sloppyFreq($minDistance);
407 * Execute query in context of index reader
408 * It also initializes necessary internal structures
410 * @param Zend_Search_Lucene_Interface $reader
411 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
413 public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
415 $this->_resVector = null;
417 if (count($this->_terms) == 0) {
418 $this->_resVector = array();
421 $resVectors = array();
422 $resVectorsSizes = array();
423 $resVectorsIds = array(); // is used to prevent arrays comparison
424 foreach ($this->_terms as $termId => $term) {
425 $resVectors[] = array_flip($reader->termDocs($term));
426 $resVectorsSizes[] = count(end($resVectors));
427 $resVectorsIds[] = $termId;
429 $this->_termsPositions[$termId] = $reader->termPositions($term);
431 // sort resvectors in order of subquery cardinality increasing
432 array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
433 $resVectorsIds, SORT_ASC, SORT_NUMERIC,
436 foreach ($resVectors as $nextResVector) {
437 if($this->_resVector === null) {
438 $this->_resVector = $nextResVector;
440 //$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
443 * This code is used as workaround for array_intersect_key() slowness problem.
445 $updatedVector = array();
446 foreach ($this->_resVector as $id => $value) {
447 if (isset($nextResVector[$id])) {
448 $updatedVector[$id] = $value;
451 $this->_resVector = $updatedVector;
454 if (count($this->_resVector) == 0) {
455 // Empty result set, we don't need to check other terms
460 // ksort($this->_resVector, SORT_NUMERIC);
461 // Docs are returned ordered. Used algorithm doesn't change elements order.
463 // Initialize weight if it's not done yet
464 $this->_initWeight($reader);
468 * Get document ids likely matching the query
470 * It's an array with document ids as keys (performance considerations)
474 public function matchedDocs()
476 return $this->_resVector;
480 * Score specified document
482 * @param integer $docId
483 * @param Zend_Search_Lucene_Interface $reader
486 public function score($docId, Zend_Search_Lucene_Interface $reader)
488 if (isset($this->_resVector[$docId])) {
489 if ($this->_slop == 0) {
490 $freq = $this->_exactPhraseFreq($docId);
492 $freq = $this->_sloppyPhraseFreq($docId, $reader);
496 $tf = $reader->getSimilarity()->tf($freq);
497 $weight = $this->_weight->getValue();
498 $norm = $reader->norm($docId, reset($this->_terms)->field);
500 return $tf * $weight * $norm * $this->getBoost();
503 // Included in result, but culculated freq is zero
515 public function getQueryTerms()
517 return $this->_terms;
521 * Query specific matches highlighting
523 * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
525 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
528 foreach ($this->_terms as $term) {
529 $words[] = $term->text;
532 $highlighter->highlight($words);
540 public function __toString()
542 // It's used only for query visualisation, so we don't care about characters escaping
543 if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) {
544 $query = $this->_terms[0]->field . ':';
551 foreach ($this->_terms as $id => $term) {
555 $query .= $term->text;
560 if ($this->_slop != 0) {
561 $query .= '~' . $this->_slop;
564 if ($this->getBoost() != 1) {
565 $query .= '^' . round($this->getBoost(), 4);