7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
16 * @package Zend_Search_Lucene
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
20 * @version $Id: MultiTerm.php 16541 2009-07-07 06:59:03Z bkarwin $
24 /** Zend_Search_Lucene_Search_Query */
25 require_once 'Zend/Search/Lucene/Search/Query.php';
27 /** Zend_Search_Lucene_Search_Weight_MultiTerm */
28 require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php';
33 * @package Zend_Search_Lucene
35 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
36 * @license http://framework.zend.com/license/new-bsd New BSD License
38 class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query
43 * Array of Zend_Search_Lucene_Index_Term
47 private $_terms = array();
51 * If true then term is required.
52 * If false then term is prohibited.
53 * If null then term is neither prohibited, nor required
55 * If array is null then all terms are required
66 private $_resVector = null;
69 * Terms positions vectors.
71 * term1Id => (docId => freq, ...)
72 * term2Id => (docId => freq, ...)
76 private $_termsFreqs = array();
80 * A score factor based on the fraction of all query terms
81 * that a document contains.
82 * float for conjunction queries
83 * array of float for non conjunction queries
87 private $_coord = null;
92 * array of Zend_Search_Lucene_Search_Weight
96 private $_weights = array();
100 * Class constructor. Create a new multi-term query object.
102 * if $signs array is omitted then all terms are required
103 * it differs from addTerm() behavior, but should never be used
105 * @param array $terms Array of Zend_Search_Lucene_Index_Term objects
106 * @param array $signs Array of signs. Sign is boolean|null.
107 * @throws Zend_Search_Lucene_Exception
109 public function __construct($terms = null, $signs = null)
111 if (is_array($terms)) {
112 if (count($terms) > Zend_Search_Lucene::getTermsPerQueryLimit()) {
113 throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
116 $this->_terms = $terms;
118 $this->_signs = null;
119 // Check if all terms are required
120 if (is_array($signs)) {
121 foreach ($signs as $sign ) {
122 if ($sign !== true) {
123 $this->_signs = $signs;
133 * Add a $term (Zend_Search_Lucene_Index_Term) to this query.
135 * The sign is specified as:
136 * TRUE - term is required
137 * FALSE - term is prohibited
138 * NULL - term is neither prohibited, nor required
140 * @param Zend_Search_Lucene_Index_Term $term
141 * @param boolean|null $sign
144 public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign = null) {
145 if ($sign !== true || $this->_signs !== null) { // Skip, if all terms are required
146 if ($this->_signs === null) { // Check, If all previous terms are required
147 $this->_signs = array();
148 foreach ($this->_terms as $prevTerm) {
149 $this->_signs[] = true;
152 $this->_signs[] = $sign;
155 $this->_terms[] = $term;
160 * Re-write query into primitive queries in the context of specified index
162 * @param Zend_Search_Lucene_Interface $index
163 * @return Zend_Search_Lucene_Search_Query
165 public function rewrite(Zend_Search_Lucene_Interface $index)
167 if (count($this->_terms) == 0) {
168 return new Zend_Search_Lucene_Search_Query_Empty();
171 // Check, that all fields are qualified
172 $allQualified = true;
173 foreach ($this->_terms as $term) {
174 if ($term->field === null) {
175 $allQualified = false;
183 /** transform multiterm query to boolean and apply rewrite() method to subqueries. */
184 $query = new Zend_Search_Lucene_Search_Query_Boolean();
185 $query->setBoost($this->getBoost());
187 foreach ($this->_terms as $termId => $term) {
188 $subquery = new Zend_Search_Lucene_Search_Query_Term($term);
190 $query->addSubquery($subquery->rewrite($index),
191 ($this->_signs === null)? true : $this->_signs[$termId]);
199 * Optimize query in the context of specified index
201 * @param Zend_Search_Lucene_Interface $index
202 * @return Zend_Search_Lucene_Search_Query
204 public function optimize(Zend_Search_Lucene_Interface $index)
206 $terms = $this->_terms;
207 $signs = $this->_signs;
209 foreach ($terms as $id => $term) {
210 if (!$index->hasTerm($term)) {
211 if ($signs === null || $signs[$id] === true) {
213 return new Zend_Search_Lucene_Search_Query_Empty();
215 // Term is optional or prohibited
216 // Remove it from terms and signs list
223 // Check if all presented terms are prohibited
224 $allProhibited = true;
225 if ($signs === null) {
226 $allProhibited = false;
228 foreach ($signs as $sign) {
229 if ($sign !== false) {
230 $allProhibited = false;
235 if ($allProhibited) {
236 return new Zend_Search_Lucene_Search_Query_Empty();
240 * @todo make an optimization for repeated terms
241 * (they may have different signs)
244 if (count($terms) == 1) {
245 // It's already checked, that it's not a prohibited term
247 // It's one term query with one required or optional element
248 $optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
249 $optimizedQuery->setBoost($this->getBoost());
251 return $optimizedQuery;
254 if (count($terms) == 0) {
255 return new Zend_Search_Lucene_Search_Query_Empty();
258 $optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $signs);
259 $optimizedQuery->setBoost($this->getBoost());
260 return $optimizedQuery;
269 public function getTerms()
271 return $this->_terms;
280 public function getSigns()
282 return $this->_signs;
287 * Set weight for specified term
289 * @param integer $num
290 * @param Zend_Search_Lucene_Search_Weight_Term $weight
292 public function setWeight($num, $weight)
294 $this->_weights[$num] = $weight;
299 * Constructs an appropriate Weight implementation for this query.
301 * @param Zend_Search_Lucene_Interface $reader
302 * @return Zend_Search_Lucene_Search_Weight
304 public function createWeight(Zend_Search_Lucene_Interface $reader)
306 $this->_weight = new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader);
307 return $this->_weight;
312 * Calculate result vector for Conjunction query
313 * (like '+something +another')
315 * @param Zend_Search_Lucene_Interface $reader
317 private function _calculateConjunctionResult(Zend_Search_Lucene_Interface $reader)
319 $this->_resVector = null;
321 if (count($this->_terms) == 0) {
322 $this->_resVector = array();
325 // Order terms by selectivity
328 foreach ($this->_terms as $id => $term) {
329 $docFreqs[] = $reader->docFreq($term);
330 $ids[] = $id; // Used to keep original order for terms with the same selectivity and omit terms comparison
332 array_multisort($docFreqs, SORT_ASC, SORT_NUMERIC,
333 $ids, SORT_ASC, SORT_NUMERIC,
336 $docsFilter = new Zend_Search_Lucene_Index_DocsFilter();
337 foreach ($this->_terms as $termId => $term) {
338 $termDocs = $reader->termDocs($term, $docsFilter);
340 // Treat last retrieved docs vector as a result set
341 // (filter collects data for other terms)
342 $this->_resVector = array_flip($termDocs);
344 foreach ($this->_terms as $termId => $term) {
345 $this->_termsFreqs[$termId] = $reader->termFreqs($term, $docsFilter);
348 // ksort($this->_resVector, SORT_NUMERIC);
349 // Docs are returned ordered. Used algorithms doesn't change elements order.
354 * Calculate result vector for non Conjunction query
355 * (like '+something -another')
357 * @param Zend_Search_Lucene_Interface $reader
359 private function _calculateNonConjunctionResult(Zend_Search_Lucene_Interface $reader)
361 $requiredVectors = array();
362 $requiredVectorsSizes = array();
363 $requiredVectorsIds = array(); // is used to prevent arrays comparison
366 $prohibited = array();
368 foreach ($this->_terms as $termId => $term) {
369 $termDocs = array_flip($reader->termDocs($term));
371 if ($this->_signs[$termId] === true) {
373 $requiredVectors[] = $termDocs;
374 $requiredVectorsSizes[] = count($termDocs);
375 $requiredVectorsIds[] = $termId;
376 } elseif ($this->_signs[$termId] === false) {
379 $prohibited += $termDocs;
381 // neither required, nor prohibited
383 $optional += $termDocs;
386 $this->_termsFreqs[$termId] = $reader->termFreqs($term);
389 // sort resvectors in order of subquery cardinality increasing
390 array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
391 $requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
395 foreach ($requiredVectors as $nextResVector) {
396 if($required === null) {
397 $required = $nextResVector;
399 //$required = array_intersect_key($required, $nextResVector);
402 * This code is used as workaround for array_intersect_key() slowness problem.
404 $updatedVector = array();
405 foreach ($required as $id => $value) {
406 if (isset($nextResVector[$id])) {
407 $updatedVector[$id] = $value;
410 $required = $updatedVector;
413 if (count($required) == 0) {
414 // Empty result set, we don't need to check other terms
419 if ($required !== null) {
420 $this->_resVector = $required;
422 $this->_resVector = $optional;
425 if (count($prohibited) != 0) {
426 // $this->_resVector = array_diff_key($this->_resVector, $prohibited);
429 * This code is used as workaround for array_diff_key() slowness problem.
431 if (count($this->_resVector) < count($prohibited)) {
432 $updatedVector = $this->_resVector;
433 foreach ($this->_resVector as $id => $value) {
434 if (isset($prohibited[$id])) {
435 unset($updatedVector[$id]);
438 $this->_resVector = $updatedVector;
440 $updatedVector = $this->_resVector;
441 foreach ($prohibited as $id => $value) {
442 unset($updatedVector[$id]);
444 $this->_resVector = $updatedVector;
448 ksort($this->_resVector, SORT_NUMERIC);
453 * Score calculator for conjunction queries (all terms are required)
455 * @param integer $docId
456 * @param Zend_Search_Lucene_Interface $reader
459 public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
461 if ($this->_coord === null) {
462 $this->_coord = $reader->getSimilarity()->coord(count($this->_terms),
463 count($this->_terms) );
468 foreach ($this->_terms as $termId => $term) {
470 * We don't need to check that term freq is not 0
471 * Score calculation is performed only for matched docs
473 $score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
474 $this->_weights[$termId]->getValue() *
475 $reader->norm($docId, $term->field);
478 return $score * $this->_coord * $this->getBoost();
483 * Score calculator for non conjunction queries (not all terms are required)
485 * @param integer $docId
486 * @param Zend_Search_Lucene_Interface $reader
489 public function _nonConjunctionScore($docId, $reader)
491 if ($this->_coord === null) {
492 $this->_coord = array();
495 foreach ($this->_signs as $sign) {
496 if ($sign !== false /* not prohibited */) {
501 for ($count = 0; $count <= $maxCoord; $count++) {
502 $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
508 foreach ($this->_terms as $termId=>$term) {
510 if ($this->_signs[$termId] !== false && // not prohibited
511 isset($this->_termsFreqs[$termId][$docId]) // matched
516 * We don't need to check that term freq is not 0
517 * Score calculation is performed only for matched docs
520 $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
521 $this->_weights[$termId]->getValue() *
522 $reader->norm($docId, $term->field);
526 return $score * $this->_coord[$matchedTerms] * $this->getBoost();
530 * Execute query in context of index reader
531 * It also initializes necessary internal structures
533 * @param Zend_Search_Lucene_Interface $reader
534 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
536 public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
538 if ($this->_signs === null) {
539 $this->_calculateConjunctionResult($reader);
541 $this->_calculateNonConjunctionResult($reader);
544 // Initialize weight if it's not done yet
545 $this->_initWeight($reader);
549 * Get document ids likely matching the query
551 * It's an array with document ids as keys (performance considerations)
555 public function matchedDocs()
557 return $this->_resVector;
561 * Score specified document
563 * @param integer $docId
564 * @param Zend_Search_Lucene_Interface $reader
567 public function score($docId, Zend_Search_Lucene_Interface $reader)
569 if (isset($this->_resVector[$docId])) {
570 if ($this->_signs === null) {
571 return $this->_conjunctionScore($docId, $reader);
573 return $this->_nonConjunctionScore($docId, $reader);
585 public function getQueryTerms()
587 if ($this->_signs === null) {
588 return $this->_terms;
593 foreach ($this->_signs as $id => $sign) {
594 if ($sign !== false) {
595 $terms[] = $this->_terms[$id];
603 * Query specific matches highlighting
605 * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
607 protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
611 if ($this->_signs === null) {
612 foreach ($this->_terms as $term) {
613 $words[] = $term->text;
616 foreach ($this->_signs as $id => $sign) {
617 if ($sign !== false) {
618 $words[] = $this->_terms[$id]->text;
623 $highlighter->highlight($words);
631 public function __toString()
633 // It's used only for query visualisation, so we don't care about characters escaping
637 foreach ($this->_terms as $id => $term) {
642 if ($this->_signs === null || $this->_signs[$id] === true) {
644 } else if ($this->_signs[$id] === false) {
648 if ($term->field !== null) {
649 $query .= $term->field . ':';
651 $query .= $term->text;
654 if ($this->getBoost() != 1) {
655 $query = '(' . $query . ')^' . round($this->getBoost(), 4);