final move of files
[web.mtrack] / Zend / Search / Lucene / Search / Query / MultiTerm.php
diff --git a/Zend/Search/Lucene/Search/Query/MultiTerm.php b/Zend/Search/Lucene/Search/Query/MultiTerm.php
new file mode 100644 (file)
index 0000000..c57bcb5
--- /dev/null
@@ -0,0 +1,661 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Search
+ * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ * @version    $Id: MultiTerm.php 16541 2009-07-07 06:59:03Z bkarwin $
+ */
+
+
+/** Zend_Search_Lucene_Search_Query */
+require_once 'Zend/Search/Lucene/Search/Query.php';
+
+/** Zend_Search_Lucene_Search_Weight_MultiTerm */
+require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php';
+
+
+/**
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Search
+ * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query
+{
+
+    /**
+     * Terms to find.
+     * Array of Zend_Search_Lucene_Index_Term
+     *
+     * @var array
+     */
+    private $_terms = array();
+
+    /**
+     * Term signs.
+     * If true then term is required.
+     * If false then term is prohibited.
+     * If null then term is neither prohibited, nor required
+     *
+     * If array is null then all terms are required
+     *
+     * @var array
+     */
+    private $_signs;
+
+    /**
+     * Result vector.
+     *
+     * @var array
+     */
+    private $_resVector = null;
+
+    /**
+     * Terms positions vectors.
+     * Array of Arrays:
+     * term1Id => (docId => freq, ...)
+     * term2Id => (docId => freq, ...)
+     *
+     * @var array
+     */
+    private $_termsFreqs = array();
+
+
+    /**
+     * A score factor based on the fraction of all query terms
+     * that a document contains.
+     * float for conjunction queries
+     * array of float for non conjunction queries
+     *
+     * @var mixed
+     */
+    private $_coord = null;
+
+
+    /**
+     * Terms weights
+     * array of Zend_Search_Lucene_Search_Weight
+     *
+     * @var array
+     */
+    private $_weights = array();
+
+
+    /**
+     * Class constructor.  Create a new multi-term query object.
+     *
+     * if $signs array is omitted then all terms are required
+     * it differs from addTerm() behavior, but should never be used
+     *
+     * @param array $terms    Array of Zend_Search_Lucene_Index_Term objects
+     * @param array $signs    Array of signs.  Sign is boolean|null.
+     * @throws Zend_Search_Lucene_Exception
+     */
+    public function __construct($terms = null, $signs = null)
+    {
+        if (is_array($terms)) {
+            if (count($terms) > Zend_Search_Lucene::getTermsPerQueryLimit()) {
+                throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
+            }
+
+            $this->_terms = $terms;
+
+            $this->_signs = null;
+            // Check if all terms are required
+            if (is_array($signs)) {
+                foreach ($signs as $sign ) {
+                    if ($sign !== true) {
+                        $this->_signs = $signs;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Add a $term (Zend_Search_Lucene_Index_Term) to this query.
+     *
+     * The sign is specified as:
+     *     TRUE  - term is required
+     *     FALSE - term is prohibited
+     *     NULL  - term is neither prohibited, nor required
+     *
+     * @param  Zend_Search_Lucene_Index_Term $term
+     * @param  boolean|null $sign
+     * @return void
+     */
+    public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign = null) {
+        if ($sign !== true || $this->_signs !== null) {       // Skip, if all terms are required
+            if ($this->_signs === null) {                     // Check, If all previous terms are required
+                $this->_signs = array();
+                foreach ($this->_terms as $prevTerm) {
+                    $this->_signs[] = true;
+                }
+            }
+            $this->_signs[] = $sign;
+        }
+
+        $this->_terms[] = $term;
+    }
+
+
+    /**
+     * Re-write query into primitive queries in the context of specified index
+     *
+     * @param Zend_Search_Lucene_Interface $index
+     * @return Zend_Search_Lucene_Search_Query
+     */
+    public function rewrite(Zend_Search_Lucene_Interface $index)
+    {
+        if (count($this->_terms) == 0) {
+            return new Zend_Search_Lucene_Search_Query_Empty();
+        }
+
+        // Check, that all fields are qualified
+        $allQualified = true;
+        foreach ($this->_terms as $term) {
+            if ($term->field === null) {
+                $allQualified = false;
+                break;
+            }
+        }
+
+        if ($allQualified) {
+            return $this;
+        } else {
+            /** transform multiterm query to boolean and apply rewrite() method to subqueries. */
+            $query = new Zend_Search_Lucene_Search_Query_Boolean();
+            $query->setBoost($this->getBoost());
+
+            foreach ($this->_terms as $termId => $term) {
+                $subquery = new Zend_Search_Lucene_Search_Query_Term($term);
+
+                $query->addSubquery($subquery->rewrite($index),
+                                    ($this->_signs === null)?  true : $this->_signs[$termId]);
+            }
+
+            return $query;
+        }
+    }
+
+    /**
+     * Optimize query in the context of specified index
+     *
+     * @param Zend_Search_Lucene_Interface $index
+     * @return Zend_Search_Lucene_Search_Query
+     */
+    public function optimize(Zend_Search_Lucene_Interface $index)
+    {
+        $terms = $this->_terms;
+        $signs = $this->_signs;
+
+        foreach ($terms as $id => $term) {
+            if (!$index->hasTerm($term)) {
+                if ($signs === null  ||  $signs[$id] === true) {
+                    // Term is required
+                    return new Zend_Search_Lucene_Search_Query_Empty();
+                } else {
+                    // Term is optional or prohibited
+                    // Remove it from terms and signs list
+                    unset($terms[$id]);
+                    unset($signs[$id]);
+                }
+            }
+        }
+
+        // Check if all presented terms are prohibited
+        $allProhibited = true;
+        if ($signs === null) {
+            $allProhibited = false;
+        } else {
+            foreach ($signs as $sign) {
+                if ($sign !== false) {
+                    $allProhibited = false;
+                    break;
+                }
+            }
+        }
+        if ($allProhibited) {
+            return new Zend_Search_Lucene_Search_Query_Empty();
+        }
+
+        /**
+         * @todo make an optimization for repeated terms
+         * (they may have different signs)
+         */
+
+        if (count($terms) == 1) {
+            // It's already checked, that it's not a prohibited term
+
+            // It's one term query with one required or optional element
+            $optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
+            $optimizedQuery->setBoost($this->getBoost());
+
+            return $optimizedQuery;
+        }
+
+        if (count($terms) == 0) {
+            return new Zend_Search_Lucene_Search_Query_Empty();
+        }
+
+        $optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $signs);
+        $optimizedQuery->setBoost($this->getBoost());
+        return $optimizedQuery;
+    }
+
+
+    /**
+     * Returns query term
+     *
+     * @return array
+     */
+    public function getTerms()
+    {
+        return $this->_terms;
+    }
+
+
+    /**
+     * Return terms signs
+     *
+     * @return array
+     */
+    public function getSigns()
+    {
+        return $this->_signs;
+    }
+
+
+    /**
+     * Set weight for specified term
+     *
+     * @param integer $num
+     * @param Zend_Search_Lucene_Search_Weight_Term $weight
+     */
+    public function setWeight($num, $weight)
+    {
+        $this->_weights[$num] = $weight;
+    }
+
+
+    /**
+     * Constructs an appropriate Weight implementation for this query.
+     *
+     * @param Zend_Search_Lucene_Interface $reader
+     * @return Zend_Search_Lucene_Search_Weight
+     */
+    public function createWeight(Zend_Search_Lucene_Interface $reader)
+    {
+        $this->_weight = new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader);
+        return $this->_weight;
+    }
+
+
+    /**
+     * Calculate result vector for Conjunction query
+     * (like '+something +another')
+     *
+     * @param Zend_Search_Lucene_Interface $reader
+     */
+    private function _calculateConjunctionResult(Zend_Search_Lucene_Interface $reader)
+    {
+        $this->_resVector = null;
+
+        if (count($this->_terms) == 0) {
+            $this->_resVector = array();
+        }
+
+        // Order terms by selectivity
+        $docFreqs = array();
+        $ids      = array();
+        foreach ($this->_terms as $id => $term) {
+            $docFreqs[] = $reader->docFreq($term);
+            $ids[]      = $id; // Used to keep original order for terms with the same selectivity and omit terms comparison
+        }
+        array_multisort($docFreqs, SORT_ASC, SORT_NUMERIC,
+                        $ids,      SORT_ASC, SORT_NUMERIC,
+                        $this->_terms);
+
+        $docsFilter = new Zend_Search_Lucene_Index_DocsFilter();
+        foreach ($this->_terms as $termId => $term) {
+            $termDocs = $reader->termDocs($term, $docsFilter);
+        }
+        // Treat last retrieved docs vector as a result set
+        // (filter collects data for other terms)
+        $this->_resVector = array_flip($termDocs);
+
+        foreach ($this->_terms as $termId => $term) {
+            $this->_termsFreqs[$termId] = $reader->termFreqs($term, $docsFilter);
+        }
+
+        // ksort($this->_resVector, SORT_NUMERIC);
+        // Docs are returned ordered. Used algorithms doesn't change elements order.
+    }
+
+
+    /**
+     * Calculate result vector for non Conjunction query
+     * (like '+something -another')
+     *
+     * @param Zend_Search_Lucene_Interface $reader
+     */
+    private function _calculateNonConjunctionResult(Zend_Search_Lucene_Interface $reader)
+    {
+        $requiredVectors      = array();
+        $requiredVectorsSizes = array();
+        $requiredVectorsIds   = array(); // is used to prevent arrays comparison
+
+        $optional   = array();
+        $prohibited = array();
+
+        foreach ($this->_terms as $termId => $term) {
+            $termDocs = array_flip($reader->termDocs($term));
+
+            if ($this->_signs[$termId] === true) {
+                // required
+                $requiredVectors[]      = $termDocs;
+                $requiredVectorsSizes[] = count($termDocs);
+                $requiredVectorsIds[]   = $termId;
+            } elseif ($this->_signs[$termId] === false) {
+                // prohibited
+                // array union
+                $prohibited += $termDocs;
+            } else {
+                // neither required, nor prohibited
+                // array union
+                $optional += $termDocs;
+            }
+
+            $this->_termsFreqs[$termId] = $reader->termFreqs($term);
+        }
+
+        // sort resvectors in order of subquery cardinality increasing
+        array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
+                        $requiredVectorsIds,   SORT_ASC, SORT_NUMERIC,
+                        $requiredVectors);
+
+        $required = null;
+        foreach ($requiredVectors as $nextResVector) {
+            if($required === null) {
+                $required = $nextResVector;
+            } else {
+                //$required = array_intersect_key($required, $nextResVector);
+
+                /**
+                 * This code is used as workaround for array_intersect_key() slowness problem.
+                 */
+                $updatedVector = array();
+                foreach ($required as $id => $value) {
+                    if (isset($nextResVector[$id])) {
+                        $updatedVector[$id] = $value;
+                    }
+                }
+                $required = $updatedVector;
+            }
+
+            if (count($required) == 0) {
+                // Empty result set, we don't need to check other terms
+                break;
+            }
+        }
+
+        if ($required !== null) {
+            $this->_resVector = $required;
+        } else {
+            $this->_resVector = $optional;
+        }
+
+        if (count($prohibited) != 0) {
+            // $this->_resVector = array_diff_key($this->_resVector, $prohibited);
+
+            /**
+             * This code is used as workaround for array_diff_key() slowness problem.
+             */
+            if (count($this->_resVector) < count($prohibited)) {
+                $updatedVector = $this->_resVector;
+                foreach ($this->_resVector as $id => $value) {
+                    if (isset($prohibited[$id])) {
+                        unset($updatedVector[$id]);
+                    }
+                }
+                $this->_resVector = $updatedVector;
+            } else {
+                $updatedVector = $this->_resVector;
+                foreach ($prohibited as $id => $value) {
+                    unset($updatedVector[$id]);
+                }
+                $this->_resVector = $updatedVector;
+            }
+        }
+
+        ksort($this->_resVector, SORT_NUMERIC);
+    }
+
+
+    /**
+     * Score calculator for conjunction queries (all terms are required)
+     *
+     * @param integer $docId
+     * @param Zend_Search_Lucene_Interface $reader
+     * @return float
+     */
+    public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
+    {
+        if ($this->_coord === null) {
+            $this->_coord = $reader->getSimilarity()->coord(count($this->_terms),
+                                                            count($this->_terms) );
+        }
+
+        $score = 0.0;
+
+        foreach ($this->_terms as $termId => $term) {
+            /**
+             * We don't need to check that term freq is not 0
+             * Score calculation is performed only for matched docs
+             */
+            $score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
+                      $this->_weights[$termId]->getValue() *
+                      $reader->norm($docId, $term->field);
+        }
+
+        return $score * $this->_coord * $this->getBoost();
+    }
+
+
+    /**
+     * Score calculator for non conjunction queries (not all terms are required)
+     *
+     * @param integer $docId
+     * @param Zend_Search_Lucene_Interface $reader
+     * @return float
+     */
+    public function _nonConjunctionScore($docId, $reader)
+    {
+        if ($this->_coord === null) {
+            $this->_coord = array();
+
+            $maxCoord = 0;
+            foreach ($this->_signs as $sign) {
+                if ($sign !== false /* not prohibited */) {
+                    $maxCoord++;
+                }
+            }
+
+            for ($count = 0; $count <= $maxCoord; $count++) {
+                $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
+            }
+        }
+
+        $score = 0.0;
+        $matchedTerms = 0;
+        foreach ($this->_terms as $termId=>$term) {
+            // Check if term is
+            if ($this->_signs[$termId] !== false &&        // not prohibited
+                isset($this->_termsFreqs[$termId][$docId]) // matched
+               ) {
+                $matchedTerms++;
+
+                /**
+                 * We don't need to check that term freq is not 0
+                 * Score calculation is performed only for matched docs
+                 */
+                $score +=
+                      $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
+                      $this->_weights[$termId]->getValue() *
+                      $reader->norm($docId, $term->field);
+            }
+        }
+
+        return $score * $this->_coord[$matchedTerms] * $this->getBoost();
+    }
+
+    /**
+     * Execute query in context of index reader
+     * It also initializes necessary internal structures
+     *
+     * @param Zend_Search_Lucene_Interface $reader
+     * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
+     */
+    public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
+    {
+        if ($this->_signs === null) {
+            $this->_calculateConjunctionResult($reader);
+        } else {
+            $this->_calculateNonConjunctionResult($reader);
+        }
+
+        // Initialize weight if it's not done yet
+        $this->_initWeight($reader);
+    }
+
+    /**
+     * Get document ids likely matching the query
+     *
+     * It's an array with document ids as keys (performance considerations)
+     *
+     * @return array
+     */
+    public function matchedDocs()
+    {
+        return $this->_resVector;
+    }
+
+    /**
+     * Score specified document
+     *
+     * @param integer $docId
+     * @param Zend_Search_Lucene_Interface $reader
+     * @return float
+     */
+    public function score($docId, Zend_Search_Lucene_Interface $reader)
+    {
+        if (isset($this->_resVector[$docId])) {
+            if ($this->_signs === null) {
+                return $this->_conjunctionScore($docId, $reader);
+            } else {
+                return $this->_nonConjunctionScore($docId, $reader);
+            }
+        } else {
+            return 0;
+        }
+    }
+
+    /**
+     * Return query terms
+     *
+     * @return array
+     */
+    public function getQueryTerms()
+    {
+        if ($this->_signs === null) {
+            return $this->_terms;
+        }
+
+        $terms = array();
+
+        foreach ($this->_signs as $id => $sign) {
+            if ($sign !== false) {
+                $terms[] = $this->_terms[$id];
+            }
+        }
+
+        return $terms;
+    }
+
+    /**
+     * Query specific matches highlighting
+     *
+     * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)
+     */
+    protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
+    {
+        $words = array();
+
+        if ($this->_signs === null) {
+            foreach ($this->_terms as $term) {
+                $words[] = $term->text;
+            }
+        } else {
+            foreach ($this->_signs as $id => $sign) {
+                if ($sign !== false) {
+                    $words[] = $this->_terms[$id]->text;
+                }
+            }
+        }
+
+        $highlighter->highlight($words);
+    }
+
+    /**
+     * Print a query
+     *
+     * @return string
+     */
+    public function __toString()
+    {
+        // It's used only for query visualisation, so we don't care about characters escaping
+
+        $query = '';
+
+        foreach ($this->_terms as $id => $term) {
+            if ($id != 0) {
+                $query .= ' ';
+            }
+
+            if ($this->_signs === null || $this->_signs[$id] === true) {
+                $query .= '+';
+            } else if ($this->_signs[$id] === false) {
+                $query .= '-';
+            }
+
+            if ($term->field !== null) {
+                $query .= $term->field . ':';
+            }
+            $query .= $term->text;
+        }
+
+        if ($this->getBoost() != 1) {
+            $query = '(' . $query . ')^' . round($this->getBoost(), 4);
+        }
+
+        return $query;
+    }
+}
+