final move of files
[web.mtrack] / Zend / Search / Lucene / Search / Query / Preprocessing / Fuzzy.php
diff --git a/Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php b/Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php
new file mode 100644 (file)
index 0000000..eed96cc
--- /dev/null
@@ -0,0 +1,287 @@
+<?php\r
+/**\r
+ * Zend Framework\r
+ *\r
+ * LICENSE\r
+ *\r
+ * This source file is subject to the new BSD license that is bundled\r
+ * with this package in the file LICENSE.txt.\r
+ * It is also available through the world-wide-web at this URL:\r
+ * http://framework.zend.com/license/new-bsd\r
+ * If you did not receive a copy of the license and are unable to\r
+ * obtain it through the world-wide-web, please send an email\r
+ * to license@zend.com so we can send you a copy immediately.\r
+ *\r
+ * @category   Zend\r
+ * @package    Zend_Search_Lucene\r
+ * @subpackage Search\r
+ * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)\r
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License\r
+ * @version    $Id: Fuzzy.php 16971 2009-07-22 18:05:45Z mikaelkael $\r
+ */\r
+\r
+\r
+/** Zend_Search_Lucene_Search_Query_Processing */\r
+require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';\r
+\r
+/** Zend_Search_Lucene_Search_Query_Phrase */\r
+require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';\r
+\r
+/** Zend_Search_Lucene_Search_Query_Insignificant */\r
+require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';\r
+\r
+/** Zend_Search_Lucene_Search_Query_Empty */\r
+require_once 'Zend/Search/Lucene/Search/Query/Empty.php';\r
+\r
+/** Zend_Search_Lucene_Search_Query_Term */\r
+require_once 'Zend/Search/Lucene/Search/Query/Term.php';\r
+\r
+/** Zend_Search_Lucene_Index_Term */\r
+require_once 'Zend/Search/Lucene/Index/Term.php';\r
+\r
+\r
+/**\r
+ * It's an internal abstract class intended to finalize ase a query processing after query parsing.\r
+ * This type of query is not actually involved into query execution.\r
+ *\r
+ * @category   Zend\r
+ * @package    Zend_Search_Lucene\r
+ * @subpackage Search\r
+ * @internal\r
+ * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)\r
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License\r
+ */\r
+class Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy extends Zend_Search_Lucene_Search_Query_Preprocessing\r
+{\r
+    /**\r
+     * word (query parser lexeme) to find.\r
+     *\r
+     * @var string\r
+     */\r
+    private $_word;\r
+\r
+    /**\r
+     * Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).\r
+     *\r
+     * @var string\r
+     */\r
+    private $_encoding;\r
+\r
+\r
+    /**\r
+     * Field name.\r
+     *\r
+     * @var string\r
+     */\r
+    private $_field;\r
+\r
+    /**\r
+     * A value between 0 and 1 to set the required similarity\r
+     *  between the query term and the matching terms. For example, for a\r
+     *  _minimumSimilarity of 0.5 a term of the same length\r
+     *  as the query term is considered similar to the query term if the edit distance\r
+     *  between both terms is less than length(term)*0.5\r
+     *\r
+     * @var float\r
+     */\r
+    private $_minimumSimilarity;\r
+\r
+    /**\r
+     * Class constructor.  Create a new preprocessing object for prase query.\r
+     *\r
+     * @param string $word       Non-tokenized word (query parser lexeme) to search.\r
+     * @param string $encoding   Word encoding.\r
+     * @param string $fieldName  Field name.\r
+     * @param float  $minimumSimilarity minimum similarity\r
+     */\r
+    public function __construct($word, $encoding, $fieldName, $minimumSimilarity)\r
+    {\r
+        $this->_word     = $word;\r
+        $this->_encoding = $encoding;\r
+        $this->_field    = $fieldName;\r
+        $this->_minimumSimilarity = $minimumSimilarity;\r
+    }\r
+\r
+    /**\r
+     * Re-write query into primitive queries in the context of specified index\r
+     *\r
+     * @param Zend_Search_Lucene_Interface $index\r
+     * @return Zend_Search_Lucene_Search_Query\r
+     */\r
+    public function rewrite(Zend_Search_Lucene_Interface $index)\r
+    {\r
+        if ($this->_field === null) {\r
+            $query = new Zend_Search_Lucene_Search_Query_Boolean();\r
+\r
+            $hasInsignificantSubqueries = false;\r
+\r
+            if (Zend_Search_Lucene::getDefaultSearchField() === null) {\r
+                $searchFields = $index->getFieldNames(true);\r
+            } else {\r
+                $searchFields = array(Zend_Search_Lucene::getDefaultSearchField());\r
+            }\r
+\r
+            foreach ($searchFields as $fieldName) {\r
+                $subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy($this->_word,\r
+                                                                                    $this->_encoding,\r
+                                                                                    $fieldName,\r
+                                                                                    $this->_minimumSimilarity);\r
+\r
+                $rewrittenSubquery = $subquery->rewrite($index);\r
+\r
+                if ( !($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant  ||\r
+                       $rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Empty) ) {\r
+                    $query->addSubquery($rewrittenSubquery);\r
+                }\r
+\r
+                if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {\r
+                       $hasInsignificantSubqueries = true;\r
+                }\r
+            }\r
+\r
+            $subqueries = $query->getSubqueries();\r
+\r
+            if (count($subqueries) == 0) {\r
+               $this->_matches = array();\r
+                if ($hasInsignificantSubqueries) {\r
+                    return new Zend_Search_Lucene_Search_Query_Insignificant();\r
+                } else {\r
+                    return new Zend_Search_Lucene_Search_Query_Empty();\r
+                }\r
+            }\r
+\r
+            if (count($subqueries) == 1) {\r
+               $query = reset($subqueries);\r
+            }\r
+\r
+            $query->setBoost($this->getBoost());\r
+\r
+            $this->_matches = $query->getQueryTerms();\r
+            return $query;\r
+        }\r
+\r
+        // -------------------------------------\r
+        // Recognize exact term matching (it corresponds to Keyword fields stored in the index)\r
+        // encoding is not used since we expect binary matching\r
+        $term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);\r
+        if ($index->hasTerm($term)) {\r
+            $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);\r
+            $query->setBoost($this->getBoost());\r
+\r
+            // Get rewritten query. Important! It also fills terms matching container.\r
+            $rewrittenQuery = $query->rewrite($index);\r
+            $this->_matches = $query->getQueryTerms();\r
+\r
+            return $rewrittenQuery;\r
+        }\r
+\r
+\r
+        // -------------------------------------\r
+        // Recognize wildcard queries\r
+\r
+        /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */\r
+        if (@preg_match('/\pL/u', 'a') == 1) {\r
+               $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));\r
+        } else {\r
+               $subPatterns = preg_split('/[*?]/', $this->_word);\r
+        }\r
+        if (count($subPatterns) > 1) {\r
+            require_once 'Zend/Search/Lucene/Search/QueryParserException.php';\r
+            throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).');\r
+        }\r
+\r
+\r
+        // -------------------------------------\r
+        // Recognize one-term multi-term and "insignificant" queries\r
+        $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);\r
+\r
+        if (count($tokens) == 0) {\r
+               $this->_matches = array();\r
+            return new Zend_Search_Lucene_Search_Query_Insignificant();\r
+        }\r
+\r
+        if (count($tokens) == 1) {\r
+            $term  = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);\r
+            $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);\r
+            $query->setBoost($this->getBoost());\r
+\r
+            // Get rewritten query. Important! It also fills terms matching container.\r
+            $rewrittenQuery = $query->rewrite($index);\r
+            $this->_matches = $query->getQueryTerms();\r
+\r
+            return $rewrittenQuery;\r
+        }\r
+\r
+        // Word is tokenized into several tokens\r
+        require_once 'Zend/Search/Lucene/Search/QueryParserException.php';\r
+        throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms');\r
+    }\r
+\r
+    /**\r
+     * Query specific matches highlighting\r
+     *\r
+     * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter  Highlighter object (also contains doc for highlighting)\r
+     */\r
+    protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)\r
+    {\r
+       /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */\r
+\r
+       /** Skip exact term matching recognition, keyword fields highlighting is not supported */\r
+\r
+        // -------------------------------------\r
+        // Recognize wildcard queries\r
+\r
+        /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */\r
+        if (@preg_match('/\pL/u', 'a') == 1) {\r
+            $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));\r
+        } else {\r
+            $subPatterns = preg_split('/[*?]/', $this->_word);\r
+        }\r
+        if (count($subPatterns) > 1) {\r
+            // Do nothing\r
+            return;\r
+        }\r
+\r
+        // -------------------------------------\r
+        // Recognize one-term multi-term and "insignificant" queries\r
+        $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);\r
+        if (count($tokens) == 0) {\r
+            // Do nothing\r
+            return;\r
+        }\r
+        if (count($tokens) == 1) {\r
+            $term  = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);\r
+            $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);\r
+\r
+            $query->_highlightMatches($highlighter);\r
+            return;\r
+        }\r
+\r
+        // Word is tokenized into several tokens\r
+        // But fuzzy search is supported only for non-multiple word terms\r
+        // Do nothing\r
+    }\r
+\r
+    /**\r
+     * Print a query\r
+     *\r
+     * @return string\r
+     */\r
+    public function __toString()\r
+    {\r
+        // It's used only for query visualisation, so we don't care about characters escaping\r
+        if ($this->_field !== null) {\r
+            $query = $this->_field . ':';\r
+        } else {\r
+            $query = '';\r
+        }\r
+\r
+        $query .= $this->_word;\r
+\r
+        if ($this->getBoost() != 1) {\r
+            $query .= '^' . round($this->getBoost(), 4);\r
+        }\r
+\r
+        return $query;\r
+    }\r
+}\r