git.roojs.org Git - web.mtrack/blob - inc/lib/Zend/Search/Lucene/Analysis/Analyzer.php

   1 <?php
   2 /**
   3  * Zend Framework
   4  *
   5  * LICENSE
   6  *
   7  * This source file is subject to the new BSD license that is bundled
   8  * with this package in the file LICENSE.txt.
   9  * It is also available through the world-wide-web at this URL:
  10  * http://framework.zend.com/license/new-bsd
  11  * If you did not receive a copy of the license and are unable to
  12  * obtain it through the world-wide-web, please send an email
  13  * to license@zend.com so we can send you a copy immediately.
  14  *
  15  * @category   Zend
  16  * @package    Zend_Search_Lucene
  17  * @subpackage Analysis
  18  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  20  * @version    $Id: Analyzer.php 16541 2009-07-07 06:59:03Z bkarwin $
  21  */
  22
  23
  24 /** Zend_Search_Lucene_Analysis_Token */
  25 require_once 'Zend/Search/Lucene/Analysis/Token.php';
  26
  27 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
  28 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
  29
  30 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
  31 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
  32
  33 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
  34 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
  35
  36 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
  37 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
  38
  39 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
  40 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
  41
  42 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
  43 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
  44
  45 /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
  46 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
  47
  48 /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
  49 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
  50
  51 /** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
  52 require_once 'Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
  53
  54 /** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
  55 require_once 'Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
  56
  57
  58 /**
  59  * An Analyzer is used to analyze text.
  60  * It thus represents a policy for extracting index terms from text.
  61  *
  62  * Note:
  63  * Lucene Java implementation is oriented to streams. It provides effective work
  64  * with a huge documents (more then 20Mb).
  65  * But engine itself is not oriented such documents.
  66  * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
  67  *
  68  * @category   Zend
  69  * @package    Zend_Search_Lucene
  70  * @subpackage Analysis
  71  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  72  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  73  */
  74
  75 abstract class Zend_Search_Lucene_Analysis_Analyzer
  76 {
  77     /**
  78      * The Analyzer implementation used by default.
  79      *
  80      * @var Zend_Search_Lucene_Analysis_Analyzer
  81      */
  82     private static $_defaultImpl;
  83
  84     /**
  85      * Input string
  86      *
  87      * @var string
  88      */
  89     protected $_input = null;
  90
  91     /**
  92      * Input string encoding
  93      *
  94      * @var string
  95      */
  96     protected $_encoding = '';
  97
  98     /**
  99      * Tokenize text to a terms
 100      * Returns array of Zend_Search_Lucene_Analysis_Token objects
 101      *
 102      * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
 103      *
 104      * @param string $data
 105      * @return array
 106      */
 107     public function tokenize($data, $encoding = '')
 108     {
 109         $this->setInput($data, $encoding);
 110
 111         $tokenList = array();
 112         while (($nextToken = $this->nextToken()) !== null) {
 113             $tokenList[] = $nextToken;
 114         }
 115
 116         return $tokenList;
 117     }
 118
 119
 120     /**
 121      * Tokenization stream API
 122      * Set input
 123      *
 124      * @param string $data
 125      */
 126     public function setInput($data, $encoding = '')
 127     {
 128         $this->_input    = $data;
 129         $this->_encoding = $encoding;
 130         $this->reset();
 131     }
 132
 133     /**
 134      * Reset token stream
 135      */
 136     abstract public function reset();
 137
 138     /**
 139      * Tokenization stream API
 140      * Get next token
 141      * Returns null at the end of stream
 142      *
 143      * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
 144      *
 145      * @return Zend_Search_Lucene_Analysis_Token|null
 146      */
 147     abstract public function nextToken();
 148
 149
 150
 151
 152     /**
 153      * Set the default Analyzer implementation used by indexing code.
 154      *
 155      * @param Zend_Search_Lucene_Analysis_Analyzer $similarity
 156      */
 157     public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
 158     {
 159         self::$_defaultImpl = $analyzer;
 160     }
 161
 162
 163     /**
 164      * Return the default Analyzer implementation used by indexing code.
 165      *
 166      * @return Zend_Search_Lucene_Analysis_Analyzer
 167      */
 168     public static function getDefault()
 169     {
 170         if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
 171             self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
 172         }
 173
 174         return self::$_defaultImpl;
 175     }
 176 }
 177