7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
16 * @package Zend_Search_Lucene
17 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
18 * @license http://framework.zend.com/license/new-bsd New BSD License
19 * @version $Id: Lucene.php 17164 2009-07-27 03:59:23Z matthew $
22 /** Zend_Search_Lucene_Document */
23 require_once 'Zend/Search/Lucene/Document.php';
25 /** Zend_Search_Lucene_Document_Html */
26 require_once 'Zend/Search/Lucene/Document/Html.php';
28 /** Zend_Search_Lucene_Document_Docx */
29 require_once 'Zend/Search/Lucene/Document/Docx.php';
31 /** Zend_Search_Lucene_Document_Pptx */
32 require_once 'Zend/Search/Lucene/Document/Pptx.php';
34 /** Zend_Search_Lucene_Document_Xlsx */
35 require_once 'Zend/Search/Lucene/Document/Xlsx.php';
37 /** Zend_Search_Lucene_Storage_Directory_Filesystem */
38 require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php';
40 /** Zend_Search_Lucene_Storage_File_Memory */
41 require_once 'Zend/Search/Lucene/Storage/File/Memory.php';
43 /** Zend_Search_Lucene_Index_Term */
44 require_once 'Zend/Search/Lucene/Index/Term.php';
46 /** Zend_Search_Lucene_Index_TermInfo */
47 require_once 'Zend/Search/Lucene/Index/TermInfo.php';
49 /** Zend_Search_Lucene_Index_SegmentInfo */
50 require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
52 /** Zend_Search_Lucene_Index_FieldInfo */
53 require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
55 /** Zend_Search_Lucene_Index_Writer */
56 require_once 'Zend/Search/Lucene/Index/Writer.php';
58 /** Zend_Search_Lucene_Search_QueryParser */
59 require_once 'Zend/Search/Lucene/Search/QueryParser.php';
61 /** Zend_Search_Lucene_Search_QueryHit */
62 require_once 'Zend/Search/Lucene/Search/QueryHit.php';
64 /** Zend_Search_Lucene_Search_Similarity */
65 require_once 'Zend/Search/Lucene/Search/Similarity.php';
67 /** Zend_Search_Lucene_Index_TermsPriorityQueue */
68 require_once 'Zend/Search/Lucene/Index/TermsPriorityQueue.php';
70 /** Zend_Search_Lucene_TermStreamsPriorityQueue */
71 require_once 'Zend/Search/Lucene/TermStreamsPriorityQueue.php';
73 /** Zend_Search_Lucene_Index_DocsFilter */
74 require_once 'Zend/Search/Lucene/Index/DocsFilter.php';
76 /** Zend_Search_Lucene_LockManager */
77 require_once 'Zend/Search/Lucene/LockManager.php';
79 /** Zend_Search_Lucene_Interface */
80 require_once 'Zend/Search/Lucene/Interface.php';
82 /** Zend_Search_Lucene_Proxy */
83 require_once 'Zend/Search/Lucene/Proxy.php';
87 * @package Zend_Search_Lucene
88 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
89 * @license http://framework.zend.com/license/new-bsd New BSD License
91 class Zend_Search_Lucene implements Zend_Search_Lucene_Interface
94 * Default field name for search
96 * Null means search through all fields
100 private static $_defaultSearchField = null;
109 private static $_resultSetLimit = 0;
112 * Terms per query limit
118 private static $_termsPerQueryLimit = 1024;
121 * File system adapter.
123 * @var Zend_Search_Lucene_Storage_Directory
125 private $_directory = null;
128 * File system adapter closing option
132 private $_closeDirOnExit = true;
135 * Writer for this index, not instantiated unless required.
137 * @var Zend_Search_Lucene_Index_Writer
139 private $_writer = null;
142 * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
144 * @var array Zend_Search_Lucene_Index_SegmentInfo
146 private $_segmentInfos = array();
149 * Number of documents in this index.
153 private $_docCount = 0;
156 * Flag for index changes
160 private $_hasChanges = false;
164 * Signal, that index is already closed, changes are fixed and resources are cleaned up
168 private $_closed = false;
171 * Number of references to the index object
175 private $_refCount = 0;
178 * Current segment generation
182 private $_generation;
184 const FORMAT_PRE_2_1 = 0;
185 const FORMAT_2_1 = 1;
186 const FORMAT_2_3 = 2;
190 * Index format version
194 private $_formatVersion;
199 * @param mixed $directory
200 * @return Zend_Search_Lucene_Interface
202 public static function create($directory)
204 return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, true));
210 * @param mixed $directory
211 * @return Zend_Search_Lucene_Interface
213 public static function open($directory)
215 return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, false));
218 /** Generation retrieving counter */
219 const GENERATION_RETRIEVE_COUNT = 10;
221 /** Pause between generation retrieving attempts in milliseconds */
222 const GENERATION_RETRIEVE_PAUSE = 50;
225 * Get current generation number
227 * Returns generation number
228 * 0 means pre-2.1 index format
229 * -1 means there are no segments files.
231 * @param Zend_Search_Lucene_Storage_Directory $directory
233 * @throws Zend_Search_Lucene_Exception
235 public static function getActualGeneration(Zend_Search_Lucene_Storage_Directory $directory)
238 * Zend_Search_Lucene uses segments.gen file to retrieve current generation number
240 * Apache Lucene index format documentation mentions this method only as a fallback method
242 * Nevertheless we use it according to the performance considerations
244 * @todo check if we can use some modification of Apache Lucene generation determination algorithm
245 * without performance problems
248 require_once 'Zend/Search/Lucene/Exception.php';
250 for ($count = 0; $count < self::GENERATION_RETRIEVE_COUNT; $count++) {
251 // Try to get generation file
252 $genFile = $directory->getFileObject('segments.gen', false);
254 $format = $genFile->readInt();
255 if ($format != (int)0xFFFFFFFE) {
256 throw new Zend_Search_Lucene_Exception('Wrong segments.gen file format');
259 $gen1 = $genFile->readLong();
260 $gen2 = $genFile->readLong();
262 if ($gen1 == $gen2) {
266 usleep(self::GENERATION_RETRIEVE_PAUSE * 1000);
269 // All passes are failed
270 throw new Zend_Search_Lucene_Exception('Index is under processing now');
271 } catch (Zend_Search_Lucene_Exception $e) {
272 if (strpos($e->getMessage(), 'is not readable') !== false) {
274 // Try to open old style segments file
275 $segmentsFile = $directory->getFileObject('segments', false);
277 // It's pre-2.1 index
279 } catch (Zend_Search_Lucene_Exception $e) {
280 if (strpos($e->getMessage(), 'is not readable') !== false) {
295 * Get segments file name
297 * @param integer $generation
300 public static function getSegmentFileName($generation)
302 if ($generation == 0) {
306 return 'segments_' . base_convert($generation, 10, 36);
310 * Get index format version
314 public function getFormatVersion()
316 return $this->_formatVersion;
320 * Set index format version.
321 * Index is converted to this format at the nearest upfdate time
323 * @param int $formatVersion
324 * @throws Zend_Search_Lucene_Exception
326 public function setFormatVersion($formatVersion)
328 if ($formatVersion != self::FORMAT_PRE_2_1 &&
329 $formatVersion != self::FORMAT_2_1 &&
330 $formatVersion != self::FORMAT_2_3) {
331 require_once 'Zend/Search/Lucene/Exception.php';
332 throw new Zend_Search_Lucene_Exception('Unsupported index format');
335 $this->_formatVersion = $formatVersion;
339 * Read segments file for pre-2.1 Lucene index format
341 * @throws Zend_Search_Lucene_Exception
343 private function _readPre21SegmentsFile()
345 $segmentsFile = $this->_directory->getFileObject('segments');
347 $format = $segmentsFile->readInt();
349 if ($format != (int)0xFFFFFFFF) {
350 require_once 'Zend/Search/Lucene/Exception.php';
351 throw new Zend_Search_Lucene_Exception('Wrong segments file format');
355 $segmentsFile->readLong();
357 // read segment name counter
358 $segmentsFile->readInt();
360 $segments = $segmentsFile->readInt();
362 $this->_docCount = 0;
365 for ($count = 0; $count < $segments; $count++) {
366 $segName = $segmentsFile->readString();
367 $segSize = $segmentsFile->readInt();
368 $this->_docCount += $segSize;
370 $this->_segmentInfos[$segName] =
371 new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
376 // Use 2.1 as a target version. Index will be reorganized at update time.
377 $this->_formatVersion = self::FORMAT_2_1;
383 * @throws Zend_Search_Lucene_Exception
385 private function _readSegmentsFile()
387 $segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation));
389 $format = $segmentsFile->readInt();
391 if ($format == (int)0xFFFFFFFC) {
392 $this->_formatVersion = self::FORMAT_2_3;
393 } else if ($format == (int)0xFFFFFFFD) {
394 $this->_formatVersion = self::FORMAT_2_1;
396 require_once 'Zend/Search/Lucene/Exception.php';
397 throw new Zend_Search_Lucene_Exception('Unsupported segments file format');
401 $segmentsFile->readLong();
403 // read segment name counter
404 $segmentsFile->readInt();
406 $segments = $segmentsFile->readInt();
408 $this->_docCount = 0;
411 for ($count = 0; $count < $segments; $count++) {
412 $segName = $segmentsFile->readString();
413 $segSize = $segmentsFile->readInt();
415 // 2.1+ specific properties
416 $delGen = $segmentsFile->readLong();
418 if ($this->_formatVersion == self::FORMAT_2_3) {
419 $docStoreOffset = $segmentsFile->readInt();
421 if ($docStoreOffset != (int)0xFFFFFFFF) {
422 $docStoreSegment = $segmentsFile->readString();
423 $docStoreIsCompoundFile = $segmentsFile->readByte();
425 $docStoreOptions = array('offset' => $docStoreOffset,
426 'segment' => $docStoreSegment,
427 'isCompound' => ($docStoreIsCompoundFile == 1));
429 $docStoreOptions = null;
432 $docStoreOptions = null;
435 $hasSingleNormFile = $segmentsFile->readByte();
436 $numField = $segmentsFile->readInt();
439 if ($numField != (int)0xFFFFFFFF) {
440 for ($count1 = 0; $count1 < $numField; $count1++) {
441 $normGens[] = $segmentsFile->readLong();
444 require_once 'Zend/Search/Lucene/Exception.php';
445 throw new Zend_Search_Lucene_Exception('Separate norm files are not supported. Optimize index to use it with Zend_Search_Lucene.');
448 $isCompoundByte = $segmentsFile->readByte();
450 if ($isCompoundByte == 0xFF) {
451 // The segment is not a compound file
453 } else if ($isCompoundByte == 0x00) {
454 // The status is unknown
456 } else if ($isCompoundByte == 0x01) {
457 // The segment is a compound file
461 $this->_docCount += $segSize;
463 $this->_segmentInfos[$segName] =
464 new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
477 * IndexReader constructor needs Directory as a parameter. It should be
478 * a string with a path to the index folder or a Directory object.
480 * @param mixed $directory
481 * @throws Zend_Search_Lucene_Exception
483 public function __construct($directory = null, $create = false)
485 if ($directory === null) {
486 require_once 'Zend/Search/Lucene/Exception.php';
487 throw new Zend_Search_Exception('No index directory specified');
490 if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) {
491 $this->_directory = $directory;
492 $this->_closeDirOnExit = false;
494 $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory);
495 $this->_closeDirOnExit = true;
498 $this->_segmentInfos = array();
500 // Mark index as "under processing" to prevent other processes from premature index cleaning
501 Zend_Search_Lucene_LockManager::obtainReadLock($this->_directory);
503 $this->_generation = self::getActualGeneration($this->_directory);
506 require_once 'Zend/Search/Lucene/Exception.php';
508 Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory);
509 } catch (Zend_Search_Lucene_Exception $e) {
510 Zend_Search_Lucene_LockManager::releaseReadLock($this->_directory);
512 if (strpos($e->getMessage(), 'Can\'t obtain exclusive index lock') === false) {
515 throw new Zend_Search_Lucene_Exception('Can\'t create index. It\'s under processing now');
519 if ($this->_generation == -1) {
520 // Directory doesn't contain existing index, start from 1
521 $this->_generation = 1;
524 // Directory contains existing index
525 $segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation));
526 $segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
528 $nameCounter = $segmentsFile->readInt();
529 $this->_generation++;
532 Zend_Search_Lucene_Index_Writer::createIndex($this->_directory, $this->_generation, $nameCounter);
534 Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
537 if ($this->_generation == -1) {
538 require_once 'Zend/Search/Lucene/Exception.php';
539 throw new Zend_Search_Lucene_Exception('Index doesn\'t exists in the specified directory.');
540 } else if ($this->_generation == 0) {
541 $this->_readPre21SegmentsFile();
543 $this->_readSegmentsFile();
548 * Close current index and free resources
550 private function _close()
552 if ($this->_closed) {
553 // index is already closed and resources are cleaned up
559 // Release "under processing" flag
560 Zend_Search_Lucene_LockManager::releaseReadLock($this->_directory);
562 if ($this->_closeDirOnExit) {
563 $this->_directory->close();
566 $this->_directory = null;
567 $this->_writer = null;
568 $this->_segmentInfos = null;
570 $this->_closed = true;
574 * Add reference to the index object
578 public function addReference()
584 * Remove reference from the index object
586 * When reference count becomes zero, index is closed and resources are cleaned up
590 public function removeReference()
594 if ($this->_refCount == 0) {
602 public function __destruct()
608 * Returns an instance of Zend_Search_Lucene_Index_Writer for the index
610 * @return Zend_Search_Lucene_Index_Writer
612 private function _getIndexWriter()
614 if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
615 $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, $this->_segmentInfos, $this->_formatVersion);
618 return $this->_writer;
623 * Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
625 * @return Zend_Search_Lucene_Storage_Directory
627 public function getDirectory()
629 return $this->_directory;
634 * Returns the total number of documents in this index (including deleted documents).
638 public function count()
640 return $this->_docCount;
644 * Returns one greater than the largest possible document number.
645 * This may be used to, e.g., determine how big to allocate a structure which will have
646 * an element for every document number in an index.
650 public function maxDoc()
652 return $this->count();
656 * Returns the total number of non-deleted documents in this index.
660 public function numDocs()
664 foreach ($this->_segmentInfos as $segmentInfo) {
665 $numDocs += $segmentInfo->numDocs();
672 * Checks, that document is deleted
676 * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
678 public function isDeleted($id)
680 if ($id >= $this->_docCount) {
681 require_once 'Zend/Search/Lucene/Exception.php';
682 throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
686 foreach ($this->_segmentInfos as $segmentInfo) {
687 if ($segmentStartId + $segmentInfo->count() > $id) {
691 $segmentStartId += $segmentInfo->count();
694 return $segmentInfo->isDeleted($id - $segmentStartId);
698 * Set default search field.
700 * Null means, that search is performed through all fields by default
702 * Default value is null
704 * @param string $fieldName
706 public static function setDefaultSearchField($fieldName)
708 self::$_defaultSearchField = $fieldName;
712 * Get default search field.
714 * Null means, that search is performed through all fields by default
718 public static function getDefaultSearchField()
720 return self::$_defaultSearchField;
724 * Set result set limit.
726 * 0 (default) means no limit
728 * @param integer $limit
730 public static function setResultSetLimit($limit)
732 self::$_resultSetLimit = $limit;
736 * Get result set limit.
742 public static function getResultSetLimit()
744 return self::$_resultSetLimit;
748 * Set terms per query limit.
752 * @param integer $limit
754 public static function setTermsPerQueryLimit($limit)
756 self::$_termsPerQueryLimit = $limit;
760 * Get result set limit.
762 * 0 (default) means no limit
766 public static function getTermsPerQueryLimit()
768 return self::$_termsPerQueryLimit;
772 * Retrieve index maxBufferedDocs option
774 * maxBufferedDocs is a minimal number of documents required before
775 * the buffered in-memory documents are written into a new Segment
777 * Default value is 10
781 public function getMaxBufferedDocs()
783 return $this->_getIndexWriter()->maxBufferedDocs;
787 * Set index maxBufferedDocs option
789 * maxBufferedDocs is a minimal number of documents required before
790 * the buffered in-memory documents are written into a new Segment
792 * Default value is 10
794 * @param integer $maxBufferedDocs
796 public function setMaxBufferedDocs($maxBufferedDocs)
798 $this->_getIndexWriter()->maxBufferedDocs = $maxBufferedDocs;
802 * Retrieve index maxMergeDocs option
804 * maxMergeDocs is a largest number of documents ever merged by addDocument().
805 * Small values (e.g., less than 10,000) are best for interactive indexing,
806 * as this limits the length of pauses while indexing to a few seconds.
807 * Larger values are best for batched indexing and speedier searches.
809 * Default value is PHP_INT_MAX
813 public function getMaxMergeDocs()
815 return $this->_getIndexWriter()->maxMergeDocs;
819 * Set index maxMergeDocs option
821 * maxMergeDocs is a largest number of documents ever merged by addDocument().
822 * Small values (e.g., less than 10,000) are best for interactive indexing,
823 * as this limits the length of pauses while indexing to a few seconds.
824 * Larger values are best for batched indexing and speedier searches.
826 * Default value is PHP_INT_MAX
828 * @param integer $maxMergeDocs
830 public function setMaxMergeDocs($maxMergeDocs)
832 $this->_getIndexWriter()->maxMergeDocs = $maxMergeDocs;
836 * Retrieve index mergeFactor option
838 * mergeFactor determines how often segment indices are merged by addDocument().
839 * With smaller values, less RAM is used while indexing,
840 * and searches on unoptimized indices are faster,
841 * but indexing speed is slower.
842 * With larger values, more RAM is used during indexing,
843 * and while searches on unoptimized indices are slower,
844 * indexing is faster.
845 * Thus larger values (> 10) are best for batch index creation,
846 * and smaller values (< 10) for indices that are interactively maintained.
848 * Default value is 10
852 public function getMergeFactor()
854 return $this->_getIndexWriter()->mergeFactor;
858 * Set index mergeFactor option
860 * mergeFactor determines how often segment indices are merged by addDocument().
861 * With smaller values, less RAM is used while indexing,
862 * and searches on unoptimized indices are faster,
863 * but indexing speed is slower.
864 * With larger values, more RAM is used during indexing,
865 * and while searches on unoptimized indices are slower,
866 * indexing is faster.
867 * Thus larger values (> 10) are best for batch index creation,
868 * and smaller values (< 10) for indices that are interactively maintained.
870 * Default value is 10
872 * @param integer $maxMergeDocs
874 public function setMergeFactor($mergeFactor)
876 $this->_getIndexWriter()->mergeFactor = $mergeFactor;
880 * Performs a query against the index and returns an array
881 * of Zend_Search_Lucene_Search_QueryHit objects.
882 * Input is a string or Zend_Search_Lucene_Search_Query.
884 * @param mixed $query
885 * @return array Zend_Search_Lucene_Search_QueryHit
886 * @throws Zend_Search_Lucene_Exception
888 public function find($query)
890 if (is_string($query)) {
891 $query = Zend_Search_Lucene_Search_QueryParser::parse($query);
894 if (!$query instanceof Zend_Search_Lucene_Search_Query) {
895 require_once 'Zend/Search/Lucene/Exception.php';
896 throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object');
905 $query = $query->rewrite($this)->optimize($this);
907 $query->execute($this);
911 foreach ($query->matchedDocs() as $id => $num) {
912 $docScore = $query->score($id, $this);
913 if( $docScore != 0 ) {
914 $hit = new Zend_Search_Lucene_Search_QueryHit($this);
916 $hit->score = $docScore;
920 $scores[] = $docScore;
922 if ($docScore > $topScore) {
923 $topScore = $docScore;
927 if (self::$_resultSetLimit != 0 && count($hits) >= self::$_resultSetLimit) {
932 if (count($hits) == 0) {
933 // skip sorting, which may cause a error on empty index
938 foreach ($hits as $hit) {
939 $hit->score /= $topScore;
943 if (func_num_args() == 1) {
945 array_multisort($scores, SORT_DESC, SORT_NUMERIC,
946 $ids, SORT_ASC, SORT_NUMERIC,
949 // sort by given field names
951 $argList = func_get_args();
952 $fieldNames = $this->getFieldNames();
955 // PHP 5.3 now expects all arguments to array_multisort be passed by
956 // reference; since constants can't be passed by reference, create
957 // some placeholder variables.
958 $sortReg = SORT_REGULAR;
960 $sortNum = SORT_NUMERIC;
962 require_once 'Zend/Search/Lucene/Exception.php';
963 for ($count = 1; $count < count($argList); $count++) {
964 $fieldName = $argList[$count];
966 if (!is_string($fieldName)) {
967 throw new Zend_Search_Lucene_Exception('Field name must be a string.');
970 if (!in_array($fieldName, $fieldNames)) {
971 throw new Zend_Search_Lucene_Exception('Wrong field name.');
974 $valuesArray = array();
975 foreach ($hits as $hit) {
977 $value = $hit->getDocument()->getFieldValue($fieldName);
978 } catch (Zend_Search_Lucene_Exception $e) {
979 if (strpos($e->getMessage(), 'not found') === false) {
986 $valuesArray[] = $value;
989 $sortArgs[] = &$valuesArray;
991 if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
993 $sortArgs[] = &$argList[$count];
995 if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
997 $sortArgs[] = &$argList[$count];
999 if ($argList[$count] == SORT_ASC || $argList[$count] == SORT_DESC) {
1000 $sortArgs[] = &$sortReg;
1002 $sortArgs[] = &$sortAsc;
1006 $sortArgs[] = &$sortAsc;
1007 $sortArgs[] = &$sortReg;
1011 // Sort by id's if values are equal
1012 $sortArgs[] = &$ids;
1013 $sortArgs[] = &$sortAsc;
1014 $sortArgs[] = &$sortNum;
1016 // Array to be sorted
1017 $sortArgs[] = &$hits;
1020 call_user_func_array('array_multisort', $sortArgs);
1028 * Returns a list of all unique field names that exist in this index.
1030 * @param boolean $indexed
1033 public function getFieldNames($indexed = false)
1036 foreach( $this->_segmentInfos as $segmentInfo ) {
1037 $result = array_merge($result, $segmentInfo->getFields($indexed));
1044 * Returns a Zend_Search_Lucene_Document object for the document
1045 * number $id in this index.
1047 * @param integer|Zend_Search_Lucene_Search_QueryHit $id
1048 * @return Zend_Search_Lucene_Document
1049 * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
1051 public function getDocument($id)
1053 if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
1054 /* @var $id Zend_Search_Lucene_Search_QueryHit */
1058 if ($id >= $this->_docCount) {
1059 require_once 'Zend/Search/Lucene/Exception.php';
1060 throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
1063 $segmentStartId = 0;
1064 foreach ($this->_segmentInfos as $segmentInfo) {
1065 if ($segmentStartId + $segmentInfo->count() > $id) {
1069 $segmentStartId += $segmentInfo->count();
1072 $fdxFile = $segmentInfo->openCompoundFile('.fdx');
1073 $fdxFile->seek(($id-$segmentStartId)*8, SEEK_CUR);
1074 $fieldValuesPosition = $fdxFile->readLong();
1076 $fdtFile = $segmentInfo->openCompoundFile('.fdt');
1077 $fdtFile->seek($fieldValuesPosition, SEEK_CUR);
1078 $fieldCount = $fdtFile->readVInt();
1080 $doc = new Zend_Search_Lucene_Document();
1081 for ($count = 0; $count < $fieldCount; $count++) {
1082 $fieldNum = $fdtFile->readVInt();
1083 $bits = $fdtFile->readByte();
1085 $fieldInfo = $segmentInfo->getField($fieldNum);
1087 if (!($bits & 2)) { // Text data
1088 $field = new Zend_Search_Lucene_Field($fieldInfo->name,
1089 $fdtFile->readString(),
1092 $fieldInfo->isIndexed,
1094 } else { // Binary data
1095 $field = new Zend_Search_Lucene_Field($fieldInfo->name,
1096 $fdtFile->readBinary(),
1099 $fieldInfo->isIndexed,
1104 $doc->addField($field);
1112 * Returns true if index contain documents with specified term.
1114 * Is used for query optimization.
1116 * @param Zend_Search_Lucene_Index_Term $term
1119 public function hasTerm(Zend_Search_Lucene_Index_Term $term)
1121 foreach ($this->_segmentInfos as $segInfo) {
1122 if ($segInfo->getTermInfo($term) instanceof Zend_Search_Lucene_Index_TermInfo) {
1131 * Returns IDs of all documents containing term.
1133 * @param Zend_Search_Lucene_Index_Term $term
1134 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
1137 public function termDocs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
1139 $subResults = array();
1140 $segmentStartDocId = 0;
1142 foreach ($this->_segmentInfos as $segmentInfo) {
1143 $subResults[] = $segmentInfo->termDocs($term, $segmentStartDocId, $docsFilter);
1145 $segmentStartDocId += $segmentInfo->count();
1148 if (count($subResults) == 0) {
1150 } else if (count($subResults) == 0) {
1151 // Index is optimized (only one segment)
1152 // Do not perform array reindexing
1153 return reset($subResults);
1155 $result = call_user_func_array('array_merge', $subResults);
1162 * Returns documents filter for all documents containing term.
1164 * It performs the same operation as termDocs, but return result as
1165 * Zend_Search_Lucene_Index_DocsFilter object
1167 * @param Zend_Search_Lucene_Index_Term $term
1168 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
1169 * @return Zend_Search_Lucene_Index_DocsFilter
1171 public function termDocsFilter(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
1173 $segmentStartDocId = 0;
1174 $result = new Zend_Search_Lucene_Index_DocsFilter();
1176 foreach ($this->_segmentInfos as $segmentInfo) {
1177 $subResults[] = $segmentInfo->termDocs($term, $segmentStartDocId, $docsFilter);
1179 $segmentStartDocId += $segmentInfo->count();
1182 if (count($subResults) == 0) {
1184 } else if (count($subResults) == 0) {
1185 // Index is optimized (only one segment)
1186 // Do not perform array reindexing
1187 return reset($subResults);
1189 $result = call_user_func_array('array_merge', $subResults);
1197 * Returns an array of all term freqs.
1198 * Result array structure: array(docId => freq, ...)
1200 * @param Zend_Search_Lucene_Index_Term $term
1201 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
1204 public function termFreqs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
1207 $segmentStartDocId = 0;
1208 foreach ($this->_segmentInfos as $segmentInfo) {
1209 $result += $segmentInfo->termFreqs($term, $segmentStartDocId, $docsFilter);
1211 $segmentStartDocId += $segmentInfo->count();
1218 * Returns an array of all term positions in the documents.
1219 * Result array structure: array(docId => array(pos1, pos2, ...), ...)
1221 * @param Zend_Search_Lucene_Index_Term $term
1222 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
1225 public function termPositions(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
1228 $segmentStartDocId = 0;
1229 foreach ($this->_segmentInfos as $segmentInfo) {
1230 $result += $segmentInfo->termPositions($term, $segmentStartDocId, $docsFilter);
1232 $segmentStartDocId += $segmentInfo->count();
1240 * Returns the number of documents in this index containing the $term.
1242 * @param Zend_Search_Lucene_Index_Term $term
1245 public function docFreq(Zend_Search_Lucene_Index_Term $term)
1248 foreach ($this->_segmentInfos as $segInfo) {
1249 $termInfo = $segInfo->getTermInfo($term);
1250 if ($termInfo !== null) {
1251 $result += $termInfo->docFreq;
1260 * Retrive similarity used by index reader
1262 * @return Zend_Search_Lucene_Search_Similarity
1264 public function getSimilarity()
1266 return Zend_Search_Lucene_Search_Similarity::getDefault();
1271 * Returns a normalization factor for "field, document" pair.
1273 * @param integer $id
1274 * @param string $fieldName
1277 public function norm($id, $fieldName)
1279 if ($id >= $this->_docCount) {
1283 $segmentStartId = 0;
1284 foreach ($this->_segmentInfos as $segInfo) {
1285 if ($segmentStartId + $segInfo->count() > $id) {
1289 $segmentStartId += $segInfo->count();
1292 if ($segInfo->isDeleted($id - $segmentStartId)) {
1296 return $segInfo->norm($id - $segmentStartId, $fieldName);
1300 * Returns true if any documents have been deleted from this index.
1304 public function hasDeletions()
1306 foreach ($this->_segmentInfos as $segmentInfo) {
1307 if ($segmentInfo->hasDeletions()) {
1317 * Deletes a document from the index.
1318 * $id is an internal document id
1320 * @param integer|Zend_Search_Lucene_Search_QueryHit $id
1321 * @throws Zend_Search_Lucene_Exception
1323 public function delete($id)
1325 if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
1326 /* @var $id Zend_Search_Lucene_Search_QueryHit */
1330 if ($id >= $this->_docCount) {
1331 require_once 'Zend/Search/Lucene/Exception.php';
1332 throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
1335 $segmentStartId = 0;
1336 foreach ($this->_segmentInfos as $segmentInfo) {
1337 if ($segmentStartId + $segmentInfo->count() > $id) {
1341 $segmentStartId += $segmentInfo->count();
1343 $segmentInfo->delete($id - $segmentStartId);
1345 $this->_hasChanges = true;
1351 * Adds a document to this index.
1353 * @param Zend_Search_Lucene_Document $document
1355 public function addDocument(Zend_Search_Lucene_Document $document)
1357 $this->_getIndexWriter()->addDocument($document);
1360 $this->_hasChanges = true;
1365 * Update document counter
1367 private function _updateDocCount()
1369 $this->_docCount = 0;
1370 foreach ($this->_segmentInfos as $segInfo) {
1371 $this->_docCount += $segInfo->count();
1376 * Commit changes resulting from delete() or undeleteAll() operations.
1378 * @todo undeleteAll processing.
1380 public function commit()
1382 if ($this->_hasChanges) {
1383 $this->_getIndexWriter()->commit();
1385 $this->_updateDocCount();
1387 $this->_hasChanges = false;
1395 * Merges all segments into one
1397 public function optimize()
1399 // Commit changes if any changes have been made
1402 if (count($this->_segmentInfos) > 1 || $this->hasDeletions()) {
1403 $this->_getIndexWriter()->optimize();
1404 $this->_updateDocCount();
1410 * Returns an array of all terms in this index.
1414 public function terms()
1418 $segmentInfoQueue = new Zend_Search_Lucene_Index_TermsPriorityQueue();
1420 foreach ($this->_segmentInfos as $segmentInfo) {
1421 $segmentInfo->resetTermsStream();
1423 // Skip "empty" segments
1424 if ($segmentInfo->currentTerm() !== null) {
1425 $segmentInfoQueue->put($segmentInfo);
1429 while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
1430 if ($segmentInfoQueue->top() === null ||
1431 $segmentInfoQueue->top()->currentTerm()->key() !=
1432 $segmentInfo->currentTerm()->key()) {
1434 $result[] = $segmentInfo->currentTerm();
1437 if ($segmentInfo->nextTerm() !== null) {
1438 // Put segment back into the priority queue
1439 $segmentInfoQueue->put($segmentInfo);
1448 * Terms stream priority queue object
1450 * @var Zend_Search_Lucene_TermStreamsPriorityQueue
1452 private $_termsStream = null;
1455 * Reset terms stream.
1457 public function resetTermsStream()
1459 if ($this->_termsStream === null) {
1460 $this->_termsStream = new Zend_Search_Lucene_TermStreamsPriorityQueue($this->_segmentInfos);
1462 $this->_termsStream->resetTermsStream();
1467 * Skip terms stream up to specified term preffix.
1469 * Prefix contains fully specified field info and portion of searched term
1471 * @param Zend_Search_Lucene_Index_Term $prefix
1473 public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
1475 $this->_termsStream->skipTo($prefix);
1479 * Scans terms dictionary and returns next term
1481 * @return Zend_Search_Lucene_Index_Term|null
1483 public function nextTerm()
1485 return $this->_termsStream->nextTerm();
1489 * Returns term in current position
1491 * @return Zend_Search_Lucene_Index_Term|null
1493 public function currentTerm()
1495 return $this->_termsStream->currentTerm();
1499 * Close terms stream
1501 * Should be used for resources clean up if stream is not read up to the end
1503 public function closeTermsStream()
1505 $this->_termsStream->closeTermsStream();
1506 $this->_termsStream = null;
1510 /*************************************************************************
1512 *************************************************************************/
1514 * Undeletes all documents currently marked as deleted in this index.
1516 * @todo Implementation
1518 public function undeleteAll()