7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
16 * @package Zend_Search_Lucene
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
20 * @version $Id: SegmentInfo.php 16541 2009-07-07 06:59:03Z bkarwin $
23 /** Zend_Search_Lucene_Index_DictionaryLoader */
24 require_once 'Zend/Search/Lucene/Index/DictionaryLoader.php';
26 /** Zend_Search_Lucene_Index_DocsFilter */
27 require_once 'Zend/Search/Lucene/Index/DocsFilter.php';
29 /** Zend_Search_Lucene_Index_TermsStream_Interface */
30 require_once 'Zend/Search/Lucene/Index/TermsStream/Interface.php';
34 * @package Zend_Search_Lucene
36 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
37 * @license http://framework.zend.com/license/new-bsd New BSD License
39 class Zend_Search_Lucene_Index_SegmentInfo implements Zend_Search_Lucene_Index_TermsStream_Interface
42 * "Full scan vs fetch" boundary.
44 * If filter selectivity is less than this value, then full scan is performed
45 * (since term entries fetching has some additional overhead).
47 const FULL_SCAN_VS_FETCH_BOUNDARY = 5;
50 * Number of docs in a segment
64 * Term Dictionary Index
66 * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
67 * of performance considerations)
69 * [1] -> $termFieldNum
71 * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
75 private $_termDictionary;
78 * Term Dictionary Index TermInfos
80 * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
81 * of performance considerations)
86 * [4] -> $indexPointer
90 private $_termDictionaryInfos;
93 * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
100 * Field positions in a dictionary.
101 * (Term dictionary contains filelds ordered by names)
105 private $_fieldsDicPositions;
109 * Associative array where the key is the file name and the value is data offset
110 * in a compound segment file (.csf).
117 * Associative array where the key is the file name and the value is file size (.csf).
121 private $_segFileSizes;
124 * Delete file generation number
126 * -2 means autodetect latest delete generation
127 * -1 means 'there is no delete file'
128 * 0 means pre-2.1 format delete file
129 * X specifies used delete file
136 * Segment has single norms file
138 * If true then one .nrm file is used for all fields
139 * Otherwise .fN files are used
143 private $_hasSingleNormFile;
146 * Use compound segment file (*.cfs) to collect all other segment files
147 * (excluding .del files)
151 private $_isCompound;
155 * File system adapter.
157 * @var Zend_Search_Lucene_Storage_Directory_Filesystem
162 * Normalization factors.
163 * An array fieldName => normVector
164 * normVector is a binary string.
165 * Each byte corresponds to an indexed document in a segment and
166 * encodes normalization factor (float value, encoded by
167 * Zend_Search_Lucene_Search_Similarity::encodeNorm())
171 private $_norms = array();
174 * List of deleted documents.
175 * bitset if bitset extension is loaded or array otherwise.
179 private $_deleted = null;
182 * $this->_deleted update flag
186 private $_deletedDirty = false;
189 * True if segment uses shared doc store
193 private $_usesSharedDocStore;
196 * Shared doc store options.
197 * It's an assotiative array with the following items:
198 * - 'offset' => $docStoreOffset The starting document in the shared doc store files where this segment's documents begin
199 * - 'segment' => $docStoreSegment The name of the segment that has the shared doc store files.
200 * - 'isCompound' => $docStoreIsCompoundFile True, if compound file format is used for the shared doc store files (.cfx file).
202 private $_sharedDocStoreOptions;
206 * Zend_Search_Lucene_Index_SegmentInfo constructor
208 * @param Zend_Search_Lucene_Storage_Directory $directory
209 * @param string $name
210 * @param integer $docCount
211 * @param integer $delGen
212 * @param array|null $docStoreOptions
213 * @param boolean $hasSingleNormFile
214 * @param boolean $isCompound
216 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null)
218 $this->_directory = $directory;
219 $this->_name = $name;
220 $this->_docCount = $docCount;
222 if ($docStoreOptions !== null) {
223 $this->_usesSharedDocStore = true;
224 $this->_sharedDocStoreOptions = $docStoreOptions;
226 if ($docStoreOptions['isCompound']) {
227 $cfxFile = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx');
228 $cfxFilesCount = $cfxFile->readVInt();
231 $cfxFileSizes = array();
233 for ($count = 0; $count < $cfxFilesCount; $count++) {
234 $dataOffset = $cfxFile->readLong();
236 $cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles);
238 $fileName = $cfxFile->readString();
239 $cfxFiles[$fileName] = $dataOffset;
242 $cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset;
245 $this->_sharedDocStoreOptions['files'] = $cfxFiles;
246 $this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes;
250 $this->_hasSingleNormFile = $hasSingleNormFile;
251 $this->_delGen = $delGen;
252 $this->_termDictionary = null;
255 if ($isCompound !== null) {
256 $this->_isCompound = $isCompound;
258 // It's a pre-2.1 segment or isCompound is set to 'unknown'
259 // Detect if segment uses compound file
260 require_once 'Zend/Search/Lucene/Exception.php';
262 // Try to open compound file
263 $this->_directory->getFileObject($name . '.cfs');
265 // Compound file is found
266 $this->_isCompound = true;
267 } catch (Zend_Search_Lucene_Exception $e) {
268 if (strpos($e->getMessage(), 'is not readable') !== false) {
269 // Compound file is not found or is not readable
270 $this->_isCompound = false;
277 $this->_segFiles = array();
278 if ($this->_isCompound) {
279 $cfsFile = $this->_directory->getFileObject($name . '.cfs');
280 $segFilesCount = $cfsFile->readVInt();
282 for ($count = 0; $count < $segFilesCount; $count++) {
283 $dataOffset = $cfsFile->readLong();
285 $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
287 $fileName = $cfsFile->readString();
288 $this->_segFiles[$fileName] = $dataOffset;
291 $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
295 $fnmFile = $this->openCompoundFile('.fnm');
296 $fieldsCount = $fnmFile->readVInt();
297 $fieldNames = array();
298 $fieldNums = array();
299 $this->_fields = array();
300 for ($count=0; $count < $fieldsCount; $count++) {
301 $fieldName = $fnmFile->readString();
302 $fieldBits = $fnmFile->readByte();
303 $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
304 $fieldBits & 0x01 /* field is indexed */,
306 $fieldBits & 0x02 /* termvectors are stored */,
307 $fieldBits & 0x10 /* norms are omitted */,
308 $fieldBits & 0x20 /* payloads are stored */);
309 if ($fieldBits & 0x10) {
310 // norms are omitted for the indexed field
311 $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
314 $fieldNums[$count] = $count;
315 $fieldNames[$count] = $fieldName;
317 array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
318 $this->_fieldsDicPositions = array_flip($fieldNums);
320 if ($this->_delGen == -2) {
321 // SegmentInfo constructor is invoked from index writer
322 // Autodetect current delete file generation number
323 $this->_delGen = $this->_detectLatestDelGen();
327 $this->_deleted = $this->_loadDelFile();
331 * Load detetions file
333 * Returns bitset or an array depending on bitset extension availability
336 * @throws Zend_Search_Lucene_Exception
338 private function _loadDelFile()
340 if ($this->_delGen == -1) {
341 // There is no delete file for this segment
343 } else if ($this->_delGen == 0) {
344 // It's a segment with pre-2.1 format delete file
345 // Try to load deletions file
346 return $this->_loadPre21DelFile();
348 // It's 2.1+ format deleteions file
349 return $this->_load21DelFile();
354 * Load pre-2.1 detetions file
356 * Returns bitset or an array depending on bitset extension availability
359 * @throws Zend_Search_Lucene_Exception
361 private function _loadPre21DelFile()
363 require_once 'Zend/Search/Lucene/Exception.php';
365 // '.del' files always stored in a separate file
366 // Segment compound is not used
367 $delFile = $this->_directory->getFileObject($this->_name . '.del');
369 $byteCount = $delFile->readInt();
370 $byteCount = ceil($byteCount/8);
371 $bitCount = $delFile->readInt();
373 if ($bitCount == 0) {
376 $delBytes = $delFile->readBytes($byteCount);
379 if (extension_loaded('bitset')) {
382 $deletions = array();
383 for ($count = 0; $count < $byteCount; $count++) {
384 $byte = ord($delBytes[$count]);
385 for ($bit = 0; $bit < 8; $bit++) {
386 if ($byte & (1<<$bit)) {
387 $deletions[$count*8 + $bit] = 1;
394 } catch(Zend_Search_Lucene_Exception $e) {
395 if (strpos($e->getMessage(), 'is not readable') === false) {
398 // There is no deletion file
406 * Load 2.1+ format detetions file
408 * Returns bitset or an array depending on bitset extension availability
412 private function _load21DelFile()
414 $delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
416 $format = $delFile->readInt();
418 if ($format == (int)0xFFFFFFFF) {
419 if (extension_loaded('bitset')) {
420 $deletions = bitset_empty();
422 $deletions = array();
425 $byteCount = $delFile->readInt();
426 $bitCount = $delFile->readInt();
428 $delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
432 $dgap = $delFile->readVInt();
433 $nonZeroByte = $delFile->readByte();
438 if (extension_loaded('bitset')) {
439 for ($bit = 0; $bit < 8; $bit++) {
440 if ($nonZeroByte & (1<<$bit)) {
441 bitset_incl($deletions, $byteNum*8 + $bit);
446 for ($bit = 0; $bit < 8; $bit++) {
447 if ($nonZeroByte & (1<<$bit)) {
448 $deletions[$byteNum*8 + $bit] = 1;
451 return (count($deletions) > 0) ? $deletions : null;
454 } while ($delFile->tell() < $delFileSize);
456 // $format is actually byte count
457 $byteCount = ceil($format/8);
458 $bitCount = $delFile->readInt();
460 if ($bitCount == 0) {
463 $delBytes = $delFile->readBytes($byteCount);
466 if (extension_loaded('bitset')) {
469 $deletions = array();
470 for ($count = 0; $count < $byteCount; $count++) {
471 $byte = ord($delBytes[$count]);
472 for ($bit = 0; $bit < 8; $bit++) {
473 if ($byte & (1<<$bit)) {
474 $deletions[$count*8 + $bit] = 1;
479 return (count($deletions) > 0) ? $deletions : null;
485 * Opens index file stoted within compound index file
487 * @param string $extension
488 * @param boolean $shareHandler
489 * @throws Zend_Search_Lucene_Exception
490 * @return Zend_Search_Lucene_Storage_File
492 public function openCompoundFile($extension, $shareHandler = true)
494 if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
495 $fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx';
496 $fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt';
498 if (!$this->_sharedDocStoreOptions['isCompound']) {
499 $fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler);
500 $fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
502 if ($extension == '.fdx') {
503 // '.fdx' file is requested
506 // '.fdt' file is requested
507 $fdtStartOffset = $fdxFile->readLong();
509 $fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler);
510 $fdtFile->seek($fdtStartOffset, SEEK_CUR);
516 if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) {
517 require_once 'Zend/Search/Lucene/Exception.php';
518 throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
519 . $fdxFName . ' file.' );
521 if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) {
522 require_once 'Zend/Search/Lucene/Exception.php';
523 throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
524 . $fdtFName . ' file.' );
527 // Open shared docstore segment file
528 $cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler);
529 // Seek to the start of '.fdx' file within compound file
530 $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]);
531 // Seek to the start of current segment documents section
532 $cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
534 if ($extension == '.fdx') {
535 // '.fdx' file is requested
538 // '.fdt' file is requested
539 $fdtStartOffset = $cfxFile->readLong();
541 // Seek to the start of '.fdt' file within compound file
542 $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]);
543 // Seek to the start of current segment documents section
544 $cfxFile->seek($fdtStartOffset, SEEK_CUR);
550 $filename = $this->_name . $extension;
552 if (!$this->_isCompound) {
553 return $this->_directory->getFileObject($filename, $shareHandler);
556 if( !isset($this->_segFiles[$filename]) ) {
557 require_once 'Zend/Search/Lucene/Exception.php';
558 throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain '
559 . $filename . ' file.' );
562 $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
563 $file->seek($this->_segFiles[$filename]);
568 * Get compound file length
570 * @param string $extension
573 public function compoundFileLength($extension)
575 if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
576 $filename = $this->_sharedDocStoreOptions['segment'] . $extension;
578 if (!$this->_sharedDocStoreOptions['isCompound']) {
579 return $this->_directory->fileLength($filename);
582 if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) {
583 require_once 'Zend/Search/Lucene/Exception.php';
584 throw new Zend_Search_Lucene_Exception('Shared doc store compound file doesn\'t contain '
585 . $filename . ' file.' );
588 return $this->_sharedDocStoreOptions['fileSizes'][$filename];
592 $filename = $this->_name . $extension;
594 // Try to get common file first
595 if ($this->_directory->fileExists($filename)) {
596 return $this->_directory->fileLength($filename);
599 if( !isset($this->_segFileSizes[$filename]) ) {
600 require_once 'Zend/Search/Lucene/Exception.php';
601 throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
602 . $filename . ' file.' );
605 return $this->_segFileSizes[$filename];
609 * Returns field index or -1 if field is not found
611 * @param string $fieldName
614 public function getFieldNum($fieldName)
616 foreach( $this->_fields as $field ) {
617 if( $field->name == $fieldName ) {
618 return $field->number;
626 * Returns field info for specified field
628 * @param integer $fieldNum
629 * @return Zend_Search_Lucene_Index_FieldInfo
631 public function getField($fieldNum)
633 return $this->_fields[$fieldNum];
637 * Returns array of fields.
638 * if $indexed parameter is true, then returns only indexed fields.
640 * @param boolean $indexed
643 public function getFields($indexed = false)
646 foreach( $this->_fields as $field ) {
647 if( (!$indexed) || $field->isIndexed ) {
648 $result[ $field->name ] = $field->name;
655 * Returns array of FieldInfo objects.
659 public function getFieldInfos()
661 return $this->_fields;
665 * Returns actual deletions file generation number.
669 public function getDelGen()
671 return $this->_delGen;
675 * Returns the total number of documents in this segment (including deleted documents).
679 public function count()
681 return $this->_docCount;
685 * Returns number of deleted documents.
689 private function _deletedCount()
691 if ($this->_deleted === null) {
695 if (extension_loaded('bitset')) {
696 return count(bitset_to_array($this->_deleted));
698 return count($this->_deleted);
703 * Returns the total number of non-deleted documents in this segment.
707 public function numDocs()
709 if ($this->hasDeletions()) {
710 return $this->_docCount - $this->_deletedCount();
712 return $this->_docCount;
717 * Get field position in a fields dictionary
719 * @param integer $fieldNum
722 private function _getFieldPosition($fieldNum) {
723 // Treat values which are not in a translation table as a 'direct value'
724 return isset($this->_fieldsDicPositions[$fieldNum]) ?
725 $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
729 * Return segment name
733 public function getName()
743 * Numbers are used instead of class constants because of performance considerations
747 private $_termInfoCache = array();
749 private function _cleanUpTermInfoCache()
751 // Clean 256 term infos
752 foreach ($this->_termInfoCache as $key => $termInfo) {
753 unset($this->_termInfoCache[$key]);
755 // leave 768 last used term infos
756 if (count($this->_termInfoCache) == 768) {
763 * Load terms dictionary index
765 * @throws Zend_Search_Lucene_Exception
767 private function _loadDictionaryIndex()
769 // Check, if index is already serialized
770 if ($this->_directory->fileExists($this->_name . '.sti')) {
771 // Load serialized dictionary index data
772 $stiFile = $this->_directory->getFileObject($this->_name . '.sti');
773 $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
775 // Load dictionary index data
776 if (($unserializedData = @unserialize($stiFileData)) !== false) {
777 list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData;
782 // Load data from .tii file and generate .sti file
784 // Prefetch dictionary index data
785 $tiiFile = $this->openCompoundFile('.tii');
786 $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
788 // Load dictionary index data
789 list($this->_termDictionary, $this->_termDictionaryInfos) =
790 Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
792 $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
793 $stiFile = $this->_directory->createFile($this->_name . '.sti');
794 $stiFile->writeBytes($stiFileData);
798 * Scans terms dictionary and returns term info
800 * @param Zend_Search_Lucene_Index_Term $term
801 * @return Zend_Search_Lucene_Index_TermInfo
803 public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
805 $termKey = $term->key();
806 if (isset($this->_termInfoCache[$termKey])) {
807 $termInfo = $this->_termInfoCache[$termKey];
809 // Move termInfo to the end of cache
810 unset($this->_termInfoCache[$termKey]);
811 $this->_termInfoCache[$termKey] = $termInfo;
817 if ($this->_termDictionary === null) {
818 $this->_loadDictionaryIndex();
821 $searchField = $this->getFieldNum($term->field);
823 if ($searchField == -1) {
826 $searchDicField = $this->_getFieldPosition($searchField);
828 // search for appropriate value in dictionary
830 $highIndex = count($this->_termDictionary)-1;
831 while ($highIndex >= $lowIndex) {
832 // $mid = ($highIndex - $lowIndex)/2;
833 $mid = ($highIndex + $lowIndex) >> 1;
834 $midTerm = $this->_termDictionary[$mid];
836 $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
837 $delta = $searchDicField - $fieldNum;
839 $delta = strcmp($term->text, $midTerm[1] /* text */);
844 } elseif ($delta > 0) {
847 // return $this->_termDictionaryInfos[$mid]; // We got it!
848 $a = $this->_termDictionaryInfos[$mid];
849 $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
851 // Put loaded termInfo into cache
852 $this->_termInfoCache[$termKey] = $termInfo;
858 if ($highIndex == -1) {
859 // Term is out of the dictionary range
863 $prevPosition = $highIndex;
864 $prevTerm = $this->_termDictionary[$prevPosition];
865 $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
867 $tisFile = $this->openCompoundFile('.tis');
868 $tiVersion = $tisFile->readInt();
869 if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
870 $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
871 require_once 'Zend/Search/Lucene/Exception.php';
872 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
875 $termCount = $tisFile->readLong();
876 $indexInterval = $tisFile->readInt();
877 $skipInterval = $tisFile->readInt();
878 if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
879 $maxSkipLevels = $tisFile->readInt();
882 $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR);
884 $termValue = $prevTerm[1] /* text */;
885 $termFieldNum = $prevTerm[0] /* field */;
886 $freqPointer = $prevTermInfo[1] /* freqPointer */;
887 $proxPointer = $prevTermInfo[2] /* proxPointer */;
888 for ($count = $prevPosition*$indexInterval + 1;
889 $count <= $termCount &&
890 ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
891 ($this->_getFieldPosition($termFieldNum) == $searchDicField &&
892 strcmp($termValue, $term->text) < 0) );
894 $termPrefixLength = $tisFile->readVInt();
895 $termSuffix = $tisFile->readString();
896 $termFieldNum = $tisFile->readVInt();
897 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
899 $docFreq = $tisFile->readVInt();
900 $freqPointer += $tisFile->readVInt();
901 $proxPointer += $tisFile->readVInt();
902 if( $docFreq >= $skipInterval ) {
903 $skipOffset = $tisFile->readVInt();
909 if ($termFieldNum == $searchField && $termValue == $term->text) {
910 $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
915 // Put loaded termInfo into cache
916 $this->_termInfoCache[$termKey] = $termInfo;
918 if (count($this->_termInfoCache) == 1024) {
919 $this->_cleanUpTermInfoCache();
926 * Returns IDs of all the documents containing term.
928 * @param Zend_Search_Lucene_Index_Term $term
929 * @param integer $shift
930 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
933 public function termDocs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
935 $termInfo = $this->getTermInfo($term);
937 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
938 if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
939 $docsFilter->segmentFilters[$this->_name] = array();
944 $frqFile = $this->openCompoundFile('.frq');
945 $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
949 if ($docsFilter !== null) {
950 if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
951 require_once 'Zend/Search/Lucene/Exception.php';
952 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
955 if (isset($docsFilter->segmentFilters[$this->_name])) {
956 // Filter already has some data for the current segment
958 // Make short name for the filter (which doesn't need additional dereferencing)
959 $filter = &$docsFilter->segmentFilters[$this->_name];
961 // Check if filter is not empty
962 if (count($filter) == 0) {
966 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
968 // ---------------------------------------------------------------
969 $updatedFilterData = array();
971 for( $count=0; $count < $termInfo->docFreq; $count++ ) {
972 $docDelta = $frqFile->readVInt();
973 if( $docDelta % 2 == 1 ) {
974 $docId += ($docDelta-1)/2;
976 $docId += $docDelta/2;
978 $frqFile->readVInt();
981 if (isset($filter[$docId])) {
982 $result[] = $shift + $docId;
983 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
986 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
987 // ---------------------------------------------------------------
990 $updatedFilterData = array();
992 for( $count=0; $count < $termInfo->docFreq; $count++ ) {
993 $docDelta = $frqFile->readVInt();
994 if( $docDelta % 2 == 1 ) {
995 $docId += ($docDelta-1)/2;
997 $docId += $docDelta/2;
999 $frqFile->readVInt();
1002 if (isset($filter[$docId])) {
1003 $result[] = $shift + $docId;
1004 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
1007 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
1010 // Filter is present, but doesn't has data for the current segment yet
1011 $filterData = array();
1012 for( $count=0; $count < $termInfo->docFreq; $count++ ) {
1013 $docDelta = $frqFile->readVInt();
1014 if( $docDelta % 2 == 1 ) {
1015 $docId += ($docDelta-1)/2;
1017 $docId += $docDelta/2;
1019 $frqFile->readVInt();
1022 $result[] = $shift + $docId;
1023 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
1025 $docsFilter->segmentFilters[$this->_name] = $filterData;
1028 for( $count=0; $count < $termInfo->docFreq; $count++ ) {
1029 $docDelta = $frqFile->readVInt();
1030 if( $docDelta % 2 == 1 ) {
1031 $docId += ($docDelta-1)/2;
1033 $docId += $docDelta/2;
1035 $frqFile->readVInt();
1038 $result[] = $shift + $docId;
1046 * Returns term freqs array.
1047 * Result array structure: array(docId => freq, ...)
1049 * @param Zend_Search_Lucene_Index_Term $term
1050 * @param integer $shift
1051 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
1052 * @return Zend_Search_Lucene_Index_TermInfo
1054 public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
1056 $termInfo = $this->getTermInfo($term);
1058 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
1059 if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
1060 $docsFilter->segmentFilters[$this->_name] = array();
1065 $frqFile = $this->openCompoundFile('.frq');
1066 $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
1072 if ($docsFilter !== null) {
1073 if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
1074 require_once 'Zend/Search/Lucene/Exception.php';
1075 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
1078 if (isset($docsFilter->segmentFilters[$this->_name])) {
1079 // Filter already has some data for the current segment
1081 // Make short name for the filter (which doesn't need additional dereferencing)
1082 $filter = &$docsFilter->segmentFilters[$this->_name];
1084 // Check if filter is not empty
1085 if (count($filter) == 0) {
1090 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
1092 // ---------------------------------------------------------------
1093 $updatedFilterData = array();
1095 for ($count = 0; $count < $termInfo->docFreq; $count++) {
1096 $docDelta = $frqFile->readVInt();
1097 if ($docDelta % 2 == 1) {
1098 $docId += ($docDelta-1)/2;
1099 if (isset($filter[$docId])) {
1100 $result[$shift + $docId] = 1;
1101 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
1104 $docId += $docDelta/2;
1105 if (isset($filter[$docId])) {
1106 $result[$shift + $docId] = $frqFile->readVInt();
1107 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
1111 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
1112 // ---------------------------------------------------------------
1114 // Perform full scan
1115 $updatedFilterData = array();
1117 for ($count = 0; $count < $termInfo->docFreq; $count++) {
1118 $docDelta = $frqFile->readVInt();
1119 if ($docDelta % 2 == 1) {
1120 $docId += ($docDelta-1)/2;
1121 if (isset($filter[$docId])) {
1122 $result[$shift + $docId] = 1;
1123 $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
1126 $docId += $docDelta/2;
1127 if (isset($filter[$docId])) {
1128 $result[$shift + $docId] = $frqFile->readVInt();
1129 $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
1133 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
1136 // Filter doesn't has data for current segment
1137 $filterData = array();
1139 for ($count = 0; $count < $termInfo->docFreq; $count++) {
1140 $docDelta = $frqFile->readVInt();
1141 if ($docDelta % 2 == 1) {
1142 $docId += ($docDelta-1)/2;
1143 $result[$shift + $docId] = 1;
1144 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
1146 $docId += $docDelta/2;
1147 $result[$shift + $docId] = $frqFile->readVInt();
1148 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
1152 $docsFilter->segmentFilters[$this->_name] = $filterData;
1155 for ($count = 0; $count < $termInfo->docFreq; $count++) {
1156 $docDelta = $frqFile->readVInt();
1157 if ($docDelta % 2 == 1) {
1158 $docId += ($docDelta-1)/2;
1159 $result[$shift + $docId] = 1;
1161 $docId += $docDelta/2;
1162 $result[$shift + $docId] = $frqFile->readVInt();
1171 * Returns term positions array.
1172 * Result array structure: array(docId => array(pos1, pos2, ...), ...)
1174 * @param Zend_Search_Lucene_Index_Term $term
1175 * @param integer $shift
1176 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
1177 * @return Zend_Search_Lucene_Index_TermInfo
1179 public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
1181 $termInfo = $this->getTermInfo($term);
1183 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
1184 if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
1185 $docsFilter->segmentFilters[$this->_name] = array();
1190 $frqFile = $this->openCompoundFile('.frq');
1191 $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
1197 if ($docsFilter !== null) {
1198 if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
1199 require_once 'Zend/Search/Lucene/Exception.php';
1200 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
1203 if (isset($docsFilter->segmentFilters[$this->_name])) {
1204 // Filter already has some data for the current segment
1206 // Make short name for the filter (which doesn't need additional dereferencing)
1207 $filter = &$docsFilter->segmentFilters[$this->_name];
1209 // Check if filter is not empty
1210 if (count($filter) == 0) {
1214 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
1216 // ---------------------------------------------------------------
1217 for ($count = 0; $count < $termInfo->docFreq; $count++) {
1218 $docDelta = $frqFile->readVInt();
1219 if ($docDelta % 2 == 1) {
1220 $docId += ($docDelta-1)/2;
1223 $docId += $docDelta/2;
1224 $freqs[$docId] = $frqFile->readVInt();
1228 $updatedFilterData = array();
1230 $prxFile = $this->openCompoundFile('.prx');
1231 $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
1232 foreach ($freqs as $docId => $freq) {
1234 $positions = array();
1236 // we have to read .prx file to get right position for next doc
1237 // even filter doesn't match current document
1238 for ($count = 0; $count < $freq; $count++ ) {
1239 $termPosition += $prxFile->readVInt();
1240 $positions[] = $termPosition;
1243 // Include into updated filter and into result only if doc is matched by filter
1244 if (isset($filter[$docId])) {
1245 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
1246 $result[$shift + $docId] = $positions;
1250 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
1251 // ---------------------------------------------------------------
1253 // Perform full scan
1254 for ($count = 0; $count < $termInfo->docFreq; $count++) {
1255 $docDelta = $frqFile->readVInt();
1256 if ($docDelta % 2 == 1) {
1257 $docId += ($docDelta-1)/2;
1260 $docId += $docDelta/2;
1261 $freqs[$docId] = $frqFile->readVInt();
1265 $updatedFilterData = array();
1267 $prxFile = $this->openCompoundFile('.prx');
1268 $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
1269 foreach ($freqs as $docId => $freq) {
1271 $positions = array();
1273 // we have to read .prx file to get right position for next doc
1274 // even filter doesn't match current document
1275 for ($count = 0; $count < $freq; $count++ ) {
1276 $termPosition += $prxFile->readVInt();
1277 $positions[] = $termPosition;
1280 // Include into updated filter and into result only if doc is matched by filter
1281 if (isset($filter[$docId])) {
1282 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
1283 $result[$shift + $docId] = $positions;
1287 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
1290 // Filter doesn't has data for current segment
1291 for ($count = 0; $count < $termInfo->docFreq; $count++) {
1292 $docDelta = $frqFile->readVInt();
1293 if ($docDelta % 2 == 1) {
1294 $docId += ($docDelta-1)/2;
1297 $docId += $docDelta/2;
1298 $freqs[$docId] = $frqFile->readVInt();
1302 $filterData = array();
1304 $prxFile = $this->openCompoundFile('.prx');
1305 $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
1306 foreach ($freqs as $docId => $freq) {
1307 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
1310 $positions = array();
1312 for ($count = 0; $count < $freq; $count++ ) {
1313 $termPosition += $prxFile->readVInt();
1314 $positions[] = $termPosition;
1317 $result[$shift + $docId] = $positions;
1320 $docsFilter->segmentFilters[$this->_name] = $filterData;
1323 for ($count = 0; $count < $termInfo->docFreq; $count++) {
1324 $docDelta = $frqFile->readVInt();
1325 if ($docDelta % 2 == 1) {
1326 $docId += ($docDelta-1)/2;
1329 $docId += $docDelta/2;
1330 $freqs[$docId] = $frqFile->readVInt();
1335 $prxFile = $this->openCompoundFile('.prx');
1336 $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
1337 foreach ($freqs as $docId => $freq) {
1339 $positions = array();
1341 for ($count = 0; $count < $freq; $count++ ) {
1342 $termPosition += $prxFile->readVInt();
1343 $positions[] = $termPosition;
1346 $result[$shift + $docId] = $positions;
1354 * Load normalizatin factors from an index file
1356 * @param integer $fieldNum
1357 * @throws Zend_Search_Lucene_Exception
1359 private function _loadNorm($fieldNum)
1361 if ($this->_hasSingleNormFile) {
1362 $normfFile = $this->openCompoundFile('.nrm');
1364 $header = $normfFile->readBytes(3);
1365 $headerFormatVersion = $normfFile->readByte();
1367 if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) {
1368 require_once 'Zend/Search/Lucene/Exception.php';
1369 throw new Zend_Search_Lucene_Exception('Wrong norms file format.');
1372 foreach ($this->_fields as $fNum => $fieldInfo) {
1373 if ($fieldInfo->isIndexed) {
1374 $this->_norms[$fNum] = $normfFile->readBytes($this->_docCount);
1378 $fFile = $this->openCompoundFile('.f' . $fieldNum);
1379 $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
1384 * Returns normalization factor for specified documents
1386 * @param integer $id
1387 * @param string $fieldName
1390 public function norm($id, $fieldName)
1392 $fieldNum = $this->getFieldNum($fieldName);
1394 if ( !($this->_fields[$fieldNum]->isIndexed) ) {
1398 if (!isset($this->_norms[$fieldNum])) {
1399 $this->_loadNorm($fieldNum);
1402 return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) );
1406 * Returns norm vector, encoded in a byte string
1408 * @param string $fieldName
1411 public function normVector($fieldName)
1413 $fieldNum = $this->getFieldNum($fieldName);
1415 if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) {
1416 $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
1418 return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
1422 if (!isset($this->_norms[$fieldNum])) {
1423 $this->_loadNorm($fieldNum);
1426 return $this->_norms[$fieldNum];
1431 * Returns true if any documents have been deleted from this index segment.
1435 public function hasDeletions()
1437 return $this->_deleted !== null;
1442 * Returns true if segment has single norms file.
1446 public function hasSingleNormFile()
1448 return $this->_hasSingleNormFile ? true : false;
1452 * Returns true if segment is stored using compound segment file.
1456 public function isCompound()
1458 return $this->_isCompound;
1462 * Deletes a document from the index segment.
1463 * $id is an internal document id
1467 public function delete($id)
1469 $this->_deletedDirty = true;
1471 if (extension_loaded('bitset')) {
1472 if ($this->_deleted === null) {
1473 $this->_deleted = bitset_empty($id);
1475 bitset_incl($this->_deleted, $id);
1477 if ($this->_deleted === null) {
1478 $this->_deleted = array();
1481 $this->_deleted[$id] = 1;
1486 * Checks, that document is deleted
1491 public function isDeleted($id)
1493 if ($this->_deleted === null) {
1497 if (extension_loaded('bitset')) {
1498 return bitset_in($this->_deleted, $id);
1500 return isset($this->_deleted[$id]);
1505 * Detect latest delete generation
1507 * Is actualy used from writeChanges() method or from the constructor if it's invoked from
1508 * Index writer. In both cases index write lock is already obtained, so we shouldn't care
1513 private function _detectLatestDelGen()
1515 $delFileList = array();
1516 foreach ($this->_directory->fileList() as $file) {
1517 if ($file == $this->_name . '.del') {
1518 // Matches <segment_name>.del file name
1520 } else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) {
1521 // Matches <segment_name>_NNN.del file names
1522 $delFileList[] = (int)base_convert($matches[1], 36, 10);
1526 if (count($delFileList) == 0) {
1527 // There is no deletions file for current segment in the directory
1528 // Set deletions file generation number to 1
1531 // There are some deletions files for current segment in the directory
1532 // Set deletions file generation number to the highest nuber
1533 return max($delFileList);
1538 * Write changes if it's necessary.
1540 * This method must be invoked only from the Writer _updateSegments() method,
1541 * so index Write lock has to be already obtained.
1544 * @throws Zend_Search_Lucene_Exceptions
1546 public function writeChanges()
1548 // Get new generation number
1549 $latestDelGen = $this->_detectLatestDelGen();
1551 if (!$this->_deletedDirty) {
1552 // There was no deletions by current process
1554 if ($latestDelGen == $this->_delGen) {
1555 // Delete file hasn't been updated by any concurrent process
1557 } else if ($latestDelGen > $this->_delGen) {
1558 // Delete file has been updated by some concurrent process
1559 // Reload deletions file
1560 $this->_delGen = $latestDelGen;
1561 $this->_deleted = $this->_loadDelFile();
1565 require_once 'Zend/Search/Lucene/Exception.php';
1566 throw new Zend_Search_Lucene_Exception('Delete file processing workflow is corrupted for the segment \'' . $this->_name . '\'.');
1570 if ($latestDelGen > $this->_delGen) {
1571 // Merge current deletions with latest deletions file
1572 $this->_delGen = $latestDelGen;
1574 $latestDelete = $this->_loadDelFile();
1576 if (extension_loaded('bitset')) {
1577 $this->_deleted = bitset_union($this->_deleted, $latestDelete);
1579 $this->_deleted += $latestDelete;
1583 if (extension_loaded('bitset')) {
1584 $delBytes = $this->_deleted;
1585 $bitCount = count(bitset_to_array($delBytes));
1587 $byteCount = floor($this->_docCount/8)+1;
1588 $delBytes = str_repeat(chr(0), $byteCount);
1589 for ($count = 0; $count < $byteCount; $count++) {
1591 for ($bit = 0; $bit < 8; $bit++) {
1592 if (isset($this->_deleted[$count*8 + $bit])) {
1596 $delBytes[$count] = chr($byte);
1598 $bitCount = count($this->_deleted);
1601 if ($this->_delGen == -1) {
1602 // Set delete file generation number to 1
1605 // Increase delete file generation number by 1
1609 $delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
1610 $delFile->writeInt($this->_docCount);
1611 $delFile->writeInt($bitCount);
1612 $delFile->writeBytes($delBytes);
1614 $this->_deletedDirty = false;
1619 * Term Dictionary File object for stream like terms reading
1621 * @var Zend_Search_Lucene_Storage_File
1623 private $_tisFile = null;
1626 * Actual offset of the .tis file data
1630 private $_tisFileOffset;
1633 * Frequencies File object for stream like terms reading
1635 * @var Zend_Search_Lucene_Storage_File
1637 private $_frqFile = null;
1640 * Actual offset of the .frq file data
1644 private $_frqFileOffset;
1647 * Positions File object for stream like terms reading
1649 * @var Zend_Search_Lucene_Storage_File
1651 private $_prxFile = null;
1654 * Actual offset of the .prx file in the compound file
1658 private $_prxFileOffset;
1662 * Actual number of terms in term stream
1666 private $_termCount = 0;
1669 * Overall number of terms in term stream
1673 private $_termNum = 0;
1676 * Segment index interval
1680 private $_indexInterval;
1683 * Segment skip interval
1687 private $_skipInterval;
1690 * Last TermInfo in a terms stream
1692 * @var Zend_Search_Lucene_Index_TermInfo
1694 private $_lastTermInfo = null;
1697 * Last Term in a terms stream
1699 * @var Zend_Search_Lucene_Index_Term
1701 private $_lastTerm = null;
1704 * Map of the document IDs
1705 * Used to get new docID after removing deleted documents.
1706 * It's not very effective from memory usage point of view,
1707 * but much more faster, then other methods
1711 private $_docMap = null;
1714 * An array of all term positions in the documents.
1715 * Array structure: array( docId => array( pos1, pos2, ...), ...)
1717 * Is set to null if term positions loading has to be skipped
1721 private $_lastTermPositions;
1729 * self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved
1730 * self::SM_FULL_INFO - terms are scanned, frequency and position info is retrieved
1731 * self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved
1732 * document numbers are compacted (shifted if segment has deleted documents)
1736 private $_termsScanMode;
1739 const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved
1740 const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved
1741 const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved
1742 // document numbers are compacted (shifted if segment contains deleted documents)
1745 * Reset terms stream
1747 * $startId - id for the fist document
1748 * $compact - remove deleted documents
1750 * Returns start document id for the next segment
1752 * @param integer $startId
1753 * @param integer $mode
1754 * @throws Zend_Search_Lucene_Exception
1757 public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */)
1760 * SegmentInfo->resetTermsStream() method actually takes two optional parameters:
1761 * $startId (default value is 0)
1762 * $mode (default value is self::SM_TERMS_ONLY)
1764 $argList = func_get_args();
1765 if (count($argList) > 2) {
1766 require_once 'Zend/Search/Lucene/Exception.php';
1767 throw new Zend_Search_Lucene_Exception('Wrong number of arguments');
1768 } else if (count($argList) == 2) {
1769 $startId = $argList[0];
1770 $mode = $argList[1];
1771 } else if (count($argList) == 1) {
1772 $startId = $argList[0];
1773 $mode = self::SM_TERMS_ONLY;
1776 $mode = self::SM_TERMS_ONLY;
1779 if ($this->_tisFile !== null) {
1780 $this->_tisFile = null;
1783 $this->_tisFile = $this->openCompoundFile('.tis', false);
1784 $this->_tisFileOffset = $this->_tisFile->tell();
1786 $tiVersion = $this->_tisFile->readInt();
1787 if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
1788 $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
1789 require_once 'Zend/Search/Lucene/Exception.php';
1790 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
1794 $this->_termNum = $this->_tisFile->readLong(); // Read terms count
1795 $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval
1796 $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval
1797 if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
1798 $maxSkipLevels = $this->_tisFile->readInt();
1801 if ($this->_frqFile !== null) {
1802 $this->_frqFile = null;
1804 if ($this->_prxFile !== null) {
1805 $this->_prxFile = null;
1807 $this->_docMap = array();
1809 $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
1810 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
1811 $this->_lastTermPositions = null;
1813 $this->_termsScanMode = $mode;
1816 case self::SM_TERMS_ONLY:
1820 case self::SM_FULL_INFO:
1821 // break intentionally omitted
1822 case self::SM_MERGE_INFO:
1823 $this->_frqFile = $this->openCompoundFile('.frq', false);
1824 $this->_frqFileOffset = $this->_frqFile->tell();
1826 $this->_prxFile = $this->openCompoundFile('.prx', false);
1827 $this->_prxFileOffset = $this->_prxFile->tell();
1829 for ($count = 0; $count < $this->_docCount; $count++) {
1830 if (!$this->isDeleted($count)) {
1831 $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count);
1837 require_once 'Zend/Search/Lucene/Exception.php';
1838 throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.');
1844 return $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount);
1849 * Skip terms stream up to specified term preffix.
1851 * Prefix contains fully specified field info and portion of searched term
1853 * @param Zend_Search_Lucene_Index_Term $prefix
1854 * @throws Zend_Search_Lucene_Exception
1856 public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
1858 if ($this->_termDictionary === null) {
1859 $this->_loadDictionaryIndex();
1862 $searchField = $this->getFieldNum($prefix->field);
1864 if ($searchField == -1) {
1866 * Field is not presented in this segment
1867 * Go to the end of dictionary
1869 $this->_tisFile = null;
1870 $this->_frqFile = null;
1871 $this->_prxFile = null;
1873 $this->_lastTerm = null;
1874 $this->_lastTermInfo = null;
1875 $this->_lastTermPositions = null;
1879 $searchDicField = $this->_getFieldPosition($searchField);
1881 // search for appropriate value in dictionary
1883 $highIndex = count($this->_termDictionary)-1;
1884 while ($highIndex >= $lowIndex) {
1885 // $mid = ($highIndex - $lowIndex)/2;
1886 $mid = ($highIndex + $lowIndex) >> 1;
1887 $midTerm = $this->_termDictionary[$mid];
1889 $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
1890 $delta = $searchDicField - $fieldNum;
1892 $delta = strcmp($prefix->text, $midTerm[1] /* text */);
1896 $highIndex = $mid-1;
1897 } elseif ($delta > 0) {
1900 // We have reached term we are looking for
1905 if ($highIndex == -1) {
1906 // Term is out of the dictionary range
1907 $this->_tisFile = null;
1908 $this->_frqFile = null;
1909 $this->_prxFile = null;
1911 $this->_lastTerm = null;
1912 $this->_lastTermInfo = null;
1913 $this->_lastTermPositions = null;
1918 $prevPosition = $highIndex;
1919 $prevTerm = $this->_termDictionary[$prevPosition];
1920 $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
1922 if ($this->_tisFile === null) {
1923 // The end of terms stream is reached and terms dictionary file is closed
1924 // Perform mini-reset operation
1925 $this->_tisFile = $this->openCompoundFile('.tis', false);
1927 if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
1928 $this->_frqFile = $this->openCompoundFile('.frq', false);
1929 $this->_prxFile = $this->openCompoundFile('.prx', false);
1932 $this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET);
1934 $this->_lastTerm = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */,
1935 ($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name);
1936 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */,
1937 $prevTermInfo[1] /* freqPointer */,
1938 $prevTermInfo[2] /* proxPointer */,
1939 $prevTermInfo[3] /* skipOffset */);
1940 $this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval;
1942 if ($highIndex == 0) {
1945 } else if ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) {
1946 // We got exact match in the dictionary index
1948 if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
1949 $this->_lastTermPositions = array();
1951 $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
1952 $freqs = array(); $docId = 0;
1953 for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
1954 $docDelta = $this->_frqFile->readVInt();
1955 if( $docDelta % 2 == 1 ) {
1956 $docId += ($docDelta-1)/2;
1957 $freqs[ $docId ] = 1;
1959 $docId += $docDelta/2;
1960 $freqs[ $docId ] = $this->_frqFile->readVInt();
1964 $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
1965 foreach ($freqs as $docId => $freq) {
1966 $termPosition = 0; $positions = array();
1968 for ($count = 0; $count < $freq; $count++ ) {
1969 $termPosition += $this->_prxFile->readVInt();
1970 $positions[] = $termPosition;
1973 if (isset($this->_docMap[$docId])) {
1974 $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
1982 // Search term matching specified prefix
1983 while ($this->_lastTerm !== null) {
1984 if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 ||
1985 ($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) {
1986 // Current term matches or greate than the pattern
1996 * Scans terms dictionary and returns next term
1998 * @return Zend_Search_Lucene_Index_Term|null
2000 public function nextTerm()
2002 if ($this->_tisFile === null || $this->_termCount == 0) {
2003 $this->_lastTerm = null;
2004 $this->_lastTermInfo = null;
2005 $this->_lastTermPositions = null;
2006 $this->_docMap = null;
2008 // may be necessary for "empty" segment
2009 $this->_tisFile = null;
2010 $this->_frqFile = null;
2011 $this->_prxFile = null;
2016 $termPrefixLength = $this->_tisFile->readVInt();
2017 $termSuffix = $this->_tisFile->readString();
2018 $termFieldNum = $this->_tisFile->readVInt();
2019 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
2021 $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
2023 $docFreq = $this->_tisFile->readVInt();
2024 $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
2025 $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
2026 if ($docFreq >= $this->_skipInterval) {
2027 $skipOffset = $this->_tisFile->readVInt();
2032 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
2035 if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
2036 $this->_lastTermPositions = array();
2038 $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
2039 $freqs = array(); $docId = 0;
2040 for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
2041 $docDelta = $this->_frqFile->readVInt();
2042 if( $docDelta % 2 == 1 ) {
2043 $docId += ($docDelta-1)/2;
2044 $freqs[ $docId ] = 1;
2046 $docId += $docDelta/2;
2047 $freqs[ $docId ] = $this->_frqFile->readVInt();
2051 $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
2052 foreach ($freqs as $docId => $freq) {
2053 $termPosition = 0; $positions = array();
2055 for ($count = 0; $count < $freq; $count++ ) {
2056 $termPosition += $this->_prxFile->readVInt();
2057 $positions[] = $termPosition;
2060 if (isset($this->_docMap[$docId])) {
2061 $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
2066 $this->_termCount--;
2067 if ($this->_termCount == 0) {
2068 $this->_tisFile = null;
2069 $this->_frqFile = null;
2070 $this->_prxFile = null;
2073 return $this->_lastTerm;
2077 * Close terms stream
2079 * Should be used for resources clean up if stream is not read up to the end
2081 public function closeTermsStream()
2083 $this->_tisFile = null;
2084 $this->_frqFile = null;
2085 $this->_prxFile = null;
2087 $this->_lastTerm = null;
2088 $this->_lastTermInfo = null;
2089 $this->_lastTermPositions = null;
2091 $this->_docMap = null;
2096 * Returns term in current position
2098 * @return Zend_Search_Lucene_Index_Term|null
2100 public function currentTerm()
2102 return $this->_lastTerm;
2107 * Returns an array of all term positions in the documents.
2108 * Return array structure: array( docId => array( pos1, pos2, ...), ...)
2112 public function currentTermPositions()
2114 return $this->_lastTermPositions;