--- /dev/null
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ * @version $Id: SegmentWriter.php 16541 2009-07-07 06:59:03Z bkarwin $
+ */
+
+/** Zend_Search_Lucene_Index_SegmentInfo */
+require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+
+/**
+ * @category Zend
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://framework.zend.com/license/new-bsd New BSD License
+ */
+abstract class Zend_Search_Lucene_Index_SegmentWriter
+{
+ /**
+ * Expert: The fraction of terms in the "dictionary" which should be stored
+ * in RAM. Smaller values use more memory, but make searching slightly
+ * faster, while larger values use less memory and make searching slightly
+ * slower. Searching is typically not dominated by dictionary lookup, so
+ * tweaking this is rarely useful.
+ *
+ * @var integer
+ */
+ public static $indexInterval = 128;
+
+ /**
+ * Expert: The fraction of TermDocs entries stored in skip tables.
+ * Larger values result in smaller indexes, greater acceleration, but fewer
+ * accelerable cases, while smaller values result in bigger indexes,
+ * less acceleration and more
+ * accelerable cases. More detailed experiments would be useful here.
+ *
+ * 0x7FFFFFFF indicates that we don't use skip data
+ *
+ * Note: not used in current implementation
+ *
+ * @var integer
+ */
+ public static $skipInterval = 0x7FFFFFFF;
+
+ /**
+ * Expert: The maximum number of skip levels. Smaller values result in
+ * slightly smaller indexes, but slower skipping in big posting lists.
+ *
+ * 0 indicates that we don't use skip data
+ *
+ * Note: not used in current implementation
+ *
+ * @var integer
+ */
+ public static $maxSkipLevels = 0;
+
+ /**
+ * Number of docs in a segment
+ *
+ * @var integer
+ */
+ protected $_docCount = 0;
+
+ /**
+ * Segment name
+ *
+ * @var string
+ */
+ protected $_name;
+
+ /**
+ * File system adapter.
+ *
+ * @var Zend_Search_Lucene_Storage_Directory
+ */
+ protected $_directory;
+
+ /**
+ * List of the index files.
+ * Used for automatic compound file generation
+ *
+ * @var unknown_type
+ */
+ protected $_files = array();
+
+ /**
+ * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
+ *
+ * @var array
+ */
+ protected $_fields = array();
+
+ /**
+ * Normalization factors.
+ * An array fieldName => normVector
+ * normVector is a binary string.
+ * Each byte corresponds to an indexed document in a segment and
+ * encodes normalization factor (float value, encoded by
+ * Zend_Search_Lucene_Search_Similarity::encodeNorm())
+ *
+ * @var array
+ */
+ protected $_norms = array();
+
+
+ /**
+ * '.fdx' file - Stored Fields, the field index.
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ protected $_fdxFile = null;
+
+ /**
+ * '.fdt' file - Stored Fields, the field data.
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ protected $_fdtFile = null;
+
+
+ /**
+ * Object constructor.
+ *
+ * @param Zend_Search_Lucene_Storage_Directory $directory
+ * @param string $name
+ */
+ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
+ {
+ $this->_directory = $directory;
+ $this->_name = $name;
+ }
+
+
+ /**
+ * Add field to the segment
+ *
+ * Returns actual field number
+ *
+ * @param Zend_Search_Lucene_Field $field
+ * @return integer
+ */
+ public function addField(Zend_Search_Lucene_Field $field)
+ {
+ if (!isset($this->_fields[$field->name])) {
+ $fieldNumber = count($this->_fields);
+ $this->_fields[$field->name] =
+ new Zend_Search_Lucene_Index_FieldInfo($field->name,
+ $field->isIndexed,
+ $fieldNumber,
+ $field->storeTermVector);
+
+ return $fieldNumber;
+ } else {
+ $this->_fields[$field->name]->isIndexed |= $field->isIndexed;
+ $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
+
+ return $this->_fields[$field->name]->number;
+ }
+ }
+
+ /**
+ * Add fieldInfo to the segment
+ *
+ * Returns actual field number
+ *
+ * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
+ * @return integer
+ */
+ public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
+ {
+ if (!isset($this->_fields[$fieldInfo->name])) {
+ $fieldNumber = count($this->_fields);
+ $this->_fields[$fieldInfo->name] =
+ new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
+ $fieldInfo->isIndexed,
+ $fieldNumber,
+ $fieldInfo->storeTermVector);
+
+ return $fieldNumber;
+ } else {
+ $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
+ $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
+
+ return $this->_fields[$fieldInfo->name]->number;
+ }
+ }
+
+ /**
+ * Returns array of FieldInfo objects.
+ *
+ * @return array
+ */
+ public function getFieldInfos()
+ {
+ return $this->_fields;
+ }
+
+ /**
+ * Add stored fields information
+ *
+ * @param array $storedFields array of Zend_Search_Lucene_Field objects
+ */
+ public function addStoredFields($storedFields)
+ {
+ if (!isset($this->_fdxFile)) {
+ $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
+ $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
+
+ $this->_files[] = $this->_name . '.fdx';
+ $this->_files[] = $this->_name . '.fdt';
+ }
+
+ $this->_fdxFile->writeLong($this->_fdtFile->tell());
+ $this->_fdtFile->writeVInt(count($storedFields));
+ foreach ($storedFields as $field) {
+ $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
+ $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
+ ($field->isBinary ? 0x02 : 0x00) |
+ 0x00; /* 0x04 - third bit, compressed (ZLIB) */
+ $this->_fdtFile->writeByte($fieldBits);
+ if ($field->isBinary) {
+ $this->_fdtFile->writeVInt(strlen($field->value));
+ $this->_fdtFile->writeBytes($field->value);
+ } else {
+ $this->_fdtFile->writeString($field->getUtf8Value());
+ }
+ }
+
+ $this->_docCount++;
+ }
+
+ /**
+ * Returns the total number of documents in this segment.
+ *
+ * @return integer
+ */
+ public function count()
+ {
+ return $this->_docCount;
+ }
+
+ /**
+ * Return segment name
+ *
+ * @return string
+ */
+ public function getName()
+ {
+ return $this->_name;
+ }
+
+ /**
+ * Dump Field Info (.fnm) segment file
+ */
+ protected function _dumpFNM()
+ {
+ $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
+ $fnmFile->writeVInt(count($this->_fields));
+
+ $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
+ // Write header
+ $nrmFile->writeBytes('NRM');
+ // Write format specifier
+ $nrmFile->writeByte((int)0xFF);
+
+ foreach ($this->_fields as $field) {
+ $fnmFile->writeString($field->name);
+ $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
+ ($field->storeTermVector ? 0x02 : 0x00)
+// not supported yet 0x04 /* term positions are stored with the term vectors */ |
+// not supported yet 0x08 /* term offsets are stored with the term vectors */ |
+ );
+
+ if ($field->isIndexed) {
+ // pre-2.1 index mode (not used now)
+ // $normFileName = $this->_name . '.f' . $field->number;
+ // $fFile = $this->_directory->createFile($normFileName);
+ // $fFile->writeBytes($this->_norms[$field->name]);
+ // $this->_files[] = $normFileName;
+
+ $nrmFile->writeBytes($this->_norms[$field->name]);
+ }
+ }
+
+ $this->_files[] = $this->_name . '.fnm';
+ $this->_files[] = $this->_name . '.nrm';
+ }
+
+
+
+ /**
+ * Term Dictionary file
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_tisFile = null;
+
+ /**
+ * Term Dictionary index file
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_tiiFile = null;
+
+ /**
+ * Frequencies file
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_frqFile = null;
+
+ /**
+ * Positions file
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_prxFile = null;
+
+ /**
+ * Number of written terms
+ *
+ * @var integer
+ */
+ private $_termCount;
+
+
+ /**
+ * Last saved term
+ *
+ * @var Zend_Search_Lucene_Index_Term
+ */
+ private $_prevTerm;
+
+ /**
+ * Last saved term info
+ *
+ * @var Zend_Search_Lucene_Index_TermInfo
+ */
+ private $_prevTermInfo;
+
+ /**
+ * Last saved index term
+ *
+ * @var Zend_Search_Lucene_Index_Term
+ */
+ private $_prevIndexTerm;
+
+ /**
+ * Last saved index term info
+ *
+ * @var Zend_Search_Lucene_Index_TermInfo
+ */
+ private $_prevIndexTermInfo;
+
+ /**
+ * Last term dictionary file position
+ *
+ * @var integer
+ */
+ private $_lastIndexPosition;
+
+ /**
+ * Create dicrionary, frequency and positions files and write necessary headers
+ */
+ public function initializeDictionaryFiles()
+ {
+ $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
+ $this->_tisFile->writeInt((int)0xFFFFFFFD);
+ $this->_tisFile->writeLong(0 /* dummy data for terms count */);
+ $this->_tisFile->writeInt(self::$indexInterval);
+ $this->_tisFile->writeInt(self::$skipInterval);
+ $this->_tisFile->writeInt(self::$maxSkipLevels);
+
+ $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
+ $this->_tiiFile->writeInt((int)0xFFFFFFFD);
+ $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
+ $this->_tiiFile->writeInt(self::$indexInterval);
+ $this->_tiiFile->writeInt(self::$skipInterval);
+ $this->_tiiFile->writeInt(self::$maxSkipLevels);
+
+ /** Dump dictionary header */
+ $this->_tiiFile->writeVInt(0); // preffix length
+ $this->_tiiFile->writeString(''); // suffix
+ $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
+ $this->_tiiFile->writeByte((int)0x0F);
+ $this->_tiiFile->writeVInt(0); // DocFreq
+ $this->_tiiFile->writeVInt(0); // FreqDelta
+ $this->_tiiFile->writeVInt(0); // ProxDelta
+ $this->_tiiFile->writeVInt(24); // IndexDelta
+
+ $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
+ $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
+
+ $this->_files[] = $this->_name . '.tis';
+ $this->_files[] = $this->_name . '.tii';
+ $this->_files[] = $this->_name . '.frq';
+ $this->_files[] = $this->_name . '.prx';
+
+ $this->_prevTerm = null;
+ $this->_prevTermInfo = null;
+ $this->_prevIndexTerm = null;
+ $this->_prevIndexTermInfo = null;
+ $this->_lastIndexPosition = 24;
+ $this->_termCount = 0;
+
+ }
+
+ /**
+ * Add term
+ *
+ * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
+ *
+ * @param Zend_Search_Lucene_Index_Term $termEntry
+ * @param array $termDocs
+ */
+ public function addTerm($termEntry, $termDocs)
+ {
+ $freqPointer = $this->_frqFile->tell();
+ $proxPointer = $this->_prxFile->tell();
+
+ $prevDoc = 0;
+ foreach ($termDocs as $docId => $termPositions) {
+ $docDelta = ($docId - $prevDoc)*2;
+ $prevDoc = $docId;
+ if (count($termPositions) > 1) {
+ $this->_frqFile->writeVInt($docDelta);
+ $this->_frqFile->writeVInt(count($termPositions));
+ } else {
+ $this->_frqFile->writeVInt($docDelta + 1);
+ }
+
+ $prevPosition = 0;
+ foreach ($termPositions as $position) {
+ $this->_prxFile->writeVInt($position - $prevPosition);
+ $prevPosition = $position;
+ }
+ }
+
+ if (count($termDocs) >= self::$skipInterval) {
+ /**
+ * @todo Write Skip Data to a freq file.
+ * It's not used now, but make index more optimal
+ */
+ $skipOffset = $this->_frqFile->tell() - $freqPointer;
+ } else {
+ $skipOffset = 0;
+ }
+
+ $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
+ $this->_fields[$termEntry->field]->number);
+ $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
+ $freqPointer, $proxPointer, $skipOffset);
+
+ $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
+
+ if (($this->_termCount + 1) % self::$indexInterval == 0) {
+ $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
+
+ $indexPosition = $this->_tisFile->tell();
+ $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
+ $this->_lastIndexPosition = $indexPosition;
+
+ }
+ $this->_termCount++;
+ }
+
+ /**
+ * Close dictionary
+ */
+ public function closeDictionaryFiles()
+ {
+ $this->_tisFile->seek(4);
+ $this->_tisFile->writeLong($this->_termCount);
+
+ $this->_tiiFile->seek(4);
+ // + 1 is used to count an additional special index entry (empty term at the start of the list)
+ $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
+ }
+
+
+ /**
+ * Dump Term Dictionary segment file entry.
+ * Used to write entry to .tis or .tii files
+ *
+ * @param Zend_Search_Lucene_Storage_File $dicFile
+ * @param Zend_Search_Lucene_Index_Term $prevTerm
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
+ * @param Zend_Search_Lucene_Index_TermInfo $termInfo
+ */
+ protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
+ &$prevTerm, Zend_Search_Lucene_Index_Term $term,
+ &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
+ {
+ if (isset($prevTerm) && $prevTerm->field == $term->field) {
+ $matchedBytes = 0;
+ $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
+ while ($matchedBytes < $maxBytes &&
+ $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
+ $matchedBytes++;
+ }
+
+ // Calculate actual matched UTF-8 pattern
+ $prefixBytes = 0;
+ $prefixChars = 0;
+ while ($prefixBytes < $matchedBytes) {
+ $charBytes = 1;
+ if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
+ $charBytes++;
+ if (ord($term->text[$prefixBytes]) & 0x20 ) {
+ $charBytes++;
+ if (ord($term->text[$prefixBytes]) & 0x10 ) {
+ $charBytes++;
+ }
+ }
+ }
+
+ if ($prefixBytes + $charBytes > $matchedBytes) {
+ // char crosses matched bytes boundary
+ // skip char
+ break;
+ }
+
+ $prefixChars++;
+ $prefixBytes += $charBytes;
+ }
+
+ // Write preffix length
+ $dicFile->writeVInt($prefixChars);
+ // Write suffix
+ $dicFile->writeString(substr($term->text, $prefixBytes));
+ } else {
+ // Write preffix length
+ $dicFile->writeVInt(0);
+ // Write suffix
+ $dicFile->writeString($term->text);
+ }
+ // Write field number
+ $dicFile->writeVInt($term->field);
+ // DocFreq (the count of documents which contain the term)
+ $dicFile->writeVInt($termInfo->docFreq);
+
+ $prevTerm = $term;
+
+ if (!isset($prevTermInfo)) {
+ // Write FreqDelta
+ $dicFile->writeVInt($termInfo->freqPointer);
+ // Write ProxDelta
+ $dicFile->writeVInt($termInfo->proxPointer);
+ } else {
+ // Write FreqDelta
+ $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
+ // Write ProxDelta
+ $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
+ }
+ // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
+ if ($termInfo->skipOffset != 0) {
+ $dicFile->writeVInt($termInfo->skipOffset);
+ }
+
+ $prevTermInfo = $termInfo;
+ }
+
+
+ /**
+ * Generate compound index file
+ */
+ protected function _generateCFS()
+ {
+ $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
+ $cfsFile->writeVInt(count($this->_files));
+
+ $dataOffsetPointers = array();
+ foreach ($this->_files as $fileName) {
+ $dataOffsetPointers[$fileName] = $cfsFile->tell();
+ $cfsFile->writeLong(0); // write dummy data
+ $cfsFile->writeString($fileName);
+ }
+
+ foreach ($this->_files as $fileName) {
+ // Get actual data offset
+ $dataOffset = $cfsFile->tell();
+ // Seek to the data offset pointer
+ $cfsFile->seek($dataOffsetPointers[$fileName]);
+ // Write actual data offset value
+ $cfsFile->writeLong($dataOffset);
+ // Seek back to the end of file
+ $cfsFile->seek($dataOffset);
+
+ $dataFile = $this->_directory->getFileObject($fileName);
+
+ $byteCount = $this->_directory->fileLength($fileName);
+ while ($byteCount > 0) {
+ $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
+ $byteCount -= strlen($data);
+ $cfsFile->writeBytes($data);
+ }
+
+ $this->_directory->deleteFile($fileName);
+ }
+ }
+
+
+ /**
+ * Close segment, write it to disk and return segment info
+ *
+ * @return Zend_Search_Lucene_Index_SegmentInfo
+ */
+ abstract public function close();
+}
+