7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
16 * @package Zend_Search_Lucene
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
20 * @version $Id: SegmentWriter.php 16541 2009-07-07 06:59:03Z bkarwin $
23 /** Zend_Search_Lucene_Index_SegmentInfo */
24 require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
28 * @package Zend_Search_Lucene
30 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
31 * @license http://framework.zend.com/license/new-bsd New BSD License
33 abstract class Zend_Search_Lucene_Index_SegmentWriter
36 * Expert: The fraction of terms in the "dictionary" which should be stored
37 * in RAM. Smaller values use more memory, but make searching slightly
38 * faster, while larger values use less memory and make searching slightly
39 * slower. Searching is typically not dominated by dictionary lookup, so
40 * tweaking this is rarely useful.
44 public static $indexInterval = 128;
47 * Expert: The fraction of TermDocs entries stored in skip tables.
48 * Larger values result in smaller indexes, greater acceleration, but fewer
49 * accelerable cases, while smaller values result in bigger indexes,
50 * less acceleration and more
51 * accelerable cases. More detailed experiments would be useful here.
53 * 0x7FFFFFFF indicates that we don't use skip data
55 * Note: not used in current implementation
59 public static $skipInterval = 0x7FFFFFFF;
62 * Expert: The maximum number of skip levels. Smaller values result in
63 * slightly smaller indexes, but slower skipping in big posting lists.
65 * 0 indicates that we don't use skip data
67 * Note: not used in current implementation
71 public static $maxSkipLevels = 0;
74 * Number of docs in a segment
78 protected $_docCount = 0;
88 * File system adapter.
90 * @var Zend_Search_Lucene_Storage_Directory
92 protected $_directory;
95 * List of the index files.
96 * Used for automatic compound file generation
100 protected $_files = array();
103 * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
107 protected $_fields = array();
110 * Normalization factors.
111 * An array fieldName => normVector
112 * normVector is a binary string.
113 * Each byte corresponds to an indexed document in a segment and
114 * encodes normalization factor (float value, encoded by
115 * Zend_Search_Lucene_Search_Similarity::encodeNorm())
119 protected $_norms = array();
123 * '.fdx' file - Stored Fields, the field index.
125 * @var Zend_Search_Lucene_Storage_File
127 protected $_fdxFile = null;
130 * '.fdt' file - Stored Fields, the field data.
132 * @var Zend_Search_Lucene_Storage_File
134 protected $_fdtFile = null;
138 * Object constructor.
140 * @param Zend_Search_Lucene_Storage_Directory $directory
141 * @param string $name
143 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
145 $this->_directory = $directory;
146 $this->_name = $name;
151 * Add field to the segment
153 * Returns actual field number
155 * @param Zend_Search_Lucene_Field $field
158 public function addField(Zend_Search_Lucene_Field $field)
160 if (!isset($this->_fields[$field->name])) {
161 $fieldNumber = count($this->_fields);
162 $this->_fields[$field->name] =
163 new Zend_Search_Lucene_Index_FieldInfo($field->name,
166 $field->storeTermVector);
170 $this->_fields[$field->name]->isIndexed |= $field->isIndexed;
171 $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
173 return $this->_fields[$field->name]->number;
178 * Add fieldInfo to the segment
180 * Returns actual field number
182 * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
185 public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
187 if (!isset($this->_fields[$fieldInfo->name])) {
188 $fieldNumber = count($this->_fields);
189 $this->_fields[$fieldInfo->name] =
190 new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
191 $fieldInfo->isIndexed,
193 $fieldInfo->storeTermVector);
197 $this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
198 $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
200 return $this->_fields[$fieldInfo->name]->number;
205 * Returns array of FieldInfo objects.
209 public function getFieldInfos()
211 return $this->_fields;
215 * Add stored fields information
217 * @param array $storedFields array of Zend_Search_Lucene_Field objects
219 public function addStoredFields($storedFields)
221 if (!isset($this->_fdxFile)) {
222 $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
223 $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
225 $this->_files[] = $this->_name . '.fdx';
226 $this->_files[] = $this->_name . '.fdt';
229 $this->_fdxFile->writeLong($this->_fdtFile->tell());
230 $this->_fdtFile->writeVInt(count($storedFields));
231 foreach ($storedFields as $field) {
232 $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
233 $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
234 ($field->isBinary ? 0x02 : 0x00) |
235 0x00; /* 0x04 - third bit, compressed (ZLIB) */
236 $this->_fdtFile->writeByte($fieldBits);
237 if ($field->isBinary) {
238 $this->_fdtFile->writeVInt(strlen($field->value));
239 $this->_fdtFile->writeBytes($field->value);
241 $this->_fdtFile->writeString($field->getUtf8Value());
249 * Returns the total number of documents in this segment.
253 public function count()
255 return $this->_docCount;
259 * Return segment name
263 public function getName()
269 * Dump Field Info (.fnm) segment file
271 protected function _dumpFNM()
273 $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
274 $fnmFile->writeVInt(count($this->_fields));
276 $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
278 $nrmFile->writeBytes('NRM');
279 // Write format specifier
280 $nrmFile->writeByte((int)0xFF);
282 foreach ($this->_fields as $field) {
283 $fnmFile->writeString($field->name);
284 $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
285 ($field->storeTermVector ? 0x02 : 0x00)
286 // not supported yet 0x04 /* term positions are stored with the term vectors */ |
287 // not supported yet 0x08 /* term offsets are stored with the term vectors */ |
290 if ($field->isIndexed) {
291 // pre-2.1 index mode (not used now)
292 // $normFileName = $this->_name . '.f' . $field->number;
293 // $fFile = $this->_directory->createFile($normFileName);
294 // $fFile->writeBytes($this->_norms[$field->name]);
295 // $this->_files[] = $normFileName;
297 $nrmFile->writeBytes($this->_norms[$field->name]);
301 $this->_files[] = $this->_name . '.fnm';
302 $this->_files[] = $this->_name . '.nrm';
308 * Term Dictionary file
310 * @var Zend_Search_Lucene_Storage_File
312 private $_tisFile = null;
315 * Term Dictionary index file
317 * @var Zend_Search_Lucene_Storage_File
319 private $_tiiFile = null;
324 * @var Zend_Search_Lucene_Storage_File
326 private $_frqFile = null;
331 * @var Zend_Search_Lucene_Storage_File
333 private $_prxFile = null;
336 * Number of written terms
346 * @var Zend_Search_Lucene_Index_Term
351 * Last saved term info
353 * @var Zend_Search_Lucene_Index_TermInfo
355 private $_prevTermInfo;
358 * Last saved index term
360 * @var Zend_Search_Lucene_Index_Term
362 private $_prevIndexTerm;
365 * Last saved index term info
367 * @var Zend_Search_Lucene_Index_TermInfo
369 private $_prevIndexTermInfo;
372 * Last term dictionary file position
376 private $_lastIndexPosition;
379 * Create dicrionary, frequency and positions files and write necessary headers
381 public function initializeDictionaryFiles()
383 $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
384 $this->_tisFile->writeInt((int)0xFFFFFFFD);
385 $this->_tisFile->writeLong(0 /* dummy data for terms count */);
386 $this->_tisFile->writeInt(self::$indexInterval);
387 $this->_tisFile->writeInt(self::$skipInterval);
388 $this->_tisFile->writeInt(self::$maxSkipLevels);
390 $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
391 $this->_tiiFile->writeInt((int)0xFFFFFFFD);
392 $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
393 $this->_tiiFile->writeInt(self::$indexInterval);
394 $this->_tiiFile->writeInt(self::$skipInterval);
395 $this->_tiiFile->writeInt(self::$maxSkipLevels);
397 /** Dump dictionary header */
398 $this->_tiiFile->writeVInt(0); // preffix length
399 $this->_tiiFile->writeString(''); // suffix
400 $this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
401 $this->_tiiFile->writeByte((int)0x0F);
402 $this->_tiiFile->writeVInt(0); // DocFreq
403 $this->_tiiFile->writeVInt(0); // FreqDelta
404 $this->_tiiFile->writeVInt(0); // ProxDelta
405 $this->_tiiFile->writeVInt(24); // IndexDelta
407 $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
408 $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
410 $this->_files[] = $this->_name . '.tis';
411 $this->_files[] = $this->_name . '.tii';
412 $this->_files[] = $this->_name . '.frq';
413 $this->_files[] = $this->_name . '.prx';
415 $this->_prevTerm = null;
416 $this->_prevTermInfo = null;
417 $this->_prevIndexTerm = null;
418 $this->_prevIndexTermInfo = null;
419 $this->_lastIndexPosition = 24;
420 $this->_termCount = 0;
427 * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
429 * @param Zend_Search_Lucene_Index_Term $termEntry
430 * @param array $termDocs
432 public function addTerm($termEntry, $termDocs)
434 $freqPointer = $this->_frqFile->tell();
435 $proxPointer = $this->_prxFile->tell();
438 foreach ($termDocs as $docId => $termPositions) {
439 $docDelta = ($docId - $prevDoc)*2;
441 if (count($termPositions) > 1) {
442 $this->_frqFile->writeVInt($docDelta);
443 $this->_frqFile->writeVInt(count($termPositions));
445 $this->_frqFile->writeVInt($docDelta + 1);
449 foreach ($termPositions as $position) {
450 $this->_prxFile->writeVInt($position - $prevPosition);
451 $prevPosition = $position;
455 if (count($termDocs) >= self::$skipInterval) {
457 * @todo Write Skip Data to a freq file.
458 * It's not used now, but make index more optimal
460 $skipOffset = $this->_frqFile->tell() - $freqPointer;
465 $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
466 $this->_fields[$termEntry->field]->number);
467 $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
468 $freqPointer, $proxPointer, $skipOffset);
470 $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
472 if (($this->_termCount + 1) % self::$indexInterval == 0) {
473 $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
475 $indexPosition = $this->_tisFile->tell();
476 $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
477 $this->_lastIndexPosition = $indexPosition;
486 public function closeDictionaryFiles()
488 $this->_tisFile->seek(4);
489 $this->_tisFile->writeLong($this->_termCount);
491 $this->_tiiFile->seek(4);
492 // + 1 is used to count an additional special index entry (empty term at the start of the list)
493 $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
498 * Dump Term Dictionary segment file entry.
499 * Used to write entry to .tis or .tii files
501 * @param Zend_Search_Lucene_Storage_File $dicFile
502 * @param Zend_Search_Lucene_Index_Term $prevTerm
503 * @param Zend_Search_Lucene_Index_Term $term
504 * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
505 * @param Zend_Search_Lucene_Index_TermInfo $termInfo
507 protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
508 &$prevTerm, Zend_Search_Lucene_Index_Term $term,
509 &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
511 if (isset($prevTerm) && $prevTerm->field == $term->field) {
513 $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
514 while ($matchedBytes < $maxBytes &&
515 $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
519 // Calculate actual matched UTF-8 pattern
522 while ($prefixBytes < $matchedBytes) {
524 if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
526 if (ord($term->text[$prefixBytes]) & 0x20 ) {
528 if (ord($term->text[$prefixBytes]) & 0x10 ) {
534 if ($prefixBytes + $charBytes > $matchedBytes) {
535 // char crosses matched bytes boundary
541 $prefixBytes += $charBytes;
544 // Write preffix length
545 $dicFile->writeVInt($prefixChars);
547 $dicFile->writeString(substr($term->text, $prefixBytes));
549 // Write preffix length
550 $dicFile->writeVInt(0);
552 $dicFile->writeString($term->text);
554 // Write field number
555 $dicFile->writeVInt($term->field);
556 // DocFreq (the count of documents which contain the term)
557 $dicFile->writeVInt($termInfo->docFreq);
561 if (!isset($prevTermInfo)) {
563 $dicFile->writeVInt($termInfo->freqPointer);
565 $dicFile->writeVInt($termInfo->proxPointer);
568 $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
570 $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
572 // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
573 if ($termInfo->skipOffset != 0) {
574 $dicFile->writeVInt($termInfo->skipOffset);
577 $prevTermInfo = $termInfo;
582 * Generate compound index file
584 protected function _generateCFS()
586 $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
587 $cfsFile->writeVInt(count($this->_files));
589 $dataOffsetPointers = array();
590 foreach ($this->_files as $fileName) {
591 $dataOffsetPointers[$fileName] = $cfsFile->tell();
592 $cfsFile->writeLong(0); // write dummy data
593 $cfsFile->writeString($fileName);
596 foreach ($this->_files as $fileName) {
597 // Get actual data offset
598 $dataOffset = $cfsFile->tell();
599 // Seek to the data offset pointer
600 $cfsFile->seek($dataOffsetPointers[$fileName]);
601 // Write actual data offset value
602 $cfsFile->writeLong($dataOffset);
603 // Seek back to the end of file
604 $cfsFile->seek($dataOffset);
606 $dataFile = $this->_directory->getFileObject($fileName);
608 $byteCount = $this->_directory->fileLength($fileName);
609 while ($byteCount > 0) {
610 $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
611 $byteCount -= strlen($data);
612 $cfsFile->writeBytes($data);
615 $this->_directory->deleteFile($fileName);
621 * Close segment, write it to disk and return segment info
623 * @return Zend_Search_Lucene_Index_SegmentInfo
625 abstract public function close();