final move of files
[web.mtrack] / Zend / Search / Lucene / Index / SegmentWriter.php
1 <?php
2 /**
3  * Zend Framework
4  *
5  * LICENSE
6  *
7  * This source file is subject to the new BSD license that is bundled
8  * with this package in the file LICENSE.txt.
9  * It is also available through the world-wide-web at this URL:
10  * http://framework.zend.com/license/new-bsd
11  * If you did not receive a copy of the license and are unable to
12  * obtain it through the world-wide-web, please send an email
13  * to license@zend.com so we can send you a copy immediately.
14  *
15  * @category   Zend
16  * @package    Zend_Search_Lucene
17  * @subpackage Index
18  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
20  * @version    $Id: SegmentWriter.php 16541 2009-07-07 06:59:03Z bkarwin $
21  */
22
23 /** Zend_Search_Lucene_Index_SegmentInfo */
24 require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
25
26 /**
27  * @category   Zend
28  * @package    Zend_Search_Lucene
29  * @subpackage Index
30  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
31  * @license    http://framework.zend.com/license/new-bsd     New BSD License
32  */
33 abstract class Zend_Search_Lucene_Index_SegmentWriter
34 {
35     /**
36      * Expert: The fraction of terms in the "dictionary" which should be stored
37      * in RAM.  Smaller values use more memory, but make searching slightly
38      * faster, while larger values use less memory and make searching slightly
39      * slower.  Searching is typically not dominated by dictionary lookup, so
40      * tweaking this is rarely useful.
41      *
42      * @var integer
43      */
44     public static $indexInterval = 128;
45
46     /**
47      * Expert: The fraction of TermDocs entries stored in skip tables.
48      * Larger values result in smaller indexes, greater acceleration, but fewer
49      * accelerable cases, while smaller values result in bigger indexes,
50      * less acceleration and more
51      * accelerable cases. More detailed experiments would be useful here.
52      *
53      * 0x7FFFFFFF indicates that we don't use skip data
54      *
55      * Note: not used in current implementation
56      *
57      * @var integer
58      */
59     public static $skipInterval = 0x7FFFFFFF;
60
61     /**
62      * Expert: The maximum number of skip levels. Smaller values result in
63      * slightly smaller indexes, but slower skipping in big posting lists.
64      *
65      * 0 indicates that we don't use skip data
66      *
67      * Note: not used in current implementation
68      *
69      * @var integer
70      */
71     public static $maxSkipLevels = 0;
72
73     /**
74      * Number of docs in a segment
75      *
76      * @var integer
77      */
78     protected $_docCount = 0;
79
80     /**
81      * Segment name
82      *
83      * @var string
84      */
85     protected $_name;
86
87     /**
88      * File system adapter.
89      *
90      * @var Zend_Search_Lucene_Storage_Directory
91      */
92     protected $_directory;
93
94     /**
95      * List of the index files.
96      * Used for automatic compound file generation
97      *
98      * @var unknown_type
99      */
100     protected $_files = array();
101
102     /**
103      * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
104      *
105      * @var array
106      */
107     protected $_fields = array();
108
109     /**
110      * Normalization factors.
111      * An array fieldName => normVector
112      * normVector is a binary string.
113      * Each byte corresponds to an indexed document in a segment and
114      * encodes normalization factor (float value, encoded by
115      * Zend_Search_Lucene_Search_Similarity::encodeNorm())
116      *
117      * @var array
118      */
119     protected $_norms = array();
120
121
122     /**
123      * '.fdx'  file - Stored Fields, the field index.
124      *
125      * @var Zend_Search_Lucene_Storage_File
126      */
127     protected $_fdxFile = null;
128
129     /**
130      * '.fdt'  file - Stored Fields, the field data.
131      *
132      * @var Zend_Search_Lucene_Storage_File
133      */
134     protected $_fdtFile = null;
135
136
137     /**
138      * Object constructor.
139      *
140      * @param Zend_Search_Lucene_Storage_Directory $directory
141      * @param string $name
142      */
143     public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
144     {
145         $this->_directory = $directory;
146         $this->_name      = $name;
147     }
148
149
150     /**
151      * Add field to the segment
152      *
153      * Returns actual field number
154      *
155      * @param Zend_Search_Lucene_Field $field
156      * @return integer
157      */
158     public function addField(Zend_Search_Lucene_Field $field)
159     {
160         if (!isset($this->_fields[$field->name])) {
161             $fieldNumber = count($this->_fields);
162             $this->_fields[$field->name] =
163                                 new Zend_Search_Lucene_Index_FieldInfo($field->name,
164                                                                        $field->isIndexed,
165                                                                        $fieldNumber,
166                                                                        $field->storeTermVector);
167
168             return $fieldNumber;
169         } else {
170             $this->_fields[$field->name]->isIndexed       |= $field->isIndexed;
171             $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
172
173             return $this->_fields[$field->name]->number;
174         }
175     }
176
177     /**
178      * Add fieldInfo to the segment
179      *
180      * Returns actual field number
181      *
182      * @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
183      * @return integer
184      */
185     public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
186     {
187         if (!isset($this->_fields[$fieldInfo->name])) {
188             $fieldNumber = count($this->_fields);
189             $this->_fields[$fieldInfo->name] =
190                                 new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
191                                                                        $fieldInfo->isIndexed,
192                                                                        $fieldNumber,
193                                                                        $fieldInfo->storeTermVector);
194
195             return $fieldNumber;
196         } else {
197             $this->_fields[$fieldInfo->name]->isIndexed       |= $fieldInfo->isIndexed;
198             $this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
199
200             return $this->_fields[$fieldInfo->name]->number;
201         }
202     }
203
204     /**
205      * Returns array of FieldInfo objects.
206      *
207      * @return array
208      */
209     public function getFieldInfos()
210     {
211         return $this->_fields;
212     }
213
214     /**
215      * Add stored fields information
216      *
217      * @param array $storedFields array of Zend_Search_Lucene_Field objects
218      */
219     public function addStoredFields($storedFields)
220     {
221         if (!isset($this->_fdxFile)) {
222             $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
223             $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
224
225             $this->_files[] = $this->_name . '.fdx';
226             $this->_files[] = $this->_name . '.fdt';
227         }
228
229         $this->_fdxFile->writeLong($this->_fdtFile->tell());
230         $this->_fdtFile->writeVInt(count($storedFields));
231         foreach ($storedFields as $field) {
232             $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
233             $fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
234                          ($field->isBinary ?    0x02 : 0x00) |
235                          0x00; /* 0x04 - third bit, compressed (ZLIB) */
236             $this->_fdtFile->writeByte($fieldBits);
237             if ($field->isBinary) {
238                 $this->_fdtFile->writeVInt(strlen($field->value));
239                 $this->_fdtFile->writeBytes($field->value);
240             } else {
241                 $this->_fdtFile->writeString($field->getUtf8Value());
242             }
243         }
244
245         $this->_docCount++;
246     }
247
248     /**
249      * Returns the total number of documents in this segment.
250      *
251      * @return integer
252      */
253     public function count()
254     {
255         return $this->_docCount;
256     }
257
258     /**
259      * Return segment name
260      *
261      * @return string
262      */
263     public function getName()
264     {
265         return $this->_name;
266     }
267
268     /**
269      * Dump Field Info (.fnm) segment file
270      */
271     protected function _dumpFNM()
272     {
273         $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
274         $fnmFile->writeVInt(count($this->_fields));
275
276         $nrmFile = $this->_directory->createFile($this->_name . '.nrm');
277         // Write header
278         $nrmFile->writeBytes('NRM');
279         // Write format specifier
280         $nrmFile->writeByte((int)0xFF);
281
282         foreach ($this->_fields as $field) {
283             $fnmFile->writeString($field->name);
284             $fnmFile->writeByte(($field->isIndexed       ? 0x01 : 0x00) |
285                                 ($field->storeTermVector ? 0x02 : 0x00)
286 // not supported yet            0x04 /* term positions are stored with the term vectors */ |
287 // not supported yet            0x08 /* term offsets are stored with the term vectors */   |
288                                );
289
290             if ($field->isIndexed) {
291                 // pre-2.1 index mode (not used now)
292                 // $normFileName = $this->_name . '.f' . $field->number;
293                 // $fFile = $this->_directory->createFile($normFileName);
294                 // $fFile->writeBytes($this->_norms[$field->name]);
295                 // $this->_files[] = $normFileName;
296
297                 $nrmFile->writeBytes($this->_norms[$field->name]);
298             }
299         }
300
301         $this->_files[] = $this->_name . '.fnm';
302         $this->_files[] = $this->_name . '.nrm';
303     }
304
305
306
307     /**
308      * Term Dictionary file
309      *
310      * @var Zend_Search_Lucene_Storage_File
311      */
312     private $_tisFile = null;
313
314     /**
315      * Term Dictionary index file
316      *
317      * @var Zend_Search_Lucene_Storage_File
318      */
319     private $_tiiFile = null;
320
321     /**
322      * Frequencies file
323      *
324      * @var Zend_Search_Lucene_Storage_File
325      */
326     private $_frqFile = null;
327
328     /**
329      * Positions file
330      *
331      * @var Zend_Search_Lucene_Storage_File
332      */
333     private $_prxFile = null;
334
335     /**
336      * Number of written terms
337      *
338      * @var integer
339      */
340     private $_termCount;
341
342
343     /**
344      * Last saved term
345      *
346      * @var Zend_Search_Lucene_Index_Term
347      */
348     private $_prevTerm;
349
350     /**
351      * Last saved term info
352      *
353      * @var Zend_Search_Lucene_Index_TermInfo
354      */
355     private $_prevTermInfo;
356
357     /**
358      * Last saved index term
359      *
360      * @var Zend_Search_Lucene_Index_Term
361      */
362     private $_prevIndexTerm;
363
364     /**
365      * Last saved index term info
366      *
367      * @var Zend_Search_Lucene_Index_TermInfo
368      */
369     private $_prevIndexTermInfo;
370
371     /**
372      * Last term dictionary file position
373      *
374      * @var integer
375      */
376     private $_lastIndexPosition;
377
378     /**
379      * Create dicrionary, frequency and positions files and write necessary headers
380      */
381     public function initializeDictionaryFiles()
382     {
383         $this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
384         $this->_tisFile->writeInt((int)0xFFFFFFFD);
385         $this->_tisFile->writeLong(0 /* dummy data for terms count */);
386         $this->_tisFile->writeInt(self::$indexInterval);
387         $this->_tisFile->writeInt(self::$skipInterval);
388         $this->_tisFile->writeInt(self::$maxSkipLevels);
389
390         $this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
391         $this->_tiiFile->writeInt((int)0xFFFFFFFD);
392         $this->_tiiFile->writeLong(0 /* dummy data for terms count */);
393         $this->_tiiFile->writeInt(self::$indexInterval);
394         $this->_tiiFile->writeInt(self::$skipInterval);
395         $this->_tiiFile->writeInt(self::$maxSkipLevels);
396
397         /** Dump dictionary header */
398         $this->_tiiFile->writeVInt(0);                    // preffix length
399         $this->_tiiFile->writeString('');                 // suffix
400         $this->_tiiFile->writeInt((int)0xFFFFFFFF);       // field number
401         $this->_tiiFile->writeByte((int)0x0F);
402         $this->_tiiFile->writeVInt(0);                    // DocFreq
403         $this->_tiiFile->writeVInt(0);                    // FreqDelta
404         $this->_tiiFile->writeVInt(0);                    // ProxDelta
405         $this->_tiiFile->writeVInt(24);                   // IndexDelta
406
407         $this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
408         $this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
409
410         $this->_files[] = $this->_name . '.tis';
411         $this->_files[] = $this->_name . '.tii';
412         $this->_files[] = $this->_name . '.frq';
413         $this->_files[] = $this->_name . '.prx';
414
415         $this->_prevTerm          = null;
416         $this->_prevTermInfo      = null;
417         $this->_prevIndexTerm     = null;
418         $this->_prevIndexTermInfo = null;
419         $this->_lastIndexPosition = 24;
420         $this->_termCount         = 0;
421
422     }
423
424     /**
425      * Add term
426      *
427      * Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
428      *
429      * @param Zend_Search_Lucene_Index_Term $termEntry
430      * @param array $termDocs
431      */
432     public function addTerm($termEntry, $termDocs)
433     {
434         $freqPointer = $this->_frqFile->tell();
435         $proxPointer = $this->_prxFile->tell();
436
437         $prevDoc = 0;
438         foreach ($termDocs as $docId => $termPositions) {
439             $docDelta = ($docId - $prevDoc)*2;
440             $prevDoc = $docId;
441             if (count($termPositions) > 1) {
442                 $this->_frqFile->writeVInt($docDelta);
443                 $this->_frqFile->writeVInt(count($termPositions));
444             } else {
445                 $this->_frqFile->writeVInt($docDelta + 1);
446             }
447
448             $prevPosition = 0;
449             foreach ($termPositions as $position) {
450                 $this->_prxFile->writeVInt($position - $prevPosition);
451                 $prevPosition = $position;
452             }
453         }
454
455         if (count($termDocs) >= self::$skipInterval) {
456             /**
457              * @todo Write Skip Data to a freq file.
458              * It's not used now, but make index more optimal
459              */
460             $skipOffset = $this->_frqFile->tell() - $freqPointer;
461         } else {
462             $skipOffset = 0;
463         }
464
465         $term = new Zend_Search_Lucene_Index_Term($termEntry->text,
466                                                   $this->_fields[$termEntry->field]->number);
467         $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
468                                                           $freqPointer, $proxPointer, $skipOffset);
469
470         $this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
471
472         if (($this->_termCount + 1) % self::$indexInterval == 0) {
473             $this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
474
475             $indexPosition = $this->_tisFile->tell();
476             $this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
477             $this->_lastIndexPosition = $indexPosition;
478
479         }
480         $this->_termCount++;
481     }
482
483     /**
484      * Close dictionary
485      */
486     public function closeDictionaryFiles()
487     {
488         $this->_tisFile->seek(4);
489         $this->_tisFile->writeLong($this->_termCount);
490
491         $this->_tiiFile->seek(4);
492         // + 1 is used to count an additional special index entry (empty term at the start of the list)
493         $this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
494     }
495
496
497     /**
498      * Dump Term Dictionary segment file entry.
499      * Used to write entry to .tis or .tii files
500      *
501      * @param Zend_Search_Lucene_Storage_File $dicFile
502      * @param Zend_Search_Lucene_Index_Term $prevTerm
503      * @param Zend_Search_Lucene_Index_Term $term
504      * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
505      * @param Zend_Search_Lucene_Index_TermInfo $termInfo
506      */
507     protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
508                                         &$prevTerm,     Zend_Search_Lucene_Index_Term     $term,
509                                         &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
510     {
511         if (isset($prevTerm) && $prevTerm->field == $term->field) {
512             $matchedBytes = 0;
513             $maxBytes = min(strlen($prevTerm->text), strlen($term->text));
514             while ($matchedBytes < $maxBytes  &&
515                    $prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
516                 $matchedBytes++;
517             }
518
519             // Calculate actual matched UTF-8 pattern
520             $prefixBytes = 0;
521             $prefixChars = 0;
522             while ($prefixBytes < $matchedBytes) {
523                 $charBytes = 1;
524                 if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
525                     $charBytes++;
526                     if (ord($term->text[$prefixBytes]) & 0x20 ) {
527                         $charBytes++;
528                         if (ord($term->text[$prefixBytes]) & 0x10 ) {
529                             $charBytes++;
530                         }
531                     }
532                 }
533
534                 if ($prefixBytes + $charBytes > $matchedBytes) {
535                     // char crosses matched bytes boundary
536                     // skip char
537                     break;
538                 }
539
540                 $prefixChars++;
541                 $prefixBytes += $charBytes;
542             }
543
544             // Write preffix length
545             $dicFile->writeVInt($prefixChars);
546             // Write suffix
547             $dicFile->writeString(substr($term->text, $prefixBytes));
548         } else {
549             // Write preffix length
550             $dicFile->writeVInt(0);
551             // Write suffix
552             $dicFile->writeString($term->text);
553         }
554         // Write field number
555         $dicFile->writeVInt($term->field);
556         // DocFreq (the count of documents which contain the term)
557         $dicFile->writeVInt($termInfo->docFreq);
558
559         $prevTerm = $term;
560
561         if (!isset($prevTermInfo)) {
562             // Write FreqDelta
563             $dicFile->writeVInt($termInfo->freqPointer);
564             // Write ProxDelta
565             $dicFile->writeVInt($termInfo->proxPointer);
566         } else {
567             // Write FreqDelta
568             $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
569             // Write ProxDelta
570             $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
571         }
572         // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
573         if ($termInfo->skipOffset != 0) {
574             $dicFile->writeVInt($termInfo->skipOffset);
575         }
576
577         $prevTermInfo = $termInfo;
578     }
579
580
581     /**
582      * Generate compound index file
583      */
584     protected function _generateCFS()
585     {
586         $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
587         $cfsFile->writeVInt(count($this->_files));
588
589         $dataOffsetPointers = array();
590         foreach ($this->_files as $fileName) {
591             $dataOffsetPointers[$fileName] = $cfsFile->tell();
592             $cfsFile->writeLong(0); // write dummy data
593             $cfsFile->writeString($fileName);
594         }
595
596         foreach ($this->_files as $fileName) {
597             // Get actual data offset
598             $dataOffset = $cfsFile->tell();
599             // Seek to the data offset pointer
600             $cfsFile->seek($dataOffsetPointers[$fileName]);
601             // Write actual data offset value
602             $cfsFile->writeLong($dataOffset);
603             // Seek back to the end of file
604             $cfsFile->seek($dataOffset);
605
606             $dataFile = $this->_directory->getFileObject($fileName);
607
608             $byteCount = $this->_directory->fileLength($fileName);
609             while ($byteCount > 0) {
610                 $data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
611                 $byteCount -= strlen($data);
612                 $cfsFile->writeBytes($data);
613             }
614
615             $this->_directory->deleteFile($fileName);
616         }
617     }
618
619
620     /**
621      * Close segment, write it to disk and return segment info
622      *
623      * @return Zend_Search_Lucene_Index_SegmentInfo
624      */
625     abstract public function close();
626 }
627