import
[web.mtrack] / inc / lib / Zend / Search / Lucene / Index / SegmentMerger.php
1 <?php
2 /**
3  * Zend Framework
4  *
5  * LICENSE
6  *
7  * This source file is subject to the new BSD license that is bundled
8  * with this package in the file LICENSE.txt.
9  * It is also available through the world-wide-web at this URL:
10  * http://framework.zend.com/license/new-bsd
11  * If you did not receive a copy of the license and are unable to
12  * obtain it through the world-wide-web, please send an email
13  * to license@zend.com so we can send you a copy immediately.
14  *
15  * @category   Zend
16  * @package    Zend_Search_Lucene
17  * @subpackage Index
18  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
20  * @version    $Id: SegmentMerger.php 16541 2009-07-07 06:59:03Z bkarwin $
21  */
22
23 /** Zend_Search_Lucene_Index_SegmentInfo */
24 require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
25
26 /** Zend_Search_Lucene_Index_SegmentWriter_StreamWriter */
27 require_once 'Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php';
28
29 /** Zend_Search_Lucene_Index_TermsPriorityQueue */
30 require_once 'Zend/Search/Lucene/Index/TermsPriorityQueue.php';
31
32 /**
33  * @category   Zend
34  * @package    Zend_Search_Lucene
35  * @subpackage Index
36  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
37  * @license    http://framework.zend.com/license/new-bsd     New BSD License
38  */
39 class Zend_Search_Lucene_Index_SegmentMerger
40 {
41     /**
42      * Target segment writer
43      *
44      * @var Zend_Search_Lucene_Index_SegmentWriter_StreamWriter
45      */
46     private $_writer;
47
48     /**
49      * Number of docs in a new segment
50      *
51      * @var integer
52      */
53     private $_docCount;
54
55     /**
56      * A set of segments to be merged
57      *
58      * @var array Zend_Search_Lucene_Index_SegmentInfo
59      */
60     private $_segmentInfos = array();
61
62     /**
63      * Flag to signal, that merge is already done
64      *
65      * @var boolean
66      */
67     private $_mergeDone = false;
68
69     /**
70      * Field map
71      * [<segment_name>][<field_number>] => <target_field_number>
72      *
73      * @var array
74      */
75     private $_fieldsMap = array();
76
77
78
79     /**
80      * Object constructor.
81      *
82      * Creates new segment merger with $directory as target to merge segments into
83      * and $name as a name of new segment
84      *
85      * @param Zend_Search_Lucene_Storage_Directory $directory
86      * @param string $name
87      */
88     public function __construct($directory, $name)
89     {
90         $this->_writer = new Zend_Search_Lucene_Index_SegmentWriter_StreamWriter($directory, $name);
91     }
92
93
94     /**
95      * Add segmnet to a collection of segments to be merged
96      *
97      * @param Zend_Search_Lucene_Index_SegmentInfo $segment
98      */
99     public function addSource(Zend_Search_Lucene_Index_SegmentInfo $segmentInfo)
100     {
101         $this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo;
102     }
103
104
105     /**
106      * Do merge.
107      *
108      * Returns number of documents in newly created segment
109      *
110      * @return Zend_Search_Lucene_Index_SegmentInfo
111      * @throws Zend_Search_Lucene_Exception
112      */
113     public function merge()
114     {
115         if ($this->_mergeDone) {
116             require_once 'Zend/Search/Lucene/Exception.php';
117             throw new Zend_Search_Lucene_Exception('Merge is already done.');
118         }
119
120         if (count($this->_segmentInfos) < 1) {
121             require_once 'Zend/Search/Lucene/Exception.php';
122             throw new Zend_Search_Lucene_Exception('Wrong number of segments to be merged ('
123                                                  . count($this->_segmentInfos)
124                                                  . ').');
125         }
126
127         $this->_mergeFields();
128         $this->_mergeNorms();
129         $this->_mergeStoredFields();
130         $this->_mergeTerms();
131
132         $this->_mergeDone = true;
133
134         return $this->_writer->close();
135     }
136
137
138     /**
139      * Merge fields information
140      */
141     private function _mergeFields()
142     {
143         foreach ($this->_segmentInfos as $segName => $segmentInfo) {
144             foreach ($segmentInfo->getFieldInfos() as $fieldInfo) {
145                 $this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo);
146             }
147         }
148     }
149
150     /**
151      * Merge field's normalization factors
152      */
153     private function _mergeNorms()
154     {
155         foreach ($this->_writer->getFieldInfos() as $fieldInfo) {
156             if ($fieldInfo->isIndexed) {
157                 foreach ($this->_segmentInfos as $segName => $segmentInfo) {
158                     if ($segmentInfo->hasDeletions()) {
159                         $srcNorm = $segmentInfo->normVector($fieldInfo->name);
160                         $norm    = '';
161                         $docs    = $segmentInfo->count();
162                         for ($count = 0; $count < $docs; $count++) {
163                             if (!$segmentInfo->isDeleted($count)) {
164                                 $norm .= $srcNorm[$count];
165                             }
166                         }
167                         $this->_writer->addNorm($fieldInfo->name, $norm);
168                     } else {
169                         $this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name));
170                     }
171                 }
172             }
173         }
174     }
175
176     /**
177      * Merge fields information
178      */
179     private function _mergeStoredFields()
180     {
181         $this->_docCount = 0;
182
183         foreach ($this->_segmentInfos as $segName => $segmentInfo) {
184             $fdtFile = $segmentInfo->openCompoundFile('.fdt');
185
186             for ($count = 0; $count < $segmentInfo->count(); $count++) {
187                 $fieldCount = $fdtFile->readVInt();
188                 $storedFields = array();
189
190                 for ($count2 = 0; $count2 < $fieldCount; $count2++) {
191                     $fieldNum = $fdtFile->readVInt();
192                     $bits = $fdtFile->readByte();
193                     $fieldInfo = $segmentInfo->getField($fieldNum);
194
195                     if (!($bits & 2)) { // Text data
196                         $storedFields[] =
197                                  new Zend_Search_Lucene_Field($fieldInfo->name,
198                                                               $fdtFile->readString(),
199                                                               'UTF-8',
200                                                               true,
201                                                               $fieldInfo->isIndexed,
202                                                               $bits & 1 );
203                     } else {            // Binary data
204                         $storedFields[] =
205                                  new Zend_Search_Lucene_Field($fieldInfo->name,
206                                                               $fdtFile->readBinary(),
207                                                               '',
208                                                               true,
209                                                               $fieldInfo->isIndexed,
210                                                               $bits & 1,
211                                                               true);
212                     }
213                 }
214
215                 if (!$segmentInfo->isDeleted($count)) {
216                     $this->_docCount++;
217                     $this->_writer->addStoredFields($storedFields);
218                 }
219             }
220         }
221     }
222
223
224     /**
225      * Merge fields information
226      */
227     private function _mergeTerms()
228     {
229         $segmentInfoQueue = new Zend_Search_Lucene_Index_TermsPriorityQueue();
230
231         $segmentStartId = 0;
232         foreach ($this->_segmentInfos as $segName => $segmentInfo) {
233             $segmentStartId = $segmentInfo->resetTermsStream($segmentStartId, Zend_Search_Lucene_Index_SegmentInfo::SM_MERGE_INFO);
234
235             // Skip "empty" segments
236             if ($segmentInfo->currentTerm() !== null) {
237                 $segmentInfoQueue->put($segmentInfo);
238             }
239         }
240
241         $this->_writer->initializeDictionaryFiles();
242
243         $termDocs = array();
244         while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
245             // Merge positions array
246             $termDocs += $segmentInfo->currentTermPositions();
247
248             if ($segmentInfoQueue->top() === null ||
249                 $segmentInfoQueue->top()->currentTerm()->key() !=
250                             $segmentInfo->currentTerm()->key()) {
251                 // We got new term
252                 ksort($termDocs, SORT_NUMERIC);
253
254                 // Add term if it's contained in any document
255                 if (count($termDocs) > 0) {
256                     $this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs);
257                 }
258                 $termDocs = array();
259             }
260
261             $segmentInfo->nextTerm();
262             // check, if segment dictionary is finished
263             if ($segmentInfo->currentTerm() !== null) {
264                 // Put segment back into the priority queue
265                 $segmentInfoQueue->put($segmentInfo);
266             }
267         }
268
269         $this->_writer->closeDictionaryFiles();
270     }
271 }