final move of files
[web.mtrack] / Zend / Search / Lucene / Index / DictionaryLoader.php
diff --git a/Zend/Search/Lucene/Index/DictionaryLoader.php b/Zend/Search/Lucene/Index/DictionaryLoader.php
new file mode 100644 (file)
index 0000000..bc1a41c
--- /dev/null
@@ -0,0 +1,268 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to the new BSD license that is bundled
+ * with this package in the file LICENSE.txt.
+ * It is also available through the world-wide-web at this URL:
+ * http://framework.zend.com/license/new-bsd
+ * If you did not receive a copy of the license and are unable to
+ * obtain it through the world-wide-web, please send an email
+ * to license@zend.com so we can send you a copy immediately.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ * @version    $Id: DictionaryLoader.php 16971 2009-07-22 18:05:45Z mikaelkael $
+ */
+
+/**
+ * Dictionary loader
+ *
+ * It's a dummy class which is created to encapsulate non-good structured code.
+ * Manual "method inlining" is performed to increase dictionary index loading operation
+ * which is major bottelneck for search performance.
+ *
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license    http://framework.zend.com/license/new-bsd     New BSD License
+ */
+class Zend_Search_Lucene_Index_DictionaryLoader
+{
+    /**
+     * Dictionary index loader.
+     *
+     * It takes a string which is actually <segment_name>.tii index file data and
+     * returns two arrays - term and tremInfo lists.
+     *
+     * See Zend_Search_Lucene_Index_SegmintInfo class for details
+     *
+     * @param string $data
+     * @return array
+     * @throws Zend_Search_Lucene_Exception
+     */
+    public static function load($data)
+    {
+        $termDictionary = array();
+        $termInfos      = array();
+        $pos = 0;
+
+        // $tiVersion = $tiiFile->readInt();
+        $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8  | ord($data[3]);
+        $pos += 4;
+        if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
+            $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
+                require_once 'Zend/Search/Lucene/Exception.php';
+                throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
+        }
+
+        // $indexTermCount = $tiiFile->readLong();
+        if (PHP_INT_SIZE > 4) {
+            $indexTermCount = ord($data[$pos]) << 56  |
+                              ord($data[$pos+1]) << 48  |
+                              ord($data[$pos+2]) << 40  |
+                              ord($data[$pos+3]) << 32  |
+                              ord($data[$pos+4]) << 24  |
+                              ord($data[$pos+5]) << 16  |
+                              ord($data[$pos+6]) << 8   |
+                              ord($data[$pos+7]);
+        } else {
+            if ((ord($data[$pos])            != 0) ||
+                (ord($data[$pos+1])          != 0) ||
+                (ord($data[$pos+2])          != 0) ||
+                (ord($data[$pos+3])          != 0) ||
+                ((ord($data[$pos+4]) & 0x80) != 0)) {
+                    require_once 'Zend/Search/Lucene/Exception.php';
+                    throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
+                 }
+
+            $indexTermCount = ord($data[$pos+4]) << 24  |
+                              ord($data[$pos+5]) << 16  |
+                              ord($data[$pos+6]) << 8   |
+                              ord($data[$pos+7]);
+        }
+        $pos += 8;
+
+        //                  $tiiFile->readInt();  // IndexInterval
+        $pos += 4;
+
+        // $skipInterval   = $tiiFile->readInt();
+        $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8  | ord($data[$pos+3]);
+        $pos += 4;
+        if ($indexTermCount < 1) {
+            require_once 'Zend/Search/Lucene/Exception.php';
+            throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
+        }
+
+        if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
+            /* Skip MaxSkipLevels value */
+            $pos += 4;
+        }
+
+        $prevTerm     = '';
+        $freqPointer  =  0;
+        $proxPointer  =  0;
+        $indexPointer =  0;
+        for ($count = 0; $count < $indexTermCount; $count++) {
+            //$termPrefixLength = $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $termPrefixLength = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $termPrefixLength |= ($nbyte & 0x7F) << $shift;
+            }
+
+            // $termSuffix       = $tiiFile->readString();
+            $nbyte = ord($data[$pos++]);
+            $len = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $len |= ($nbyte & 0x7F) << $shift;
+            }
+            if ($len == 0) {
+                $termSuffix = '';
+            } else {
+                $termSuffix = substr($data, $pos, $len);
+                $pos += $len;
+                for ($count1 = 0; $count1 < $len; $count1++ ) {
+                    if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
+                        $addBytes = 1;
+                        if (ord($termSuffix[$count1]) & 0x20 ) {
+                            $addBytes++;
+
+                            // Never used for Java Lucene created index.
+                            // Java2 doesn't encode strings in four bytes
+                            if (ord($termSuffix[$count1]) & 0x10 ) {
+                                $addBytes++;
+                            }
+                        }
+                        $termSuffix .= substr($data, $pos, $addBytes);
+                        $pos += $addBytes;
+                        $len += $addBytes;
+
+                        // Check for null character. Java2 encodes null character
+                        // in two bytes.
+                        if (ord($termSuffix[$count1]) == 0xC0 &&
+                            ord($termSuffix[$count1+1]) == 0x80   ) {
+                            $termSuffix[$count1] = 0;
+                            $termSuffix = substr($termSuffix,0,$count1+1)
+                                        . substr($termSuffix,$count1+2);
+                        }
+                        $count1 += $addBytes;
+                    }
+                }
+            }
+
+            // $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
+            $pb = 0; $pc = 0;
+            while ($pb < strlen($prevTerm)  &&  $pc < $termPrefixLength) {
+                $charBytes = 1;
+                if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
+                    $charBytes++;
+                    if (ord($prevTerm[$pb]) & 0x20 ) {
+                        $charBytes++;
+                        if (ord($prevTerm[$pb]) & 0x10 ) {
+                            $charBytes++;
+                        }
+                    }
+                }
+
+                if ($pb + $charBytes > strlen($data)) {
+                    // wrong character
+                    break;
+                }
+
+                $pc++;
+                $pb += $charBytes;
+            }
+            $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
+
+            // $termFieldNum     = $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $termFieldNum = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $termFieldNum |= ($nbyte & 0x7F) << $shift;
+            }
+
+            // $docFreq          = $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $docFreq = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $docFreq |= ($nbyte & 0x7F) << $shift;
+            }
+
+            // $freqPointer     += $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $vint = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $vint |= ($nbyte & 0x7F) << $shift;
+            }
+            $freqPointer += $vint;
+
+            // $proxPointer     += $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $vint = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $vint |= ($nbyte & 0x7F) << $shift;
+            }
+            $proxPointer += $vint;
+
+            if( $docFreq >= $skipInterval ) {
+                // $skipDelta = $tiiFile->readVInt();
+                $nbyte = ord($data[$pos++]);
+                $vint = $nbyte & 0x7F;
+                for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                    $nbyte = ord($data[$pos++]);
+                    $vint |= ($nbyte & 0x7F) << $shift;
+                }
+                $skipDelta = $vint;
+            } else {
+                $skipDelta = 0;
+            }
+
+            // $indexPointer += $tiiFile->readVInt();
+            $nbyte = ord($data[$pos++]);
+            $vint = $nbyte & 0x7F;
+            for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
+                $nbyte = ord($data[$pos++]);
+                $vint |= ($nbyte & 0x7F) << $shift;
+            }
+            $indexPointer += $vint;
+
+
+            // $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
+            $termDictionary[] = array($termFieldNum, $termValue);
+
+            $termInfos[] =
+                 // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
+                 array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
+
+            $prevTerm = $termValue;
+        }
+
+        // Check special index entry mark
+        if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
+            require_once 'Zend/Search/Lucene/Exception.php';
+            throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
+        }
+
+        if (PHP_INT_SIZE > 4) {
+            // Treat 64-bit 0xFFFFFFFF as -1
+            $termDictionary[0][0] = -1;
+        }
+
+        return array($termDictionary, $termInfos);
+    }
+}
+