X-Git-Url: http://git.roojs.org/?p=web.mtrack;a=blobdiff_plain;f=Zend%2FSearch%2FLucene%2FIndex%2FDictionaryLoader.php;fp=Zend%2FSearch%2FLucene%2FIndex%2FDictionaryLoader.php;h=bc1a41cad26d68e7305043639f41313367d7a72b;hp=0000000000000000000000000000000000000000;hb=29cac3c0e01987683ce5d500381a30d9cc1c4936;hpb=837ccffb3a0d087521a4f3061499690f22972ad8 diff --git a/Zend/Search/Lucene/Index/DictionaryLoader.php b/Zend/Search/Lucene/Index/DictionaryLoader.php new file mode 100644 index 00000000..bc1a41ca --- /dev/null +++ b/Zend/Search/Lucene/Index/DictionaryLoader.php @@ -0,0 +1,268 @@ +.tii index file data and + * returns two arrays - term and tremInfo lists. + * + * See Zend_Search_Lucene_Index_SegmintInfo class for details + * + * @param string $data + * @return array + * @throws Zend_Search_Lucene_Exception + */ + public static function load($data) + { + $termDictionary = array(); + $termInfos = array(); + $pos = 0; + + // $tiVersion = $tiiFile->readInt(); + $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]); + $pos += 4; + if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && + $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { + require_once 'Zend/Search/Lucene/Exception.php'; + throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); + } + + // $indexTermCount = $tiiFile->readLong(); + if (PHP_INT_SIZE > 4) { + $indexTermCount = ord($data[$pos]) << 56 | + ord($data[$pos+1]) << 48 | + ord($data[$pos+2]) << 40 | + ord($data[$pos+3]) << 32 | + ord($data[$pos+4]) << 24 | + ord($data[$pos+5]) << 16 | + ord($data[$pos+6]) << 8 | + ord($data[$pos+7]); + } else { + if ((ord($data[$pos]) != 0) || + (ord($data[$pos+1]) != 0) || + (ord($data[$pos+2]) != 0) || + (ord($data[$pos+3]) != 0) || + ((ord($data[$pos+4]) & 0x80) != 0)) { + require_once 'Zend/Search/Lucene/Exception.php'; + throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb'); + } + + $indexTermCount = ord($data[$pos+4]) << 24 | + ord($data[$pos+5]) << 16 | + ord($data[$pos+6]) << 8 | + ord($data[$pos+7]); + } + $pos += 8; + + // $tiiFile->readInt(); // IndexInterval + $pos += 4; + + // $skipInterval = $tiiFile->readInt(); + $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]); + $pos += 4; + if ($indexTermCount < 1) { + require_once 'Zend/Search/Lucene/Exception.php'; + throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index'); + } + + if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { + /* Skip MaxSkipLevels value */ + $pos += 4; + } + + $prevTerm = ''; + $freqPointer = 0; + $proxPointer = 0; + $indexPointer = 0; + for ($count = 0; $count < $indexTermCount; $count++) { + //$termPrefixLength = $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $termPrefixLength = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $termPrefixLength |= ($nbyte & 0x7F) << $shift; + } + + // $termSuffix = $tiiFile->readString(); + $nbyte = ord($data[$pos++]); + $len = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $len |= ($nbyte & 0x7F) << $shift; + } + if ($len == 0) { + $termSuffix = ''; + } else { + $termSuffix = substr($data, $pos, $len); + $pos += $len; + for ($count1 = 0; $count1 < $len; $count1++ ) { + if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) { + $addBytes = 1; + if (ord($termSuffix[$count1]) & 0x20 ) { + $addBytes++; + + // Never used for Java Lucene created index. + // Java2 doesn't encode strings in four bytes + if (ord($termSuffix[$count1]) & 0x10 ) { + $addBytes++; + } + } + $termSuffix .= substr($data, $pos, $addBytes); + $pos += $addBytes; + $len += $addBytes; + + // Check for null character. Java2 encodes null character + // in two bytes. + if (ord($termSuffix[$count1]) == 0xC0 && + ord($termSuffix[$count1+1]) == 0x80 ) { + $termSuffix[$count1] = 0; + $termSuffix = substr($termSuffix,0,$count1+1) + . substr($termSuffix,$count1+2); + } + $count1 += $addBytes; + } + } + } + + // $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix; + $pb = 0; $pc = 0; + while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) { + $charBytes = 1; + if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) { + $charBytes++; + if (ord($prevTerm[$pb]) & 0x20 ) { + $charBytes++; + if (ord($prevTerm[$pb]) & 0x10 ) { + $charBytes++; + } + } + } + + if ($pb + $charBytes > strlen($data)) { + // wrong character + break; + } + + $pc++; + $pb += $charBytes; + } + $termValue = substr($prevTerm, 0, $pb) . $termSuffix; + + // $termFieldNum = $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $termFieldNum = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $termFieldNum |= ($nbyte & 0x7F) << $shift; + } + + // $docFreq = $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $docFreq = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $docFreq |= ($nbyte & 0x7F) << $shift; + } + + // $freqPointer += $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $vint = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $vint |= ($nbyte & 0x7F) << $shift; + } + $freqPointer += $vint; + + // $proxPointer += $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $vint = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $vint |= ($nbyte & 0x7F) << $shift; + } + $proxPointer += $vint; + + if( $docFreq >= $skipInterval ) { + // $skipDelta = $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $vint = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $vint |= ($nbyte & 0x7F) << $shift; + } + $skipDelta = $vint; + } else { + $skipDelta = 0; + } + + // $indexPointer += $tiiFile->readVInt(); + $nbyte = ord($data[$pos++]); + $vint = $nbyte & 0x7F; + for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { + $nbyte = ord($data[$pos++]); + $vint |= ($nbyte & 0x7F) << $shift; + } + $indexPointer += $vint; + + + // $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum); + $termDictionary[] = array($termFieldNum, $termValue); + + $termInfos[] = + // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); + array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); + + $prevTerm = $termValue; + } + + // Check special index entry mark + if ($termDictionary[0][0] != (int)0xFFFFFFFF) { + require_once 'Zend/Search/Lucene/Exception.php'; + throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); + } + + if (PHP_INT_SIZE > 4) { + // Treat 64-bit 0xFFFFFFFF as -1 + $termDictionary[0][0] = -1; + } + + return array($termDictionary, $termInfos); + } +} +