git.roojs.org Git - web.mtrack/blob - inc/lib/Zend/Search/Lucene/Search/Similarity.php

   1 <?php
   2 /**
   3  * Zend Framework
   4  *
   5  * LICENSE
   6  *
   7  * This source file is subject to the new BSD license that is bundled
   8  * with this package in the file LICENSE.txt.
   9  * It is also available through the world-wide-web at this URL:
  10  * http://framework.zend.com/license/new-bsd
  11  * If you did not receive a copy of the license and are unable to
  12  * obtain it through the world-wide-web, please send an email
  13  * to license@zend.com so we can send you a copy immediately.
  14  *
  15  * @category   Zend
  16  * @package    Zend_Search_Lucene
  17  * @subpackage Search
  18  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  20  * @version    $Id: Similarity.php 16541 2009-07-07 06:59:03Z bkarwin $
  21  */
  22
  23
  24 /** Zend_Search_Lucene_Search_Similarity_Default */
  25 require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
  26
  27
  28 /**
  29  * @category   Zend
  30  * @package    Zend_Search_Lucene
  31  * @subpackage Search
  32  * @copyright  Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
  33  * @license    http://framework.zend.com/license/new-bsd     New BSD License
  34  */
  35 abstract class Zend_Search_Lucene_Search_Similarity
  36 {
  37     /**
  38      * The Similarity implementation used by default.
  39      *
  40      * @var Zend_Search_Lucene_Search_Similarity
  41      */
  42     private static $_defaultImpl;
  43
  44     /**
  45      * Cache of decoded bytes.
  46      * Array of floats
  47      *
  48      * @var array
  49      */
  50     private static $_normTable = array( 0   => 0.0,
  51                                         1   => 5.820766E-10,
  52                                         2   => 6.9849193E-10,
  53                                         3   => 8.1490725E-10,
  54                                         4   => 9.313226E-10,
  55                                         5   => 1.1641532E-9,
  56                                         6   => 1.3969839E-9,
  57                                         7   => 1.6298145E-9,
  58                                         8   => 1.8626451E-9,
  59                                         9   => 2.3283064E-9,
  60                                         10  => 2.7939677E-9,
  61                                         11  => 3.259629E-9,
  62                                         12  => 3.7252903E-9,
  63                                         13  => 4.656613E-9,
  64                                         14  => 5.5879354E-9,
  65                                         15  => 6.519258E-9,
  66                                         16  => 7.4505806E-9,
  67                                         17  => 9.313226E-9,
  68                                         18  => 1.1175871E-8,
  69                                         19  => 1.3038516E-8,
  70                                         20  => 1.4901161E-8,
  71                                         21  => 1.8626451E-8,
  72                                         22  => 2.2351742E-8,
  73                                         23  => 2.6077032E-8,
  74                                         24  => 2.9802322E-8,
  75                                         25  => 3.7252903E-8,
  76                                         26  => 4.4703484E-8,
  77                                         27  => 5.2154064E-8,
  78                                         28  => 5.9604645E-8,
  79                                         29  => 7.4505806E-8,
  80                                         30  => 8.940697E-8,
  81                                         31  => 1.0430813E-7,
  82                                         32  => 1.1920929E-7,
  83                                         33  => 1.4901161E-7,
  84                                         34  => 1.7881393E-7,
  85                                         35  => 2.0861626E-7,
  86                                         36  => 2.3841858E-7,
  87                                         37  => 2.9802322E-7,
  88                                         38  => 3.5762787E-7,
  89                                         39  => 4.172325E-7,
  90                                         40  => 4.7683716E-7,
  91                                         41  => 5.9604645E-7,
  92                                         42  => 7.1525574E-7,
  93                                         43  => 8.34465E-7,
  94                                         44  => 9.536743E-7,
  95                                         45  => 1.1920929E-6,
  96                                         46  => 1.4305115E-6,
  97                                         47  => 1.66893E-6,
  98                                         48  => 1.9073486E-6,
  99                                         49  => 2.3841858E-6,
 100                                         50  => 2.861023E-6,
 101                                         51  => 3.33786E-6,
 102                                         52  => 3.8146973E-6,
 103                                         53  => 4.7683716E-6,
 104                                         54  => 5.722046E-6,
 105                                         55  => 6.67572E-6,
 106                                         56  => 7.6293945E-6,
 107                                         57  => 9.536743E-6,
 108                                         58  => 1.1444092E-5,
 109                                         59  => 1.335144E-5,
 110                                         60  => 1.5258789E-5,
 111                                         61  => 1.9073486E-5,
 112                                         62  => 2.2888184E-5,
 113                                         63  => 2.670288E-5,
 114                                         64  => 3.0517578E-5,
 115                                         65  => 3.8146973E-5,
 116                                         66  => 4.5776367E-5,
 117                                         67  => 5.340576E-5,
 118                                         68  => 6.1035156E-5,
 119                                         69  => 7.6293945E-5,
 120                                         70  => 9.1552734E-5,
 121                                         71  => 1.0681152E-4,
 122                                         72  => 1.2207031E-4,
 123                                         73  => 1.5258789E-4,
 124                                         74  => 1.8310547E-4,
 125                                         75  => 2.1362305E-4,
 126                                         76  => 2.4414062E-4,
 127                                         77  => 3.0517578E-4,
 128                                         78  => 3.6621094E-4,
 129                                         79  => 4.272461E-4,
 130                                         80  => 4.8828125E-4,
 131                                         81  => 6.1035156E-4,
 132                                         82  => 7.324219E-4,
 133                                         83  => 8.544922E-4,
 134                                         84  => 9.765625E-4,
 135                                         85  => 0.0012207031,
 136                                         86  => 0.0014648438,
 137                                         87  => 0.0017089844,
 138                                         88  => 0.001953125,
 139                                         89  => 0.0024414062,
 140                                         90  => 0.0029296875,
 141                                         91  => 0.0034179688,
 142                                         92  => 0.00390625,
 143                                         93  => 0.0048828125,
 144                                         94  => 0.005859375,
 145                                         95  => 0.0068359375,
 146                                         96  => 0.0078125,
 147                                         97  => 0.009765625,
 148                                         98  => 0.01171875,
 149                                         99  => 0.013671875,
 150                                         100 => 0.015625,
 151                                         101 => 0.01953125,
 152                                         102 => 0.0234375,
 153                                         103 => 0.02734375,
 154                                         104 => 0.03125,
 155                                         105 => 0.0390625,
 156                                         106 => 0.046875,
 157                                         107 => 0.0546875,
 158                                         108 => 0.0625,
 159                                         109 => 0.078125,
 160                                         110 => 0.09375,
 161                                         111 => 0.109375,
 162                                         112 => 0.125,
 163                                         113 => 0.15625,
 164                                         114 => 0.1875,
 165                                         115 => 0.21875,
 166                                         116 => 0.25,
 167                                         117 => 0.3125,
 168                                         118 => 0.375,
 169                                         119 => 0.4375,
 170                                         120 => 0.5,
 171                                         121 => 0.625,
 172                                         122 => 0.75,
 173                                         123 => 0.875,
 174                                         124 => 1.0,
 175                                         125 => 1.25,
 176                                         126 => 1.5,
 177                                         127 => 1.75,
 178                                         128 => 2.0,
 179                                         129 => 2.5,
 180                                         130 => 3.0,
 181                                         131 => 3.5,
 182                                         132 => 4.0,
 183                                         133 => 5.0,
 184                                         134 => 6.0,
 185                                         135 => 7.0,
 186                                         136 => 8.0,
 187                                         137 => 10.0,
 188                                         138 => 12.0,
 189                                         139 => 14.0,
 190                                         140 => 16.0,
 191                                         141 => 20.0,
 192                                         142 => 24.0,
 193                                         143 => 28.0,
 194                                         144 => 32.0,
 195                                         145 => 40.0,
 196                                         146 => 48.0,
 197                                         147 => 56.0,
 198                                         148 => 64.0,
 199                                         149 => 80.0,
 200                                         150 => 96.0,
 201                                         151 => 112.0,
 202                                         152 => 128.0,
 203                                         153 => 160.0,
 204                                         154 => 192.0,
 205                                         155 => 224.0,
 206                                         156 => 256.0,
 207                                         157 => 320.0,
 208                                         158 => 384.0,
 209                                         159 => 448.0,
 210                                         160 => 512.0,
 211                                         161 => 640.0,
 212                                         162 => 768.0,
 213                                         163 => 896.0,
 214                                         164 => 1024.0,
 215                                         165 => 1280.0,
 216                                         166 => 1536.0,
 217                                         167 => 1792.0,
 218                                         168 => 2048.0,
 219                                         169 => 2560.0,
 220                                         170 => 3072.0,
 221                                         171 => 3584.0,
 222                                         172 => 4096.0,
 223                                         173 => 5120.0,
 224                                         174 => 6144.0,
 225                                         175 => 7168.0,
 226                                         176 => 8192.0,
 227                                         177 => 10240.0,
 228                                         178 => 12288.0,
 229                                         179 => 14336.0,
 230                                         180 => 16384.0,
 231                                         181 => 20480.0,
 232                                         182 => 24576.0,
 233                                         183 => 28672.0,
 234                                         184 => 32768.0,
 235                                         185 => 40960.0,
 236                                         186 => 49152.0,
 237                                         187 => 57344.0,
 238                                         188 => 65536.0,
 239                                         189 => 81920.0,
 240                                         190 => 98304.0,
 241                                         191 => 114688.0,
 242                                         192 => 131072.0,
 243                                         193 => 163840.0,
 244                                         194 => 196608.0,
 245                                         195 => 229376.0,
 246                                         196 => 262144.0,
 247                                         197 => 327680.0,
 248                                         198 => 393216.0,
 249                                         199 => 458752.0,
 250                                         200 => 524288.0,
 251                                         201 => 655360.0,
 252                                         202 => 786432.0,
 253                                         203 => 917504.0,
 254                                         204 => 1048576.0,
 255                                         205 => 1310720.0,
 256                                         206 => 1572864.0,
 257                                         207 => 1835008.0,
 258                                         208 => 2097152.0,
 259                                         209 => 2621440.0,
 260                                         210 => 3145728.0,
 261                                         211 => 3670016.0,
 262                                         212 => 4194304.0,
 263                                         213 => 5242880.0,
 264                                         214 => 6291456.0,
 265                                         215 => 7340032.0,
 266                                         216 => 8388608.0,
 267                                         217 => 1.048576E7,
 268                                         218 => 1.2582912E7,
 269                                         219 => 1.4680064E7,
 270                                         220 => 1.6777216E7,
 271                                         221 => 2.097152E7,
 272                                         222 => 2.5165824E7,
 273                                         223 => 2.9360128E7,
 274                                         224 => 3.3554432E7,
 275                                         225 => 4.194304E7,
 276                                         226 => 5.0331648E7,
 277                                         227 => 5.8720256E7,
 278                                         228 => 6.7108864E7,
 279                                         229 => 8.388608E7,
 280                                         230 => 1.00663296E8,
 281                                         231 => 1.17440512E8,
 282                                         232 => 1.34217728E8,
 283                                         233 => 1.6777216E8,
 284                                         234 => 2.01326592E8,
 285                                         235 => 2.34881024E8,
 286                                         236 => 2.68435456E8,
 287                                         237 => 3.3554432E8,
 288                                         238 => 4.02653184E8,
 289                                         239 => 4.69762048E8,
 290                                         240 => 5.3687091E8,
 291                                         241 => 6.7108864E8,
 292                                         242 => 8.0530637E8,
 293                                         243 => 9.395241E8,
 294                                         244 => 1.07374182E9,
 295                                         245 => 1.34217728E9,
 296                                         246 => 1.61061274E9,
 297                                         247 => 1.87904819E9,
 298                                         248 => 2.14748365E9,
 299                                         249 => 2.68435456E9,
 300                                         250 => 3.22122547E9,
 301                                         251 => 3.75809638E9,
 302                                         252 => 4.2949673E9,
 303                                         253 => 5.3687091E9,
 304                                         254 => 6.4424509E9,
 305                                         255 => 7.5161928E9 );
 306
 307
 308     /**
 309      * Set the default Similarity implementation used by indexing and search
 310      * code.
 311      *
 312      * @param Zend_Search_Lucene_Search_Similarity $similarity
 313      */
 314     public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
 315     {
 316         self::$_defaultImpl = $similarity;
 317     }
 318
 319
 320     /**
 321      * Return the default Similarity implementation used by indexing and search
 322      * code.
 323      *
 324      * @return Zend_Search_Lucene_Search_Similarity
 325      */
 326     public static function getDefault()
 327     {
 328         if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
 329             self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
 330         }
 331
 332         return self::$_defaultImpl;
 333     }
 334
 335
 336     /**
 337      * Computes the normalization value for a field given the total number of
 338      * terms contained in a field.  These values, together with field boosts, are
 339      * stored in an index and multipled into scores for hits on each field by the
 340      * search code.
 341      *
 342      * Matches in longer fields are less precise, so implemenations of this
 343      * method usually return smaller values when 'numTokens' is large,
 344      * and larger values when 'numTokens' is small.
 345      *
 346      * That these values are computed under
 347      * IndexWriter::addDocument(Document) and stored then using
 348      * encodeNorm(float).  Thus they have limited precision, and documents
 349      * must be re-indexed if this method is altered.
 350      *
 351      * fieldName - name of field
 352      * numTokens - the total number of tokens contained in fields named
 353      *             'fieldName' of 'doc'.
 354      * Returns a normalization factor for hits on this field of this document
 355      *
 356      * @param string $fieldName
 357      * @param integer $numTokens
 358      * @return float
 359      */
 360     abstract public function lengthNorm($fieldName, $numTokens);
 361
 362     /**
 363      * Computes the normalization value for a query given the sum of the squared
 364      * weights of each of the query terms.  This value is then multipled into the
 365      * weight of each query term.
 366      *
 367      * This does not affect ranking, but rather just attempts to make scores
 368      * from different queries comparable.
 369      *
 370      * sumOfSquaredWeights - the sum of the squares of query term weights
 371      * Returns a normalization factor for query weights
 372      *
 373      * @param float $sumOfSquaredWeights
 374      * @return float
 375      */
 376     abstract public function queryNorm($sumOfSquaredWeights);
 377
 378
 379     /**
 380      *  Decodes a normalization factor stored in an index.
 381      *
 382      * @param integer $byte
 383      * @return float
 384      */
 385     public static function decodeNorm($byte)
 386     {
 387         return self::$_normTable[$byte & 0xFF];
 388     }
 389
 390
 391     /**
 392      * Encodes a normalization factor for storage in an index.
 393      *
 394      * The encoding uses a five-bit exponent and three-bit mantissa, thus
 395      * representing values from around 7x10^9 to 2x10^-9 with about one
 396      * significant decimal digit of accuracy.  Zero is also represented.
 397      * Negative numbers are rounded up to zero.  Values too large to represent
 398      * are rounded down to the largest representable value.  Positive values too
 399      * small to represent are rounded up to the smallest positive representable
 400      * value.
 401      *
 402      * @param float $f
 403      * @return integer
 404      */
 405     static function encodeNorm($f)
 406     {
 407       return self::_floatToByte($f);
 408     }
 409
 410     /**
 411      * Float to byte conversion
 412      *
 413      * @param integer $b
 414      * @return float
 415      */
 416     private static function _floatToByte($f)
 417     {
 418         // round negatives up to zero
 419         if ($f <= 0.0) {
 420             return 0;
 421         }
 422
 423         // search for appropriate value
 424         $lowIndex = 0;
 425         $highIndex = 255;
 426         while ($highIndex >= $lowIndex) {
 427             // $mid = ($highIndex - $lowIndex)/2;
 428             $mid = ($highIndex + $lowIndex) >> 1;
 429             $delta = $f - self::$_normTable[$mid];
 430
 431             if ($delta < 0) {
 432                 $highIndex = $mid-1;
 433             } elseif ($delta > 0) {
 434                 $lowIndex  = $mid+1;
 435             } else {
 436                 return $mid; // We got it!
 437             }
 438         }
 439
 440         // round to closest value
 441         if ($highIndex != 255 &&
 442             $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
 443             return $highIndex + 1;
 444         } else {
 445             return $highIndex;
 446         }
 447     }
 448
 449
 450     /**
 451      * Computes a score factor based on a term or phrase's frequency in a
 452      * document.  This value is multiplied by the idf(Term, Searcher)
 453      * factor for each term in the query and these products are then summed to
 454      * form the initial score for a document.
 455      *
 456      * Terms and phrases repeated in a document indicate the topic of the
 457      * document, so implementations of this method usually return larger values
 458      * when 'freq' is large, and smaller values when 'freq'
 459      * is small.
 460      *
 461      * freq - the frequency of a term within a document
 462      * Returns a score factor based on a term's within-document frequency
 463      *
 464      * @param float $freq
 465      * @return float
 466      */
 467     abstract public function tf($freq);
 468
 469     /**
 470      * Computes the amount of a sloppy phrase match, based on an edit distance.
 471      * This value is summed for each sloppy phrase match in a document to form
 472      * the frequency that is passed to tf(float).
 473      *
 474      * A phrase match with a small edit distance to a document passage more
 475      * closely matches the document, so implementations of this method usually
 476      * return larger values when the edit distance is small and smaller values
 477      * when it is large.
 478      *
 479      * distance - the edit distance of this sloppy phrase match
 480      * Returns the frequency increment for this match
 481      *
 482      * @param integer $distance
 483      * @return float
 484      */
 485     abstract public function sloppyFreq($distance);
 486
 487
 488     /**
 489      * Computes a score factor for a simple term or a phrase.
 490      *
 491      * The default implementation is:
 492      *   return idfFreq(searcher.docFreq(term), searcher.maxDoc());
 493      *
 494      * input - the term in question or array of terms
 495      * reader - reader the document collection being searched
 496      * Returns a score factor for the term
 497      *
 498      * @param mixed $input
 499      * @param Zend_Search_Lucene_Interface $reader
 500      * @return a score factor for the term
 501      */
 502     public function idf($input, Zend_Search_Lucene_Interface $reader)
 503     {
 504         if (!is_array($input)) {
 505             return $this->idfFreq($reader->docFreq($input), $reader->count());
 506         } else {
 507             $idf = 0.0;
 508             foreach ($input as $term) {
 509                 $idf += $this->idfFreq($reader->docFreq($term), $reader->count());
 510             }
 511             return $idf;
 512         }
 513     }
 514
 515     /**
 516      * Computes a score factor based on a term's document frequency (the number
 517      * of documents which contain the term).  This value is multiplied by the
 518      * tf(int) factor for each term in the query and these products are
 519      * then summed to form the initial score for a document.
 520      *
 521      * Terms that occur in fewer documents are better indicators of topic, so
 522      * implemenations of this method usually return larger values for rare terms,
 523      * and smaller values for common terms.
 524      *
 525      * docFreq - the number of documents which contain the term
 526      * numDocs - the total number of documents in the collection
 527      * Returns a score factor based on the term's document frequency
 528      *
 529      * @param integer $docFreq
 530      * @param integer $numDocs
 531      * @return float
 532      */
 533     abstract public function idfFreq($docFreq, $numDocs);
 534
 535     /**
 536      * Computes a score factor based on the fraction of all query terms that a
 537      * document contains.  This value is multiplied into scores.
 538      *
 539      * The presence of a large portion of the query terms indicates a better
 540      * match with the query, so implemenations of this method usually return
 541      * larger values when the ratio between these parameters is large and smaller
 542      * values when the ratio between them is small.
 543      *
 544      * overlap - the number of query terms matched in the document
 545      * maxOverlap - the total number of terms in the query
 546      * Returns a score factor based on term overlap with the query
 547      *
 548      * @param integer $overlap
 549      * @param integer $maxOverlap
 550      * @return float
 551      */
 552     abstract public function coord($overlap, $maxOverlap);
 553 }
 554