7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
16 * @package Zend_Search_Lucene
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
20 * @version $Id: Similarity.php 16541 2009-07-07 06:59:03Z bkarwin $
24 /** Zend_Search_Lucene_Search_Similarity_Default */
25 require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
30 * @package Zend_Search_Lucene
32 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
33 * @license http://framework.zend.com/license/new-bsd New BSD License
35 abstract class Zend_Search_Lucene_Search_Similarity
38 * The Similarity implementation used by default.
40 * @var Zend_Search_Lucene_Search_Similarity
42 private static $_defaultImpl;
45 * Cache of decoded bytes.
50 private static $_normTable = array( 0 => 0.0,
305 255 => 7.5161928E9 );
309 * Set the default Similarity implementation used by indexing and search
312 * @param Zend_Search_Lucene_Search_Similarity $similarity
314 public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
316 self::$_defaultImpl = $similarity;
321 * Return the default Similarity implementation used by indexing and search
324 * @return Zend_Search_Lucene_Search_Similarity
326 public static function getDefault()
328 if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
329 self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
332 return self::$_defaultImpl;
337 * Computes the normalization value for a field given the total number of
338 * terms contained in a field. These values, together with field boosts, are
339 * stored in an index and multipled into scores for hits on each field by the
342 * Matches in longer fields are less precise, so implemenations of this
343 * method usually return smaller values when 'numTokens' is large,
344 * and larger values when 'numTokens' is small.
346 * That these values are computed under
347 * IndexWriter::addDocument(Document) and stored then using
348 * encodeNorm(float). Thus they have limited precision, and documents
349 * must be re-indexed if this method is altered.
351 * fieldName - name of field
352 * numTokens - the total number of tokens contained in fields named
353 * 'fieldName' of 'doc'.
354 * Returns a normalization factor for hits on this field of this document
356 * @param string $fieldName
357 * @param integer $numTokens
360 abstract public function lengthNorm($fieldName, $numTokens);
363 * Computes the normalization value for a query given the sum of the squared
364 * weights of each of the query terms. This value is then multipled into the
365 * weight of each query term.
367 * This does not affect ranking, but rather just attempts to make scores
368 * from different queries comparable.
370 * sumOfSquaredWeights - the sum of the squares of query term weights
371 * Returns a normalization factor for query weights
373 * @param float $sumOfSquaredWeights
376 abstract public function queryNorm($sumOfSquaredWeights);
380 * Decodes a normalization factor stored in an index.
382 * @param integer $byte
385 public static function decodeNorm($byte)
387 return self::$_normTable[$byte & 0xFF];
392 * Encodes a normalization factor for storage in an index.
394 * The encoding uses a five-bit exponent and three-bit mantissa, thus
395 * representing values from around 7x10^9 to 2x10^-9 with about one
396 * significant decimal digit of accuracy. Zero is also represented.
397 * Negative numbers are rounded up to zero. Values too large to represent
398 * are rounded down to the largest representable value. Positive values too
399 * small to represent are rounded up to the smallest positive representable
405 static function encodeNorm($f)
407 return self::_floatToByte($f);
411 * Float to byte conversion
416 private static function _floatToByte($f)
418 // round negatives up to zero
423 // search for appropriate value
426 while ($highIndex >= $lowIndex) {
427 // $mid = ($highIndex - $lowIndex)/2;
428 $mid = ($highIndex + $lowIndex) >> 1;
429 $delta = $f - self::$_normTable[$mid];
433 } elseif ($delta > 0) {
436 return $mid; // We got it!
440 // round to closest value
441 if ($highIndex != 255 &&
442 $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
443 return $highIndex + 1;
451 * Computes a score factor based on a term or phrase's frequency in a
452 * document. This value is multiplied by the idf(Term, Searcher)
453 * factor for each term in the query and these products are then summed to
454 * form the initial score for a document.
456 * Terms and phrases repeated in a document indicate the topic of the
457 * document, so implementations of this method usually return larger values
458 * when 'freq' is large, and smaller values when 'freq'
461 * freq - the frequency of a term within a document
462 * Returns a score factor based on a term's within-document frequency
467 abstract public function tf($freq);
470 * Computes the amount of a sloppy phrase match, based on an edit distance.
471 * This value is summed for each sloppy phrase match in a document to form
472 * the frequency that is passed to tf(float).
474 * A phrase match with a small edit distance to a document passage more
475 * closely matches the document, so implementations of this method usually
476 * return larger values when the edit distance is small and smaller values
479 * distance - the edit distance of this sloppy phrase match
480 * Returns the frequency increment for this match
482 * @param integer $distance
485 abstract public function sloppyFreq($distance);
489 * Computes a score factor for a simple term or a phrase.
491 * The default implementation is:
492 * return idfFreq(searcher.docFreq(term), searcher.maxDoc());
494 * input - the term in question or array of terms
495 * reader - reader the document collection being searched
496 * Returns a score factor for the term
498 * @param mixed $input
499 * @param Zend_Search_Lucene_Interface $reader
500 * @return a score factor for the term
502 public function idf($input, Zend_Search_Lucene_Interface $reader)
504 if (!is_array($input)) {
505 return $this->idfFreq($reader->docFreq($input), $reader->count());
508 foreach ($input as $term) {
509 $idf += $this->idfFreq($reader->docFreq($term), $reader->count());
516 * Computes a score factor based on a term's document frequency (the number
517 * of documents which contain the term). This value is multiplied by the
518 * tf(int) factor for each term in the query and these products are
519 * then summed to form the initial score for a document.
521 * Terms that occur in fewer documents are better indicators of topic, so
522 * implemenations of this method usually return larger values for rare terms,
523 * and smaller values for common terms.
525 * docFreq - the number of documents which contain the term
526 * numDocs - the total number of documents in the collection
527 * Returns a score factor based on the term's document frequency
529 * @param integer $docFreq
530 * @param integer $numDocs
533 abstract public function idfFreq($docFreq, $numDocs);
536 * Computes a score factor based on the fraction of all query terms that a
537 * document contains. This value is multiplied into scores.
539 * The presence of a large portion of the query terms indicates a better
540 * match with the query, so implemenations of this method usually return
541 * larger values when the ratio between these parameters is large and smaller
542 * values when the ratio between them is small.
544 * overlap - the number of query terms matched in the document
545 * maxOverlap - the total number of terms in the query
546 * Returns a score factor based on term overlap with the query
548 * @param integer $overlap
549 * @param integer $maxOverlap
552 abstract public function coord($overlap, $maxOverlap);