diff options
Diffstat (limited to '3rdparty/clucene/src/CLucene/search/Similarity.cpp')
-rw-r--r-- | 3rdparty/clucene/src/CLucene/search/Similarity.cpp | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/search/Similarity.cpp b/3rdparty/clucene/src/CLucene/search/Similarity.cpp new file mode 100644 index 000000000..d33a036e4 --- /dev/null +++ b/3rdparty/clucene/src/CLucene/search/Similarity.cpp @@ -0,0 +1,233 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "Similarity.h" + +#include "CLucene/index/Term.h" +#include "SearchHeader.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + +#ifdef _CL_HAVE_NO_FLOAT_BYTE + #if defined(_LUCENE_PRAGMA_WARNINGS) + #pragma message ("==================Using fallback float<->byte encodings!!!==================") + #else + #warning "==================Using fallback float<->byte encodings!!!==================" + #endif + + //if the autoconf figured out that we can't do the conversions properly, then + //we fall back on the old, inaccurate way of doing things. + qreal NORM_TABLE[] = { + 0.0,5.820766E-10,6.9849193E-10,8.1490725E-10,9.313226E-10,1.1641532E-9,1.3969839E-9, + 1.6298145E-9,1.8626451E-9,2.3283064E-9,2.7939677E-9,3.259629E-9,3.7252903E-9, + 4.656613E-9,5.5879354E-9,6.519258E-9,7.4505806E-9,9.313226E-9,1.1175871E-8,1.3038516E-8, + 1.4901161E-8,1.8626451E-8,2.2351742E-8,2.6077032E-8,2.9802322E-8,3.7252903E-8,4.4703484E-8, + 5.2154064E-8,5.9604645E-8,7.4505806E-8,8.940697E-8,1.0430813E-7,1.1920929E-7,1.4901161E-7, + 1.7881393E-7,2.0861626E-7,2.3841858E-7,2.9802322E-7,3.5762787E-7,4.172325E-7,4.7683716E-7, + 5.9604645E-7,7.1525574E-7,8.34465E-7,9.536743E-7,1.1920929E-6,1.4305115E-6,1.66893E-6, + 1.9073486E-6,2.3841858E-6,2.861023E-6,3.33786E-6,3.8146973E-6,4.7683716E-6,5.722046E-6, + 6.67572E-6,7.6293945E-6,9.536743E-6,1.1444092E-5,1.335144E-5,1.5258789E-5,1.9073486E-5, + 2.2888184E-5,2.670288E-5,3.0517578E-5,3.8146973E-5,4.5776367E-5,5.340576E-5,6.1035156E-5, + 7.6293945E-5,9.1552734E-5,1.0681152E-4,1.2207031E-4,1.5258789E-4,1.8310547E-4,2.1362305E-4, + 2.4414062E-4,3.0517578E-4,3.6621094E-4,4.272461E-4,4.8828125E-4,6.1035156E-4,7.324219E-4, + 8.544922E-4,9.765625E-4,0.0012207031,0.0014648438,0.0017089844,0.001953125,0.0024414062, + 0.0029296875,0.0034179688,0.00390625,0.0048828125,0.005859375,0.0068359375, + 0.0078125,0.009765625,0.01171875,0.013671875,0.015625,0.01953125,0.0234375, + 0.02734375,0.03125,0.0390625,0.046875,0.0546875,0.0625,0.078125,0.09375,0.109375, + 0.125,0.15625,0.1875,0.21875,0.25,0.3125,0.375,0.4375,0.5,0.625,0.75, + 0.875,1.0,1.25,1.5,1.75,2,2.5,3,3.5,4.0,5.0,6.0,7.0,8.0,10.0,12.0,14.0,16.0,20.0,24.0,28.0,32.0,40.0,48.0,56.0, + 64.0,80.0,96.0,112.0,128.0,160.0,192.0,224.0,256.0,320.0,384.0,448.0,512.0,640.0,768.0,896.0,1024.0,1280.0,1536.0,1792.0, + 2048.0,2560.0,3072.0,3584.0,4096.0,5120.0,6144.0,7168.0,8192.0,10240.0,12288.0,14336.0,16384.0,20480.0,24576.0, + 28672.0,32768.0,40960.0,49152.0,57344.0,65536.0,81920.0,98304.0,114688.0,131072.0,163840.0,196608.0, + 229376.0,262144.0,327680.0,393216.0,458752.0,524288.0,655360.0,786432.0,917504.0,1048576.0,1310720.0, + 1572864.0,1835008.0,2097152.0,2621440.0,3145728.0,3670016.0,4194304.0,5242880.0,6291456.0,7340032.0, + 8388608.0,10485760.0,12582912.0,14680064.0,16777216.0,20971520.0,25165824.0,29360128.0,33554432.0, + 41943040.0,50331648.0,58720256.0,67108864.0,83886080.0,100663296.0,117440512.0,134217728.0, + 167772160.0,201326592.0,234881024.0,268435456.0,335544320.0,402653184.0,469762048.0,536870912.0, + 671088640.0,805306368.0,939524096.0,1073741824.0,1342177280.0,1610612736.0,1879048192.0, + 2147483648.0,2684354560.0,3221225472.0,3758096384.0,4294967296.0,5368709120.0,6442450944.0,7516192768.0 + }; + + qreal Similarity::byteToFloat(uint8_t b) { + return NORM_TABLE[b]; + } + + uint8_t Similarity::floatToByte(qreal f) { + return Similarity::encodeNorm(f); + } + +#else + + /** Cache of decoded bytes. */ + qreal NORM_TABLE[256]; + bool NORM_TABLE_initd=false; + + //float to bits conversion utilities... + union clvalue { + int32_t i; + float f; //must use a float type, else types dont match up + }; + + int32_t floatToIntBits(qreal value) + { + clvalue u; + int32_t e, f; + u.f = (float)value; + e = u.i & 0x7f800000; + f = u.i & 0x007fffff; + + if (e == 0x7f800000 && f != 0) + u.i = 0x7fc00000; + + return u.i; + } + + qreal intBitsToFloat(int32_t bits) + { + clvalue u; + u.i = bits; + return u.f; + } + + + qreal Similarity::byteToFloat(uint8_t b) { + if (b == 0) // zero is a special case + return 0.0f; + int32_t mantissa = b & 7; + int32_t exponent = (b >> 3) & 31; + int32_t bits = ((exponent+(63-15)) << 24) | (mantissa << 21); + return intBitsToFloat(bits); + } + + uint8_t Similarity::floatToByte(qreal f) { + if (f < 0.0f) // round negatives up to zero + f = 0.0f; + + if (f == 0.0f) // zero is a special case + return 0; + + int32_t bits = floatToIntBits(f); // parse qreal into parts + int32_t mantissa = (bits & 0xffffff) >> 21; + int32_t exponent = (((bits >> 24) & 0x7f) - 63) + 15; + + if (exponent > 31) { // overflow: use max value + exponent = 31; + mantissa = 7; + } + + if (exponent < 0) { // underflow: use min value + exponent = 0; + mantissa = 1; + } + + return (uint8_t)((exponent << 3) | mantissa); // pack into a uint8_t + } +#endif + + /** The Similarity implementation used by default. */ + Similarity* _defaultImpl=NULL; + + void Similarity::setDefault(Similarity* similarity) { + _defaultImpl = similarity; + } + + Similarity* Similarity::getDefault() { + if ( _defaultImpl == NULL ){ + _defaultImpl = _CLNEW DefaultSimilarity(); + } + return _defaultImpl; + } + + qreal Similarity::decodeNorm(uint8_t b) { +#ifndef _CL_HAVE_NO_FLOAT_BYTE + if ( !NORM_TABLE_initd ){ + for (int i = 0; i < 256; i++) + NORM_TABLE[i] = byteToFloat(i); + NORM_TABLE_initd=true; + } +#endif + return NORM_TABLE[b]; + } + + uint8_t Similarity::encodeNorm(qreal f) { +#ifdef _CL_HAVE_NO_FLOAT_BYTE + int32_t i=0; + if ( f <= 0 ) + return 0; + + while ( i<256 && f > NORM_TABLE[i] ){ + i++; + } + if ( i == 0 ) + return 0; + else if ( i == 255 && f>NORM_TABLE[255] ) + return 255; + else + return i; +#else + return floatToByte(f); +#endif + } + + + qreal Similarity::idf(Term* term, Searcher* searcher) { + return idf(searcher->docFreq(term), searcher->maxDoc()); + } + + + qreal Similarity::idf(CL_NS(util)::CLVector<Term*>* terms, Searcher* searcher) { + qreal _idf = 0.0f; + for (CL_NS(util)::CLVector<Term*>::iterator i = terms->begin(); i != terms->end(); i++ ) { + _idf += idf((Term*)*i, searcher); + } + return _idf; + } + + Similarity::~Similarity(){ + } + + + + + DefaultSimilarity::DefaultSimilarity(){ + } + DefaultSimilarity::~DefaultSimilarity(){ + } + + qreal DefaultSimilarity::lengthNorm(const TCHAR* fieldName, int32_t numTerms) { + if ( numTerms == 0 ) //prevent div by zero + return 0; + qreal ret = (qreal)(1.0 / sqrt((qreal)numTerms)); + return ret; + } + + qreal DefaultSimilarity::queryNorm(qreal sumOfSquaredWeights) { + if ( sumOfSquaredWeights == 0 ) //prevent div by zero + return 0.0f; + qreal ret = (qreal)(1.0 / sqrt(sumOfSquaredWeights)); + return ret; + } + + qreal DefaultSimilarity::tf(qreal freq) { + return sqrt(freq); + } + + qreal DefaultSimilarity::sloppyFreq(int32_t distance) { + return 1.0f / (distance + 1); + } + + qreal DefaultSimilarity::idf(int32_t docFreq, int32_t numDocs) { + return (qreal)(log(numDocs/(qreal)(docFreq+1)) + 1.0); + } + + qreal DefaultSimilarity::coord(int32_t overlap, int32_t maxOverlap) { + if ( maxOverlap == 0 ) + return 0.0f; + return overlap / (qreal)maxOverlap; + } +CL_NS_END |