path: root/src/libs/3rdparty/sqlite/okapi_bm25.h
diff options
Diffstat (limited to 'src/libs/3rdparty/sqlite/okapi_bm25.h')
1 files changed, 0 insertions, 231 deletions
diff --git a/src/libs/3rdparty/sqlite/okapi_bm25.h b/src/libs/3rdparty/sqlite/okapi_bm25.h
deleted file mode 100644
index d527012c19..0000000000
--- a/src/libs/3rdparty/sqlite/okapi_bm25.h
+++ /dev/null
@@ -1,231 +0,0 @@
-#include <math.h>
-#include <assert.h>
-#include "sqlite3.h"
-static void okapi_bm25(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
- assert(sizeof(int) == 4);
- const unsigned int *matchinfo = (const unsigned int *)sqlite3_value_blob(apVal[0]);
- int searchTextCol = sqlite3_value_int(apVal[1]);
- double K1 = ((nVal >= 3) ? sqlite3_value_double(apVal[2]) : 1.2);
- double B = ((nVal >= 4) ? sqlite3_value_double(apVal[3]) : 0.75);
- int P_OFFSET = 0;
- int C_OFFSET = 1;
- int X_OFFSET = 2;
- int termCount = matchinfo[P_OFFSET];
- int colCount = matchinfo[C_OFFSET];
- int N_OFFSET = X_OFFSET + 3*termCount*colCount;
- int A_OFFSET = N_OFFSET + 1;
- int L_OFFSET = (A_OFFSET + colCount);
- double totalDocs = matchinfo[N_OFFSET];
- double avgLength = matchinfo[A_OFFSET + searchTextCol];
- double docLength = matchinfo[L_OFFSET + searchTextCol];
- double sum = 0.0;
- for (int i = 0; i < termCount; i++) {
- int currentX = X_OFFSET + (3 * searchTextCol * (i + 1));
- double termFrequency = matchinfo[currentX];
- double docsWithTerm = matchinfo[currentX + 2];
- double idf = log(
- (totalDocs - docsWithTerm + 0.5) /
- (docsWithTerm + 0.5)
- );
- double rightSide = (
- (termFrequency * (K1 + 1)) /
- (termFrequency + (K1 * (1 - B + (B * (docLength / avgLength)))))
- );
- sum += (idf * rightSide);
- }
- sqlite3_result_double(pCtx, sum);
-// Created by Joshua Wilson on 27/05/14.
-// Copyright (c) 2014 Joshua Wilson. All rights reserved.
-// https://github.com/neozenith/sqlite-okapi-bm25
-// This is an extension to the work of "Radford 'rads' Smith"
-// found at: https://github.com/rads/sqlite-okapi-bm25
-// which is covered by the MIT License
-// http://opensource.org/licenses/MIT
-// the following code shall also be covered by the same MIT License
-static void okapi_bm25f(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
- assert(sizeof(int) == 4);
- const unsigned int *matchinfo = (const unsigned int *)sqlite3_value_blob(apVal[0]);
- //Setting the default values and ignoring argument based inputs so the extra
- //arguments can be the column weights instead.
- double K1 = 1.2;// ((nVal >= 3) ? sqlite3_value_double(apVal[2]) : 1.2);
- double B = 0.75;// ((nVal >= 4) ? sqlite3_value_double(apVal[3]) : 0.75);
- //For a good explanation fo the maths and how to choose these variables
- //http://stackoverflow.com/a/23161886/622276
- //NOTE: the rearranged order of parameters to match the order presented on
- //SQLite3 FTS3 documentation 'pcxnals' (http://www.sqlite.org/fts3.html#matchinfo)
- int P_OFFSET = 0;
- int C_OFFSET = 1;
- int X_OFFSET = 2;
- int termCount = matchinfo[P_OFFSET];
- int colCount = matchinfo[C_OFFSET];
- int N_OFFSET = X_OFFSET + 3*termCount*colCount;
- int A_OFFSET = N_OFFSET + 1;
- int L_OFFSET = (A_OFFSET + colCount);
-// int S_OFFSET = (L_OFFSET + colCount); //useful as a pseudo proximity weighting per field/column
- double totalDocs = matchinfo[N_OFFSET];
- double avgLength = 0.0;
- double docLength = 0.0;
- for (int col = 0; col < colCount; col++)
- {
- avgLength += matchinfo[A_OFFSET + col];
- docLength += matchinfo[L_OFFSET + col];
- }
- double epsilon = 1.0 / (totalDocs*avgLength);
- double sum = 0.0;
- for (int t = 0; t < termCount; t++) {
- for (int col = 0 ; col < colCount; col++)
- {
- int currentX = X_OFFSET + (3 * col * (t + 1));
- double termFrequency = matchinfo[currentX];
- double docsWithTerm = matchinfo[currentX + 2];
- double idf = log(
- (totalDocs - docsWithTerm + 0.5) /
- (docsWithTerm + 0.5)
- );
- // "...terms appearing in more than half of the corpus will provide negative contributions to the final document score."
- //http://en.wikipedia.org/wiki/Okapi_BM25
- idf = (idf < 0) ? epsilon : idf; //common terms could have no effect (\epsilon=0.0) or a very small effect (\epsilon=1/NoOfTokens which asymptotes to 0.0)
- double rightSide = (
- (termFrequency * (K1 + 1)) /
- (termFrequency + (K1 * (1 - B + (B * (docLength / avgLength)))))
- );
- rightSide += 1.0;
- //To comply with BM25+ that solves a lower bounding issue where large documents that match are unfairly scored as
- //having similar relevancy as short documents that do not contain as many terms
- //Yuanhua Lv and ChengXiang Zhai. 'Lower-bounding term frequency normalization.' In Proceedings of CIKM'2011, pages 7-16.
- //http://sifaka.cs.uiuc.edu/~ylv2/pub/cikm11-lowerbound.pdf
- double weight = ((nVal > col+1) ? sqlite3_value_double(apVal[col+1]) : 1.0);
-// double subsequence = matchinfo[S_OFFSET + col];
- sum += (idf * rightSide) * weight; // * subsequence; //useful as a pseudo proximty weighting
- }
- }
- sqlite3_result_double(pCtx, sum);
-static void okapi_bm25f_kb(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
- assert(sizeof(int) == 4);
- const unsigned int *matchinfo = (const unsigned int *)sqlite3_value_blob(apVal[0]);
- //Setting the default values and ignoring argument based inputs so the extra
- //arguments can be the column weights instead.
- if (nVal < 2) sqlite3_result_error(pCtx, "wrong number of arguments to function okapi_bm25_kb(), expected k1 parameter", -1);
- if (nVal < 3) sqlite3_result_error(pCtx, "wrong number of arguments to function okapi_bm25_kb(), expected b parameter", -1);
- double K1 = sqlite3_value_double(apVal[1]);
- double B = sqlite3_value_double(apVal[2]);
- //For a good explanation fo the maths and how to choose these variables
- //http://stackoverflow.com/a/23161886/622276
- //NOTE: the rearranged order of parameters to match the order presented on
- //SQLite3 FTS3 documentation 'pcxnals' (http://www.sqlite.org/fts3.html#matchinfo)
- int P_OFFSET = 0;
- int C_OFFSET = 1;
- int X_OFFSET = 2;
- int termCount = matchinfo[P_OFFSET];
- int colCount = matchinfo[C_OFFSET];
- int N_OFFSET = X_OFFSET + 3*termCount*colCount;
- int A_OFFSET = N_OFFSET + 1;
- int L_OFFSET = (A_OFFSET + colCount);
- // int S_OFFSET = (L_OFFSET + colCount); //useful as a pseudo proximity weighting per field/column
- double totalDocs = matchinfo[N_OFFSET];
- double avgLength = 0.0;
- double docLength = 0.0;
- for (int col = 0; col < colCount; col++)
- {
- avgLength += matchinfo[A_OFFSET + col];
- docLength += matchinfo[L_OFFSET + col];
- }
- double epsilon = 1.0 / (totalDocs*avgLength);
- double sum = 0.0;
- for (int t = 0; t < termCount; t++) {
- for (int col = 0 ; col < colCount; col++)
- {
- int currentX = X_OFFSET + (3 * col * (t + 1));
- double termFrequency = matchinfo[currentX];
- double docsWithTerm = matchinfo[currentX + 2];
- double idf = log(
- (totalDocs - docsWithTerm + 0.5) /
- (docsWithTerm + 0.5)
- );
- // "...terms appearing in more than half of the corpus will provide negative contributions to the final document score."
- //http://en.wikipedia.org/wiki/Okapi_BM25
- idf = (idf < 0) ? epsilon : idf; //common terms could have no effect (\epsilon=0.0) or a very small effect (\epsilon=1/NoOfTokens which asymptotes to 0.0)
- double rightSide = (
- (termFrequency * (K1 + 1)) /
- (termFrequency + (K1 * (1 - B + (B * (docLength / avgLength)))))
- );
- rightSide += 1.0;
- //To comply with BM25+ that solves a lower bounding issue where large documents that match are unfairly scored as
- //having similar relevancy as short documents that do not contain as many terms
- //Yuanhua Lv and ChengXiang Zhai. 'Lower-bounding term frequency normalization.' In Proceedings of CIKM'2011, pages 7-16.
- //http://sifaka.cs.uiuc.edu/~ylv2/pub/cikm11-lowerbound.pdf
- double weight = ((nVal > col+3) ? sqlite3_value_double(apVal[col+3]) : 1.0);
- // double subsequence = matchinfo[S_OFFSET + col];
- sum += (idf * rightSide) * weight; // * subsequence; //useful as a pseudo proximty weighting
- }
- }
- sqlite3_result_double(pCtx, sum);