summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp')
-rw-r--r--3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp443
1 files changed, 443 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp b/3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp
new file mode 100644
index 000000000..8f9e43dec
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+ *
+ * Distributable under the terms of either the Apache License (Version 2.0) or
+ * the GNU Lesser General Public License, as specified in the COPYING file.
+ *
+ * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
+*/
+#include "CLucene/StdHeader.h"
+#include "TermInfosReader.h"
+
+#include "CLucene/store/Directory.h"
+#include "CLucene/util/Misc.h"
+#include "FieldInfos.h"
+#include "Term.h"
+#include "Terms.h"
+#include "TermInfo.h"
+#include "TermInfosWriter.h"
+
+CL_NS_USE(store)
+CL_NS_USE(util)
+CL_NS_DEF(index)
+
+TermInfosReader::TermInfosReader(Directory* dir, const QString& seg,
+ FieldInfos* fis)
+ : directory(dir)
+ , fieldInfos (fis)
+{
+ //Func - Constructor.
+ // Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii)
+ //Pre - dir is a reference to a valid Directory
+ // Fis contains a valid reference to an FieldInfos instance
+ // seg != NULL and contains the name of the segment
+ //Post - An instance has been created and the index named seg has been read. (Remember
+ // a segment is nothing more then an independently readable index)
+
+ CND_PRECONDITION(!seg.isEmpty(), "seg is NULL");
+
+ //Initialize the name of the segment
+ segment = seg;
+ //There are no indexTerms yet
+ indexTerms = NULL;
+ //So there are no indexInfos
+ indexInfos = NULL;
+ //So there are no indexPointers
+ indexPointers = NULL;
+ //Create a filname fo a Term Info File
+ QString tisFile = Misc::segmentname(segment, QLatin1String(".tis"));
+ QString tiiFile = Misc::segmentname(segment, QLatin1String(".tii"));
+
+ //Create an SegmentTermEnum for storing all the terms read of the segment
+ origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile ), fieldInfos, false);
+ indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile ), fieldInfos, true);
+
+ //Check if enumerator points to a valid instance
+ CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator");
+ CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator");
+
+ //Get the size of the enumeration and store it in size
+ _size = origEnum->size;
+}
+
+TermInfosReader::~TermInfosReader()
+{
+ //Func - Destructor
+ //Pre - true
+ //Post - The instance has been destroyed
+
+ //Close the TermInfosReader to be absolutly sure that enumerator has been closed
+ //and the arrays indexTerms, indexPointers and indexInfos and their elements
+ //have been destroyed
+ close();
+}
+
+void TermInfosReader::close()
+{
+ //Func - Close the enumeration of TermInfos
+ //Pre - true
+ //Post - The _enumeration has been closed and the arrays
+
+ //Check if indexTerms and indexInfos exist
+ if (indexTerms && indexInfos){
+ //Iterate through arrays indexTerms and indexPointer to
+ //destroy their elements
+#ifdef _DEBUG
+ for (int32_t i = 0; i < indexTermsLength; ++i) {
+ if (indexTerms[i].__cl_refcount != 1) {
+ CND_PRECONDITION(indexTerms[i].__cl_refcount == 1,
+ "TermInfosReader term was references more than internally");
+ }
+ // _CLDECDELETE(indexTerms[i]);
+ //_CLDELETE(indexInfos[i]);
+ }
+#endif
+ //Delete the arrays
+ _CLDELETE_ARRAY(indexTerms);
+ _CLDELETE_ARRAY(indexInfos);
+ }
+
+ //Delete the arrays
+ _CLDELETE_ARRAY(indexPointers);
+
+ if (origEnum != NULL) {
+ origEnum->close();
+
+ //Get a pointer to IndexInput used by the enumeration but
+ //instantiated in the constructor by directory.open( tisFile )
+ IndexInput *is = origEnum->input;
+
+ //Delete the enumuration enumerator
+ _CLDELETE(origEnum);
+
+ //Delete the IndexInput
+ _CLDELETE(is);
+ }
+
+ if (indexEnum != NULL){
+ indexEnum->close();
+
+ //Get a pointer to IndexInput used by the enumeration but
+ //instantiated in the constructor by directory.open( tiiFile )
+ IndexInput *is = indexEnum->input;
+
+ //Delete the enumuration enumerator
+ _CLDELETE(indexEnum);
+
+ //Delete the IndexInput
+ _CLDELETE(is);
+ }
+}
+
+int64_t TermInfosReader::size() const
+{
+ //Func - Return the size of the enumeration of TermInfos
+ //Pre - true
+ //Post - size has been returened
+
+ return _size;
+}
+
+Term* TermInfosReader::get(const int32_t position)
+{
+ //Func - Returns the nth term in the set
+ //Pre - position > = 0
+ //Post - The n-th term in the set has been returned
+
+ //Check if the size is 0 because then there are no terms
+ if (_size == 0)
+ return NULL;
+
+ SegmentTermEnum* enumerator = getEnum();
+
+ if (enumerator != NULL //an enumeration exists
+ && enumerator->term(false) != NULL // term is at or past current
+ && position >= enumerator->position
+ && position < (enumerator->position + enumerator->indexInterval)) {
+ return scanEnum(position); // can avoid seek
+ }
+
+ //random-access: must seek
+ seekEnum(position / enumerator->indexInterval);
+
+ //Get the Term at position
+ return scanEnum(position);
+}
+
+// TODO: currently there is no way of cleaning up a thread, if the thread ends.
+// we are stuck with the terminfosreader of that thread. Hopefully this won't
+// be too big a problem... solutions anyone?
+SegmentTermEnum* TermInfosReader::getEnum()
+{
+ SegmentTermEnum* termEnum = enumerators.get();
+ if (termEnum == NULL) {
+ termEnum = terms();
+ enumerators.set(termEnum);
+ }
+ return termEnum;
+}
+
+TermInfo* TermInfosReader::get(const Term* term)
+{
+ //Func - Returns a TermInfo for a term
+ //Pre - term holds a valid reference to term
+ //Post - if term can be found its TermInfo has been returned otherwise NULL
+
+ //If the size of the enumeration is 0 then no Terms have been read
+ if (_size == 0)
+ return NULL;
+
+ ensureIndexIsRead();
+
+ // optimize sequential access: first try scanning cached enum w/o seeking
+ SegmentTermEnum* enumerator = getEnum();
+
+ // optimize sequential access: first try scanning cached enumerator w/o seeking
+ // if the current term of the enumeration enumerator is not at the end
+ if (enumerator->term(false) != NULL
+ // AND there exists a previous current called prev and term is
+ // positioned after this prev
+ && ((enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0)
+ // OR term is positioned at the same position as the current of
+ // enumerator or at a higher position
+ || term->compareTo(enumerator->term(false)) >= 0)) {
+ //Calculate the offset for the position
+ int32_t _enumOffset = (int32_t)
+ (enumerator->position / enumerator->indexInterval) + 1;
+
+ // but before end of block the length of indexTerms (the number of
+ // terms in enumerator) equals _enum_offset
+ if (indexTermsLength == _enumOffset
+ // OR term is positioned in front of term found at _enumOffset in
+ // indexTerms
+ || term->compareTo(&indexTerms[_enumOffset]) < 0) {
+ //no need to seek, retrieve the TermInfo for term
+ return scanEnum(term);
+ }
+ }
+
+ //Reposition current term in the enumeration
+ seekEnum(getIndexOffset(term));
+ //Return the TermInfo for term
+ return scanEnum(term);
+}
+
+int64_t TermInfosReader::getPosition(const Term* term)
+{
+ //Func - Returns the position of a Term in the set
+ //Pre - term holds a valid reference to a Term
+ // enumerator != NULL
+ //Post - If term was found then its position is returned otherwise -1
+
+ //if the enumeration is empty then return -1
+ if (_size == 0)
+ return -1;
+
+ ensureIndexIsRead();
+
+ //Retrieve the indexOffset for term
+ int32_t indexOffset = getIndexOffset(term);
+ seekEnum(indexOffset);
+
+ SegmentTermEnum* enumerator = getEnum();
+
+ while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {}
+
+ if (term->equals(enumerator->term(false)))
+ return enumerator->position;
+
+ return -1;
+}
+
+SegmentTermEnum* TermInfosReader::terms(const Term* term)
+{
+ //Func - Returns an enumeration of terms starting at or after the named term.
+ // If term is null then enumerator is set to the beginning
+ //Pre - term holds a valid reference to a Term
+ // enumerator != NULL
+ //Post - An enumeration of terms starting at or after the named term has been returned
+
+ SegmentTermEnum* enumerator = NULL;
+ if (term != NULL) {
+ //Seek enumerator to term; delete the new TermInfo that's returned.
+ TermInfo* ti = get(term);
+ _CLDELETE(ti);
+ enumerator = getEnum();
+ } else {
+ enumerator = origEnum;
+ }
+ //Clone the entire enumeration
+ SegmentTermEnum* cln = enumerator->clone();
+
+ //Check if cln points to a valid instance
+ CND_CONDITION(cln != NULL, "cln is NULL");
+
+ return cln;
+}
+
+void TermInfosReader::ensureIndexIsRead()
+{
+ //Func - Reads the term info index file or .tti file.
+ // This file contains every IndexInterval-th entry from the .tis file,
+ // along with its location in the "tis" file. This is designed to be
+ // read entirely into memory and used to provide random access to the
+ // "tis" file.
+ //Pre - indexTerms = NULL
+ // indexInfos = NULL
+ // indexPointers = NULL
+ //Post - The term info index file has been read into memory
+
+ SCOPED_LOCK_MUTEX(THIS_LOCK)
+
+ if ( indexTerms != NULL )
+ return;
+
+ try {
+ indexTermsLength = (size_t)indexEnum->size;
+
+ // Instantiate an block of Term's,so that each one doesn't have to be new'd
+ indexTerms = _CL_NEWARRAY(Term,indexTermsLength);
+
+ // Check if is indexTerms is a valid array
+ CND_CONDITION(indexTerms != NULL,
+ "No memory could be allocated for indexTerms");
+
+ // Instantiate an big block of TermInfo's, so that each one doesn't
+ // have to be new'd
+ indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength);
+
+ // Check if is indexInfos is a valid array
+ CND_CONDITION(indexInfos != NULL,
+ "No memory could be allocated for indexInfos");
+
+ // Instantiate an array indexPointers that contains pointers to the
+ // term info index file
+ indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength);
+
+ // Check if is indexPointers is a valid array
+ CND_CONDITION(indexPointers != NULL,
+ "No memory could be allocated for indexPointers");
+
+ //Iterate through the terms of indexEnum
+ for (int32_t i = 0; indexEnum->next(); ++i) {
+ indexTerms[i].set(indexEnum->term(false), indexEnum->term(false)->text());
+ indexEnum->getTermInfo(&indexInfos[i]);
+ indexPointers[i] = indexEnum->indexPointer;
+ }
+ } _CLFINALLY (
+ indexEnum->close();
+ // Close and delete the IndexInput is. The close is done by the destructor.
+ _CLDELETE( indexEnum->input );
+ _CLDELETE( indexEnum );
+ );
+}
+
+int32_t TermInfosReader::getIndexOffset(const Term* term)
+{
+ //Func - Returns the offset of the greatest index entry which is less than
+ // or equal to term.
+ //Pre - term holds a reference to a valid term
+ // indexTerms != NULL
+ //Post - The new offset has been returned
+
+ //Check if is indexTerms is a valid array
+ CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
+
+ int32_t lo = 0;
+ int32_t hi = indexTermsLength - 1;
+ int32_t mid;
+ int32_t delta;
+
+ while (hi >= lo) {
+ //Start in the middle betwee hi and lo
+ mid = (lo + hi) >> 1;
+
+ //Check if is indexTerms[mid] is a valid instance of Term
+ CND_PRECONDITION(&indexTerms[mid] != NULL, "indexTerms[mid] is NULL");
+ CND_PRECONDITION(mid < indexTermsLength, "mid >= indexTermsLength");
+
+ //Determine if term is before mid or after mid
+ delta = term->compareTo(&indexTerms[mid]);
+ if (delta < 0) {
+ //Calculate the new hi
+ hi = mid - 1;
+ } else if (delta > 0) {
+ //Calculate the new lo
+ lo = mid + 1;
+ } else {
+ //term has been found so return its position
+ return mid;
+ }
+ }
+ // the new starting offset
+ return hi;
+}
+
+void TermInfosReader::seekEnum(const int32_t indexOffset)
+{
+ //Func - Reposition the current Term and TermInfo to indexOffset
+ //Pre - indexOffset >= 0
+ // indexTerms != NULL
+ // indexInfos != NULL
+ // indexPointers != NULL
+ //Post - The current Term and Terminfo have been repositioned to indexOffset
+
+ CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number");
+ CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
+ CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL");
+ CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL");
+
+ SegmentTermEnum* enumerator = getEnum();
+ enumerator->seek(indexPointers[indexOffset],
+ (indexOffset * enumerator->indexInterval) - 1,
+ &indexTerms[indexOffset], &indexInfos[indexOffset]);
+}
+
+TermInfo* TermInfosReader::scanEnum(const Term* term)
+{
+ //Func - Scans the Enumeration of terms for term and returns the
+ // corresponding TermInfo instance if found. The search is started
+ // from the current term.
+ //Pre - term contains a valid reference to a Term
+ // enumerator != NULL
+ //Post - if term has been found the corresponding TermInfo has been returned
+ // otherwise NULL has been returned
+
+ SegmentTermEnum* enumerator = getEnum();
+ enumerator->scanTo(term);
+
+ //Check if the at the position the Term term can be found
+ if (enumerator->term(false) != NULL && term->equals(enumerator->term(false))) {
+ //Return the TermInfo instance about term
+ return enumerator->getTermInfo();
+ }
+
+ //term was not found so no TermInfo can be returned
+ return NULL;
+}
+
+Term* TermInfosReader::scanEnum(const int32_t position)
+{
+ //Func - Scans the enumeration to the requested position and returns the
+ // Term located at that position
+ //Pre - position > = 0
+ // enumerator != NULL
+ //Post - The Term at the requested position has been returned
+
+ SegmentTermEnum* enumerator = getEnum();
+
+ // As long the position of the enumeration enumerator is smaller than the
+ // requested one
+ while(enumerator->position < position) {
+ //Move the current of enumerator to the next
+ if (!enumerator->next()) {
+ //If there is no next it means that the requested position was to big
+ return NULL;
+ }
+ }
+
+ //Return the Term a the requested position
+ return enumerator->term();
+}
+
+CL_NS_END