diff options
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp')
-rw-r--r-- | 3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp | 443 |
1 files changed, 443 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp b/3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp new file mode 100644 index 000000000..8f9e43dec --- /dev/null +++ b/3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp @@ -0,0 +1,443 @@ +/* + * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team + * + * Distributable under the terms of either the Apache License (Version 2.0) or + * the GNU Lesser General Public License, as specified in the COPYING file. + * + * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved. +*/ +#include "CLucene/StdHeader.h" +#include "TermInfosReader.h" + +#include "CLucene/store/Directory.h" +#include "CLucene/util/Misc.h" +#include "FieldInfos.h" +#include "Term.h" +#include "Terms.h" +#include "TermInfo.h" +#include "TermInfosWriter.h" + +CL_NS_USE(store) +CL_NS_USE(util) +CL_NS_DEF(index) + +TermInfosReader::TermInfosReader(Directory* dir, const QString& seg, + FieldInfos* fis) + : directory(dir) + , fieldInfos (fis) +{ + //Func - Constructor. + // Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii) + //Pre - dir is a reference to a valid Directory + // Fis contains a valid reference to an FieldInfos instance + // seg != NULL and contains the name of the segment + //Post - An instance has been created and the index named seg has been read. (Remember + // a segment is nothing more then an independently readable index) + + CND_PRECONDITION(!seg.isEmpty(), "seg is NULL"); + + //Initialize the name of the segment + segment = seg; + //There are no indexTerms yet + indexTerms = NULL; + //So there are no indexInfos + indexInfos = NULL; + //So there are no indexPointers + indexPointers = NULL; + //Create a filname fo a Term Info File + QString tisFile = Misc::segmentname(segment, QLatin1String(".tis")); + QString tiiFile = Misc::segmentname(segment, QLatin1String(".tii")); + + //Create an SegmentTermEnum for storing all the terms read of the segment + origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile ), fieldInfos, false); + indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile ), fieldInfos, true); + + //Check if enumerator points to a valid instance + CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator"); + CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator"); + + //Get the size of the enumeration and store it in size + _size = origEnum->size; +} + +TermInfosReader::~TermInfosReader() +{ + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + //Close the TermInfosReader to be absolutly sure that enumerator has been closed + //and the arrays indexTerms, indexPointers and indexInfos and their elements + //have been destroyed + close(); +} + +void TermInfosReader::close() +{ + //Func - Close the enumeration of TermInfos + //Pre - true + //Post - The _enumeration has been closed and the arrays + + //Check if indexTerms and indexInfos exist + if (indexTerms && indexInfos){ + //Iterate through arrays indexTerms and indexPointer to + //destroy their elements +#ifdef _DEBUG + for (int32_t i = 0; i < indexTermsLength; ++i) { + if (indexTerms[i].__cl_refcount != 1) { + CND_PRECONDITION(indexTerms[i].__cl_refcount == 1, + "TermInfosReader term was references more than internally"); + } + // _CLDECDELETE(indexTerms[i]); + //_CLDELETE(indexInfos[i]); + } +#endif + //Delete the arrays + _CLDELETE_ARRAY(indexTerms); + _CLDELETE_ARRAY(indexInfos); + } + + //Delete the arrays + _CLDELETE_ARRAY(indexPointers); + + if (origEnum != NULL) { + origEnum->close(); + + //Get a pointer to IndexInput used by the enumeration but + //instantiated in the constructor by directory.open( tisFile ) + IndexInput *is = origEnum->input; + + //Delete the enumuration enumerator + _CLDELETE(origEnum); + + //Delete the IndexInput + _CLDELETE(is); + } + + if (indexEnum != NULL){ + indexEnum->close(); + + //Get a pointer to IndexInput used by the enumeration but + //instantiated in the constructor by directory.open( tiiFile ) + IndexInput *is = indexEnum->input; + + //Delete the enumuration enumerator + _CLDELETE(indexEnum); + + //Delete the IndexInput + _CLDELETE(is); + } +} + +int64_t TermInfosReader::size() const +{ + //Func - Return the size of the enumeration of TermInfos + //Pre - true + //Post - size has been returened + + return _size; +} + +Term* TermInfosReader::get(const int32_t position) +{ + //Func - Returns the nth term in the set + //Pre - position > = 0 + //Post - The n-th term in the set has been returned + + //Check if the size is 0 because then there are no terms + if (_size == 0) + return NULL; + + SegmentTermEnum* enumerator = getEnum(); + + if (enumerator != NULL //an enumeration exists + && enumerator->term(false) != NULL // term is at or past current + && position >= enumerator->position + && position < (enumerator->position + enumerator->indexInterval)) { + return scanEnum(position); // can avoid seek + } + + //random-access: must seek + seekEnum(position / enumerator->indexInterval); + + //Get the Term at position + return scanEnum(position); +} + +// TODO: currently there is no way of cleaning up a thread, if the thread ends. +// we are stuck with the terminfosreader of that thread. Hopefully this won't +// be too big a problem... solutions anyone? +SegmentTermEnum* TermInfosReader::getEnum() +{ + SegmentTermEnum* termEnum = enumerators.get(); + if (termEnum == NULL) { + termEnum = terms(); + enumerators.set(termEnum); + } + return termEnum; +} + +TermInfo* TermInfosReader::get(const Term* term) +{ + //Func - Returns a TermInfo for a term + //Pre - term holds a valid reference to term + //Post - if term can be found its TermInfo has been returned otherwise NULL + + //If the size of the enumeration is 0 then no Terms have been read + if (_size == 0) + return NULL; + + ensureIndexIsRead(); + + // optimize sequential access: first try scanning cached enum w/o seeking + SegmentTermEnum* enumerator = getEnum(); + + // optimize sequential access: first try scanning cached enumerator w/o seeking + // if the current term of the enumeration enumerator is not at the end + if (enumerator->term(false) != NULL + // AND there exists a previous current called prev and term is + // positioned after this prev + && ((enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0) + // OR term is positioned at the same position as the current of + // enumerator or at a higher position + || term->compareTo(enumerator->term(false)) >= 0)) { + //Calculate the offset for the position + int32_t _enumOffset = (int32_t) + (enumerator->position / enumerator->indexInterval) + 1; + + // but before end of block the length of indexTerms (the number of + // terms in enumerator) equals _enum_offset + if (indexTermsLength == _enumOffset + // OR term is positioned in front of term found at _enumOffset in + // indexTerms + || term->compareTo(&indexTerms[_enumOffset]) < 0) { + //no need to seek, retrieve the TermInfo for term + return scanEnum(term); + } + } + + //Reposition current term in the enumeration + seekEnum(getIndexOffset(term)); + //Return the TermInfo for term + return scanEnum(term); +} + +int64_t TermInfosReader::getPosition(const Term* term) +{ + //Func - Returns the position of a Term in the set + //Pre - term holds a valid reference to a Term + // enumerator != NULL + //Post - If term was found then its position is returned otherwise -1 + + //if the enumeration is empty then return -1 + if (_size == 0) + return -1; + + ensureIndexIsRead(); + + //Retrieve the indexOffset for term + int32_t indexOffset = getIndexOffset(term); + seekEnum(indexOffset); + + SegmentTermEnum* enumerator = getEnum(); + + while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {} + + if (term->equals(enumerator->term(false))) + return enumerator->position; + + return -1; +} + +SegmentTermEnum* TermInfosReader::terms(const Term* term) +{ + //Func - Returns an enumeration of terms starting at or after the named term. + // If term is null then enumerator is set to the beginning + //Pre - term holds a valid reference to a Term + // enumerator != NULL + //Post - An enumeration of terms starting at or after the named term has been returned + + SegmentTermEnum* enumerator = NULL; + if (term != NULL) { + //Seek enumerator to term; delete the new TermInfo that's returned. + TermInfo* ti = get(term); + _CLDELETE(ti); + enumerator = getEnum(); + } else { + enumerator = origEnum; + } + //Clone the entire enumeration + SegmentTermEnum* cln = enumerator->clone(); + + //Check if cln points to a valid instance + CND_CONDITION(cln != NULL, "cln is NULL"); + + return cln; +} + +void TermInfosReader::ensureIndexIsRead() +{ + //Func - Reads the term info index file or .tti file. + // This file contains every IndexInterval-th entry from the .tis file, + // along with its location in the "tis" file. This is designed to be + // read entirely into memory and used to provide random access to the + // "tis" file. + //Pre - indexTerms = NULL + // indexInfos = NULL + // indexPointers = NULL + //Post - The term info index file has been read into memory + + SCOPED_LOCK_MUTEX(THIS_LOCK) + + if ( indexTerms != NULL ) + return; + + try { + indexTermsLength = (size_t)indexEnum->size; + + // Instantiate an block of Term's,so that each one doesn't have to be new'd + indexTerms = _CL_NEWARRAY(Term,indexTermsLength); + + // Check if is indexTerms is a valid array + CND_CONDITION(indexTerms != NULL, + "No memory could be allocated for indexTerms"); + + // Instantiate an big block of TermInfo's, so that each one doesn't + // have to be new'd + indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength); + + // Check if is indexInfos is a valid array + CND_CONDITION(indexInfos != NULL, + "No memory could be allocated for indexInfos"); + + // Instantiate an array indexPointers that contains pointers to the + // term info index file + indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength); + + // Check if is indexPointers is a valid array + CND_CONDITION(indexPointers != NULL, + "No memory could be allocated for indexPointers"); + + //Iterate through the terms of indexEnum + for (int32_t i = 0; indexEnum->next(); ++i) { + indexTerms[i].set(indexEnum->term(false), indexEnum->term(false)->text()); + indexEnum->getTermInfo(&indexInfos[i]); + indexPointers[i] = indexEnum->indexPointer; + } + } _CLFINALLY ( + indexEnum->close(); + // Close and delete the IndexInput is. The close is done by the destructor. + _CLDELETE( indexEnum->input ); + _CLDELETE( indexEnum ); + ); +} + +int32_t TermInfosReader::getIndexOffset(const Term* term) +{ + //Func - Returns the offset of the greatest index entry which is less than + // or equal to term. + //Pre - term holds a reference to a valid term + // indexTerms != NULL + //Post - The new offset has been returned + + //Check if is indexTerms is a valid array + CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); + + int32_t lo = 0; + int32_t hi = indexTermsLength - 1; + int32_t mid; + int32_t delta; + + while (hi >= lo) { + //Start in the middle betwee hi and lo + mid = (lo + hi) >> 1; + + //Check if is indexTerms[mid] is a valid instance of Term + CND_PRECONDITION(&indexTerms[mid] != NULL, "indexTerms[mid] is NULL"); + CND_PRECONDITION(mid < indexTermsLength, "mid >= indexTermsLength"); + + //Determine if term is before mid or after mid + delta = term->compareTo(&indexTerms[mid]); + if (delta < 0) { + //Calculate the new hi + hi = mid - 1; + } else if (delta > 0) { + //Calculate the new lo + lo = mid + 1; + } else { + //term has been found so return its position + return mid; + } + } + // the new starting offset + return hi; +} + +void TermInfosReader::seekEnum(const int32_t indexOffset) +{ + //Func - Reposition the current Term and TermInfo to indexOffset + //Pre - indexOffset >= 0 + // indexTerms != NULL + // indexInfos != NULL + // indexPointers != NULL + //Post - The current Term and Terminfo have been repositioned to indexOffset + + CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number"); + CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); + CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL"); + CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL"); + + SegmentTermEnum* enumerator = getEnum(); + enumerator->seek(indexPointers[indexOffset], + (indexOffset * enumerator->indexInterval) - 1, + &indexTerms[indexOffset], &indexInfos[indexOffset]); +} + +TermInfo* TermInfosReader::scanEnum(const Term* term) +{ + //Func - Scans the Enumeration of terms for term and returns the + // corresponding TermInfo instance if found. The search is started + // from the current term. + //Pre - term contains a valid reference to a Term + // enumerator != NULL + //Post - if term has been found the corresponding TermInfo has been returned + // otherwise NULL has been returned + + SegmentTermEnum* enumerator = getEnum(); + enumerator->scanTo(term); + + //Check if the at the position the Term term can be found + if (enumerator->term(false) != NULL && term->equals(enumerator->term(false))) { + //Return the TermInfo instance about term + return enumerator->getTermInfo(); + } + + //term was not found so no TermInfo can be returned + return NULL; +} + +Term* TermInfosReader::scanEnum(const int32_t position) +{ + //Func - Scans the enumeration to the requested position and returns the + // Term located at that position + //Pre - position > = 0 + // enumerator != NULL + //Post - The Term at the requested position has been returned + + SegmentTermEnum* enumerator = getEnum(); + + // As long the position of the enumeration enumerator is smaller than the + // requested one + while(enumerator->position < position) { + //Move the current of enumerator to the next + if (!enumerator->next()) { + //If there is no next it means that the requested position was to big + return NULL; + } + } + + //Return the Term a the requested position + return enumerator->term(); +} + +CL_NS_END |