summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/index/MultiReader.cpp
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/MultiReader.cpp')
-rw-r--r--3rdparty/clucene/src/CLucene/index/MultiReader.cpp722
1 files changed, 722 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/MultiReader.cpp b/3rdparty/clucene/src/CLucene/index/MultiReader.cpp
new file mode 100644
index 000000000..1260d04dc
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/index/MultiReader.cpp
@@ -0,0 +1,722 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/StdHeader.h"
+#include "MultiReader.h"
+
+#include "IndexReader.h"
+#include "CLucene/document/Document.h"
+#include "Terms.h"
+#include "SegmentMergeQueue.h"
+
+CL_NS_USE(store)
+CL_NS_USE(util)
+CL_NS_DEF(index)
+
+MultiReader::MultiReader(IndexReader** subReaders):
+ IndexReader(subReaders == NULL || subReaders[0] == NULL ? NULL : subReaders[0]->getDirectory()),
+ normsCache(true, true)
+{
+ initialize(subReaders);
+}
+
+MultiReader::MultiReader(Directory* directory, SegmentInfos* sis, IndexReader** subReaders):
+ IndexReader(directory, sis, false),
+ normsCache(true, true)
+{
+ initialize(subReaders);
+}
+
+
+MultiReader::~MultiReader() {
+//Func - Destructor
+//Pre - true
+//Post - The instance has been destroyed all IndexReader instances
+// this instance managed have been destroyed to
+
+ _CLDELETE_ARRAY(ones);
+ _CLDELETE_ARRAY(starts);
+
+ //Iterate through the subReaders and destroy each reader
+ if (subReaders && subReadersLength > 0) {
+ for (int32_t i = 0; i < subReadersLength; i++) {
+ _CLDELETE(subReaders[i]);
+ }
+ }
+ //Destroy the subReaders array
+ _CLDELETE_ARRAY(subReaders);
+}
+
+void MultiReader::initialize(IndexReader** subReaders){
+ this->subReadersLength = 0;
+ this->subReaders = subReaders;
+
+ //count the subReaders size
+ if ( subReaders != NULL ){
+ while ( subReaders[subReadersLength] != NULL ){
+ subReadersLength++;
+ }
+ }
+ _maxDoc = 0;
+ _numDocs = -1;
+ ones = NULL;
+
+ starts = _CL_NEWARRAY(int32_t,subReadersLength + 1); // build starts array
+ for (int32_t i = 0; i < subReadersLength; i++) {
+ starts[i] = _maxDoc;
+
+ // compute maxDocs
+ _maxDoc += subReaders[i]->maxDoc();
+ if (subReaders[i]->hasDeletions())
+ _hasDeletions = true;
+ }
+ starts[subReadersLength] = _maxDoc;
+}
+
+bool MultiReader::getTermFreqVectors(int32_t n, Array<TermFreqVector*>& result){
+ int32_t i = readerIndex(n); // find segment num
+ return subReaders[i]->getTermFreqVectors(n - starts[i], result); // dispatch to segment
+}
+
+TermFreqVector* MultiReader::getTermFreqVector(int32_t n, const TCHAR* field){
+ int32_t i = readerIndex(n); // find segment num
+ return subReaders[i]->getTermFreqVector(n - starts[i], field);
+}
+
+
+int32_t MultiReader::numDocs() {
+ SCOPED_LOCK_MUTEX(THIS_LOCK)
+ if (_numDocs == -1) { // check cache
+ int32_t n = 0; // cache miss--recompute
+ for (int32_t i = 0; i < subReadersLength; i++)
+ n += subReaders[i]->numDocs(); // sum from readers
+ _numDocs = n;
+ }
+ return _numDocs;
+}
+
+int32_t MultiReader::maxDoc() const {
+ return _maxDoc;
+}
+
+bool MultiReader::document(int32_t n, CL_NS(document)::Document* doc){
+ int32_t i = readerIndex(n); // find segment num
+ return subReaders[i]->document(n - starts[i],doc); // dispatch to segment reader
+}
+
+bool MultiReader::isDeleted(const int32_t n) {
+ int32_t i = readerIndex(n); // find segment num
+ return subReaders[i]->isDeleted(n - starts[i]); // dispatch to segment reader
+}
+
+uint8_t* MultiReader::norms(const TCHAR* field){
+ SCOPED_LOCK_MUTEX(THIS_LOCK)
+ uint8_t* bytes;
+ bytes = normsCache.get(field);
+ if (bytes != NULL){
+ return bytes; // cache hit
+ }
+
+ if ( !hasNorms(field) )
+ return fakeNorms();
+
+ bytes = _CL_NEWARRAY(uint8_t,maxDoc());
+ for (int32_t i = 0; i < subReadersLength; i++)
+ subReaders[i]->norms(field, bytes + starts[i]);
+
+ //Unfortunately the data in the normCache can get corrupted, since it's being loaded with string
+ //keys that may be deleted while still in use by the map. To prevent this field is duplicated
+ //and then stored in the normCache
+ TCHAR* key = STRDUP_TtoT(field);
+ //update cache
+ normsCache.put(key, bytes);
+
+ return bytes;
+}
+
+void MultiReader::norms(const TCHAR* field, uint8_t* result) {
+ SCOPED_LOCK_MUTEX(THIS_LOCK)
+ uint8_t* bytes = normsCache.get(field);
+ if (bytes==NULL && !hasNorms(field))
+ bytes=fakeNorms();
+
+ if (bytes != NULL){ // cache hit
+ int32_t len = maxDoc();
+ memcpy(result,bytes,len * sizeof(int32_t));
+ }
+
+ for (int32_t i = 0; i < subReadersLength; i++) // read from segments
+ subReaders[i]->norms(field, result + starts[i]);
+}
+
+
+void MultiReader::doSetNorm(int32_t n, const TCHAR* field, uint8_t value){
+ normsCache.remove(field); // clear cache
+ int32_t i = readerIndex(n); // find segment num
+ subReaders[i]->setNorm(n-starts[i], field, value); // dispatch
+}
+
+TermEnum* MultiReader::terms() const {
+ return _CLNEW MultiTermEnum(subReaders, starts, NULL);
+}
+
+TermEnum* MultiReader::terms(const Term* term) const {
+ return _CLNEW MultiTermEnum(subReaders, starts, term);
+}
+
+int32_t MultiReader::docFreq(const Term* t) const {
+ int32_t total = 0; // sum freqs in Multi
+ for (int32_t i = 0; i < subReadersLength; i++)
+ total += subReaders[i]->docFreq(t);
+ return total;
+}
+
+TermDocs* MultiReader::termDocs() const {
+ TermDocs* ret = _CLNEW MultiTermDocs(subReaders, starts);
+ return ret;
+}
+
+TermPositions* MultiReader::termPositions() const {
+ TermPositions* ret = (TermPositions*)_CLNEW MultiTermPositions(subReaders, starts);
+ return ret;
+}
+
+void MultiReader::doDelete(const int32_t n) {
+ _numDocs = -1; // invalidate cache
+ int32_t i = readerIndex(n); // find segment num
+ subReaders[i]->deleteDocument(n - starts[i]); // dispatch to segment reader
+ _hasDeletions = true;
+}
+
+int32_t MultiReader::readerIndex(const int32_t n) const { // find reader for doc n:
+ int32_t lo = 0; // search starts array
+ int32_t hi = subReadersLength - 1; // for first element less
+ // than n, return its index
+ while (hi >= lo) {
+ int32_t mid = (lo + hi) >> 1;
+ int32_t midValue = starts[mid];
+ if (n < midValue)
+ hi = mid - 1;
+ else if (n > midValue)
+ lo = mid + 1;
+ else{ // found a match
+ while (mid+1 < subReadersLength && starts[mid+1] == midValue) {
+ mid++; // scan to last match
+ }
+ return mid;
+ }
+ }
+ return hi;
+}
+
+bool MultiReader::hasNorms(const TCHAR* field) {
+ for (int i = 0; i < subReadersLength; i++) {
+ if (subReaders[i]->hasNorms(field))
+ return true;
+ }
+ return false;
+}
+uint8_t* MultiReader::fakeNorms() {
+ if (ones==NULL)
+ ones=SegmentReader::createFakeNorms(maxDoc());
+ return ones;
+}
+
+void MultiReader::doUndeleteAll(){
+ for (int32_t i = 0; i < subReadersLength; i++)
+ subReaders[i]->undeleteAll();
+ _hasDeletions = false;
+ _numDocs = -1;
+}
+void MultiReader::doCommit() {
+ for (int32_t i = 0; i < subReadersLength; i++)
+ subReaders[i]->commit();
+}
+
+void MultiReader::doClose() {
+ SCOPED_LOCK_MUTEX(THIS_LOCK)
+ for (int32_t i = 0; i < subReadersLength; i++){
+ subReaders[i]->close();
+ }
+}
+
+
+void MultiReader::getFieldNames(FieldOption fldOption, StringArrayWithDeletor& retarray){
+ StringArrayWithDeletor temp;
+ CLHashList<TCHAR*> hashList;
+ for (int32_t i = 0; i < subReadersLength; i++) {
+ IndexReader* reader = subReaders[i];
+ reader->getFieldNames(fldOption, temp);
+
+ //create a unique list of names.
+ StringArrayWithDeletor::iterator itr = temp.begin();
+ while ( itr != temp.end() ){
+ if ( hashList.find(*itr) == hashList.end() )
+ hashList.insert(STRDUP_TtoT(*itr));
+ itr++;
+ }
+ }
+ //move the items into the return
+ CLHashList<TCHAR*>::iterator itr = hashList.begin();
+ while ( itr != hashList.end() ){
+ retarray.push_back(*itr);//no need to copy, already done!
+ itr++;
+ }
+}
+
+
+MultiTermDocs::MultiTermDocs(){
+//Func - Default constructor
+// Initialises an empty MultiTermDocs.
+// This constructor is needed to allow the constructor of MultiTermPositions
+// initialise the instance by itself
+//Pre - true
+//Post - An empty
+
+ subReaders = NULL;
+ subReadersLength = 0;
+ starts = NULL;
+ base = 0;
+ pointer = 0;
+ current = NULL;
+ term = NULL;
+ readerTermDocs = NULL;
+}
+
+MultiTermDocs::MultiTermDocs(IndexReader** r, const int32_t* s){
+//Func - Constructor
+//Pre - if r is NULL then rLen must be 0 else if r != NULL then rLen > 0
+// s != NULL
+//Post - The instance has been created
+
+ //count readers
+ subReadersLength = 0;
+ subReaders = r;
+
+ CND_PRECONDITION(s != NULL, "s is NULL");
+
+ if ( subReaders != NULL ){
+ while ( subReaders[subReadersLength] != NULL )
+ subReadersLength++;
+ }
+
+ starts = s;
+ base = 0;
+ pointer = 0;
+ current = NULL;
+ term = NULL;
+
+ readerTermDocs = NULL;
+
+ //Check if there are subReaders
+ if(subReaders != NULL && subReadersLength > 0){
+ readerTermDocs = _CL_NEWARRAY(TermDocs*, subReadersLength+1);
+
+ CND_CONDITION(readerTermDocs != NULL,"No memory could be allocated for readerTermDocs");
+
+ //Initialize the readerTermDocs pointer array to NULLs
+ for ( int32_t i=0;i<subReadersLength+1;i++){
+ readerTermDocs[i]=NULL;
+ }
+ }
+}
+
+MultiTermDocs::~MultiTermDocs(){
+//Func - Destructor
+//Pre - true
+//Post - The instance has been destroyed
+
+ close();
+}
+
+
+TermPositions* MultiTermDocs::__asTermPositions(){
+ return NULL;
+}
+
+int32_t MultiTermDocs::doc() const {
+ CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was called");
+ return base + current->doc();
+}
+int32_t MultiTermDocs::freq() const {
+ CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was called");
+ return current->freq();
+}
+
+void MultiTermDocs::seek(TermEnum* termEnum){
+ seek(termEnum->term(false));
+}
+
+void MultiTermDocs::seek( Term* tterm) {
+//Func - Resets the instance for a new search
+//Pre - tterm != NULL
+//Post - The instance has been reset for a new search
+
+ CND_PRECONDITION(tterm != NULL, "tterm is NULL");
+
+ //Assigning tterm is done as below for a reason
+ //The construction ensures that if seek is called from within
+ //MultiTermDocs with as argument this->term (seek(this->term)) that the assignment
+ //will succeed and all referencecounters represent the correct situation
+
+ //Get a pointer from tterm and increase its reference counter
+ Term *TempTerm = _CL_POINTER(tterm);
+
+ //Finialize term to ensure we decrease the reference counter of the instance which term points to
+ _CLDECDELETE(term);
+
+ //Assign TempTerm to term
+ term = TempTerm;
+
+ base = 0;
+ pointer = 0;
+ current = NULL;
+}
+
+bool MultiTermDocs::next() {
+ if (current != NULL && current->next()) {
+ return true;
+ } else if (pointer < subReadersLength) {
+ base = starts[pointer];
+ current = termDocs(pointer++);
+ return next();
+ } else
+ return false;
+}
+
+int32_t MultiTermDocs::read(int32_t* docs, int32_t* freqs, int32_t length) {
+ while (true) {
+ while (current == NULL) {
+ if (pointer < subReadersLength) { // try next segment
+ base = starts[pointer];
+ current = termDocs(pointer++);
+ } else {
+ return 0;
+ }
+ }
+ int32_t end = current->read(docs, freqs,length);
+ if (end == 0) { // none left in segment
+ current = NULL;
+ } else { // got some
+ int32_t b = base; // adjust doc numbers
+ for (int32_t i = 0; i < end; i++)
+ docs[i] += b;
+ return end;
+ }
+ }
+}
+
+bool MultiTermDocs::skipTo(const int32_t target) {
+ do {
+ if (!next())
+ return false;
+ } while (target > doc());
+ return true;
+}
+
+void MultiTermDocs::close() {
+//Func - Closes all MultiTermDocs managed by this instance
+//Pre - true
+//Post - All the MultiTermDocs have been closed
+
+
+ //Check if readerTermDocs is valid
+ if (readerTermDocs){
+ TermDocs* curTD = NULL;
+ //iterate through the readerTermDocs array
+ for (int32_t i = 0; i < subReadersLength; i++) {
+ //Retrieve the i-th TermDocs instance
+ curTD = readerTermDocs[i];
+
+ //Check if it is a valid pointer
+ if (curTD != NULL) {
+ //Close it
+ curTD->close();
+ _CLDELETE(curTD);
+ }
+ }
+
+ _CLDELETE_ARRAY(readerTermDocs);
+ }
+
+ //current previously pointed to a member of readerTermDocs; ensure that
+ //it doesn't now point to invalid memory.
+ current = NULL;
+ base = 0;
+ pointer = 0;
+
+ _CLDECDELETE(term);
+}
+
+TermDocs* MultiTermDocs::termDocs(const IndexReader* reader) const {
+ TermDocs* ret = reader->termDocs();
+ return ret;
+}
+
+TermDocs* MultiTermDocs::termDocs(const int32_t i) const {
+ if (term == NULL)
+ return NULL;
+ TermDocs* result = readerTermDocs[i];
+ if (result == NULL){
+ readerTermDocs[i] = termDocs(subReaders[i]);
+ result = readerTermDocs[i];
+ }
+ result->seek(term);
+
+ return result;
+}
+
+
+MultiTermEnum::MultiTermEnum(
+ IndexReader** subReaders, const int32_t *starts, const Term* t){
+//Func - Constructor
+// Opens all enumerations of all readers
+//Pre - readers != NULL and contains an array of IndexReader instances each responsible for
+// reading a single segment
+// subReadersLength >= 0 and represents the number of readers in the readers array
+// starts is an array of
+//Post - An instance of has been created
+
+//Pre - if readers is NULL then subReadersLength must be 0 else if readers != NULL then subReadersLength > 0
+// s != NULL
+//Post - The instance has been created
+
+ int32_t subReadersLength = 0;
+ if ( subReaders != NULL ){
+ while ( subReaders[subReadersLength] != NULL )
+ subReadersLength++;
+ }
+ CND_PRECONDITION(starts != NULL,"starts is NULL");
+
+ //Temporary variables
+ IndexReader* reader = NULL;
+ TermEnum* termEnum = NULL;
+ SegmentMergeInfo* smi = NULL;
+ _docFreq = 0;
+ _term = NULL;
+ queue = _CLNEW SegmentMergeQueue(subReadersLength);
+
+ CND_CONDITION (queue != NULL, "Could not allocate memory for queue");
+
+ //iterate through all the readers
+ for ( int32_t i=0;i<subReadersLength;i++ ) {
+ //Get the i-th reader
+ reader = subReaders[i];
+
+ //Check if the enumeration must start from term t
+ if (t != NULL) {
+ //termEnum is an enumeration of terms starting at or after the named term t
+ termEnum = reader->terms(t);
+ }else{
+ //termEnum is an enumeration of all the Terms and TermInfos in the set.
+ termEnum = reader->terms();
+ }
+
+ //Instantiate an new SegmentMerginfo
+ smi = _CLNEW SegmentMergeInfo(starts[i], termEnum, reader);
+
+ // Note that in the call termEnum->getTerm(false) below false is required because
+ // otherwise a reference is leaked. By passing false getTerm is
+ // ordered to return an unowned reference instead. (Credits for DSR)
+ if (t == NULL ? smi->next() : termEnum->term(false) != NULL){
+ // initialize queue
+ queue->put(smi);
+ } else{
+ //Close the SegmentMergeInfo
+ smi->close();
+ //And have it deleted
+ _CLDELETE(smi);
+ }
+ }
+
+ //Check if the queue has elements
+ if (t != NULL && queue->size() > 0) {
+ next();
+ }
+}
+
+MultiTermEnum::~MultiTermEnum(){
+//Func - Destructor
+//Pre - true
+//Post - All the resource have been freed and the instance has been deleted
+
+ //Close the enumeration
+ close();
+
+ //Delete the queue
+ _CLDELETE(queue);
+}
+
+bool MultiTermEnum::next(){
+//Func - Move the current term to the next in the set of enumerations
+//Pre - true
+//Post - Returns true if term has been moved to the next in the set of enumerations
+// Returns false if this was not possible
+
+ SegmentMergeInfo* top = queue->top();
+ if (top == NULL) {
+ _CLDECDELETE(_term);
+ _term = NULL;
+ return false;
+ }
+
+ //The getTerm method requires the client programmer to indicate whether he
+ // owns the returned reference, so we can discard ours
+ // right away.
+ _CLDECDELETE(_term);
+
+ //Assign term the term of top and make sure the reference counter is increased
+ _term = _CL_POINTER(top->term);
+ _docFreq = 0;
+
+ //Find the next term
+ while (top != NULL && _term->compareTo(top->term) == 0) {
+ //don't delete, this is the top
+ queue->pop();
+ // increment freq
+ _docFreq += top->termEnum->docFreq();
+ if (top->next()){
+ // restore queue
+ queue->put(top);
+ }else{
+ // done with a segment
+ top->close();
+ _CLDELETE(top);
+ }
+ top = queue->top();
+ }
+
+ return true;
+}
+
+
+Term* MultiTermEnum::term() {
+//Func - Returns the current term of the set of enumerations
+//Pre - pointer is true or false and indicates if the reference counter
+// of term must be increased or not
+// next() must have been called once!
+//Post - pointer = true -> term has been returned with an increased reference counter
+// pointer = false -> term has been returned
+
+ return _CL_POINTER(_term);
+}
+
+Term* MultiTermEnum::term(bool pointer) {
+ if ( pointer )
+ return _CL_POINTER(_term);
+ else
+ return _term;
+}
+
+int32_t MultiTermEnum::docFreq() const {
+//Func - Returns the document frequency of the current term in the set
+//Pre - termInfo != NULL
+// next() must have been called once
+//Post - The document frequency of the current enumerated term has been returned
+
+ return _docFreq;
+}
+
+
+void MultiTermEnum::close() {
+//Func - Closes the set of enumerations in the queue
+//Pre - queue holds a valid reference to a SegmentMergeQueue
+//Post - The queue has been closed all SegmentMergeInfo instance have been deleted by
+// the closing of the queue
+// term has been finalized and reset to NULL
+
+ // Needed when this enumeration hasn't actually been exhausted yet
+ _CLDECDELETE(_term);
+
+ //Close the queue This will destroy all SegmentMergeInfo instances!
+ queue->close();
+
+}
+
+
+
+
+
+MultiTermPositions::MultiTermPositions(IndexReader** r, const int32_t* s){
+//Func - Constructor
+//Pre - if r is NULL then rLen must be 0 else if r != NULL then rLen > 0
+// s != NULL
+//Post - The instance has been created
+
+ subReaders = r;
+ subReadersLength = 0;
+ if ( subReaders != NULL ){
+ while ( subReaders[subReadersLength] != NULL )
+ subReadersLength ++ ;
+ }
+
+ CND_PRECONDITION(s != NULL, "s is NULL");
+
+ starts = s;
+ base = 0;
+ pointer = 0;
+ current = NULL;
+ term = NULL;
+
+ readerTermDocs = NULL;
+
+ //Check if there are readers
+ if(subReaders != NULL && subReadersLength > 0){
+ readerTermDocs = (TermDocs**)_CL_NEWARRAY(SegmentTermPositions*,subReadersLength);
+
+ CND_CONDITION(readerTermDocs != NULL,"No memory could be allocated for readerTermDocs");
+
+ //Initialize the readerTermDocs pointer array
+ for ( int32_t i=0;i<subReadersLength;i++){
+ readerTermDocs[i]=NULL;
+ }
+ }
+}
+
+
+TermDocs* MultiTermPositions::__asTermDocs(){
+ return (TermDocs*) this;
+}
+TermPositions* MultiTermPositions::__asTermPositions(){
+ return (TermPositions*) this;
+}
+
+
+TermDocs* MultiTermPositions::termDocs(const IndexReader* reader) const {
+// Here in the MultiTermPositions class, we want this->current to always
+// be a SegmentTermPositions rather than merely a SegmentTermDocs.
+// To that end, we override the termDocs(IndexReader&) method to produce
+// a SegmentTermPositions via the underlying reader's termPositions method
+// rather merely producing a SegmentTermDocs via the reader's termDocs
+// method.
+
+ TermPositions* tp = reader->termPositions();
+ TermDocs* ret = tp->__asTermDocs();
+
+ CND_CONDITION(ret != NULL,
+ "Dynamic downcast in MultiTermPositions::termDocs from"
+ " TermPositions to TermDocs failed."
+ );
+ return ret;
+ }
+
+int32_t MultiTermPositions::nextPosition() {
+ //Func -
+ //Pre - current != NULL
+ //Post -
+ CND_PRECONDITION(current != NULL,"current is NULL");
+
+ TermPositions* curAsTP = current->__asTermPositions();
+
+ CND_CONDITION(curAsTP != NULL,
+ "Dynamic downcast in MultiTermPositions::nextPosition from"
+ " SegmentTermDocs to TermPositions failed."
+ )
+ return curAsTP->nextPosition();
+}
+
+
+CL_NS_END