diff options
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp')
-rw-r--r-- | 3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp | 571 |
1 files changed, 571 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp b/3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp new file mode 100644 index 000000000..dcbc31591 --- /dev/null +++ b/3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp @@ -0,0 +1,571 @@ +/* + * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team + * + * Distributable under the terms of either the Apache License (Version 2.0) or + * the GNU Lesser General Public License, as specified in the COPYING file. + * + * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved. +*/ +#include "CLucene/StdHeader.h" + +#include "DocumentWriter.h" +#include "FieldInfos.h" +#include "IndexWriter.h" +#include "FieldsWriter.h" +#include "Term.h" +#include "TermInfo.h" +#include "TermInfosWriter.h" + +#include "CLucene/analysis/AnalysisHeader.h" + +#include "CLucene/search/Similarity.h" +#include "TermInfosWriter.h" +#include "FieldsWriter.h" + +CL_NS_USE(util) +CL_NS_USE(store) +CL_NS_USE(analysis) +CL_NS_USE(document) +CL_NS_DEF(index) + +/*Posting*/ + +DocumentWriter::Posting::Posting(Term* t, const int32_t position, + TermVectorOffsetInfo* offset) +{ + //Func - Constructor + //Pre - t contains a valid reference to a Term + //Post - Instance has been created + freq = 1; + + term = _CL_POINTER(t); + positions.values = (int32_t*)malloc(sizeof(int32_t)); + positions.values[0] = position; + positions.length = 1; + + if ( offset != NULL ){ + this->offsets.values =(TermVectorOffsetInfo*)malloc(sizeof(TermVectorOffsetInfo)); + this->offsets.values[0] = *offset; + this->offsets.length = 1; + } +} + +DocumentWriter::Posting::~Posting() +{ + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + free(positions.values); + if ( this->offsets.values != NULL ) + free(this->offsets.values); + _CLDECDELETE(term); +} + +DocumentWriter::DocumentWriter(Directory* d, Analyzer* a, + CL_NS(search)::Similarity* sim, const int32_t mfl) + : analyzer(a) + , directory(d) + , maxFieldLength(mfl) + , fieldInfos(NULL) + , fieldLengths(NULL) + , similarity(sim) + , termIndexInterval(IndexWriter::DEFAULT_TERM_INDEX_INTERVAL) + , fieldPositions(NULL) + , fieldBoosts(NULL) + , termBuffer(_CLNEW Term) +{ + //Pre - d contains a valid reference to a Directory + // d contains a valid reference to a Analyzer + // mfl > 0 and contains the maximum field length + //Post - Instance has been created + + CND_PRECONDITION(((mfl > 0) || (mfl == IndexWriter::FIELD_TRUNC_POLICY__WARN)), + "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN") + + fieldInfos = NULL; + fieldLengths = NULL; +} + +DocumentWriter::DocumentWriter(CL_NS(store)::Directory* d, + CL_NS(analysis)::Analyzer* a, IndexWriter* writer) + : analyzer(a) + , directory(d) + , maxFieldLength(writer->getMaxFieldLength()) + , fieldInfos(NULL) + , fieldLengths(NULL) + , similarity(writer->getSimilarity()) + , termIndexInterval(writer->getTermIndexInterval()) + , fieldPositions(NULL) + , fieldBoosts(NULL) + , termBuffer(_CLNEW Term) +{ + //Pre - d contains a valid reference to a Directory + // d contains a valid reference to a Analyzer + // mfl > 0 and contains the maximum field length + //Post - Instance has been created + + CND_PRECONDITION(((maxFieldLength > 0) + || (maxFieldLength == IndexWriter::FIELD_TRUNC_POLICY__WARN)), + "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN") + + fieldInfos = NULL; + fieldLengths = NULL; + +} + +DocumentWriter::~DocumentWriter() +{ + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + clearPostingTable(); + _CLDELETE( fieldInfos ); + _CLDELETE_ARRAY(fieldLengths); + _CLDELETE_ARRAY(fieldPositions); + _CLDELETE_ARRAY(fieldBoosts); + _CLDELETE_ARRAY(fieldOffsets); + + _CLDECDELETE(termBuffer); +} + +void DocumentWriter::clearPostingTable() +{ + PostingTableType::iterator itr = postingTable.begin(); + while (itr != postingTable.end()){ + _CLDELETE(itr->second); + _CLLDECDELETE(itr->first); + ++itr; + } + postingTable.clear(); +} + +void DocumentWriter::addDocument(const QString& segment, Document* doc) +{ + CND_PRECONDITION(fieldInfos == NULL, "fieldInfos!=NULL") + + // write field names + fieldInfos = _CLNEW FieldInfos(); + fieldInfos->add(doc); + + QString buf = Misc::segmentname(segment, QLatin1String(".fnm")); + fieldInfos->write(directory, buf); + + // write field values + FieldsWriter fieldsWriter(directory, segment, fieldInfos); + try { + fieldsWriter.addDocument(doc); + } _CLFINALLY ( + fieldsWriter.close() + ); + + // clear postingTable + clearPostingTable(); + + int32_t fieldInfoSize = fieldInfos->size(); + fieldLengths = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldLengths + fieldPositions = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldPositions + fieldOffsets = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldOffsets + fieldBoosts = _CL_NEWARRAY(qreal, fieldInfoSize); // init fieldBoosts + + qreal fbd = doc->getBoost(); + for (int32_t i = 0; i < fieldInfoSize; ++i) { + fieldLengths[i] = 0; + fieldPositions[i] = 0; + fieldOffsets[i] = 0; + //initialise fieldBoost array with default boost + fieldBoosts[i] = fbd; + } + + // invert doc into postingTable + invertDocument(doc); + + // sort postingTable into an array + Posting** postings = NULL; + int32_t postingsLength = 0; + sortPostingTable(postings, postingsLength); + + //DEBUG: + /*for (int32_t i = 0; i < postingsLength; i++) { + Posting* posting = postings[i]; + + TCHAR* b = posting->term->toString(); + _cout << b << " freq=" << posting->freq; + _CLDELETE(b); + + _cout << " pos=" << posting->positions[0]; + for (int32_t j = 1; j < posting->freq; j++) + _cout <<"," << posting->positions[j]; + + _cout << endl; + }*/ + + + // write postings + writePostings(postings, postingsLength, segment); + + // write norms of indexed fields + writeNorms(segment); + _CLDELETE_ARRAY(postings); +} + +void DocumentWriter::sortPostingTable(Posting**& array, int32_t& arraySize) +{ + // copy postingTable into an array + arraySize = postingTable.size(); + array = _CL_NEWARRAY(Posting*,arraySize); + PostingTableType::iterator postings = postingTable.begin(); + int32_t i=0; + while ( postings != postingTable.end() ){ + array[i] = (Posting*)postings->second; + postings++; + i++; + } + // sort the array + quickSort(array, 0, i - 1); +} + + +void DocumentWriter::invertDocument(const Document* doc) +{ + DocumentFieldEnumeration* fields = doc->fields(); + try { + while (fields->hasMoreElements()) { + Field* field = (Field*)fields->nextElement(); + const TCHAR* fieldName = field->name(); + const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName); + + int32_t length = fieldLengths[fieldNumber]; // length of field + int32_t position = fieldPositions[fieldNumber]; // position in field + if (length>0) + position+=analyzer->getPositionIncrementGap(fieldName); + int32_t offset = fieldOffsets[fieldNumber]; // offset field + + if (field->isIndexed()) { + if (!field->isTokenized()) { // un-tokenized field + //FEATURE: this is bug in java: if using a Reader, then + //field value will not be added. With CLucene, an untokenized + //field with a reader will still be added (if it isn't stored, + //because if it's stored, then the reader has already been read. + const TCHAR* charBuf = NULL; + int64_t dataLen = 0; + + if (field->stringValue() == NULL && !field->isStored() ) { + CL_NS(util)::Reader* r = field->readerValue(); + // this call tries to read the entire stream + // this may invalidate the string for the further calls + // it may be better to do this via a FilterReader + // TODO make a better implementation of this + dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE); + if (dataLen == -1) + dataLen = 0; + //todo: would be better to pass the string length, in case + //a null char is passed, but then would need to test the output too. + } else { + charBuf = field->stringValue(); + dataLen = _tcslen(charBuf); + } + + if(field->isStoreOffsetWithTermVector()){ + TermVectorOffsetInfo tio; + tio.setStartOffset(offset); + tio.setEndOffset(offset + dataLen); + addPosition(fieldName, charBuf, position++, &tio ); + }else + addPosition(fieldName, charBuf, position++, NULL); + offset += dataLen; + length++; + } else { // field must be tokenized + CL_NS(util)::Reader* reader; // find or make Reader + bool delReader = false; + if (field->readerValue() != NULL) { + reader = field->readerValue(); + } else if (field->stringValue() != NULL) { + reader = _CLNEW CL_NS(util)::StringReader( + field->stringValue(),_tcslen(field->stringValue()), + false); + delReader = true; + } else { + _CLTHROWA(CL_ERR_IO,"field must have either String or Reader value"); + } + + try { + // Tokenize field and add to postingTable. + CL_NS(analysis)::TokenStream* stream = + analyzer->tokenStream(fieldName, reader); + + try { + CL_NS(analysis)::Token t; + int32_t lastTokenEndOffset = -1; + while (stream->next(&t)) { + position += (t.getPositionIncrement() - 1); + + if(field->isStoreOffsetWithTermVector()){ + TermVectorOffsetInfo tio; + tio.setStartOffset(offset + t.startOffset()); + tio.setEndOffset(offset + t.endOffset()); + addPosition(fieldName, t.termText(), position++, &tio); + } else + addPosition(fieldName, t.termText(), position++, NULL); + + lastTokenEndOffset = t.endOffset(); + length++; + // Apply field truncation policy. + if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) { + // The client programmer has explicitly authorized us to + // truncate the token stream after maxFieldLength tokens. + if ( length > maxFieldLength) + break; + } else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) { + const TCHAR* errMsgBase = + _T("Indexing a huge number of tokens from a single") + _T(" field (\"%s\", in this case) can cause CLucene") + _T(" to use memory excessively.") + _T(" By default, CLucene will accept only %s tokens") + _T(" tokens from a single field before forcing the") + _T(" client programmer to specify a threshold at") + _T(" which to truncate the token stream.") + _T(" You should set this threshold via") + _T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX") + _T(" to disable truncation, or a value to specify maximum number of fields)."); + + TCHAR defaultMaxAsChar[34]; + _i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH, + defaultMaxAsChar, 10 + ); + int32_t errMsgLen = _tcslen(errMsgBase) + + _tcslen(fieldName) + + _tcslen(defaultMaxAsChar); + TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1); + + _sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar); + + _CLTHROWT_DEL(CL_ERR_Runtime,errMsg); + } + } // while token->next + + if(lastTokenEndOffset != -1 ) + offset += lastTokenEndOffset + 1; + } _CLFINALLY ( + stream->close(); + _CLDELETE(stream); + ); + } _CLFINALLY ( + if (delReader) { + _CLDELETE(reader); + } + ); + } // if/else field is to be tokenized + fieldLengths[fieldNumber] = length; // save field length + fieldPositions[fieldNumber] = position; // save field position + fieldBoosts[fieldNumber] *= field->getBoost(); + fieldOffsets[fieldNumber] = offset; + } // if field is to beindexed + } // while more fields available + } _CLFINALLY ( + _CLDELETE(fields); + ); +} + +void DocumentWriter::addPosition(const TCHAR* field, const TCHAR* text, + const int32_t position, TermVectorOffsetInfo* offset) +{ + termBuffer->set(field,text,false); + + Posting* ti = postingTable.get(termBuffer); + if (ti != NULL) { // word seen before + int32_t freq = ti->freq; + if (ti->positions.length == freq) { + // positions array is full, realloc its size + ti->positions.length = freq*2; + ti->positions.values = (int32_t*)realloc(ti->positions.values, ti->positions.length * sizeof(int32_t)); + } + ti->positions.values[freq] = position; // add new position + + if (offset != NULL) { + if (ti->offsets.length == freq){ + ti->offsets.length = freq*2; + ti->offsets.values = (TermVectorOffsetInfo*)realloc(ti->offsets.values, ti->offsets.length * sizeof(TermVectorOffsetInfo)); + } + ti->offsets[freq] = *offset; + } + + ti->freq = freq + 1; // update frequency + } else { // word not seen before + Term* term = _CLNEW Term( field, text, false); + postingTable.put(term, _CLNEW Posting(term, position, offset)); + } +} + +//static +void DocumentWriter::quickSort(Posting**& postings, const int32_t lo, const int32_t hi) +{ + if(lo >= hi) + return; + + int32_t mid = (lo + hi) / 2; + + if(postings[lo]->term->compareTo(postings[mid]->term) > 0) { + Posting* tmp = postings[lo]; + postings[lo] = postings[mid]; + postings[mid] = tmp; + } + + if(postings[mid]->term->compareTo(postings[hi]->term) > 0) { + Posting* tmp = postings[mid]; + postings[mid] = postings[hi]; + postings[hi] = tmp; + + if(postings[lo]->term->compareTo(postings[mid]->term) > 0) { + Posting* tmp2 = postings[lo]; + postings[lo] = postings[mid]; + postings[mid] = tmp2; + } + } + + int32_t left = lo + 1; + int32_t right = hi - 1; + + if (left >= right) + return; + + const Term* partition = postings[mid]->term; //not kept, so no need to finalize + + for( ;; ) { + while(postings[right]->term->compareTo(partition) > 0) + --right; + + while(left < right && postings[left]->term->compareTo(partition) <= 0) + ++left; + + if(left < right) { + Posting* tmp = postings[left]; + postings[left] = postings[right]; + postings[right] = tmp; + --right; + } else { + break; + } + } + + quickSort(postings, lo, left); + quickSort(postings, left + 1, hi); +} + +void DocumentWriter::writePostings(Posting** postings, + const int32_t postingsLength, const QString& segment) +{ + #define __DOCLOSE(obj) \ + if (obj!=NULL) { \ + try { \ + obj->close(); \ + _CLDELETE(obj); \ + } catch(CLuceneError &e) { \ + ierr = e.number(); \ + err = e.what(); \ + } catch(...) { \ + err = "Unknown error while closing posting tables"; \ + } \ + } + + IndexOutput* freq = NULL; + IndexOutput* prox = NULL; + TermInfosWriter* tis = NULL; + TermVectorsWriter* termVectorWriter = NULL; + try { + //open files for inverse index storage + QString buf = Misc::segmentname(segment, QLatin1String(".frq")); + freq = directory->createOutput(buf); + + buf = Misc::segmentname(segment, QLatin1String(".prx")); + prox = directory->createOutput(buf); + + tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos, + termIndexInterval); + TermInfo* ti = _CLNEW TermInfo(); + const TCHAR* currentField = NULL; + for (int32_t i = 0; i < postingsLength; i++) { + Posting* posting = postings[i]; + + // add an entry to the dictionary with pointers to prox and freq files + ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1); + tis->add(posting->term, ti); + + // add an entry to the freq file + int32_t postingFreq = posting->freq; + if (postingFreq == 1) // optimize freq=1 + freq->writeVInt(1); // set low bit of doc num. + else { + freq->writeVInt(0); // the document number + freq->writeVInt(postingFreq); // frequency in doc + } + + int32_t lastPosition = 0; // write positions + for (int32_t j = 0; j < postingFreq; ++j) { // use delta-encoding + prox->writeVInt(posting->positions.values[j] - lastPosition); + lastPosition = posting->positions.values[j]; + } + + // check to see if we switched to a new field + const TCHAR* termField = posting->term->field(); + if ( currentField == NULL || _tcscmp(currentField,termField) != 0 ) { + //todo, can we do an intern'd check? + // changing field - see if there is something to save + currentField = termField; + FieldInfo* fi = fieldInfos->fieldInfo(currentField); + + if (fi->storeTermVector) { + if (termVectorWriter == NULL) { + termVectorWriter = _CLNEW TermVectorsWriter(directory, + segment, fieldInfos); + termVectorWriter->openDocument(); + } + termVectorWriter->openField(currentField); + } else if (termVectorWriter != NULL) { + termVectorWriter->closeField(); + } + } + if (termVectorWriter != NULL && termVectorWriter->isFieldOpen()) { + termVectorWriter->addTerm(posting->term->text(), postingFreq, + &posting->positions, &posting->offsets); + } + } + if (termVectorWriter != NULL) + termVectorWriter->closeDocument(); + _CLDELETE(ti); + } _CLFINALLY ( + const char* err = NULL; + int32_t ierr = 0; + + // make an effort to close all streams we can but remember and re-throw + // the first exception encountered in this process + __DOCLOSE(freq); + __DOCLOSE(prox); + __DOCLOSE(tis); + __DOCLOSE(termVectorWriter); + if (err != NULL) + _CLTHROWA(ierr,err); + ); +} + +void DocumentWriter::writeNorms(const QString& segment) +{ + for(int32_t n = 0; n < fieldInfos->size(); n++){ + FieldInfo* fi = fieldInfos->fieldInfo(n); + if(fi->isIndexed && !fi->omitNorms) { + qreal norm = fieldBoosts[n] * similarity->lengthNorm( + fi->name, fieldLengths[n]); + + QString fn(segment + QLatin1String(".f%1")); + IndexOutput* norms = directory->createOutput(fn.arg(n)); + try { + norms->writeByte(CL_NS(search)::Similarity::encodeNorm(norm)); + }_CLFINALLY ( + norms->close(); + _CLDELETE(norms); + ) + } + } +} + +CL_NS_END |