diff options
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/TermVectorWriter.cpp')
-rw-r--r-- | 3rdparty/clucene/src/CLucene/index/TermVectorWriter.cpp | 349 |
1 files changed, 349 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/TermVectorWriter.cpp b/3rdparty/clucene/src/CLucene/index/TermVectorWriter.cpp new file mode 100644 index 000000000..276b1bbd0 --- /dev/null +++ b/3rdparty/clucene/src/CLucene/index/TermVectorWriter.cpp @@ -0,0 +1,349 @@ +/* + * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team + * + * Distributable under the terms of either the Apache License (Version 2.0) or + * the GNU Lesser General Public License, as specified in the COPYING file. + * + * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved. +*/ +#include "CLucene/StdHeader.h" +#include "TermVector.h" +#include "CLucene/util/Misc.h" + +CL_NS_USE(util) +CL_NS_DEF(index) + +const QLatin1String TermVectorsWriter::LUCENE_TVX_EXTENSION(".tvx"); +const QLatin1String TermVectorsWriter::LUCENE_TVD_EXTENSION(".tvd"); +const QLatin1String TermVectorsWriter::LUCENE_TVF_EXTENSION(".tvf"); + +TermVectorsWriter::TermVectorsWriter(CL_NS(store)::Directory* directory, + const QString& segment,FieldInfos* fieldInfos) +{ + // Open files for TermVector storage + tvx = directory->createOutput(segment + LUCENE_TVX_EXTENSION); + tvx->writeInt(FORMAT_VERSION); + + tvd = directory->createOutput(segment + LUCENE_TVD_EXTENSION); + tvd->writeInt(FORMAT_VERSION); + + tvf = directory->createOutput(segment + LUCENE_TVF_EXTENSION); + tvf->writeInt(FORMAT_VERSION); + + this->fieldInfos = fieldInfos; + + currentField = NULL; + currentDocPointer = -1; +} + +TermVectorsWriter::~TermVectorsWriter() +{ + if (tvx != NULL) { + tvx->close(); + _CLDELETE(tvx); + } + + if (tvd != NULL) { + tvd->close(); + _CLDELETE(tvd); + } + + if (tvf != NULL){ + tvf->close(); + _CLDELETE(tvf); + } +} + +void TermVectorsWriter::openDocument() +{ + closeDocument(); + currentDocPointer = tvd->getFilePointer(); +} + +void TermVectorsWriter::closeDocument() +{ + if (isDocumentOpen()) { + closeField(); + writeDoc(); + fields.clear(); + currentDocPointer = -1; + } +} + +bool TermVectorsWriter::isDocumentOpen() const +{ + return currentDocPointer != -1; +} + + +void TermVectorsWriter::openField(int32_t fieldNumber, + bool storePositionWithTermVector, bool storeOffsetWithTermVector) +{ + if (!isDocumentOpen()) + _CLTHROWA(CL_ERR_InvalidState,"Cannot open field when no document is open."); + + closeField(); + currentField = _CLNEW TVField(fieldNumber, storePositionWithTermVector, + storeOffsetWithTermVector); +} + +void TermVectorsWriter::openField(const TCHAR* field) +{ + FieldInfo* fieldInfo = fieldInfos->fieldInfo(field); + openField(fieldInfo->number, fieldInfo->storePositionWithTermVector, + fieldInfo->storeOffsetWithTermVector); +} + +void TermVectorsWriter::closeField() +{ + if (isFieldOpen()) { + /* DEBUG */ + //System.out.println("closeField()"); + /* DEBUG */ + + // save field and terms + writeField(); + fields.push_back(currentField); + terms.clear(); + currentField = NULL; + } +} + +bool TermVectorsWriter::isFieldOpen() const +{ + return currentField != NULL; +} + +void TermVectorsWriter::addTerm(const TCHAR* termText, int32_t freq, + Array<int32_t>* positions, Array<TermVectorOffsetInfo>* offsets) +{ + if (!isDocumentOpen()) + _CLTHROWA(CL_ERR_InvalidState, "Cannot add terms when document is not open"); + + if (!isFieldOpen()) + _CLTHROWA(CL_ERR_InvalidState, "Cannot add terms when field is not open"); + + addTermInternal(termText, freq, positions, offsets); +} + +void TermVectorsWriter::addTermInternal(const TCHAR* termText, int32_t freq, + Array<int32_t>* positions, Array<TermVectorOffsetInfo>* offsets) +{ + TVTerm* term = _CLNEW TVTerm(); + term->setTermText(termText); + term->freq = freq; + term->positions = positions; + term->offsets = offsets; + terms.push_back(term); +} + +void TermVectorsWriter::addAllDocVectors(Array<TermFreqVector*>& vectors) +{ + openDocument(); + + for (int32_t i = 0; i < vectors.length; ++i) { + bool storePositionWithTermVector = false; + bool storeOffsetWithTermVector = false; + + if ( vectors[i]->__asTermPositionVector() != NULL ) { + TermPositionVector* tpVector = vectors[i]->__asTermPositionVector(); + + if (tpVector->size() > 0 && tpVector->getTermPositions(0) != NULL) + storePositionWithTermVector = true; + if (tpVector->size() > 0 && tpVector->getOffsets(0) != NULL) + storeOffsetWithTermVector = true; + + FieldInfo* fieldInfo = fieldInfos->fieldInfo(tpVector->getField()); + openField(fieldInfo->number, storePositionWithTermVector, storeOffsetWithTermVector); + + for (int32_t j = 0; j < tpVector->size(); ++j) + addTermInternal(tpVector->getTerms()[j], + (*tpVector->getTermFrequencies())[j], + tpVector->getTermPositions(j), + tpVector->getOffsets(j)); + + closeField(); + + } else { + TermFreqVector* tfVector = vectors[i]; + + FieldInfo* fieldInfo = fieldInfos->fieldInfo(tfVector->getField()); + openField(fieldInfo->number, storePositionWithTermVector, storeOffsetWithTermVector); + + for (int32_t j = 0; j < tfVector->size(); ++j) + addTermInternal(tfVector->getTerms()[j], + (*tfVector->getTermFrequencies())[j], NULL, NULL); + + closeField(); + } + } + + closeDocument(); +} + + +void TermVectorsWriter::close() +{ + try { + closeDocument(); + + // make an effort to close all streams we can but remember and re-throw + // the first exception encountered in this process +#define _DOTVWCLOSE(x) \ + if (x != NULL) { \ + try { \ + x->close(); \ + _CLDELETE(x) \ + } catch (CLuceneError& e) { \ + if ( e.number() != CL_ERR_IO ) \ + throw e; \ + if (ikeep == 0) \ + ikeep = e.number(); \ + if (keep[0] == 0) \ + strcpy(keep,e.what()); \ + } catch (...) { \ + if (keep[0] == 0) \ + strcpy(keep, "Unknown error while closing " #x);\ + } \ + } + } _CLFINALLY ( \ + char keep[200]; + int32_t ikeep = 0; + keep[0] = 0; + _DOTVWCLOSE(tvx); + _DOTVWCLOSE(tvd); + _DOTVWCLOSE(tvf); + if (keep[0] != 0) + _CLTHROWA(ikeep, keep); + ); +} + +void TermVectorsWriter::writeField() +{ + // remember where this field is written + currentField->tvfPointer = tvf->getFilePointer(); + //System.out.println("Field Pointer: " + currentField.tvfPointer); + int32_t size = terms.size(); + + tvf->writeVInt(size); + + bool storePositions = currentField->storePositions; + bool storeOffsets = currentField->storeOffsets; + uint8_t bits = 0x0; + if (storePositions) + bits |= STORE_POSITIONS_WITH_TERMVECTOR; + if (storeOffsets) + bits |= STORE_OFFSET_WITH_TERMVECTOR; + tvf->writeByte(bits); + + const TCHAR* lastTermText = LUCENE_BLANK_STRING; + int32_t lastTermTextLen = 0; + + for (int32_t i = 0; i < size; ++i) { + TVTerm* term = terms[i]; + int32_t start = CL_NS(util)::Misc::stringDifference(lastTermText, + lastTermTextLen, term->getTermText(),term->getTermTextLen()); + int32_t length = term->getTermTextLen() - start; + tvf->writeVInt(start); // write shared prefix length + tvf->writeVInt(length); // write delta length + tvf->writeChars(term->getTermText(), start, length); // write delta chars + tvf->writeVInt(term->freq); + + lastTermText = term->getTermText(); + lastTermTextLen = term->getTermTextLen(); + + if (storePositions) { + if(term->positions == NULL) + _CLTHROWA(CL_ERR_IllegalState, "Trying to write positions that are NULL!"); + + // use delta encoding for positions + int32_t position = 0; + for (int32_t j = 0; j < term->freq; ++j){ + tvf->writeVInt((*term->positions)[j] - position); + position = (*term->positions)[j]; + } + } + + if (storeOffsets) { + if(term->offsets == NULL) + _CLTHROWA(CL_ERR_IllegalState, "Trying to write offsets that are NULL!"); + + // use delta encoding for offsets + int32_t position = 0; + for (int32_t j = 0; j < term->freq; ++j) { + tvf->writeVInt((*term->offsets)[j].getStartOffset() - position); + //Save the diff between the two. + tvf->writeVInt((*term->offsets)[j].getEndOffset() - + (*term->offsets)[j].getStartOffset()); + position = (*term->offsets)[j].getEndOffset(); + } + } + } +} + +void TermVectorsWriter::writeDoc() +{ + if (isFieldOpen()) { + _CLTHROWA(CL_ERR_InvalidState, + "Field is still open while writing document"); + } + + // write document index record + tvx->writeLong(currentDocPointer); + + // write document data record + int32_t size = fields.size(); + + // write the number of fields + tvd->writeVInt(size); + + // write field numbers + for (int32_t j = 0; j < size; ++j) { + tvd->writeVInt(fields[j]->number); + } + + // write field pointers + int64_t lastFieldPointer = 0; + for (int32_t i = 0; i < size; ++i) { + TVField* field = (TVField*) fields[i]; + tvd->writeVLong(field->tvfPointer - lastFieldPointer); + + lastFieldPointer = field->tvfPointer; + } +} + +const TCHAR* TermVectorsWriter::TVTerm::getTermText() const +{ + return termText; +} + +size_t TermVectorsWriter::TVTerm::getTermTextLen() +{ + if (termTextLen==-1) + termTextLen = _tcslen(termText); + return termTextLen; +} + +void TermVectorsWriter::TVTerm::setTermText(const TCHAR* val) +{ + _CLDELETE_CARRAY(termText); + termText = STRDUP_TtoT(val); + termTextLen = -1; + +} + +TermVectorsWriter::TVTerm::TVTerm() + : freq(0) + , positions(NULL) + , offsets(NULL) +{ + termText=NULL; + termTextLen=-1; +} + +TermVectorsWriter::TVTerm::~TVTerm() +{ + _CLDELETE_CARRAY(termText) +} + +CL_NS_END |