summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp')
-rw-r--r--3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp571
1 files changed, 571 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp b/3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp
new file mode 100644
index 000000000..dcbc31591
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/index/DocumentWriter.cpp
@@ -0,0 +1,571 @@
+/*
+ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+ *
+ * Distributable under the terms of either the Apache License (Version 2.0) or
+ * the GNU Lesser General Public License, as specified in the COPYING file.
+ *
+ * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
+*/
+#include "CLucene/StdHeader.h"
+
+#include "DocumentWriter.h"
+#include "FieldInfos.h"
+#include "IndexWriter.h"
+#include "FieldsWriter.h"
+#include "Term.h"
+#include "TermInfo.h"
+#include "TermInfosWriter.h"
+
+#include "CLucene/analysis/AnalysisHeader.h"
+
+#include "CLucene/search/Similarity.h"
+#include "TermInfosWriter.h"
+#include "FieldsWriter.h"
+
+CL_NS_USE(util)
+CL_NS_USE(store)
+CL_NS_USE(analysis)
+CL_NS_USE(document)
+CL_NS_DEF(index)
+
+/*Posting*/
+
+DocumentWriter::Posting::Posting(Term* t, const int32_t position,
+ TermVectorOffsetInfo* offset)
+{
+ //Func - Constructor
+ //Pre - t contains a valid reference to a Term
+ //Post - Instance has been created
+ freq = 1;
+
+ term = _CL_POINTER(t);
+ positions.values = (int32_t*)malloc(sizeof(int32_t));
+ positions.values[0] = position;
+ positions.length = 1;
+
+ if ( offset != NULL ){
+ this->offsets.values =(TermVectorOffsetInfo*)malloc(sizeof(TermVectorOffsetInfo));
+ this->offsets.values[0] = *offset;
+ this->offsets.length = 1;
+ }
+}
+
+DocumentWriter::Posting::~Posting()
+{
+ //Func - Destructor
+ //Pre - true
+ //Post - The instance has been destroyed
+
+ free(positions.values);
+ if ( this->offsets.values != NULL )
+ free(this->offsets.values);
+ _CLDECDELETE(term);
+}
+
+DocumentWriter::DocumentWriter(Directory* d, Analyzer* a,
+ CL_NS(search)::Similarity* sim, const int32_t mfl)
+ : analyzer(a)
+ , directory(d)
+ , maxFieldLength(mfl)
+ , fieldInfos(NULL)
+ , fieldLengths(NULL)
+ , similarity(sim)
+ , termIndexInterval(IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
+ , fieldPositions(NULL)
+ , fieldBoosts(NULL)
+ , termBuffer(_CLNEW Term)
+{
+ //Pre - d contains a valid reference to a Directory
+ // d contains a valid reference to a Analyzer
+ // mfl > 0 and contains the maximum field length
+ //Post - Instance has been created
+
+ CND_PRECONDITION(((mfl > 0) || (mfl == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
+ "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
+
+ fieldInfos = NULL;
+ fieldLengths = NULL;
+}
+
+DocumentWriter::DocumentWriter(CL_NS(store)::Directory* d,
+ CL_NS(analysis)::Analyzer* a, IndexWriter* writer)
+ : analyzer(a)
+ , directory(d)
+ , maxFieldLength(writer->getMaxFieldLength())
+ , fieldInfos(NULL)
+ , fieldLengths(NULL)
+ , similarity(writer->getSimilarity())
+ , termIndexInterval(writer->getTermIndexInterval())
+ , fieldPositions(NULL)
+ , fieldBoosts(NULL)
+ , termBuffer(_CLNEW Term)
+{
+ //Pre - d contains a valid reference to a Directory
+ // d contains a valid reference to a Analyzer
+ // mfl > 0 and contains the maximum field length
+ //Post - Instance has been created
+
+ CND_PRECONDITION(((maxFieldLength > 0)
+ || (maxFieldLength == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
+ "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
+
+ fieldInfos = NULL;
+ fieldLengths = NULL;
+
+}
+
+DocumentWriter::~DocumentWriter()
+{
+ //Func - Destructor
+ //Pre - true
+ //Post - The instance has been destroyed
+ clearPostingTable();
+ _CLDELETE( fieldInfos );
+ _CLDELETE_ARRAY(fieldLengths);
+ _CLDELETE_ARRAY(fieldPositions);
+ _CLDELETE_ARRAY(fieldBoosts);
+ _CLDELETE_ARRAY(fieldOffsets);
+
+ _CLDECDELETE(termBuffer);
+}
+
+void DocumentWriter::clearPostingTable()
+{
+ PostingTableType::iterator itr = postingTable.begin();
+ while (itr != postingTable.end()){
+ _CLDELETE(itr->second);
+ _CLLDECDELETE(itr->first);
+ ++itr;
+ }
+ postingTable.clear();
+}
+
+void DocumentWriter::addDocument(const QString& segment, Document* doc)
+{
+ CND_PRECONDITION(fieldInfos == NULL, "fieldInfos!=NULL")
+
+ // write field names
+ fieldInfos = _CLNEW FieldInfos();
+ fieldInfos->add(doc);
+
+ QString buf = Misc::segmentname(segment, QLatin1String(".fnm"));
+ fieldInfos->write(directory, buf);
+
+ // write field values
+ FieldsWriter fieldsWriter(directory, segment, fieldInfos);
+ try {
+ fieldsWriter.addDocument(doc);
+ } _CLFINALLY (
+ fieldsWriter.close()
+ );
+
+ // clear postingTable
+ clearPostingTable();
+
+ int32_t fieldInfoSize = fieldInfos->size();
+ fieldLengths = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldLengths
+ fieldPositions = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldPositions
+ fieldOffsets = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldOffsets
+ fieldBoosts = _CL_NEWARRAY(qreal, fieldInfoSize); // init fieldBoosts
+
+ qreal fbd = doc->getBoost();
+ for (int32_t i = 0; i < fieldInfoSize; ++i) {
+ fieldLengths[i] = 0;
+ fieldPositions[i] = 0;
+ fieldOffsets[i] = 0;
+ //initialise fieldBoost array with default boost
+ fieldBoosts[i] = fbd;
+ }
+
+ // invert doc into postingTable
+ invertDocument(doc);
+
+ // sort postingTable into an array
+ Posting** postings = NULL;
+ int32_t postingsLength = 0;
+ sortPostingTable(postings, postingsLength);
+
+ //DEBUG:
+ /*for (int32_t i = 0; i < postingsLength; i++) {
+ Posting* posting = postings[i];
+
+ TCHAR* b = posting->term->toString();
+ _cout << b << " freq=" << posting->freq;
+ _CLDELETE(b);
+
+ _cout << " pos=" << posting->positions[0];
+ for (int32_t j = 1; j < posting->freq; j++)
+ _cout <<"," << posting->positions[j];
+
+ _cout << endl;
+ }*/
+
+
+ // write postings
+ writePostings(postings, postingsLength, segment);
+
+ // write norms of indexed fields
+ writeNorms(segment);
+ _CLDELETE_ARRAY(postings);
+}
+
+void DocumentWriter::sortPostingTable(Posting**& array, int32_t& arraySize)
+{
+ // copy postingTable into an array
+ arraySize = postingTable.size();
+ array = _CL_NEWARRAY(Posting*,arraySize);
+ PostingTableType::iterator postings = postingTable.begin();
+ int32_t i=0;
+ while ( postings != postingTable.end() ){
+ array[i] = (Posting*)postings->second;
+ postings++;
+ i++;
+ }
+ // sort the array
+ quickSort(array, 0, i - 1);
+}
+
+
+void DocumentWriter::invertDocument(const Document* doc)
+{
+ DocumentFieldEnumeration* fields = doc->fields();
+ try {
+ while (fields->hasMoreElements()) {
+ Field* field = (Field*)fields->nextElement();
+ const TCHAR* fieldName = field->name();
+ const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName);
+
+ int32_t length = fieldLengths[fieldNumber]; // length of field
+ int32_t position = fieldPositions[fieldNumber]; // position in field
+ if (length>0)
+ position+=analyzer->getPositionIncrementGap(fieldName);
+ int32_t offset = fieldOffsets[fieldNumber]; // offset field
+
+ if (field->isIndexed()) {
+ if (!field->isTokenized()) { // un-tokenized field
+ //FEATURE: this is bug in java: if using a Reader, then
+ //field value will not be added. With CLucene, an untokenized
+ //field with a reader will still be added (if it isn't stored,
+ //because if it's stored, then the reader has already been read.
+ const TCHAR* charBuf = NULL;
+ int64_t dataLen = 0;
+
+ if (field->stringValue() == NULL && !field->isStored() ) {
+ CL_NS(util)::Reader* r = field->readerValue();
+ // this call tries to read the entire stream
+ // this may invalidate the string for the further calls
+ // it may be better to do this via a FilterReader
+ // TODO make a better implementation of this
+ dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE);
+ if (dataLen == -1)
+ dataLen = 0;
+ //todo: would be better to pass the string length, in case
+ //a null char is passed, but then would need to test the output too.
+ } else {
+ charBuf = field->stringValue();
+ dataLen = _tcslen(charBuf);
+ }
+
+ if(field->isStoreOffsetWithTermVector()){
+ TermVectorOffsetInfo tio;
+ tio.setStartOffset(offset);
+ tio.setEndOffset(offset + dataLen);
+ addPosition(fieldName, charBuf, position++, &tio );
+ }else
+ addPosition(fieldName, charBuf, position++, NULL);
+ offset += dataLen;
+ length++;
+ } else { // field must be tokenized
+ CL_NS(util)::Reader* reader; // find or make Reader
+ bool delReader = false;
+ if (field->readerValue() != NULL) {
+ reader = field->readerValue();
+ } else if (field->stringValue() != NULL) {
+ reader = _CLNEW CL_NS(util)::StringReader(
+ field->stringValue(),_tcslen(field->stringValue()),
+ false);
+ delReader = true;
+ } else {
+ _CLTHROWA(CL_ERR_IO,"field must have either String or Reader value");
+ }
+
+ try {
+ // Tokenize field and add to postingTable.
+ CL_NS(analysis)::TokenStream* stream =
+ analyzer->tokenStream(fieldName, reader);
+
+ try {
+ CL_NS(analysis)::Token t;
+ int32_t lastTokenEndOffset = -1;
+ while (stream->next(&t)) {
+ position += (t.getPositionIncrement() - 1);
+
+ if(field->isStoreOffsetWithTermVector()){
+ TermVectorOffsetInfo tio;
+ tio.setStartOffset(offset + t.startOffset());
+ tio.setEndOffset(offset + t.endOffset());
+ addPosition(fieldName, t.termText(), position++, &tio);
+ } else
+ addPosition(fieldName, t.termText(), position++, NULL);
+
+ lastTokenEndOffset = t.endOffset();
+ length++;
+ // Apply field truncation policy.
+ if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {
+ // The client programmer has explicitly authorized us to
+ // truncate the token stream after maxFieldLength tokens.
+ if ( length > maxFieldLength)
+ break;
+ } else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) {
+ const TCHAR* errMsgBase =
+ _T("Indexing a huge number of tokens from a single")
+ _T(" field (\"%s\", in this case) can cause CLucene")
+ _T(" to use memory excessively.")
+ _T(" By default, CLucene will accept only %s tokens")
+ _T(" tokens from a single field before forcing the")
+ _T(" client programmer to specify a threshold at")
+ _T(" which to truncate the token stream.")
+ _T(" You should set this threshold via")
+ _T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX")
+ _T(" to disable truncation, or a value to specify maximum number of fields).");
+
+ TCHAR defaultMaxAsChar[34];
+ _i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,
+ defaultMaxAsChar, 10
+ );
+ int32_t errMsgLen = _tcslen(errMsgBase)
+ + _tcslen(fieldName)
+ + _tcslen(defaultMaxAsChar);
+ TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1);
+
+ _sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar);
+
+ _CLTHROWT_DEL(CL_ERR_Runtime,errMsg);
+ }
+ } // while token->next
+
+ if(lastTokenEndOffset != -1 )
+ offset += lastTokenEndOffset + 1;
+ } _CLFINALLY (
+ stream->close();
+ _CLDELETE(stream);
+ );
+ } _CLFINALLY (
+ if (delReader) {
+ _CLDELETE(reader);
+ }
+ );
+ } // if/else field is to be tokenized
+ fieldLengths[fieldNumber] = length; // save field length
+ fieldPositions[fieldNumber] = position; // save field position
+ fieldBoosts[fieldNumber] *= field->getBoost();
+ fieldOffsets[fieldNumber] = offset;
+ } // if field is to beindexed
+ } // while more fields available
+ } _CLFINALLY (
+ _CLDELETE(fields);
+ );
+}
+
+void DocumentWriter::addPosition(const TCHAR* field, const TCHAR* text,
+ const int32_t position, TermVectorOffsetInfo* offset)
+{
+ termBuffer->set(field,text,false);
+
+ Posting* ti = postingTable.get(termBuffer);
+ if (ti != NULL) { // word seen before
+ int32_t freq = ti->freq;
+ if (ti->positions.length == freq) {
+ // positions array is full, realloc its size
+ ti->positions.length = freq*2;
+ ti->positions.values = (int32_t*)realloc(ti->positions.values, ti->positions.length * sizeof(int32_t));
+ }
+ ti->positions.values[freq] = position; // add new position
+
+ if (offset != NULL) {
+ if (ti->offsets.length == freq){
+ ti->offsets.length = freq*2;
+ ti->offsets.values = (TermVectorOffsetInfo*)realloc(ti->offsets.values, ti->offsets.length * sizeof(TermVectorOffsetInfo));
+ }
+ ti->offsets[freq] = *offset;
+ }
+
+ ti->freq = freq + 1; // update frequency
+ } else { // word not seen before
+ Term* term = _CLNEW Term( field, text, false);
+ postingTable.put(term, _CLNEW Posting(term, position, offset));
+ }
+}
+
+//static
+void DocumentWriter::quickSort(Posting**& postings, const int32_t lo, const int32_t hi)
+{
+ if(lo >= hi)
+ return;
+
+ int32_t mid = (lo + hi) / 2;
+
+ if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
+ Posting* tmp = postings[lo];
+ postings[lo] = postings[mid];
+ postings[mid] = tmp;
+ }
+
+ if(postings[mid]->term->compareTo(postings[hi]->term) > 0) {
+ Posting* tmp = postings[mid];
+ postings[mid] = postings[hi];
+ postings[hi] = tmp;
+
+ if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
+ Posting* tmp2 = postings[lo];
+ postings[lo] = postings[mid];
+ postings[mid] = tmp2;
+ }
+ }
+
+ int32_t left = lo + 1;
+ int32_t right = hi - 1;
+
+ if (left >= right)
+ return;
+
+ const Term* partition = postings[mid]->term; //not kept, so no need to finalize
+
+ for( ;; ) {
+ while(postings[right]->term->compareTo(partition) > 0)
+ --right;
+
+ while(left < right && postings[left]->term->compareTo(partition) <= 0)
+ ++left;
+
+ if(left < right) {
+ Posting* tmp = postings[left];
+ postings[left] = postings[right];
+ postings[right] = tmp;
+ --right;
+ } else {
+ break;
+ }
+ }
+
+ quickSort(postings, lo, left);
+ quickSort(postings, left + 1, hi);
+}
+
+void DocumentWriter::writePostings(Posting** postings,
+ const int32_t postingsLength, const QString& segment)
+{
+ #define __DOCLOSE(obj) \
+ if (obj!=NULL) { \
+ try { \
+ obj->close(); \
+ _CLDELETE(obj); \
+ } catch(CLuceneError &e) { \
+ ierr = e.number(); \
+ err = e.what(); \
+ } catch(...) { \
+ err = "Unknown error while closing posting tables"; \
+ } \
+ }
+
+ IndexOutput* freq = NULL;
+ IndexOutput* prox = NULL;
+ TermInfosWriter* tis = NULL;
+ TermVectorsWriter* termVectorWriter = NULL;
+ try {
+ //open files for inverse index storage
+ QString buf = Misc::segmentname(segment, QLatin1String(".frq"));
+ freq = directory->createOutput(buf);
+
+ buf = Misc::segmentname(segment, QLatin1String(".prx"));
+ prox = directory->createOutput(buf);
+
+ tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos,
+ termIndexInterval);
+ TermInfo* ti = _CLNEW TermInfo();
+ const TCHAR* currentField = NULL;
+ for (int32_t i = 0; i < postingsLength; i++) {
+ Posting* posting = postings[i];
+
+ // add an entry to the dictionary with pointers to prox and freq files
+ ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1);
+ tis->add(posting->term, ti);
+
+ // add an entry to the freq file
+ int32_t postingFreq = posting->freq;
+ if (postingFreq == 1) // optimize freq=1
+ freq->writeVInt(1); // set low bit of doc num.
+ else {
+ freq->writeVInt(0); // the document number
+ freq->writeVInt(postingFreq); // frequency in doc
+ }
+
+ int32_t lastPosition = 0; // write positions
+ for (int32_t j = 0; j < postingFreq; ++j) { // use delta-encoding
+ prox->writeVInt(posting->positions.values[j] - lastPosition);
+ lastPosition = posting->positions.values[j];
+ }
+
+ // check to see if we switched to a new field
+ const TCHAR* termField = posting->term->field();
+ if ( currentField == NULL || _tcscmp(currentField,termField) != 0 ) {
+ //todo, can we do an intern'd check?
+ // changing field - see if there is something to save
+ currentField = termField;
+ FieldInfo* fi = fieldInfos->fieldInfo(currentField);
+
+ if (fi->storeTermVector) {
+ if (termVectorWriter == NULL) {
+ termVectorWriter = _CLNEW TermVectorsWriter(directory,
+ segment, fieldInfos);
+ termVectorWriter->openDocument();
+ }
+ termVectorWriter->openField(currentField);
+ } else if (termVectorWriter != NULL) {
+ termVectorWriter->closeField();
+ }
+ }
+ if (termVectorWriter != NULL && termVectorWriter->isFieldOpen()) {
+ termVectorWriter->addTerm(posting->term->text(), postingFreq,
+ &posting->positions, &posting->offsets);
+ }
+ }
+ if (termVectorWriter != NULL)
+ termVectorWriter->closeDocument();
+ _CLDELETE(ti);
+ } _CLFINALLY (
+ const char* err = NULL;
+ int32_t ierr = 0;
+
+ // make an effort to close all streams we can but remember and re-throw
+ // the first exception encountered in this process
+ __DOCLOSE(freq);
+ __DOCLOSE(prox);
+ __DOCLOSE(tis);
+ __DOCLOSE(termVectorWriter);
+ if (err != NULL)
+ _CLTHROWA(ierr,err);
+ );
+}
+
+void DocumentWriter::writeNorms(const QString& segment)
+{
+ for(int32_t n = 0; n < fieldInfos->size(); n++){
+ FieldInfo* fi = fieldInfos->fieldInfo(n);
+ if(fi->isIndexed && !fi->omitNorms) {
+ qreal norm = fieldBoosts[n] * similarity->lengthNorm(
+ fi->name, fieldLengths[n]);
+
+ QString fn(segment + QLatin1String(".f%1"));
+ IndexOutput* norms = directory->createOutput(fn.arg(n));
+ try {
+ norms->writeByte(CL_NS(search)::Similarity::encodeNorm(norm));
+ }_CLFINALLY (
+ norms->close();
+ _CLDELETE(norms);
+ )
+ }
+ }
+}
+
+CL_NS_END