diff options
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/IndexWriter.h')
-rw-r--r-- | 3rdparty/clucene/src/CLucene/index/IndexWriter.h | 425 |
1 files changed, 425 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/IndexWriter.h b/3rdparty/clucene/src/CLucene/index/IndexWriter.h new file mode 100644 index 000000000..80476c864 --- /dev/null +++ b/3rdparty/clucene/src/CLucene/index/IndexWriter.h @@ -0,0 +1,425 @@ +/* + * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team + * + * Distributable under the terms of either the Apache License (Version 2.0) or + * the GNU Lesser General Public License, as specified in the COPYING file. + * + * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved. +*/ +#ifndef _lucene_index_IndexWriter_ +#define _lucene_index_IndexWriter_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include <QtCore/QString> +#include <QtCore/QStringList> + +#include "CLucene/analysis/AnalysisHeader.h" +#include "CLucene/util/VoidList.h" +#include "CLucene/search/Similarity.h" +#include "CLucene/store/Lock.h" +#include "CLucene/store/TransactionalRAMDirectory.h" + +#include "SegmentHeader.h" + +CL_NS_DEF(index) + +/** +An IndexWriter creates and maintains an index. + +The third argument to the +<a href="#IndexWriter(org.apache.lucene.store.Directory, org.apache.lucene.analysis.Analyzer, boolean)"><b>constructor</b></a> +determines whether a new index is created, or whether an existing index is +opened for the addition of new documents. + +In either case, documents are added with the <a +href="#addDocument(org.apache.lucene.document.Document)"><b>addDocument</b></a> method. +When finished adding documents, <a href="#close()"><b>close</b></a> should be called. + +<p>If an index will not have more documents added for a while and optimal search +performance is desired, then the <a href="#optimize()"><b>optimize</b></a> +method should be called before the index is closed. + +<p>Opening an IndexWriter creates a lock file for the directory in use. Trying to open +another IndexWriter on the same directory will lead to an IOException. The IOException +is also thrown if an IndexReader on the same directory is used to delete documents +from the index. + +@see IndexModifier IndexModifier supports the important methods of IndexWriter plus deletion +*/ +class IndexWriter : LUCENE_BASE +{ + class LockWith2 : public CL_NS(store)::LuceneLockWith<void> + { + public: + LockWith2(CL_NS(store)::LuceneLock* lock, + int64_t lockWaitTimeout, + IndexWriter* wr, + CL_NS(util)::CLVector<SegmentReader*>* std, + bool create); + + ~LockWith2() {} + + void doBody(); + + private: + bool create; + IndexWriter* writer; + CL_NS(util)::CLVector<SegmentReader*>* segmentsToDelete; + }; + friend class LockWith2; + + class LockWithCFS : public CL_NS(store)::LuceneLockWith<void> + { + public: + LockWithCFS(CL_NS(store)::LuceneLock* lock, + int64_t lockWaitTimeout, + CL_NS(store)::Directory* dir, + IndexWriter* wr, + const QString& segName, + const QStringList& ftd); + + ~LockWithCFS() {} + + void doBody(); + + private: + QString segName; + IndexWriter* writer; + CL_NS(store)::Directory* directory; + QStringList filesToDelete; + }; + friend class IndexWriter::LockWithCFS; + + // indicates if the writers is open - this way close can be called multiple + // times + bool isOpen; + + // how to analyze text + CL_NS(analysis)::Analyzer* analyzer; + + CL_NS(search)::Similarity* similarity; // how to normalize + + /** Use compound file setting. Normally defaults to true, except when + * using a RAMDirectory. This minimizes the number of files used. + * Setting this to false may improve indexing performance, but + * may also cause file handle problems. + */ + bool useCompoundFile; + bool closeDir; + + // for temp segs + CL_NS(store)::TransactionalRAMDirectory* ramDirectory; + + CL_NS(store)::LuceneLock* writeLock; + + void _IndexWriter(const bool create); + + void _finalize(); + + // where this index resides + CL_NS(store)::Directory* directory; + + + int32_t getSegmentsCounter() { return segmentInfos.counter; } + int32_t maxFieldLength; + int32_t mergeFactor; + int32_t minMergeDocs; + int32_t maxMergeDocs; + int32_t termIndexInterval; + + int64_t writeLockTimeout; + int64_t commitLockTimeout; +public: + DEFINE_MUTEX(THIS_LOCK) + + // Release the write lock, if needed. + SegmentInfos segmentInfos; + + // Release the write lock, if needed. + ~IndexWriter(); + + /** + * The Java implementation of Lucene silently truncates any tokenized + * field if the number of tokens exceeds a certain threshold. Although + * that threshold is adjustable, it is easy for the client programmer + * to be unaware that such a threshold exists, and to become its + * unwitting victim. + * CLucene implements a less insidious truncation policy. Up to + * DEFAULT_MAX_FIELD_LENGTH tokens, CLucene behaves just as JLucene + * does. If the number of tokens exceeds that threshold without any + * indication of a truncation preference by the client programmer, + * CLucene raises an exception, prompting the client programmer to + * explicitly set a truncation policy by adjusting maxFieldLength. + */ + LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_FIELD_LENGTH = 10000); + LUCENE_STATIC_CONSTANT(int32_t, FIELD_TRUNC_POLICY__WARN = -1); + int32_t getMaxFieldLength() const{ return maxFieldLength; } + void setMaxFieldLength(int32_t val){ maxFieldLength = val; } + + /** + * Default value is 10. Change using {@link #setMaxBufferedDocs(int)}. + */ + LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_BUFFERED_DOCS = 10); + /** Determines the minimal number of documents required before the buffered + * in-memory documents are merging and a new Segment is created. + * Since Documents are merged in a {@link RAMDirectory}, + * large value gives faster indexing. At the same time, mergeFactor limits + * the number of files open in a FSDirectory. + * + * <p> The default value is DEFAULT_MAX_BUFFERED_DOCS.*/ + void setMaxBufferedDocs(int32_t val){ minMergeDocs = val; } + /** + * @see #setMaxBufferedDocs + */ + int32_t getMaxBufferedDocs(){ return minMergeDocs; } + + /** + * Default value for the write lock timeout (1,000). + */ + LUCENE_STATIC_CONSTANT(int64_t, WRITE_LOCK_TIMEOUT = 1000); + /** + * Sets the maximum time to wait for a write lock (in milliseconds). + */ + void setWriteLockTimeout(int64_t writeLockTimeout) + { this->writeLockTimeout = writeLockTimeout; } + /** + * @see #setWriteLockTimeout + */ + int64_t getWriteLockTimeout() { return writeLockTimeout; } + + /** + * Default value for the commit lock timeout (10,000). + */ + LUCENE_STATIC_CONSTANT(int64_t, COMMIT_LOCK_TIMEOUT = 10000); + /** + * Sets the maximum time to wait for a commit lock (in milliseconds). + */ + void setCommitLockTimeout(int64_t commitLockTimeout) + { this->commitLockTimeout = commitLockTimeout; } + /** + * @see #setCommitLockTimeout + */ + int64_t getCommitLockTimeout() { return commitLockTimeout; } + + static const QLatin1String WRITE_LOCK_NAME; //"write.lock"; + static const QLatin1String COMMIT_LOCK_NAME; //"commit.lock"; + + /** + * Default value is 10. Change using {@link #setMergeFactor(int)}. + */ + LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MERGE_FACTOR = 10); + /* Determines how often segment indices are merged by addDocument(). With + * smaller values, less RAM is used while indexing, and searches on + * unoptimized indices are faster, but indexing speed is slower. With larger + * values more RAM is used while indexing and searches on unoptimized indices + * are slower, but indexing is faster. Thus larger values (> 10) are best + * for batched index creation, and smaller values (< 10) for indices that are + * interactively maintained. + * + * <p>This must never be less than 2. The default value is 10. + */ + int32_t getMergeFactor() const{ return mergeFactor; } + void setMergeFactor(int32_t val){ mergeFactor = val; } + + + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful. + */ + LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_TERM_INDEX_INTERVAL = 128); + /** Expert: Set the interval between indexed terms. Large values cause less + * memory to be used by IndexReader, but slow random-access to terms. Small + * values cause more memory to be used by an IndexReader, and speed + * random-access to terms. + * + * This parameter determines the amount of computation required per query + * term, regardless of the number of documents that contain that term. In + * particular, it is the maximum number of other terms that must be + * scanned before a term is located and its frequency and position information + * may be processed. In a large index with user-entered query terms, query + * processing time is likely to be dominated not by term lookup but rather + * by the processing of frequency and positional data. In a small index + * or when many uncommon query terms are generated (e.g., by wildcard + * queries) term lookup may become a dominant cost. + * + * In particular, <code>numUniqueTerms/interval</code> terms are read into + * memory by an IndexReader, and, on average, <code>interval/2</code> terms + * must be scanned for each random term access. + * + * @see #DEFAULT_TERM_INDEX_INTERVAL + */ + void setTermIndexInterval(int32_t interval) { termIndexInterval = interval; } + /** Expert: Return the interval between indexed terms. + * + * @see #setTermIndexInterval(int) + */ + int32_t getTermIndexInterval() { return termIndexInterval; } + + /** Determines the minimal number of documents required before the buffered + * in-memory documents are merging and a new Segment is created. + * Since Documents are merged in a {@link RAMDirectory}, + * large value gives faster indexing. At the same time, mergeFactor limits + * the number of files open in a FSDirectory. + * + * <p> The default value is 10.*/ + int32_t getMinMergeDocs() const{ return minMergeDocs; } + void setMinMergeDocs(int32_t val){ minMergeDocs = val; } + + /** Determines the largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + * <p>The default value is {@link #DEFAULT_MAX_MERGE_DOCS}. + */ + LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_MERGE_DOCS = LUCENE_INT32_MAX_SHOULDBE); + /**Determines the largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + * <p>The default value is {@link Integer#MAX_VALUE}. + */ + int32_t getMaxMergeDocs() const{ return maxMergeDocs; } + void setMaxMergeDocs(int32_t val){ maxMergeDocs = val; } + + /** + * Constructs an IndexWriter for the index in <code>path</code>. + * Text will be analyzed with <code>a</code>. If <code>create</code> + * is true, then a new, empty index will be created in + * <code>path</code>, replacing the index already there, if any. + * + * @param path the path to the index directory + * @param a the analyzer to use + * @param create <code>true</code> to create the index or overwrite + * the existing one; <code>false</code> to append to the existing + * index + * @throws IOException if the directory cannot be read/written to, or + * if it does not exist, and <code>create</code> is + * <code>false</code> + */ + IndexWriter(const QString& path, CL_NS(analysis)::Analyzer* a, + const bool create, const bool closeDir = true); + + + /**Constructs an IndexWriter for the index in <code>d</code>. Text will be + * analyzed with <code>a</code>. If <code>create</code> is true, then a new, + * empty index will be created in <code>d</code>, replacing the index already + * there, if any. + */ + IndexWriter(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a, + const bool create, const bool closeDir = false); + + // Flushes all changes to an index, closes all associated files, and closes + // the directory that the index is stored in. + void close(); + + // Returns the number of documents currently in this index. synchronized + int32_t docCount(); + + + // Adds a document to this index, using the provided analyzer instead of + // the value of {@link #getAnalyzer()}. If the document contains more than + // {@link #setMaxFieldLength(int)} terms for a given field, the remainder + // are discarded. + void addDocument(CL_NS(document)::Document* doc, + CL_NS(analysis)::Analyzer* analyzer = NULL); + + + // Merges all segments together into a single segment, optimizing an index + // for search. synchronized + void optimize(); + + + /**Merges all segments from an array of indices into this index. + * + * <p>This may be used to parallelize batch indexing. A large document + * collection can be broken into sub-collections. Each sub-collection can be + * indexed in parallel, on a different thread, process or machine. The + * complete index can then be created by merging sub-collection indices + * with this method. + * + * <p>After this completes, the index is optimized. + *@synchronized + */ + void addIndexes(CL_NS(store)::Directory** dirs); + + /** Merges the provided indexes into this index. + * <p>After this completes, the index is optimized. </p> + * <p>The provided IndexReaders are not closed.</p> + */ + void addIndexes(IndexReader** readers); + + + /** Returns the directory this index resides in. */ + CL_NS(store)::Directory* getDirectory() { return directory; } + + /** Get the current setting of whether to use the compound file format. + * Note that this just returns the value you set with setUseCompoundFile(boolean) + * or the default. You cannot use this to query the status of an existing index. + * @see #setUseCompoundFile(boolean) + */ + bool getUseCompoundFile() { return useCompoundFile; } + + /** Setting to turn on usage of a compound file. When on, multiple files + * for each segment are merged into a single file once the segment creation + * is finished. This is done regardless of what directory is in use. + */ + void setUseCompoundFile(bool value) { useCompoundFile = value; } + + + /** Expert: Set the Similarity implementation used by this IndexWriter. + * + * @see Similarity#setDefault(Similarity) + */ + void setSimilarity(CL_NS(search)::Similarity* similarity) + { this->similarity = similarity; } + + /** Expert: Return the Similarity implementation used by this IndexWriter. + * + * <p>This defaults to the current value of {@link Similarity#getDefault()}. + */ + CL_NS(search)::Similarity* getSimilarity() { return this->similarity; } + + /** Returns the analyzer used by this index. */ + CL_NS(analysis)::Analyzer* getAnalyzer() { return analyzer; } + +private: + /** Merges all RAM-resident segments. */ + void flushRamSegments(); + + /** Incremental segment merger. */ + void maybeMergeSegments(); + + // Pops segments off of segmentInfos stack down to minSegment, merges them, + // and pushes the merged index onto the top of the segmentInfos stack. + void mergeSegments(const uint32_t minSegment); + + // Merges the named range of segments, replacing them in the stack with a + // single segment. + void mergeSegments(const uint32_t minSegment, const uint32_t end); + + // Some operating systems (e.g. Windows) don't permit a file to be deleted + // while it is opened for read (e.g. by another process or thread). So we + // assume that when a delete fails it is because the file is open in another + // process, and queue the file for subsequent deletion. + void deleteSegments(CL_NS(util)::CLVector<SegmentReader*>* segments); + + void deleteFiles(const QStringList& files); + void readDeleteableFiles(QStringList& files); + void deleteFiles(const QStringList& files, QStringList& deletable); + void deleteFiles(const QStringList& files, CL_NS(store)::Directory* directory); + void writeDeleteableFiles(const QStringList& files); + + // synchronized + QString newSegmentName(); +}; + +CL_NS_END + +#endif |