summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/index/IndexWriter.h
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/index/IndexWriter.h')
-rw-r--r--3rdparty/clucene/src/CLucene/index/IndexWriter.h425
1 files changed, 425 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/index/IndexWriter.h b/3rdparty/clucene/src/CLucene/index/IndexWriter.h
new file mode 100644
index 000000000..80476c864
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/index/IndexWriter.h
@@ -0,0 +1,425 @@
+/*
+ * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+ *
+ * Distributable under the terms of either the Apache License (Version 2.0) or
+ * the GNU Lesser General Public License, as specified in the COPYING file.
+ *
+ * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
+*/
+#ifndef _lucene_index_IndexWriter_
+#define _lucene_index_IndexWriter_
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include <QtCore/QString>
+#include <QtCore/QStringList>
+
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "CLucene/util/VoidList.h"
+#include "CLucene/search/Similarity.h"
+#include "CLucene/store/Lock.h"
+#include "CLucene/store/TransactionalRAMDirectory.h"
+
+#include "SegmentHeader.h"
+
+CL_NS_DEF(index)
+
+/**
+An IndexWriter creates and maintains an index.
+
+The third argument to the
+<a href="#IndexWriter(org.apache.lucene.store.Directory, org.apache.lucene.analysis.Analyzer, boolean)"><b>constructor</b></a>
+determines whether a new index is created, or whether an existing index is
+opened for the addition of new documents.
+
+In either case, documents are added with the <a
+href="#addDocument(org.apache.lucene.document.Document)"><b>addDocument</b></a> method.
+When finished adding documents, <a href="#close()"><b>close</b></a> should be called.
+
+<p>If an index will not have more documents added for a while and optimal search
+performance is desired, then the <a href="#optimize()"><b>optimize</b></a>
+method should be called before the index is closed.
+
+<p>Opening an IndexWriter creates a lock file for the directory in use. Trying to open
+another IndexWriter on the same directory will lead to an IOException. The IOException
+is also thrown if an IndexReader on the same directory is used to delete documents
+from the index.
+
+@see IndexModifier IndexModifier supports the important methods of IndexWriter plus deletion
+*/
+class IndexWriter : LUCENE_BASE
+{
+ class LockWith2 : public CL_NS(store)::LuceneLockWith<void>
+ {
+ public:
+ LockWith2(CL_NS(store)::LuceneLock* lock,
+ int64_t lockWaitTimeout,
+ IndexWriter* wr,
+ CL_NS(util)::CLVector<SegmentReader*>* std,
+ bool create);
+
+ ~LockWith2() {}
+
+ void doBody();
+
+ private:
+ bool create;
+ IndexWriter* writer;
+ CL_NS(util)::CLVector<SegmentReader*>* segmentsToDelete;
+ };
+ friend class LockWith2;
+
+ class LockWithCFS : public CL_NS(store)::LuceneLockWith<void>
+ {
+ public:
+ LockWithCFS(CL_NS(store)::LuceneLock* lock,
+ int64_t lockWaitTimeout,
+ CL_NS(store)::Directory* dir,
+ IndexWriter* wr,
+ const QString& segName,
+ const QStringList& ftd);
+
+ ~LockWithCFS() {}
+
+ void doBody();
+
+ private:
+ QString segName;
+ IndexWriter* writer;
+ CL_NS(store)::Directory* directory;
+ QStringList filesToDelete;
+ };
+ friend class IndexWriter::LockWithCFS;
+
+ // indicates if the writers is open - this way close can be called multiple
+ // times
+ bool isOpen;
+
+ // how to analyze text
+ CL_NS(analysis)::Analyzer* analyzer;
+
+ CL_NS(search)::Similarity* similarity; // how to normalize
+
+ /** Use compound file setting. Normally defaults to true, except when
+ * using a RAMDirectory. This minimizes the number of files used.
+ * Setting this to false may improve indexing performance, but
+ * may also cause file handle problems.
+ */
+ bool useCompoundFile;
+ bool closeDir;
+
+ // for temp segs
+ CL_NS(store)::TransactionalRAMDirectory* ramDirectory;
+
+ CL_NS(store)::LuceneLock* writeLock;
+
+ void _IndexWriter(const bool create);
+
+ void _finalize();
+
+ // where this index resides
+ CL_NS(store)::Directory* directory;
+
+
+ int32_t getSegmentsCounter() { return segmentInfos.counter; }
+ int32_t maxFieldLength;
+ int32_t mergeFactor;
+ int32_t minMergeDocs;
+ int32_t maxMergeDocs;
+ int32_t termIndexInterval;
+
+ int64_t writeLockTimeout;
+ int64_t commitLockTimeout;
+public:
+ DEFINE_MUTEX(THIS_LOCK)
+
+ // Release the write lock, if needed.
+ SegmentInfos segmentInfos;
+
+ // Release the write lock, if needed.
+ ~IndexWriter();
+
+ /**
+ * The Java implementation of Lucene silently truncates any tokenized
+ * field if the number of tokens exceeds a certain threshold. Although
+ * that threshold is adjustable, it is easy for the client programmer
+ * to be unaware that such a threshold exists, and to become its
+ * unwitting victim.
+ * CLucene implements a less insidious truncation policy. Up to
+ * DEFAULT_MAX_FIELD_LENGTH tokens, CLucene behaves just as JLucene
+ * does. If the number of tokens exceeds that threshold without any
+ * indication of a truncation preference by the client programmer,
+ * CLucene raises an exception, prompting the client programmer to
+ * explicitly set a truncation policy by adjusting maxFieldLength.
+ */
+ LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_FIELD_LENGTH = 10000);
+ LUCENE_STATIC_CONSTANT(int32_t, FIELD_TRUNC_POLICY__WARN = -1);
+ int32_t getMaxFieldLength() const{ return maxFieldLength; }
+ void setMaxFieldLength(int32_t val){ maxFieldLength = val; }
+
+ /**
+ * Default value is 10. Change using {@link #setMaxBufferedDocs(int)}.
+ */
+ LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_BUFFERED_DOCS = 10);
+ /** Determines the minimal number of documents required before the buffered
+ * in-memory documents are merging and a new Segment is created.
+ * Since Documents are merged in a {@link RAMDirectory},
+ * large value gives faster indexing. At the same time, mergeFactor limits
+ * the number of files open in a FSDirectory.
+ *
+ * <p> The default value is DEFAULT_MAX_BUFFERED_DOCS.*/
+ void setMaxBufferedDocs(int32_t val){ minMergeDocs = val; }
+ /**
+ * @see #setMaxBufferedDocs
+ */
+ int32_t getMaxBufferedDocs(){ return minMergeDocs; }
+
+ /**
+ * Default value for the write lock timeout (1,000).
+ */
+ LUCENE_STATIC_CONSTANT(int64_t, WRITE_LOCK_TIMEOUT = 1000);
+ /**
+ * Sets the maximum time to wait for a write lock (in milliseconds).
+ */
+ void setWriteLockTimeout(int64_t writeLockTimeout)
+ { this->writeLockTimeout = writeLockTimeout; }
+ /**
+ * @see #setWriteLockTimeout
+ */
+ int64_t getWriteLockTimeout() { return writeLockTimeout; }
+
+ /**
+ * Default value for the commit lock timeout (10,000).
+ */
+ LUCENE_STATIC_CONSTANT(int64_t, COMMIT_LOCK_TIMEOUT = 10000);
+ /**
+ * Sets the maximum time to wait for a commit lock (in milliseconds).
+ */
+ void setCommitLockTimeout(int64_t commitLockTimeout)
+ { this->commitLockTimeout = commitLockTimeout; }
+ /**
+ * @see #setCommitLockTimeout
+ */
+ int64_t getCommitLockTimeout() { return commitLockTimeout; }
+
+ static const QLatin1String WRITE_LOCK_NAME; //"write.lock";
+ static const QLatin1String COMMIT_LOCK_NAME; //"commit.lock";
+
+ /**
+ * Default value is 10. Change using {@link #setMergeFactor(int)}.
+ */
+ LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MERGE_FACTOR = 10);
+ /* Determines how often segment indices are merged by addDocument(). With
+ * smaller values, less RAM is used while indexing, and searches on
+ * unoptimized indices are faster, but indexing speed is slower. With larger
+ * values more RAM is used while indexing and searches on unoptimized indices
+ * are slower, but indexing is faster. Thus larger values (> 10) are best
+ * for batched index creation, and smaller values (< 10) for indices that are
+ * interactively maintained.
+ *
+ * <p>This must never be less than 2. The default value is 10.
+ */
+ int32_t getMergeFactor() const{ return mergeFactor; }
+ void setMergeFactor(int32_t val){ mergeFactor = val; }
+
+
+ /** Expert: The fraction of terms in the "dictionary" which should be stored
+ * in RAM. Smaller values use more memory, but make searching slightly
+ * faster, while larger values use less memory and make searching slightly
+ * slower. Searching is typically not dominated by dictionary lookup, so
+ * tweaking this is rarely useful.
+ */
+ LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_TERM_INDEX_INTERVAL = 128);
+ /** Expert: Set the interval between indexed terms. Large values cause less
+ * memory to be used by IndexReader, but slow random-access to terms. Small
+ * values cause more memory to be used by an IndexReader, and speed
+ * random-access to terms.
+ *
+ * This parameter determines the amount of computation required per query
+ * term, regardless of the number of documents that contain that term. In
+ * particular, it is the maximum number of other terms that must be
+ * scanned before a term is located and its frequency and position information
+ * may be processed. In a large index with user-entered query terms, query
+ * processing time is likely to be dominated not by term lookup but rather
+ * by the processing of frequency and positional data. In a small index
+ * or when many uncommon query terms are generated (e.g., by wildcard
+ * queries) term lookup may become a dominant cost.
+ *
+ * In particular, <code>numUniqueTerms/interval</code> terms are read into
+ * memory by an IndexReader, and, on average, <code>interval/2</code> terms
+ * must be scanned for each random term access.
+ *
+ * @see #DEFAULT_TERM_INDEX_INTERVAL
+ */
+ void setTermIndexInterval(int32_t interval) { termIndexInterval = interval; }
+ /** Expert: Return the interval between indexed terms.
+ *
+ * @see #setTermIndexInterval(int)
+ */
+ int32_t getTermIndexInterval() { return termIndexInterval; }
+
+ /** Determines the minimal number of documents required before the buffered
+ * in-memory documents are merging and a new Segment is created.
+ * Since Documents are merged in a {@link RAMDirectory},
+ * large value gives faster indexing. At the same time, mergeFactor limits
+ * the number of files open in a FSDirectory.
+ *
+ * <p> The default value is 10.*/
+ int32_t getMinMergeDocs() const{ return minMergeDocs; }
+ void setMinMergeDocs(int32_t val){ minMergeDocs = val; }
+
+ /** Determines the largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * <p>The default value is {@link #DEFAULT_MAX_MERGE_DOCS}.
+ */
+ LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_MERGE_DOCS = LUCENE_INT32_MAX_SHOULDBE);
+ /**Determines the largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ * <p>The default value is {@link Integer#MAX_VALUE}.
+ */
+ int32_t getMaxMergeDocs() const{ return maxMergeDocs; }
+ void setMaxMergeDocs(int32_t val){ maxMergeDocs = val; }
+
+ /**
+ * Constructs an IndexWriter for the index in <code>path</code>.
+ * Text will be analyzed with <code>a</code>. If <code>create</code>
+ * is true, then a new, empty index will be created in
+ * <code>path</code>, replacing the index already there, if any.
+ *
+ * @param path the path to the index directory
+ * @param a the analyzer to use
+ * @param create <code>true</code> to create the index or overwrite
+ * the existing one; <code>false</code> to append to the existing
+ * index
+ * @throws IOException if the directory cannot be read/written to, or
+ * if it does not exist, and <code>create</code> is
+ * <code>false</code>
+ */
+ IndexWriter(const QString& path, CL_NS(analysis)::Analyzer* a,
+ const bool create, const bool closeDir = true);
+
+
+ /**Constructs an IndexWriter for the index in <code>d</code>. Text will be
+ * analyzed with <code>a</code>. If <code>create</code> is true, then a new,
+ * empty index will be created in <code>d</code>, replacing the index already
+ * there, if any.
+ */
+ IndexWriter(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a,
+ const bool create, const bool closeDir = false);
+
+ // Flushes all changes to an index, closes all associated files, and closes
+ // the directory that the index is stored in.
+ void close();
+
+ // Returns the number of documents currently in this index. synchronized
+ int32_t docCount();
+
+
+ // Adds a document to this index, using the provided analyzer instead of
+ // the value of {@link #getAnalyzer()}. If the document contains more than
+ // {@link #setMaxFieldLength(int)} terms for a given field, the remainder
+ // are discarded.
+ void addDocument(CL_NS(document)::Document* doc,
+ CL_NS(analysis)::Analyzer* analyzer = NULL);
+
+
+ // Merges all segments together into a single segment, optimizing an index
+ // for search. synchronized
+ void optimize();
+
+
+ /**Merges all segments from an array of indices into this index.
+ *
+ * <p>This may be used to parallelize batch indexing. A large document
+ * collection can be broken into sub-collections. Each sub-collection can be
+ * indexed in parallel, on a different thread, process or machine. The
+ * complete index can then be created by merging sub-collection indices
+ * with this method.
+ *
+ * <p>After this completes, the index is optimized.
+ *@synchronized
+ */
+ void addIndexes(CL_NS(store)::Directory** dirs);
+
+ /** Merges the provided indexes into this index.
+ * <p>After this completes, the index is optimized. </p>
+ * <p>The provided IndexReaders are not closed.</p>
+ */
+ void addIndexes(IndexReader** readers);
+
+
+ /** Returns the directory this index resides in. */
+ CL_NS(store)::Directory* getDirectory() { return directory; }
+
+ /** Get the current setting of whether to use the compound file format.
+ * Note that this just returns the value you set with setUseCompoundFile(boolean)
+ * or the default. You cannot use this to query the status of an existing index.
+ * @see #setUseCompoundFile(boolean)
+ */
+ bool getUseCompoundFile() { return useCompoundFile; }
+
+ /** Setting to turn on usage of a compound file. When on, multiple files
+ * for each segment are merged into a single file once the segment creation
+ * is finished. This is done regardless of what directory is in use.
+ */
+ void setUseCompoundFile(bool value) { useCompoundFile = value; }
+
+
+ /** Expert: Set the Similarity implementation used by this IndexWriter.
+ *
+ * @see Similarity#setDefault(Similarity)
+ */
+ void setSimilarity(CL_NS(search)::Similarity* similarity)
+ { this->similarity = similarity; }
+
+ /** Expert: Return the Similarity implementation used by this IndexWriter.
+ *
+ * <p>This defaults to the current value of {@link Similarity#getDefault()}.
+ */
+ CL_NS(search)::Similarity* getSimilarity() { return this->similarity; }
+
+ /** Returns the analyzer used by this index. */
+ CL_NS(analysis)::Analyzer* getAnalyzer() { return analyzer; }
+
+private:
+ /** Merges all RAM-resident segments. */
+ void flushRamSegments();
+
+ /** Incremental segment merger. */
+ void maybeMergeSegments();
+
+ // Pops segments off of segmentInfos stack down to minSegment, merges them,
+ // and pushes the merged index onto the top of the segmentInfos stack.
+ void mergeSegments(const uint32_t minSegment);
+
+ // Merges the named range of segments, replacing them in the stack with a
+ // single segment.
+ void mergeSegments(const uint32_t minSegment, const uint32_t end);
+
+ // Some operating systems (e.g. Windows) don't permit a file to be deleted
+ // while it is opened for read (e.g. by another process or thread). So we
+ // assume that when a delete fails it is because the file is open in another
+ // process, and queue the file for subsequent deletion.
+ void deleteSegments(CL_NS(util)::CLVector<SegmentReader*>* segments);
+
+ void deleteFiles(const QStringList& files);
+ void readDeleteableFiles(QStringList& files);
+ void deleteFiles(const QStringList& files, QStringList& deletable);
+ void deleteFiles(const QStringList& files, CL_NS(store)::Directory* directory);
+ void writeDeleteableFiles(const QStringList& files);
+
+ // synchronized
+ QString newSegmentName();
+};
+
+CL_NS_END
+
+#endif