summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/clucene/src/CLucene/index/IndexReader.h
blob: 0fe79d08309e27ea336c65ddbd73f0050e7ffa75 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
/*
 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
 *
 * Distributable under the terms of either the Apache License (Version 2.0) or 
 * the GNU Lesser General Public License, as specified in the COPYING file.
 *
 * Changes are Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies), all rights reserved.
*/
#ifndef _lucene_index_IndexReader_
#define _lucene_index_IndexReader_

#if defined(_LUCENE_PRAGMA_ONCE)
#   pragma once
#endif

#include <QtCore/QString>

#include "CLucene/store/Directory.h"
#include "CLucene/store/FSDirectory.h"
#include "CLucene/store/Lock.h"
#include "CLucene/document/Document.h"
#include "CLucene/index/TermVector.h"
#include "SegmentInfos.h"
#include "Terms.h"


CL_NS_DEF(index)


/** IndexReader is an abstract class, providing an interface for accessing an
 index.  Search of an index is done entirely through this abstract interface,
 so that any subclass which implements it is searchable.

 <p> Concrete subclasses of IndexReader are usually constructed with a call to
 one of the static <code>open()</code> methods, e.g. {@link #open(String)}.

 <p> For efficiency, in this API documents are often referred to via
 <i>document numbers</i>, non-negative integers which each name a unique
 document in the index.  These document numbers are ephemeral--they may change
 as documents are added to and deleted from an index.  Clients should thus not
 rely on a given document having the same number between sessions.
 
 <p> An IndexReader can be opened on a directory for which an IndexWriter is
 opened already, but it cannot be used to delete documents from the index then.
*/
class IndexReader : LUCENE_BASE
{
public:
	//Callback for classes that need to know if IndexReader is closing.
	typedef void (*CloseCallback)(IndexReader*, void*);

	class CloseCallbackCompare:public CL_NS(util)::Compare::_base{
	public:
		bool operator()( CloseCallback t1, CloseCallback t2 ) const{
			return t1 > t2;
		}
		static void doDelete(CloseCallback dummy){
		}
	};
	
	
	enum FieldOption {
		// all fields
		ALL = 1,
		// all indexed fields
		INDEXED = 2,
		// all fields which are not indexed
		UNINDEXED = 4,
		// all fields which are indexed with termvectors enables
		INDEXED_WITH_TERMVECTOR = 8,
		// all fields which are indexed but don't have termvectors enabled
		INDEXED_NO_TERMVECTOR = 16,
		// all fields where termvectors are enabled. Please note that only standard termvector fields are returned
		TERMVECTOR = 32,
		// all field with termvectors wiht positions enabled
		TERMVECTOR_WITH_POSITION = 64,
		// all fields where termvectors with offset position are set
		TERMVECTOR_WITH_OFFSET = 128,
		// all fields where termvectors with offset and position values set
		TERMVECTOR_WITH_POSITION_OFFSET = 256
	};


private:
    bool stale;
    bool hasChanges;
    bool closeDirectory;
    bool directoryOwner;
    
    SegmentInfos* segmentInfos;
    CL_NS(store)::Directory* directory;
	CL_NS(store)::LuceneLock* writeLock;

    typedef CL_NS(util)::CLSet<CloseCallback, void*, CloseCallbackCompare,
		CloseCallbackCompare> CloseCallbackMap;
    CloseCallbackMap closeCallbacks;
	
    /** Internal use. Implements commit */
    virtual void doCommit() = 0;
  
    /**
	* Tries to acquire the WriteLock on this directory.
	* this method is only valid if this IndexReader is directory owner.
	* 
	* @throws IOException If WriteLock cannot be acquired.
	*/
    void aquireWriteLock();
protected:
    /**
    * Constructor used if IndexReader is not owner of its directory. 
    * This is used for IndexReaders that are used within other IndexReaders that take care or locking directories.
    * 
    * @param directory Directory where IndexReader files reside.
    */
    IndexReader(CL_NS(store)::Directory* dir);

    /**
    * Constructor used if IndexReader is owner of its directory.
    * If IndexReader is owner of its directory, it locks its directory in case of write operations.
    * 
    * @param directory Directory where IndexReader files reside.
    * @param segmentInfos Used for write-l
    * @param closeDirectory
    */
	IndexReader(CL_NS(store)::Directory* directory, SegmentInfos* segmentInfos, bool closeDirectory);
	

	/// Implements close. 
	virtual void doClose() = 0;

    /** Implements setNorm in subclass.*/
    virtual void doSetNorm(int32_t doc, const TCHAR* field, uint8_t value) = 0;
      
    /** Implements actual undeleteAll() in subclass. */
    virtual void doUndeleteAll() = 0;


    /** Implements deletion of the document numbered <code>docNum</code>.
	* Applications should call {@link #deleteDocument(int32_t)} or {@link #deleteDocuments(Term*)}.
	*/
    virtual void doDelete(const int32_t docNum) = 0;

public:

	DEFINE_MUTEX(THIS_LOCK)
	
	///Do not access this directly, only public so that MultiReader can access it
	virtual void commit();
	
	
	/** Undeletes all documents currently marked as deleted in this index.*/
	void undeleteAll();

	/**
	* Get a list of unique field names that exist in this index and have the specified
	* field option information.
	* @param fldOption specifies which field option should be available for the returned fields
	* @return Collection of Strings indicating the names of the fields.
	* @see IndexReader.FieldOption
	*/
	virtual void getFieldNames(FieldOption fldOption, CL_NS(util)::StringArrayWithDeletor& retarray) = 0;

	_CL_DEPRECATED( getFieldNames(FieldOption, StringArrayWithDeletor&) ) virtual TCHAR** getFieldNames();
	_CL_DEPRECATED( getFieldNames(FieldOption, StringArrayWithDeletor&) ) virtual TCHAR** getFieldNames(bool indexed);

	/** Returns the byte-encoded normalization factor for the named field of
	* every document.  This is used by the search code to score documents.
	*
	* The number of bytes returned is the size of the IndexReader->maxDoc()
	* MEMORY: The values are cached, so don't delete the returned byte array.
	* @see Field#setBoost(qreal)
	*/
	virtual uint8_t* norms(const TCHAR* field) = 0;
	
	
	/** Reads the byte-encoded normalization factor for the named field of every
	*  document.  This is used by the search code to score documents.
	*
	* @see Field#setBoost(qreal)
	*/
	virtual void norms(const TCHAR* field, uint8_t* bytes) = 0;

    /** Expert: Resets the normalization factor for the named field of the named
    * document.
    *
    * @see #norms(TCHAR*)
    * @see Similarity#decodeNorm(uint8_t)
    */
    void setNorm(int32_t doc, const TCHAR* field, qreal value);
  
    /** Expert: Resets the normalization factor for the named field of the named
    * document.  The norm represents the product of the field's {@link
    * Field#setBoost(qreal) boost} and its {@link Similarity#lengthNorm(TCHAR*,
    * int32_t) length normalization}.  Thus, to preserve the length normalization
    * values when resetting this, one should base the new value upon the old.
    *
    * @see #norms(TCHAR*)
    * @see Similarity#decodeNorm(uint8_t)
    */
    void setNorm(int32_t doc, const TCHAR* field, uint8_t value);

	/// Release the write lock, if needed. 
    virtual ~IndexReader();

	/// Returns an IndexReader reading the index in an FSDirectory in the named path. 
	static IndexReader* open(const QString& path);

	/// Returns an IndexReader reading the index in the given Directory. 
	static IndexReader* open( CL_NS(store)::Directory* directory, bool closeDirectory=false);

	/** 
	* Returns the time the index in the named directory was last modified.
	* Do not use this to check whether the reader is still up-to-date, use
	* {@link #isCurrent()} instead. 
	*/
	static uint64_t lastModified(const QString& directory);

	/** 
	* Returns the time the index in the named directory was last modified. 
	* Do not use this to check whether the reader is still up-to-date, use
	* {@link #isCurrent()} instead. 
	*/
	static uint64_t lastModified(const CL_NS(store)::Directory* directory);

	
	/**
	* Reads version number from segments files. The version number is
	* initialized with a timestamp and then increased by one for each change of
	* the index.
	* 
	* @param directory where the index resides.
	* @return version number.
	* @throws IOException if segments file cannot be read
	*/
	static int64_t getCurrentVersion(CL_NS(store)::Directory* directory);
	
	/**
   * Reads version number from segments files. The version number is
   * initialized with a timestamp and then increased by one for each change of
   * the index.
   * 
   * @param directory where the index resides.
   * @return version number.
   * @throws IOException if segments file cannot be read
   */
	static int64_t getCurrentVersion(const QString& directory);
	
	/**
	* Version number when this IndexReader was opened.
	*/
	int64_t getVersion();
	
	/**
	* Check whether this IndexReader still works on a current version of the index.
	* If this is not the case you will need to re-open the IndexReader to
	* make sure you see the latest changes made to the index.
	* 
	* @throws IOException
	*/
	bool isCurrent();


	/**
	*  Return an array of term frequency vectors for the specified document.
	*  The array contains a vector for each vectorized field in the document.
	*  Each vector contains terms and frequencies for all terms in a given vectorized field.
	*  If no such fields existed, the method returns null. The term vectors that are
	* returned my either be of type TermFreqVector or of type TermPositionsVector if
	* positions or offsets have been stored.
	* 
	* @param docNumber document for which term frequency vectors are returned
	* @return array of term frequency vectors. May be null if no term vectors have been
	*  stored for the specified document.
	* @throws IOException if index cannot be accessed
	* @see org.apache.lucene.document.Field.TermVector
	*/
	virtual bool getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result) =0;
	
	/**
	*  Return a term frequency vector for the specified document and field. The
	*  returned vector contains terms and frequencies for the terms in
	*  the specified field of this document, if the field had the storeTermVector
	*  flag set. If termvectors had been stored with positions or offsets, a 
	*  TermPositionsVector is returned.
	* 
	* @param docNumber document for which the term frequency vector is returned
	* @param field field for which the term frequency vector is returned.
	* @return term frequency vector May be null if field does not exist in the specified
	* document or term vector was not stored.
	* @throws IOException if index cannot be accessed
	* @see org.apache.lucene.document.Field.TermVector
	*/
	virtual TermFreqVector* getTermFreqVector(int32_t docNumber, const TCHAR* field) = 0;
	
	/**
	* Returns <code>true</code> if an index exists at the specified directory.
	* If the directory does not exist or if there is no index in it.
	* @param  directory the directory to check for an index
	* @return <code>true</code> if an index exists; <code>false</code> otherwise
	*/
	static bool indexExists(const QString& directory);

    /**
	* Returns <code>true</code> if an index exists at the specified directory.
	* If the directory does not exist or if there is no index in it.
	* @param  directory the directory to check for an index
	* @return <code>true</code> if an index exists; <code>false</code> otherwise
	* @throws IOException if there is a problem with accessing the index
	*/
	static bool indexExists(const CL_NS(store)::Directory* directory);

	/** Returns the number of documents in this index. */
  	virtual int32_t numDocs() = 0;

	/** Returns one greater than the largest possible document number.
	* This may be used to, e.g., determine how big to allocate an array which
	* will have an element for every document number in an index.
	*/
	virtual int32_t maxDoc() const = 0;

	/** Gets the stored fields of the <code>n</code><sup>th</sup>
   	 * <code>Document</code> in this index. 
   	 * The fields are not cleared before retrieving the document, so the
   	 * object should be new or just cleared.
   	 */ 
	virtual bool document(int32_t n, CL_NS(document)::Document*) =0;
	
	_CL_DEPRECATED( document(i, document) ) CL_NS(document)::Document* document(const int32_t n);

	/** Returns true if document <i>n</i> has been deleted */
  	virtual bool isDeleted(const int32_t n) = 0;

	/** Returns true if any documents have been deleted */
	virtual bool hasDeletions() const = 0;

	/** Returns true if there are norms stored for this field. */
	virtual bool hasNorms(const TCHAR* field);

	/** Returns an enumeration of all the terms in the index.
	* The enumeration is ordered by Term.compareTo().  Each term
	* is greater than all that precede it in the enumeration.
	* @memory Caller must clean up
	*/
	virtual TermEnum* terms() const =0;

	/** Returns an enumeration of all terms after a given term.
	* The enumeration is ordered by Term.compareTo().  Each term
	* is greater than all that precede it in the enumeration.
	* @memory Caller must clean up
	*/
	virtual TermEnum* terms(const Term* t) const = 0;

	/** Returns the number of documents containing the term <code>t</code>. */
	virtual int32_t docFreq(const Term* t) const = 0;

	/* Returns an unpositioned TermPositions enumerator.
	 * @memory Caller must clean up
	 */
	virtual TermPositions* termPositions() const = 0;
	
    /** Returns an enumeration of all the documents which contain
	* <code>term</code>.  For each document, in addition to the document number
	* and frequency of the term in that document, a list of all of the ordinal
	* positions of the term in the document is available.  Thus, this method
	* implements the mapping:
	*
	* <p><ul>
	* Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq,
	* &lt;pos<sub>1</sub>, pos<sub>2</sub>, ...
	* pos<sub>freq-1</sub>&gt;
	* &gt;<sup>*</sup>
	* </ul>
	* <p> This positional information faciliates phrase and proximity searching.
	* <p>The enumeration is ordered by document number.  Each document number is
	* greater than all that precede it in the enumeration.
	 * @memory Caller must clean up
	*/
	TermPositions* termPositions(Term* term) const;

	/** Returns an unpositioned {@link TermDocs} enumerator. 
	 * @memory Caller must clean up
	 */
	virtual TermDocs* termDocs() const = 0;

	/** Returns an enumeration of all the documents which contain
	* <code>term</code>. For each document, the document number, the frequency of
	* the term in that document is also provided, for use in search scoring.
	* Thus, this method implements the mapping:
	* <p><ul>Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq&gt;<sup>*</sup></ul>
	* <p>The enumeration is ordered by document number.  Each document number
	* is greater than all that precede it in the enumeration.
	 * @memory Caller must clean up
	*/
	TermDocs* termDocs(Term* term) const;

	/** Deletes the document numbered <code>docNum</code>.  Once a document is
	* deleted it will not appear in TermDocs or TermPostitions enumerations.
	* Attempts to read its field with the {@link #document}
	* method will result in an error.  The presence of this document may still be
	* reflected in the {@link #docFreq} statistic, though
	* this will be corrected eventually as the index is further modified.
	*/
	void deleteDocument(const int32_t docNum);

	///@deprecated. Use deleteDocument instead.
	_CL_DEPRECATED( deleteDocument ) void deleteDoc(const int32_t docNum)
    { deleteDocument(docNum); }

	/** Deletes all documents containing <code>term</code>.
	* This is useful if one uses a document field to hold a unique ID string for
	* the document.  Then to delete such a document, one merely constructs a
	* term with the appropriate field and the unique ID string as its text and
	* passes it to this method.
	* See {@link #deleteDocument(int)} for information about when this deletion will 
	* become effective.
	* @return the number of documents deleted
	*/
	int32_t deleteDocuments(Term* term);

	///@deprecated. Use deleteDocuments instead.
	_CL_DEPRECATED( deleteDocuments ) int32_t deleteTerm(Term* term){ return deleteDocuments(term); }

	/** 
	* Closes files associated with this index and also saves any new deletions to disk.
    * No other methods should be called after this has been called.
    */
	void close();

	///Checks if the index in the named directory is currently locked.       
	static bool isLocked(CL_NS(store)::Directory* directory);
	
	///Checks if the index in the named directory is currently locked.       
	static bool isLocked(const QString& directory);


	///Forcibly unlocks the index in the named directory.
	///Caution: this should only be used by failure recovery code,
	///when it is known that no other process nor thread is in fact
	///currently accessing this index.
	static void unlock(CL_NS(store)::Directory* directory);
	static void unlock(const QString& path);

	 /** Returns the directory this index resides in. */
	CL_NS(store)::Directory* getDirectory() { return directory; }

	/** Returns true if the file is a lucene filename (based on extension or filename) */
	static bool isLuceneFile(const QString& filename);

	/**
	* For classes that need to know when the IndexReader closes (such as caches, etc),
	* should pass their callback function to this.
	*/
	void addCloseCallback(CloseCallback callback, void* parameter);

protected:
	class LockWith : public CL_NS(store)::LuceneLockWith<IndexReader*>
    {
	public:
		LockWith(CL_NS(store)::LuceneLock* lock, CL_NS(store)::Directory* dir);

		//Reads the segmentinfo file and depending on the number of segments found
		//it returns a MultiReader or a SegmentReader
		IndexReader* doBody();

    private:
		CL_NS(store)::Directory* directory;
	};
	friend class IndexReader::LockWith;

    class CommitLockWith : public CL_NS(store)::LuceneLockWith<void>
    {
	public:
	    CommitLockWith(CL_NS(store)::LuceneLock* lock, IndexReader* r);
	    void doBody();

    private:
	    IndexReader* reader;
    };
	friend class IndexReader::CommitLockWith;
};

CL_NS_END
#endif