summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/clucene/src/CLucene/index/SegmentHeader.h
blob: b0055364c16d2d09575b94b78d01c1a94e50b284 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/*
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
*
* Changes are Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies), all rights reserved.
*/
#ifndef _lucene_index_SegmentHeader_
#define _lucene_index_SegmentHeader_

#if defined(_LUCENE_PRAGMA_ONCE)
#   pragma once
#endif

#include <QtCore/QString>
#include <QtCore/QStringList>

#include "SegmentInfos.h"
#include "CLucene/util/BitSet.h"
#include "CLucene/util/VoidMap.h"
#include "Term.h"
#include "FieldInfos.h"
#include "FieldsReader.h"
#include "IndexReader.h"
#include "TermInfosReader.h"
#include "CompoundFile.h"
#include "CLucene/util/ThreadLocal.h"

CL_NS_DEF(index)

class SegmentReader;

class SegmentTermDocs : public virtual TermDocs
{
    int32_t _doc;
    int32_t skipInterval;
    int32_t numSkips;
    int32_t skipCount;
    CL_NS(store)::IndexInput* skipStream;
    int32_t skipDoc;
    int64_t freqPointer;
    int64_t proxPointer;
    int64_t skipPointer;
    bool haveSkipped;

protected:
    // SegmentReader parent
    const SegmentReader* parent;
    CL_NS(store)::IndexInput* freqStream;
    int32_t count;
    int32_t df;
    int32_t _freq;
    CL_NS(util)::BitSet* deletedDocs;
public:
    virtual ~SegmentTermDocs();

    virtual void seek(TermEnum* termEnum);
    virtual void seek(Term* term);
    virtual void seek(const TermInfo* ti);

    virtual void close();
    virtual int32_t doc()const;
    virtual int32_t freq()const;

    virtual bool next();

    /** Optimized implementation. */
    virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length);

    /** Optimized implementation. */
    virtual bool skipTo(const int32_t target);

    virtual TermPositions* __asTermPositions();

    // \param Parent must be a segment reader
    SegmentTermDocs( const SegmentReader* Parent);
protected:
    virtual void skippingDoc(){}
    virtual void skipProx(int64_t proxPointer){}
};


class SegmentTermPositions : public SegmentTermDocs, public TermPositions
{
private:
    CL_NS(store)::IndexInput* proxStream;
    int32_t proxCount;
    int32_t position;

public:
    // \param Parent must be a segment reader
    SegmentTermPositions(const SegmentReader* Parent);
    ~SegmentTermPositions();

    void seek(const TermInfo* ti);
    void close();
    int32_t nextPosition();
    bool next();
    int32_t read(int32_t* docs, int32_t* freqs, int32_t length);
    virtual TermDocs* __asTermDocs();
    virtual TermPositions* __asTermPositions();

    //resolve SegmentTermDocs/TermPositions ambiguity
    void seek(Term* term){ SegmentTermDocs::seek(term); }
    void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); }
    int32_t doc() const{ return SegmentTermDocs::doc(); }
    int32_t freq() const{ return SegmentTermDocs::freq(); }
    bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); }

protected:
    void skippingDoc();
    /** Called by super.skipTo(). */
    void skipProx(int64_t proxPointer);
};

// An IndexReader responsible for reading 1 segment of an index
class SegmentReader : public IndexReader
{
    /**
    * The class Norm represents the normalizations for a field.
    * These normalizations are read from an IndexInput in into an array of bytes called bytes
    */
    class Norm : LUCENE_BASE
    {
        int32_t number;
        SegmentReader* reader;
        QString segment; // segment name
    
    public:
        CL_NS(store)::IndexInput* in;
        uint8_t* bytes;
        bool dirty;
        //Constructor
        Norm(CL_NS(store)::IndexInput* instrm, int32_t number,
            SegmentReader* reader, const QString& segment);
        //Destructor
        ~Norm();

        void reWrite();
    };
    friend class SegmentReader::Norm;

    //Holds the name of the segment that is being read
    QString segment;

    //Indicates if there are documents marked as deleted
    bool deletedDocsDirty;
    bool normsDirty;
    bool undeleteAll;

    //Holds all norms for all fields in the segment
    typedef CL_NS(util)::CLHashtable<const TCHAR*,
        Norm*,CL_NS(util)::Compare::TChar, CL_NS(util)::Equals::TChar> NormsType;
    NormsType _norms; 

    uint8_t* ones;
    uint8_t* fakeNorms();

    // Compound File Reader when based on a compound file segment
    CompoundFileReader* cfsReader;
    // Reads the Field Info file
    FieldsReader* fieldsReader;
    TermVectorsReader* termVectorsReaderOrig;
    CL_NS(util)::ThreadLocal<TermVectorsReader*,
        CL_NS(util)::Deletor::Object<TermVectorsReader> >termVectorsLocal;

    void initialize(SegmentInfo* si);

    // Create a clone from the initial TermVectorsReader and store it in the
    // ThreadLocal. @return TermVectorsReader
    TermVectorsReader* getTermVectorsReader();

protected:
    // Marks document docNum as deleted
    void doDelete(const int32_t docNum);
    void doUndeleteAll();
    void doCommit();
    void doSetNorm(int32_t doc, const TCHAR* field, uint8_t value);

    // can return null if norms aren't stored
    uint8_t* getNorms(const TCHAR* field);

public:
    /**
    Func - Constructor.
    Opens all files of a segment
    .fnm     -> Field Info File
    Field names are stored in the field info file, with suffix .fnm.
    .frq     -> Frequency File
    The .frq file contains the lists of documents which contain 
    each term, along with the frequency of the term in that document.
    .prx     -> Prox File
    The prox file contains the lists of positions that each term occurs
    at within documents.
    .tis     -> Term Info File
    This file is sorted by Term. Terms are ordered first lexicographically 
    by the term's field name, and within that lexicographically by the term's text.
    .del     -> Deletion File
    The .del file is optional, and only exists when a segment contains deletions
    .f[0-9]* -> Norm File
    Contains s, for each document, a byte that encodes a value that is 
    multiplied into the score for hits on that field:
    */
    SegmentReader(SegmentInfo* si);

    SegmentReader(SegmentInfos* sis, SegmentInfo* si);
    // Destructor.
    virtual ~SegmentReader();

    // Closes all streams to the files of a single segment
    void doClose();

    // Checks if a segment managed by SegmentInfo si has deletions
    static bool hasDeletions(const SegmentInfo* si);
    bool hasDeletions() const;
    bool hasNorms(const TCHAR* field) const;

    // Returns all file names managed by this SegmentReader
    void files(QStringList& retarray);
    // Returns an enumeration of all the Terms and TermInfos in the set.
    TermEnum* terms() const;
    // Returns an enumeration of terms starting at or after the named term t
    TermEnum* terms(const Term* t) const;

    // Gets the document identified by n
    bool document(int32_t n, CL_NS(document)::Document* doc);

    // Checks if the n-th document has been marked deleted
    bool isDeleted(const int32_t n);

    // Returns an unpositioned TermDocs enumerator.
    TermDocs* termDocs() const;
    // Returns an unpositioned TermPositions enumerator.
    TermPositions* termPositions() const;

    // Returns the number of documents which contain the term t
    int32_t docFreq(const Term* t) const;

    // Returns the actual number of documents in the segment
    int32_t numDocs();
    // Returns the number of  all the documents in the segment including the
    // ones that have been marked deleted
    int32_t maxDoc() const;

    // Returns the bytes array that holds the norms of a named field.
    // Returns fake norms if norms aren't available
    uint8_t* norms(const TCHAR* field);

    // Reads the Norms for field from disk
    void norms(const TCHAR* field, uint8_t* bytes);

    // concatenating segment with ext and x
    QString SegmentName(const QString& ext, const int32_t x = -1);
    // Creates a filename in buffer by concatenating segment with ext and x
    void SegmentName(QString& buffer, int32_t bufferLen, const QString& ext,
        const int32_t x = -1);

    /**
    * @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
    */
    void getFieldNames(FieldOption fldOption, CL_NS(util)::StringArrayWithDeletor& retarray);

    static bool usesCompoundFile(SegmentInfo* si);

    /** Return a term frequency vector for the specified document and field. The
    *  vector returned contains term numbers and frequencies for all terms in
    *  the specified field of this document, if the field had storeTermVector
    *  flag set.  If the flag was not set, the method returns null.
    * @throws IOException
    */
    TermFreqVector* getTermFreqVector(int32_t docNumber, const TCHAR* field = NULL);

    /** Return an array of term frequency vectors for the specified document.
    *  The array contains a vector for each vectorized field in the document.
    *  Each vector vector contains term numbers and frequencies for all terms
    *  in a given vectorized field.
    *  If no such fields existed, the method returns null.
    * @throws IOException
    */
    bool getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result);

private:
    //Open all norms files for all fields
    void openNorms(CL_NS(store)::Directory* cfsDir);
    //Closes all norms files
    void closeNorms();

    // a bitVector that manages which documents have been deleted
    CL_NS(util)::BitSet* deletedDocs;
    // an IndexInput to the frequency file
    CL_NS(store)::IndexInput* freqStream;
    // For reading the fieldInfos file
    FieldInfos* fieldInfos;
    // For reading the Term Dictionary .tis file
    TermInfosReader* tis;
    // an IndexInput to the prox file
    CL_NS(store)::IndexInput* proxStream;

    static bool hasSeparateNorms(SegmentInfo* si);
    static uint8_t* createFakeNorms(int32_t size);

    // allow various classes to access the internals of this. this allows us
    // to have a more tight idea of the package
    friend class IndexReader;
    friend class IndexWriter;
    friend class SegmentTermDocs;
    friend class SegmentTermPositions;
    friend class MultiReader;
};

CL_NS_END

#endif