summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/index/TermInfosReader.cpp
blob: 8f9e43dec8650f111f9abd7ba0eeeb9364d176c6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
/*
 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
 *
 * Distributable under the terms of either the Apache License (Version 2.0) or 
 * the GNU Lesser General Public License, as specified in the COPYING file.
 *
 * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
*/
#include "CLucene/StdHeader.h"
#include "TermInfosReader.h"

#include "CLucene/store/Directory.h"
#include "CLucene/util/Misc.h"
#include "FieldInfos.h"
#include "Term.h"
#include "Terms.h"
#include "TermInfo.h"
#include "TermInfosWriter.h"

CL_NS_USE(store)
CL_NS_USE(util)
CL_NS_DEF(index)

TermInfosReader::TermInfosReader(Directory* dir, const QString& seg,
    FieldInfos* fis)
    : directory(dir)
    , fieldInfos (fis)
{
    //Func - Constructor.
    //       Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii)
    //Pre  - dir is a reference to a valid Directory 
    //       Fis contains a valid reference to an FieldInfos instance
    //       seg != NULL and contains the name of the segment
    //Post - An instance has been created and the index named seg has been read. (Remember
    //       a segment is nothing more then an independently readable index)

    CND_PRECONDITION(!seg.isEmpty(), "seg is NULL");

    //Initialize the name of the segment
    segment    =  seg;
    //There are no indexTerms yet
    indexTerms    = NULL;
    //So there are no indexInfos
    indexInfos    = NULL;
    //So there are no indexPointers
    indexPointers = NULL; 	
    //Create a filname fo a Term Info File
    QString tisFile = Misc::segmentname(segment, QLatin1String(".tis"));
    QString tiiFile = Misc::segmentname(segment, QLatin1String(".tii"));

    //Create an SegmentTermEnum for storing all the terms read of the segment
    origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile ), fieldInfos, false);
    indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile ), fieldInfos, true);

    //Check if enumerator points to a valid instance
    CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator");
    CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator");

    //Get the size of the enumeration and store it in size
    _size =  origEnum->size;
}

TermInfosReader::~TermInfosReader()
{
    //Func - Destructor
    //Pre  - true
    //Post - The instance has been destroyed

    //Close the TermInfosReader to be absolutly sure that enumerator has been closed
    //and the arrays indexTerms, indexPointers and indexInfos and  their elements 
    //have been destroyed
    close();
}

void TermInfosReader::close()
{
    //Func - Close the enumeration of TermInfos
    //Pre  - true
    //Post - The _enumeration has been closed and the arrays

    //Check if indexTerms and indexInfos exist
    if (indexTerms && indexInfos){
        //Iterate through arrays indexTerms and indexPointer to
        //destroy their elements
#ifdef _DEBUG
        for (int32_t i = 0; i < indexTermsLength; ++i) {
            if (indexTerms[i].__cl_refcount != 1) {
                CND_PRECONDITION(indexTerms[i].__cl_refcount == 1,
                    "TermInfosReader term was references more than internally");
            }
            //   _CLDECDELETE(indexTerms[i]);
            //_CLDELETE(indexInfos[i]);
        }
#endif
        //Delete the arrays
        _CLDELETE_ARRAY(indexTerms);
        _CLDELETE_ARRAY(indexInfos);
    }

    //Delete the arrays
    _CLDELETE_ARRAY(indexPointers);

    if (origEnum != NULL) {
        origEnum->close();

        //Get a pointer to IndexInput used by the enumeration but 
        //instantiated in the constructor by directory.open( tisFile )
        IndexInput *is = origEnum->input;

        //Delete the enumuration enumerator
        _CLDELETE(origEnum);

        //Delete the IndexInput 
        _CLDELETE(is);	
    }

    if (indexEnum != NULL){
        indexEnum->close();

        //Get a pointer to IndexInput used by the enumeration but 
        //instantiated in the constructor by directory.open( tiiFile )
        IndexInput *is = indexEnum->input;

        //Delete the enumuration enumerator
        _CLDELETE(indexEnum);

        //Delete the IndexInput 
        _CLDELETE(is);	
    }
}

int64_t TermInfosReader::size() const
{
    //Func - Return the size of the enumeration of TermInfos
    //Pre  - true
    //Post - size has been returened

    return _size;
}

Term* TermInfosReader::get(const int32_t position)
{
    //Func - Returns the nth term in the set
    //Pre  - position > = 0
    //Post - The n-th term in the set has been returned

    //Check if the size is 0 because then there are no terms
    if (_size == 0) 
        return NULL;

    SegmentTermEnum* enumerator = getEnum();

    if (enumerator != NULL //an enumeration exists
        && enumerator->term(false) != NULL // term is at or past current
        && position >= enumerator->position
        && position < (enumerator->position + enumerator->indexInterval)) {
        return scanEnum(position);			  // can avoid seek
    }

    //random-access: must seek
    seekEnum(position / enumerator->indexInterval); 

    //Get the Term at position
    return scanEnum(position);
}

// TODO: currently there is no way of cleaning up a thread, if the thread ends.
// we are stuck with the terminfosreader of that thread. Hopefully this won't
// be too big a problem... solutions anyone?
SegmentTermEnum* TermInfosReader::getEnum()
{
    SegmentTermEnum* termEnum = enumerators.get();
    if (termEnum == NULL) {
        termEnum = terms();
        enumerators.set(termEnum);
    }
    return termEnum;
}

TermInfo* TermInfosReader::get(const Term* term)
{
    //Func - Returns a TermInfo for a term
    //Pre  - term holds a valid reference to term
    //Post - if term can be found its TermInfo has been returned otherwise NULL

    //If the size of the enumeration is 0 then no Terms have been read
    if (_size == 0)
        return NULL;

    ensureIndexIsRead();

    // optimize sequential access: first try scanning cached enum w/o seeking
    SegmentTermEnum* enumerator = getEnum();

    // optimize sequential access: first try scanning cached enumerator w/o seeking
    // if the current term of the enumeration enumerator is not at the end
    if (enumerator->term(false) != NULL
        // AND there exists a previous current called prev and term is
        // positioned after this prev
        && ((enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0)
        // OR term is positioned at the same position as the current of
        // enumerator or at a higher position
        || term->compareTo(enumerator->term(false)) >= 0)) {
            //Calculate the offset for the position
            int32_t _enumOffset = (int32_t)
                (enumerator->position / enumerator->indexInterval) + 1;

        // but before end of block the length of indexTerms (the number of
        // terms in enumerator) equals _enum_offset
        if (indexTermsLength == _enumOffset
            // OR term is positioned in front of term found at _enumOffset in
            // indexTerms
            || term->compareTo(&indexTerms[_enumOffset]) < 0) {
                //no need to seek, retrieve the TermInfo for term
                return scanEnum(term);
        }
    }

    //Reposition current term in the enumeration 
    seekEnum(getIndexOffset(term));
    //Return the TermInfo for term
    return scanEnum(term);
}

int64_t TermInfosReader::getPosition(const Term* term)
{
    //Func - Returns the position of a Term in the set
    //Pre  - term holds a valid reference to a Term
    //       enumerator != NULL
    //Post - If term was found then its position is returned otherwise -1

    //if the enumeration is empty then return -1
    if (_size == 0)
        return -1;

    ensureIndexIsRead();

    //Retrieve the indexOffset for term
    int32_t indexOffset = getIndexOffset(term);
    seekEnum(indexOffset);

    SegmentTermEnum* enumerator = getEnum();

    while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {}

    if (term->equals(enumerator->term(false)))
        return enumerator->position;

    return -1;
}

SegmentTermEnum* TermInfosReader::terms(const Term* term)
{
    //Func - Returns an enumeration of terms starting at or after the named term.
    //       If term is null then enumerator is set to the beginning
    //Pre  - term holds a valid reference to a Term
    //       enumerator != NULL
    //Post - An enumeration of terms starting at or after the named term has been returned

    SegmentTermEnum* enumerator = NULL;
    if (term != NULL) {
        //Seek enumerator to term; delete the new TermInfo that's returned.
        TermInfo* ti = get(term);
        _CLDELETE(ti);
        enumerator = getEnum();
    } else {
        enumerator = origEnum;
    }
    //Clone the entire enumeration
    SegmentTermEnum* cln = enumerator->clone();

    //Check if cln points to a valid instance
    CND_CONDITION(cln != NULL, "cln is NULL");

    return cln;
}

void TermInfosReader::ensureIndexIsRead()
{
    //Func - Reads the term info index file or .tti file.
    //       This file contains every IndexInterval-th entry from the .tis file, 
    //       along with its location in the "tis" file. This is designed to be
    //       read entirely into memory and used to provide random access to the
    //       "tis" file.
    //Pre  - indexTerms    = NULL
    //       indexInfos    = NULL
    //       indexPointers = NULL
    //Post - The term info index file has been read into memory

    SCOPED_LOCK_MUTEX(THIS_LOCK)

    if ( indexTerms != NULL )
        return;

    try {
        indexTermsLength = (size_t)indexEnum->size;

        // Instantiate an block of Term's,so that each one doesn't have to be new'd
        indexTerms    = _CL_NEWARRAY(Term,indexTermsLength);

        // Check if is indexTerms is a valid array
        CND_CONDITION(indexTerms != NULL,
            "No memory could be allocated for indexTerms");

        // Instantiate an big block of TermInfo's, so that each one doesn't
        // have to be new'd
        indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength);

        // Check if is indexInfos is a valid array
        CND_CONDITION(indexInfos != NULL,
            "No memory could be allocated for indexInfos");

        // Instantiate an array indexPointers that contains pointers to the
        // term info index file
        indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength);

        // Check if is indexPointers is a valid array
        CND_CONDITION(indexPointers != NULL,
            "No memory could be allocated for indexPointers");

        //Iterate through the terms of indexEnum
        for (int32_t i = 0; indexEnum->next(); ++i) {
            indexTerms[i].set(indexEnum->term(false), indexEnum->term(false)->text());
            indexEnum->getTermInfo(&indexInfos[i]);
            indexPointers[i] = indexEnum->indexPointer;
        }
    } _CLFINALLY (
        indexEnum->close(); 
        // Close and delete the IndexInput is. The close is done by the destructor.
        _CLDELETE( indexEnum->input );
        _CLDELETE( indexEnum ); 
    );
}

int32_t TermInfosReader::getIndexOffset(const Term* term)
{
    //Func - Returns the offset of the greatest index entry which is less than
    //       or equal to term.
    //Pre  - term holds a reference to a valid term
    //       indexTerms != NULL
    //Post - The new offset has been returned

    //Check if is indexTerms is a valid array
    CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");

    int32_t lo = 0;					  
    int32_t hi = indexTermsLength - 1;
    int32_t mid;
    int32_t delta;

    while (hi >= lo) {
        //Start in the middle betwee hi and lo
        mid = (lo + hi) >> 1;

        //Check if is indexTerms[mid] is a valid instance of Term
        CND_PRECONDITION(&indexTerms[mid] != NULL, "indexTerms[mid] is NULL");
        CND_PRECONDITION(mid < indexTermsLength, "mid >= indexTermsLength");

        //Determine if term is before mid or after mid
        delta = term->compareTo(&indexTerms[mid]);
        if (delta < 0) {
            //Calculate the new hi   
            hi = mid - 1;
        } else if (delta > 0) {
            //Calculate the new lo 
            lo = mid + 1;
        } else {
            //term has been found so return its position
            return mid;
        }
    }
    // the new starting offset
    return hi;
}

void TermInfosReader::seekEnum(const int32_t indexOffset)
{
    //Func - Reposition the current Term and TermInfo to indexOffset
    //Pre  - indexOffset >= 0
    //       indexTerms    != NULL
    //       indexInfos    != NULL
    //       indexPointers != NULL
    //Post - The current Term and Terminfo have been repositioned to indexOffset

    CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number");
    CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
    CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL");
    CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL");

    SegmentTermEnum* enumerator =  getEnum();
    enumerator->seek(indexPointers[indexOffset],
        (indexOffset * enumerator->indexInterval) - 1,
        &indexTerms[indexOffset], &indexInfos[indexOffset]);
}

TermInfo* TermInfosReader::scanEnum(const Term* term)
{
    //Func - Scans the Enumeration of terms for term and returns the
    //       corresponding TermInfo instance if found. The search is started
    //       from the current term.
    //Pre  - term contains a valid reference to a Term
    //       enumerator != NULL
    //Post - if term has been found the corresponding TermInfo has been returned
    //       otherwise NULL has been returned

    SegmentTermEnum* enumerator = getEnum();
    enumerator->scanTo(term);

    //Check if the at the position the Term term can be found
    if (enumerator->term(false) != NULL && term->equals(enumerator->term(false))) {
        //Return the TermInfo instance about term
        return enumerator->getTermInfo();
    }

    //term was not found so no TermInfo can be returned
    return NULL;
}

Term* TermInfosReader::scanEnum(const int32_t position)
{
    //Func - Scans the enumeration to the requested position and returns the
    //       Term located at that position
    //Pre  - position > = 0
    //       enumerator != NULL
    //Post - The Term at the requested position has been returned

    SegmentTermEnum* enumerator = getEnum();

    // As long the position of the enumeration enumerator is smaller than the
    // requested one
    while(enumerator->position < position) {
        //Move the current of enumerator to the next
        if (!enumerator->next()) {
            //If there is no next it means that the requested position was to big
            return NULL;
        }
    }

    //Return the Term a the requested position
    return enumerator->term();
}

CL_NS_END