summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/clucene/src/CLucene/index/FieldsWriter.cpp
blob: ceb6735cbafad6a6a2ca03695397d80c94b917c6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/*
 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
 *
 * Distributable under the terms of either the Apache License (Version 2.0) or 
 * the GNU Lesser General Public License, as specified in the COPYING file.
 *
 * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
*/
#include "CLucene/StdHeader.h"
#include "FieldsWriter.h"

#include "CLucene/util/VoidMap.h"
#include "CLucene/util/Reader.h"
#include "CLucene/util/Misc.h"
#include "CLucene/store/Directory.h"
#include "CLucene/store/IndexOutput.h"
#include "CLucene/document/Document.h"
#include "CLucene/document/Field.h"
#include "FieldInfos.h"

CL_NS_USE(store)
CL_NS_USE(util)
CL_NS_USE(document)
CL_NS_DEF(index)
	
FieldsWriter::FieldsWriter(Directory* d, const QString& segment, FieldInfos* fn)
    : fieldInfos(fn)
{
    //Func - Constructor
    //Pre  - d contains a valid reference to a directory
    //       segment != NULL and contains the name of the segment
    //Post - fn contains a valid reference toa a FieldInfos

	CND_PRECONDITION(!segment.isEmpty(), "segment is NULL");

	QString buf = Misc::segmentname(segment, QLatin1String(".fdt"));
    fieldsStream = d->createOutput(buf);
    
	buf = Misc::segmentname(segment, QLatin1String(".fdx"));
    indexStream = d->createOutput(buf);

	CND_CONDITION(indexStream != NULL, "indexStream is NULL");
}

FieldsWriter::~FieldsWriter()
{
    //Func - Destructor
    //Pre  - true
    //Post - Instance has been destroyed

	close();
}

void FieldsWriter::close()
{
    //Func - Closes all streams and frees all resources
    //Pre  - true
    //Post - All streams have been closed all resources have been freed

    //Check if fieldsStream is valid
    if (fieldsStream) {
        //Close fieldsStream
        fieldsStream->close();
        _CLDELETE(fieldsStream);
    }

    //Check if indexStream is valid
    if (indexStream) {
        //Close indexStream
        indexStream->close();
        _CLDELETE(indexStream);
    }
}

void FieldsWriter::addDocument(Document* doc)
{
    //Func - Adds a document
    //Pre  - doc contains a valid reference to a Document
    //       indexStream != NULL
    //       fieldsStream != NULL
    //Post - The document doc has been added

    CND_PRECONDITION(indexStream != NULL, "indexStream is NULL");
    CND_PRECONDITION(fieldsStream != NULL, "fieldsStream is NULL");

    indexStream->writeLong(fieldsStream->getFilePointer());

    int32_t storedCount = 0;
    DocumentFieldEnumeration* fields = doc->fields();
    while (fields->hasMoreElements()) {
        Field* field = fields->nextElement();
        if (field->isStored())
            storedCount++;
    }
    _CLDELETE(fields);
    fieldsStream->writeVInt(storedCount);

    fields = doc->fields();
    while (fields->hasMoreElements()) {
        Field* field = fields->nextElement();
        if (field->isStored()) {
            fieldsStream->writeVInt(fieldInfos->fieldNumber(field->name()));

            uint8_t bits = 0;
            if (field->isTokenized())
                bits |= FieldsWriter::FIELD_IS_TOKENIZED;
            if (field->isBinary())
                bits |= FieldsWriter::FIELD_IS_BINARY;
            if (field->isCompressed())
                bits |= FieldsWriter::FIELD_IS_COMPRESSED;

            fieldsStream->writeByte(bits);

            if ( field->isCompressed()) {
                _CLTHROWA(CL_ERR_Runtime,
                    "CLucene does not directly support compressed fields. "
                    "Write a compressed byte array instead");
            } else {
                // FEATURE: this problem in Java Lucene too, if using Reader,
                // data is not stored.
                //
                // TODO: this is a logic bug...
                // if the field is stored, and indexed, and is using a reader
                // the field wont get indexed
                //
                // if we could write zero prefixed vints (therefore static
                // length), then we could write a reader directly to the field
                // indexoutput and then go back and write the data length.
                // however this is not supported in lucene yet...
                // if this is ever implemented, then it would make sense to
                // also be able to combine the FieldsWriter and
                // DocumentWriter::invertDocument process, and use a
                // streamfilter to write the field data while the documentwrite
                // analyses the document! how cool would that be! it would cut
                // out all these buffers!!!

                // compression is disabled for the current field
                if (field->isBinary()) {
                    // TODO: since we currently don't support static length vints,
                    // we have to read the entire stream into memory first.... ugly!
                    jstreams::StreamBase<char>* stream = field->streamValue();
                    const char* sd;
                    // how do we make sure we read the entire index in now???
                    // TODO: we need to have a max amount, and guarantee its all
                    // in or throw an error...
                    int32_t rl = stream->read(sd,10000000,0);

                    if ( rl < 0 ) {
                        // TODO: could we detect this earlier and not actually
                        // write the field??
                        fieldsStream->writeVInt(0);
                    } else {
                        // TODO: if this int could be written with a constant
                        // length, then the stream could be read and written a
                        // bit at a time then the length is re-written at the end.
                        fieldsStream->writeVInt(rl);
                        fieldsStream->writeBytes((uint8_t*)sd, rl);
                    }
                } else if (field->stringValue() == NULL ) {
                    // we must be using readerValue
                    CND_PRECONDITION(!field->isIndexed(),
                        "Cannot store reader if it is indexed too")
                    Reader* r = field->readerValue();

                    //read the entire string
                    const TCHAR* rv;
                    int64_t rl = r->read(rv, LUCENE_INT32_MAX_SHOULDBE);
                    if ( rl > LUCENE_INT32_MAX_SHOULDBE )
                        _CLTHROWA(CL_ERR_Runtime, "Field length too long");
                    else if ( rl < 0 )
                        rl = 0;

                    fieldsStream->writeString( rv, (int32_t)rl);
                } else if (field->stringValue() != NULL ) {
                    fieldsStream->writeString(field->stringValue(),
                        _tcslen(field->stringValue()));
                } else {
                    _CLTHROWA(CL_ERR_Runtime, "No values are set for the field");
                }
            }
        }
    }
    _CLDELETE(fields);
}

CL_NS_END