summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/clucene/src/CLucene/index/DocumentWriter.h
blob: 916d2f62d6aed0280977a5ca3374fd0c11c66aa2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
 *
 * Distributable under the terms of either the Apache License (Version 2.0) or 
 * the GNU Lesser General Public License, as specified in the COPYING file.
 *
 * Changes are Copyright (C) 2012 Digia Plc and/or its subsidiary(-ies), all rights reserved.
*/
#ifndef _lucene_index_DocumentWriter_
#define _lucene_index_DocumentWriter_

#if defined(_LUCENE_PRAGMA_ONCE)
#   pragma once
#endif

#include <QtCore/QString>

#include "CLucene/analysis/AnalysisHeader.h"
#include "CLucene/document/Document.h"
#include "CLucene/store/Directory.h"
#include "FieldInfos.h"
#include "IndexWriter.h"
#include "CLucene/util/VoidMap.h"
#include "CLucene/document/Field.h"
#include "TermInfo.h"
#include "CLucene/search/Similarity.h"
#include "TermInfosWriter.h"
#include "FieldsWriter.h"
#include "Term.h"

CL_NS_DEF(index)

class DocumentWriter : LUCENE_BASE
{
public:	
    // info about a Term in a doc
    class Posting : LUCENE_BASE
    {
	public:
		Term* term;					  // the Term
		int32_t freq;					  // its frequency in doc
		Array<int32_t> positions;				  // positions it occurs at
		Array<TermVectorOffsetInfo> offsets;
		
		Posting(Term* t, const int32_t position, TermVectorOffsetInfo* offset);
		~Posting();
	};

private:
	CL_NS(analysis)::Analyzer* analyzer;
	CL_NS(store)::Directory* directory;
	FieldInfos* fieldInfos; //array
	const int32_t maxFieldLength;
	CL_NS(search)::Similarity* similarity;
	int32_t termIndexInterval;

	// Keys are Terms, values are Postings.
	// Used to buffer a document before it is written to the index.
	typedef CL_NS(util)::CLHashtable<Term*, Posting*, Term::Compare,
        Term::Equals> PostingTableType;
    PostingTableType postingTable;
	int32_t* fieldLengths; //array
	int32_t* fieldPositions; //array
	int32_t* fieldOffsets; //array
	qreal* fieldBoosts; //array

	Term* termBuffer;
public:
	/** This ctor used by test code only.
	*
	* @param directory The directory to write the document information to
	* @param analyzer The analyzer to use for the document
	* @param similarity The Similarity function
	* @param maxFieldLength The maximum number of tokens a field may have
	*/ 
	DocumentWriter(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a,
        CL_NS(search)::Similarity* similarity, const int32_t maxFieldLength);
	
	DocumentWriter(CL_NS(store)::Directory* directory,
        CL_NS(analysis)::Analyzer* analyzer, IndexWriter* writer);
	~DocumentWriter();

	void addDocument(const QString& segment, CL_NS(document)::Document* doc);


private:
	// Tokenizes the fields of a document into Postings.
	void invertDocument(const CL_NS(document)::Document* doc);

	void addPosition(const TCHAR* field, const TCHAR* text,
        const int32_t position, TermVectorOffsetInfo* offset);

	void sortPostingTable(Posting**& array, int32_t& arraySize);

	static void quickSort(Posting**& postings, const int32_t lo, const int32_t hi);

	void writePostings(Posting** postings, const int32_t postingsLength,
        const QString& segment);

	void writeNorms(const QString& segment);

	void clearPostingTable();
};

CL_NS_END

#endif