diff options
Diffstat (limited to '3rdparty/clucene/src/CLucene/document/Field.h')
-rw-r--r-- | 3rdparty/clucene/src/CLucene/document/Field.h | 261 |
1 files changed, 261 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/document/Field.h b/3rdparty/clucene/src/CLucene/document/Field.h new file mode 100644 index 000000000..771a1382b --- /dev/null +++ b/3rdparty/clucene/src/CLucene/document/Field.h @@ -0,0 +1,261 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_document_Field_ +#define _lucene_document_Field_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/util/Reader.h" +#include "CLucene/util/streambase.h" + +CL_NS_DEF(document) +/** +A field is a section of a Document. Each field has two parts, a name and a +value. Values may be free text, provided as a String or as a Reader, or they +may be atomic keywords, which are not further processed. Such keywords may +be used to represent dates, urls, etc. Fields are optionally stored in the +index, so that they may be returned with hits on the document. + +PORTING: CLucene doesn't directly support compressed fields. However, it is easy +to reproduce this functionality by using the GZip streams in the contrib package. +Also note that binary fields are not read immediately in CLucene, a substream +is pointed directly to the field's data, in affect creating a lazy load ability. +This means that large fields are best saved in binary format (even if they are +text), so that they can be loaded lazily. +*/ +class Field :LUCENE_BASE{ +private: + const TCHAR* _name; + TCHAR* _stringValue; + CL_NS(util)::Reader* _readerValue; + jstreams::StreamBase<char>* _streamValue; + + int config; + qreal boost; + bool omitNorms; +public: + enum Store{ + /** Store the original field value in the index. This is useful for short texts + * like a document's title which should be displayed with the results. The + * value is stored in its original form, i.e. no analyzer is used before it is + * stored. + */ + STORE_YES=1, + /** Do not store the field value in the index. */ + STORE_NO=2, + + /** Store the original field value in the index in a compressed form. This is + * useful for long documents and for binary valued fields. + * NOTE: CLucene does not directly support compressed fields, to store a + * compressed field. + * //TODO: need better documentation on how to add a compressed field + * //because actually we still need to write a GZipOutputStream... + */ + STORE_COMPRESS=4 + }; + + enum Index{ + /** Do not index the field value. This field can thus not be searched, + * but one can still access its contents provided it is + * {@link Field::Store stored}. */ + INDEX_NO=16, + /** Index the field's value so it can be searched. An Analyzer will be used + * to tokenize and possibly further normalize the text before its + * terms will be stored in the index. This is useful for common text. + */ + INDEX_TOKENIZED=32, + /** Index the field's value without using an Analyzer, so it can be searched. + * As no analyzer is used the value will be stored as a single term. This is + * useful for unique Ids like product numbers. + */ + INDEX_UNTOKENIZED=64, + /** Index the field's value without an Analyzer, and disable + * the storing of norms. No norms means that index-time boosting + * and field length normalization will be disabled. The benefit is + * less memory usage as norms take up one byte per indexed field + * for every document in the index. + */ + INDEX_NONORMS=128 + }; + + enum TermVector{ + /** Do not store term vectors. */ + TERMVECTOR_NO=256, + /** Store the term vectors of each document. A term vector is a list + * of the document's terms and their number of occurences in that document. */ + TERMVECTOR_YES=512, + /** + * Store the term vector + token position information + * + * @see #YES + */ + TERMVECTOR_WITH_POSITIONS=1024, + /** + * Store the term vector + Token offset information + * + * @see #YES + */ + TERMVECTOR_WITH_OFFSETS=2048 + }; + + _CL_DEPRECATED( another overload ) Field(const TCHAR* name, const TCHAR* value, bool store, bool index, bool token, const bool storeTermVector=false); + _CL_DEPRECATED( another overload ) Field(const TCHAR* name, CL_NS(util)::Reader* reader, bool store, bool index, bool token, const bool storeTermVector=false); + + Field(const TCHAR* name, const TCHAR* value, int configs); + Field(const TCHAR* name, CL_NS(util)::Reader* reader, int configs); + Field(const TCHAR* name, jstreams::StreamBase<char>* stream, int configs); + ~Field(); + + /** Constructs a String-valued Field that is not tokenized, but is indexed + * and stored. Useful for non-text fields, e.g. date or url. + * @deprecated Use new Field(name,value,Field::STORE_YES | Field::INDEX_UNTOKENIZED) + */ + _CL_DEPRECATED( new Field(*) ) static Field* Keyword(const TCHAR* name, const TCHAR* value); + + /** Constructs a String-valued Field that is not tokenized nor indexed, + * but is stored in the index, for return with hits. + * @deprecated Use new Field(name,value,Field::STORE_YES | Field::INDEX_NO) + */ + _CL_DEPRECATED( new Field(*) ) static Field* UnIndexed(const TCHAR* name, const TCHAR* value); + + /** Constructs a String-valued Field that is tokenized and indexed, + * and is stored in the index, for return with hits. Useful for short text + * fields, like "title" or "subject". + * @deprecated Use new Field(name,value,Field::STORE_YES | Field::INDEX_TOKENIZED) + */ + _CL_DEPRECATED( new Field(*) ) static Field* Text(const TCHAR* name, const TCHAR* value, const bool storeTermVector=false); + + /** Constructs a String-valued Field that is tokenized and indexed, + * but that is not stored in the index. + * @deprecated Use new Field(name,value,Field::STORE_NO | Field::INDEX_TOKENIZED) + */ + _CL_DEPRECATED( new Field(*) ) static Field* UnStored(const TCHAR* name, const TCHAR* value, const bool storeTermVector=false); + + /** Constructs a Reader-valued Field that is tokenized and indexed, but is + * *not* stored in the index verbatim. Useful for longer text fields, like + * "body". + * @deprecated Use new Field(name,value, Field::INDEX_TOKENIZED) + */ + _CL_DEPRECATED( new Field(*) ) static Field* Text(const TCHAR* name, CL_NS(util)::Reader* value, const bool storeTermVector=false); + + /** The name of the field (e.g., "date", "subject", "title", "body", etc.) + * as an interned string. */ + const TCHAR* name(); ///<returns reference + + /** The value of the field as a String, or null. If null, the Reader value + * or binary value is used. Exactly one of stringValue(), readerValue() and + * streamValue() must be set. */ + TCHAR* stringValue(); ///<returns reference + + /** The value of the field as a reader, or null. If null, the String value + * or stream value is used. Exactly one of stringValue(), readerValue() and + * streamValue() must be set. */ + CL_NS(util)::Reader* readerValue(); + + /** The value of the field as a String, or null. If null, the String value + * or Reader value is used. Exactly one of stringValue(), readerValue() and + * streamValue() must be set. */ + jstreams::StreamBase<char>* streamValue(); + + // True iff the value of the field is to be stored in the index for return + // with search hits. It is an error for this to be true if a field is + // Reader-valued. + bool isStored(); + + // True iff the value of the field is to be indexed, so that it may be + // searched on. + bool isIndexed(); + + // True iff the value of the field should be tokenized as text prior to + // indexing. Un-tokenized fields are indexed as a single word and may not be + // Reader-valued. + bool isTokenized(); + + /** True if the value of the field is stored and compressed within the index + * NOTE: CLucene does not actually support compressed fields, Instead, a reader + * will be returned with a pointer to a SubIndexInputStream. A GZipInputStream + * and a UTF8 reader must be used to actually read the content. This flag + * will only be set if the index was created by another lucene implementation. + */ + bool isCompressed(); + + //Set configs using XOR. This resets all the settings + //For example, to use term vectors with positions and offsets do: + //object->setConfig(TERMVECTOR_WITH_POSITIONS | TERMVECTOR_WITH_OFFSETS); + void setConfig(int termVector); + + /** True iff the term or terms used to index this field are stored as a term + * vector, available from {@link IndexReader#getTermFreqVector(int32_t,TCHAR*)}. + * These methods do not provide access to the original content of the field, + * only to terms used to index it. If the original content must be + * preserved, use the <code>stored</code> attribute instead. + * + * @see IndexReader#getTermFreqVector(int32_t, String) + */ + bool isTermVectorStored(); + + /** + * True iff terms are stored as term vector together with their offsets + * (start and end positon in source text). + */ + bool isStoreOffsetWithTermVector(); + + /** + * True iff terms are stored as term vector together with their token positions. + */ + bool isStorePositionWithTermVector(); + + /** Returns the boost factor for hits for this field. + * + * <p>The default value is 1.0. + * + * <p>Note: this value is not stored directly with the document in the index. + * Documents returned from {@link IndexReader#document(int)} and + * {@link Hits#doc(int)} may thus not have the same value present as when + * this field was indexed. + * + * @see #setBoost(float) + */ + qreal getBoost(); + + /** Sets the boost factor hits on this field. This value will be + * multiplied into the score of all hits on this field of this document. + * + * <p>The boost is multiplied by {@link Document#getBoost()} of the document + * containing this field. If a document has multiple fields with the same + * name, all such values are multiplied together. This product is then + * multipled by the value {@link Similarity#lengthNorm(String,int)}, and + * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the + * index. One should attempt to ensure that this product does not overflow + * the range of that encoding. + * + * @see Document#setBoost(float) + * @see Similarity#lengthNorm(String, int) + * @see Similarity#encodeNorm(float) + */ + void setBoost(qreal value); + + /** True iff the value of the filed is stored as binary */ + bool isBinary(); + + /** True if norms are omitted for this indexed field */ + bool getOmitNorms(); + + /** Expert: + * + * If set, omit normalization factors associated with this indexed field. + * This effectively disables indexing boosts and length normalization for this field. + */ + void setOmitNorms(bool omitNorms); + + // Prints a Field for human consumption. + TCHAR* toString(); +}; +CL_NS_END +#endif |