summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/document/Field.h
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/document/Field.h')
-rw-r--r--3rdparty/clucene/src/CLucene/document/Field.h261
1 files changed, 261 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/document/Field.h b/3rdparty/clucene/src/CLucene/document/Field.h
new file mode 100644
index 000000000..771a1382b
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/document/Field.h
@@ -0,0 +1,261 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_document_Field_
+#define _lucene_document_Field_
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "CLucene/util/Reader.h"
+#include "CLucene/util/streambase.h"
+
+CL_NS_DEF(document)
+/**
+A field is a section of a Document. Each field has two parts, a name and a
+value. Values may be free text, provided as a String or as a Reader, or they
+may be atomic keywords, which are not further processed. Such keywords may
+be used to represent dates, urls, etc. Fields are optionally stored in the
+index, so that they may be returned with hits on the document.
+
+PORTING: CLucene doesn't directly support compressed fields. However, it is easy
+to reproduce this functionality by using the GZip streams in the contrib package.
+Also note that binary fields are not read immediately in CLucene, a substream
+is pointed directly to the field's data, in affect creating a lazy load ability.
+This means that large fields are best saved in binary format (even if they are
+text), so that they can be loaded lazily.
+*/
+class Field :LUCENE_BASE{
+private:
+ const TCHAR* _name;
+ TCHAR* _stringValue;
+ CL_NS(util)::Reader* _readerValue;
+ jstreams::StreamBase<char>* _streamValue;
+
+ int config;
+ qreal boost;
+ bool omitNorms;
+public:
+ enum Store{
+ /** Store the original field value in the index. This is useful for short texts
+ * like a document's title which should be displayed with the results. The
+ * value is stored in its original form, i.e. no analyzer is used before it is
+ * stored.
+ */
+ STORE_YES=1,
+ /** Do not store the field value in the index. */
+ STORE_NO=2,
+
+ /** Store the original field value in the index in a compressed form. This is
+ * useful for long documents and for binary valued fields.
+ * NOTE: CLucene does not directly support compressed fields, to store a
+ * compressed field.
+ * //TODO: need better documentation on how to add a compressed field
+ * //because actually we still need to write a GZipOutputStream...
+ */
+ STORE_COMPRESS=4
+ };
+
+ enum Index{
+ /** Do not index the field value. This field can thus not be searched,
+ * but one can still access its contents provided it is
+ * {@link Field::Store stored}. */
+ INDEX_NO=16,
+ /** Index the field's value so it can be searched. An Analyzer will be used
+ * to tokenize and possibly further normalize the text before its
+ * terms will be stored in the index. This is useful for common text.
+ */
+ INDEX_TOKENIZED=32,
+ /** Index the field's value without using an Analyzer, so it can be searched.
+ * As no analyzer is used the value will be stored as a single term. This is
+ * useful for unique Ids like product numbers.
+ */
+ INDEX_UNTOKENIZED=64,
+ /** Index the field's value without an Analyzer, and disable
+ * the storing of norms. No norms means that index-time boosting
+ * and field length normalization will be disabled. The benefit is
+ * less memory usage as norms take up one byte per indexed field
+ * for every document in the index.
+ */
+ INDEX_NONORMS=128
+ };
+
+ enum TermVector{
+ /** Do not store term vectors. */
+ TERMVECTOR_NO=256,
+ /** Store the term vectors of each document. A term vector is a list
+ * of the document's terms and their number of occurences in that document. */
+ TERMVECTOR_YES=512,
+ /**
+ * Store the term vector + token position information
+ *
+ * @see #YES
+ */
+ TERMVECTOR_WITH_POSITIONS=1024,
+ /**
+ * Store the term vector + Token offset information
+ *
+ * @see #YES
+ */
+ TERMVECTOR_WITH_OFFSETS=2048
+ };
+
+ _CL_DEPRECATED( another overload ) Field(const TCHAR* name, const TCHAR* value, bool store, bool index, bool token, const bool storeTermVector=false);
+ _CL_DEPRECATED( another overload ) Field(const TCHAR* name, CL_NS(util)::Reader* reader, bool store, bool index, bool token, const bool storeTermVector=false);
+
+ Field(const TCHAR* name, const TCHAR* value, int configs);
+ Field(const TCHAR* name, CL_NS(util)::Reader* reader, int configs);
+ Field(const TCHAR* name, jstreams::StreamBase<char>* stream, int configs);
+ ~Field();
+
+ /** Constructs a String-valued Field that is not tokenized, but is indexed
+ * and stored. Useful for non-text fields, e.g. date or url.
+ * @deprecated Use new Field(name,value,Field::STORE_YES | Field::INDEX_UNTOKENIZED)
+ */
+ _CL_DEPRECATED( new Field(*) ) static Field* Keyword(const TCHAR* name, const TCHAR* value);
+
+ /** Constructs a String-valued Field that is not tokenized nor indexed,
+ * but is stored in the index, for return with hits.
+ * @deprecated Use new Field(name,value,Field::STORE_YES | Field::INDEX_NO)
+ */
+ _CL_DEPRECATED( new Field(*) ) static Field* UnIndexed(const TCHAR* name, const TCHAR* value);
+
+ /** Constructs a String-valued Field that is tokenized and indexed,
+ * and is stored in the index, for return with hits. Useful for short text
+ * fields, like "title" or "subject".
+ * @deprecated Use new Field(name,value,Field::STORE_YES | Field::INDEX_TOKENIZED)
+ */
+ _CL_DEPRECATED( new Field(*) ) static Field* Text(const TCHAR* name, const TCHAR* value, const bool storeTermVector=false);
+
+ /** Constructs a String-valued Field that is tokenized and indexed,
+ * but that is not stored in the index.
+ * @deprecated Use new Field(name,value,Field::STORE_NO | Field::INDEX_TOKENIZED)
+ */
+ _CL_DEPRECATED( new Field(*) ) static Field* UnStored(const TCHAR* name, const TCHAR* value, const bool storeTermVector=false);
+
+ /** Constructs a Reader-valued Field that is tokenized and indexed, but is
+ * *not* stored in the index verbatim. Useful for longer text fields, like
+ * "body".
+ * @deprecated Use new Field(name,value, Field::INDEX_TOKENIZED)
+ */
+ _CL_DEPRECATED( new Field(*) ) static Field* Text(const TCHAR* name, CL_NS(util)::Reader* value, const bool storeTermVector=false);
+
+ /** The name of the field (e.g., "date", "subject", "title", "body", etc.)
+ * as an interned string. */
+ const TCHAR* name(); ///<returns reference
+
+ /** The value of the field as a String, or null. If null, the Reader value
+ * or binary value is used. Exactly one of stringValue(), readerValue() and
+ * streamValue() must be set. */
+ TCHAR* stringValue(); ///<returns reference
+
+ /** The value of the field as a reader, or null. If null, the String value
+ * or stream value is used. Exactly one of stringValue(), readerValue() and
+ * streamValue() must be set. */
+ CL_NS(util)::Reader* readerValue();
+
+ /** The value of the field as a String, or null. If null, the String value
+ * or Reader value is used. Exactly one of stringValue(), readerValue() and
+ * streamValue() must be set. */
+ jstreams::StreamBase<char>* streamValue();
+
+ // True iff the value of the field is to be stored in the index for return
+ // with search hits. It is an error for this to be true if a field is
+ // Reader-valued.
+ bool isStored();
+
+ // True iff the value of the field is to be indexed, so that it may be
+ // searched on.
+ bool isIndexed();
+
+ // True iff the value of the field should be tokenized as text prior to
+ // indexing. Un-tokenized fields are indexed as a single word and may not be
+ // Reader-valued.
+ bool isTokenized();
+
+ /** True if the value of the field is stored and compressed within the index
+ * NOTE: CLucene does not actually support compressed fields, Instead, a reader
+ * will be returned with a pointer to a SubIndexInputStream. A GZipInputStream
+ * and a UTF8 reader must be used to actually read the content. This flag
+ * will only be set if the index was created by another lucene implementation.
+ */
+ bool isCompressed();
+
+ //Set configs using XOR. This resets all the settings
+ //For example, to use term vectors with positions and offsets do:
+ //object->setConfig(TERMVECTOR_WITH_POSITIONS | TERMVECTOR_WITH_OFFSETS);
+ void setConfig(int termVector);
+
+ /** True iff the term or terms used to index this field are stored as a term
+ * vector, available from {@link IndexReader#getTermFreqVector(int32_t,TCHAR*)}.
+ * These methods do not provide access to the original content of the field,
+ * only to terms used to index it. If the original content must be
+ * preserved, use the <code>stored</code> attribute instead.
+ *
+ * @see IndexReader#getTermFreqVector(int32_t, String)
+ */
+ bool isTermVectorStored();
+
+ /**
+ * True iff terms are stored as term vector together with their offsets
+ * (start and end positon in source text).
+ */
+ bool isStoreOffsetWithTermVector();
+
+ /**
+ * True iff terms are stored as term vector together with their token positions.
+ */
+ bool isStorePositionWithTermVector();
+
+ /** Returns the boost factor for hits for this field.
+ *
+ * <p>The default value is 1.0.
+ *
+ * <p>Note: this value is not stored directly with the document in the index.
+ * Documents returned from {@link IndexReader#document(int)} and
+ * {@link Hits#doc(int)} may thus not have the same value present as when
+ * this field was indexed.
+ *
+ * @see #setBoost(float)
+ */
+ qreal getBoost();
+
+ /** Sets the boost factor hits on this field. This value will be
+ * multiplied into the score of all hits on this field of this document.
+ *
+ * <p>The boost is multiplied by {@link Document#getBoost()} of the document
+ * containing this field. If a document has multiple fields with the same
+ * name, all such values are multiplied together. This product is then
+ * multipled by the value {@link Similarity#lengthNorm(String,int)}, and
+ * rounded by {@link Similarity#encodeNorm(float)} before it is stored in the
+ * index. One should attempt to ensure that this product does not overflow
+ * the range of that encoding.
+ *
+ * @see Document#setBoost(float)
+ * @see Similarity#lengthNorm(String, int)
+ * @see Similarity#encodeNorm(float)
+ */
+ void setBoost(qreal value);
+
+ /** True iff the value of the filed is stored as binary */
+ bool isBinary();
+
+ /** True if norms are omitted for this indexed field */
+ bool getOmitNorms();
+
+ /** Expert:
+ *
+ * If set, omit normalization factors associated with this indexed field.
+ * This effectively disables indexing boosts and length normalization for this field.
+ */
+ void setOmitNorms(bool omitNorms);
+
+ // Prints a Field for human consumption.
+ TCHAR* toString();
+};
+CL_NS_END
+#endif