summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/analysis/AnalysisHeader.h
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/analysis/AnalysisHeader.h')
-rw-r--r--3rdparty/clucene/src/CLucene/analysis/AnalysisHeader.h234
1 files changed, 234 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/analysis/AnalysisHeader.h b/3rdparty/clucene/src/CLucene/analysis/AnalysisHeader.h
new file mode 100644
index 000000000..0cfd9c684
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/analysis/AnalysisHeader.h
@@ -0,0 +1,234 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_AnalysisHeader_
+#define _lucene_analysis_AnalysisHeader_
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "CLucene/util/Reader.h"
+
+CL_NS_DEF(analysis)
+
+
+/** A Token is an occurence of a term from the text of a field. It consists of
+* a term's text, the start and end offset of the term in the text of the field,
+* and a type string.
+*
+* The start and end offsets permit applications to re-associate a token with
+* its source text, e.g., to display highlighted query terms in a document
+* browser, or to show matching text fragments in a KWIC (KeyWord In Context)
+* display, etc.
+*
+* The type is an interned string, assigned by a lexical analyzer
+* (a.k.a. tokenizer), naming the lexical or syntactic class that the token
+* belongs to. For example an end of sentence marker token might be implemented
+* with type "eos". The default token type is "word".
+*/
+class Token:LUCENE_BASE{
+private:
+ int32_t _startOffset; // start in source text
+ int32_t _endOffset; // end in source text
+ const TCHAR* _type; // lexical type
+ int32_t positionIncrement;
+ size_t bufferTextLen;
+
+public:
+ #ifndef LUCENE_TOKEN_WORD_LENGTH
+ TCHAR* _termText; // the text of the term
+ #else
+ TCHAR _termText[LUCENE_TOKEN_WORD_LENGTH+1]; // the text of the term
+ #endif
+ int32_t _termTextLen;
+ static const TCHAR* defaultType;
+
+ Token();
+ ~Token();
+ // Constructs a Token with the given text, start and end offsets, & type.
+ Token(const TCHAR* text, const int32_t start, const int32_t end, const TCHAR* typ=defaultType);
+ void set(const TCHAR* text, const int32_t start, const int32_t end, const TCHAR* typ=defaultType);
+
+ size_t bufferLength(){ return bufferTextLen; }
+ void growBuffer(size_t size);
+
+ /* Set the position increment. This determines the position of this
+ * token relative to the previous Token in a TokenStream, used in
+ * phrase searching.
+ *
+ * The default value is 1.
+ *
+ * Some common uses for this are:
+ *
+ * - Set it to zero to put multiple terms in the same position. This is
+ * useful if, e.g., a word has multiple stems. Searches for phrases
+ * including either stem will match. In this case, all but the first stem's
+ * increment should be set to zero: the increment of the first instance
+ * should be one. Repeating a token with an increment of zero can also be
+ * used to boost the scores of matches on that token.
+ *
+ * - Set it to values greater than one to inhibit exact phrase matches.
+ * If, for example, one does not want phrases to match across removed stop
+ * words, then one could build a stop word filter that removes stop words and
+ * also sets the increment to the number of stop words removed before each
+ * non-stop word. Then exact phrase queries will only match when the terms
+ * occur with no intervening stop words.
+ */
+ void setPositionIncrement(int32_t posIncr);
+ int32_t getPositionIncrement() const;
+ const TCHAR* termText() const;
+ size_t termTextLength();
+ void resetTermTextLen();
+ void setText(const TCHAR* txt);
+
+ /**
+ * Returns this Token's starting offset, the position of the first character
+ * corresponding to this token in the source text.
+ *
+ * Note that the difference between endOffset() and startOffset() may not be
+ * equal to termText.length(), as the term text may have been altered by a
+ * stemmer or some other filter.
+ */
+ int32_t startOffset() const { return _startOffset; }
+ void setStartOffset(int32_t val){ _startOffset =val; }
+
+ /**
+ * Returns this Token's ending offset, one greater than the position of the
+ * last character corresponding to this token in the source text.
+ */
+ int32_t endOffset() const { return _endOffset; }
+ void setEndOffset(int32_t val){ _endOffset =val; }
+
+ // Returns this Token's lexical type. Defaults to "word".
+ const TCHAR* type() const { return _type; } ///<returns reference
+ void setType(const TCHAR* val) { _type = val; } ///<returns reference
+
+ TCHAR* toString() const;
+
+ ///Compares the Token for their order
+ class OrderCompare:LUCENE_BASE, public CL_NS(util)::Compare::_base //<Token*>
+ {
+ public:
+ bool operator()( Token* t1, Token* t2 ) const;
+ };
+};
+
+/**
+* A TokenStream enumerates the sequence of tokens, either from
+* fields of a document or from query text.
+* <p>
+* This is an abstract class. Concrete subclasses are:
+* <ul>
+* <li>{@link Tokenizer}, a TokenStream
+* whose input is a Reader; and
+* <li>{@link TokenFilter}, a TokenStream
+* whose input is another TokenStream.
+* </ul>
+*/
+class TokenStream:LUCENE_BASE {
+public:
+ /** Sets token to the next token in the stream, returns false at the EOS. */
+ virtual bool next(Token* token) = 0;
+
+ /** Releases resources associated with this stream. */
+ virtual void close() = 0;
+
+ virtual ~TokenStream(){
+ }
+
+ /* This is for backwards compatibility only. You should pass the token you want to fill
+ * to next(), this will save a lot of object construction and destructions.
+ * @deprecated. use next(token). Kept only to avoid breaking existing code.
+ */
+ _CL_DEPRECATED(next(Token)) Token* next();
+};
+
+
+/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
+ * policy for extracting index terms from text.
+ * <p>
+ * Typical implementations first build a Tokenizer, which breaks the stream of
+ * characters from the Reader into raw Tokens. One or more TokenFilters may
+ * then be applied to the output of the Tokenizer.
+ * <p>
+ * WARNING: You must override one of the methods defined by this class in your
+ * subclass or the Analyzer will enter an infinite loop.
+ */
+class Analyzer:LUCENE_BASE{
+public:
+ /** Creates a TokenStream which tokenizes all the text in the provided
+ Reader. Default implementation forwards to tokenStream(Reader) for
+ compatibility with older version. Override to allow Analyzer to choose
+ strategy based on document and/or field. Must be able to handle null
+ field name for backward compatibility. */
+ virtual TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader)=0;
+
+ virtual ~Analyzer(){
+ }
+
+ /**
+ * Invoked before indexing a Field instance if
+ * terms have already been added to that field. This allows custom
+ * analyzers to place an automatic position increment gap between
+ * Field instances using the same field name. The default value
+ * position increment gap is 0. With a 0 position increment gap and
+ * the typical default token position increment of 1, all terms in a field,
+ * including across Field instances, are in successive positions, allowing
+ * exact PhraseQuery matches, for instance, across Field instance boundaries.
+ *
+ * @param fieldName Field name being indexed.
+ * @return position increment gap, added to the next token emitted from {@link #tokenStream(TCHAR*, Reader*)}
+ */
+ virtual int32_t getPositionIncrementGap(const TCHAR* fieldName);
+};
+
+
+/** A Tokenizer is a TokenStream whose input is a Reader.
+<p>
+This is an abstract class.
+*/
+class Tokenizer:public TokenStream {
+protected:
+ /** The text source for this Tokenizer. */
+ CL_NS(util)::Reader* input;
+
+public:
+ /** Construct a tokenizer with null input. */
+ Tokenizer();
+ /** Construct a token stream processing the given input. */
+ Tokenizer(CL_NS(util)::Reader* _input);
+
+ // ** By default, closes the input Reader. */
+ virtual void close();
+ virtual ~Tokenizer();
+};
+
+/** A TokenFilter is a TokenStream whose input is another token stream.
+<p>
+This is an abstract class.
+*/
+class TokenFilter:public TokenStream {
+protected:
+ /** The source of tokens for this filter. */
+ TokenStream* input;
+ /** If true then input will be deleted in the destructor */
+ bool deleteTokenStream;
+
+ /** Construct a token stream filtering the given input.
+ *
+ * @param in The TokenStream to filter from
+ * @param deleteTS If true, input will be deleted in the destructor
+ */
+ TokenFilter(TokenStream* in, bool deleteTS=false);
+ virtual ~TokenFilter();
+public:
+ /** Close the input TokenStream. */
+ void close();
+};
+
+CL_NS_END
+#endif