summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h')
-rw-r--r--3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h88
1 files changed, 88 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h b/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h
new file mode 100644
index 000000000..d4195be81
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.h
@@ -0,0 +1,88 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_standard_StandardTokenizer
+#define _lucene_analysis_standard_StandardTokenizer
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "../AnalysisHeader.h"
+#include "../Analyzers.h"
+#include "StandardTokenizerConstants.h"
+#include "CLucene/util/StringBuffer.h"
+#include "CLucene/util/FastCharStream.h"
+#include "CLucene/util/Reader.h"
+
+
+CL_NS_DEF2(analysis,standard)
+
+/** A grammar-based tokenizer constructed with JavaCC.
+ *
+ * <p> This should be a good tokenizer for most European-language documents:
+ *
+ * <ul>
+ * <li>Splits words at punctuation characters, removing punctuation. However, a
+ * dot that's not followed by whitespace is considered part of a token.
+ * <li>Splits words at hyphens, unless there's a number in the token, in which case
+ * the whole token is interpreted as a product number and is not split.
+ * <li>Recognizes email addresses and internet hostnames as one token.
+ * </ul>
+ *
+ * <p>Many applications have specific tokenizer needs. If this tokenizer does
+ * not suit your application, please consider copying this source code
+ * directory to your project and maintaining your own grammar-based tokenizer.
+ */
+ class StandardTokenizer: public Tokenizer {
+ private:
+ int32_t rdPos;
+ int32_t tokenStart;
+
+ // Advance by one character, incrementing rdPos and returning the character.
+ int readChar();
+ // Retreat by one character, decrementing rdPos.
+ void unReadChar();
+
+ // createToken centralizes token creation for auditing purposes.
+ //Token* createToken(CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode);
+ inline bool setToken(Token* t, CL_NS(util)::StringBuffer* sb, TokenTypes tokenCode);
+
+ bool ReadDotted(CL_NS(util)::StringBuffer* str, TokenTypes forcedType,Token* t);
+
+ public:
+ CL_NS(util)::FastCharStream* rd;
+
+ // Constructs a tokenizer for this Reader.
+ StandardTokenizer(CL_NS(util)::Reader* reader);
+
+ ~StandardTokenizer();
+
+ /** Returns the next token in the stream, or false at end-of-stream.
+ * The returned token's type is set to an element of
+ * StandardTokenizerConstants::tokenImage. */
+ bool next(Token* token);
+
+ // Reads for number like "1"/"1234.567", or IP address like "192.168.1.2".
+ bool ReadNumber(const TCHAR* previousNumber, const TCHAR prev, Token* t);
+
+ bool ReadAlphaNum(const TCHAR prev, Token* t);
+
+ // Reads for apostrophe-containing word.
+ bool ReadApostrophe(CL_NS(util)::StringBuffer* str, Token* t);
+
+ // Reads for something@... it may be a COMPANY name or a EMAIL address
+ bool ReadAt(CL_NS(util)::StringBuffer* str, Token* t);
+
+ // Reads for COMPANY name like AT&T.
+ bool ReadCompany(CL_NS(util)::StringBuffer* str, Token* t);
+
+ // Reads CJK characters
+ bool ReadCJK(const TCHAR prev, Token* t);
+ };
+
+CL_NS_END2
+#endif