summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp')
-rw-r--r--3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp446
1 files changed, 446 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp b/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp
new file mode 100644
index 000000000..60f9a449c
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/analysis/standard/StandardTokenizer.cpp
@@ -0,0 +1,446 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/StdHeader.h"
+#include "StandardTokenizer.h"
+
+CL_NS_USE(analysis)
+CL_NS_USE(util)
+CL_NS_DEF2(analysis,standard)
+
+ const static TCHAR* tokenImageArray[] = {
+ _T("<EOF>"),
+ _T("<UNKNOWN>"),
+ _T("<ALPHANUM>"),
+ _T("<APOSTROPHE>"),
+ _T("<ACRONYM>"),
+ _T("<COMPANY>"),
+ _T("<EMAIL>"),
+ _T("<HOST>"),
+ _T("<NUM>"),
+ _T("<CJK>")
+ };
+ const TCHAR** tokenImage = tokenImageArray;
+
+ /* A bunch of shortcut macros, many of which make assumptions about variable
+ ** names. These macros enhance readability, not just convenience! */
+ #define EOS (ch==-1 || rd->Eos())
+ #define SPACE (_istspace((TCHAR)ch) != 0)
+ #define ALPHA (_istalpha((TCHAR)ch) != 0)
+ #define ALNUM (_istalnum(ch) != 0)
+ #define DIGIT (_istdigit(ch) != 0)
+ #define UNDERSCORE (ch == '_')
+
+ #define _CJK ( (ch>=0x3040 && ch<=0x318f) || \
+ (ch>=0x3300 && ch<=0x337f) || \
+ (ch>=0x3400 && ch<=0x3d2d) || \
+ (ch>=0x4e00 && ch<=0x9fff) || \
+ (ch>=0xf900 && ch<=0xfaff) || \
+ (ch>=0xac00 && ch<=0xd7af) ) //korean
+
+
+ #define DASH (ch == '-')
+ #define NEGATIVE_SIGN_ DASH
+ //#define POSITIVE_SIGN_ (ch == '+')
+ //#define SIGN (NEGATIVE_SIGN_ || POSITIVE_SIGN_)
+
+ #define DOT (ch == '.')
+ #define DECIMAL DOT
+
+
+ //freebsd seems to have a problem with defines over multiple lines, so this has to be one long line
+ #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);}
+
+ #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)
+
+ #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)
+
+ /* otherMatches is a condition (possibly compound) under which a character
+ ** that's not an ALNUM or UNDERSCORE can be considered not to break the
+ ** span. Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
+ #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE)
+
+ /*
+ ** Consume CJK characters
+ */
+ #define CONSUME_CJK _CONSUME_AS_LONG_AS(_CJK)
+
+
+ /* It is considered that "nothing of value" has been read if:
+ ** a) The "read head" hasn't moved since specialCharPos was established.
+ ** or
+ ** b) The "read head" has moved by one character, but that character was
+ ** either whitespace or not among the characters found in the body of
+ ** a token (deliberately doesn't include the likes of '@'/'&'). */
+ #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) )))
+
+ #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
+ #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
+ /* To discard the last character in a StringBuffer, we decrement the buffer's
+ ** length indicator and move the terminator back by one character. */
+ #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')
+
+ //#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}}
+
+ /* Does StringBuffer sb contain any of the characters in string ofThese? */
+ #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))
+
+
+ StandardTokenizer::StandardTokenizer(Reader* reader):
+ rd(_CLNEW FastCharStream(reader)),
+ /* rdPos is zero-based. It starts at -1, and will advance to the first
+ ** position when readChar() is first called. */
+ rdPos(-1),
+ tokenStart(-1)
+ {
+ }
+
+ StandardTokenizer::~StandardTokenizer() {
+ _CLDELETE(rd);
+ }
+
+ int StandardTokenizer::readChar() {
+ /* Increment by 1 because we're speaking in terms of characters, not
+ ** necessarily bytes: */
+ rdPos++;
+ return rd->GetNext();
+ }
+
+ void StandardTokenizer::unReadChar() {
+ rd->UnGet();
+ rdPos--;
+ }
+
+ inline bool StandardTokenizer::setToken(Token* t, StringBuffer* sb, TokenTypes tokenCode) {
+ t->setStartOffset(tokenStart);
+ t->setEndOffset(tokenStart+sb->length());
+ t->setType(tokenImage[tokenCode]);
+ sb->getBuffer(); //null terminates the buffer
+ t->resetTermTextLen();
+ return true;
+ }
+
+ bool StandardTokenizer::next(Token* t) {
+ int ch=0;
+ while (!EOS) {
+ ch = readChar();
+
+ if ( ch == 0 || ch == -1 ){
+ continue;
+ } else if (SPACE) {
+ continue;
+ } else if (ALPHA || UNDERSCORE) {
+ tokenStart = rdPos;
+ return ReadAlphaNum(ch,t);
+ } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
+ tokenStart = rdPos;
+ /* ReadNumber returns NULL if it fails to extract a valid number; in
+ ** that case, we just continue. */
+ if (ReadNumber(NULL, ch,t))
+ return true;
+ } else if ( _CJK ){
+ if ( ReadCJK(ch,t) )
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool StandardTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
+ /* previousNumber is only non-NULL if this function already read a complete
+ ** number in a previous recursion, yet has been asked to read additional
+ ** numeric segments. For example, in the HOST "192.168.1.3", "192.168" is
+ ** a complete number, but this function will recurse to read the "1.3",
+ ** generating a single HOST token "192.168.1.3". */
+ t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
+ StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
+ TokenTypes tokenType;
+ bool decExhausted;
+ if (previousNumber != NULL) {
+ str.prepend(previousNumber);
+ tokenType = CL_NS2(analysis,standard)::HOST;
+ decExhausted = false;
+ } else {
+ tokenType = CL_NS2(analysis,standard)::NUM;
+ decExhausted = (prev == '.');
+ }
+ if ( str.len >= LUCENE_MAX_WORD_LEN ){
+ //if a number is too long, i would say there is no point
+ //storing it, because its going to be the wrong number anyway?
+ //what do people think?
+ return false;
+ }
+ str.appendChar(prev);
+
+ const bool signExhausted = (prev == '-');
+ int ch = prev;
+
+ CONSUME_DIGITS;
+
+ if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */
+ && (
+ (signExhausted && !DECIMAL)
+ || (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
+ )
+ )
+ {
+ /* We have either:
+ ** a) a negative sign that's not followed by either digit(s) or a decimal
+ ** b) a decimal that's not followed by digit(s)
+ ** so this is not a valid number. */
+ if (!EOS) {
+ /* Unread the character that stopped CONSUME_DIGITS: */
+ unReadChar();
+ }
+ return false;
+ }
+
+ /* We just read a group of digits. Is it followed by a decimal symbol,
+ ** implying that there might be another group of digits available? */
+ if (!EOS) {
+ if (DECIMAL) {
+ if ( str.len >= LUCENE_MAX_WORD_LEN )
+ return false; //read above for rationale
+ str.appendChar(ch);
+ } else {
+ unReadChar();
+ goto SUCCESSFULLY_EXTRACTED_NUMBER;
+ }
+
+ CONSUME_DIGITS;
+ if (!DIGIT && !DECIMAL) {
+ unReadChar();
+ } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
+ /* We just read the fractional digit group, but it's also followed by
+ ** a decimal symbol and at least one more digit, so this must be a
+ ** HOST rather than a real number. */
+ return ReadNumber(str.getBuffer(), '.',t);
+ }
+ }
+
+ SUCCESSFULLY_EXTRACTED_NUMBER:
+ TCHAR rightmost = RIGHTMOST(str);
+ /* Don't including a trailing decimal point. */
+ if (rightmost == '.') {
+ SHAVE_RIGHTMOST(str);
+ unReadChar();
+ rightmost = RIGHTMOST(str);
+ }
+ /* If all we have left is a negative sign, it's not a valid number. */
+ if (rightmost == '-') {
+ CND_PRECONDITION (str.len == 1, "Number is invalid");
+ return false;
+ }
+
+ return setToken(t,&str,tokenType);
+ }
+
+ bool StandardTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
+ t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
+ StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
+ if ( str.len < LUCENE_MAX_WORD_LEN ){
+ str.appendChar(prev);
+ int ch = prev;
+
+ CONSUME_WORD;
+ if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
+ switch(ch) { /* What follows the first alphanum segment? */
+ case '.':
+ str.appendChar('.');
+ return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
+ case '\'':
+ str.appendChar('\'');
+ return ReadApostrophe(&str,t);
+ case '@':
+ str.appendChar('@');
+ return ReadAt(&str,t);
+ case '&':
+ str.appendChar('&');
+ return ReadCompany(&str,t);
+ /* default: fall through to end of this function. */
+ }
+ }
+ }
+ return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
+ }
+
+ bool StandardTokenizer::ReadCJK(const TCHAR prev, Token* t) {
+ t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
+ StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
+ if ( str.len < LUCENE_MAX_WORD_LEN ){
+ str.appendChar(prev);
+ int ch = prev;
+
+ CONSUME_CJK;
+ }
+ return setToken(t,&str,CL_NS2(analysis,standard)::CJK);
+ }
+
+
+ bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) {
+ const int32_t specialCharPos = rdPos;
+ StringBuffer& str=*_str;
+
+ /* A segment of a "dotted" is not allowed to begin with another dot or a dash.
+ ** Even though hosts, e-mail addresses, etc., could have a dotted-segment
+ ** that begins with a dot or a dash, it's far more common in source text
+ ** for a pattern like "abc.--def" to be intended as two tokens. */
+ int ch = rd->Peek();
+ if (!(DOT || DASH)) {
+ bool prevWasDot;
+ bool prevWasDash;
+ if (str.len == 0) {
+ prevWasDot = false;
+ prevWasDash = false;
+ } else {
+ prevWasDot = RIGHTMOST(str) == '.';
+ prevWasDash = RIGHTMOST(str) == '-';
+ }
+ while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
+ ch = readChar();
+ const bool dot = ch == '.';
+ const bool dash = ch == '-';
+
+ if (!(ALNUM || UNDERSCORE || dot || dash)) {
+ break;
+ }
+ /* Multiple dots or dashes in succession end the token.
+ ** Consider the following inputs:
+ ** "Visit windowsupdate.microsoft.com--update today!"
+ ** "In the U.S.A.--yes, even there!" */
+ if ((dot || dash) && (prevWasDot || prevWasDash)) {
+ /* We're not going to append the character we just read, in any case.
+ ** As to the character before it (which is currently RIGHTMOST(str)):
+ ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
+ ** acronym-versus-host detection can work, we want to get rid of it. */
+ if (!prevWasDot) {
+ SHAVE_RIGHTMOST(str);
+ }
+ break;
+ }
+
+ str.appendChar(ch);
+
+ prevWasDot = dot;
+ prevWasDash = dash;
+ }
+ }
+
+ /* There's a potential StringBuffer.append call in the code above, which
+ ** could cause str to reallocate its internal buffer. We must wait to
+ ** obtain the optimization-oriented strBuf pointer until after the initial
+ ** potentially realloc-triggering operations on str.
+ ** Because there can be other such ops much later in this function, strBuf
+ ** is guarded within a block to prevent its use during or after the calls
+ ** that would potentially invalidate it. */
+ { /* Begin block-guard of strBuf */
+ TCHAR* strBuf = str.getBuffer();
+
+ bool rightmostIsDot = RIGHTMOST_IS(str, '.');
+ if (CONSUMED_NOTHING_OF_VALUE) {
+ /* No more alphanums available for this token; shave trailing dot, if any. */
+ if (rightmostIsDot) {
+ SHAVE_RIGHTMOST(str);
+ }
+ /* If there are no dots remaining, this is a generic ALPHANUM. */
+ if (_tcschr(strBuf, '.') == NULL) {
+ forcedType = CL_NS2(analysis,standard)::ALPHANUM;
+ }
+
+ /* Check the token to see if it's an acronym. An acronym must have a
+ ** letter in every even slot and a dot in every odd slot, including the
+ ** last slot (for example, "U.S.A."). */
+ } else if (rightmostIsDot) {
+ bool isAcronym = true;
+ const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */
+
+ for (int32_t i = 0; i < upperCheckLimit; i++) {
+ const bool even = (i % 2 == 0);
+ ch = strBuf[i];
+ if ( (even && !ALPHA) || (!even && !DOT) ) {
+ isAcronym = false;
+ break;
+ }
+ }
+ if (isAcronym) {
+ forcedType = CL_NS2(analysis,standard)::ACRONYM;
+ } else {
+ /* If it's not an acronym, we don't want the trailing dot. */
+ SHAVE_RIGHTMOST(str);
+ /* If there are no dots remaining, this is a generic ALPHANUM. */
+ if (_tcschr(strBuf, '.') == NULL) {
+ forcedType = CL_NS2(analysis,standard)::ALPHANUM;
+ }
+ }
+ }
+ } /* End block-guard of strBuf */
+
+ if (!EOS) {
+ if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
+ str.appendChar('@');
+ return ReadAt(&str,t);
+ } else {
+ unReadChar();
+ }
+ }
+
+ return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
+ ? forcedType : CL_NS2(analysis,standard)::HOST);
+ }
+
+ bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
+ StringBuffer& str=*_str;
+
+ TokenTypes tokenType = CL_NS2(analysis,standard)::APOSTROPHE;
+ const int32_t specialCharPos = rdPos;
+ int ch=0;
+
+ CONSUME_ALPHAS;
+ if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) {
+ /* After the apostrophe, no more alphanums were available within this
+ ** token; shave trailing apostrophe and revert to generic ALPHANUM. */
+ SHAVE_RIGHTMOST(str);
+ tokenType = CL_NS2(analysis,standard)::ALPHANUM;
+ }
+ if (!EOS) {
+ unReadChar();
+ }
+
+ return setToken(t,&str,tokenType);
+ }
+
+ bool StandardTokenizer::ReadAt(StringBuffer* str, Token* t) {
+ ReadDotted(str, CL_NS2(analysis,standard)::EMAIL,t);
+ /* JLucene grammar indicates dots/digits not allowed in company name: */
+ if (!CONTAINS_ANY((*str), ".0123456789")) {
+ setToken(t,str,CL_NS2(analysis,standard)::COMPANY);
+ }
+ return true;
+ }
+
+ bool StandardTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
+ StringBuffer& str = *_str;
+ const int32_t specialCharPos = rdPos;
+ int ch=0;
+
+ CONSUME_WORD;
+ if (CONSUMED_NOTHING_OF_VALUE) {
+ /* After the ampersand, no more alphanums were available within this
+ ** token; shave trailing ampersand and revert to ALPHANUM. */
+ CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
+ SHAVE_RIGHTMOST(str);
+
+
+ return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
+ }
+ if (!EOS) {
+ unReadChar();
+ }
+
+ return setToken(t,&str,CL_NS2(analysis,standard)::COMPANY);
+ }
+
+CL_NS_END2