summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/queryParser/Lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/queryParser/Lexer.cpp')
-rw-r--r--3rdparty/clucene/src/CLucene/queryParser/Lexer.cpp371
1 files changed, 371 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/queryParser/Lexer.cpp b/3rdparty/clucene/src/CLucene/queryParser/Lexer.cpp
new file mode 100644
index 000000000..861c5d3cb
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/queryParser/Lexer.cpp
@@ -0,0 +1,371 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/StdHeader.h"
+#include "Lexer.h"
+
+#include "CLucene/util/FastCharStream.h"
+#include "CLucene/util/Reader.h"
+#include "CLucene/util/StringBuffer.h"
+#include "TokenList.h"
+#include "QueryToken.h"
+#include "QueryParserBase.h"
+
+CL_NS_USE(util)
+
+CL_NS_DEF(queryParser)
+Lexer::Lexer(QueryParserBase* queryparser, const TCHAR* query) {
+ //Func - Constructor
+ //Pre - query != NULL and contains the query string
+ //Post - An instance of Lexer has been created
+
+ this->queryparser = queryparser;
+
+ CND_PRECONDITION(query != NULL, "query is NULL");
+
+ //The InputStream of Reader must be destroyed in the destructor
+ delSR = true;
+
+ StringReader *r = _CLNEW StringReader(query);
+
+ //Check to see if r has been created properly
+ CND_CONDITION(r != NULL, "Could not allocate memory for StringReader r");
+
+ //Instantie a FastCharStream instance using r and assign it to reader
+ reader = _CLNEW FastCharStream(r);
+
+ //Check to see if reader has been created properly
+ CND_CONDITION(reader != NULL, "Could not allocate memory for FastCharStream reader");
+
+ //The InputStream of Reader must be destroyed in the destructor
+ delSR = true;
+
+}
+
+
+Lexer::Lexer(QueryParserBase* queryparser, Reader* source) {
+ //Func - Constructor
+ // Initializes a new instance of the Lexer class with the specified
+ // TextReader to lex.
+ //Pre - Source contains a valid reference to a Reader
+ //Post - An instance of Lexer has been created using source as the reader
+
+ this->queryparser = queryparser;
+
+ //Instantie a FastCharStream instance using r and assign it to reader
+ reader = _CLNEW FastCharStream(source);
+
+ //Check to see if reader has been created properly
+ CND_CONDITION(reader != NULL, "Could not allocate memory for FastCharStream reader");
+
+ //The InputStream of Reader must not be destroyed in the destructor
+ delSR = false;
+}
+
+
+Lexer::~Lexer() {
+ //Func - Destructor
+ //Pre - true
+ //Post - if delSR was true the InputStream input of reader has been deleted
+ // The instance of Lexer has been destroyed
+
+ if (delSR) {
+ _CLDELETE(reader->input);
+ }
+
+ _CLDELETE(reader);
+}
+
+
+void Lexer::Lex(TokenList *tokenList) {
+ //Func - Breaks the input stream onto the tokens list tokens
+ //Pre - tokens != NULL and contains a TokenList in which the tokens can be stored
+ //Post - The tokens have been added to the TokenList tokens
+
+ CND_PRECONDITION(tokenList != NULL, "tokens is NULL");
+
+ //Get all the tokens
+ while(true) {
+ //Add the token to the tokens list
+
+ //Get the next token
+ QueryToken* token = _CLNEW QueryToken;
+ if ( !GetNextToken(token) ){
+ _CLDELETE(token);
+ break;
+ }
+ tokenList->add(token);
+ }
+
+ //The end has been reached so create an EOF_ token
+ //Add the final token to the TokenList _tokens
+ tokenList->add(_CLNEW QueryToken( QueryToken::EOF_));
+}
+
+
+bool Lexer::GetNextToken(QueryToken* token) {
+ while(!reader->Eos()) {
+ int ch = reader->GetNext();
+
+ if ( ch == -1 )
+ break;
+
+ // skipping whitespaces
+ if( _istspace(ch)!=0 ) {
+ continue;
+ }
+ TCHAR buf[2] = {ch,'\0'};
+ switch(ch) {
+ case '+':
+ token->set(buf, QueryToken::PLUS);
+ return true;
+ case '-':
+ token->set(buf, QueryToken::MINUS);
+ return true;
+ case '(':
+ token->set(buf, QueryToken::LPAREN);
+ return true;
+ case ')':
+ token->set(buf, QueryToken::RPAREN);
+ return true;
+ case ':':
+ token->set(buf, QueryToken::COLON);
+ return true;
+ case '!':
+ token->set(buf, QueryToken::NOT);
+ return true;
+ case '^':
+ token->set(buf, QueryToken::CARAT);
+ return true;
+ case '~':
+ if( _istdigit( reader->Peek() )!=0 ) {
+ TCHAR number[LUCENE_MAX_FIELD_LEN];
+ ReadIntegerNumber(ch, number,LUCENE_MAX_FIELD_LEN);
+ token->set(number, QueryToken::SLOP);
+ return true;
+ }else{
+ token->set(buf, QueryToken::FUZZY);
+ return true;
+ }
+ break;
+ case '"':
+ return ReadQuoted(ch, token);
+ case '[':
+ return ReadInclusiveRange(ch, token);
+ case '{':
+ return ReadExclusiveRange(ch, token);
+ case ']':
+ case '}':
+ case '*':
+ queryparser->throwParserException( _T("Unrecognized TCHAR %d at %d::%d."),
+ ch, reader->Column(), reader->Line() );
+ return false;
+ default:
+ return ReadTerm(ch, token);
+
+ // end of swith
+ }
+
+ }
+ return false;
+}
+
+
+void Lexer::ReadIntegerNumber(const TCHAR ch, TCHAR* buf, int buflen) {
+ int bp=0;
+ buf[bp++] = ch;
+
+ int c = reader->Peek();
+ while( c!=-1 && _istdigit(c)!=0 && bp<buflen-1 ) {
+ buf[bp++] = reader->GetNext();
+ c = reader->Peek();
+ }
+ buf[bp++] = 0;
+}
+
+
+bool Lexer::ReadInclusiveRange(const TCHAR prev, QueryToken* token) {
+ int ch = prev;
+ StringBuffer range;
+ range.appendChar(ch);
+
+ while(!reader->Eos()) {
+ ch = reader->GetNext();
+ if ( ch == -1 )
+ break;
+ range.appendChar(ch);
+
+ if(ch == ']'){
+ token->set(range.getBuffer(), QueryToken::RANGEIN);
+ return true;
+ }
+ }
+ queryparser->throwParserException(_T("Unterminated inclusive range! %d %d::%d"),' ',
+ reader->Column(),reader->Column());
+ return false;
+}
+
+
+bool Lexer::ReadExclusiveRange(const TCHAR prev, QueryToken* token) {
+ int ch = prev;
+ StringBuffer range;
+ range.appendChar(ch);
+
+ while(!reader->Eos()) {
+ ch = reader->GetNext();
+
+ if (ch==-1)
+ break;
+ range.appendChar(ch);
+
+ if(ch == '}'){
+ token->set(range.getBuffer(), QueryToken::RANGEEX);
+ return true;
+ }
+ }
+ queryparser->throwParserException(_T("Unterminated exclusive range! %d %d::%d"),' ',
+ reader->Column(),reader->Column() );
+ return false;
+}
+
+bool Lexer::ReadQuoted(const TCHAR prev, QueryToken* token) {
+ int ch = prev;
+ StringBuffer quoted;
+ quoted.appendChar(ch);
+
+ while(!reader->Eos()) {
+ ch = reader->GetNext();
+
+ if (ch==-1)
+ break;
+
+ quoted.appendChar(ch);
+
+ if(ch == '"'){
+ token->set(quoted.getBuffer(), QueryToken::QUOTED);
+ return true;
+ }
+ }
+ queryparser->throwParserException(_T("Unterminated string! %d %d::%d"),' ',
+ reader->Column(),reader->Column());
+ return false;
+}
+
+
+bool Lexer::ReadTerm(const TCHAR prev, QueryToken* token) {
+ int ch = prev;
+ bool completed = false;
+ int32_t asteriskCount = 0;
+ bool hasQuestion = false;
+
+ StringBuffer val;
+ TCHAR buf[3]; //used for readescaped
+
+ while(true) {
+ switch(ch) {
+ case -1:
+ break;
+ case '\\':
+ {
+ if ( ReadEscape(ch, buf) )
+ val.append( buf );
+ else
+ return false;
+ }
+ break;
+
+ case LUCENE_WILDCARDTERMENUM_WILDCARD_STRING:
+ asteriskCount++;
+ val.appendChar(ch);
+ break;
+ case LUCENE_WILDCARDTERMENUM_WILDCARD_CHAR:
+ hasQuestion = true;
+ val.appendChar(ch);
+ break;
+ case '\n':
+ case '\t':
+ case ' ':
+ case '+':
+ case '-':
+ case '!':
+ case '(':
+ case ')':
+ case ':':
+ case '^':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '~':
+ case '"':
+ // create new QueryToken
+ reader->UnGet();
+ completed = true;
+ break;
+ default:
+ val.appendChar(ch);
+ break;
+ // end of switch
+ }
+
+ if(completed || ch==-1 || reader->Eos() )
+ break;
+ else
+ ch = reader->GetNext();
+ }
+
+ // create new QueryToken
+ if(hasQuestion)
+ token->set(val.getBuffer(), QueryToken::WILDTERM);
+ else if(asteriskCount == 1 && val.getBuffer()[val.length() - 1] == '*')
+ token->set(val.getBuffer(), QueryToken::PREFIXTERM);
+ else if(asteriskCount > 0)
+ token->set(val.getBuffer(), QueryToken::WILDTERM);
+ else if( _tcsicmp(val.getBuffer(), _T("AND"))==0 || _tcscmp(val.getBuffer(), _T("&&"))==0 )
+ token->set(val.getBuffer(), QueryToken::AND_);
+ else if( _tcsicmp(val.getBuffer(), _T("OR"))==0 || _tcscmp(val.getBuffer(), _T("||"))==0)
+ token->set(val.getBuffer(), QueryToken::OR);
+ else if( _tcsicmp(val.getBuffer(), _T("NOT"))==0 )
+ token->set(val.getBuffer(), QueryToken::NOT);
+ else {
+ bool isnum = true;
+ int32_t nlen=val.length();
+ for (int32_t i=0;i<nlen;++i) {
+ TCHAR ch=val.getBuffer()[i];
+ if ( _istalpha(ch) ) {
+ isnum=false;
+ break;
+ }
+ }
+
+ if ( isnum )
+ token->set(val.getBuffer(), QueryToken::NUMBER);
+ else
+ token->set(val.getBuffer(), QueryToken::TERM);
+ }
+ return true;
+}
+
+
+bool Lexer::ReadEscape(TCHAR prev, TCHAR* buf) {
+ TCHAR ch = prev;
+ int bp=0;
+ buf[bp++] = ch;
+
+ ch = reader->GetNext();
+ int32_t idx = _tcscspn( buf, _T("\\+-!():^[]{}\"~*") );
+ if(idx == 0) {
+ buf[bp++] = ch;
+ buf[bp++]=0;
+ return true;
+ }
+ queryparser->throwParserException(_T("Unrecognized escape sequence at %d %d::%d"), ' ',
+ reader->Column(),reader->Line());
+ return false;
+}
+
+
+CL_NS_END