summaryrefslogtreecommitdiffstats
path: root/3rdparty/clucene/src/CLucene/config/gunichartables.cpp
diff options
context:
space:
mode:
Diffstat (limited to '3rdparty/clucene/src/CLucene/config/gunichartables.cpp')
-rw-r--r--3rdparty/clucene/src/CLucene/config/gunichartables.cpp386
1 files changed, 386 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/config/gunichartables.cpp b/3rdparty/clucene/src/CLucene/config/gunichartables.cpp
new file mode 100644
index 000000000..5463936f6
--- /dev/null
+++ b/3rdparty/clucene/src/CLucene/config/gunichartables.cpp
@@ -0,0 +1,386 @@
+/*
+ * Copyright (C) 1999 Tom Tromey
+ * Copyright (C) 2000 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ *
+ ************************************************
+ * Also licensed with permission from Tom Tromey
+ * and Owen Taylor under the Apache license.
+ * Original location:
+ * http://cvs.gnome.org/viewcvs/glib/glib/guniprop.c?view=log
+ ************************************************
+ *
+ * Copyright 2003-2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Changes are Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
+*/
+
+#include "CLucene/StdHeader.h"
+
+typedef unsigned long gunichar;
+typedef unsigned short guint16;
+typedef short gint16;
+typedef char gchar;
+typedef unsigned char guchar;
+
+/* These are the possible character classifications.
+ * See http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
+ or http://www.unicode.org/Public/UNIDATA/UCD.html.
+
+ todo: i think there is a new version of the unicode, which we should use.
+ data is licensed like this: http://www.unicode.org/copyright.html... not sure but looks apache compatible
+ */
+typedef enum
+{
+ G_UNICODE_CONTROL,
+ G_UNICODE_FORMAT,
+ G_UNICODE_UNASSIGNED,
+ G_UNICODE_PRIVATE_USE,
+ G_UNICODE_SURROGATE,
+ G_UNICODE_LOWERCASE_LETTER,
+ G_UNICODE_MODIFIER_LETTER,
+ G_UNICODE_OTHER_LETTER,
+ G_UNICODE_TITLECASE_LETTER,
+ G_UNICODE_UPPERCASE_LETTER,
+ G_UNICODE_COMBINING_MARK,
+ G_UNICODE_ENCLOSING_MARK,
+ G_UNICODE_NON_SPACING_MARK,
+ G_UNICODE_DECIMAL_NUMBER,
+ G_UNICODE_LETTER_NUMBER,
+ G_UNICODE_OTHER_NUMBER,
+ G_UNICODE_CONNECT_PUNCTUATION,
+ G_UNICODE_DASH_PUNCTUATION,
+ G_UNICODE_CLOSE_PUNCTUATION,
+ G_UNICODE_FINAL_PUNCTUATION,
+ G_UNICODE_INITIAL_PUNCTUATION,
+ G_UNICODE_OTHER_PUNCTUATION,
+ G_UNICODE_OPEN_PUNCTUATION,
+ G_UNICODE_CURRENCY_SYMBOL,
+ G_UNICODE_MODIFIER_SYMBOL,
+ G_UNICODE_MATH_SYMBOL,
+ G_UNICODE_OTHER_SYMBOL,
+ G_UNICODE_LINE_SEPARATOR,
+ G_UNICODE_PARAGRAPH_SEPARATOR,
+ G_UNICODE_SPACE_SEPARATOR
+} GUnicodeType;
+
+
+#include "gunichartables.h"
+
+#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
+ ? attr_table_part1[Page] \
+ : attr_table_part2[(Page) - 0xe00])
+
+#define ATTTABLE(Page, Char) \
+ ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
+
+
+#define TTYPE_PART1(Page, Char) \
+ ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+ ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+ : (type_data[type_table_part1[Page]][Char]))
+
+#define TTYPE_PART2(Page, Char) \
+ ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+ ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+ : (type_data[type_table_part2[Page]][Char]))
+
+#define TYPE(Char) \
+ (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
+ ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
+ : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
+ ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
+ : G_UNICODE_UNASSIGNED))
+
+/* Count the number of elements in an array. The array must be defined
+ * as such; using this with a dynamically allocated array will give
+ * incorrect results.
+ */
+#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
+
+
+
+
+#if defined(LUCENE_USE_INTERNAL_CHAR_FUNCTIONS)
+#ifdef _LUCENE_PRAGMA_WARNINGS
+ #pragma message ("===== Using internal character function =====")
+#else
+#if !(defined(Q_OS_SOLARIS) || defined(Q_CC_MIPS))
+#warning "===== Using internal character function ====="
+#endif
+#endif
+
+bool cl_isletter(gunichar c)
+{
+ int t = TYPE (c);
+ switch(t)
+ {
+ case G_UNICODE_LOWERCASE_LETTER: return true;
+ case G_UNICODE_TITLECASE_LETTER: return true;
+ case G_UNICODE_UPPERCASE_LETTER: return true;
+ case G_UNICODE_MODIFIER_LETTER: return true;
+ case G_UNICODE_OTHER_LETTER: return true;
+ default: return false;
+ }
+}
+
+bool cl_isalnum(gunichar c)
+{
+ int t = TYPE (c);
+ switch(t)
+ {
+ case G_UNICODE_LOWERCASE_LETTER: return true;
+ case G_UNICODE_TITLECASE_LETTER: return true;
+ case G_UNICODE_UPPERCASE_LETTER: return true;
+ case G_UNICODE_MODIFIER_LETTER: return true;
+ case G_UNICODE_OTHER_LETTER: return true;
+ case G_UNICODE_DECIMAL_NUMBER: return true;
+ case G_UNICODE_LETTER_NUMBER: return true;
+ case G_UNICODE_OTHER_NUMBER: return true;
+ default: return false;
+ }
+}
+
+bool cl_isdigit(gunichar c)
+{
+ int t = TYPE (c);
+ switch(t)
+ {
+ case G_UNICODE_DECIMAL_NUMBER: return true;
+ case G_UNICODE_LETTER_NUMBER: return true;
+ case G_UNICODE_OTHER_NUMBER: return true;
+ default: return false;
+ }
+}
+
+/**
+ * cl_isspace:
+ * @c: a Unicode character
+ *
+ * Determines whether a character is a space, tab, or line separator
+ * (newline, carriage return, etc.). Given some UTF-8 text, obtain a
+ * character value with lucene_utf8towc().
+ *
+ * (Note: don't use this to do word breaking; you have to use
+ * Pango or equivalent to get word breaking right, the algorithm
+ * is fairly complex.)
+ *
+ * Return value: %TRUE if @c is a punctuation character
+ **/
+bool cl_isspace (gunichar c)
+{
+ switch (c)
+ {
+ /* special-case these since Unicode thinks they are not spaces */
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ return true;
+
+ default:
+ {
+ int t = TYPE ((gunichar)c);
+ return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
+ || t == G_UNICODE_PARAGRAPH_SEPARATOR);
+ }
+ }
+}
+
+
+
+/**
+ * cl_tolower:
+ * @c: a Unicode character.
+ *
+ * Converts a character to lower case.
+ *
+ * Return value: the result of converting @c to lower case.
+ * If @c is not an upperlower or titlecase character,
+ * or has no lowercase equivalent @c is returned unchanged.
+ **/
+TCHAR cl_tolower (TCHAR ch)
+{
+ gunichar c=ch;
+ int t = TYPE ((gunichar)c);
+ if (t == G_UNICODE_UPPERCASE_LETTER)
+ {
+ gunichar val = ATTTABLE (c >> 8, c & 0xff);
+ if (val >= 0x1000000)
+ {
+ const gchar *p = special_case_table + val - 0x1000000;
+ int len=0;
+ wchar_t ret=0;
+ lucene_utf8towc(&ret,p,6);
+#ifdef _UCS2
+ return ret;
+#else
+ return LUCENE_OOR_CHAR(ret);
+#endif
+ //return cl_utf8_get_char (p, &len);
+ }else
+ return val ? val : c;
+ }else if (t == G_UNICODE_TITLECASE_LETTER){
+ unsigned int i;
+ for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
+ {
+ if (title_table[i][0] == c)
+ return title_table[i][2];
+ }
+ }
+ return c;
+}
+
+/**
+ * cl_toupper:
+ * @c: a Unicode character
+ *
+ * Converts a character to uppercase.
+ *
+ * Return value: the result of converting @c to uppercase.
+ * If @c is not an lowercase or titlecase character,
+ * or has no upper case equivalent @c is returned unchanged.
+ **/
+TCHAR cl_toupper (TCHAR ch)
+{
+ gunichar c=ch;
+ int t = TYPE (c);
+ if (t == G_UNICODE_LOWERCASE_LETTER)
+ {
+ gunichar val = ATTTABLE (c >> 8, c & 0xff);
+ if (val >= 0x1000000)
+ {
+ const gchar *p = special_case_table + val - 0x1000000;
+
+ wchar_t ret=0;
+ lucene_utf8towc(&ret,p,6);
+#ifdef _UCS2
+ return ret;
+#else
+ return LUCENE_OOR_CHAR(ret);
+#endif
+ //return lucene_utf8towc (p);
+ }
+ else
+ return val ? val : c;
+ }
+ else if (t == G_UNICODE_TITLECASE_LETTER)
+ {
+ unsigned int i;
+ for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
+ {
+ if (title_table[i][0] == c)
+ return title_table[i][1];
+ }
+ }
+ return c;
+}
+
+
+
+/**
+ * cl_tcasefold:
+ * @str: a unicode string
+ *
+ * Converts a string into a form that is independent of case. The
+ * result will not correspond to any particular case, but can be
+ * compared for equality or ordered with the results of calling
+ * cl_tcasefold() on other strings.
+ *
+ * Note that calling cl_tcasefold() followed by g_utf8_collate() is
+ * only an approximation to the correct linguistic case insensitive
+ * ordering, though it is a fairly good one. Getting this exactly
+ * right would require a more sophisticated collation function that
+ * takes case sensitivity into account. GLib does not currently
+ * provide such a function.
+ *
+ * Return value: a newly allocated string, that is a
+ * case independent form of @str.
+ **/
+TCHAR cl_tcasefold(const TCHAR ch){
+ int start = 0;
+ int end = G_N_ELEMENTS (casefold_table);
+
+ if (ch >= casefold_table[start].ch &&
+ ch <= casefold_table[end - 1].ch)
+ {
+ while (1)
+ {
+ int half = (start + end) / 2;
+ if (ch == casefold_table[half].ch)
+ {
+ wchar_t ret=0;
+ lucene_utf8towc(&ret,casefold_table[half].data,6);
+
+ #ifdef _UCS2
+ return ret;
+ #else
+ LUCENE_OOR_CHAR(ret)
+ #endif
+ }else if (half == start){
+ break;
+ }else if (ch > casefold_table[half].ch){
+ start = half;
+ }else{
+ end = half;
+ }
+ }
+ }
+ return cl_tolower(ch);
+
+}
+
+
+//this function was not taken from gnome
+TCHAR* cl_tcscasefold( TCHAR * str, int len ) //len default is -1
+{
+ TCHAR *p = str;
+ while ((len < 0 || p < str + len) && *p)
+ {
+ *p = cl_tcasefold(*p);
+ p++;
+ }
+ return str;
+}
+//this function was not taken from gnome
+int cl_tcscasefoldcmp(const TCHAR * dst, const TCHAR * src){
+ TCHAR f,l;
+
+ do{
+ f = cl_tcasefold( (*(dst++)) );
+ l = cl_tcasefold( (*(src++)) );
+ } while ( (f) && (f == l) );
+
+ return (int)(f - l);
+}
+
+#endif