summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/clucene/src/CLucene/config/gunichartables.cpp
blob: 5463936f611d9e9db860fe86f80b603b5a15f15a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
/*
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 *
 ************************************************
 * Also licensed with permission from Tom Tromey 
 * and Owen Taylor under the Apache license.
 * Original location:
 * http://cvs.gnome.org/viewcvs/glib/glib/guniprop.c?view=log
 ************************************************
 * 
 * Copyright 2003-2006 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 
/*
 * Changes are Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
*/

#include "CLucene/StdHeader.h"

typedef unsigned long  gunichar;
typedef unsigned short guint16;
typedef          short gint16;
typedef          char  gchar;
typedef unsigned char  guchar;

/* These are the possible character classifications.
 * See http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
   or http://www.unicode.org/Public/UNIDATA/UCD.html.
      
   todo: i think there is a new version of the unicode, which we should use.
   data is licensed like this: http://www.unicode.org/copyright.html... not sure but looks apache compatible
 */
typedef enum
{
  G_UNICODE_CONTROL,
  G_UNICODE_FORMAT,
  G_UNICODE_UNASSIGNED,
  G_UNICODE_PRIVATE_USE,
  G_UNICODE_SURROGATE,
  G_UNICODE_LOWERCASE_LETTER,
  G_UNICODE_MODIFIER_LETTER,
  G_UNICODE_OTHER_LETTER,
  G_UNICODE_TITLECASE_LETTER,
  G_UNICODE_UPPERCASE_LETTER,
  G_UNICODE_COMBINING_MARK,
  G_UNICODE_ENCLOSING_MARK,
  G_UNICODE_NON_SPACING_MARK,
  G_UNICODE_DECIMAL_NUMBER,
  G_UNICODE_LETTER_NUMBER,
  G_UNICODE_OTHER_NUMBER,
  G_UNICODE_CONNECT_PUNCTUATION,
  G_UNICODE_DASH_PUNCTUATION,
  G_UNICODE_CLOSE_PUNCTUATION,
  G_UNICODE_FINAL_PUNCTUATION,
  G_UNICODE_INITIAL_PUNCTUATION,
  G_UNICODE_OTHER_PUNCTUATION,
  G_UNICODE_OPEN_PUNCTUATION,
  G_UNICODE_CURRENCY_SYMBOL,
  G_UNICODE_MODIFIER_SYMBOL,
  G_UNICODE_MATH_SYMBOL,
  G_UNICODE_OTHER_SYMBOL,
  G_UNICODE_LINE_SEPARATOR,
  G_UNICODE_PARAGRAPH_SEPARATOR,
  G_UNICODE_SPACE_SEPARATOR
} GUnicodeType;


#include "gunichartables.h"

#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
                          ? attr_table_part1[Page] \
                          : attr_table_part2[(Page) - 0xe00])

#define ATTTABLE(Page, Char) \
  ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))


#define TTYPE_PART1(Page, Char) \
  ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (type_data[type_table_part1[Page]][Char]))

#define TTYPE_PART2(Page, Char) \
  ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (type_data[type_table_part2[Page]][Char]))

#define TYPE(Char) \
  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
   ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
      ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
      : G_UNICODE_UNASSIGNED))

/* Count the number of elements in an array. The array must be defined
 * as such; using this with a dynamically allocated array will give
 * incorrect results.
 */
#define G_N_ELEMENTS(arr)		(sizeof (arr) / sizeof ((arr)[0]))




#if defined(LUCENE_USE_INTERNAL_CHAR_FUNCTIONS)
#ifdef _LUCENE_PRAGMA_WARNINGS
 #pragma message ("===== Using internal character function =====")
#else
#if !(defined(Q_OS_SOLARIS) || defined(Q_CC_MIPS))
#warning "===== Using internal character function ====="
#endif
#endif

bool cl_isletter(gunichar c)
{
    int t = TYPE (c);
    switch(t)
    {
      case G_UNICODE_LOWERCASE_LETTER: return true;
      case G_UNICODE_TITLECASE_LETTER: return true;
      case G_UNICODE_UPPERCASE_LETTER: return true;
      case G_UNICODE_MODIFIER_LETTER: return true;
      case G_UNICODE_OTHER_LETTER: return true;
      default: return false;
    }
}

bool cl_isalnum(gunichar c)
{
    int t = TYPE (c);
    switch(t)
    {
      case G_UNICODE_LOWERCASE_LETTER: return true;
      case G_UNICODE_TITLECASE_LETTER: return true;
      case G_UNICODE_UPPERCASE_LETTER: return true;
      case G_UNICODE_MODIFIER_LETTER: return true;
      case G_UNICODE_OTHER_LETTER: return true;
      case G_UNICODE_DECIMAL_NUMBER: return true;
      case G_UNICODE_LETTER_NUMBER: return true;
      case G_UNICODE_OTHER_NUMBER: return true;
      default: return false;
    }
}

bool cl_isdigit(gunichar c)
{
    int t = TYPE (c);
    switch(t)
    {
      case G_UNICODE_DECIMAL_NUMBER: return true;
      case G_UNICODE_LETTER_NUMBER: return true;
      case G_UNICODE_OTHER_NUMBER: return true;
      default: return false;
    }
}

/**
 * cl_isspace:
 * @c: a Unicode character
 *
 * Determines whether a character is a space, tab, or line separator
 * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 * character value with lucene_utf8towc().
 *
 * (Note: don't use this to do word breaking; you have to use
 * Pango or equivalent to get word breaking right, the algorithm
 * is fairly complex.)
 *
 * Return value: %TRUE if @c is a punctuation character
 **/
bool cl_isspace (gunichar c)
{
  switch (c)
  {
      /* special-case these since Unicode thinks they are not spaces */
    case '\t':
    case '\n':
    case '\r':
    case '\f':
      return true;

    default:
    {
     int t = TYPE ((gunichar)c);
     return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
             || t == G_UNICODE_PARAGRAPH_SEPARATOR);
    }
  }
}



/**
 * cl_tolower:
 * @c: a Unicode character.
 *
 * Converts a character to lower case.
 *
 * Return value: the result of converting @c to lower case.
 *               If @c is not an upperlower or titlecase character,
 *               or has no lowercase equivalent @c is returned unchanged.
 **/
TCHAR cl_tolower (TCHAR ch)
{
  gunichar c=ch;
  int t = TYPE ((gunichar)c);
  if (t == G_UNICODE_UPPERCASE_LETTER)
  {
      gunichar val = ATTTABLE (c >> 8, c & 0xff);
      if (val >= 0x1000000)
      {
        const gchar *p = special_case_table + val - 0x1000000;
        int len=0;
		wchar_t ret=0;
		lucene_utf8towc(&ret,p,6);
#ifdef _UCS2
		return ret;
#else
        return LUCENE_OOR_CHAR(ret);
#endif
        //return cl_utf8_get_char (p, &len);
      }else
        return val ? val : c;
  }else if (t == G_UNICODE_TITLECASE_LETTER){
      unsigned int i;
      for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
      {
        if (title_table[i][0] == c)
          return title_table[i][2];
      }
  }
  return c;
}

/**
 * cl_toupper:
 * @c: a Unicode character
 * 
 * Converts a character to uppercase.
 * 
 * Return value: the result of converting @c to uppercase.
 *               If @c is not an lowercase or titlecase character,
 *               or has no upper case equivalent @c is returned unchanged.
 **/
TCHAR cl_toupper (TCHAR ch)
{
  gunichar c=ch;
  int t = TYPE (c);
  if (t == G_UNICODE_LOWERCASE_LETTER)
    {
      gunichar val = ATTTABLE (c >> 8, c & 0xff);
      if (val >= 0x1000000)
	{
	  const gchar *p = special_case_table + val - 0x1000000;
	  
	  wchar_t ret=0;
	  lucene_utf8towc(&ret,p,6);
#ifdef _UCS2
	  return ret;
#else
      return LUCENE_OOR_CHAR(ret);
#endif
	  //return lucene_utf8towc (p);
	}
      else
	return val ? val : c;
    }
  else if (t == G_UNICODE_TITLECASE_LETTER)
    {
      unsigned int i;
      for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
	{
	  if (title_table[i][0] == c)
	    return title_table[i][1];
	}
    }
  return c;
}



/**
 * cl_tcasefold:
 * @str: a unicode string
 *
 * Converts a string into a form that is independent of case. The
 * result will not correspond to any particular case, but can be
 * compared for equality or ordered with the results of calling
 * cl_tcasefold() on other strings.
 *
 * Note that calling cl_tcasefold() followed by g_utf8_collate() is
 * only an approximation to the correct linguistic case insensitive
 * ordering, though it is a fairly good one. Getting this exactly
 * right would require a more sophisticated collation function that
 * takes case sensitivity into account. GLib does not currently
 * provide such a function.
 *
 * Return value: a newly allocated string, that is a
 *   case independent form of @str.
 **/
TCHAR cl_tcasefold(const TCHAR ch){
    int start = 0;
    int end = G_N_ELEMENTS (casefold_table);
    
	if (ch >= casefold_table[start].ch &&
        ch <= casefold_table[end - 1].ch)
    {
        while (1)
        {
            int half = (start + end) / 2;
            if (ch == casefold_table[half].ch)
            {
				   wchar_t ret=0;
				   lucene_utf8towc(&ret,casefold_table[half].data,6);

               #ifdef _UCS2
		           return ret;
               #else
                   LUCENE_OOR_CHAR(ret)
               #endif
            }else if (half == start){
                break;
            }else if (ch > casefold_table[half].ch){
                start = half;
            }else{
                end = half;
            }
        }
    }
    return cl_tolower(ch);
    
}


//this function was not taken from gnome
TCHAR* cl_tcscasefold( TCHAR * str, int len ) //len default is -1
{
    TCHAR *p = str;
    while ((len < 0 || p < str + len) && *p)
    {
        *p = cl_tcasefold(*p);
		p++;
    }
    return str;
}
//this function was not taken from gnome
int cl_tcscasefoldcmp(const TCHAR * dst, const TCHAR * src){
    TCHAR f,l;
    
    do{
        f = cl_tcasefold( (*(dst++)) );
        l = cl_tcasefold( (*(src++)) );
    } while ( (f) && (f == l) );
    
    return (int)(f - l);
}

#endif