diff options
-rw-r--r-- | src/corelib/text/qunicodetools.cpp | 1529 | ||||
-rw-r--r-- | src/corelib/text/qunicodetools_p.h | 4 | ||||
-rw-r--r-- | src/gui/text/qtextengine.cpp | 2 |
3 files changed, 1495 insertions, 40 deletions
diff --git a/src/corelib/text/qunicodetools.cpp b/src/corelib/text/qunicodetools.cpp index 76072f8282..819d8a9c3b 100644 --- a/src/corelib/text/qunicodetools.cpp +++ b/src/corelib/text/qunicodetools.cpp @@ -1,6 +1,6 @@ /**************************************************************************** ** -** Copyright (C) 2016 The Qt Company Ltd. +** Copyright (C) 2020 The Qt Company Ltd. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -41,8 +41,7 @@ #include "qunicodetables_p.h" #include "qvarlengtharray.h" - -#include "qharfbuzz_p.h" +#include "qlibrary.h" #define FLAG(x) (1 << (x)) @@ -724,6 +723,1493 @@ static void getWhiteSpaces(const ushort *string, quint32 len, QCharAttributes *a } } +namespace Tailored { + +using CharAttributeFunction = void (*)(QChar::Script script, const ushort *text, uint from, uint len, QCharAttributes *attributes); + + +enum Form { + Invalid = 0x0, + UnknownForm = Invalid, + Consonant, + Nukta, + Halant, + Matra, + VowelMark, + StressMark, + IndependentVowel, + LengthMark, + Control, + Other +}; + +static const unsigned char indicForms[0xe00-0x900] = { + // Devangari + Invalid, VowelMark, VowelMark, VowelMark, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, UnknownForm, UnknownForm, + Nukta, Other, Matra, Matra, + + Matra, Matra, Matra, Matra, + Matra, Matra, Matra, Matra, + Matra, Matra, Matra, Matra, + Matra, Halant, UnknownForm, UnknownForm, + + Other, StressMark, StressMark, StressMark, + StressMark, UnknownForm, UnknownForm, UnknownForm, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + IndependentVowel, IndependentVowel, VowelMark, VowelMark, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Consonant, + Consonant, Consonant /* ??? */, Consonant, Consonant, + + // Bengali + Invalid, VowelMark, VowelMark, VowelMark, + Invalid, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, Invalid, Invalid, IndependentVowel, + + IndependentVowel, Invalid, Invalid, IndependentVowel, + IndependentVowel, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Invalid, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Invalid, Consonant, Invalid, + Invalid, Invalid, Consonant, Consonant, + Consonant, Consonant, UnknownForm, UnknownForm, + Nukta, Other, Matra, Matra, + + Matra, Matra, Matra, Matra, + Matra, Invalid, Invalid, Matra, + Matra, Invalid, Invalid, Matra, + Matra, Halant, Consonant, UnknownForm, + + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Invalid, VowelMark, + Invalid, Invalid, Invalid, Invalid, + Consonant, Consonant, Invalid, Consonant, + + IndependentVowel, IndependentVowel, VowelMark, VowelMark, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + Consonant, Consonant, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + // Gurmukhi + Invalid, VowelMark, VowelMark, VowelMark, + Invalid, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, Invalid, + Invalid, Invalid, Invalid, IndependentVowel, + + IndependentVowel, Invalid, Invalid, IndependentVowel, + IndependentVowel, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Invalid, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Invalid, Consonant, Consonant, + Invalid, Consonant, Consonant, Invalid, + Consonant, Consonant, UnknownForm, UnknownForm, + Nukta, Other, Matra, Matra, + + Matra, Matra, Matra, Invalid, + Invalid, Invalid, Invalid, Matra, + Matra, Invalid, Invalid, Matra, + Matra, Halant, UnknownForm, UnknownForm, + + Invalid, Invalid, Invalid, Invalid, + Invalid, UnknownForm, UnknownForm, UnknownForm, + Invalid, Consonant, Consonant, Consonant, + Consonant, Invalid, Consonant, Invalid, + + Other, Other, Invalid, Invalid, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + StressMark, StressMark, Consonant, Consonant, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + // Gujarati + Invalid, VowelMark, VowelMark, VowelMark, + Invalid, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, Invalid, IndependentVowel, + + IndependentVowel, IndependentVowel, Invalid, IndependentVowel, + IndependentVowel, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Invalid, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Invalid, Consonant, Consonant, + Invalid, Consonant, Consonant, Consonant, + Consonant, Consonant, UnknownForm, UnknownForm, + Nukta, Other, Matra, Matra, + + Matra, Matra, Matra, Matra, + Matra, Matra, Invalid, Matra, + Matra, Matra, Invalid, Matra, + Matra, Halant, UnknownForm, UnknownForm, + + Other, UnknownForm, UnknownForm, UnknownForm, + UnknownForm, UnknownForm, UnknownForm, UnknownForm, + UnknownForm, UnknownForm, UnknownForm, UnknownForm, + UnknownForm, UnknownForm, UnknownForm, UnknownForm, + + IndependentVowel, IndependentVowel, VowelMark, VowelMark, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + // Oriya + Invalid, VowelMark, VowelMark, VowelMark, + Invalid, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, Invalid, Invalid, IndependentVowel, + + IndependentVowel, Invalid, Invalid, IndependentVowel, + IndependentVowel, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Invalid, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Invalid, Consonant, Consonant, + Invalid, Consonant, Consonant, Consonant, + Consonant, Consonant, UnknownForm, UnknownForm, + Nukta, Other, Matra, Matra, + + Matra, Matra, Matra, Matra, + Invalid, Invalid, Invalid, Matra, + Matra, Invalid, Invalid, Matra, + Matra, Halant, UnknownForm, UnknownForm, + + Other, Invalid, Invalid, Invalid, + Invalid, UnknownForm, LengthMark, LengthMark, + Invalid, Invalid, Invalid, Invalid, + Consonant, Consonant, Invalid, Consonant, + + IndependentVowel, IndependentVowel, Invalid, Invalid, + Invalid, Invalid, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + Other, Consonant, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + //Tamil + Invalid, Invalid, VowelMark, Other, + Invalid, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, Invalid, + Invalid, Invalid, IndependentVowel, IndependentVowel, + + IndependentVowel, Invalid, IndependentVowel, IndependentVowel, + IndependentVowel, Consonant, Invalid, Invalid, + Invalid, Consonant, Consonant, Invalid, + Consonant, Invalid, Consonant, Consonant, + + Invalid, Invalid, Invalid, Consonant, + Consonant, Invalid, Invalid, Invalid, + Consonant, Consonant, Consonant, Invalid, + Invalid, Invalid, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, UnknownForm, UnknownForm, + Invalid, Invalid, Matra, Matra, + + Matra, Matra, Matra, Invalid, + Invalid, Invalid, Matra, Matra, + Matra, Invalid, Matra, Matra, + Matra, Halant, Invalid, Invalid, + + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Invalid, LengthMark, + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Invalid, Invalid, + + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + // Telugu + Invalid, VowelMark, VowelMark, VowelMark, + Invalid, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, Invalid, IndependentVowel, IndependentVowel, + + IndependentVowel, Invalid, IndependentVowel, IndependentVowel, + IndependentVowel, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Invalid, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Invalid, Consonant, Consonant, Consonant, + Consonant, Consonant, UnknownForm, UnknownForm, + Invalid, Invalid, Matra, Matra, + + Matra, Matra, Matra, Matra, + Matra, Invalid, Matra, Matra, + Matra, Invalid, Matra, Matra, + Matra, Halant, Invalid, Invalid, + + Invalid, Invalid, Invalid, Invalid, + Invalid, LengthMark, Matra, Invalid, + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Invalid, Invalid, + + IndependentVowel, IndependentVowel, Invalid, Invalid, + Invalid, Invalid, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + // Kannada + Invalid, Invalid, VowelMark, VowelMark, + Invalid, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, Invalid, IndependentVowel, IndependentVowel, + + IndependentVowel, Invalid, IndependentVowel, IndependentVowel, + IndependentVowel, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Invalid, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Invalid, Consonant, Consonant, Consonant, + Consonant, Consonant, UnknownForm, UnknownForm, + Nukta, Other, Matra, Matra, + + Matra, Matra, Matra, Matra, + Matra, Invalid, Matra, Matra, + Matra, Invalid, Matra, Matra, + Matra, Halant, Invalid, Invalid, + + Invalid, Invalid, Invalid, Invalid, + Invalid, LengthMark, LengthMark, Invalid, + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Consonant, Invalid, + + IndependentVowel, IndependentVowel, VowelMark, VowelMark, + Invalid, Invalid, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + // Malayalam + Invalid, Invalid, VowelMark, VowelMark, + Invalid, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, Invalid, IndependentVowel, IndependentVowel, + + IndependentVowel, Invalid, IndependentVowel, IndependentVowel, + IndependentVowel, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Invalid, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, UnknownForm, UnknownForm, + Invalid, Invalid, Matra, Matra, + + Matra, Matra, Matra, Matra, + Invalid, Invalid, Matra, Matra, + Matra, Invalid, Matra, Matra, + Matra, Halant, Invalid, Invalid, + + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Invalid, Matra, + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Invalid, Invalid, + + IndependentVowel, IndependentVowel, Invalid, Invalid, + Invalid, Invalid, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, + + // Sinhala + Invalid, Invalid, VowelMark, VowelMark, + Invalid, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + + IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel, + IndependentVowel, IndependentVowel, IndependentVowel, Invalid, + Invalid, Invalid, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + + Consonant, Consonant, Invalid, Consonant, + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Consonant, + Invalid, Consonant, Invalid, Invalid, + + Consonant, Consonant, Consonant, Consonant, + Consonant, Consonant, Consonant, Invalid, + Invalid, Invalid, Halant, Invalid, + Invalid, Invalid, Invalid, Matra, + + Matra, Matra, Matra, Matra, + Matra, Invalid, Matra, Invalid, + Matra, Matra, Matra, Matra, + Matra, Matra, Matra, Matra, + + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Invalid, Invalid, + Invalid, Invalid, Invalid, Invalid, + + Invalid, Invalid, Matra, Matra, + Other, Other, Other, Other, + Other, Other, Other, Other, + Other, Other, Other, Other, +}; + +static inline Form form(unsigned short uc) { + if (uc < 0x900 || uc > 0xdff) { + if (uc == 0x25cc) + return Consonant; + if (uc == 0x200c || uc == 0x200d) + return Control; + return Other; + } + return (Form)indicForms[uc-0x900]; +} + +// #define INDIC_DEBUG +#ifdef INDIC_DEBUG +#define IDEBUG qDebug +#else +#define IDEBUG if constexpr (1) ; else qDebug +#endif + +/* syllables are of the form: + + (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark? + (Consonant Nukta? Halant)* Consonant Halant + IndependentVowel VowelMark? StressMark? + + We return syllable boundaries on invalid combinations aswell +*/ +static int indic_nextSyllableBoundary(QChar::Script script, const ushort *s, int start, int end, bool *invalid) +{ + *invalid = false; + IDEBUG("indic_nextSyllableBoundary: start=%d, end=%d", start, end); + const ushort *uc = s+start; + + int pos = 0; + Form state = form(uc[pos]); + IDEBUG("state[%d]=%d (uc=%4x)", pos, state, uc[pos]); + pos++; + + if (state != Consonant && state != IndependentVowel) { + if (state != Other) + *invalid = true; + goto finish; + } + + while (pos < end - start) { + Form newState = form(uc[pos]); + IDEBUG("state[%d]=%d (uc=%4x)", pos, newState, uc[pos]); + switch (newState) { + case Control: + newState = state; + if (state == Halant && uc[pos] == 0x200d /* ZWJ */) + break; + // the control character should be the last char in the item + if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */) + break; + if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */) + break; + // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15 + ++pos; + goto finish; + case Consonant: + if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */)) + break; + goto finish; + case Halant: + if (state == Nukta || state == Consonant) + break; + // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya + if (script == QChar::Script_Bengali && pos == 1 && + (uc[0] == 0x0985 || uc[0] == 0x098f)) + break; + // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra. + if (script == QChar::Script_Sinhala && state == Matra) { + ++pos; + continue; + } + if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) { + ++pos; + continue; + } + goto finish; + case Nukta: + if (state == Consonant) + break; + goto finish; + case StressMark: + if (state == VowelMark) + break; + // fall through + case VowelMark: + if (state == Matra || state == LengthMark || state == IndependentVowel) + break; + // fall through + case Matra: + if (state == Consonant || state == Nukta) + break; + if (state == Matra) { + // ### needs proper testing for correct two/three part matras + break; + } + // ### not sure if this is correct. If it is, does it apply only to Bengali or should + // it work for all Indic languages? + // the combination Independent_A + Vowel Sign AA is allowed. + if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985) + break; + if (script == QChar::Script_Tamil && state == Matra) { + if (uc[pos-1] == 0x0bc6 && + (uc[pos] == 0xbbe || uc[pos] == 0xbd7)) + break; + if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe) + break; + } + goto finish; + + case LengthMark: + if (state == Matra) { + // ### needs proper testing for correct two/three part matras + break; + } + case IndependentVowel: + case Invalid: + case Other: + goto finish; + } + state = newState; + pos++; + } + finish: + return pos+start; +} + +static void indicAttributes(QChar::Script script, const ushort *text, uint from, uint len, QCharAttributes *attributes) +{ + int end = from + len; + const ushort *uc = text + from; + attributes += from; + uint i = 0; + while (i < len) { + bool invalid; + uint boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from; + attributes[i].graphemeBoundary = true; + + if (boundary > len-1) boundary = len; + i++; + while (i < boundary) { + attributes[i].graphemeBoundary = false; + ++uc; + ++i; + } + assert(i == boundary); + } + + +} + +#define LIBTHAI_MAJOR 0 + +/* + * if libthai changed please update these codes too. + */ +struct thcell_t { + unsigned char base; /**< base character */ + unsigned char hilo; /**< upper/lower vowel/diacritic */ + unsigned char top; /**< top-level mark */ +}; +typedef int (*th_brk_def) (const unsigned char*, int*, size_t); +typedef size_t (*th_next_cell_def) (const unsigned char *, size_t, struct thcell_t *, int); + +/* libthai related function handles */ +static th_brk_def th_brk = 0; +static th_next_cell_def th_next_cell = 0; + +static int init_libthai() { + static bool initialized = false; + if (!initialized && (!th_brk || !th_next_cell)) { + th_brk = (th_brk_def) QLibrary::resolve(QLatin1String("thai"), (int)LIBTHAI_MAJOR, "th_brk"); + th_next_cell = (th_next_cell_def)QLibrary::resolve(QLatin1String("thai"), LIBTHAI_MAJOR, "th_next_cell"); + initialized = true; + } + if (th_brk && th_next_cell) + return 1; + else + return 0; +} + +static void to_tis620(const ushort *string, uint len, char *cstr) +{ + uint i; + unsigned char *result = (unsigned char *)cstr; + + for (i = 0; i < len; ++i) { + if (string[i] <= 0xa0) + result[i] = (unsigned char)string[i]; + else if (string[i] >= 0xe01 && string[i] <= 0xe5b) + result[i] = (unsigned char)(string[i] - 0xe00 + 0xa0); + else + result[i] = (unsigned char)~0; // Same encoding as libthai uses for invalid chars + } + + result[len] = 0; +} + +/* + * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI. + */ +static void thaiAssignAttributes(const ushort *string, uint len, QCharAttributes *attributes) +{ + char s[128]; + char *cstr = s; + int *break_positions = 0; + int brp[128]; + int brp_size = 0; + uint numbreaks, i, j, cell_length; + struct thcell_t tis_cell; + + if (!init_libthai()) + return ; + + if (len >= 128) + cstr = (char *)malloc(len*sizeof(char) + 1); + + to_tis620(string, len, cstr); + + for (i = 0; i < len; ++i) { + attributes[i].wordBreak = false; + attributes[i].wordStart = false; + attributes[i].wordEnd = false; + attributes[i].lineBreak = false; + } + + if (len > 128) { + break_positions = (int*) malloc (sizeof(int) * len); + memset (break_positions, 0, sizeof(int) * len); + brp_size = len; + } + else { + break_positions = brp; + brp_size = 128; + } + + if (break_positions) { + attributes[0].wordBreak = true; + attributes[0].wordStart = true; + attributes[0].wordEnd = false; + numbreaks = th_brk((const unsigned char *)cstr, break_positions, brp_size); + for (i = 0; i < numbreaks; ++i) { + attributes[break_positions[i]].wordBreak = true; + attributes[break_positions[i]].wordStart = true; + attributes[break_positions[i]].wordEnd = true; + attributes[break_positions[i]].lineBreak = true; + } + if (numbreaks > 0) + attributes[break_positions[numbreaks - 1]].wordStart = false; + + if (break_positions != brp) + free(break_positions); + } + + /* manage grapheme boundaries */ + i = 0; + while (i < len) { + cell_length = (uint)(th_next_cell((const unsigned char *)cstr + i, len - i, &tis_cell, true)); + + attributes[i].graphemeBoundary = true; + for (j = 1; j < cell_length; j++) + attributes[i + j].graphemeBoundary = false; + + /* Set graphemeBoundary for SARA AM */ + if (cstr[i + cell_length - 1] == (char)0xd3) + attributes[i + cell_length - 1].graphemeBoundary = true; + + i += cell_length; + } + + if (len >= 128) + free(cstr); +} + +static void thaiAttributes(QChar::Script script, const ushort *text, uint from, uint len, QCharAttributes *attributes) +{ + assert(script == QChar::Script_Thai); + const ushort *uc = text + from; + attributes += from; + Q_UNUSED(script); + thaiAssignAttributes(uc, len, attributes); +} + +/* + tibetan syllables are of the form: + head position consonant + first sub-joined consonant + ....intermediate sub-joined consonants (if any) + last sub-joined consonant + sub-joined vowel (a-chung U+0F71) + standard or compound vowel sign (or 'virama' for devanagari transliteration) +*/ + +typedef enum { + TibetanOther, + TibetanHeadConsonant, + TibetanSubjoinedConsonant, + TibetanSubjoinedVowel, + TibetanVowel +} TibetanForm; + +/* this table starts at U+0f40 */ +static const unsigned char tibetanForm[0x80] = { + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, + TibetanOther, TibetanOther, TibetanOther, TibetanOther, + + TibetanOther, TibetanVowel, TibetanVowel, TibetanVowel, + TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel, + TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel, + TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel, + + TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel, + TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel, + TibetanOther, TibetanOther, TibetanOther, TibetanOther, + TibetanOther, TibetanOther, TibetanOther, TibetanOther, + + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, + TibetanSubjoinedConsonant, TibetanOther, TibetanOther, TibetanOther +}; + +#define tibetan_form(c) \ + ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther) + +static int tibetan_nextSyllableBoundary(const ushort *s, int start, int end, bool *invalid) +{ + const ushort *uc = s + start; + + int pos = 0; + TibetanForm state = tibetan_form(*uc); + +/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/ + pos++; + + if (state != TibetanHeadConsonant) { + if (state != TibetanOther) + *invalid = true; + goto finish; + } + + while (pos < end - start) { + TibetanForm newState = tibetan_form(uc[pos]); + switch (newState) { + case TibetanSubjoinedConsonant: + case TibetanSubjoinedVowel: + if (state != TibetanHeadConsonant && + state != TibetanSubjoinedConsonant) + goto finish; + state = newState; + break; + case TibetanVowel: + if (state != TibetanHeadConsonant && + state != TibetanSubjoinedConsonant && + state != TibetanSubjoinedVowel) + goto finish; + break; + case TibetanOther: + case TibetanHeadConsonant: + goto finish; + } + pos++; + } + +finish: + *invalid = false; + return start+pos; +} + +static void tibetanAttributes(QChar::Script script, const ushort *text, uint from, uint len, QCharAttributes *attributes) +{ + int end = from + len; + const ushort *uc = text + from; + uint i = 0; + Q_UNUSED(script); + attributes += from; + while (i < len) { + bool invalid; + uint boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from; + + attributes[i].graphemeBoundary = true; + + if (boundary > len-1) boundary = len; + i++; + while (i < boundary) { + attributes[i].graphemeBoundary = false; + ++uc; + ++i; + } + assert(i == boundary); + } +} + +enum MymrCharClassValues { + Mymr_CC_RESERVED = 0, + Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */ + Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */ + Mymr_CC_NGA = 3, /* Consonant NGA */ + Mymr_CC_YA = 4, /* Consonant YA */ + Mymr_CC_RA = 5, /* Consonant RA */ + Mymr_CC_WA = 6, /* Consonant WA */ + Mymr_CC_HA = 7, /* Consonant HA */ + Mymr_CC_IND_VOWEL = 8, /* Independent vowel */ + Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */ + Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */ + Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */ + Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */ + Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */ + Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */ + Mymr_CC_SIGN_ABOVE = 15, + Mymr_CC_SIGN_BELOW = 16, + Mymr_CC_SIGN_AFTER = 17, + Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */ + Mymr_CC_COUNT = 19 /* This is the number of character classes */ +}; + +enum MymrCharClassFlags { + Mymr_CF_CLASS_MASK = 0x0000FFFF, + + Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */ + Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */ + Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */ + Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */ + Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the + first in a syllable */ + Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */ + + /* position flags */ + Mymr_CF_POS_BEFORE = 0x00080000, + Mymr_CF_POS_BELOW = 0x00040000, + Mymr_CF_POS_ABOVE = 0x00020000, + Mymr_CF_POS_AFTER = 0x00010000, + Mymr_CF_POS_MASK = 0x000f0000, + + Mymr_CF_AFTER_KINZI = 0x00100000 +}; + +/* Characters that get refrered to by name */ +enum MymrChar +{ + Mymr_C_SIGN_ZWNJ = 0x200C, + Mymr_C_SIGN_ZWJ = 0x200D, + Mymr_C_DOTTED_CIRCLE = 0x25CC, + Mymr_C_RA = 0x101B, + Mymr_C_YA = 0x101A, + Mymr_C_NGA = 0x1004, + Mymr_C_VOWEL_E = 0x1031, + Mymr_C_VIRAMA = 0x1039 +}; + +enum +{ + Mymr_xx = Mymr_CC_RESERVED, + Mymr_c1 = Mymr_CC_CONSONANT | Mymr_CF_CONSONANT | Mymr_CF_POS_BELOW, + Mymr_c2 = Mymr_CC_CONSONANT2 | Mymr_CF_CONSONANT, + Mymr_ng = Mymr_CC_NGA | Mymr_CF_CONSONANT | Mymr_CF_POS_ABOVE, + Mymr_ya = Mymr_CC_YA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_AFTER | Mymr_CF_AFTER_KINZI, + Mymr_ra = Mymr_CC_RA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BEFORE, + Mymr_wa = Mymr_CC_WA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW, + Mymr_ha = Mymr_CC_HA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW, + Mymr_id = Mymr_CC_IND_VOWEL | Mymr_CF_IND_VOWEL, + Mymr_vi = Mymr_CC_VIRAMA | Mymr_CF_VIRAMA | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE, + Mymr_dl = Mymr_CC_PRE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BEFORE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI, + Mymr_db = Mymr_CC_BELOW_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI, + Mymr_da = Mymr_CC_ABOVE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI, + Mymr_dr = Mymr_CC_POST_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI, + Mymr_sa = Mymr_CC_SIGN_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_ABOVE | Mymr_CF_AFTER_KINZI, + Mymr_sb = Mymr_CC_SIGN_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_BELOW | Mymr_CF_AFTER_KINZI, + Mymr_sp = Mymr_CC_SIGN_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI +}; + + +typedef int MymrCharClass; + + +static const MymrCharClass mymrCharClasses[] = +{ + Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_ng, Mymr_c1, Mymr_c1, Mymr_c1, + Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, /* 1000 - 100F */ + Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, + Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, /* 1010 - 101F */ + Mymr_c2, Mymr_c2, Mymr_xx, Mymr_id, Mymr_id, Mymr_id, Mymr_id, Mymr_id, + Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, /* 1020 - 102F */ + Mymr_db, Mymr_dl, Mymr_da, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_sa, Mymr_sb, + Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1030 - 103F */ + Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, + Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1040 - 104F */ + Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, + Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1050 - 105F */ +}; + +static MymrCharClass +getMyanmarCharClass (ushort ch) +{ + if (ch == Mymr_C_SIGN_ZWJ) + return Mymr_CC_ZERO_WIDTH_J_MARK; + + if (ch == Mymr_C_SIGN_ZWNJ) + return Mymr_CC_ZERO_WIDTH_NJ_MARK; + + if (ch < 0x1000 || ch > 0x105f) + return Mymr_CC_RESERVED; + + return mymrCharClasses[ch - 0x1000]; +} + +static const signed char mymrStateTable[][Mymr_CC_COUNT] = +{ +/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */ + { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */ + {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */ + {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */ + {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */ + {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */ + {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */ + {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */ + {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */ +/* exit state -2 is for invalid order of medials and combination of invalids + with virama where virama should treat as start of next syllable + */ +}; + +/*#define MYANMAR_DEBUG */ +#ifdef MYANMAR_DEBUG +#define MMDEBUG qDebug +#else +# define MMDEBUG \ + if (0) \ + printf +#endif + +/* +// Given an input string of characters and a location in which to start looking +// calculate, using the state table, which one is the last character of the syllable +// that starts in the starting position. +*/ +static int myanmar_nextSyllableBoundary(const ushort *s, int start, int end, bool *invalid) +{ + const ushort *uc = s + start; + int state = 0; + int pos = start; + *invalid = false; + + while (pos < end) { + MymrCharClass charClass = getMyanmarCharClass(*uc); + state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK]; + if (pos == start) + *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE); + + MMDEBUG("state[%d]=%d class=%8x (uc=%4x)", pos - start, state, charClass, *uc); + + if (state < 0) { + if (state < -1) + --pos; + break; + } + ++uc; + ++pos; + } + return pos; +} + +static void myanmarAttributes(QChar::Script script, const ushort *text, uint from, uint len, QCharAttributes *attributes) +{ + int end = from + len; + const ushort *uc = text + from; + uint i = 0; + Q_UNUSED(script); + attributes += from; + while (i < len) { + bool invalid; + uint boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from; + + attributes[i].graphemeBoundary = true; + attributes[i].lineBreak = true; + + if (boundary > len-1) + boundary = len; + i++; + while (i < boundary) { + attributes[i].graphemeBoundary = false; + ++uc; + ++i; + } + assert(i == boundary); + } +} + +/* +// Vocabulary +// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the +// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels, +// split vowels, signs... but there is only one base in a syllable, it has to be coded as +// the first character of the syllable. +// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant). +// Khmer language has five of them. Khmer split vowels either have one part before the +// base and one after the base or they have a part before the base and a part above the base. +// The first part of all Khmer split vowels is the same character, identical to +// the glyph of Khmer dependent vowel SRA EI +// coeng --> modifier used in Khmer to construct coeng (subscript) consonants +// Differently than indian languages, the coeng modifies the consonant that follows it, +// not the one preceding it Each consonant has two forms, the base form and the subscript form +// the base form is the normal one (using the consonants code-point), the subscript form is +// displayed when the combination coeng + consonant is encountered. +// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant +// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO) +// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA) +// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds +// if it is attached to a consonant of the first series or a consonant of the second series +// Most consonants have an equivalent in the other series, but some of theme exist only in +// one series (for example SA). If we want to use the consonant SA with a vowel sound that +// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant +// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN +// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and +// MUSIKATOAN a second series consonant to have a first series vowel sound. +// Consonant shifter are both normally supercript marks, but, when they are followed by a +// superscript, they change shape and take the form of subscript dependent vowel SRA U. +// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they +// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should +// be placed after the coeng consonant. +// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base +// Each vowel has its own position. Only one vowel per syllable is allowed. +// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are +// Allowed in a syllable. +// +// +// order is important here! This order must be the same that is found in each horizontal +// line in the statetable for Khmer (see khmerStateTable) . +*/ +enum KhmerCharClassValues { + CC_RESERVED = 0, + CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */ + CC_CONSONANT2 = 2, /* Consonant of type 2 */ + CC_CONSONANT3 = 3, /* Consonant of type 3 */ + CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */ + CC_CONSONANT_SHIFTER = 5, + CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */ + CC_COENG = 7, /* Subscript consonant combining character */ + CC_DEPENDENT_VOWEL = 8, + CC_SIGN_ABOVE = 9, + CC_SIGN_AFTER = 10, + CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */ + CC_COUNT = 12 /* This is the number of character classes */ +}; + + +enum KhmerCharClassFlags { + CF_CLASS_MASK = 0x0000FFFF, + + CF_CONSONANT = 0x01000000, /* flag to speed up comparing */ + CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */ + CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */ + CF_COENG = 0x08000000, /* flag to speed up comparing */ + CF_SHIFTER = 0x10000000, /* flag to speed up comparing */ + CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */ + + /* position flags */ + CF_POS_BEFORE = 0x00080000, + CF_POS_BELOW = 0x00040000, + CF_POS_ABOVE = 0x00020000, + CF_POS_AFTER = 0x00010000, + CF_POS_MASK = 0x000f0000 +}; + + +/* Characters that get referred to by name */ +enum KhmerChar { + C_SIGN_ZWNJ = 0x200C, + C_SIGN_ZWJ = 0x200D, + C_RO = 0x179A, + C_VOWEL_AA = 0x17B6, + C_SIGN_NIKAHIT = 0x17C6, + C_VOWEL_E = 0x17C1, + C_COENG = 0x17D2 +}; + + +/* +// simple classes, they are used in the statetable (in this file) to control the length of a syllable +// they are also used to know where a character should be placed (location in reference to the base character) +// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to +// indicate error in syllable construction +*/ +enum { + _xx = CC_RESERVED, + _sa = CC_SIGN_ABOVE | CF_DOTTED_CIRCLE | CF_POS_ABOVE, + _sp = CC_SIGN_AFTER | CF_DOTTED_CIRCLE| CF_POS_AFTER, + _c1 = CC_CONSONANT | CF_CONSONANT, + _c2 = CC_CONSONANT2 | CF_CONSONANT, + _c3 = CC_CONSONANT3 | CF_CONSONANT, + _rb = CC_ROBAT | CF_POS_ABOVE | CF_DOTTED_CIRCLE, + _cs = CC_CONSONANT_SHIFTER | CF_DOTTED_CIRCLE | CF_SHIFTER, + _dl = CC_DEPENDENT_VOWEL | CF_POS_BEFORE | CF_DOTTED_CIRCLE, + _db = CC_DEPENDENT_VOWEL | CF_POS_BELOW | CF_DOTTED_CIRCLE, + _da = CC_DEPENDENT_VOWEL | CF_POS_ABOVE | CF_DOTTED_CIRCLE | CF_ABOVE_VOWEL, + _dr = CC_DEPENDENT_VOWEL | CF_POS_AFTER | CF_DOTTED_CIRCLE, + _co = CC_COENG | CF_COENG | CF_DOTTED_CIRCLE, + + /* split vowel */ + _va = _da | CF_SPLIT_VOWEL, + _vr = _dr | CF_SPLIT_VOWEL +}; + + +/* +// Character class: a character class value +// ORed with character class flags. +*/ +typedef unsigned long KhmerCharClass; + + +/* +// Character class tables +// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs... +// _sa Sign placed above the base +// _sp Sign placed after the base +// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants) +// _c2 Consonant of type 2 (only RO) +// _c3 Consonant of type 3 +// _rb Khmer sign robat u17CC. combining mark for subscript consonants +// _cd Consonant-shifter +// _dl Dependent vowel placed before the base (left of the base) +// _db Dependent vowel placed below the base +// _da Dependent vowel placed above the base +// _dr Dependent vowel placed behind the base (right of the base) +// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following +// it to create a subscript consonant or independent vowel +// _va Khmer split vowel in which the first part is before the base and the second one above the base +// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base +*/ +static const KhmerCharClass khmerCharClasses[] = { + _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */ + _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */ + _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */ + _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */ + _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */ + _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */ +}; + +/* this enum must reflect the range of khmerCharClasses */ +enum KhmerCharClassesRange { + KhmerFirstChar = 0x1780, + KhmerLastChar = 0x17df +}; + +/* +// Below we define how a character in the input string is either in the khmerCharClasses table +// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear +// within the syllable, but are not in the table) we also get their type back, or an unknown object +// in which case we get _xx (CC_RESERVED) back +*/ +static KhmerCharClass getKhmerCharClass(ushort uc) +{ + if (uc == C_SIGN_ZWJ) { + return CC_ZERO_WIDTH_J_MARK; + } + + if (uc == C_SIGN_ZWNJ) { + return CC_ZERO_WIDTH_NJ_MARK; + } + + if (uc < KhmerFirstChar || uc > KhmerLastChar) { + return CC_RESERVED; + } + + return khmerCharClasses[uc - KhmerFirstChar]; +} + + +/* +// The stateTable is used to calculate the end (the length) of a well +// formed Khmer Syllable. +// +// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable +// CharClassValues. This coincidence of values allows the follow up of the table. +// +// Each line corresponds to a state, which does not necessarily need to be a type +// of component... for example, state 2 is a base, with is always a first character +// in the syllable, but the state could be produced a consonant of any type when +// it is the first character that is analysed (in ground state). +// +// Differentiating 3 types of consonants is necessary in order to +// forbid the use of certain combinations, such as having a second +// coeng after a coeng RO, +// The inexistent possibility of having a type 3 after another type 3 is permitted, +// eliminating it would very much complicate the table, and it does not create typing +// problems, as the case above. +// +// The table is quite complex, in order to limit the number of coeng consonants +// to 2 (by means of the table). +// +// There a peculiarity, as far as Unicode is concerned: +// - The consonant-shifter is considered in two possible different +// locations, the one considered in Unicode 3.0 and the one considered in +// Unicode 4.0. (there is a backwards compatibility problem in this standard). +// +// +// xx independent character, such as a number, punctuation sign or non-khmer char +// +// c1 Khmer consonant of type 1 or an independent vowel +// that is, a letter in which the subscript for is only under the +// base, not taking any space to the right or to the left +// +// c2 Khmer consonant of type 2, the coeng form takes space under +// and to the left of the base (only RO is of this type) +// +// c3 Khmer consonant of type 3. Its subscript form takes space under +// and to the right of the base. +// +// cs Khmer consonant shifter +// +// rb Khmer robat +// +// co coeng character (u17D2) +// +// dv dependent vowel (including split vowels, they are treated in the same way). +// even if dv is not defined above, the component that is really tested for is +// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels +// +// zwj Zero Width joiner +// +// zwnj Zero width non joiner +// +// sa above sign +// +// sp post sign +// +// there are lines with equal content but for an easier understanding +// (and maybe change in the future) we did not join them +*/ +static const signed char khmerStateTable[][CC_COUNT] = +{ + /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */ + { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */ + {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */ + {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */ + {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */ + {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */ + {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */ + {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */ + {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */ + {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */ + {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */ + {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */ + {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */ + {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */ + {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */ + {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */ + {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */ + {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */ + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */ +}; + + +/* #define KHMER_DEBUG */ +#ifdef KHMER_DEBUG +#define KHDEBUG qDebug +#else +# define KHDEBUG \ + if (0) \ + printf +#endif + +/* +// Given an input string of characters and a location in which to start looking +// calculate, using the state table, which one is the last character of the syllable +// that starts in the starting position. +*/ +static int khmer_nextSyllableBoundary(const ushort *s, int start, int end, bool *invalid) +{ + const ushort *uc = s + start; + int state = 0; + int pos = start; + *invalid = false; + + while (pos < end) { + KhmerCharClass charClass = getKhmerCharClass(*uc); + if (pos == start) { + *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT); + } + state = khmerStateTable[state][charClass & CF_CLASS_MASK]; + + KHDEBUG("state[%d]=%d class=%8lx (uc=%4x)", pos - start, state, + charClass, *uc ); + + if (state < 0) { + break; + } + ++uc; + ++pos; + } + return pos; +} + +static void khmerAttributes(QChar::Script script, const ushort *text, uint from, uint len, QCharAttributes *attributes) +{ + int end = from + len; + const ushort *uc = text + from; + uint i = 0; + Q_UNUSED(script); + attributes += from; + while ( i < len ) { + bool invalid; + uint boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from; + + attributes[i].graphemeBoundary = true; + + if ( boundary > len-1 ) boundary = len; + i++; + while ( i < boundary ) { + attributes[i].graphemeBoundary = false; + ++uc; + ++i; + } + assert( i == boundary ); + } +} + + +const CharAttributeFunction charAttributeFunction[] = { +// Script_Unknown, + nullptr, +// Script_Inherited, + nullptr, +// Script_Common, + nullptr, +// Script_Latin, + nullptr, +// Script_Greek, + nullptr, +// Script_Cyrillic, + nullptr, +// Script_Armenian, + nullptr, +// Script_Hebrew, + nullptr, +// Script_Arabic, + nullptr, +// Script_Syriac, + nullptr, +// Script_Thaana, + nullptr, +// Script_Devanagari, + indicAttributes, +// Script_Bengali, + indicAttributes, +// Script_Gurmukhi, + indicAttributes, +// Script_Gujarati, + indicAttributes, +// Script_Oriya, + indicAttributes, +// Script_Tamil, + indicAttributes, +// Script_Telugu, + indicAttributes, +// Script_Kannada, + indicAttributes, +// Script_Malayalam, + indicAttributes, +// Script_Sinhala, + indicAttributes, +// Script_Thai, + thaiAttributes, +// Script_Lao, + nullptr, +// Script_Tibetan, + tibetanAttributes, +// Script_Myanmar, + myanmarAttributes, +// Script_Georgian, + nullptr, +// Script_Hangul, + nullptr, +// Script_Ethiopic, + nullptr, +// Script_Cherokee, + nullptr, +// Script_CanadianAboriginal, + nullptr, +// Script_Ogham, + nullptr, +// Script_Runic, + nullptr, +// Script_Khmer, + khmerAttributes +}; + +static void getCharAttributes(const ushort *string, uint stringLength, + const QUnicodeTools::ScriptItem *items, uint numItems, + QCharAttributes *attributes) +{ + if (stringLength == 0) + return; + for (uint i = 0; i < numItems; ++i) { + QChar::Script script = items[i].script; + if (script > QChar::Script_Khmer) + script = QChar::Script_Common; + CharAttributeFunction attributeFunction = charAttributeFunction[script]; + if (!attributeFunction) + continue; + int end = i < numItems - 1 ? items[i + 1].position : stringLength; + attributeFunction(script, string, items[i].position, end - items[i].position, attributes); + } +} + +} Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length, const ScriptItem *items, int numItems, @@ -750,38 +2236,7 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length, if (!items || numItems <= 0) return; - QVarLengthArray<HB_ScriptItem, 64> scriptItems; - scriptItems.reserve(numItems); - int start = 0; - HB_Script startScript = script_to_hbscript(items[start].script); - if (Q_UNLIKELY(startScript == HB_Script_Inherited)) - startScript = HB_Script_Common; - for (int i = start + 1; i < numItems; ++i) { - HB_Script script = script_to_hbscript(items[i].script); - if (Q_LIKELY(script == startScript || script == HB_Script_Inherited)) - continue; - Q_ASSERT(items[i].position > items[start].position); - HB_ScriptItem item; - item.pos = items[start].position; - item.length = items[i].position - items[start].position; - item.script = startScript; - item.bidiLevel = 0; // unused - scriptItems.append(item); - start = i; - startScript = script; - } - if (items[start].position + 1 < length) { - HB_ScriptItem item; - item.pos = items[start].position; - item.length = length - items[start].position; - item.script = startScript; - item.bidiLevel = 0; // unused - scriptItems.append(item); - } - Q_STATIC_ASSERT(sizeof(QCharAttributes) == sizeof(HB_CharAttributes)); - HB_GetTailoredCharAttributes(string, length, - scriptItems.constData(), scriptItems.size(), - reinterpret_cast<HB_CharAttributes *>(attributes)); + Tailored::getCharAttributes(string, length, items, numItems, attributes); } } @@ -796,7 +2251,7 @@ Q_CORE_EXPORT void initScripts(const ushort *string, int length, ScriptItemArray { int sor = 0; int eor = 0; - uchar script = QChar::Script_Common; + QChar::Script script = QChar::Script_Common; for (int i = 0; i < length; ++i, eor = i) { uint ucs4 = string[i]; @@ -810,7 +2265,7 @@ Q_CORE_EXPORT void initScripts(const ushort *string, int length, ScriptItemArray const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); - uchar nscript = prop->script; + QChar::Script nscript = QChar::Script(prop->script); if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common)) continue; diff --git a/src/corelib/text/qunicodetools_p.h b/src/corelib/text/qunicodetools_p.h index 6294d9ceb4..5715444025 100644 --- a/src/corelib/text/qunicodetools_p.h +++ b/src/corelib/text/qunicodetools_p.h @@ -1,6 +1,6 @@ /**************************************************************************** ** -** Copyright (C) 2016 The Qt Company Ltd. +** Copyright (C) 2020 The Qt Company Ltd. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -75,7 +75,7 @@ namespace QUnicodeTools { struct ScriptItem { int position; - int script; + QChar::Script script; }; using ScriptItemArray = QVarLengthArray<ScriptItem, 64>; diff --git a/src/gui/text/qtextengine.cpp b/src/gui/text/qtextengine.cpp index 2deae6f4ba..fce3e519d4 100644 --- a/src/gui/text/qtextengine.cpp +++ b/src/gui/text/qtextengine.cpp @@ -1985,7 +1985,7 @@ const QCharAttributes *QTextEngine::attributes() const for (int i = 0; i < layoutData->items.size(); ++i) { const QScriptItem &si = layoutData->items.at(i); scriptItems[i].position = si.position; - scriptItems[i].script = si.analysis.script; + scriptItems[i].script = QChar::Script(si.analysis.script); } QUnicodeTools::initCharAttributes(reinterpret_cast<const ushort *>(layoutData->string.constData()), |