From edfce46a6c0406af749ca7ef659df6315e36cd5d Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Sun, 12 Jan 2014 21:14:25 +0200 Subject: Update the Unicode Data and Algorithms up to Unicode 6.3.0 * Mongolian and Phags-pa characters have been given a Joining_Type classification for contextual shaping. As a part of these additions, one Phags-pa character has the Joining_Type value of L (Left Joining), which no character had been assigned before. * The unassigned code points in the Currency Symbols block have been given the Bidi_Class property value ET and the Line_Break property value PR, to help implementations support new currency symbols, when they are encoded. * Hebrew letters and basic punctuation marks have been assigned the newly introduced Word_Break property values Hebrew_Letter, Single_Quote, and Double_Quote. * The Bidi_Class property has been extended with four new values for directional isolates. For more details, see http://www.unicode.org/versions/Unicode6.3.0/ Change-Id: Iad62d02edc58a8497898dcd6d6c70d5aece317ea Reviewed-by: Lars Knoll --- src/corelib/tools/qunicodetools.cpp | 80 +++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 29 deletions(-) (limited to 'src/corelib/tools/qunicodetools.cpp') diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp index b3e55a5abc..fac795051a 100644 --- a/src/corelib/tools/qunicodetools.cpp +++ b/src/corelib/tools/qunicodetools.cpp @@ -57,7 +57,7 @@ namespace QUnicodeTools { // ----------------------------------------------------------------------------------------------------- // // The text boundaries determination algorithm. -// See http://www.unicode.org/reports/tr29/tr29-21.html +// See http://www.unicode.org/reports/tr29/tr29-23.html // // ----------------------------------------------------------------------------------------------------- @@ -112,26 +112,30 @@ static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes namespace WB { enum Action { - NoBreak = 0, - Break = 1, - Lookup = 2 + NoBreak, + Break, + Lookup, + LookupW }; static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnicodeTables::WordBreak_ExtendNumLet + 1] = { -// Other CR LF Newline Extend RI Katakana ALetter MidNumLet MidLetter MidNum Numeric ExtendNumLet - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // Other - { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR - { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF - { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // Extend - { Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator - { Break , Break , Break , Break , NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , NoBreak }, // Katakana - { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, Lookup , Lookup , Break , NoBreak, NoBreak }, // ALetter - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum - { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric - { Break , Break , Break , Break , NoBreak, Break , NoBreak, NoBreak, Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet +// Other CR LF Newline Extend RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtendNumLet + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Other + { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend + { Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator + { Break , Break , Break , Break , NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // Katakana + { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak }, // HebrewLetter + { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak }, // ALetter + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum + { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric + { Break , Break , Break , Break , NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet }; } // namespace WB @@ -160,8 +164,8 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at if (qt_initcharattributes_default_algorithm_only) { // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet // which caused "hi.there" to be treated like if it were just a single word; - // by remapping those characters in the Unicode tables generator. - // this code is needed to pass the coverage tests; remove once the issue is fixed. + // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator + // and this code is needed to pass the coverage tests; remove once the issue is fixed. if (ucs4 == 0x002E) // FULL STOP ncls = QUnicodeTables::WordBreak_MidNumLet; else if (ucs4 == 0x003A) // COLON @@ -170,8 +174,17 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at #endif uchar action = WB::breakTable[cls][ncls]; - if (Q_UNLIKELY(action == WB::Lookup)) { - action = WB::Break; + switch (action) { + case WB::Break: + break; + case WB::NoBreak: + if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) { + // WB4: X(Extend|Format)* -> X + continue; + } + break; + case WB::Lookup: + case WB::LookupW: for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) { ucs4 = string[lookahead]; if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) { @@ -184,20 +197,28 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at prop = QUnicodeTables::properties(ucs4); QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass; - if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend)) + + if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend)) { + // WB4: X(Extend|Format)* -> X continue; - if (Q_LIKELY(tcls == cls)) { + } + + if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter + || tcls == QUnicodeTables::WordBreak_ALetter)))) { i = lookahead; ncls = tcls; action = WB::NoBreak; } break; } - } else if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) { - // WB4: X(Extend|Format)* -> X - if (Q_LIKELY(action != WB::Break)) - continue; + if (action != WB::NoBreak) { + action = WB::Break; + if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter)) + action = WB::NoBreak; // WB7a + } + break; } + cls = ncls; if (action == WB::Break) { attributes[pos].wordBreak = true; @@ -208,6 +229,7 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at currentWordType = WordTypeHiraganaKatakana; attributes[pos].wordStart = true; break; + case QUnicodeTables::WordBreak_HebrewLetter: case QUnicodeTables::WordBreak_ALetter: case QUnicodeTables::WordBreak_Numeric: currentWordType = WordTypeAlphaNumeric; @@ -327,7 +349,7 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes // ----------------------------------------------------------------------------------------------------- // // The line breaking algorithm. -// See http://www.unicode.org/reports/tr14/tr14-30.html +// See http://www.unicode.org/reports/tr14/tr14-32.html // // ----------------------------------------------------------------------------------------------------- -- cgit v1.2.3