From edfce46a6c0406af749ca7ef659df6315e36cd5d Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Sun, 12 Jan 2014 21:14:25 +0200 Subject: Update the Unicode Data and Algorithms up to Unicode 6.3.0 * Mongolian and Phags-pa characters have been given a Joining_Type classification for contextual shaping. As a part of these additions, one Phags-pa character has the Joining_Type value of L (Left Joining), which no character had been assigned before. * The unassigned code points in the Currency Symbols block have been given the Bidi_Class property value ET and the Line_Break property value PR, to help implementations support new currency symbols, when they are encoded. * Hebrew letters and basic punctuation marks have been assigned the newly introduced Word_Break property values Hebrew_Letter, Single_Quote, and Double_Quote. * The Bidi_Class property has been extended with four new values for directional isolates. For more details, see http://www.unicode.org/versions/Unicode6.3.0/ Change-Id: Iad62d02edc58a8497898dcd6d6c70d5aece317ea Reviewed-by: Lars Knoll --- src/corelib/tools/qchar.cpp | 7 +++- src/corelib/tools/qchar.h | 6 ++- src/corelib/tools/qunicodetools.cpp | 80 +++++++++++++++++++++++-------------- src/gui/text/qtextengine.cpp | 16 ++++---- 4 files changed, 68 insertions(+), 41 deletions(-) (limited to 'src') diff --git a/src/corelib/tools/qchar.cpp b/src/corelib/tools/qchar.cpp index 4ed0cd5eea..f7f425d594 100644 --- a/src/corelib/tools/qchar.cpp +++ b/src/corelib/tools/qchar.cpp @@ -185,8 +185,9 @@ QT_BEGIN_NAMESPACE \value Unicode_6_0 Version 6.0 \value Unicode_6_1 Version 6.1 \value Unicode_6_2 Version 6.2 + \value Unicode_6_3 Version 6.3 Since Qt 5.3 \value Unicode_Unassigned The value is not assigned to any character - in version 6.2 of Unicode. + in version 6.3 of Unicode. \sa unicodeVersion(), currentUnicodeVersion() */ @@ -408,14 +409,18 @@ QT_BEGIN_NAMESPACE \value DirEN \value DirES \value DirET + \value DirFSI Since Qt 5.3 \value DirL \value DirLRE + \value DirLRI Since Qt 5.3 \value DirLRO \value DirNSM \value DirON \value DirPDF + \value DirPDI Since Qt 5.3 \value DirR \value DirRLE + \value DirRLI Since Qt 5.3 \value DirRLO \value DirS \value DirWS diff --git a/src/corelib/tools/qchar.h b/src/corelib/tools/qchar.h index 8afa05bb00..82ff337341 100644 --- a/src/corelib/tools/qchar.h +++ b/src/corelib/tools/qchar.h @@ -262,7 +262,8 @@ public: enum Direction { DirL, DirR, DirEN, DirES, DirET, DirAN, DirCS, DirB, DirS, DirWS, DirON, - DirLRE, DirLRO, DirAL, DirRLE, DirRLO, DirPDF, DirNSM, DirBN + DirLRE, DirLRO, DirAL, DirRLE, DirRLO, DirPDF, DirNSM, DirBN, + DirLRI, DirRLI, DirFSI, DirPDI }; enum Decomposition @@ -332,7 +333,8 @@ public: Unicode_5_2, Unicode_6_0, Unicode_6_1, - Unicode_6_2 + Unicode_6_2, + Unicode_6_3 }; // ****** WHEN ADDING FUNCTIONS, CONSIDER ADDING TO QCharRef TOO diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp index b3e55a5abc..fac795051a 100644 --- a/src/corelib/tools/qunicodetools.cpp +++ b/src/corelib/tools/qunicodetools.cpp @@ -57,7 +57,7 @@ namespace QUnicodeTools { // ----------------------------------------------------------------------------------------------------- // // The text boundaries determination algorithm. -// See http://www.unicode.org/reports/tr29/tr29-21.html +// See http://www.unicode.org/reports/tr29/tr29-23.html // // ----------------------------------------------------------------------------------------------------- @@ -112,26 +112,30 @@ static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes namespace WB { enum Action { - NoBreak = 0, - Break = 1, - Lookup = 2 + NoBreak, + Break, + Lookup, + LookupW }; static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnicodeTables::WordBreak_ExtendNumLet + 1] = { -// Other CR LF Newline Extend RI Katakana ALetter MidNumLet MidLetter MidNum Numeric ExtendNumLet - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // Other - { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR - { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF - { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // Extend - { Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator - { Break , Break , Break , Break , NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , NoBreak }, // Katakana - { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, Lookup , Lookup , Break , NoBreak, NoBreak }, // ALetter - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum - { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric - { Break , Break , Break , Break , NoBreak, Break , NoBreak, NoBreak, Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet +// Other CR LF Newline Extend RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtendNumLet + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Other + { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend + { Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator + { Break , Break , Break , Break , NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // Katakana + { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak }, // HebrewLetter + { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak }, // ALetter + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum + { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric + { Break , Break , Break , Break , NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet }; } // namespace WB @@ -160,8 +164,8 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at if (qt_initcharattributes_default_algorithm_only) { // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet // which caused "hi.there" to be treated like if it were just a single word; - // by remapping those characters in the Unicode tables generator. - // this code is needed to pass the coverage tests; remove once the issue is fixed. + // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator + // and this code is needed to pass the coverage tests; remove once the issue is fixed. if (ucs4 == 0x002E) // FULL STOP ncls = QUnicodeTables::WordBreak_MidNumLet; else if (ucs4 == 0x003A) // COLON @@ -170,8 +174,17 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at #endif uchar action = WB::breakTable[cls][ncls]; - if (Q_UNLIKELY(action == WB::Lookup)) { - action = WB::Break; + switch (action) { + case WB::Break: + break; + case WB::NoBreak: + if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) { + // WB4: X(Extend|Format)* -> X + continue; + } + break; + case WB::Lookup: + case WB::LookupW: for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) { ucs4 = string[lookahead]; if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) { @@ -184,20 +197,28 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at prop = QUnicodeTables::properties(ucs4); QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass; - if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend)) + + if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend)) { + // WB4: X(Extend|Format)* -> X continue; - if (Q_LIKELY(tcls == cls)) { + } + + if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter + || tcls == QUnicodeTables::WordBreak_ALetter)))) { i = lookahead; ncls = tcls; action = WB::NoBreak; } break; } - } else if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) { - // WB4: X(Extend|Format)* -> X - if (Q_LIKELY(action != WB::Break)) - continue; + if (action != WB::NoBreak) { + action = WB::Break; + if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter)) + action = WB::NoBreak; // WB7a + } + break; } + cls = ncls; if (action == WB::Break) { attributes[pos].wordBreak = true; @@ -208,6 +229,7 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at currentWordType = WordTypeHiraganaKatakana; attributes[pos].wordStart = true; break; + case QUnicodeTables::WordBreak_HebrewLetter: case QUnicodeTables::WordBreak_ALetter: case QUnicodeTables::WordBreak_Numeric: currentWordType = WordTypeAlphaNumeric; @@ -327,7 +349,7 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes // ----------------------------------------------------------------------------------------------------- // // The line breaking algorithm. -// See http://www.unicode.org/reports/tr14/tr14-30.html +// See http://www.unicode.org/reports/tr14/tr14-32.html // // ----------------------------------------------------------------------------------------------------- diff --git a/src/gui/text/qtextengine.cpp b/src/gui/text/qtextengine.cpp index 06c5e24920..109b7e600f 100644 --- a/src/gui/text/qtextengine.cpp +++ b/src/gui/text/qtextengine.cpp @@ -241,7 +241,8 @@ using namespace std; static const char *directions[] = { "DirL", "DirR", "DirEN", "DirES", "DirET", "DirAN", "DirCS", "DirB", "DirS", "DirWS", "DirON", - "DirLRE", "DirLRO", "DirAL", "DirRLE", "DirRLO", "DirPDF", "DirNSM", "DirBN" + "DirLRE", "DirLRO", "DirAL", "DirRLE", "DirRLO", "DirPDF", "DirNSM", "DirBN", + "DirLRI", "DirRLI", "DirFSI", "DirPDI" }; #endif @@ -2536,7 +2537,8 @@ static inline bool nextCharJoins(const QString &string, int pos) ++pos; if (pos == string.length()) return false; - return string.at(pos).joining() != QChar::OtherJoining; + // ### U+A872 has joining type L + return string.at(pos) == QChar(0xA872) || string.at(pos).joining() != QChar::OtherJoining; } static inline bool prevCharJoins(const QString &string, int pos) @@ -2551,13 +2553,9 @@ static inline bool prevCharJoins(const QString &string, int pos) static inline bool isRetainableControlCode(QChar c) { - return (c.unicode() == 0x202a // LRE - || c.unicode() == 0x202b // LRE - || c.unicode() == 0x202c // PDF - || c.unicode() == 0x202d // LRO - || c.unicode() == 0x202e // RLO - || c.unicode() == 0x200e // LRM - || c.unicode() == 0x200f); // RLM + return (c.unicode() >= 0x202a && c.unicode() <= 0x202e) // LRE, RLE, PDF, LRO, RLO + || (c.unicode() >= 0x200e && c.unicode() <= 0x200f) // LRM, RLM + || (c.unicode() >= 0x2066 && c.unicode() <= 0x2069); // LRM, RLM } static QString stringMidRetainingBidiCC(const QString &string, -- cgit v1.2.3