diff options
Diffstat (limited to 'src/corelib/tools/qunicodetools.cpp')
-rw-r--r-- | src/corelib/tools/qunicodetools.cpp | 266 |
1 files changed, 175 insertions, 91 deletions
diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp index ac19d6b6d1..c9d0868fef 100644 --- a/src/corelib/tools/qunicodetools.cpp +++ b/src/corelib/tools/qunicodetools.cpp @@ -55,27 +55,45 @@ namespace QUnicodeTools { // ----------------------------------------------------------------------------------------------------- // // The text boundaries determination algorithm. -// See http://www.unicode.org/reports/tr29/tr29-27.html +// See http://www.unicode.org/reports/tr29/tr29-31.html // // ----------------------------------------------------------------------------------------------------- namespace GB { -static const uchar breakTable[QUnicodeTables::GraphemeBreak_LVT + 1][QUnicodeTables::GraphemeBreak_LVT + 1] = { -// Other CR LF Control Extend RI Prepend S-Mark L V T LV LVT - { true , true , true , true , false, true , true , false, true , true , true , true , true }, // Other - { true , true , false, true , true , true , true , true , true , true , true , true , true }, // CR - { true , true , true , true , true , true , true , true , true , true , true , true , true }, // LF - { true , true , true , true , true , true , true , true , true , true , true , true , true }, // Control - { true , true , true , true , false, true , true , false, true , true , true , true , true }, // Extend - { true , true , true , true , false, false, true , false, true , true , true , true , true }, // RegionalIndicator - { false, true , true , true , false, false, false, false, false, false, false, false, false }, // Prepend - { true , true , true , true , false, true , true , false, true , true , true , true , true }, // SpacingMark - { true , true , true , true , false, true , true , false, false, false, true , false, false }, // L - { true , true , true , true , false, true , true , false, true , false, false, true , true }, // V - { true , true , true , true , false, true , true , false, true , true , false, true , true }, // T - { true , true , true , true , false, true , true , false, true , false, false, true , true }, // LV - { true , true , true , true , false, true , true , false, true , true , false, true , true }, // LVT +/* + * Most grapheme break rules can be implemented table driven, but rules GB10, GB12 and GB13 need a bit + * of special treatment. + */ +enum State : uchar { + Break, + Inside, + GB10, + GB10_2, + GB10_3, + GB13, // also covers GB12 +}; + +static const State breakTable[QUnicodeTables::NumGraphemeBreakClasses][QUnicodeTables::NumGraphemeBreakClasses] = { +// Any CR LF Control Extend ZWJ RI Prepend S-Mark L V T LV LVT E_B E_M GAZ EBG + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any + { Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Control + { Break , Break , Break , Break , GB10_2, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , GB10_3, Break , Break }, // Extend + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Inside, Inside }, // ZWJ + { Break , Break , Break , Break , Inside, Inside, GB13 , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator + { Inside, Break , Break , Break , Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside }, // Prepend + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SpacingMark + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Inside, Inside, Break , Inside, Inside, Break , Break , Break , Break }, // L + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // V + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // T + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // LV + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // LVT + { Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // E_B + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_M + { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ + { Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // EBG }; } // namespace GB @@ -83,6 +101,7 @@ static const uchar breakTable[QUnicodeTables::GraphemeBreak_LVT + 1][QUnicodeTab static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes *attributes) { QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1 + GB::State state = GB::Break; // only required to track some of the rules for (quint32 i = 0; i != len; ++i) { quint32 pos = i; uint ucs4 = string[i]; @@ -97,8 +116,36 @@ static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass; - if (Q_LIKELY(GB::breakTable[lcls][cls])) + switch (GB::breakTable[lcls][cls]) { + case GB::Break: attributes[pos].graphemeBoundary = true; + state = GB::Break; + break; + case GB::Inside: + state = GB::Break; + break; + case GB::GB10: + state = GB::GB10; + break; + case GB::GB10_2: + if (state == GB::GB10 || state == GB::GB10_2) + state = GB::GB10_2; + else + state = GB::Break; + break; + case GB::GB10_3: + if (state != GB::GB10 && state != GB::GB10_2) + attributes[pos].graphemeBoundary = true; + state = GB::Break; + break; + case GB::GB13: + if (state != GB::GB13) { + state = GB::GB13; + } else { + attributes[pos].graphemeBoundary = true; + state = GB::Break; + } + } lcls = cls; } @@ -116,24 +163,30 @@ enum Action { LookupW }; -static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnicodeTables::WordBreak_ExtendNumLet + 1] = { -// Other CR LF Newline Extend RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtendNumLet - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Other - { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR - { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF - { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend - { Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator - { Break , Break , Break , Break , NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // Katakana - { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak }, // HebrewLetter - { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak }, // ALetter - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter - { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum - { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric - { Break , Break , Break , Break , NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet +static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = { +// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet E_Base E_Mod GAZ EBG + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any + { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, NoBreak }, // ZWJ + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break , Break }, // Katakana + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break }, // HebrewLetter + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break }, // ALetter + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break , Break , Break , Break }, // Numeric + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break }, // ExtendNumLet + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break }, // E_Base + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_Mod + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break }, // EBG }; } // namespace WB @@ -176,9 +229,14 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at case WB::Break: break; case WB::NoBreak: - if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) { + if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) { // WB4: X(Extend|Format)* -> X - continue; + if (cls != QUnicodeTables::WordBreak_ZWJ) // WB3c + continue; + } + if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) { + // WB15/WB16: break between pairs of Regional indicator + ncls = QUnicodeTables::WordBreak_Any; } break; case WB::Lookup: @@ -196,7 +254,7 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at prop = QUnicodeTables::properties(ucs4); QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass; - if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend)) { + if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend || tcls == QUnicodeTables::WordBreak_ZWJ || tcls == QUnicodeTables::WordBreak_Format)) { // WB4: X(Extend|Format)* -> X continue; } @@ -265,8 +323,8 @@ enum State { Lookup }; -static const uchar breakTable[BAfter + 1][QUnicodeTables::SentenceBreak_Close + 1] = { -// Other CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close +static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = { +// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial { Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower { Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, STerm , STerm , Initial }, // Upper @@ -319,7 +377,7 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes prop = QUnicodeTables::properties(ucs4); QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass; switch (tcls) { - case QUnicodeTables::SentenceBreak_Other: + case QUnicodeTables::SentenceBreak_Any: case QUnicodeTables::SentenceBreak_Extend: case QUnicodeTables::SentenceBreak_Sp: case QUnicodeTables::SentenceBreak_Numeric: @@ -349,7 +407,7 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes // ----------------------------------------------------------------------------------------------------- // // The line breaking algorithm. -// See http://www.unicode.org/reports/tr14/tr14-35.html +// See http://www.unicode.org/reports/tr14/tr14-39.html // // ----------------------------------------------------------------------------------------------------- @@ -441,38 +499,41 @@ enum Action { ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen }; -static const uchar breakTable[QUnicodeTables::LineBreak_CB + 1][QUnicodeTables::LineBreak_CB + 1] = { -/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB */ -/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB }, -/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB }, -/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB }, -/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB }, -/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB }, -/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB }, -/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB }, -/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }, -/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB }, -/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB }, -/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB }, -/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB }, -/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB }, -/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB }, -/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB }, -/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB } +static const uchar breakTable[QUnicodeTables::LineBreak_SA][QUnicodeTables::LineBreak_SA] = { +/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM ZWJ*/ +/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB }, +/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, IB }, +/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB }, +/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB }, +/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB }, +/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB }, +/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB }, +/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, IB }, +/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB }, +/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB }, +/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, IB }, +/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB }, +/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* ZWJ*/ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, IB } }; // The following line break classes are not treated by the pair table @@ -501,6 +562,7 @@ static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *at const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass; + QUnicodeTables::LineBreakClass tcls; if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) { // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM @@ -508,14 +570,39 @@ static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *at if (FLAG(prop->category) & test) ncls = QUnicodeTables::LineBreak_CM; } - if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) { - // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL - if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP) - ncls = QUnicodeTables::LineBreak_AL; + + if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) { + // LB4: BK!, LB5: (CRxLF|CR|LF|NL)! + if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF) + attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true; + if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) { + cls = QUnicodeTables::LineBreak_AL; + goto next_no_cls_update; + } + goto next; } - if (Q_LIKELY(ncls != QUnicodeTables::LineBreak_CM)) { - // LB25: do not break lines inside numbers + if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) { + if (ncls > QUnicodeTables::LineBreak_SP) + goto next; // LB6: x(BK|CR|LF|NL) + goto next_no_cls_update; // LB7: xSP + } + + if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) { + // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X + if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP) + // don't update anything + goto next_no_cls_update; + } + + if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) { + // LB8a: ZWJ x (ID | EB | EM) + if (ncls == QUnicodeTables::LineBreak_ID || ncls == QUnicodeTables::LineBreak_EB || ncls == QUnicodeTables::LineBreak_EM) + goto next; + } + + // LB25: do not break lines inside numbers + { LB::NS::Class necur = LB::NS::toClass(ncls, (QChar::Category)prop->category); switch (LB::NS::actionTable[nelast][necur]) { case LB::NS::Break: @@ -535,25 +622,22 @@ static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *at } } - if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) { - // LB4: BK!, LB5: (CRxLF|CR|LF|NL)! - if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF) - attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true; + if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) { + // LB30a + ncls = QUnicodeTables::LineBreak_SP; goto next; } - if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) { - if (ncls > QUnicodeTables::LineBreak_SP) - goto next; // LB6: x(BK|CR|LF|NL) - goto next_no_cls_update; // LB7: xSP - } - // for South East Asian chars that require a complex analysis, the Unicode // standard recommends to treat them as AL. tailoring that do dictionary analysis can override if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA)) cls = QUnicodeTables::LineBreak_AL; - switch (LB::breakTable[cls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) { + tcls = cls; + if (tcls == QUnicodeTables::LineBreak_CM) + // LB10 + tcls = QUnicodeTables::LineBreak_AL; + switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) { case LB::DirectBreak: attributes[pos].lineBreak = true; break; |