summaryrefslogtreecommitdiffstats
path: root/src/corelib/tools/qunicodetools.cpp
diff options
context:
space:
mode:
authorLars Knoll <lars.knoll@qt.io>2017-12-12 10:14:28 +0100
committerLars Knoll <lars.knoll@qt.io>2018-01-03 07:47:26 +0000
commit41b4e154d617a820cd7f7f732838647425a58227 (patch)
tree27e9300e3fc275bf4e50de8fb2c5e1f8aeb40fab /src/corelib/tools/qunicodetools.cpp
parent8bfabb34dec8a437a08b5a6e0ecac4a9dd3ae18c (diff)
Update Text segmentation and line break data to Unicode 10.0
Also adjusted the text segmentation and line break algorithms so that they can handle the new data, and pass the test suite. Change-Id: Ib727fd80003e34e96458d7a681996de3fa3691e7 Reviewed-by: Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@qt.io>
Diffstat (limited to 'src/corelib/tools/qunicodetools.cpp')
-rw-r--r--src/corelib/tools/qunicodetools.cpp266
1 files changed, 175 insertions, 91 deletions
diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp
index ac19d6b6d1..c9d0868fef 100644
--- a/src/corelib/tools/qunicodetools.cpp
+++ b/src/corelib/tools/qunicodetools.cpp
@@ -55,27 +55,45 @@ namespace QUnicodeTools {
// -----------------------------------------------------------------------------------------------------
//
// The text boundaries determination algorithm.
-// See http://www.unicode.org/reports/tr29/tr29-27.html
+// See http://www.unicode.org/reports/tr29/tr29-31.html
//
// -----------------------------------------------------------------------------------------------------
namespace GB {
-static const uchar breakTable[QUnicodeTables::GraphemeBreak_LVT + 1][QUnicodeTables::GraphemeBreak_LVT + 1] = {
-// Other CR LF Control Extend RI Prepend S-Mark L V T LV LVT
- { true , true , true , true , false, true , true , false, true , true , true , true , true }, // Other
- { true , true , false, true , true , true , true , true , true , true , true , true , true }, // CR
- { true , true , true , true , true , true , true , true , true , true , true , true , true }, // LF
- { true , true , true , true , true , true , true , true , true , true , true , true , true }, // Control
- { true , true , true , true , false, true , true , false, true , true , true , true , true }, // Extend
- { true , true , true , true , false, false, true , false, true , true , true , true , true }, // RegionalIndicator
- { false, true , true , true , false, false, false, false, false, false, false, false, false }, // Prepend
- { true , true , true , true , false, true , true , false, true , true , true , true , true }, // SpacingMark
- { true , true , true , true , false, true , true , false, false, false, true , false, false }, // L
- { true , true , true , true , false, true , true , false, true , false, false, true , true }, // V
- { true , true , true , true , false, true , true , false, true , true , false, true , true }, // T
- { true , true , true , true , false, true , true , false, true , false, false, true , true }, // LV
- { true , true , true , true , false, true , true , false, true , true , false, true , true }, // LVT
+/*
+ * Most grapheme break rules can be implemented table driven, but rules GB10, GB12 and GB13 need a bit
+ * of special treatment.
+ */
+enum State : uchar {
+ Break,
+ Inside,
+ GB10,
+ GB10_2,
+ GB10_3,
+ GB13, // also covers GB12
+};
+
+static const State breakTable[QUnicodeTables::NumGraphemeBreakClasses][QUnicodeTables::NumGraphemeBreakClasses] = {
+// Any CR LF Control Extend ZWJ RI Prepend S-Mark L V T LV LVT E_B E_M GAZ EBG
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
+ { Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
+ { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
+ { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Control
+ { Break , Break , Break , Break , GB10_2, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , GB10_3, Break , Break }, // Extend
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Inside, Inside }, // ZWJ
+ { Break , Break , Break , Break , Inside, Inside, GB13 , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
+ { Inside, Break , Break , Break , Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside }, // Prepend
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SpacingMark
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Inside, Inside, Break , Inside, Inside, Break , Break , Break , Break }, // L
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // V
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // T
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // LV
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // LVT
+ { Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // E_B
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_M
+ { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ
+ { Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // EBG
};
} // namespace GB
@@ -83,6 +101,7 @@ static const uchar breakTable[QUnicodeTables::GraphemeBreak_LVT + 1][QUnicodeTab
static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
{
QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
+ GB::State state = GB::Break; // only required to track some of the rules
for (quint32 i = 0; i != len; ++i) {
quint32 pos = i;
uint ucs4 = string[i];
@@ -97,8 +116,36 @@ static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
- if (Q_LIKELY(GB::breakTable[lcls][cls]))
+ switch (GB::breakTable[lcls][cls]) {
+ case GB::Break:
attributes[pos].graphemeBoundary = true;
+ state = GB::Break;
+ break;
+ case GB::Inside:
+ state = GB::Break;
+ break;
+ case GB::GB10:
+ state = GB::GB10;
+ break;
+ case GB::GB10_2:
+ if (state == GB::GB10 || state == GB::GB10_2)
+ state = GB::GB10_2;
+ else
+ state = GB::Break;
+ break;
+ case GB::GB10_3:
+ if (state != GB::GB10 && state != GB::GB10_2)
+ attributes[pos].graphemeBoundary = true;
+ state = GB::Break;
+ break;
+ case GB::GB13:
+ if (state != GB::GB13) {
+ state = GB::GB13;
+ } else {
+ attributes[pos].graphemeBoundary = true;
+ state = GB::Break;
+ }
+ }
lcls = cls;
}
@@ -116,24 +163,30 @@ enum Action {
LookupW
};
-static const uchar breakTable[QUnicodeTables::WordBreak_ExtendNumLet + 1][QUnicodeTables::WordBreak_ExtendNumLet + 1] = {
-// Other CR LF Newline Extend RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtendNumLet
- { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Other
- { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
- { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
- { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
- { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
- { Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
- { Break , Break , Break , Break , NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // Katakana
- { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak }, // HebrewLetter
- { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak }, // ALetter
- { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
- { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
- { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
- { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
- { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
- { Break , Break , Break , Break , NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric
- { Break , Break , Break , Break , NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet
+static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
+// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet E_Base E_Mod GAZ EBG
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
+ { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
+ { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
+ { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, NoBreak }, // ZWJ
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break , Break }, // Katakana
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break }, // HebrewLetter
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break }, // ALetter
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break , Break , Break , Break }, // Numeric
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break }, // ExtendNumLet
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break }, // E_Base
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_Mod
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break }, // EBG
};
} // namespace WB
@@ -176,9 +229,14 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
case WB::Break:
break;
case WB::NoBreak:
- if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend)) {
+ if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) {
// WB4: X(Extend|Format)* -> X
- continue;
+ if (cls != QUnicodeTables::WordBreak_ZWJ) // WB3c
+ continue;
+ }
+ if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
+ // WB15/WB16: break between pairs of Regional indicator
+ ncls = QUnicodeTables::WordBreak_Any;
}
break;
case WB::Lookup:
@@ -196,7 +254,7 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
- if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend)) {
+ if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend || tcls == QUnicodeTables::WordBreak_ZWJ || tcls == QUnicodeTables::WordBreak_Format)) {
// WB4: X(Extend|Format)* -> X
continue;
}
@@ -265,8 +323,8 @@ enum State {
Lookup
};
-static const uchar breakTable[BAfter + 1][QUnicodeTables::SentenceBreak_Close + 1] = {
-// Other CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
+static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = {
+// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
{ Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
{ Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
{ Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, STerm , STerm , Initial }, // Upper
@@ -319,7 +377,7 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes
prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
switch (tcls) {
- case QUnicodeTables::SentenceBreak_Other:
+ case QUnicodeTables::SentenceBreak_Any:
case QUnicodeTables::SentenceBreak_Extend:
case QUnicodeTables::SentenceBreak_Sp:
case QUnicodeTables::SentenceBreak_Numeric:
@@ -349,7 +407,7 @@ static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes
// -----------------------------------------------------------------------------------------------------
//
// The line breaking algorithm.
-// See http://www.unicode.org/reports/tr14/tr14-35.html
+// See http://www.unicode.org/reports/tr14/tr14-39.html
//
// -----------------------------------------------------------------------------------------------------
@@ -441,38 +499,41 @@ enum Action {
ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen
};
-static const uchar breakTable[QUnicodeTables::LineBreak_CB + 1][QUnicodeTables::LineBreak_CB + 1] = {
-/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB */
-/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB },
-/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB },
-/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB },
-/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB },
-/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB },
-/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB },
-/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB },
-/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB },
-/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB },
-/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB },
-/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB },
-/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB },
-/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB },
-/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB },
-/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB },
-/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB }
+static const uchar breakTable[QUnicodeTables::LineBreak_SA][QUnicodeTables::LineBreak_SA] = {
+/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM ZWJ*/
+/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
+/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, IB },
+/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
+/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
+/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB },
+/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
+/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
+/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, IB },
+/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
+/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
+/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, IB },
+/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
+/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* ZWJ*/ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, IB }
};
// The following line break classes are not treated by the pair table
@@ -501,6 +562,7 @@ static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *at
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
+ QUnicodeTables::LineBreakClass tcls;
if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
@@ -508,14 +570,39 @@ static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *at
if (FLAG(prop->category) & test)
ncls = QUnicodeTables::LineBreak_CM;
}
- if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) {
- // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
- if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
- ncls = QUnicodeTables::LineBreak_AL;
+
+ if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
+ // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
+ if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
+ attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
+ if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
+ cls = QUnicodeTables::LineBreak_AL;
+ goto next_no_cls_update;
+ }
+ goto next;
}
- if (Q_LIKELY(ncls != QUnicodeTables::LineBreak_CM)) {
- // LB25: do not break lines inside numbers
+ if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
+ if (ncls > QUnicodeTables::LineBreak_SP)
+ goto next; // LB6: x(BK|CR|LF|NL)
+ goto next_no_cls_update; // LB7: xSP
+ }
+
+ if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
+ // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
+ if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
+ // don't update anything
+ goto next_no_cls_update;
+ }
+
+ if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
+ // LB8a: ZWJ x (ID | EB | EM)
+ if (ncls == QUnicodeTables::LineBreak_ID || ncls == QUnicodeTables::LineBreak_EB || ncls == QUnicodeTables::LineBreak_EM)
+ goto next;
+ }
+
+ // LB25: do not break lines inside numbers
+ {
LB::NS::Class necur = LB::NS::toClass(ncls, (QChar::Category)prop->category);
switch (LB::NS::actionTable[nelast][necur]) {
case LB::NS::Break:
@@ -535,25 +622,22 @@ static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *at
}
}
- if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
- // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
- if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
- attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
+ if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
+ // LB30a
+ ncls = QUnicodeTables::LineBreak_SP;
goto next;
}
- if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
- if (ncls > QUnicodeTables::LineBreak_SP)
- goto next; // LB6: x(BK|CR|LF|NL)
- goto next_no_cls_update; // LB7: xSP
- }
-
// for South East Asian chars that require a complex analysis, the Unicode
// standard recommends to treat them as AL. tailoring that do dictionary analysis can override
if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
cls = QUnicodeTables::LineBreak_AL;
- switch (LB::breakTable[cls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
+ tcls = cls;
+ if (tcls == QUnicodeTables::LineBreak_CM)
+ // LB10
+ tcls = QUnicodeTables::LineBreak_AL;
+ switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
case LB::DirectBreak:
attributes[pos].lineBreak = true;
break;