From d64cb5f7073f8a219c12fce605eae35e3887f65b Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Fri, 8 Jun 2012 01:30:04 +0300 Subject: Update the Unicode Text Breaking Algorithm implementation to make it conformant to the Unicode 6.1 specifications #14 and #29. The most important changes are: * The implementation has been reworked from scratch to fix all known bugs; * Separate-out the grapheme and the line breaking implementation to eliminate an overhead due to calculating unnecessary breaks; * Stop using deprecated SG class in favor of resolving pairs of surrogates; * A proper support for SMP code points; * Support for extended grapheme clusters (a drop-in replacement for the legacy grapheme clusters as of Unicode 5.1); * The hardcoded tailoring of UBA has been eliminated which breaks the 7 years-old lineBreaking test. Some later, we'll investigate if such a tailoring is still needed. Change-Id: I9f5867b3cec753b4fc120bc5a7e20f9a73d89370 Reviewed-by: Lars Knoll --- src/corelib/tools/qtextboundaryfinder.cpp | 15 +- src/corelib/tools/qunicodetools.cpp | 607 ++++++++++++++++-------------- 2 files changed, 336 insertions(+), 286 deletions(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qtextboundaryfinder.cpp b/src/corelib/tools/qtextboundaryfinder.cpp index 042f92f1e7..fd011377eb 100644 --- a/src/corelib/tools/qtextboundaryfinder.cpp +++ b/src/corelib/tools/qtextboundaryfinder.cpp @@ -116,8 +116,9 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int \reentrant QTextBoundaryFinder allows to find Unicode text boundaries in a - string, similar to the Unicode text boundary specification (see - http://www.unicode.org/reports/tr29/tr29-11.html). + string, accordingly to the Unicode text boundary specification (see + \l{http://www.unicode.org/reports/tr14/}{Unicode Standard Annex #14} and + \l{http://www.unicode.org/reports/tr29/}{Unicode Standard Annex #29}). QTextBoundaryFinder can operate on a QString in four possible modes depending on the value of \a BoundaryType. @@ -127,14 +128,18 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int Grapheme clusters. The two unicode characters 'A' + diaeresis do for example form one grapheme cluster as the user thinks of them as one character, yet it is in this case represented by two - unicode code points. + unicode code points + (see \l{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}). Word boundaries are there to locate the start and end of what a - language considers to be a word. + language considers to be a word + (see \l{http://www.unicode.org/reports/tr29/#Word_Boundaries}). Line break boundaries give possible places where a line break might happen and sentence boundaries will show the beginning and - end of whole sentences. + end of whole sentences + (see \l{http://www.unicode.org/reports/tr29/#Sentence_Boundaries} and + \l{http://www.unicode.org/reports/tr14/}). The first position in a string is always a valid boundary and refers to the position before the first character. The last diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp index 5845765e46..53e7d99733 100644 --- a/src/corelib/tools/qunicodetools.cpp +++ b/src/corelib/tools/qunicodetools.cpp @@ -43,6 +43,8 @@ #include "qunicodetables_p.h" +#define FLAG(x) (1 << (x)) + QT_BEGIN_NAMESPACE Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0; @@ -51,327 +53,371 @@ namespace QUnicodeTools { // ----------------------------------------------------------------------------------------------------- // -// The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-19.html -// -// ----------------------------------------------------------------------------------------------------- -// -// The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-11.html +// The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-19.html // // ----------------------------------------------------------------------------------------------------- -/* The Unicode algorithm does in our opinion allow line breaks at some - places they shouldn't be allowed. The following changes were thus - made in comparison to the Unicode reference: - - EX->AL from DB to IB - SY->AL from DB to IB - SY->PO from DB to IB - SY->PR from DB to IB - SY->OP from DB to IB - AL->PR from DB to IB - AL->PO from DB to IB - PR->PR from DB to IB - PO->PO from DB to IB - PR->PO from DB to IB - PO->PR from DB to IB - HY->PO from DB to IB - HY->PR from DB to IB - HY->OP from DB to IB - NU->EX from PB to IB - EX->PO from DB to IB -*/ - -// The following line break classes are not treated by the table: -// AI, BK, CB, CR, LF, NL, SA, SG, SP, XX - -enum LineBreakRule { - ProhibitedBreak, // PB in table - DirectBreak, // DB in table - IndirectBreak, // IB in table - CombiningIndirectBreak, // CI in table - CombiningProhibitedBreak // CP in table +namespace GB { + +static const uchar breakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = { +// Other CR LF Control Extend Prepend S-Mark L V T LV LVT + { true , true , true , true , false, true , false, true , true , true , true , true }, // Other + { true , true , false, true , true , true , true , true , true , true , true , true }, // CR + { true , true , true , true , true , true , true , true , true , true , true , true }, // LF + { true , true , true , true , true , true , true , true , true , true , true , true }, // Control + { true , true , true , true , false, true , false, true , true , true , true , true }, // Extend + { false, true , true , true , false, false, false, false, false, false, false, false }, // Prepend + { true , true , true , true , false, true , false, true , true , true , true , true }, // SpacingMark + { true , true , true , true , false, true , false, false, false, true , false, false }, // L + { true , true , true , true , false, true , false, true , false, false, true , true }, // V + { true , true , true , true , false, true , false, true , true , false, true , true }, // T + { true , true , true , true , false, true , false, true , false, false, true , true }, // LV + { true , true , true , true , false, true , false, true , true , false, true , true }, // LVT }; -#define DB DirectBreak -#define IB IndirectBreak -#define CI CombiningIndirectBreak -#define CP CombiningProhibitedBreak -#define PB ProhibitedBreak -static const uchar lineBreakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = { -/* OP CL QU GL NS EX SY IS PR PO NU AL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */ -/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB }, -/* CL */ { DB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* QU */ { PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* GL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* NS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* EX */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* SY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* IS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* PR */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* PO */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* NU */ { IB, PB, IB, IB, IB, IB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* AL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* ID */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* IN */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* HY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* BA */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* BB */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* B2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB }, -/* CM */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* WJ */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* H2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB }, -/* H3 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }, -/* JL */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB }, -/* JV */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB }, -/* JT */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB } -}; -#undef DB -#undef IB -#undef CI -#undef CP -#undef PB - -static const uchar graphemeBreakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = { -// Other, CR, LF, Control, Extend, L, V, T, LV, LVT - { true , true , true , true , true , true , true , true , true , true }, // Other, - { true , true , true , true , true , true , true , true , true , true }, // CR, - { true , false, true , true , true , true , true , true , true , true }, // LF, - { true , true , true , true , true , true , true , true , true , true }, // Control, - { false, true , true , true , false, false, false, false, false, false }, // Extend, - { true , true , true , true , true , false, true , true , true , true }, // L, - { true , true , true , true , true , false, false, true , false, true }, // V, - { true , true , true , true , true , true , false, false, false, false }, // T, - { true , true , true , true , true , false, true , true , true , true }, // LV, - { true , true , true , true , true , false, true , true , true , true }, // LVT -}; - -static void calcGraphemeAndLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes) -{ - // ##### can this fail if the first char is a surrogate? - const QUnicodeTables::Properties *prop = QUnicodeTables::properties(string[0]); - QUnicodeTables::GraphemeBreak grapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak; - QUnicodeTables::LineBreakClass cls = (QUnicodeTables::LineBreakClass) prop->line_break_class; - // handle case where input starts with an LF - if (cls == QUnicodeTables::LineBreak_LF) - cls = QUnicodeTables::LineBreak_BK; - attributes[0].charStop = true; - - int lcls = cls; - for (quint32 i = 1; i < len; ++i) { - attributes[i].charStop = true; +} // namespace GB +static void getGraphemeBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes) +{ + QUnicodeTables::GraphemeBreak lcls = QUnicodeTables::GraphemeBreakLF; // to meet GB1 + for (quint32 i = 0; i != len; ++i) { + quint32 pos = i; uint ucs4 = string[i]; - prop = QUnicodeTables::properties(ucs4); - QUnicodeTables::GraphemeBreak ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak; - QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class; - attributes[i].charStop = graphemeBreakTable[ngrapheme][grapheme]; - // handle surrogates - if (ncls == QUnicodeTables::LineBreak_SG) { - if (QChar::isHighSurrogate(string[i]) && i < len - 1 && QChar::isLowSurrogate(string[i+1])) { - continue; - } else if (QChar::isLowSurrogate(string[i]) && QChar::isHighSurrogate(string[i-1])) { - ucs4 = QChar::surrogateToUcs4(string[i-1], string[i]); - prop = QUnicodeTables::properties(ucs4); - ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak; - ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class; - attributes[i].charStop = false; - } else { - ncls = QUnicodeTables::LineBreak_AL; + if (QChar::isHighSurrogate(ucs4) && i + 1 != len) { + ushort low = string[i + 1]; + if (QChar::isLowSurrogate(low)) { + ucs4 = QChar::surrogateToUcs4(ucs4, low); + ++i; } } - HB_LineBreakType lineBreakType = HB_NoBreak; + const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); + QUnicodeTables::GraphemeBreak cls = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak; - if (cls >= QUnicodeTables::LineBreak_CR) { - if (cls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF) - lineBreakType = HB_ForcedBreak; - goto next; + attributes[pos].charStop = GB::breakTable[lcls][cls]; + + lcls = cls; + } +} + + +namespace WB { + +enum Action { + NoBreak = 0, + Break = 1, + Lookup = 2 +}; + +static const uchar breakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = { +// Other CR LF Newline Format Katakana ALetter MidNumLet MidLetter MidNum Numeric ExtendNumLet + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // Other + { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // Format + { Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak }, // Katakana + { Break , Break , Break , Break , NoBreak, Break , NoBreak, Lookup , Lookup , Break , NoBreak, NoBreak }, // ALetter + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidNumLet + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidLetter + { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidNum + { Break , Break , Break , Break , NoBreak, Break , NoBreak, Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet +}; + +} // namespace WB + +static void getWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes) +{ + QUnicodeTables::WordBreak cls = QUnicodeTables::WordBreakLF; // to meet WB1 + for (quint32 i = 0; i != len; ++i) { + quint32 pos = i; + uint ucs4 = string[i]; + if (QChar::isHighSurrogate(ucs4) && i + 1 != len) { + ushort low = string[i + 1]; + if (QChar::isLowSurrogate(low)) { + ucs4 = QChar::surrogateToUcs4(ucs4, low); + ++i; + } } - if (ncls == QUnicodeTables::LineBreak_SP) - goto next_no_cls_update; - if (ncls >= QUnicodeTables::LineBreak_CR) - goto next; + const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); + QUnicodeTables::WordBreak ncls = (QUnicodeTables::WordBreak) prop->wordBreak; - { - int tcls = ncls; - // for south east asian chars that require a complex (dictionary analysis), the unicode - // standard recommends to treat them as AL. thai_attributes and other attribute methods that - // do dictionary analysis can override - if (tcls >= QUnicodeTables::LineBreak_SA) - tcls = QUnicodeTables::LineBreak_AL; - if (cls >= QUnicodeTables::LineBreak_SA) - cls = QUnicodeTables::LineBreak_AL; - - int brk = lineBreakTable[cls][tcls]; - switch (brk) { - case DirectBreak: - lineBreakType = HB_Break; - if (string[i-1] == 0xad) // soft hyphen - lineBreakType = HB_SoftHyphen; - break; - case IndirectBreak: - lineBreakType = (lcls == QUnicodeTables::LineBreak_SP) ? HB_Break : HB_NoBreak; - break; - case CombiningIndirectBreak: - lineBreakType = HB_NoBreak; - if (lcls == QUnicodeTables::LineBreak_SP){ - if (i > 1) - attributes[i-2].lineBreakType = HB_Break; - } else { - goto next_no_cls_update; + uchar action = WB::breakTable[cls][ncls]; + if (ncls == QUnicodeTables::WordBreakFormat) { + // WB4: X(Extend|Format)* -> X + if (action != WB::Break) + continue; + } else { + if (action == WB::Lookup) { + action = WB::Break; + for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) { + ucs4 = string[lookahead]; + if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) { + ushort low = string[lookahead + 1]; + if (QChar::isLowSurrogate(low)) { + ucs4 = QChar::surrogateToUcs4(ucs4, low); + ++lookahead; + } + } + + prop = QUnicodeTables::properties(ucs4); + QUnicodeTables::WordBreak tcls = (QUnicodeTables::WordBreak) prop->wordBreak; + if (tcls == QUnicodeTables::WordBreakFormat) + continue; + if (tcls == cls) { + i = lookahead; + ncls = tcls; + action = WB::NoBreak; + } + break; } - break; - case CombiningProhibitedBreak: - lineBreakType = HB_NoBreak; - if (lcls != QUnicodeTables::LineBreak_SP) - goto next_no_cls_update; - case ProhibitedBreak: - default: - break; } } - next: cls = ncls; - next_no_cls_update: - lcls = ncls; - grapheme = ngrapheme; - attributes[i-1].lineBreakType = lineBreakType; + if (action == WB::Break) + attributes[pos].wordBoundary = true; } - - for (quint32 i = len - 1; i > 0; --i) - attributes[i].lineBreakType = attributes[i - 1].lineBreakType; - attributes[0].lineBreakType = HB_NoBreak; // LB2 } -enum WordBreakRule { NoBreak = 0, Break = 1, Middle = 2 }; - -static const uchar wordBreakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = { -// Other Format Katakana ALetter MidLetter MidNum Numeric ExtendNumLet - { Break , Break , Break , Break , Break , Break , Break , Break }, // Other - { Break , Break , Break , Break , Break , Break , Break , Break }, // Format - { Break , Break , NoBreak, Break , Break , Break , Break , NoBreak }, // Katakana - { Break , Break , Break , NoBreak, Middle , Break , NoBreak, NoBreak }, // ALetter - { Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter - { Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum - { Break , Break , Break , NoBreak, Break , Middle , NoBreak, NoBreak }, // Numeric - { Break , Break , NoBreak, NoBreak, Break , Break , NoBreak, NoBreak }, // ExtendNumLet +namespace SB { + +enum State { + Initial, + Upper, + UpATerm, + ATerm, + ATermC, + ACS, + STerm, + STermC, + SCS, + BAfterC, + BAfter, + Break, + Lookup }; -static void calcWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes) -{ - quint32 brk = QUnicodeTables::wordBreakClass(string[0]); +static const uchar breakTable[BAfter + 1][QUnicodeTables::SentenceBreakClose + 1] = { +// Other CR LF Sep Format Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close + { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Initial, Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial + { Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, UpATerm, STerm , STerm , Initial }, // Upper + + { Lookup , BAfterC, BAfter , BAfter , UpATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // UpATerm + { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm + { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC + { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS + + { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm, + { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC + { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS + { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter +}; - attributes[0].wordBoundary = true; +} // namespace SB - for (quint32 i = 1; i < len; ++i) { - if (!attributes[i].charStop) { - attributes[i].wordBoundary = false; - continue; +static void getSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes) +{ + uchar state = SB::BAfter; // to meet SB1 + for (quint32 i = 0; i != len; ++i) { + quint32 pos = i; + uint ucs4 = string[i]; + if (QChar::isHighSurrogate(ucs4) && i + 1 != len) { + ushort low = string[i + 1]; + if (QChar::isLowSurrogate(low)) { + ucs4 = QChar::surrogateToUcs4(ucs4, low); + ++i; + } } - quint32 nbrk = QUnicodeTables::wordBreakClass(string[i]); - if (nbrk == QUnicodeTables::WordBreakFormat) { - attributes[i].wordBoundary = (QUnicodeTables::sentenceBreakClass(string[i-1]) == QUnicodeTables::SentenceBreakSep); - continue; - } + const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); + QUnicodeTables::SentenceBreak ncls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak; + + Q_ASSERT(state <= SB::BAfter); + state = SB::breakTable[state][ncls]; + if (state == SB::Lookup) { // SB8 + state = SB::Break; + for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) { + ucs4 = string[lookahead]; + if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) { + ushort low = string[lookahead + 1]; + if (QChar::isLowSurrogate(low)) { + ucs4 = QChar::surrogateToUcs4(ucs4, low); + ++lookahead; + } + } - WordBreakRule rule = (WordBreakRule)wordBreakTable[brk][nbrk]; - if (rule == Middle) { - rule = Break; - quint32 lookahead = i + 1; - while (lookahead < len) { - quint32 testbrk = QUnicodeTables::wordBreakClass(string[lookahead]); - if (testbrk == QUnicodeTables::WordBreakFormat - && QUnicodeTables::sentenceBreakClass(string[lookahead]) != QUnicodeTables::SentenceBreakSep) { - ++lookahead; + prop = QUnicodeTables::properties(ucs4); + QUnicodeTables::SentenceBreak tcls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak; + switch (tcls) { + case QUnicodeTables::SentenceBreakOther: + case QUnicodeTables::SentenceBreakFormat: + case QUnicodeTables::SentenceBreakSp: + case QUnicodeTables::SentenceBreakNumeric: + case QUnicodeTables::SentenceBreakSContinue: + case QUnicodeTables::SentenceBreakClose: continue; - } - if (testbrk == brk) { - rule = NoBreak; - while (i < lookahead) - attributes[i++].wordBoundary = false; - nbrk = testbrk; + case QUnicodeTables::SentenceBreakLower: + i = lookahead; + state = SB::Initial; + break; + default: + break; } break; } } - attributes[i].wordBoundary = (rule == Break); - brk = nbrk; + if (state == SB::Break) { + attributes[pos].sentenceBoundary = true; + state = SB::breakTable[SB::Initial][ncls]; + } } } -enum SentenceBreakState { - SB_Initial, - SB_Upper, - SB_UpATerm, - SB_ATerm, - SB_ATermC, - SB_ACS, - SB_STerm, - SB_STermC, - SB_SCS, - SB_BAfter, - SB_Break, - SB_Lookup -}; +// ----------------------------------------------------------------------------------------------------- +// +// The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-28.html +// +// ----------------------------------------------------------------------------------------------------- + +namespace LB { -static const uchar sentenceBreakTable[SB_Lookup + 1][QUnicodeTables::SentenceBreakClose + 1] = { -// Other Sep Format Sp Lower Upper OLetter Numeric ATerm STerm Close - { SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_ATerm , SB_STerm , SB_Initial }, // SB_Initial, - { SB_Initial, SB_BAfter , SB_Upper , SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm , SB_Initial }, // SB_Upper +// The following line break classes are not treated by the pair table +// and must be resolved outside: +// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX - { SB_Lookup , SB_BAfter , SB_UpATerm, SB_ACS , SB_Initial, SB_Upper , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_UpATerm - { SB_Lookup , SB_BAfter , SB_ATerm , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATerm - { SB_Lookup , SB_BAfter , SB_ATermC , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Lookup , SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATermC, - { SB_Lookup , SB_BAfter , SB_ACS , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Lookup , SB_ATerm , SB_STerm , SB_Lookup }, // SB_ACS, +enum Action { + ProhibitedBreak, PB = ProhibitedBreak, + DirectBreak, DB = DirectBreak, + IndirectBreak, IB = IndirectBreak, + CombiningIndirectBreak, CI = CombiningIndirectBreak, + CombiningProhibitedBreak, CP = CombiningProhibitedBreak +}; - { SB_Break , SB_BAfter , SB_STerm , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STerm, - { SB_Break , SB_BAfter , SB_STermC , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STermC, - { SB_Break , SB_BAfter , SB_SCS , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_Break }, // SB_SCS, - { SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break }, // SB_BAfter, +static const uchar breakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = { +/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */ +/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB }, +/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, +/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, +/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* PR */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB }, +/* PO */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* HY */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* BA */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, +/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB }, +/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, +/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, +/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB }, +/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }, +/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB }, +/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB }, +/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB } }; -static void calcSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes) +} // namespace LB + +static void getLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes) { - quint32 brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[0])]; - attributes[0].sentenceBoundary = true; - for (quint32 i = 1; i < len; ++i) { - if (!attributes[i].charStop) { - attributes[i].sentenceBoundary = false; - continue; - } - brk = sentenceBreakTable[brk][QUnicodeTables::sentenceBreakClass(string[i])]; - if (brk == SB_Lookup) { - brk = SB_Break; - quint32 lookahead = i + 1; - while (lookahead < len) { - quint32 sbrk = QUnicodeTables::sentenceBreakClass(string[lookahead]); - if (sbrk != QUnicodeTables::SentenceBreakOther - && sbrk != QUnicodeTables::SentenceBreakNumeric - && sbrk != QUnicodeTables::SentenceBreakClose) { - break; - } else if (sbrk == QUnicodeTables::SentenceBreakLower) { - brk = SB_Initial; - break; - } - ++lookahead; - } - if (brk == SB_Initial) { - while (i < lookahead) - attributes[i++].sentenceBoundary = false; + uint lucs4 = 0; + QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10 + QUnicodeTables::LineBreakClass cls = lcls; + for (quint32 i = 0; i != len; ++i) { + quint32 pos = i; + uint ucs4 = string[i]; + if (QChar::isHighSurrogate(ucs4) && i + 1 != len) { + ushort low = string[i + 1]; + if (QChar::isLowSurrogate(low)) { + ucs4 = QChar::surrogateToUcs4(ucs4, low); + ++i; } } - if (brk == SB_Break) { - attributes[i].sentenceBoundary = true; - brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[i])]; - } else { - attributes[i].sentenceBoundary = false; + + const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); + QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class; + + if (ncls == QUnicodeTables::LineBreak_SA) { + // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM + static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining); + if (FLAG(prop->category) & test) + ncls = QUnicodeTables::LineBreak_CM; + } + if (ncls == QUnicodeTables::LineBreak_CM) { + // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL + if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP) + ncls = QUnicodeTables::LineBreak_AL; + } + + HB_LineBreakType lineBreakType = HB_NoBreak; + + if (lcls >= QUnicodeTables::LineBreak_CR) { + // LB4: BK!, LB5: (CRxLF|CR|LF|NL)! + if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF) + lineBreakType = HB_ForcedBreak; + goto next; + } + + if (ncls >= QUnicodeTables::LineBreak_SP) { + if (ncls > QUnicodeTables::LineBreak_SP) + goto next; // LB6: x(BK|CR|LF|NL) + goto next_no_cls_update; // LB7: xSP } + + // for South East Asian chars that require a complex analysis, the Unicode + // standard recommends to treat them as AL. tailoring that do dictionary analysis can override + if (cls >= QUnicodeTables::LineBreak_SA) + cls = QUnicodeTables::LineBreak_AL; + + switch (LB::breakTable[cls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) { + case LB::DirectBreak: + lineBreakType = HB_Break; + if (lucs4 == 0x00ad) // soft hyphen + lineBreakType = HB_SoftHyphen; + break; + case LB::IndirectBreak: + if (lcls == QUnicodeTables::LineBreak_SP) + lineBreakType = HB_Break; + break; + case LB::CombiningIndirectBreak: + if (lcls != QUnicodeTables::LineBreak_SP) + goto next_no_cls_update; + lineBreakType = HB_Break; + break; + case LB::CombiningProhibitedBreak: + if (lcls != QUnicodeTables::LineBreak_SP) + goto next_no_cls_update; + break; + case LB::ProhibitedBreak: + // nothing to do + default: + break; + } + + next: + cls = ncls; + lucs4 = ucs4; + next_no_cls_update: + lcls = ncls; + if (lineBreakType != HB_NoBreak) + attributes[pos].lineBreakType = lineBreakType; } + + attributes[0].lineBreakType = HB_NoBreak; // LB2 } @@ -400,18 +446,17 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length, if (length <= 0) return; - if (!(options & DontClearAttributes)) { + if (!(options & DontClearAttributes)) ::memset(attributes, 0, length * sizeof(HB_CharAttributes)); - if (options & (WordBreaks | SentenceBreaks)) - options |= GraphemeBreaks; - } - if (options & (GraphemeBreaks | LineBreaks)) - calcGraphemeAndLineBreaks(string, length, attributes); + if (options & GraphemeBreaks) + getGraphemeBreaks(string, length, attributes); if (options & WordBreaks) - calcWordBreaks(string, length, attributes); + getWordBreaks(string, length, attributes); if (options & SentenceBreaks) - calcSentenceBreaks(string, length, attributes); + getSentenceBreaks(string, length, attributes); + if (options & LineBreaks) + getLineBreaks(string, length, attributes); if (options & WhiteSpaces) getWhiteSpaces(string, length, attributes); -- cgit v1.2.3