summaryrefslogtreecommitdiffstats
path: root/src/corelib/tools/qunicodetools.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/tools/qunicodetools.cpp')
-rw-r--r--src/corelib/tools/qunicodetools.cpp607
1 files changed, 326 insertions, 281 deletions
diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp
index 5845765e46..53e7d99733 100644
--- a/src/corelib/tools/qunicodetools.cpp
+++ b/src/corelib/tools/qunicodetools.cpp
@@ -43,6 +43,8 @@
#include "qunicodetables_p.h"
+#define FLAG(x) (1 << (x))
+
QT_BEGIN_NAMESPACE
Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
@@ -51,327 +53,371 @@ namespace QUnicodeTools {
// -----------------------------------------------------------------------------------------------------
//
-// The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-19.html
-//
-// -----------------------------------------------------------------------------------------------------
-//
-// The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-11.html
+// The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-19.html
//
// -----------------------------------------------------------------------------------------------------
-/* The Unicode algorithm does in our opinion allow line breaks at some
- places they shouldn't be allowed. The following changes were thus
- made in comparison to the Unicode reference:
-
- EX->AL from DB to IB
- SY->AL from DB to IB
- SY->PO from DB to IB
- SY->PR from DB to IB
- SY->OP from DB to IB
- AL->PR from DB to IB
- AL->PO from DB to IB
- PR->PR from DB to IB
- PO->PO from DB to IB
- PR->PO from DB to IB
- PO->PR from DB to IB
- HY->PO from DB to IB
- HY->PR from DB to IB
- HY->OP from DB to IB
- NU->EX from PB to IB
- EX->PO from DB to IB
-*/
-
-// The following line break classes are not treated by the table:
-// AI, BK, CB, CR, LF, NL, SA, SG, SP, XX
-
-enum LineBreakRule {
- ProhibitedBreak, // PB in table
- DirectBreak, // DB in table
- IndirectBreak, // IB in table
- CombiningIndirectBreak, // CI in table
- CombiningProhibitedBreak // CP in table
+namespace GB {
+
+static const uchar breakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = {
+// Other CR LF Control Extend Prepend S-Mark L V T LV LVT
+ { true , true , true , true , false, true , false, true , true , true , true , true }, // Other
+ { true , true , false, true , true , true , true , true , true , true , true , true }, // CR
+ { true , true , true , true , true , true , true , true , true , true , true , true }, // LF
+ { true , true , true , true , true , true , true , true , true , true , true , true }, // Control
+ { true , true , true , true , false, true , false, true , true , true , true , true }, // Extend
+ { false, true , true , true , false, false, false, false, false, false, false, false }, // Prepend
+ { true , true , true , true , false, true , false, true , true , true , true , true }, // SpacingMark
+ { true , true , true , true , false, true , false, false, false, true , false, false }, // L
+ { true , true , true , true , false, true , false, true , false, false, true , true }, // V
+ { true , true , true , true , false, true , false, true , true , false, true , true }, // T
+ { true , true , true , true , false, true , false, true , false, false, true , true }, // LV
+ { true , true , true , true , false, true , false, true , true , false, true , true }, // LVT
};
-#define DB DirectBreak
-#define IB IndirectBreak
-#define CI CombiningIndirectBreak
-#define CP CombiningProhibitedBreak
-#define PB ProhibitedBreak
-static const uchar lineBreakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = {
-/* OP CL QU GL NS EX SY IS PR PO NU AL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */
-/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
-/* CL */ { DB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* QU */ { PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* GL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* NS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* EX */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* SY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* IS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* PR */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* PO */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* NU */ { IB, PB, IB, IB, IB, IB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* AL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* ID */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* IN */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* HY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* BA */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* BB */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* B2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
-/* CM */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* WJ */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* H2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
-/* H3 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
-/* JL */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
-/* JV */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
-/* JT */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
-};
-#undef DB
-#undef IB
-#undef CI
-#undef CP
-#undef PB
-
-static const uchar graphemeBreakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = {
-// Other, CR, LF, Control, Extend, L, V, T, LV, LVT
- { true , true , true , true , true , true , true , true , true , true }, // Other,
- { true , true , true , true , true , true , true , true , true , true }, // CR,
- { true , false, true , true , true , true , true , true , true , true }, // LF,
- { true , true , true , true , true , true , true , true , true , true }, // Control,
- { false, true , true , true , false, false, false, false, false, false }, // Extend,
- { true , true , true , true , true , false, true , true , true , true }, // L,
- { true , true , true , true , true , false, false, true , false, true }, // V,
- { true , true , true , true , true , true , false, false, false, false }, // T,
- { true , true , true , true , true , false, true , true , true , true }, // LV,
- { true , true , true , true , true , false, true , true , true , true }, // LVT
-};
-
-static void calcGraphemeAndLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
-{
- // ##### can this fail if the first char is a surrogate?
- const QUnicodeTables::Properties *prop = QUnicodeTables::properties(string[0]);
- QUnicodeTables::GraphemeBreak grapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
- QUnicodeTables::LineBreakClass cls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
- // handle case where input starts with an LF
- if (cls == QUnicodeTables::LineBreak_LF)
- cls = QUnicodeTables::LineBreak_BK;
- attributes[0].charStop = true;
-
- int lcls = cls;
- for (quint32 i = 1; i < len; ++i) {
- attributes[i].charStop = true;
+} // namespace GB
+static void getGraphemeBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
+{
+ QUnicodeTables::GraphemeBreak lcls = QUnicodeTables::GraphemeBreakLF; // to meet GB1
+ for (quint32 i = 0; i != len; ++i) {
+ quint32 pos = i;
uint ucs4 = string[i];
- prop = QUnicodeTables::properties(ucs4);
- QUnicodeTables::GraphemeBreak ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
- QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
- attributes[i].charStop = graphemeBreakTable[ngrapheme][grapheme];
- // handle surrogates
- if (ncls == QUnicodeTables::LineBreak_SG) {
- if (QChar::isHighSurrogate(string[i]) && i < len - 1 && QChar::isLowSurrogate(string[i+1])) {
- continue;
- } else if (QChar::isLowSurrogate(string[i]) && QChar::isHighSurrogate(string[i-1])) {
- ucs4 = QChar::surrogateToUcs4(string[i-1], string[i]);
- prop = QUnicodeTables::properties(ucs4);
- ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
- ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
- attributes[i].charStop = false;
- } else {
- ncls = QUnicodeTables::LineBreak_AL;
+ if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
+ ushort low = string[i + 1];
+ if (QChar::isLowSurrogate(low)) {
+ ucs4 = QChar::surrogateToUcs4(ucs4, low);
+ ++i;
}
}
- HB_LineBreakType lineBreakType = HB_NoBreak;
+ const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
+ QUnicodeTables::GraphemeBreak cls = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
- if (cls >= QUnicodeTables::LineBreak_CR) {
- if (cls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
- lineBreakType = HB_ForcedBreak;
- goto next;
+ attributes[pos].charStop = GB::breakTable[lcls][cls];
+
+ lcls = cls;
+ }
+}
+
+
+namespace WB {
+
+enum Action {
+ NoBreak = 0,
+ Break = 1,
+ Lookup = 2
+};
+
+static const uchar breakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = {
+// Other CR LF Newline Format Katakana ALetter MidNumLet MidLetter MidNum Numeric ExtendNumLet
+ { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // Other
+ { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
+ { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
+ { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
+ { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // Format
+ { Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak }, // Katakana
+ { Break , Break , Break , Break , NoBreak, Break , NoBreak, Lookup , Lookup , Break , NoBreak, NoBreak }, // ALetter
+ { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
+ { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidLetter
+ { Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidNum
+ { Break , Break , Break , Break , NoBreak, Break , NoBreak, Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet
+};
+
+} // namespace WB
+
+static void getWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
+{
+ QUnicodeTables::WordBreak cls = QUnicodeTables::WordBreakLF; // to meet WB1
+ for (quint32 i = 0; i != len; ++i) {
+ quint32 pos = i;
+ uint ucs4 = string[i];
+ if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
+ ushort low = string[i + 1];
+ if (QChar::isLowSurrogate(low)) {
+ ucs4 = QChar::surrogateToUcs4(ucs4, low);
+ ++i;
+ }
}
- if (ncls == QUnicodeTables::LineBreak_SP)
- goto next_no_cls_update;
- if (ncls >= QUnicodeTables::LineBreak_CR)
- goto next;
+ const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
+ QUnicodeTables::WordBreak ncls = (QUnicodeTables::WordBreak) prop->wordBreak;
- {
- int tcls = ncls;
- // for south east asian chars that require a complex (dictionary analysis), the unicode
- // standard recommends to treat them as AL. thai_attributes and other attribute methods that
- // do dictionary analysis can override
- if (tcls >= QUnicodeTables::LineBreak_SA)
- tcls = QUnicodeTables::LineBreak_AL;
- if (cls >= QUnicodeTables::LineBreak_SA)
- cls = QUnicodeTables::LineBreak_AL;
-
- int brk = lineBreakTable[cls][tcls];
- switch (brk) {
- case DirectBreak:
- lineBreakType = HB_Break;
- if (string[i-1] == 0xad) // soft hyphen
- lineBreakType = HB_SoftHyphen;
- break;
- case IndirectBreak:
- lineBreakType = (lcls == QUnicodeTables::LineBreak_SP) ? HB_Break : HB_NoBreak;
- break;
- case CombiningIndirectBreak:
- lineBreakType = HB_NoBreak;
- if (lcls == QUnicodeTables::LineBreak_SP){
- if (i > 1)
- attributes[i-2].lineBreakType = HB_Break;
- } else {
- goto next_no_cls_update;
+ uchar action = WB::breakTable[cls][ncls];
+ if (ncls == QUnicodeTables::WordBreakFormat) {
+ // WB4: X(Extend|Format)* -> X
+ if (action != WB::Break)
+ continue;
+ } else {
+ if (action == WB::Lookup) {
+ action = WB::Break;
+ for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
+ ucs4 = string[lookahead];
+ if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
+ ushort low = string[lookahead + 1];
+ if (QChar::isLowSurrogate(low)) {
+ ucs4 = QChar::surrogateToUcs4(ucs4, low);
+ ++lookahead;
+ }
+ }
+
+ prop = QUnicodeTables::properties(ucs4);
+ QUnicodeTables::WordBreak tcls = (QUnicodeTables::WordBreak) prop->wordBreak;
+ if (tcls == QUnicodeTables::WordBreakFormat)
+ continue;
+ if (tcls == cls) {
+ i = lookahead;
+ ncls = tcls;
+ action = WB::NoBreak;
+ }
+ break;
}
- break;
- case CombiningProhibitedBreak:
- lineBreakType = HB_NoBreak;
- if (lcls != QUnicodeTables::LineBreak_SP)
- goto next_no_cls_update;
- case ProhibitedBreak:
- default:
- break;
}
}
- next:
cls = ncls;
- next_no_cls_update:
- lcls = ncls;
- grapheme = ngrapheme;
- attributes[i-1].lineBreakType = lineBreakType;
+ if (action == WB::Break)
+ attributes[pos].wordBoundary = true;
}
-
- for (quint32 i = len - 1; i > 0; --i)
- attributes[i].lineBreakType = attributes[i - 1].lineBreakType;
- attributes[0].lineBreakType = HB_NoBreak; // LB2
}
-enum WordBreakRule { NoBreak = 0, Break = 1, Middle = 2 };
-
-static const uchar wordBreakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = {
-// Other Format Katakana ALetter MidLetter MidNum Numeric ExtendNumLet
- { Break , Break , Break , Break , Break , Break , Break , Break }, // Other
- { Break , Break , Break , Break , Break , Break , Break , Break }, // Format
- { Break , Break , NoBreak, Break , Break , Break , Break , NoBreak }, // Katakana
- { Break , Break , Break , NoBreak, Middle , Break , NoBreak, NoBreak }, // ALetter
- { Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
- { Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
- { Break , Break , Break , NoBreak, Break , Middle , NoBreak, NoBreak }, // Numeric
- { Break , Break , NoBreak, NoBreak, Break , Break , NoBreak, NoBreak }, // ExtendNumLet
+namespace SB {
+
+enum State {
+ Initial,
+ Upper,
+ UpATerm,
+ ATerm,
+ ATermC,
+ ACS,
+ STerm,
+ STermC,
+ SCS,
+ BAfterC,
+ BAfter,
+ Break,
+ Lookup
};
-static void calcWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
-{
- quint32 brk = QUnicodeTables::wordBreakClass(string[0]);
+static const uchar breakTable[BAfter + 1][QUnicodeTables::SentenceBreakClose + 1] = {
+// Other CR LF Sep Format Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
+ { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Initial, Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
+ { Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, UpATerm, STerm , STerm , Initial }, // Upper
+
+ { Lookup , BAfterC, BAfter , BAfter , UpATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // UpATerm
+ { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
+ { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
+ { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
+
+ { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
+ { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
+ { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
+ { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
+ { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
+};
- attributes[0].wordBoundary = true;
+} // namespace SB
- for (quint32 i = 1; i < len; ++i) {
- if (!attributes[i].charStop) {
- attributes[i].wordBoundary = false;
- continue;
+static void getSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
+{
+ uchar state = SB::BAfter; // to meet SB1
+ for (quint32 i = 0; i != len; ++i) {
+ quint32 pos = i;
+ uint ucs4 = string[i];
+ if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
+ ushort low = string[i + 1];
+ if (QChar::isLowSurrogate(low)) {
+ ucs4 = QChar::surrogateToUcs4(ucs4, low);
+ ++i;
+ }
}
- quint32 nbrk = QUnicodeTables::wordBreakClass(string[i]);
- if (nbrk == QUnicodeTables::WordBreakFormat) {
- attributes[i].wordBoundary = (QUnicodeTables::sentenceBreakClass(string[i-1]) == QUnicodeTables::SentenceBreakSep);
- continue;
- }
+ const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
+ QUnicodeTables::SentenceBreak ncls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak;
+
+ Q_ASSERT(state <= SB::BAfter);
+ state = SB::breakTable[state][ncls];
+ if (state == SB::Lookup) { // SB8
+ state = SB::Break;
+ for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
+ ucs4 = string[lookahead];
+ if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
+ ushort low = string[lookahead + 1];
+ if (QChar::isLowSurrogate(low)) {
+ ucs4 = QChar::surrogateToUcs4(ucs4, low);
+ ++lookahead;
+ }
+ }
- WordBreakRule rule = (WordBreakRule)wordBreakTable[brk][nbrk];
- if (rule == Middle) {
- rule = Break;
- quint32 lookahead = i + 1;
- while (lookahead < len) {
- quint32 testbrk = QUnicodeTables::wordBreakClass(string[lookahead]);
- if (testbrk == QUnicodeTables::WordBreakFormat
- && QUnicodeTables::sentenceBreakClass(string[lookahead]) != QUnicodeTables::SentenceBreakSep) {
- ++lookahead;
+ prop = QUnicodeTables::properties(ucs4);
+ QUnicodeTables::SentenceBreak tcls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak;
+ switch (tcls) {
+ case QUnicodeTables::SentenceBreakOther:
+ case QUnicodeTables::SentenceBreakFormat:
+ case QUnicodeTables::SentenceBreakSp:
+ case QUnicodeTables::SentenceBreakNumeric:
+ case QUnicodeTables::SentenceBreakSContinue:
+ case QUnicodeTables::SentenceBreakClose:
continue;
- }
- if (testbrk == brk) {
- rule = NoBreak;
- while (i < lookahead)
- attributes[i++].wordBoundary = false;
- nbrk = testbrk;
+ case QUnicodeTables::SentenceBreakLower:
+ i = lookahead;
+ state = SB::Initial;
+ break;
+ default:
+ break;
}
break;
}
}
- attributes[i].wordBoundary = (rule == Break);
- brk = nbrk;
+ if (state == SB::Break) {
+ attributes[pos].sentenceBoundary = true;
+ state = SB::breakTable[SB::Initial][ncls];
+ }
}
}
-enum SentenceBreakState {
- SB_Initial,
- SB_Upper,
- SB_UpATerm,
- SB_ATerm,
- SB_ATermC,
- SB_ACS,
- SB_STerm,
- SB_STermC,
- SB_SCS,
- SB_BAfter,
- SB_Break,
- SB_Lookup
-};
+// -----------------------------------------------------------------------------------------------------
+//
+// The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-28.html
+//
+// -----------------------------------------------------------------------------------------------------
+
+namespace LB {
-static const uchar sentenceBreakTable[SB_Lookup + 1][QUnicodeTables::SentenceBreakClose + 1] = {
-// Other Sep Format Sp Lower Upper OLetter Numeric ATerm STerm Close
- { SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_ATerm , SB_STerm , SB_Initial }, // SB_Initial,
- { SB_Initial, SB_BAfter , SB_Upper , SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm , SB_Initial }, // SB_Upper
+// The following line break classes are not treated by the pair table
+// and must be resolved outside:
+// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
- { SB_Lookup , SB_BAfter , SB_UpATerm, SB_ACS , SB_Initial, SB_Upper , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_UpATerm
- { SB_Lookup , SB_BAfter , SB_ATerm , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATerm
- { SB_Lookup , SB_BAfter , SB_ATermC , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Lookup , SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATermC,
- { SB_Lookup , SB_BAfter , SB_ACS , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Lookup , SB_ATerm , SB_STerm , SB_Lookup }, // SB_ACS,
+enum Action {
+ ProhibitedBreak, PB = ProhibitedBreak,
+ DirectBreak, DB = DirectBreak,
+ IndirectBreak, IB = IndirectBreak,
+ CombiningIndirectBreak, CI = CombiningIndirectBreak,
+ CombiningProhibitedBreak, CP = CombiningProhibitedBreak
+};
- { SB_Break , SB_BAfter , SB_STerm , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STerm,
- { SB_Break , SB_BAfter , SB_STermC , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STermC,
- { SB_Break , SB_BAfter , SB_SCS , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_Break }, // SB_SCS,
- { SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break }, // SB_BAfter,
+static const uchar breakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = {
+/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */
+/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
+/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
+/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
+/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* PR */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
+/* PO */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* HY */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* BA */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
+/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
+/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
+/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
+/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
+/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
+/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
+/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
+/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
};
-static void calcSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
+} // namespace LB
+
+static void getLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
{
- quint32 brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[0])];
- attributes[0].sentenceBoundary = true;
- for (quint32 i = 1; i < len; ++i) {
- if (!attributes[i].charStop) {
- attributes[i].sentenceBoundary = false;
- continue;
- }
- brk = sentenceBreakTable[brk][QUnicodeTables::sentenceBreakClass(string[i])];
- if (brk == SB_Lookup) {
- brk = SB_Break;
- quint32 lookahead = i + 1;
- while (lookahead < len) {
- quint32 sbrk = QUnicodeTables::sentenceBreakClass(string[lookahead]);
- if (sbrk != QUnicodeTables::SentenceBreakOther
- && sbrk != QUnicodeTables::SentenceBreakNumeric
- && sbrk != QUnicodeTables::SentenceBreakClose) {
- break;
- } else if (sbrk == QUnicodeTables::SentenceBreakLower) {
- brk = SB_Initial;
- break;
- }
- ++lookahead;
- }
- if (brk == SB_Initial) {
- while (i < lookahead)
- attributes[i++].sentenceBoundary = false;
+ uint lucs4 = 0;
+ QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
+ QUnicodeTables::LineBreakClass cls = lcls;
+ for (quint32 i = 0; i != len; ++i) {
+ quint32 pos = i;
+ uint ucs4 = string[i];
+ if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
+ ushort low = string[i + 1];
+ if (QChar::isLowSurrogate(low)) {
+ ucs4 = QChar::surrogateToUcs4(ucs4, low);
+ ++i;
}
}
- if (brk == SB_Break) {
- attributes[i].sentenceBoundary = true;
- brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[i])];
- } else {
- attributes[i].sentenceBoundary = false;
+
+ const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
+ QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
+
+ if (ncls == QUnicodeTables::LineBreak_SA) {
+ // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
+ static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
+ if (FLAG(prop->category) & test)
+ ncls = QUnicodeTables::LineBreak_CM;
+ }
+ if (ncls == QUnicodeTables::LineBreak_CM) {
+ // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
+ if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
+ ncls = QUnicodeTables::LineBreak_AL;
+ }
+
+ HB_LineBreakType lineBreakType = HB_NoBreak;
+
+ if (lcls >= QUnicodeTables::LineBreak_CR) {
+ // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
+ if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
+ lineBreakType = HB_ForcedBreak;
+ goto next;
+ }
+
+ if (ncls >= QUnicodeTables::LineBreak_SP) {
+ if (ncls > QUnicodeTables::LineBreak_SP)
+ goto next; // LB6: x(BK|CR|LF|NL)
+ goto next_no_cls_update; // LB7: xSP
}
+
+ // for South East Asian chars that require a complex analysis, the Unicode
+ // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
+ if (cls >= QUnicodeTables::LineBreak_SA)
+ cls = QUnicodeTables::LineBreak_AL;
+
+ switch (LB::breakTable[cls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
+ case LB::DirectBreak:
+ lineBreakType = HB_Break;
+ if (lucs4 == 0x00ad) // soft hyphen
+ lineBreakType = HB_SoftHyphen;
+ break;
+ case LB::IndirectBreak:
+ if (lcls == QUnicodeTables::LineBreak_SP)
+ lineBreakType = HB_Break;
+ break;
+ case LB::CombiningIndirectBreak:
+ if (lcls != QUnicodeTables::LineBreak_SP)
+ goto next_no_cls_update;
+ lineBreakType = HB_Break;
+ break;
+ case LB::CombiningProhibitedBreak:
+ if (lcls != QUnicodeTables::LineBreak_SP)
+ goto next_no_cls_update;
+ break;
+ case LB::ProhibitedBreak:
+ // nothing to do
+ default:
+ break;
+ }
+
+ next:
+ cls = ncls;
+ lucs4 = ucs4;
+ next_no_cls_update:
+ lcls = ncls;
+ if (lineBreakType != HB_NoBreak)
+ attributes[pos].lineBreakType = lineBreakType;
}
+
+ attributes[0].lineBreakType = HB_NoBreak; // LB2
}
@@ -400,18 +446,17 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
if (length <= 0)
return;
- if (!(options & DontClearAttributes)) {
+ if (!(options & DontClearAttributes))
::memset(attributes, 0, length * sizeof(HB_CharAttributes));
- if (options & (WordBreaks | SentenceBreaks))
- options |= GraphemeBreaks;
- }
- if (options & (GraphemeBreaks | LineBreaks))
- calcGraphemeAndLineBreaks(string, length, attributes);
+ if (options & GraphemeBreaks)
+ getGraphemeBreaks(string, length, attributes);
if (options & WordBreaks)
- calcWordBreaks(string, length, attributes);
+ getWordBreaks(string, length, attributes);
if (options & SentenceBreaks)
- calcSentenceBreaks(string, length, attributes);
+ getSentenceBreaks(string, length, attributes);
+ if (options & LineBreaks)
+ getLineBreaks(string, length, attributes);
if (options & WhiteSpaces)
getWhiteSpaces(string, length, attributes);