diff options
Diffstat (limited to 'src/corelib/text/qunicodetools.cpp')
-rw-r--r-- | src/corelib/text/qunicodetools.cpp | 311 |
1 files changed, 207 insertions, 104 deletions
diff --git a/src/corelib/text/qunicodetools.cpp b/src/corelib/text/qunicodetools.cpp index beef159daa..2917804830 100644 --- a/src/corelib/text/qunicodetools.cpp +++ b/src/corelib/text/qunicodetools.cpp @@ -17,7 +17,12 @@ QT_BEGIN_NAMESPACE using namespace Qt::StringLiterals; -Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0; +#ifdef QT_BUILD_INTERNAL +Q_CONSTINIT Q_AUTOTEST_EXPORT +#else +constexpr +#endif +int qt_initcharattributes_default_algorithm_only = 0; namespace QUnicodeTools { @@ -254,7 +259,6 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass; -#ifdef QT_BUILD_INTERNAL if (qt_initcharattributes_default_algorithm_only) { // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet // which caused "hi.there" to be treated like if it were just a single word; @@ -265,7 +269,6 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes else if (ucs4 == 0x003A) // COLON ncls = QUnicodeTables::WordBreak_MidLetter; } -#endif uchar action = WB::breakTable[cls][ncls]; switch (action) { @@ -556,45 +559,49 @@ enum Action { IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30 }; +// See https://www.unicode.org/reports/tr14/tr14-37.html for the information +// about the table. It was removed in the later versions of the standard. static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = { -/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/ -/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB }, -/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, -/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, -/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB }, -/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* NU */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* AL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* HL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB }, -/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB }, -/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB }, -/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, -/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB }, -/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB }, -/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB }, -/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB }, -/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB }, -/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB }, -/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* 1↓ 2→ OP CL CP QU +Pi +Pf GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/ +/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB }, +/* CL */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* CP */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB }, +/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* NS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB }, +/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* ID */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* HY */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB }, +/* BA */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB }, +/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB }, +/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB }, +/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB }, +/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB }, +/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB }, +/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB }, +/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB }, +/* CB */ { DB, PB, PB, IB, IB, PB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, }; // The following line break classes are not treated by the pair table // and must be resolved outside: -// AI, BK, CB, CJ, CR, LF, NL, ZWJ, SA, SG, SP, XX +// AI, AK, AP, AS, BK, CB, CJ, CR, LF, NL, SA, SG, SP, VF, VI, XX, ZWJ } // namespace LB @@ -654,6 +661,61 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes ncls = QUnicodeTables::LineBreak_CM; } + if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) { + if (prop->category == QChar::Punctuation_InitialQuote) { + // LB15a: Do not break after an unresolved initial punctuation + // that lies at the start of the line, after a space, after + // opening punctuation, or after an unresolved quotation mark, + // even after spaces. + // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW) + // [\p{Pi}&QU] SP* × + // Note: sot is treated as LF here due to initial loop setup. + constexpr QUnicodeTables::LineBreakClass lb15a[] = { + QUnicodeTables::LineBreak_BK, QUnicodeTables::LineBreak_CR, + QUnicodeTables::LineBreak_LF, QUnicodeTables::LineBreak_OP, + QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi, + QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_GL, + QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_ZW}; + if (std::any_of(std::begin(lb15a), std::end(lb15a), + [lcls](auto x) { return x == lcls; })) { + ncls = QUnicodeTables::LineBreak_QU_Pi; + } + } else if (prop->category == QChar::Punctuation_FinalQuote) { + // LB15b: Do not break before an unresolved final punctuation + // that lies at the end of the line, before a space, before + // a prohibited break, or before an unresolved quotation mark, + // even after spaces. + // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS + // | SY | BK | CR | LF | NL | ZW | eot) + auto nncls = QUnicodeTables::LineBreak_LF; + + if (i + 1 < len) { + char32_t c = string[i + 1]; + if (QChar::isHighSurrogate(c) && i + 2 != len) { + ushort low = string[i + 2]; + if (QChar::isLowSurrogate(low)) + c = QChar::surrogateToUcs4(c, low); + } + nncls = QUnicodeTables::LineBreakClass( + QUnicodeTables::properties(c)->lineBreakClass); + } + + constexpr QUnicodeTables::LineBreakClass lb15b[] = { + QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_GL, + QUnicodeTables::LineBreak_WJ, QUnicodeTables::LineBreak_CL, + QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi, + QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_CP, + QUnicodeTables::LineBreak_EX, QUnicodeTables::LineBreak_IS, + QUnicodeTables::LineBreak_SY, QUnicodeTables::LineBreak_BK, + QUnicodeTables::LineBreak_CR, QUnicodeTables::LineBreak_LF, + QUnicodeTables::LineBreak_ZW}; + if (std::any_of(std::begin(lb15b), std::end(lb15b), + [nncls](auto x) { return x == nncls; })) { + ncls = QUnicodeTables::LineBreak_QU_Pf; + } + } + } + if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) { // LB4: BK!, LB5: (CRxLF|CR|LF|NL)! if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF) @@ -1263,12 +1325,12 @@ static inline Form form(unsigned short uc) { static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid) { *invalid = false; - IDEBUG("indic_nextSyllableBoundary: start=%d, end=%d", int(start), int(end)); + IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end)); const char16_t *uc = s+start; qsizetype pos = 0; Form state = form(uc[pos]); - IDEBUG("state[%d]=%d (uc=%4x)", int(pos), state, uc[pos]); + IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]); pos++; if (state != Consonant && state != IndependentVowel) { @@ -1279,7 +1341,7 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t while (pos < end - start) { Form newState = form(uc[pos]); - IDEBUG("state[%d]=%d (uc=%4x)", int(pos), newState, uc[pos]); + IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]); switch (newState) { case Control: newState = state; @@ -1352,6 +1414,7 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t // ### needs proper testing for correct two/three part matras break; } + Q_FALLTHROUGH(); case IndependentVowel: case Invalid: case Other: @@ -1386,6 +1449,8 @@ static void indicAttributes(QChar::Script script, const char16_t *text, qsizetyp } +#if QT_CONFIG(library) + #define LIBTHAI_MAJOR 0 /* @@ -1396,27 +1461,74 @@ struct thcell_t { unsigned char hilo; /**< upper/lower vowel/diacritic */ unsigned char top; /**< top-level mark */ }; -typedef int (*th_brk_def) (const unsigned char*, int*, size_t); -typedef size_t (*th_next_cell_def) (const unsigned char *, size_t, struct thcell_t *, int); -/* libthai related function handles */ -Q_CONSTINIT static th_brk_def th_brk = nullptr; -Q_CONSTINIT static th_next_cell_def th_next_cell = nullptr; +using ThBrk = struct _ThBrk; -static int init_libthai() { -#if QT_CONFIG(library) - Q_CONSTINIT static bool initialized = false; - if (!initialized && (!th_brk || !th_next_cell)) { - th_brk = reinterpret_cast<th_brk_def>(QLibrary::resolve("thai"_L1, static_cast<int>(LIBTHAI_MAJOR), "th_brk")); - th_next_cell = (th_next_cell_def)QLibrary::resolve("thai"_L1, LIBTHAI_MAJOR, "th_next_cell"); - initialized = true; +namespace { + +class LibThai final +{ + Q_DISABLE_COPY_MOVE(LibThai) + + using th_brk_new_def = ThBrk *(*)(const char *); + using th_brk_delete_def = void (*)(ThBrk *); + using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t); + using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int); + +public: + LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR) + { + m_th_brk_find_breaks = + reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks")); + m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell")); + + auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new")); + if (th_brk_new) { + m_state = th_brk_new(nullptr); + m_th_brk_delete = + reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete")); + } } - if (th_brk && th_next_cell) - return 1; - else -#endif - return 0; -} + + ~LibThai() + { + if (m_state && m_th_brk_delete) + m_th_brk_delete(m_state); + m_library.unload(); + } + + bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; } + + int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const + { + Q_ASSERT(m_state); + Q_ASSERT(m_th_brk_find_breaks); + return m_th_brk_find_breaks(m_state, s, pos, pos_sz); + } + + size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am) + { + Q_ASSERT(m_th_next_cell); + return m_th_next_cell(s, len, cell, is_decomp_am); + } + +private: + QLibrary m_library; + + // Global state for th_brk_find_breaks(). + // Note: even if signature for th_brk_find_breaks() suggests otherwise, the + // state is read-only, and so it is safe to use it from multiple threads after + // initialization. This is also stated in the libthai documentation. + ThBrk *m_state = nullptr; + + th_brk_find_breaks_def m_th_brk_find_breaks = nullptr; + th_next_cell_def m_th_next_cell = nullptr; + th_brk_delete_def m_th_brk_delete = nullptr; +}; + +} // unnamed namespace + +Q_GLOBAL_STATIC(LibThai, g_libThai) static void to_tis620(const char16_t *string, qsizetype len, char *cstr) { @@ -1440,21 +1552,17 @@ static void to_tis620(const char16_t *string, qsizetype len, char *cstr) */ static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes) { - char s[128]; - char *cstr = s; - int *break_positions = nullptr; - int brp[128]; - int brp_size = 0; - qsizetype numbreaks, i, j, cell_length; + constexpr qsizetype Prealloc = 128; + QVarLengthArray<char, Prealloc + 1> s(len + 1); + QVarLengthArray<int, Prealloc> break_positions(len); + qsizetype numbreaks, i; struct thcell_t tis_cell; - if (!init_libthai()) - return ; - - if (len >= 128) - cstr = static_cast<char *>(malloc (len * sizeof(char) + 1)); + LibThai *libThai = g_libThai; + if (!libThai || !libThai->isInitialized()) + return; - to_tis620(string, len, cstr); + to_tis620(string, len, s.data()); for (i = 0; i < len; ++i) { attributes[i].wordBreak = false; @@ -1463,58 +1571,53 @@ static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAtt attributes[i].lineBreak = false; } - if (len > 128) { - break_positions = static_cast<int *>(malloc (sizeof(int) * len)); - memset (break_positions, 0, sizeof(int) * len); - brp_size = len; - } - else { - break_positions = brp; - brp_size = 128; - } - - if (break_positions) { - attributes[0].wordBreak = true; - attributes[0].wordStart = true; - attributes[0].wordEnd = false; - numbreaks = th_brk(reinterpret_cast<const unsigned char *>(cstr), break_positions, brp_size); - for (i = 0; i < numbreaks; ++i) { - attributes[break_positions[i]].wordBreak = true; - attributes[break_positions[i]].wordStart = true; - attributes[break_positions[i]].wordEnd = true; - attributes[break_positions[i]].lineBreak = true; - } - if (numbreaks > 0) - attributes[break_positions[numbreaks - 1]].wordStart = false; - - if (break_positions != brp) - free(break_positions); + attributes[0].wordBreak = true; + attributes[0].wordStart = true; + attributes[0].wordEnd = false; + numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()), + break_positions.data(), + static_cast<size_t>(break_positions.size())); + for (i = 0; i < numbreaks; ++i) { + attributes[break_positions[i]].wordBreak = true; + attributes[break_positions[i]].wordStart = true; + attributes[break_positions[i]].wordEnd = true; + attributes[break_positions[i]].lineBreak = true; } + if (numbreaks > 0) + attributes[break_positions[numbreaks - 1]].wordStart = false; /* manage grapheme boundaries */ i = 0; while (i < len) { - cell_length = static_cast<uint>(th_next_cell(reinterpret_cast<const unsigned char *>(cstr) + i, len - i, &tis_cell, true)); - + size_t cell_length = + libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i, + size_t(len - i), &tis_cell, true); attributes[i].graphemeBoundary = true; - for (j = 1; j < cell_length; j++) + for (size_t j = 1; j < cell_length; ++j) attributes[i + j].graphemeBoundary = false; i += cell_length; } - - if (len >= 128) - free(cstr); } +#endif // QT_CONFIG(library) + static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) { assert(script == QChar::Script_Thai); +#if QT_CONFIG(library) const char16_t *uc = text + from; attributes += from; Q_UNUSED(script); thaiAssignAttributes(uc, len, attributes); +#else + Q_UNUSED(script); + Q_UNUSED(text); + Q_UNUSED(from); + Q_UNUSED(len); + Q_UNUSED(attributes); +#endif } /* @@ -1825,7 +1928,7 @@ static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start if (pos == start) *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE); - MMDEBUG("state[%d]=%d class=%8x (uc=%4x)", int(pos - start), state, charClass, *uc); + MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc); if (state < 0) { if (state < -1) @@ -2160,7 +2263,7 @@ static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, } state = khmerStateTable[state][charClass & CF_CLASS_MASK]; - KHDEBUG("state[%d]=%d class=%8lx (uc=%4x)", int(pos - start), state, + KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state, charClass, *uc ); if (state < 0) { |