summaryrefslogtreecommitdiffstats
path: root/src/corelib/text/qunicodetools.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/text/qunicodetools.cpp')
-rw-r--r--src/corelib/text/qunicodetools.cpp311
1 files changed, 207 insertions, 104 deletions
diff --git a/src/corelib/text/qunicodetools.cpp b/src/corelib/text/qunicodetools.cpp
index beef159daa..2917804830 100644
--- a/src/corelib/text/qunicodetools.cpp
+++ b/src/corelib/text/qunicodetools.cpp
@@ -17,7 +17,12 @@ QT_BEGIN_NAMESPACE
using namespace Qt::StringLiterals;
-Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
+#ifdef QT_BUILD_INTERNAL
+Q_CONSTINIT Q_AUTOTEST_EXPORT
+#else
+constexpr
+#endif
+int qt_initcharattributes_default_algorithm_only = 0;
namespace QUnicodeTools {
@@ -254,7 +259,6 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
-#ifdef QT_BUILD_INTERNAL
if (qt_initcharattributes_default_algorithm_only) {
// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
// which caused "hi.there" to be treated like if it were just a single word;
@@ -265,7 +269,6 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes
else if (ucs4 == 0x003A) // COLON
ncls = QUnicodeTables::WordBreak_MidLetter;
}
-#endif
uchar action = WB::breakTable[cls][ncls];
switch (action) {
@@ -556,45 +559,49 @@ enum Action {
IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30
};
+// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
+// about the table. It was removed in the later versions of the standard.
static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = {
-/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/
-/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
-/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
-/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
-/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB },
-/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* NU */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* AL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* HL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
-/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
-/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB },
-/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
-/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
-/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
-/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB },
-/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
-/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
-/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB },
-/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* 1↓ 2→ OP CL CP QU +Pi +Pf GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/
+/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
+/* CL */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* CP */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
+/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* NS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB },
+/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* ID */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* HY */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
+/* BA */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
+/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB },
+/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
+/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
+/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB },
+/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
+/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
+/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB },
+/* CB */ { DB, PB, PB, IB, IB, PB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
};
// The following line break classes are not treated by the pair table
// and must be resolved outside:
-// AI, BK, CB, CJ, CR, LF, NL, ZWJ, SA, SG, SP, XX
+// AI, AK, AP, AS, BK, CB, CJ, CR, LF, NL, SA, SG, SP, VF, VI, XX, ZWJ
} // namespace LB
@@ -654,6 +661,61 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
ncls = QUnicodeTables::LineBreak_CM;
}
+ if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
+ if (prop->category == QChar::Punctuation_InitialQuote) {
+ // LB15a: Do not break after an unresolved initial punctuation
+ // that lies at the start of the line, after a space, after
+ // opening punctuation, or after an unresolved quotation mark,
+ // even after spaces.
+ // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
+ // [\p{Pi}&QU] SP* ×
+ // Note: sot is treated as LF here due to initial loop setup.
+ constexpr QUnicodeTables::LineBreakClass lb15a[] = {
+ QUnicodeTables::LineBreak_BK, QUnicodeTables::LineBreak_CR,
+ QUnicodeTables::LineBreak_LF, QUnicodeTables::LineBreak_OP,
+ QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
+ QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_GL,
+ QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_ZW};
+ if (std::any_of(std::begin(lb15a), std::end(lb15a),
+ [lcls](auto x) { return x == lcls; })) {
+ ncls = QUnicodeTables::LineBreak_QU_Pi;
+ }
+ } else if (prop->category == QChar::Punctuation_FinalQuote) {
+ // LB15b: Do not break before an unresolved final punctuation
+ // that lies at the end of the line, before a space, before
+ // a prohibited break, or before an unresolved quotation mark,
+ // even after spaces.
+ // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
+ // | SY | BK | CR | LF | NL | ZW | eot)
+ auto nncls = QUnicodeTables::LineBreak_LF;
+
+ if (i + 1 < len) {
+ char32_t c = string[i + 1];
+ if (QChar::isHighSurrogate(c) && i + 2 != len) {
+ ushort low = string[i + 2];
+ if (QChar::isLowSurrogate(low))
+ c = QChar::surrogateToUcs4(c, low);
+ }
+ nncls = QUnicodeTables::LineBreakClass(
+ QUnicodeTables::properties(c)->lineBreakClass);
+ }
+
+ constexpr QUnicodeTables::LineBreakClass lb15b[] = {
+ QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_GL,
+ QUnicodeTables::LineBreak_WJ, QUnicodeTables::LineBreak_CL,
+ QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
+ QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_CP,
+ QUnicodeTables::LineBreak_EX, QUnicodeTables::LineBreak_IS,
+ QUnicodeTables::LineBreak_SY, QUnicodeTables::LineBreak_BK,
+ QUnicodeTables::LineBreak_CR, QUnicodeTables::LineBreak_LF,
+ QUnicodeTables::LineBreak_ZW};
+ if (std::any_of(std::begin(lb15b), std::end(lb15b),
+ [nncls](auto x) { return x == nncls; })) {
+ ncls = QUnicodeTables::LineBreak_QU_Pf;
+ }
+ }
+ }
+
if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
// LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
@@ -1263,12 +1325,12 @@ static inline Form form(unsigned short uc) {
static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
{
*invalid = false;
- IDEBUG("indic_nextSyllableBoundary: start=%d, end=%d", int(start), int(end));
+ IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
const char16_t *uc = s+start;
qsizetype pos = 0;
Form state = form(uc[pos]);
- IDEBUG("state[%d]=%d (uc=%4x)", int(pos), state, uc[pos]);
+ IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
pos++;
if (state != Consonant && state != IndependentVowel) {
@@ -1279,7 +1341,7 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t
while (pos < end - start) {
Form newState = form(uc[pos]);
- IDEBUG("state[%d]=%d (uc=%4x)", int(pos), newState, uc[pos]);
+ IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
switch (newState) {
case Control:
newState = state;
@@ -1352,6 +1414,7 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t
// ### needs proper testing for correct two/three part matras
break;
}
+ Q_FALLTHROUGH();
case IndependentVowel:
case Invalid:
case Other:
@@ -1386,6 +1449,8 @@ static void indicAttributes(QChar::Script script, const char16_t *text, qsizetyp
}
+#if QT_CONFIG(library)
+
#define LIBTHAI_MAJOR 0
/*
@@ -1396,27 +1461,74 @@ struct thcell_t {
unsigned char hilo; /**< upper/lower vowel/diacritic */
unsigned char top; /**< top-level mark */
};
-typedef int (*th_brk_def) (const unsigned char*, int*, size_t);
-typedef size_t (*th_next_cell_def) (const unsigned char *, size_t, struct thcell_t *, int);
-/* libthai related function handles */
-Q_CONSTINIT static th_brk_def th_brk = nullptr;
-Q_CONSTINIT static th_next_cell_def th_next_cell = nullptr;
+using ThBrk = struct _ThBrk;
-static int init_libthai() {
-#if QT_CONFIG(library)
- Q_CONSTINIT static bool initialized = false;
- if (!initialized && (!th_brk || !th_next_cell)) {
- th_brk = reinterpret_cast<th_brk_def>(QLibrary::resolve("thai"_L1, static_cast<int>(LIBTHAI_MAJOR), "th_brk"));
- th_next_cell = (th_next_cell_def)QLibrary::resolve("thai"_L1, LIBTHAI_MAJOR, "th_next_cell");
- initialized = true;
+namespace {
+
+class LibThai final
+{
+ Q_DISABLE_COPY_MOVE(LibThai)
+
+ using th_brk_new_def = ThBrk *(*)(const char *);
+ using th_brk_delete_def = void (*)(ThBrk *);
+ using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
+ using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
+
+public:
+ LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
+ {
+ m_th_brk_find_breaks =
+ reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks"));
+ m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell"));
+
+ auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new"));
+ if (th_brk_new) {
+ m_state = th_brk_new(nullptr);
+ m_th_brk_delete =
+ reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete"));
+ }
}
- if (th_brk && th_next_cell)
- return 1;
- else
-#endif
- return 0;
-}
+
+ ~LibThai()
+ {
+ if (m_state && m_th_brk_delete)
+ m_th_brk_delete(m_state);
+ m_library.unload();
+ }
+
+ bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
+
+ int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
+ {
+ Q_ASSERT(m_state);
+ Q_ASSERT(m_th_brk_find_breaks);
+ return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
+ }
+
+ size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
+ {
+ Q_ASSERT(m_th_next_cell);
+ return m_th_next_cell(s, len, cell, is_decomp_am);
+ }
+
+private:
+ QLibrary m_library;
+
+ // Global state for th_brk_find_breaks().
+ // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
+ // state is read-only, and so it is safe to use it from multiple threads after
+ // initialization. This is also stated in the libthai documentation.
+ ThBrk *m_state = nullptr;
+
+ th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
+ th_next_cell_def m_th_next_cell = nullptr;
+ th_brk_delete_def m_th_brk_delete = nullptr;
+};
+
+} // unnamed namespace
+
+Q_GLOBAL_STATIC(LibThai, g_libThai)
static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
{
@@ -1440,21 +1552,17 @@ static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
*/
static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
{
- char s[128];
- char *cstr = s;
- int *break_positions = nullptr;
- int brp[128];
- int brp_size = 0;
- qsizetype numbreaks, i, j, cell_length;
+ constexpr qsizetype Prealloc = 128;
+ QVarLengthArray<char, Prealloc + 1> s(len + 1);
+ QVarLengthArray<int, Prealloc> break_positions(len);
+ qsizetype numbreaks, i;
struct thcell_t tis_cell;
- if (!init_libthai())
- return ;
-
- if (len >= 128)
- cstr = static_cast<char *>(malloc (len * sizeof(char) + 1));
+ LibThai *libThai = g_libThai;
+ if (!libThai || !libThai->isInitialized())
+ return;
- to_tis620(string, len, cstr);
+ to_tis620(string, len, s.data());
for (i = 0; i < len; ++i) {
attributes[i].wordBreak = false;
@@ -1463,58 +1571,53 @@ static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAtt
attributes[i].lineBreak = false;
}
- if (len > 128) {
- break_positions = static_cast<int *>(malloc (sizeof(int) * len));
- memset (break_positions, 0, sizeof(int) * len);
- brp_size = len;
- }
- else {
- break_positions = brp;
- brp_size = 128;
- }
-
- if (break_positions) {
- attributes[0].wordBreak = true;
- attributes[0].wordStart = true;
- attributes[0].wordEnd = false;
- numbreaks = th_brk(reinterpret_cast<const unsigned char *>(cstr), break_positions, brp_size);
- for (i = 0; i < numbreaks; ++i) {
- attributes[break_positions[i]].wordBreak = true;
- attributes[break_positions[i]].wordStart = true;
- attributes[break_positions[i]].wordEnd = true;
- attributes[break_positions[i]].lineBreak = true;
- }
- if (numbreaks > 0)
- attributes[break_positions[numbreaks - 1]].wordStart = false;
-
- if (break_positions != brp)
- free(break_positions);
+ attributes[0].wordBreak = true;
+ attributes[0].wordStart = true;
+ attributes[0].wordEnd = false;
+ numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()),
+ break_positions.data(),
+ static_cast<size_t>(break_positions.size()));
+ for (i = 0; i < numbreaks; ++i) {
+ attributes[break_positions[i]].wordBreak = true;
+ attributes[break_positions[i]].wordStart = true;
+ attributes[break_positions[i]].wordEnd = true;
+ attributes[break_positions[i]].lineBreak = true;
}
+ if (numbreaks > 0)
+ attributes[break_positions[numbreaks - 1]].wordStart = false;
/* manage grapheme boundaries */
i = 0;
while (i < len) {
- cell_length = static_cast<uint>(th_next_cell(reinterpret_cast<const unsigned char *>(cstr) + i, len - i, &tis_cell, true));
-
+ size_t cell_length =
+ libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i,
+ size_t(len - i), &tis_cell, true);
attributes[i].graphemeBoundary = true;
- for (j = 1; j < cell_length; j++)
+ for (size_t j = 1; j < cell_length; ++j)
attributes[i + j].graphemeBoundary = false;
i += cell_length;
}
-
- if (len >= 128)
- free(cstr);
}
+#endif // QT_CONFIG(library)
+
static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
{
assert(script == QChar::Script_Thai);
+#if QT_CONFIG(library)
const char16_t *uc = text + from;
attributes += from;
Q_UNUSED(script);
thaiAssignAttributes(uc, len, attributes);
+#else
+ Q_UNUSED(script);
+ Q_UNUSED(text);
+ Q_UNUSED(from);
+ Q_UNUSED(len);
+ Q_UNUSED(attributes);
+#endif
}
/*
@@ -1825,7 +1928,7 @@ static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start
if (pos == start)
*invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
- MMDEBUG("state[%d]=%d class=%8x (uc=%4x)", int(pos - start), state, charClass, *uc);
+ MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
if (state < 0) {
if (state < -1)
@@ -2160,7 +2263,7 @@ static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start,
}
state = khmerStateTable[state][charClass & CF_CLASS_MASK];
- KHDEBUG("state[%d]=%d class=%8lx (uc=%4x)", int(pos - start), state,
+ KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
charClass, *uc );
if (state < 0) {