summaryrefslogtreecommitdiffstats
path: root/src/corelib/text/qunicodetools.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/text/qunicodetools.cpp')
-rw-r--r--src/corelib/text/qunicodetools.cpp520
1 files changed, 316 insertions, 204 deletions
diff --git a/src/corelib/text/qunicodetools.cpp b/src/corelib/text/qunicodetools.cpp
index 31680932f1..2917804830 100644
--- a/src/corelib/text/qunicodetools.cpp
+++ b/src/corelib/text/qunicodetools.cpp
@@ -1,53 +1,28 @@
-/****************************************************************************
-**
-** Copyright (C) 2020 The Qt Company Ltd.
-** Contact: https://www.qt.io/licensing/
-**
-** This file is part of the QtCore module of the Qt Toolkit.
-**
-** $QT_BEGIN_LICENSE:LGPL$
-** Commercial License Usage
-** Licensees holding valid commercial Qt licenses may use this file in
-** accordance with the commercial license agreement provided with the
-** Software or, alternatively, in accordance with the terms contained in
-** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see https://www.qt.io/terms-conditions. For further
-** information use the contact form at https://www.qt.io/contact-us.
-**
-** GNU Lesser General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU Lesser
-** General Public License version 3 as published by the Free Software
-** Foundation and appearing in the file LICENSE.LGPL3 included in the
-** packaging of this file. Please review the following information to
-** ensure the GNU Lesser General Public License version 3 requirements
-** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
-**
-** GNU General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU
-** General Public License version 2.0 or (at your option) the GNU General
-** Public license version 3 or any later version approved by the KDE Free
-** Qt Foundation. The licenses are as published by the Free Software
-** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
-** included in the packaging of this file. Please review the following
-** information to ensure the GNU General Public License requirements will
-** be met: https://www.gnu.org/licenses/gpl-2.0.html and
-** https://www.gnu.org/licenses/gpl-3.0.html.
-**
-** $QT_END_LICENSE$
-**
-****************************************************************************/
+// Copyright (C) 2020 The Qt Company Ltd.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#include "qunicodetools_p.h"
#include "qunicodetables_p.h"
#include "qvarlengtharray.h"
+#if QT_CONFIG(library)
#include "qlibrary.h"
+#endif
+
+#include <limits.h>
#define FLAG(x) (1 << (x))
QT_BEGIN_NAMESPACE
-Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
+using namespace Qt::StringLiterals;
+
+#ifdef QT_BUILD_INTERNAL
+Q_CONSTINIT Q_AUTOTEST_EXPORT
+#else
+constexpr
+#endif
+int qt_initcharattributes_default_algorithm_only = 0;
namespace QUnicodeTools {
@@ -161,51 +136,63 @@ static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttrib
QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);
+ bool handled = false;
switch (state) {
case GB::State::Normal:
- if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
- if (cls == QUnicodeTables::GraphemeBreak_Extend) {
- state = GB::State::GB11_ExtPicExt;
- Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
- } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
- state = GB::State::GB11_ExtPicExtZWJ;
- Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
- }
- } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
- state = GB::State::GB12_13_RI;
- }
+ break; // will deal with it below
- break;
case GB::State::GB11_ExtPicExt:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
if (cls == QUnicodeTables::GraphemeBreak_Extend) {
// keep going in the current state
Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
+ handled = true;
} else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
state = GB::State::GB11_ExtPicExtZWJ;
Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
+ handled = true;
+ } else {
+ state = GB::State::Normal;
}
-
break;
case GB::State::GB11_ExtPicExtZWJ:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
- if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic)
+ if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) {
shouldBreak = false;
+ handled = true;
+ }
state = GB::State::Normal;
break;
case GB::State::GB12_13_RI:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
- if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator)
+ if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) {
shouldBreak = false;
+ handled = true;
+ }
state = GB::State::Normal;
break;
}
+ if (!handled) {
+ Q_ASSERT(state == GB::State::Normal);
+ if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
+ if (cls == QUnicodeTables::GraphemeBreak_Extend) {
+ state = GB::State::GB11_ExtPicExt;
+ Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
+ } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
+ state = GB::State::GB11_ExtPicExtZWJ;
+ Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
+ }
+ } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
+ state = GB::State::GB12_13_RI;
+ }
+ }
+
if (shouldBreak)
attributes[pos].graphemeBoundary = true;
@@ -226,30 +213,26 @@ enum Action {
};
static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
-// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet E_Base E_Mod GAZ EBG WSeg
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
- { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
- { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
- { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ZWJ
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break }, // Katakana
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // HebrewLetter
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ALetter
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // Numeric
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ExtendNumLet
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // E_Base
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_Mod
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ
- { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // EBG
- { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // WSeg
+// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
+ { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
+ { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
+ { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // HebrewLetter
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // ALetter
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break }, // Numeric
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ExtendNumLet
+ { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
};
} // namespace WB
@@ -261,6 +244,8 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes
} currentWordType = WordTypeNone;
QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
+ auto real_cls = cls; // Unaffected by WB4
+
for (qsizetype i = 0; i != len; ++i) {
qsizetype pos = i;
char32_t ucs4 = string[i];
@@ -274,7 +259,6 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
-#ifdef QT_BUILD_INTERNAL
if (qt_initcharattributes_default_algorithm_only) {
// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
// which caused "hi.there" to be treated like if it were just a single word;
@@ -285,22 +269,32 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes
else if (ucs4 == 0x003A) // COLON
ncls = QUnicodeTables::WordBreak_MidLetter;
}
-#endif
uchar action = WB::breakTable[cls][ncls];
switch (action) {
case WB::Break:
+ if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
+ && prop->graphemeBreakClass
+ == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
+ // WB3c: ZWJ × \p{Extended_Pictographic}
+ action = WB::NoBreak;
+ }
break;
case WB::NoBreak:
if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) {
// WB4: X(Extend|Format)* -> X
- if (cls != QUnicodeTables::WordBreak_ZWJ) // WB3c
- continue;
+ real_cls = ncls;
+ continue;
}
if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
// WB15/WB16: break between pairs of Regional indicator
ncls = QUnicodeTables::WordBreak_Any;
}
+ if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
+ && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
+ // WB3d should not be affected by WB4
+ action = WB::Break;
+ }
break;
case WB::Lookup:
case WB::LookupW:
@@ -339,6 +333,8 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes
}
cls = ncls;
+ real_cls = ncls;
+
if (action == WB::Break) {
attributes[pos].wordBreak = true;
if (currentWordType != WordTypeNone)
@@ -559,49 +555,53 @@ enum Action {
IndirectBreak, IB = IndirectBreak,
CombiningIndirectBreak, CI = CombiningIndirectBreak,
CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
- ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen
+ ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen,
+ IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30
};
-static const uchar breakTable[QUnicodeTables::LineBreak_SA][QUnicodeTables::LineBreak_SA] = {
-/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM ZWJ*/
-/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
-/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
-/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
-/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, IB },
-/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
-/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
-/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB },
-/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
-/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
-/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
-/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
-/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, IB },
-/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
-/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
-/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, IB },
-/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
-/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
-/* ZWJ*/ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, IB }
+// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
+// about the table. It was removed in the later versions of the standard.
+static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = {
+/* 1↓ 2→ OP CL CP QU +Pi +Pf GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/
+/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
+/* CL */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* CP */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
+/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* NS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB },
+/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* ID */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* HY */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
+/* BA */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
+/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB },
+/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
+/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
+/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
+/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB },
+/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
+/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
+/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB },
+/* CB */ { DB, PB, PB, IB, IB, PB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
+/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
+/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
};
// The following line break classes are not treated by the pair table
// and must be resolved outside:
-// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
+// AI, AK, AP, AS, BK, CB, CJ, CR, LF, NL, SA, SG, SP, VF, VI, XX, ZWJ
} // namespace LB
@@ -612,6 +612,8 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
QUnicodeTables::LineBreakClass cls = lcls;
+ const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(U'\n');
+
for (qsizetype i = 0; i != len; ++i) {
qsizetype pos = i;
char32_t ucs4 = string[i];
@@ -659,6 +661,61 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
ncls = QUnicodeTables::LineBreak_CM;
}
+ if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
+ if (prop->category == QChar::Punctuation_InitialQuote) {
+ // LB15a: Do not break after an unresolved initial punctuation
+ // that lies at the start of the line, after a space, after
+ // opening punctuation, or after an unresolved quotation mark,
+ // even after spaces.
+ // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
+ // [\p{Pi}&QU] SP* ×
+ // Note: sot is treated as LF here due to initial loop setup.
+ constexpr QUnicodeTables::LineBreakClass lb15a[] = {
+ QUnicodeTables::LineBreak_BK, QUnicodeTables::LineBreak_CR,
+ QUnicodeTables::LineBreak_LF, QUnicodeTables::LineBreak_OP,
+ QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
+ QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_GL,
+ QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_ZW};
+ if (std::any_of(std::begin(lb15a), std::end(lb15a),
+ [lcls](auto x) { return x == lcls; })) {
+ ncls = QUnicodeTables::LineBreak_QU_Pi;
+ }
+ } else if (prop->category == QChar::Punctuation_FinalQuote) {
+ // LB15b: Do not break before an unresolved final punctuation
+ // that lies at the end of the line, before a space, before
+ // a prohibited break, or before an unresolved quotation mark,
+ // even after spaces.
+ // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
+ // | SY | BK | CR | LF | NL | ZW | eot)
+ auto nncls = QUnicodeTables::LineBreak_LF;
+
+ if (i + 1 < len) {
+ char32_t c = string[i + 1];
+ if (QChar::isHighSurrogate(c) && i + 2 != len) {
+ ushort low = string[i + 2];
+ if (QChar::isLowSurrogate(low))
+ c = QChar::surrogateToUcs4(c, low);
+ }
+ nncls = QUnicodeTables::LineBreakClass(
+ QUnicodeTables::properties(c)->lineBreakClass);
+ }
+
+ constexpr QUnicodeTables::LineBreakClass lb15b[] = {
+ QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_GL,
+ QUnicodeTables::LineBreak_WJ, QUnicodeTables::LineBreak_CL,
+ QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
+ QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_CP,
+ QUnicodeTables::LineBreak_EX, QUnicodeTables::LineBreak_IS,
+ QUnicodeTables::LineBreak_SY, QUnicodeTables::LineBreak_BK,
+ QUnicodeTables::LineBreak_CR, QUnicodeTables::LineBreak_LF,
+ QUnicodeTables::LineBreak_ZW};
+ if (std::any_of(std::begin(lb15b), std::end(lb15b),
+ [nncls](auto x) { return x == nncls; })) {
+ ncls = QUnicodeTables::LineBreak_QU_Pf;
+ }
+ }
+ }
+
if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
// LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
@@ -684,9 +741,8 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
}
if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
- // LB8a: ZWJ x (ID | EB | EM)
- if (ncls == QUnicodeTables::LineBreak_ID || ncls == QUnicodeTables::LineBreak_EB || ncls == QUnicodeTables::LineBreak_EM)
- goto next;
+ // LB8a: ZWJ x
+ goto next;
}
// LB25: do not break lines inside numbers
@@ -716,16 +772,24 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
goto next;
}
+ if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
+ && lastProp->category == QChar::Other_NotAssigned
+ && lastProp->graphemeBreakClass
+ == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
+ // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
+ goto next;
+ }
+
// for South East Asian chars that require a complex analysis, the Unicode
// standard recommends to treat them as AL. tailoring that do dictionary analysis can override
if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
cls = QUnicodeTables::LineBreak_AL;
tcls = cls;
- if (tcls == QUnicodeTables::LineBreak_CM)
+ if (tcls == QUnicodeTables::LineBreak_CM || tcls == QUnicodeTables::LineBreak_ZWJ)
// LB10
tcls = QUnicodeTables::LineBreak_AL;
- switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
+ switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_ZWJ ? ncls : QUnicodeTables::LineBreak_AL]) {
case LB::DirectBreak:
attributes[pos].lineBreak = true;
break;
@@ -746,6 +810,19 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
if (lcls != QUnicodeTables::LineBreak_HL)
attributes[pos].lineBreak = true;
break;
+ case LB::IndirectBreakIfNarrow:
+ switch (static_cast<QUnicodeTables::EastAsianWidth>(prop->eastAsianWidth)) {
+ default:
+ if (lcls != QUnicodeTables::LineBreak_SP)
+ break;
+ Q_FALLTHROUGH();
+ case QUnicodeTables::EastAsianWidth::F:
+ case QUnicodeTables::EastAsianWidth::W:
+ case QUnicodeTables::EastAsianWidth::H:
+ attributes[pos].lineBreak = true;
+ break;
+ }
+ break;
case LB::ProhibitedBreak:
// nothing to do
default:
@@ -754,6 +831,7 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes
next:
cls = ncls;
+ lastProp = prop;
next_no_cls_update:
lcls = ncls;
}
@@ -1242,17 +1320,17 @@ static inline Form form(unsigned short uc) {
(Consonant Nukta? Halant)* Consonant Halant
IndependentVowel VowelMark? StressMark?
- We return syllable boundaries on invalid combinations aswell
+ We return syllable boundaries on invalid combinations as well
*/
static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
{
*invalid = false;
- IDEBUG("indic_nextSyllableBoundary: start=%d, end=%d", int(start), int(end));
+ IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
const char16_t *uc = s+start;
qsizetype pos = 0;
Form state = form(uc[pos]);
- IDEBUG("state[%d]=%d (uc=%4x)", int(pos), state, uc[pos]);
+ IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
pos++;
if (state != Consonant && state != IndependentVowel) {
@@ -1263,7 +1341,7 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t
while (pos < end - start) {
Form newState = form(uc[pos]);
- IDEBUG("state[%d]=%d (uc=%4x)", int(pos), newState, uc[pos]);
+ IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
switch (newState) {
case Control:
newState = state;
@@ -1336,6 +1414,7 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t
// ### needs proper testing for correct two/three part matras
break;
}
+ Q_FALLTHROUGH();
case IndependentVowel:
case Invalid:
case Other:
@@ -1351,7 +1430,6 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t
static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
{
qsizetype end = from + len;
- const char16_t *uc = text + from;
attributes += from;
qsizetype i = 0;
while (i < len) {
@@ -1363,7 +1441,6 @@ static void indicAttributes(QChar::Script script, const char16_t *text, qsizetyp
i++;
while (i < boundary) {
attributes[i].graphemeBoundary = false;
- ++uc;
++i;
}
assert(i == boundary);
@@ -1372,6 +1449,8 @@ static void indicAttributes(QChar::Script script, const char16_t *text, qsizetyp
}
+#if QT_CONFIG(library)
+
#define LIBTHAI_MAJOR 0
/*
@@ -1382,25 +1461,74 @@ struct thcell_t {
unsigned char hilo; /**< upper/lower vowel/diacritic */
unsigned char top; /**< top-level mark */
};
-typedef int (*th_brk_def) (const unsigned char*, int*, size_t);
-typedef size_t (*th_next_cell_def) (const unsigned char *, size_t, struct thcell_t *, int);
-
-/* libthai related function handles */
-static th_brk_def th_brk = nullptr;
-static th_next_cell_def th_next_cell = nullptr;
-
-static int init_libthai() {
- static bool initialized = false;
- if (!initialized && (!th_brk || !th_next_cell)) {
- th_brk = reinterpret_cast<th_brk_def>(QLibrary::resolve(QLatin1String("thai"), static_cast<int>(LIBTHAI_MAJOR), "th_brk"));
- th_next_cell = (th_next_cell_def)QLibrary::resolve(QLatin1String("thai"), LIBTHAI_MAJOR, "th_next_cell");
- initialized = true;
+
+using ThBrk = struct _ThBrk;
+
+namespace {
+
+class LibThai final
+{
+ Q_DISABLE_COPY_MOVE(LibThai)
+
+ using th_brk_new_def = ThBrk *(*)(const char *);
+ using th_brk_delete_def = void (*)(ThBrk *);
+ using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
+ using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
+
+public:
+ LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
+ {
+ m_th_brk_find_breaks =
+ reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks"));
+ m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell"));
+
+ auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new"));
+ if (th_brk_new) {
+ m_state = th_brk_new(nullptr);
+ m_th_brk_delete =
+ reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete"));
+ }
+ }
+
+ ~LibThai()
+ {
+ if (m_state && m_th_brk_delete)
+ m_th_brk_delete(m_state);
+ m_library.unload();
+ }
+
+ bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
+
+ int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
+ {
+ Q_ASSERT(m_state);
+ Q_ASSERT(m_th_brk_find_breaks);
+ return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
+ }
+
+ size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
+ {
+ Q_ASSERT(m_th_next_cell);
+ return m_th_next_cell(s, len, cell, is_decomp_am);
}
- if (th_brk && th_next_cell)
- return 1;
- else
- return 0;
-}
+
+private:
+ QLibrary m_library;
+
+ // Global state for th_brk_find_breaks().
+ // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
+ // state is read-only, and so it is safe to use it from multiple threads after
+ // initialization. This is also stated in the libthai documentation.
+ ThBrk *m_state = nullptr;
+
+ th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
+ th_next_cell_def m_th_next_cell = nullptr;
+ th_brk_delete_def m_th_brk_delete = nullptr;
+};
+
+} // unnamed namespace
+
+Q_GLOBAL_STATIC(LibThai, g_libThai)
static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
{
@@ -1424,21 +1552,17 @@ static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
*/
static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
{
- char s[128];
- char *cstr = s;
- int *break_positions = nullptr;
- int brp[128];
- int brp_size = 0;
- qsizetype numbreaks, i, j, cell_length;
+ constexpr qsizetype Prealloc = 128;
+ QVarLengthArray<char, Prealloc + 1> s(len + 1);
+ QVarLengthArray<int, Prealloc> break_positions(len);
+ qsizetype numbreaks, i;
struct thcell_t tis_cell;
- if (!init_libthai())
- return ;
-
- if (len >= 128)
- cstr = static_cast<char *>(malloc (len * sizeof(char) + 1));
+ LibThai *libThai = g_libThai;
+ if (!libThai || !libThai->isInitialized())
+ return;
- to_tis620(string, len, cstr);
+ to_tis620(string, len, s.data());
for (i = 0; i < len; ++i) {
attributes[i].wordBreak = false;
@@ -1447,62 +1571,53 @@ static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAtt
attributes[i].lineBreak = false;
}
- if (len > 128) {
- break_positions = static_cast<int *>(malloc (sizeof(int) * len));
- memset (break_positions, 0, sizeof(int) * len);
- brp_size = len;
- }
- else {
- break_positions = brp;
- brp_size = 128;
- }
-
- if (break_positions) {
- attributes[0].wordBreak = true;
- attributes[0].wordStart = true;
- attributes[0].wordEnd = false;
- numbreaks = th_brk(reinterpret_cast<const unsigned char *>(cstr), break_positions, brp_size);
- for (i = 0; i < numbreaks; ++i) {
- attributes[break_positions[i]].wordBreak = true;
- attributes[break_positions[i]].wordStart = true;
- attributes[break_positions[i]].wordEnd = true;
- attributes[break_positions[i]].lineBreak = true;
- }
- if (numbreaks > 0)
- attributes[break_positions[numbreaks - 1]].wordStart = false;
-
- if (break_positions != brp)
- free(break_positions);
+ attributes[0].wordBreak = true;
+ attributes[0].wordStart = true;
+ attributes[0].wordEnd = false;
+ numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()),
+ break_positions.data(),
+ static_cast<size_t>(break_positions.size()));
+ for (i = 0; i < numbreaks; ++i) {
+ attributes[break_positions[i]].wordBreak = true;
+ attributes[break_positions[i]].wordStart = true;
+ attributes[break_positions[i]].wordEnd = true;
+ attributes[break_positions[i]].lineBreak = true;
}
+ if (numbreaks > 0)
+ attributes[break_positions[numbreaks - 1]].wordStart = false;
/* manage grapheme boundaries */
i = 0;
while (i < len) {
- cell_length = static_cast<uint>(th_next_cell(reinterpret_cast<const unsigned char *>(cstr) + i, len - i, &tis_cell, true));
-
+ size_t cell_length =
+ libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i,
+ size_t(len - i), &tis_cell, true);
attributes[i].graphemeBoundary = true;
- for (j = 1; j < cell_length; j++)
+ for (size_t j = 1; j < cell_length; ++j)
attributes[i + j].graphemeBoundary = false;
- /* Set graphemeBoundary for SARA AM */
- if (cstr[i + cell_length - 1] == static_cast<char>(0xd3))
- attributes[i + cell_length - 1].graphemeBoundary = true;
-
i += cell_length;
}
-
- if (len >= 128)
- free(cstr);
}
+#endif // QT_CONFIG(library)
+
static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
{
assert(script == QChar::Script_Thai);
+#if QT_CONFIG(library)
const char16_t *uc = text + from;
attributes += from;
Q_UNUSED(script);
thaiAssignAttributes(uc, len, attributes);
+#else
+ Q_UNUSED(script);
+ Q_UNUSED(text);
+ Q_UNUSED(from);
+ Q_UNUSED(len);
+ Q_UNUSED(attributes);
+#endif
}
/*
@@ -1616,7 +1731,6 @@ finish:
static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
{
qsizetype end = from + len;
- const char16_t *uc = text + from;
qsizetype i = 0;
Q_UNUSED(script);
attributes += from;
@@ -1630,7 +1744,6 @@ static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizet
i++;
while (i < boundary) {
attributes[i].graphemeBoundary = false;
- ++uc;
++i;
}
assert(i == boundary);
@@ -1681,6 +1794,8 @@ enum MymrCharClassFlags {
Mymr_CF_AFTER_KINZI = 0x00100000
};
+Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
+
/* Characters that get refrered to by name */
enum MymrChar
{
@@ -1813,7 +1928,7 @@ static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start
if (pos == start)
*invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
- MMDEBUG("state[%d]=%d class=%8x (uc=%4x)", int(pos - start), state, charClass, *uc);
+ MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
if (state < 0) {
if (state < -1)
@@ -1829,7 +1944,6 @@ static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start
static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
{
qsizetype end = from + len;
- const char16_t *uc = text + from;
qsizetype i = 0;
Q_UNUSED(script);
attributes += from;
@@ -1845,7 +1959,6 @@ static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizet
i++;
while (i < boundary) {
attributes[i].graphemeBoundary = false;
- ++uc;
++i;
}
assert(i == boundary);
@@ -1928,6 +2041,7 @@ enum KhmerCharClassFlags {
CF_POS_MASK = 0x000f0000
};
+Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
/* Characters that get referred to by name */
enum KhmerChar {
@@ -2149,7 +2263,7 @@ static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start,
}
state = khmerStateTable[state][charClass & CF_CLASS_MASK];
- KHDEBUG("state[%d]=%d class=%8lx (uc=%4x)", int(pos - start), state,
+ KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
charClass, *uc );
if (state < 0) {
@@ -2164,7 +2278,6 @@ static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start,
static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
{
qsizetype end = from + len;
- const char16_t *uc = text + from;
qsizetype i = 0;
Q_UNUSED(script);
attributes += from;
@@ -2178,7 +2291,6 @@ static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetyp
i++;
while ( i < boundary ) {
attributes[i].graphemeBoundary = false;
- ++uc;
++i;
}
assert( i == boundary );