diff options
Diffstat (limited to 'src/corelib/text/qunicodetools.cpp')
-rw-r--r-- | src/corelib/text/qunicodetools.cpp | 466 |
1 files changed, 282 insertions, 184 deletions
diff --git a/src/corelib/text/qunicodetools.cpp b/src/corelib/text/qunicodetools.cpp index a924addda0..2917804830 100644 --- a/src/corelib/text/qunicodetools.cpp +++ b/src/corelib/text/qunicodetools.cpp @@ -1,41 +1,5 @@ -/**************************************************************************** -** -** Copyright (C) 2020 The Qt Company Ltd. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the QtCore module of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:LGPL$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** GNU Lesser General Public License Usage -** Alternatively, this file may be used under the terms of the GNU Lesser -** General Public License version 3 as published by the Free Software -** Foundation and appearing in the file LICENSE.LGPL3 included in the -** packaging of this file. Please review the following information to -** ensure the GNU Lesser General Public License version 3 requirements -** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 2.0 or (at your option) the GNU General -** Public license version 3 or any later version approved by the KDE Free -** Qt Foundation. The licenses are as published by the Free Software -** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 -** included in the packaging of this file. Please review the following -** information to ensure the GNU General Public License requirements will -** be met: https://www.gnu.org/licenses/gpl-2.0.html and -** https://www.gnu.org/licenses/gpl-3.0.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ +// Copyright (C) 2020 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only #include "qunicodetools_p.h" @@ -51,7 +15,14 @@ QT_BEGIN_NAMESPACE -Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0; +using namespace Qt::StringLiterals; + +#ifdef QT_BUILD_INTERNAL +Q_CONSTINIT Q_AUTOTEST_EXPORT +#else +constexpr +#endif +int qt_initcharattributes_default_algorithm_only = 0; namespace QUnicodeTools { @@ -242,30 +213,26 @@ enum Action { }; static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = { -// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet E_Base E_Mod GAZ EBG WSeg - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any - { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR - { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF - { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ZWJ - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break }, // Katakana - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // HebrewLetter - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ALetter - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // Numeric - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ExtendNumLet - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // E_Base - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_Mod - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ - { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // EBG - { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // WSeg +// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any + { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF + { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // HebrewLetter + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // ALetter + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break }, // Numeric + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ExtendNumLet + { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace }; } // namespace WB @@ -277,6 +244,8 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes } currentWordType = WordTypeNone; QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1 + auto real_cls = cls; // Unaffected by WB4 + for (qsizetype i = 0; i != len; ++i) { qsizetype pos = i; char32_t ucs4 = string[i]; @@ -290,7 +259,6 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass; -#ifdef QT_BUILD_INTERNAL if (qt_initcharattributes_default_algorithm_only) { // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet // which caused "hi.there" to be treated like if it were just a single word; @@ -301,22 +269,32 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes else if (ucs4 == 0x003A) // COLON ncls = QUnicodeTables::WordBreak_MidLetter; } -#endif uchar action = WB::breakTable[cls][ncls]; switch (action) { case WB::Break: + if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ + && prop->graphemeBreakClass + == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) { + // WB3c: ZWJ × \p{Extended_Pictographic} + action = WB::NoBreak; + } break; case WB::NoBreak: if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) { // WB4: X(Extend|Format)* -> X - if (cls != QUnicodeTables::WordBreak_ZWJ) // WB3c - continue; + real_cls = ncls; + continue; } if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) { // WB15/WB16: break between pairs of Regional indicator ncls = QUnicodeTables::WordBreak_Any; } + if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace + && real_cls != QUnicodeTables::WordBreak_WSegSpace)) { + // WB3d should not be affected by WB4 + action = WB::Break; + } break; case WB::Lookup: case WB::LookupW: @@ -355,6 +333,8 @@ static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes } cls = ncls; + real_cls = ncls; + if (action == WB::Break) { attributes[pos].wordBreak = true; if (currentWordType != WordTypeNone) @@ -575,49 +555,53 @@ enum Action { IndirectBreak, IB = IndirectBreak, CombiningIndirectBreak, CI = CombiningIndirectBreak, CombiningProhibitedBreak, CP = CombiningProhibitedBreak, - ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen + ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen, + IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30 }; -static const uchar breakTable[QUnicodeTables::LineBreak_SA][QUnicodeTables::LineBreak_SA] = { -/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM ZWJ*/ -/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB }, -/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, -/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, -/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, IB }, -/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB }, -/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB }, -/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB }, -/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, -/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, -/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB }, -/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB }, -/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, IB }, -/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB }, -/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB }, -/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, IB }, -/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB }, -/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, -/* ZWJ*/ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, IB } +// See https://www.unicode.org/reports/tr14/tr14-37.html for the information +// about the table. It was removed in the later versions of the standard. +static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = { +/* 1↓ 2→ OP CL CP QU +Pi +Pf GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/ +/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB }, +/* CL */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* CP */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB }, +/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* NS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB }, +/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* ID */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* HY */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB }, +/* BA */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB }, +/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB }, +/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB }, +/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB }, +/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB }, +/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB }, +/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB }, +/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB }, +/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB }, +/* CB */ { DB, PB, PB, IB, IB, PB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, +/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB }, +/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB }, }; // The following line break classes are not treated by the pair table // and must be resolved outside: -// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX +// AI, AK, AP, AS, BK, CB, CJ, CR, LF, NL, SA, SG, SP, VF, VI, XX, ZWJ } // namespace LB @@ -628,6 +612,8 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10 QUnicodeTables::LineBreakClass cls = lcls; + const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(U'\n'); + for (qsizetype i = 0; i != len; ++i) { qsizetype pos = i; char32_t ucs4 = string[i]; @@ -675,6 +661,61 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes ncls = QUnicodeTables::LineBreak_CM; } + if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) { + if (prop->category == QChar::Punctuation_InitialQuote) { + // LB15a: Do not break after an unresolved initial punctuation + // that lies at the start of the line, after a space, after + // opening punctuation, or after an unresolved quotation mark, + // even after spaces. + // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW) + // [\p{Pi}&QU] SP* × + // Note: sot is treated as LF here due to initial loop setup. + constexpr QUnicodeTables::LineBreakClass lb15a[] = { + QUnicodeTables::LineBreak_BK, QUnicodeTables::LineBreak_CR, + QUnicodeTables::LineBreak_LF, QUnicodeTables::LineBreak_OP, + QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi, + QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_GL, + QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_ZW}; + if (std::any_of(std::begin(lb15a), std::end(lb15a), + [lcls](auto x) { return x == lcls; })) { + ncls = QUnicodeTables::LineBreak_QU_Pi; + } + } else if (prop->category == QChar::Punctuation_FinalQuote) { + // LB15b: Do not break before an unresolved final punctuation + // that lies at the end of the line, before a space, before + // a prohibited break, or before an unresolved quotation mark, + // even after spaces. + // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS + // | SY | BK | CR | LF | NL | ZW | eot) + auto nncls = QUnicodeTables::LineBreak_LF; + + if (i + 1 < len) { + char32_t c = string[i + 1]; + if (QChar::isHighSurrogate(c) && i + 2 != len) { + ushort low = string[i + 2]; + if (QChar::isLowSurrogate(low)) + c = QChar::surrogateToUcs4(c, low); + } + nncls = QUnicodeTables::LineBreakClass( + QUnicodeTables::properties(c)->lineBreakClass); + } + + constexpr QUnicodeTables::LineBreakClass lb15b[] = { + QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_GL, + QUnicodeTables::LineBreak_WJ, QUnicodeTables::LineBreak_CL, + QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi, + QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_CP, + QUnicodeTables::LineBreak_EX, QUnicodeTables::LineBreak_IS, + QUnicodeTables::LineBreak_SY, QUnicodeTables::LineBreak_BK, + QUnicodeTables::LineBreak_CR, QUnicodeTables::LineBreak_LF, + QUnicodeTables::LineBreak_ZW}; + if (std::any_of(std::begin(lb15b), std::end(lb15b), + [nncls](auto x) { return x == nncls; })) { + ncls = QUnicodeTables::LineBreak_QU_Pf; + } + } + } + if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) { // LB4: BK!, LB5: (CRxLF|CR|LF|NL)! if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF) @@ -700,9 +741,8 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes } if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) { - // LB8a: ZWJ x (ID | EB | EM) - if (ncls == QUnicodeTables::LineBreak_ID || ncls == QUnicodeTables::LineBreak_EB || ncls == QUnicodeTables::LineBreak_EM) - goto next; + // LB8a: ZWJ x + goto next; } // LB25: do not break lines inside numbers @@ -732,16 +772,24 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes goto next; } + if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM + && lastProp->category == QChar::Other_NotAssigned + && lastProp->graphemeBreakClass + == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) { + // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM + goto next; + } + // for South East Asian chars that require a complex analysis, the Unicode // standard recommends to treat them as AL. tailoring that do dictionary analysis can override if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA)) cls = QUnicodeTables::LineBreak_AL; tcls = cls; - if (tcls == QUnicodeTables::LineBreak_CM) + if (tcls == QUnicodeTables::LineBreak_CM || tcls == QUnicodeTables::LineBreak_ZWJ) // LB10 tcls = QUnicodeTables::LineBreak_AL; - switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) { + switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_ZWJ ? ncls : QUnicodeTables::LineBreak_AL]) { case LB::DirectBreak: attributes[pos].lineBreak = true; break; @@ -762,6 +810,19 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes if (lcls != QUnicodeTables::LineBreak_HL) attributes[pos].lineBreak = true; break; + case LB::IndirectBreakIfNarrow: + switch (static_cast<QUnicodeTables::EastAsianWidth>(prop->eastAsianWidth)) { + default: + if (lcls != QUnicodeTables::LineBreak_SP) + break; + Q_FALLTHROUGH(); + case QUnicodeTables::EastAsianWidth::F: + case QUnicodeTables::EastAsianWidth::W: + case QUnicodeTables::EastAsianWidth::H: + attributes[pos].lineBreak = true; + break; + } + break; case LB::ProhibitedBreak: // nothing to do default: @@ -770,6 +831,7 @@ static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes next: cls = ncls; + lastProp = prop; next_no_cls_update: lcls = ncls; } @@ -1263,12 +1325,12 @@ static inline Form form(unsigned short uc) { static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid) { *invalid = false; - IDEBUG("indic_nextSyllableBoundary: start=%d, end=%d", int(start), int(end)); + IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end)); const char16_t *uc = s+start; qsizetype pos = 0; Form state = form(uc[pos]); - IDEBUG("state[%d]=%d (uc=%4x)", int(pos), state, uc[pos]); + IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]); pos++; if (state != Consonant && state != IndependentVowel) { @@ -1279,7 +1341,7 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t while (pos < end - start) { Form newState = form(uc[pos]); - IDEBUG("state[%d]=%d (uc=%4x)", int(pos), newState, uc[pos]); + IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]); switch (newState) { case Control: newState = state; @@ -1352,6 +1414,7 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t // ### needs proper testing for correct two/three part matras break; } + Q_FALLTHROUGH(); case IndependentVowel: case Invalid: case Other: @@ -1367,7 +1430,6 @@ static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) { qsizetype end = from + len; - const char16_t *uc = text + from; attributes += from; qsizetype i = 0; while (i < len) { @@ -1379,7 +1441,6 @@ static void indicAttributes(QChar::Script script, const char16_t *text, qsizetyp i++; while (i < boundary) { attributes[i].graphemeBoundary = false; - ++uc; ++i; } assert(i == boundary); @@ -1388,6 +1449,8 @@ static void indicAttributes(QChar::Script script, const char16_t *text, qsizetyp } +#if QT_CONFIG(library) + #define LIBTHAI_MAJOR 0 /* @@ -1398,27 +1461,74 @@ struct thcell_t { unsigned char hilo; /**< upper/lower vowel/diacritic */ unsigned char top; /**< top-level mark */ }; -typedef int (*th_brk_def) (const unsigned char*, int*, size_t); -typedef size_t (*th_next_cell_def) (const unsigned char *, size_t, struct thcell_t *, int); -/* libthai related function handles */ -static th_brk_def th_brk = nullptr; -static th_next_cell_def th_next_cell = nullptr; +using ThBrk = struct _ThBrk; -static int init_libthai() { -#if QT_CONFIG(library) - static bool initialized = false; - if (!initialized && (!th_brk || !th_next_cell)) { - th_brk = reinterpret_cast<th_brk_def>(QLibrary::resolve(QLatin1String("thai"), static_cast<int>(LIBTHAI_MAJOR), "th_brk")); - th_next_cell = (th_next_cell_def)QLibrary::resolve(QLatin1String("thai"), LIBTHAI_MAJOR, "th_next_cell"); - initialized = true; +namespace { + +class LibThai final +{ + Q_DISABLE_COPY_MOVE(LibThai) + + using th_brk_new_def = ThBrk *(*)(const char *); + using th_brk_delete_def = void (*)(ThBrk *); + using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t); + using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int); + +public: + LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR) + { + m_th_brk_find_breaks = + reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks")); + m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell")); + + auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new")); + if (th_brk_new) { + m_state = th_brk_new(nullptr); + m_th_brk_delete = + reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete")); + } + } + + ~LibThai() + { + if (m_state && m_th_brk_delete) + m_th_brk_delete(m_state); + m_library.unload(); + } + + bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; } + + int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const + { + Q_ASSERT(m_state); + Q_ASSERT(m_th_brk_find_breaks); + return m_th_brk_find_breaks(m_state, s, pos, pos_sz); } - if (th_brk && th_next_cell) - return 1; - else -#endif - return 0; -} + + size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am) + { + Q_ASSERT(m_th_next_cell); + return m_th_next_cell(s, len, cell, is_decomp_am); + } + +private: + QLibrary m_library; + + // Global state for th_brk_find_breaks(). + // Note: even if signature for th_brk_find_breaks() suggests otherwise, the + // state is read-only, and so it is safe to use it from multiple threads after + // initialization. This is also stated in the libthai documentation. + ThBrk *m_state = nullptr; + + th_brk_find_breaks_def m_th_brk_find_breaks = nullptr; + th_next_cell_def m_th_next_cell = nullptr; + th_brk_delete_def m_th_brk_delete = nullptr; +}; + +} // unnamed namespace + +Q_GLOBAL_STATIC(LibThai, g_libThai) static void to_tis620(const char16_t *string, qsizetype len, char *cstr) { @@ -1442,21 +1552,17 @@ static void to_tis620(const char16_t *string, qsizetype len, char *cstr) */ static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes) { - char s[128]; - char *cstr = s; - int *break_positions = nullptr; - int brp[128]; - int brp_size = 0; - qsizetype numbreaks, i, j, cell_length; + constexpr qsizetype Prealloc = 128; + QVarLengthArray<char, Prealloc + 1> s(len + 1); + QVarLengthArray<int, Prealloc> break_positions(len); + qsizetype numbreaks, i; struct thcell_t tis_cell; - if (!init_libthai()) - return ; - - if (len >= 128) - cstr = static_cast<char *>(malloc (len * sizeof(char) + 1)); + LibThai *libThai = g_libThai; + if (!libThai || !libThai->isInitialized()) + return; - to_tis620(string, len, cstr); + to_tis620(string, len, s.data()); for (i = 0; i < len; ++i) { attributes[i].wordBreak = false; @@ -1465,58 +1571,53 @@ static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAtt attributes[i].lineBreak = false; } - if (len > 128) { - break_positions = static_cast<int *>(malloc (sizeof(int) * len)); - memset (break_positions, 0, sizeof(int) * len); - brp_size = len; - } - else { - break_positions = brp; - brp_size = 128; - } - - if (break_positions) { - attributes[0].wordBreak = true; - attributes[0].wordStart = true; - attributes[0].wordEnd = false; - numbreaks = th_brk(reinterpret_cast<const unsigned char *>(cstr), break_positions, brp_size); - for (i = 0; i < numbreaks; ++i) { - attributes[break_positions[i]].wordBreak = true; - attributes[break_positions[i]].wordStart = true; - attributes[break_positions[i]].wordEnd = true; - attributes[break_positions[i]].lineBreak = true; - } - if (numbreaks > 0) - attributes[break_positions[numbreaks - 1]].wordStart = false; - - if (break_positions != brp) - free(break_positions); + attributes[0].wordBreak = true; + attributes[0].wordStart = true; + attributes[0].wordEnd = false; + numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()), + break_positions.data(), + static_cast<size_t>(break_positions.size())); + for (i = 0; i < numbreaks; ++i) { + attributes[break_positions[i]].wordBreak = true; + attributes[break_positions[i]].wordStart = true; + attributes[break_positions[i]].wordEnd = true; + attributes[break_positions[i]].lineBreak = true; } + if (numbreaks > 0) + attributes[break_positions[numbreaks - 1]].wordStart = false; /* manage grapheme boundaries */ i = 0; while (i < len) { - cell_length = static_cast<uint>(th_next_cell(reinterpret_cast<const unsigned char *>(cstr) + i, len - i, &tis_cell, true)); - + size_t cell_length = + libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i, + size_t(len - i), &tis_cell, true); attributes[i].graphemeBoundary = true; - for (j = 1; j < cell_length; j++) + for (size_t j = 1; j < cell_length; ++j) attributes[i + j].graphemeBoundary = false; i += cell_length; } - - if (len >= 128) - free(cstr); } +#endif // QT_CONFIG(library) + static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) { assert(script == QChar::Script_Thai); +#if QT_CONFIG(library) const char16_t *uc = text + from; attributes += from; Q_UNUSED(script); thaiAssignAttributes(uc, len, attributes); +#else + Q_UNUSED(script); + Q_UNUSED(text); + Q_UNUSED(from); + Q_UNUSED(len); + Q_UNUSED(attributes); +#endif } /* @@ -1630,7 +1731,6 @@ finish: static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) { qsizetype end = from + len; - const char16_t *uc = text + from; qsizetype i = 0; Q_UNUSED(script); attributes += from; @@ -1644,7 +1744,6 @@ static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizet i++; while (i < boundary) { attributes[i].graphemeBoundary = false; - ++uc; ++i; } assert(i == boundary); @@ -1695,6 +1794,8 @@ enum MymrCharClassFlags { Mymr_CF_AFTER_KINZI = 0x00100000 }; +Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags) + /* Characters that get refrered to by name */ enum MymrChar { @@ -1827,7 +1928,7 @@ static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start if (pos == start) *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE); - MMDEBUG("state[%d]=%d class=%8x (uc=%4x)", int(pos - start), state, charClass, *uc); + MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc); if (state < 0) { if (state < -1) @@ -1843,7 +1944,6 @@ static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) { qsizetype end = from + len; - const char16_t *uc = text + from; qsizetype i = 0; Q_UNUSED(script); attributes += from; @@ -1859,7 +1959,6 @@ static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizet i++; while (i < boundary) { attributes[i].graphemeBoundary = false; - ++uc; ++i; } assert(i == boundary); @@ -1942,6 +2041,7 @@ enum KhmerCharClassFlags { CF_POS_MASK = 0x000f0000 }; +Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags) /* Characters that get referred to by name */ enum KhmerChar { @@ -2163,7 +2263,7 @@ static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, } state = khmerStateTable[state][charClass & CF_CLASS_MASK]; - KHDEBUG("state[%d]=%d class=%8lx (uc=%4x)", int(pos - start), state, + KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state, charClass, *uc ); if (state < 0) { @@ -2178,7 +2278,6 @@ static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) { qsizetype end = from + len; - const char16_t *uc = text + from; qsizetype i = 0; Q_UNUSED(script); attributes += from; @@ -2192,7 +2291,6 @@ static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetyp i++; while ( i < boundary ) { attributes[i].graphemeBoundary = false; - ++uc; ++i; } assert( i == boundary ); |