diff options
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r-- | util/unicode/main.cpp | 246 |
1 files changed, 151 insertions, 95 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index 91015411a0..fb308b7dc0 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -1,30 +1,5 @@ -/**************************************************************************** -** -** Copyright (C) 2020 The Qt Company Ltd. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the utils of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:GPL-EXCEPT$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 3 as published by the Free Software -** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT -** included in the packaging of this file. Please review the following -** information to ensure the GNU General Public License requirements will -** be met: https://www.gnu.org/licenses/gpl-3.0.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ +// Copyright (C) 2020 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 #include <qbytearray.h> #include <qchar.h> @@ -39,8 +14,8 @@ #include <private/qunicodetables_p.h> #endif -#define DATA_VERSION_S "13.0" -#define DATA_VERSION_STR "QChar::Unicode_13_0" +#define DATA_VERSION_S "15.1" +#define DATA_VERSION_STR "QChar::Unicode_15_1" static QHash<QByteArray, QChar::UnicodeVersion> age_map; @@ -74,6 +49,9 @@ static void initAgeMap() { QChar::Unicode_12_0, "12.0" }, { QChar::Unicode_12_1, "12.1" }, // UCD Revision 24 { QChar::Unicode_13_0, "13.0" }, // UCD Revision 26 + { QChar::Unicode_14_0, "14.0" }, // UCD Revision 28 + { QChar::Unicode_15_0, "15.0" }, // UCD Revision 30 + { QChar::Unicode_15_1, "15.1" }, // UCD Revision 32 { QChar::Unicode_Unassigned, 0 } }; AgeMap *d = ageMap; @@ -83,6 +61,47 @@ static void initAgeMap() } } +static const char *east_asian_width_string = +R"(enum class EastAsianWidth : unsigned int { + A, + F, + H, + N, + Na, + W, +}; + +)"; + +enum class EastAsianWidth : unsigned int { + A, + F, + H, + N, + Na, + W, +}; + +static QHash<QByteArray, EastAsianWidth> eastAsianWidthMap; + +static void initEastAsianWidthMap() +{ + constexpr struct W { + EastAsianWidth width; + const char *name; + } widths[] = { + { EastAsianWidth::A, "A" }, + { EastAsianWidth::F, "F" }, + { EastAsianWidth::H, "H" }, + { EastAsianWidth::N, "N" }, + { EastAsianWidth::Na, "Na" }, + { EastAsianWidth::W, "W" }, + }; + + for (auto &w : widths) + eastAsianWidthMap.insert(w.name, w.width); +} + static QHash<QByteArray, QChar::Category> categoryMap; static void initCategoryMap() @@ -370,10 +389,6 @@ static const char *word_break_class_string = " WordBreak_MidNum,\n" " WordBreak_Numeric,\n" " WordBreak_ExtendNumLet,\n" - " WordBreak_E_Base,\n" - " WordBreak_E_Modifier,\n" - " WordBreak_Glue_After_Zwj,\n" - " WordBreak_E_Base_GAZ,\n" " WordBreak_WSegSpace,\n" "\n" " NumWordBreakClasses\n" @@ -398,10 +413,6 @@ enum WordBreakClass { WordBreak_MidNum, WordBreak_Numeric, WordBreak_ExtendNumLet, - WordBreak_E_Base, - WordBreak_E_Modifier, - WordBreak_Glue_After_Zwj, - WordBreak_E_Base_GAZ, WordBreak_WSegSpace, WordBreak_Unassigned @@ -433,10 +444,6 @@ static void initWordBreak() { WordBreak_MidNum, "MidNum" }, { WordBreak_Numeric, "Numeric" }, { WordBreak_ExtendNumLet, "ExtendNumLet" }, - { WordBreak_E_Base, "E_Base" }, - { WordBreak_E_Modifier, "E_Modifier" }, - { WordBreak_Glue_After_Zwj, "Glue_After_Zwj" }, - { WordBreak_E_Base_GAZ, "E_Base_GAZ" }, { WordBreak_WSegSpace, "WSegSpace" }, { WordBreak_Unassigned, 0 } }; @@ -522,9 +529,11 @@ static void initSentenceBreak() static const char *line_break_class_string = "// see http://www.unicode.org/reports/tr14/tr14-30.html\n" - "// we don't use the XX and AI classes and map them to AL instead.\n" + "// we don't use the XX, AK, AP, AS and AI classes and map them to AL instead.\n" + "// VI and VF classes are mapped to CM.\n" "enum LineBreakClass {\n" - " LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,\n" + " LineBreak_OP, LineBreak_CL, LineBreak_CP,\n" + " LineBreak_QU, LineBreak_QU_Pi, LineBreak_QU_Pf, LineBreak_GL,\n" " LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,\n" " LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,\n" " LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,\n" @@ -538,7 +547,8 @@ static const char *line_break_class_string = "};\n\n"; enum LineBreakClass { - LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL, + LineBreak_OP, LineBreak_CL, LineBreak_CP, + LineBreak_QU, LineBreak_QU_Pi, LineBreak_QU_Pf, LineBreak_GL, LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID, LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2, @@ -606,6 +616,11 @@ static void initLineBreak() { LineBreak_EB, "EB" }, { LineBreak_EM, "EM" }, { LineBreak_ZWJ, "ZWJ" }, + { LineBreak_AL, "AK" }, + { LineBreak_AL, "AP" }, + { LineBreak_AL, "AS" }, + { LineBreak_CM, "VI" }, + { LineBreak_CM, "VF" }, { LineBreak_Unassigned, 0 } }; LineBreakList *d = breaks; @@ -797,6 +812,17 @@ static void initScriptMap() { QChar::Script_KhitanSmallScript, "KhitanSmallScript" }, { QChar::Script_Yezidi, "Yezidi" }, + // 14.0 + { QChar::Script_CyproMinoan, "CyproMinoan"}, + { QChar::Script_OldUyghur, "OldUyghur"}, + { QChar::Script_Tangsa, "Tangsa"}, + { QChar::Script_Toto, "Toto"}, + { QChar::Script_Vithkuqi, "Vithkuqi"}, + + // 15.0 + { QChar::Script_Kawi, "Kawi"}, + { QChar::Script_NagMundari, "NagMundari"}, + // unhandled { QChar::Script_Unknown, 0 } }; @@ -878,7 +904,8 @@ static const char *property_string = " ushort joining : 3;\n" " signed short digitValue : 5;\n" " signed short mirrorDiff : 16;\n" - " ushort unicodeVersion : 8; /* 5 used */\n" + " ushort unicodeVersion : 5; /* 5 used */\n" + " ushort eastAsianWidth : 3; /* 3 used */\n" " ushort nfQuickCheck : 8;\n" // could be narrowed "#ifdef Q_OS_WASM\n" " unsigned char : 0; //wasm 64 packing trick\n" @@ -925,6 +952,10 @@ static const char *methods = "Q_CORE_EXPORT QStringView QT_FASTCALL idnaMapping(char32_t usc4) noexcept;\n" "inline QStringView idnaMapping(QChar ch) noexcept\n" "{ return idnaMapping(ch.unicode()); }\n" + "\n" + "Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept;\n" + "inline EastAsianWidth eastAsianWidth(QChar ch) noexcept\n" + "{ return eastAsianWidth(ch.unicode()); }\n" "\n"; static const int SizeOfPropertiesStruct = 20; @@ -947,6 +978,7 @@ struct PropertyFlags { && direction == o.direction && joining == o.joining && age == o.age + && eastAsianWidth == o.eastAsianWidth && digitValue == o.digitValue && mirrorDiff == o.mirrorDiff && lowerCaseDiff == o.lowerCaseDiff @@ -974,6 +1006,8 @@ struct PropertyFlags { QChar::JoiningType joining : 3; // from DerivedAge.txt QChar::UnicodeVersion age : 5; + // From EastAsianWidth.txt + EastAsianWidth eastAsianWidth = EastAsianWidth::N; int digitValue = -1; int mirrorDiff : 16; @@ -1512,6 +1546,52 @@ static void readDerivedAge() } } +static void readEastAsianWidth() +{ + qDebug("Reading EastAsianWidth.txt"); + + QFile f("data/EastAsianWidth.txt"); + if (!f.exists() || !f.open(QFile::ReadOnly)) + qFatal("Couldn't find or read EastAsianWidth.txt"); + + while (!f.atEnd()) { + QByteArray line = f.readLine().trimmed(); + + int comment = line.indexOf('#'); + line = (comment < 0 ? line : line.left(comment)).simplified(); + + if (line.isEmpty()) + continue; + + QList<QByteArray> fields = line.split(';'); + Q_ASSERT(fields.size() == 2); + + // That would be split(".."), but that API does not exist. + const QByteArray codePoints = fields[0].trimmed().replace("..", "."); + QList<QByteArray> cl = codePoints.split('.'); + Q_ASSERT(cl.size() >= 1 && cl.size() <= 2); + + const QByteArray widthString = fields[1].trimmed(); + if (!eastAsianWidthMap.contains(widthString)) { + qFatal("Unhandled EastAsianWidth property value for %s: %s", + qPrintable(codePoints), qPrintable(widthString)); + } + auto width = eastAsianWidthMap.value(widthString); + + bool ok; + const int first = cl[0].toInt(&ok, 16); + const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first; + Q_ASSERT(ok); + + for (int codepoint = first; codepoint <= last; ++codepoint) { + UnicodeData &ud = UnicodeData::valueRef(codepoint); + // Ensure that ranges don't overlap. + Q_ASSERT(ud.p.eastAsianWidth == EastAsianWidth::N); + ud.p.eastAsianWidth = width; + } + } +} + static void readDerivedNormalizationProps() { qDebug("Reading DerivedNormalizationProps.txt"); @@ -1639,7 +1719,7 @@ static QByteArray createNormalizationCorrections() " int version;\n" "};\n\n" - "static const NormalizationCorrection uc_normalization_corrections[] = {\n"; + "static constexpr NormalizationCorrection uc_normalization_corrections[] = {\n"; int maxVersion = 0; int numCorrections = 0; @@ -2655,7 +2735,7 @@ static QByteArray createIdnaMapping() qsizetype memoryUsage = 0; QByteArray out = - "static const char16_t idnaMappingData[] = {"; + "static constexpr char16_t idnaMappingData[] = {"; int col = 0; for (auto c : idnaMappingData) { @@ -2680,7 +2760,7 @@ static QByteArray createIdnaMapping() " char16_t ucs[2]; // ucs[0] is offset if size > 2\n" "};\n" "static_assert(sizeof(IdnaMapEntry) == 8);\n\n" - "static const IdnaMapEntry idnaMap[] = {\n"; + "static constexpr IdnaMapEntry idnaMap[] = {\n"; for (auto i = idnaMappingTable.keyValueBegin(); i != idnaMappingTable.keyValueEnd(); i++) { const QString &mapping = i->second; @@ -2849,7 +2929,7 @@ static QByteArray createPropertyInfo() Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE +(SMP_END-BMP_END)/SMP_BLOCKSIZE); // 0x1870 Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8))); - QByteArray out = "static const unsigned short uc_property_trie[] = {\n"; + QByteArray out = "static constexpr unsigned short uc_property_trie[] = {\n"; // First write the map from blockId to indices of unique blocks: out += " // [0x0..0x" + QByteArray::number(BMP_END, 16) + ")"; for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { @@ -2902,7 +2982,7 @@ static QByteArray createPropertyInfo() out.chop(2); out += "\n};\n\n"; - out += "static const Properties uc_properties[] = {"; + out += "static constexpr Properties uc_properties[] = {"; // keep in sync with the property declaration for (int i = 0; i < uniqueProperties.size(); ++i) { const PropertyFlags &p = uniqueProperties.at(i); @@ -2925,9 +3005,12 @@ static QByteArray createPropertyInfo() // " signed short mirrorDiff : 16;\n" out += QByteArray::number( p.mirrorDiff ); out += ", "; -// " ushort unicodeVersion : 8; /* 5 used */\n" +// " ushort unicodeVersion : 5; /* 5 used */\n" out += QByteArray::number( p.age ); out += ", "; +// " ushort eastAsianWidth : 3;" /* 3 used */\n" + out += QByteArray::number( static_cast<unsigned int>(p.eastAsianWidth) ); + out += ", "; // " ushort nfQuickCheck : 8;\n" out += QByteArray::number( p.nfQuickCheck ); out += ", "; @@ -3032,6 +3115,11 @@ static QByteArray createPropertyInfo() "{\n" " return static_cast<IdnaStatus>(qGetProp(ucs4)->idnaStatus);\n" "}\n" + "\n" + "Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept\n" + "{\n" + " return static_cast<EastAsianWidth>(qGetProp(ucs4)->eastAsianWidth);\n" + "}\n" "\n"; return out; @@ -3042,7 +3130,7 @@ static QByteArray createSpecialCaseMap() qDebug("createSpecialCaseMap:"); QByteArray out - = "static const unsigned short specialCaseMap[] = {\n" + = "static constexpr unsigned short specialCaseMap[] = {\n" " 0x0, // placeholder"; int i = 1; @@ -3058,7 +3146,7 @@ static QByteArray createSpecialCaseMap() maxN = std::max(maxN, n); } out.chop(1); - out += "\n};\n\nconst unsigned int MaxSpecialCaseLength = "; + out += "\n};\n\nconstexpr unsigned int MaxSpecialCaseLength = "; out += QByteArray::number(maxN); out += ";\n\n"; @@ -3190,7 +3278,7 @@ static QByteArray createCompositionInfo() Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8))); - QByteArray out = "static const unsigned short uc_decomposition_trie[] = {\n"; + QByteArray out = "static constexpr unsigned short uc_decomposition_trie[] = {\n"; // first write the map out += " // 0 - 0x" + QByteArray::number(BMP_END, 16); for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { @@ -3254,7 +3342,7 @@ static QByteArray createCompositionInfo() + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n" " : 0xffff)\n\n"; - out += "static const unsigned short uc_decomposition_map[] = {"; + out += "static constexpr unsigned short uc_decomposition_map[] = {"; for (int i = 0; i < decompositions.size(); ++i) { if (!(i % 8)) { if (out.endsWith(' ')) @@ -3390,7 +3478,7 @@ static QByteArray createLigatureInfo() Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8))); - QByteArray out = "static const unsigned short uc_ligature_trie[] = {\n"; + QByteArray out = "static constexpr unsigned short uc_ligature_trie[] = {\n"; // first write the map out += " // 0 - 0x" + QByteArray::number(BMP_END, 16); for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { @@ -3454,7 +3542,7 @@ static QByteArray createLigatureInfo() + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n" " : 0xffff)\n\n"; - out += "static const unsigned short uc_ligature_map[] = {"; + out += "static constexpr unsigned short uc_ligature_map[] = {"; for (int i = 0; i < ligatures.size(); ++i) { if (!(i % 8)) { if (out.endsWith(' ')) @@ -3487,6 +3575,7 @@ QByteArray createCasingInfo() int main(int, char **) { initAgeMap(); + initEastAsianWidthMap(); initCategoryMap(); initDecompositionMap(); initDirectionMap(); @@ -3502,6 +3591,7 @@ int main(int, char **) readBidiMirroring(); readArabicShaping(); readDerivedAge(); + readEastAsianWidth(); readDerivedNormalizationProps(); readSpecialCasing(); readCaseFolding(); @@ -3525,44 +3615,9 @@ int main(int, char **) QByteArray idnaMapping = createIdnaMapping(); QByteArray header = - "/****************************************************************************\n" - "**\n" - "** Copyright (C) 2020 The Qt Company Ltd.\n" - "** Contact: https://www.qt.io/licensing/\n" - "**\n" - "** This file is part of the QtCore module of the Qt Toolkit.\n" - "**\n" - "** $QT_BEGIN_LICENSE:LGPL$\n" - "** Commercial License Usage\n" - "** Licensees holding valid commercial Qt licenses may use this file in\n" - "** accordance with the commercial license agreement provided with the\n" - "** Software or, alternatively, in accordance with the terms contained in\n" - "** a written agreement between you and The Qt Company. For licensing terms\n" - "** and conditions see https://www.qt.io/terms-conditions. For further\n" - "** information use the contact form at https://www.qt.io/contact-us.\n" - "**\n" - "** GNU Lesser General Public License Usage\n" - "** Alternatively, this file may be used under the terms of the GNU Lesser\n" - "** General Public License version 3 as published by the Free Software\n" - "** Foundation and appearing in the file LICENSE.LGPL3 included in the\n" - "** packaging of this file. Please review the following information to\n" - "** ensure the GNU Lesser General Public License version 3 requirements\n" - "** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.\n" - "**\n" - "** GNU General Public License Usage\n" - "** Alternatively, this file may be used under the terms of the GNU\n" - "** General Public License version 2.0 or (at your option) the GNU General\n" - "** Public license version 3 or any later version approved by the KDE Free\n" - "** Qt Foundation. The licenses are as published by the Free Software\n" - "** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3\n" - "** included in the packaging of this file. Please review the following\n" - "** information to ensure the GNU General Public License requirements will\n" - "** be met: https://www.gnu.org/licenses/gpl-2.0.html and\n" - "** https://www.gnu.org/licenses/gpl-3.0.html.\n" - "**\n" - "** $QT_END_LICENSE$\n" - "**\n" - "****************************************************************************/\n\n"; + "// Copyright (C) 2020 The Qt Company Ltd.\n" + "// SPDX-License-Identifier: Unicode-3.0\n" + "\n"; QByteArray note = "/* This file is autogenerated from the Unicode " DATA_VERSION_S " database. Do not edit */\n\n"; @@ -3612,6 +3667,7 @@ int main(int, char **) f.write("namespace QUnicodeTables {\n\n"); f.write(property_string); f.write(sizeOfPropertiesStructCheck); + f.write(east_asian_width_string); f.write(grapheme_break_class_string); f.write(word_break_class_string); f.write(sentence_break_class_string); |