summaryrefslogtreecommitdiffstats
path: root/util/unicode/main.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r--util/unicode/main.cpp761
1 files changed, 653 insertions, 108 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index df806eff0b..fb308b7dc0 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -1,30 +1,5 @@
-/****************************************************************************
-**
-** Copyright (C) 2020 The Qt Company Ltd.
-** Contact: https://www.qt.io/licensing/
-**
-** This file is part of the utils of the Qt Toolkit.
-**
-** $QT_BEGIN_LICENSE:GPL-EXCEPT$
-** Commercial License Usage
-** Licensees holding valid commercial Qt licenses may use this file in
-** accordance with the commercial license agreement provided with the
-** Software or, alternatively, in accordance with the terms contained in
-** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see https://www.qt.io/terms-conditions. For further
-** information use the contact form at https://www.qt.io/contact-us.
-**
-** GNU General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU
-** General Public License version 3 as published by the Free Software
-** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
-** included in the packaging of this file. Please review the following
-** information to ensure the GNU General Public License requirements will
-** be met: https://www.gnu.org/licenses/gpl-3.0.html.
-**
-** $QT_END_LICENSE$
-**
-****************************************************************************/
+// Copyright (C) 2020 The Qt Company Ltd.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include <qbytearray.h>
#include <qchar.h>
@@ -33,12 +8,14 @@
#include <qhash.h>
#include <qlist.h>
#include <qstring.h>
+#include <qbitarray.h>
+#include <private/qstringiterator_p.h>
#if 0
#include <private/qunicodetables_p.h>
#endif
-#define DATA_VERSION_S "13.0"
-#define DATA_VERSION_STR "QChar::Unicode_13_0"
+#define DATA_VERSION_S "15.1"
+#define DATA_VERSION_STR "QChar::Unicode_15_1"
static QHash<QByteArray, QChar::UnicodeVersion> age_map;
@@ -72,6 +49,9 @@ static void initAgeMap()
{ QChar::Unicode_12_0, "12.0" },
{ QChar::Unicode_12_1, "12.1" }, // UCD Revision 24
{ QChar::Unicode_13_0, "13.0" }, // UCD Revision 26
+ { QChar::Unicode_14_0, "14.0" }, // UCD Revision 28
+ { QChar::Unicode_15_0, "15.0" }, // UCD Revision 30
+ { QChar::Unicode_15_1, "15.1" }, // UCD Revision 32
{ QChar::Unicode_Unassigned, 0 }
};
AgeMap *d = ageMap;
@@ -81,6 +61,47 @@ static void initAgeMap()
}
}
+static const char *east_asian_width_string =
+R"(enum class EastAsianWidth : unsigned int {
+ A,
+ F,
+ H,
+ N,
+ Na,
+ W,
+};
+
+)";
+
+enum class EastAsianWidth : unsigned int {
+ A,
+ F,
+ H,
+ N,
+ Na,
+ W,
+};
+
+static QHash<QByteArray, EastAsianWidth> eastAsianWidthMap;
+
+static void initEastAsianWidthMap()
+{
+ constexpr struct W {
+ EastAsianWidth width;
+ const char *name;
+ } widths[] = {
+ { EastAsianWidth::A, "A" },
+ { EastAsianWidth::F, "F" },
+ { EastAsianWidth::H, "H" },
+ { EastAsianWidth::N, "N" },
+ { EastAsianWidth::Na, "Na" },
+ { EastAsianWidth::W, "W" },
+ };
+
+ for (auto &w : widths)
+ eastAsianWidthMap.insert(w.name, w.width);
+}
+
static QHash<QByteArray, QChar::Category> categoryMap;
static void initCategoryMap()
@@ -290,10 +311,7 @@ static const char *grapheme_break_class_string =
" GraphemeBreak_T,\n"
" GraphemeBreak_LV,\n"
" GraphemeBreak_LVT,\n"
- " Graphemebreak_E_Base,\n"
- " Graphemebreak_E_Modifier,\n"
- " Graphemebreak_Glue_After_Zwj,\n"
- " Graphemebreak_E_Base_GAZ,\n"
+ " GraphemeBreak_Extended_Pictographic,\n"
"\n"
" NumGraphemeBreakClasses\n"
"};\n\n";
@@ -313,10 +331,7 @@ enum GraphemeBreakClass {
GraphemeBreak_T,
GraphemeBreak_LV,
GraphemeBreak_LVT,
- Graphemebreak_E_Base,
- Graphemebreak_E_Modifier,
- Graphemebreak_Glue_After_Zwj,
- Graphemebreak_E_Base_GAZ,
+ GraphemeBreak_Extended_Pictographic,
GraphemeBreak_Unassigned
};
@@ -343,11 +358,8 @@ static void initGraphemeBreak()
{ GraphemeBreak_T, "T" },
{ GraphemeBreak_LV, "LV" },
{ GraphemeBreak_LVT, "LVT" },
- { Graphemebreak_E_Base, "E_Base" },
- { Graphemebreak_E_Modifier, "E_Modifier" },
- { Graphemebreak_Glue_After_Zwj, "Glue_After_Zwj" },
- { Graphemebreak_E_Base_GAZ, "E_Base_GAZ" },
- { GraphemeBreak_Unassigned, 0 }
+ { GraphemeBreak_Extended_Pictographic, "Extended_Pictographic" },
+ { GraphemeBreak_Unassigned, nullptr }
};
GraphemeBreakList *d = breaks;
while (d->name) {
@@ -377,10 +389,6 @@ static const char *word_break_class_string =
" WordBreak_MidNum,\n"
" WordBreak_Numeric,\n"
" WordBreak_ExtendNumLet,\n"
- " WordBreak_E_Base,\n"
- " WordBreak_E_Modifier,\n"
- " WordBreak_Glue_After_Zwj,\n"
- " WordBreak_E_Base_GAZ,\n"
" WordBreak_WSegSpace,\n"
"\n"
" NumWordBreakClasses\n"
@@ -405,10 +413,6 @@ enum WordBreakClass {
WordBreak_MidNum,
WordBreak_Numeric,
WordBreak_ExtendNumLet,
- WordBreak_E_Base,
- WordBreak_E_Modifier,
- WordBreak_Glue_After_Zwj,
- WordBreak_E_Base_GAZ,
WordBreak_WSegSpace,
WordBreak_Unassigned
@@ -440,10 +444,6 @@ static void initWordBreak()
{ WordBreak_MidNum, "MidNum" },
{ WordBreak_Numeric, "Numeric" },
{ WordBreak_ExtendNumLet, "ExtendNumLet" },
- { WordBreak_E_Base, "E_Base" },
- { WordBreak_E_Modifier, "E_Modifier" },
- { WordBreak_Glue_After_Zwj, "Glue_After_Zwj" },
- { WordBreak_E_Base_GAZ, "E_Base_GAZ" },
{ WordBreak_WSegSpace, "WSegSpace" },
{ WordBreak_Unassigned, 0 }
};
@@ -529,9 +529,11 @@ static void initSentenceBreak()
static const char *line_break_class_string =
"// see http://www.unicode.org/reports/tr14/tr14-30.html\n"
- "// we don't use the XX and AI classes and map them to AL instead.\n"
+ "// we don't use the XX, AK, AP, AS and AI classes and map them to AL instead.\n"
+ "// VI and VF classes are mapped to CM.\n"
"enum LineBreakClass {\n"
- " LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,\n"
+ " LineBreak_OP, LineBreak_CL, LineBreak_CP,\n"
+ " LineBreak_QU, LineBreak_QU_Pi, LineBreak_QU_Pf, LineBreak_GL,\n"
" LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,\n"
" LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,\n"
" LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,\n"
@@ -545,7 +547,8 @@ static const char *line_break_class_string =
"};\n\n";
enum LineBreakClass {
- LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,
+ LineBreak_OP, LineBreak_CL, LineBreak_CP,
+ LineBreak_QU, LineBreak_QU_Pi, LineBreak_QU_Pf, LineBreak_GL,
LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,
LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,
LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,
@@ -613,6 +616,11 @@ static void initLineBreak()
{ LineBreak_EB, "EB" },
{ LineBreak_EM, "EM" },
{ LineBreak_ZWJ, "ZWJ" },
+ { LineBreak_AL, "AK" },
+ { LineBreak_AL, "AP" },
+ { LineBreak_AL, "AS" },
+ { LineBreak_CM, "VI" },
+ { LineBreak_CM, "VF" },
{ LineBreak_Unassigned, 0 }
};
LineBreakList *d = breaks;
@@ -804,6 +812,17 @@ static void initScriptMap()
{ QChar::Script_KhitanSmallScript, "KhitanSmallScript" },
{ QChar::Script_Yezidi, "Yezidi" },
+ // 14.0
+ { QChar::Script_CyproMinoan, "CyproMinoan"},
+ { QChar::Script_OldUyghur, "OldUyghur"},
+ { QChar::Script_Tangsa, "Tangsa"},
+ { QChar::Script_Toto, "Toto"},
+ { QChar::Script_Vithkuqi, "Vithkuqi"},
+
+ // 15.0
+ { QChar::Script_Kawi, "Kawi"},
+ { QChar::Script_NagMundari, "NagMundari"},
+
// unhandled
{ QChar::Script_Unknown, 0 }
};
@@ -814,6 +833,59 @@ static void initScriptMap()
}
}
+// IDNA status as present int the data file
+enum class IdnaRawStatus : unsigned int {
+ Disallowed,
+ Valid,
+ Ignored,
+ Mapped,
+ Deviation,
+ DisallowedStd3Valid,
+ DisallowedStd3Mapped,
+};
+
+static QHash<QByteArray, IdnaRawStatus> idnaStatusMap;
+
+static void initIdnaStatusMap()
+{
+ struct {
+ IdnaRawStatus status;
+ const char *name;
+ } data[] = {
+ {IdnaRawStatus::Disallowed, "disallowed"},
+ {IdnaRawStatus::Valid, "valid"},
+ {IdnaRawStatus::Ignored, "ignored"},
+ {IdnaRawStatus::Mapped, "mapped"},
+ {IdnaRawStatus::Deviation, "deviation"},
+ {IdnaRawStatus::DisallowedStd3Valid, "disallowed_STD3_valid"},
+ {IdnaRawStatus::DisallowedStd3Mapped, "disallowed_STD3_mapped"},
+ };
+
+ for (const auto &entry : data)
+ idnaStatusMap[entry.name] = entry.status;
+}
+
+static const char *idna_status_string =
+ "enum class IdnaStatus : unsigned int {\n"
+ " Disallowed,\n"
+ " Valid,\n"
+ " Ignored,\n"
+ " Mapped,\n"
+ " Deviation\n"
+ "};\n\n";
+
+// Resolved IDNA status as it goes into the database.
+// Qt extends host name validity rules to allow underscores
+// NOTE: The members here should come in the same order and have the same values
+// as in IdnaRawStatus
+enum class IdnaStatus : unsigned int {
+ Disallowed,
+ Valid,
+ Ignored,
+ Mapped,
+ Deviation,
+};
+
// Keep this one in sync with the code in createPropertyInfo
static const char *property_string =
"enum Case {\n"
@@ -832,7 +904,8 @@ static const char *property_string =
" ushort joining : 3;\n"
" signed short digitValue : 5;\n"
" signed short mirrorDiff : 16;\n"
- " ushort unicodeVersion : 8; /* 5 used */\n"
+ " ushort unicodeVersion : 5; /* 5 used */\n"
+ " ushort eastAsianWidth : 3; /* 3 used */\n"
" ushort nfQuickCheck : 8;\n" // could be narrowed
"#ifdef Q_OS_WASM\n"
" unsigned char : 0; //wasm 64 packing trick\n"
@@ -847,7 +920,8 @@ static const char *property_string =
" ushort graphemeBreakClass : 5; /* 5 used */\n"
" ushort wordBreakClass : 5; /* 5 used */\n"
" ushort lineBreakClass : 6; /* 6 used */\n"
- " ushort sentenceBreakClass : 8; /* 4 used */\n"
+ " ushort sentenceBreakClass : 4; /* 4 used */\n"
+ " ushort idnaStatus : 4; /* 3 used */\n"
" ushort script : 8;\n"
"};\n\n"
"Q_CORE_EXPORT const Properties * QT_FASTCALL properties(char32_t ucs4) noexcept;\n"
@@ -870,6 +944,18 @@ static const char *methods =
"Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(char32_t ucs4) noexcept;\n"
"inline LineBreakClass lineBreakClass(QChar ch) noexcept\n"
"{ return lineBreakClass(ch.unicode()); }\n"
+ "\n"
+ "Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept;\n"
+ "inline IdnaStatus idnaStatus(QChar ch) noexcept\n"
+ "{ return idnaStatus(ch.unicode()); }\n"
+ "\n"
+ "Q_CORE_EXPORT QStringView QT_FASTCALL idnaMapping(char32_t usc4) noexcept;\n"
+ "inline QStringView idnaMapping(QChar ch) noexcept\n"
+ "{ return idnaMapping(ch.unicode()); }\n"
+ "\n"
+ "Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept;\n"
+ "inline EastAsianWidth eastAsianWidth(QChar ch) noexcept\n"
+ "{ return eastAsianWidth(ch.unicode()); }\n"
"\n";
static const int SizeOfPropertiesStruct = 20;
@@ -892,6 +978,7 @@ struct PropertyFlags {
&& direction == o.direction
&& joining == o.joining
&& age == o.age
+ && eastAsianWidth == o.eastAsianWidth
&& digitValue == o.digitValue
&& mirrorDiff == o.mirrorDiff
&& lowerCaseDiff == o.lowerCaseDiff
@@ -908,6 +995,7 @@ struct PropertyFlags {
&& lineBreakClass == o.lineBreakClass
&& script == o.script
&& nfQuickCheck == o.nfQuickCheck
+ && idnaStatus == o.idnaStatus
);
}
// from UnicodeData.txt
@@ -918,6 +1006,8 @@ struct PropertyFlags {
QChar::JoiningType joining : 3;
// from DerivedAge.txt
QChar::UnicodeVersion age : 5;
+ // From EastAsianWidth.txt
+ EastAsianWidth eastAsianWidth = EastAsianWidth::N;
int digitValue = -1;
int mirrorDiff : 16;
@@ -937,6 +1027,7 @@ struct PropertyFlags {
int script = QChar::Script_Unknown;
// from DerivedNormalizationProps.txt
uchar nfQuickCheck = 0;
+ IdnaStatus idnaStatus = IdnaStatus::Disallowed;
};
@@ -1091,6 +1182,8 @@ struct UnicodeData {
// computed position of unicode property set
int propertyIndex = -1;
+
+ IdnaRawStatus idnaRawStatus = IdnaRawStatus::Disallowed;
};
static QList<UnicodeData> unicodeData;
@@ -1453,6 +1546,52 @@ static void readDerivedAge()
}
}
+static void readEastAsianWidth()
+{
+ qDebug("Reading EastAsianWidth.txt");
+
+ QFile f("data/EastAsianWidth.txt");
+ if (!f.exists() || !f.open(QFile::ReadOnly))
+ qFatal("Couldn't find or read EastAsianWidth.txt");
+
+ while (!f.atEnd()) {
+ QByteArray line = f.readLine().trimmed();
+
+ int comment = line.indexOf('#');
+ line = (comment < 0 ? line : line.left(comment)).simplified();
+
+ if (line.isEmpty())
+ continue;
+
+ QList<QByteArray> fields = line.split(';');
+ Q_ASSERT(fields.size() == 2);
+
+ // That would be split(".."), but that API does not exist.
+ const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
+ QList<QByteArray> cl = codePoints.split('.');
+ Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
+
+ const QByteArray widthString = fields[1].trimmed();
+ if (!eastAsianWidthMap.contains(widthString)) {
+ qFatal("Unhandled EastAsianWidth property value for %s: %s",
+ qPrintable(codePoints), qPrintable(widthString));
+ }
+ auto width = eastAsianWidthMap.value(widthString);
+
+ bool ok;
+ const int first = cl[0].toInt(&ok, 16);
+ const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
+ Q_ASSERT(ok);
+
+ for (int codepoint = first; codepoint <= last; ++codepoint) {
+ UnicodeData &ud = UnicodeData::valueRef(codepoint);
+ // Ensure that ranges don't overlap.
+ Q_ASSERT(ud.p.eastAsianWidth == EastAsianWidth::N);
+ ud.p.eastAsianWidth = width;
+ }
+ }
+}
+
static void readDerivedNormalizationProps()
{
qDebug("Reading DerivedNormalizationProps.txt");
@@ -1580,7 +1719,7 @@ static QByteArray createNormalizationCorrections()
" int version;\n"
"};\n\n"
- "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
+ "static constexpr NormalizationCorrection uc_normalization_corrections[] = {\n";
int maxVersion = 0;
int numCorrections = 0;
@@ -1893,6 +2032,59 @@ static void readGraphemeBreak()
}
}
+static void readEmojiData()
+{
+ qDebug("Reading emoji-data.txt");
+
+ QFile f("data/emoji-data.txt");
+ if (!f.open(QFile::ReadOnly))
+ qFatal("Couldn't find emoji-data.txt");
+
+ while (!f.atEnd()) {
+ QByteArray line;
+ line.resize(1024);
+ int len = f.readLine(line.data(), 1024);
+ line.resize(len-1);
+
+ int comment = line.indexOf('#');
+ if (comment >= 0)
+ line = line.left(comment);
+ line.replace(" ", "");
+
+ if (line.isEmpty())
+ continue;
+
+ QList<QByteArray> l = line.split(';');
+ Q_ASSERT(l.size() == 2);
+
+ // NOTE: for the moment we process emoji_data only to extract
+ // the code points with Extended_Pictographic. This is needed by
+ // extended grapheme clustering (cf. the GB11 rule in UAX #29).
+ if (l[1] != "Extended_Pictographic")
+ continue;
+
+ QByteArray codes = l[0];
+ codes.replace("..", ".");
+ QList<QByteArray> cl = codes.split('.');
+
+ bool ok;
+ int from = cl[0].toInt(&ok, 16);
+ Q_ASSERT(ok);
+ int to = from;
+ if (cl.size() == 2) {
+ to = cl[1].toInt(&ok, 16);
+ Q_ASSERT(ok);
+ }
+
+ for (int codepoint = from; codepoint <= to; ++codepoint) {
+ UnicodeData &ud = UnicodeData::valueRef(codepoint);
+ // Check we're not overwriting the data from GraphemeBreakProperty.txt...
+ Q_ASSERT(ud.p.graphemeBreakClass == GraphemeBreak_Any);
+ ud.p.graphemeBreakClass = GraphemeBreak_Extended_Pictographic;
+ }
+ }
+}
+
static void readWordBreak()
{
qDebug("Reading WordBreakProperty.txt");
@@ -2248,6 +2440,367 @@ static void readScripts()
}
}
+static QMap<char32_t, QString> idnaMappingTable;
+
+static void readIdnaMappingTable()
+{
+ qDebug("Reading IdnaMappingTable.txt");
+
+ QFile f("data/IdnaMappingTable.txt");
+ if (!f.exists() || !f.open(QFile::ReadOnly))
+ qFatal("Couldn't find or read IdnaMappingTable.txt");
+
+ while (!f.atEnd()) {
+ QByteArray line = f.readLine().trimmed();
+
+ int comment = line.indexOf('#');
+ line = (comment < 0 ? line : line.left(comment)).simplified();
+
+ if (line.isEmpty())
+ continue;
+
+ QList<QByteArray> fields = line.split(';');
+ Q_ASSERT(fields.size() >= 2);
+
+ // That would be split(".."), but that API does not exist.
+ const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
+ QList<QByteArray> cl = codePoints.split('.');
+ Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
+
+ const QByteArray statusString = fields[1].trimmed();
+ if (!idnaStatusMap.contains(statusString))
+ qFatal("Unhandled IDNA status property value for %s: %s",
+ qPrintable(codePoints), qPrintable(statusString));
+ IdnaRawStatus rawStatus = idnaStatusMap.value(statusString);
+
+ bool ok;
+ const int first = cl[0].toInt(&ok, 16);
+ const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
+ Q_ASSERT(ok);
+
+ QString mapping;
+
+ switch (rawStatus) {
+ case IdnaRawStatus::Disallowed:
+ case IdnaRawStatus::Valid:
+ case IdnaRawStatus::Ignored:
+ case IdnaRawStatus::DisallowedStd3Valid:
+ break;
+
+ case IdnaRawStatus::Mapped:
+ case IdnaRawStatus::Deviation:
+ case IdnaRawStatus::DisallowedStd3Mapped:
+ Q_ASSERT(fields.size() >= 3);
+
+ for (const auto &s : fields[2].trimmed().split(' ')) {
+ if (!s.isEmpty()) {
+ bool ok;
+ int val = s.toInt(&ok, 16);
+ Q_ASSERT_X(ok, "readIdnaMappingTable", qPrintable(line));
+ for (auto c : QChar::fromUcs4(val))
+ mapping.append(c);
+ }
+ }
+
+ // Some deviations have empty mappings, others should not...
+ if (mapping.isEmpty()) {
+ Q_ASSERT(rawStatus == IdnaRawStatus::Deviation);
+ qDebug() << " Empty IDNA mapping for" << codePoints;
+ }
+
+ break;
+ }
+
+ for (int codepoint = first; codepoint <= last; ++codepoint) {
+ UnicodeData &ud = UnicodeData::valueRef(codepoint);
+ // Ensure that ranges don't overlap.
+ Q_ASSERT(ud.idnaRawStatus == IdnaRawStatus::Disallowed);
+ ud.idnaRawStatus = rawStatus;
+
+ // ASCII codepoints are skipped here because they are processed in separate
+ // optimized code paths that do not use this mapping table.
+ if (codepoint >= 0x80 && !mapping.isEmpty())
+ idnaMappingTable[codepoint] = mapping;
+ }
+ }
+}
+
+/*
+ Resolve IDNA status by deciding whether to allow STD3 violations
+
+ Underscores are normally prohibited by STD3 rules but Qt allows underscores
+ to be used inside URLs (see QTBUG-7434 for example). This code changes the
+ underscore status to Valid. The same is done to mapped codepoints that
+ map to underscores combined with other Valid codepoints.
+
+ Underscores in domain names are required when using DNS-SD protocol and they
+ are also allowed by the SMB protocol.
+*/
+static void resolveIdnaStatus()
+{
+ qDebug("resolveIdnaStatus:");
+
+ UnicodeData::valueRef(u'_').idnaRawStatus = IdnaRawStatus::Valid;
+
+ for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
+ UnicodeData &ud = UnicodeData::valueRef(codepoint);
+ switch (ud.idnaRawStatus) {
+ case IdnaRawStatus::Disallowed:
+ case IdnaRawStatus::Valid:
+ case IdnaRawStatus::Ignored:
+ case IdnaRawStatus::Deviation:
+ case IdnaRawStatus::Mapped:
+ ud.p.idnaStatus = static_cast<IdnaStatus>(ud.idnaRawStatus);
+ break;
+ case IdnaRawStatus::DisallowedStd3Valid:
+ ud.p.idnaStatus = IdnaStatus::Disallowed;
+ break;
+ case IdnaRawStatus::DisallowedStd3Mapped: {
+ Q_ASSERT(idnaMappingTable.contains(codepoint));
+ const auto &mapping = idnaMappingTable[codepoint];
+
+ bool allow = true;
+ for (QStringIterator iter(mapping); iter.hasNext();) {
+ if (UnicodeData::valueRef(iter.next()).idnaRawStatus != IdnaRawStatus::Valid) {
+ allow = false;
+ break;
+ }
+ }
+
+ if (allow) {
+ qDebug() << " Allowing" << Qt::hex << codepoint;
+ ud.p.idnaStatus = IdnaStatus::Mapped;
+ } else {
+ ud.p.idnaStatus = IdnaStatus::Disallowed;
+ idnaMappingTable.remove(codepoint);
+ }
+ break;
+ }
+ }
+ }
+}
+
+/*
+ Return maximum overlap for strings left and right in this order.
+
+ The input strings should not be substrings of each other.
+*/
+static qsizetype overlap(const QString &left, const QString &right)
+{
+ for (qsizetype n = std::min(left.size(), right.size()) - 1; n > 0; n--) {
+ if (left.last(n) == right.first(n))
+ return n;
+ }
+ return 0;
+}
+
+using GraphNode = unsigned int;
+
+struct OverlapGraphEdge
+{
+ GraphNode start;
+ GraphNode end;
+ qsizetype overlap;
+};
+
+/*
+ Returns a common superstring of all inputs.
+
+ Ideally this function would return the superstring of the smallest
+ possible size, but the shortest common superstring problem is know to be
+ NP-hard so an approximation must be used here.
+
+ This function implements the greedy algorithm for building the superstring.
+
+ As an optimization this function is allowed to destroy its inputs.
+*/
+static QString buildSuperstring(QList<QString> &inputs)
+{
+ // Ensure that the inputs don't contain substrings.
+ // First, sort the array by length to make substring removal easier.
+ std::sort(inputs.begin(), inputs.end(), [](const QString &a, const QString &b) {
+ return a.size() == b.size() ? a > b : a.size() > b.size();
+ });
+
+ // Remove duplicates and other substrings
+ for (auto i = inputs.begin() + 1; i != inputs.end();) {
+ bool isSubstring = std::any_of(inputs.begin(), i, [i](const QString &s) {
+ return s.contains(*i);
+ });
+ i = isSubstring ? inputs.erase(i) : i + 1;
+ }
+
+ // Build overlap graph for the remaining inputs. It is fully-connected.
+ QList<OverlapGraphEdge> graphEdges;
+ graphEdges.reserve(inputs.size() * (inputs.size() - 1));
+
+ for (GraphNode i = 0; i < inputs.size(); i++) {
+ for (GraphNode j = 0; j < inputs.size(); j++) {
+ if (i != j)
+ graphEdges.append(OverlapGraphEdge {i, j, overlap(inputs[i], inputs[j])});
+ }
+ }
+
+ // Build a Hamiltonian path through the overlap graph, taking nodes with highest overlap
+ // first.
+ std::sort(graphEdges.begin(), graphEdges.end(), [](const auto &a, const auto &b) {
+ return a.overlap == b.overlap
+ ? a.start == b.start ? a.end < b.end : a.start < b.start
+ : a.overlap > b.overlap;
+ });
+
+ QBitArray starts(inputs.size());
+ QBitArray ends(inputs.size());
+ QMap<GraphNode, OverlapGraphEdge> pathEdges;
+
+ auto createsCycle = [&](const OverlapGraphEdge &edge) {
+ if (!starts[edge.end] || !ends[edge.start])
+ return false;
+ Q_ASSERT(!pathEdges.contains(edge.start)); // Caller checks it's not yet a start.
+
+ GraphNode node = edge.end;
+ while (pathEdges.contains(node))
+ node = pathEdges[node].end;
+
+ return node == edge.start;
+ };
+
+ for (const auto &edge : graphEdges) {
+ if (!starts[edge.start] && !ends[edge.end] && !createsCycle(edge)) {
+ starts.setBit(edge.start);
+ ends.setBit(edge.end);
+ pathEdges[edge.start] = edge;
+ if (pathEdges.size() == inputs.size() - 1)
+ break;
+ }
+ }
+
+ Q_ASSERT(ends.count(false) == 1);
+ Q_ASSERT(starts.count(false) == 1);
+
+ // Find the start node of the path.
+ GraphNode node = 0;
+ while (node < ends.size() && ends[node])
+ node++;
+ Q_ASSERT(node < ends.size());
+
+ QString superstring = inputs[node];
+ qsizetype pathNodes = 1; // Count path nodes for sanity check
+
+ while (pathEdges.contains(node)) {
+ const auto &edge = pathEdges[node];
+ Q_ASSERT(edge.start == node);
+
+ superstring.append(QStringView { inputs[edge.end] }.sliced(edge.overlap));
+
+ node = edge.end;
+ pathNodes++;
+ }
+ Q_ASSERT(pathNodes == inputs.size());
+
+ return superstring;
+}
+
+/*
+ Stores IDNA mapping information.
+
+ The mapping table is an array of IdnaMapEntry instances sorted
+ by codePoint. For mapping resulting in a single QChar, that character
+ is stored inside the entry in charOrOffset. Otherwise the entry contains
+ offset inside idnaMappingData array.
+
+ It should be possible to find all mapped strings with size > 1 inside
+ idnaMappingData, otherwise the construction of this array should be optimized
+ to take advantage of common substrings and minimize the data size.
+*/
+static QByteArray createIdnaMapping()
+{
+ qDebug("createIdnaMapping:");
+
+ QList<QString> values;
+ values.reserve(idnaMappingTable.size());
+ qsizetype uncompressedSize = 0;
+
+ for (const auto &v : idnaMappingTable.values()) {
+ if (v.size() > 2) {
+ values.append(v);
+ uncompressedSize += v.size();
+ }
+ }
+
+ QString idnaMappingData = buildSuperstring(values);
+ qDebug() << " uncompressed size:" << uncompressedSize << "characters";
+ qDebug() << " consolidated size:" << idnaMappingData.size() << "characters";
+
+ qsizetype memoryUsage = 0;
+
+ QByteArray out =
+ "static constexpr char16_t idnaMappingData[] = {";
+
+ int col = 0;
+ for (auto c : idnaMappingData) {
+ if (col == 0)
+ out += "\n ";
+ out += " 0x" + QByteArray::number(c.unicode(), 16) + ",";
+ col = (col + 1) % 12;
+ memoryUsage += 2;
+ }
+ out += "\n};\n\n";
+
+ // Check if the values fit into IdnaMapEntry below.
+ Q_ASSERT(idnaMappingData.size() < (1 << 16));
+
+ // This could be written more elegantly with a union and designated initializers,
+ // but designated initizers is a C++20 feature
+ out +=
+ "struct IdnaMapEntry {\n"
+ " // 21 bits suffice for any valid code-point (LastValidCodePoint = 0x10ffff)\n"
+ " unsigned codePoint : 24;\n"
+ " unsigned size : 8;\n"
+ " char16_t ucs[2]; // ucs[0] is offset if size > 2\n"
+ "};\n"
+ "static_assert(sizeof(IdnaMapEntry) == 8);\n\n"
+ "static constexpr IdnaMapEntry idnaMap[] = {\n";
+
+ for (auto i = idnaMappingTable.keyValueBegin(); i != idnaMappingTable.keyValueEnd(); i++) {
+ const QString &mapping = i->second;
+ Q_ASSERT(!mapping.isEmpty());
+
+ qsizetype mappingIndex = idnaMappingData.indexOf(mapping);
+ Q_ASSERT(mappingIndex >= 0 || mapping.size() <= 2);
+
+ out += " { 0x" + QByteArray::number(i->first, 16) +
+ ", " + QByteArray::number(mapping.size());
+ if (mapping.size() <= 2) {
+ out += ", { 0x" + QByteArray::number(mapping[0].unicode(), 16);
+ if (mapping.size() == 2)
+ out += ", 0x" + QByteArray::number(mapping[1].unicode(), 16);
+ else
+ out += ", 0";
+ } else {
+ out += ", { " + QByteArray::number(mappingIndex);
+ out += ", 0";
+ }
+ out += " } },\n";
+ memoryUsage += 8;
+ }
+
+ qDebug() << " memory usage:" << memoryUsage << "bytes";
+
+ out +=
+ "};\n\n"
+ "Q_CORE_EXPORT QStringView QT_FASTCALL idnaMapping(char32_t ucs4) noexcept\n"
+ "{\n"
+ " auto i = std::lower_bound(std::begin(idnaMap), std::end(idnaMap), ucs4,\n"
+ " [](const auto &p, char32_t c) { return p.codePoint < c; });\n"
+ " if (i == std::end(idnaMap) || i->codePoint != ucs4)\n"
+ " return {};\n\n"
+ " return QStringView(i->size > 2 ? idnaMappingData + i->ucs[0] : i->ucs, i->size);\n"
+ "}\n\n";
+
+ return out;
+}
+
#if 0
static void dump(int from, int to)
{
@@ -2376,7 +2929,7 @@ static QByteArray createPropertyInfo()
Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE +(SMP_END-BMP_END)/SMP_BLOCKSIZE); // 0x1870
Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
- QByteArray out = "static const unsigned short uc_property_trie[] = {\n";
+ QByteArray out = "static constexpr unsigned short uc_property_trie[] = {\n";
// First write the map from blockId to indices of unique blocks:
out += " // [0x0..0x" + QByteArray::number(BMP_END, 16) + ")";
for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
@@ -2429,7 +2982,7 @@ static QByteArray createPropertyInfo()
out.chop(2);
out += "\n};\n\n";
- out += "static const Properties uc_properties[] = {";
+ out += "static constexpr Properties uc_properties[] = {";
// keep in sync with the property declaration
for (int i = 0; i < uniqueProperties.size(); ++i) {
const PropertyFlags &p = uniqueProperties.at(i);
@@ -2452,9 +3005,12 @@ static QByteArray createPropertyInfo()
// " signed short mirrorDiff : 16;\n"
out += QByteArray::number( p.mirrorDiff );
out += ", ";
-// " ushort unicodeVersion : 8; /* 5 used */\n"
+// " ushort unicodeVersion : 5; /* 5 used */\n"
out += QByteArray::number( p.age );
out += ", ";
+// " ushort eastAsianWidth : 3;" /* 3 used */\n"
+ out += QByteArray::number( static_cast<unsigned int>(p.eastAsianWidth) );
+ out += ", ";
// " ushort nfQuickCheck : 8;\n"
out += QByteArray::number( p.nfQuickCheck );
out += ", ";
@@ -2488,9 +3044,12 @@ static QByteArray createPropertyInfo()
out += ", ";
out += QByteArray::number( p.lineBreakClass );
out += ", ";
-// " ushort sentenceBreakClass : 8; /* 4 used */\n"
+// " ushort sentenceBreakClass : 4; /* 4 used */\n"
out += QByteArray::number( p.sentenceBreakClass );
out += ", ";
+// " ushort idnaStatus : 4; /* 3 used */\n"
+ out += QByteArray::number( static_cast<unsigned int>(p.idnaStatus) );
+ out += ", ";
// " ushort script : 8;\n"
out += QByteArray::number( p.script );
out += " },";
@@ -2551,6 +3110,16 @@ static QByteArray createPropertyInfo()
"{\n"
" return static_cast<LineBreakClass>(qGetProp(ucs4)->lineBreakClass);\n"
"}\n"
+ "\n"
+ "Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept\n"
+ "{\n"
+ " return static_cast<IdnaStatus>(qGetProp(ucs4)->idnaStatus);\n"
+ "}\n"
+ "\n"
+ "Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept\n"
+ "{\n"
+ " return static_cast<EastAsianWidth>(qGetProp(ucs4)->eastAsianWidth);\n"
+ "}\n"
"\n";
return out;
@@ -2561,7 +3130,7 @@ static QByteArray createSpecialCaseMap()
qDebug("createSpecialCaseMap:");
QByteArray out
- = "static const unsigned short specialCaseMap[] = {\n"
+ = "static constexpr unsigned short specialCaseMap[] = {\n"
" 0x0, // placeholder";
int i = 1;
@@ -2577,7 +3146,7 @@ static QByteArray createSpecialCaseMap()
maxN = std::max(maxN, n);
}
out.chop(1);
- out += "\n};\n\nconst unsigned int MaxSpecialCaseLength = ";
+ out += "\n};\n\nconstexpr unsigned int MaxSpecialCaseLength = ";
out += QByteArray::number(maxN);
out += ";\n\n";
@@ -2709,7 +3278,7 @@ static QByteArray createCompositionInfo()
Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
- QByteArray out = "static const unsigned short uc_decomposition_trie[] = {\n";
+ QByteArray out = "static constexpr unsigned short uc_decomposition_trie[] = {\n";
// first write the map
out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
@@ -2773,7 +3342,7 @@ static QByteArray createCompositionInfo()
+ QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n"
" : 0xffff)\n\n";
- out += "static const unsigned short uc_decomposition_map[] = {";
+ out += "static constexpr unsigned short uc_decomposition_map[] = {";
for (int i = 0; i < decompositions.size(); ++i) {
if (!(i % 8)) {
if (out.endsWith(' '))
@@ -2909,7 +3478,7 @@ static QByteArray createLigatureInfo()
Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
- QByteArray out = "static const unsigned short uc_ligature_trie[] = {\n";
+ QByteArray out = "static constexpr unsigned short uc_ligature_trie[] = {\n";
// first write the map
out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
@@ -2973,7 +3542,7 @@ static QByteArray createLigatureInfo()
+ QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n"
" : 0xffff)\n\n";
- out += "static const unsigned short uc_ligature_map[] = {";
+ out += "static constexpr unsigned short uc_ligature_map[] = {";
for (int i = 0; i < ligatures.size(); ++i) {
if (!(i % 8)) {
if (out.endsWith(' '))
@@ -3006,6 +3575,7 @@ QByteArray createCasingInfo()
int main(int, char **)
{
initAgeMap();
+ initEastAsianWidthMap();
initCategoryMap();
initDecompositionMap();
initDirectionMap();
@@ -3015,20 +3585,26 @@ int main(int, char **)
initSentenceBreak();
initLineBreak();
initScriptMap();
+ initIdnaStatusMap();
readUnicodeData();
readBidiMirroring();
readArabicShaping();
readDerivedAge();
+ readEastAsianWidth();
readDerivedNormalizationProps();
readSpecialCasing();
readCaseFolding();
// readBlocks();
readScripts();
readGraphemeBreak();
+ readEmojiData();
readWordBreak();
readSentenceBreak();
readLineBreak();
+ readIdnaMappingTable();
+
+ resolveIdnaStatus();
computeUniqueProperties();
QByteArray properties = createPropertyInfo();
@@ -3036,46 +3612,12 @@ int main(int, char **)
QByteArray compositions = createCompositionInfo();
QByteArray ligatures = createLigatureInfo();
QByteArray normalizationCorrections = createNormalizationCorrections();
+ QByteArray idnaMapping = createIdnaMapping();
QByteArray header =
- "/****************************************************************************\n"
- "**\n"
- "** Copyright (C) 2020 The Qt Company Ltd.\n"
- "** Contact: https://www.qt.io/licensing/\n"
- "**\n"
- "** This file is part of the QtCore module of the Qt Toolkit.\n"
- "**\n"
- "** $QT_BEGIN_LICENSE:LGPL$\n"
- "** Commercial License Usage\n"
- "** Licensees holding valid commercial Qt licenses may use this file in\n"
- "** accordance with the commercial license agreement provided with the\n"
- "** Software or, alternatively, in accordance with the terms contained in\n"
- "** a written agreement between you and The Qt Company. For licensing terms\n"
- "** and conditions see https://www.qt.io/terms-conditions. For further\n"
- "** information use the contact form at https://www.qt.io/contact-us.\n"
- "**\n"
- "** GNU Lesser General Public License Usage\n"
- "** Alternatively, this file may be used under the terms of the GNU Lesser\n"
- "** General Public License version 3 as published by the Free Software\n"
- "** Foundation and appearing in the file LICENSE.LGPL3 included in the\n"
- "** packaging of this file. Please review the following information to\n"
- "** ensure the GNU Lesser General Public License version 3 requirements\n"
- "** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.\n"
- "**\n"
- "** GNU General Public License Usage\n"
- "** Alternatively, this file may be used under the terms of the GNU\n"
- "** General Public License version 2.0 or (at your option) the GNU General\n"
- "** Public license version 3 or any later version approved by the KDE Free\n"
- "** Qt Foundation. The licenses are as published by the Free Software\n"
- "** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3\n"
- "** included in the packaging of this file. Please review the following\n"
- "** information to ensure the GNU General Public License requirements will\n"
- "** be met: https://www.gnu.org/licenses/gpl-2.0.html and\n"
- "** https://www.gnu.org/licenses/gpl-3.0.html.\n"
- "**\n"
- "** $QT_END_LICENSE$\n"
- "**\n"
- "****************************************************************************/\n\n";
+ "// Copyright (C) 2020 The Qt Company Ltd.\n"
+ "// SPDX-License-Identifier: Unicode-3.0\n"
+ "\n";
QByteArray note =
"/* This file is autogenerated from the Unicode " DATA_VERSION_S " database. Do not edit */\n\n";
@@ -3105,6 +3647,7 @@ int main(int, char **)
f.write(ligatures);
f.write("\n");
f.write(normalizationCorrections);
+ f.write(idnaMapping);
f.write("} // namespace QUnicodeTables\n\n");
f.write("using namespace QUnicodeTables;\n\n");
f.write("QT_END_NAMESPACE\n");
@@ -3124,10 +3667,12 @@ int main(int, char **)
f.write("namespace QUnicodeTables {\n\n");
f.write(property_string);
f.write(sizeOfPropertiesStructCheck);
+ f.write(east_asian_width_string);
f.write(grapheme_break_class_string);
f.write(word_break_class_string);
f.write(sentence_break_class_string);
f.write(line_break_class_string);
+ f.write(idna_status_string);
f.write(methods);
f.write("} // namespace QUnicodeTables\n\n"
"QT_END_NAMESPACE\n\n"