summaryrefslogtreecommitdiffstats
path: root/util/unicode/main.cpp
diff options
context:
space:
mode:
authorIevgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io>2021-07-30 12:09:46 +0200
committerIevgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io>2021-08-26 16:55:05 +0200
commit2afe1a3c19239da0a7bf4dd578b334d8ff6903b3 (patch)
treef02522a7064a14b3d000ba24f06b0a1fcdfe8ef3 /util/unicode/main.cpp
parent0dbf73e3deb9727da0376abf131d764251969114 (diff)
unicode: Generate tables for IDNA/UTS #46
Update the Unicode data processing tool to generate properties and mapping tables needed to implement UTS #46 (https://unicode.org/reports/tr46/). The implementation extends the standard to allow usage of underscores in URLs. This is done for compatibility with DNS-SD and SMB protocols. The data file needed to generate the new properties was taken from https://www.unicode.org/Public/idna/13.0.0/IdnaMappingTable.txt Task-number: QTBUG-85323 Change-Id: I2c303bf8a08aefb18a7491fb9b55385563bfa219 Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r--util/unicode/main.cpp273
1 files changed, 271 insertions, 2 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index e40acffd5c..15db12e242 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -805,6 +805,59 @@ static void initScriptMap()
}
}
+// IDNA status as present int the data file
+enum class IdnaRawStatus : unsigned int {
+ Disallowed,
+ Valid,
+ Ignored,
+ Mapped,
+ Deviation,
+ DisallowedStd3Valid,
+ DisallowedStd3Mapped,
+};
+
+static QHash<QByteArray, IdnaRawStatus> idnaStatusMap;
+
+static void initIdnaStatusMap()
+{
+ struct {
+ IdnaRawStatus status;
+ const char *name;
+ } data[] = {
+ {IdnaRawStatus::Disallowed, "disallowed"},
+ {IdnaRawStatus::Valid, "valid"},
+ {IdnaRawStatus::Ignored, "ignored"},
+ {IdnaRawStatus::Mapped, "mapped"},
+ {IdnaRawStatus::Deviation, "deviation"},
+ {IdnaRawStatus::DisallowedStd3Valid, "disallowed_STD3_valid"},
+ {IdnaRawStatus::DisallowedStd3Mapped, "disallowed_STD3_mapped"},
+ };
+
+ for (const auto &entry : data)
+ idnaStatusMap[entry.name] = entry.status;
+}
+
+static const char *idna_status_string =
+ "enum class IdnaStatus : unsigned int {\n"
+ " Disallowed,\n"
+ " Valid,\n"
+ " Ignored,\n"
+ " Mapped,\n"
+ " Deviation\n"
+ "};\n\n";
+
+// Resolved IDNA status as it goes into the database.
+// Qt extends host name validity rules to allow underscores
+// NOTE: The members here should come in the same order and have the same values
+// as in IdnaRawStatus
+enum class IdnaStatus : unsigned int {
+ Disallowed,
+ Valid,
+ Ignored,
+ Mapped,
+ Deviation,
+};
+
// Keep this one in sync with the code in createPropertyInfo
static const char *property_string =
"enum Case {\n"
@@ -838,7 +891,8 @@ static const char *property_string =
" ushort graphemeBreakClass : 5; /* 5 used */\n"
" ushort wordBreakClass : 5; /* 5 used */\n"
" ushort lineBreakClass : 6; /* 6 used */\n"
- " ushort sentenceBreakClass : 8; /* 4 used */\n"
+ " ushort sentenceBreakClass : 4; /* 4 used */\n"
+ " ushort idnaStatus : 4; /* 3 used */\n"
" ushort script : 8;\n"
"};\n\n"
"Q_CORE_EXPORT const Properties * QT_FASTCALL properties(char32_t ucs4) noexcept;\n"
@@ -861,6 +915,14 @@ static const char *methods =
"Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(char32_t ucs4) noexcept;\n"
"inline LineBreakClass lineBreakClass(QChar ch) noexcept\n"
"{ return lineBreakClass(ch.unicode()); }\n"
+ "\n"
+ "Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept;\n"
+ "inline IdnaStatus idnaStatus(QChar ch) noexcept\n"
+ "{ return idnaStatus(ch.unicode()); }\n"
+ "\n"
+ "Q_CORE_EXPORT const char16_t * QT_FASTCALL idnaMapping(char32_t usc4) noexcept;\n"
+ "inline const char16_t *idnaMapping(QChar ch) noexcept\n"
+ "{ return idnaMapping(ch.unicode()); }\n"
"\n";
static const int SizeOfPropertiesStruct = 20;
@@ -899,6 +961,7 @@ struct PropertyFlags {
&& lineBreakClass == o.lineBreakClass
&& script == o.script
&& nfQuickCheck == o.nfQuickCheck
+ && idnaStatus == o.idnaStatus
);
}
// from UnicodeData.txt
@@ -928,6 +991,7 @@ struct PropertyFlags {
int script = QChar::Script_Unknown;
// from DerivedNormalizationProps.txt
uchar nfQuickCheck = 0;
+ IdnaStatus idnaStatus = IdnaStatus::Disallowed;
};
@@ -1082,6 +1146,8 @@ struct UnicodeData {
// computed position of unicode property set
int propertyIndex = -1;
+
+ IdnaRawStatus idnaRawStatus = IdnaRawStatus::Disallowed;
};
static QList<UnicodeData> unicodeData;
@@ -2292,6 +2358,194 @@ static void readScripts()
}
}
+static QMap<char32_t, QList<char32_t>> idnaMappingTable;
+
+static void readIdnaMappingTable()
+{
+ qDebug("Reading IdnaMappingTable.txt");
+
+ QFile f("data/IdnaMappingTable.txt");
+ if (!f.exists() || !f.open(QFile::ReadOnly))
+ qFatal("Couldn't find or read IdnaMappingTable.txt");
+
+ while (!f.atEnd()) {
+ QByteArray line = f.readLine().trimmed();
+
+ int comment = line.indexOf('#');
+ line = (comment < 0 ? line : line.left(comment)).simplified();
+
+ if (line.isEmpty())
+ continue;
+
+ QList<QByteArray> fields = line.split(';');
+ Q_ASSERT(fields.size() >= 2);
+
+ // That would be split(".."), but that API does not exist.
+ const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
+ QList<QByteArray> cl = codePoints.split('.');
+ Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
+
+ const QByteArray statusString = fields[1].trimmed();
+ if (!idnaStatusMap.contains(statusString))
+ qFatal("Unhandled IDNA status property value for %s: %s",
+ qPrintable(codePoints), qPrintable(statusString));
+ IdnaRawStatus rawStatus = idnaStatusMap.value(statusString);
+
+ bool ok;
+ const int first = cl[0].toInt(&ok, 16);
+ const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
+ Q_ASSERT(ok);
+
+ QList<char32_t> mapping;
+
+ switch (rawStatus) {
+ case IdnaRawStatus::Disallowed:
+ case IdnaRawStatus::Valid:
+ case IdnaRawStatus::Ignored:
+ case IdnaRawStatus::DisallowedStd3Valid:
+ break;
+
+ case IdnaRawStatus::Mapped:
+ case IdnaRawStatus::Deviation:
+ case IdnaRawStatus::DisallowedStd3Mapped:
+ Q_ASSERT(fields.size() >= 3);
+
+ for (const auto &s : fields[2].trimmed().split(' ')) {
+ if (!s.isEmpty()) {
+ bool ok;
+ int val = s.toInt(&ok, 16);
+ Q_ASSERT_X(ok, "readIdnaMappingTable", qPrintable(line));
+ mapping.append(val);
+ }
+ }
+
+ // Some deviations have empty mappings, others should not...
+ if (mapping.isEmpty()) {
+ Q_ASSERT(rawStatus == IdnaRawStatus::Deviation);
+ qDebug() << " Empty IDNA mapping for" << codePoints;
+ }
+
+ break;
+ }
+
+ for (int codepoint = first; codepoint <= last; ++codepoint) {
+ UnicodeData &ud = UnicodeData::valueRef(codepoint);
+ // Ensure that ranges don't overlap.
+ Q_ASSERT(ud.idnaRawStatus == IdnaRawStatus::Disallowed);
+ ud.idnaRawStatus = rawStatus;
+
+ // ASCII codepoints are skipped here because they are processed in separate
+ // optimized code paths that do not use this mapping table.
+ if (codepoint >= 0x80 && !mapping.isEmpty())
+ idnaMappingTable[codepoint] = mapping;
+ }
+ }
+}
+
+/*
+ Resolve IDNA status by deciding whether to allow STD3 violations
+
+ Underscores are normally prohibited by STD3 rules but Qt allows underscores
+ to be used inside URLs (see QTBUG-7434 for example). This code changes the
+ underscore status to Valid. The same is done to mapped codepoints that
+ map to underscores combined with other Valid codepoints.
+
+ Underscores in domain names are required when using DNS-SD protocol and they
+ are also allowed by the SMB protocol.
+*/
+static void resolveIdnaStatus()
+{
+ qDebug("resolveIdnaStatus:");
+
+ UnicodeData::valueRef(u'_').idnaRawStatus = IdnaRawStatus::Valid;
+
+ for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
+ UnicodeData &ud = UnicodeData::valueRef(codepoint);
+ switch (ud.idnaRawStatus) {
+ case IdnaRawStatus::Disallowed:
+ case IdnaRawStatus::Valid:
+ case IdnaRawStatus::Ignored:
+ case IdnaRawStatus::Deviation:
+ case IdnaRawStatus::Mapped:
+ ud.p.idnaStatus = static_cast<IdnaStatus>(ud.idnaRawStatus);
+ break;
+ case IdnaRawStatus::DisallowedStd3Valid:
+ ud.p.idnaStatus = IdnaStatus::Disallowed;
+ break;
+ case IdnaRawStatus::DisallowedStd3Mapped: {
+ Q_ASSERT(idnaMappingTable.contains(codepoint));
+ const auto &mapping = idnaMappingTable[codepoint];
+
+ bool allow = std::all_of(mapping.begin(), mapping.end(), [](auto c) {
+ return UnicodeData::valueRef(c).idnaRawStatus == IdnaRawStatus::Valid;
+ });
+
+ if (allow) {
+ qDebug() << " Allowing" << Qt::hex << codepoint;
+ ud.p.idnaStatus = IdnaStatus::Mapped;
+ } else {
+ ud.p.idnaStatus = IdnaStatus::Disallowed;
+ idnaMappingTable.remove(codepoint);
+ }
+ break;
+ }
+ }
+ }
+}
+
+static QByteArray createIdnaMapping()
+{
+ qDebug("createIdnaMapping:");
+
+ size_t maxMappingLength = 0;
+
+ for (const auto &entry : idnaMappingTable) {
+ size_t length = 0;
+ for (char32_t c : entry)
+ length += QChar::requiresSurrogates(c) ? 2 : 1;
+ maxMappingLength = qMax(maxMappingLength, length);
+ }
+
+ qDebug() << " max mapping length:" << maxMappingLength;
+ qsizetype memoryUsage = 0;
+ QByteArray out =
+ "struct IdnaMapEntry {\n"
+ " char32_t codePoint;\n"
+ " char16_t mapping[" + QByteArray::number(maxMappingLength + 1) + "];\n"
+ "};\n\n"
+ "static const IdnaMapEntry idnaMap[] = {\n";
+
+ for (auto i = idnaMappingTable.keyValueBegin(); i != idnaMappingTable.keyValueEnd(); i++) {
+ out += " { 0x" + QByteArray::number(i->first, 16) + ", {";
+ size_t n = 0;
+ for (char32_t c : i->second) {
+ for (auto qc : QChar::fromUcs4(c)) {
+ out += "0x" + QByteArray::number(qc, 16) + ", ";
+ n++;
+ }
+ }
+ for (; n < maxMappingLength; n++)
+ out += "0, ";
+ out += "0 }},\n";
+ memoryUsage += 4 + 2 * (maxMappingLength + 1);
+ }
+
+ qDebug() << " memory usage:" << memoryUsage << "bytes";
+
+ out +=
+ "};\n\n"
+ "Q_CORE_EXPORT const char16_t * QT_FASTCALL idnaMapping(char32_t ucs4) noexcept\n"
+ "{\n"
+ " auto i = std::lower_bound(std::begin(idnaMap), std::end(idnaMap), ucs4,\n"
+ " [](const auto &p, char32_t c) { return p.codePoint < c; });\n"
+ " if (i != std::end(idnaMap) && i->codePoint == ucs4)\n"
+ " return i->mapping;\n"
+ " return nullptr;\n"
+ "}\n\n";
+
+ return out;
+}
+
#if 0
static void dump(int from, int to)
{
@@ -2532,9 +2786,12 @@ static QByteArray createPropertyInfo()
out += ", ";
out += QByteArray::number( p.lineBreakClass );
out += ", ";
-// " ushort sentenceBreakClass : 8; /* 4 used */\n"
+// " ushort sentenceBreakClass : 4; /* 4 used */\n"
out += QByteArray::number( p.sentenceBreakClass );
out += ", ";
+// " ushort idnaStatus : 4; /* 3 used */\n"
+ out += QByteArray::number( static_cast<unsigned int>(p.idnaStatus) );
+ out += ", ";
// " ushort script : 8;\n"
out += QByteArray::number( p.script );
out += " },";
@@ -2595,6 +2852,11 @@ static QByteArray createPropertyInfo()
"{\n"
" return static_cast<LineBreakClass>(qGetProp(ucs4)->lineBreakClass);\n"
"}\n"
+ "\n"
+ "Q_CORE_EXPORT IdnaStatus QT_FASTCALL idnaStatus(char32_t ucs4) noexcept\n"
+ "{\n"
+ " return static_cast<IdnaStatus>(qGetProp(ucs4)->idnaStatus);\n"
+ "}\n"
"\n";
return out;
@@ -3059,6 +3321,7 @@ int main(int, char **)
initSentenceBreak();
initLineBreak();
initScriptMap();
+ initIdnaStatusMap();
readUnicodeData();
readBidiMirroring();
@@ -3074,6 +3337,9 @@ int main(int, char **)
readWordBreak();
readSentenceBreak();
readLineBreak();
+ readIdnaMappingTable();
+
+ resolveIdnaStatus();
computeUniqueProperties();
QByteArray properties = createPropertyInfo();
@@ -3081,6 +3347,7 @@ int main(int, char **)
QByteArray compositions = createCompositionInfo();
QByteArray ligatures = createLigatureInfo();
QByteArray normalizationCorrections = createNormalizationCorrections();
+ QByteArray idnaMapping = createIdnaMapping();
QByteArray header =
"/****************************************************************************\n"
@@ -3150,6 +3417,7 @@ int main(int, char **)
f.write(ligatures);
f.write("\n");
f.write(normalizationCorrections);
+ f.write(idnaMapping);
f.write("} // namespace QUnicodeTables\n\n");
f.write("using namespace QUnicodeTables;\n\n");
f.write("QT_END_NAMESPACE\n");
@@ -3173,6 +3441,7 @@ int main(int, char **)
f.write(word_break_class_string);
f.write(sentence_break_class_string);
f.write(line_break_class_string);
+ f.write(idna_status_string);
f.write(methods);
f.write("} // namespace QUnicodeTables\n\n"
"QT_END_NAMESPACE\n\n"