summaryrefslogtreecommitdiffstats
path: root/util/unicode/main.cpp
diff options
context:
space:
mode:
authorIevgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io>2022-05-06 13:44:58 +0200
committerIevgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io>2022-05-24 23:07:43 +0200
commit838a7a01f388673c45af2dd60bc38992de1a2a05 (patch)
treee11fbffd90b986623b025eb4953265d070da2791 /util/unicode/main.cpp
parent40b4ad1866b4c48fa7a64bc2f07c27125398fdba (diff)
Unicode: Extract EastAsianWidth property
This property is needed to properly implement the line breaking algorithm from UAX #14. Task-number: QTBUG-97537 Pick-to: 6.3 Change-Id: Ia83cc553c9ef19fae33560721630849d2a95af84 Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r--util/unicode/main.cpp110
1 files changed, 108 insertions, 2 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index 9fdda4a0c3..3a6771fa37 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -59,6 +59,47 @@ static void initAgeMap()
}
}
+static const char *east_asian_width_string =
+R"(enum class EastAsianWidth : unsigned int {
+ A,
+ F,
+ H,
+ N,
+ Na,
+ W,
+};
+
+)";
+
+enum class EastAsianWidth : unsigned int {
+ A,
+ F,
+ H,
+ N,
+ Na,
+ W,
+};
+
+static QHash<QByteArray, EastAsianWidth> eastAsianWidthMap;
+
+static void initEastAsianWidthMap()
+{
+ constexpr struct W {
+ EastAsianWidth width;
+ const char *name;
+ } widths[] = {
+ { EastAsianWidth::A, "A" },
+ { EastAsianWidth::F, "F" },
+ { EastAsianWidth::H, "H" },
+ { EastAsianWidth::N, "N" },
+ { EastAsianWidth::Na, "Na" },
+ { EastAsianWidth::W, "W" },
+ };
+
+ for (auto &w : widths)
+ eastAsianWidthMap.insert(w.name, w.width);
+}
+
static QHash<QByteArray, QChar::Category> categoryMap;
static void initCategoryMap()
@@ -849,7 +890,8 @@ static const char *property_string =
" ushort joining : 3;\n"
" signed short digitValue : 5;\n"
" signed short mirrorDiff : 16;\n"
- " ushort unicodeVersion : 8; /* 5 used */\n"
+ " ushort unicodeVersion : 5; /* 5 used */\n"
+ " ushort eastAsianWidth : 3; /* 3 used */\n"
" ushort nfQuickCheck : 8;\n" // could be narrowed
"#ifdef Q_OS_WASM\n"
" unsigned char : 0; //wasm 64 packing trick\n"
@@ -896,6 +938,10 @@ static const char *methods =
"Q_CORE_EXPORT QStringView QT_FASTCALL idnaMapping(char32_t usc4) noexcept;\n"
"inline QStringView idnaMapping(QChar ch) noexcept\n"
"{ return idnaMapping(ch.unicode()); }\n"
+ "\n"
+ "Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept;\n"
+ "inline EastAsianWidth eastAsianWidth(QChar ch) noexcept\n"
+ "{ return eastAsianWidth(ch.unicode()); }\n"
"\n";
static const int SizeOfPropertiesStruct = 20;
@@ -918,6 +964,7 @@ struct PropertyFlags {
&& direction == o.direction
&& joining == o.joining
&& age == o.age
+ && eastAsianWidth == o.eastAsianWidth
&& digitValue == o.digitValue
&& mirrorDiff == o.mirrorDiff
&& lowerCaseDiff == o.lowerCaseDiff
@@ -945,6 +992,8 @@ struct PropertyFlags {
QChar::JoiningType joining : 3;
// from DerivedAge.txt
QChar::UnicodeVersion age : 5;
+ // From EastAsianWidth.txt
+ EastAsianWidth eastAsianWidth = EastAsianWidth::N;
int digitValue = -1;
int mirrorDiff : 16;
@@ -1483,6 +1532,52 @@ static void readDerivedAge()
}
}
+static void readEastAsianWidth()
+{
+ qDebug("Reading EastAsianWidth.txt");
+
+ QFile f("data/EastAsianWidth.txt");
+ if (!f.exists() || !f.open(QFile::ReadOnly))
+ qFatal("Couldn't find or read EastAsianWidth.txt");
+
+ while (!f.atEnd()) {
+ QByteArray line = f.readLine().trimmed();
+
+ int comment = line.indexOf('#');
+ line = (comment < 0 ? line : line.left(comment)).simplified();
+
+ if (line.isEmpty())
+ continue;
+
+ QList<QByteArray> fields = line.split(';');
+ Q_ASSERT(fields.size() == 2);
+
+ // That would be split(".."), but that API does not exist.
+ const QByteArray codePoints = fields[0].trimmed().replace("..", ".");
+ QList<QByteArray> cl = codePoints.split('.');
+ Q_ASSERT(cl.size() >= 1 && cl.size() <= 2);
+
+ const QByteArray widthString = fields[1].trimmed();
+ if (!eastAsianWidthMap.contains(widthString)) {
+ qFatal("Unhandled EastAsianWidth property value for %s: %s",
+ qPrintable(codePoints), qPrintable(widthString));
+ }
+ auto width = eastAsianWidthMap.value(widthString);
+
+ bool ok;
+ const int first = cl[0].toInt(&ok, 16);
+ const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first;
+ Q_ASSERT(ok);
+
+ for (int codepoint = first; codepoint <= last; ++codepoint) {
+ UnicodeData &ud = UnicodeData::valueRef(codepoint);
+ // Ensure that ranges don't overlap.
+ Q_ASSERT(ud.p.eastAsianWidth == EastAsianWidth::N);
+ ud.p.eastAsianWidth = width;
+ }
+ }
+}
+
static void readDerivedNormalizationProps()
{
qDebug("Reading DerivedNormalizationProps.txt");
@@ -2896,9 +2991,12 @@ static QByteArray createPropertyInfo()
// " signed short mirrorDiff : 16;\n"
out += QByteArray::number( p.mirrorDiff );
out += ", ";
-// " ushort unicodeVersion : 8; /* 5 used */\n"
+// " ushort unicodeVersion : 5; /* 5 used */\n"
out += QByteArray::number( p.age );
out += ", ";
+// " ushort eastAsianWidth : 3;" /* 3 used */\n"
+ out += QByteArray::number( static_cast<unsigned int>(p.eastAsianWidth) );
+ out += ", ";
// " ushort nfQuickCheck : 8;\n"
out += QByteArray::number( p.nfQuickCheck );
out += ", ";
@@ -3003,6 +3101,11 @@ static QByteArray createPropertyInfo()
"{\n"
" return static_cast<IdnaStatus>(qGetProp(ucs4)->idnaStatus);\n"
"}\n"
+ "\n"
+ "Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept\n"
+ "{\n"
+ " return static_cast<EastAsianWidth>(qGetProp(ucs4)->eastAsianWidth);\n"
+ "}\n"
"\n";
return out;
@@ -3458,6 +3561,7 @@ QByteArray createCasingInfo()
int main(int, char **)
{
initAgeMap();
+ initEastAsianWidthMap();
initCategoryMap();
initDecompositionMap();
initDirectionMap();
@@ -3473,6 +3577,7 @@ int main(int, char **)
readBidiMirroring();
readArabicShaping();
readDerivedAge();
+ readEastAsianWidth();
readDerivedNormalizationProps();
readSpecialCasing();
readCaseFolding();
@@ -3548,6 +3653,7 @@ int main(int, char **)
f.write("namespace QUnicodeTables {\n\n");
f.write(property_string);
f.write(sizeOfPropertiesStructCheck);
+ f.write(east_asian_width_string);
f.write(grapheme_break_class_string);
f.write(word_break_class_string);
f.write(sentence_break_class_string);