diff options
author | Ievgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io> | 2022-05-06 13:44:58 +0200 |
---|---|---|
committer | Ievgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io> | 2022-05-24 23:07:43 +0200 |
commit | 838a7a01f388673c45af2dd60bc38992de1a2a05 (patch) | |
tree | e11fbffd90b986623b025eb4953265d070da2791 /util/unicode/main.cpp | |
parent | 40b4ad1866b4c48fa7a64bc2f07c27125398fdba (diff) |
Unicode: Extract EastAsianWidth property
This property is needed to properly implement the line breaking
algorithm from UAX #14.
Task-number: QTBUG-97537
Pick-to: 6.3
Change-Id: Ia83cc553c9ef19fae33560721630849d2a95af84
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r-- | util/unicode/main.cpp | 110 |
1 files changed, 108 insertions, 2 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index 9fdda4a0c3..3a6771fa37 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -59,6 +59,47 @@ static void initAgeMap() } } +static const char *east_asian_width_string = +R"(enum class EastAsianWidth : unsigned int { + A, + F, + H, + N, + Na, + W, +}; + +)"; + +enum class EastAsianWidth : unsigned int { + A, + F, + H, + N, + Na, + W, +}; + +static QHash<QByteArray, EastAsianWidth> eastAsianWidthMap; + +static void initEastAsianWidthMap() +{ + constexpr struct W { + EastAsianWidth width; + const char *name; + } widths[] = { + { EastAsianWidth::A, "A" }, + { EastAsianWidth::F, "F" }, + { EastAsianWidth::H, "H" }, + { EastAsianWidth::N, "N" }, + { EastAsianWidth::Na, "Na" }, + { EastAsianWidth::W, "W" }, + }; + + for (auto &w : widths) + eastAsianWidthMap.insert(w.name, w.width); +} + static QHash<QByteArray, QChar::Category> categoryMap; static void initCategoryMap() @@ -849,7 +890,8 @@ static const char *property_string = " ushort joining : 3;\n" " signed short digitValue : 5;\n" " signed short mirrorDiff : 16;\n" - " ushort unicodeVersion : 8; /* 5 used */\n" + " ushort unicodeVersion : 5; /* 5 used */\n" + " ushort eastAsianWidth : 3; /* 3 used */\n" " ushort nfQuickCheck : 8;\n" // could be narrowed "#ifdef Q_OS_WASM\n" " unsigned char : 0; //wasm 64 packing trick\n" @@ -896,6 +938,10 @@ static const char *methods = "Q_CORE_EXPORT QStringView QT_FASTCALL idnaMapping(char32_t usc4) noexcept;\n" "inline QStringView idnaMapping(QChar ch) noexcept\n" "{ return idnaMapping(ch.unicode()); }\n" + "\n" + "Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept;\n" + "inline EastAsianWidth eastAsianWidth(QChar ch) noexcept\n" + "{ return eastAsianWidth(ch.unicode()); }\n" "\n"; static const int SizeOfPropertiesStruct = 20; @@ -918,6 +964,7 @@ struct PropertyFlags { && direction == o.direction && joining == o.joining && age == o.age + && eastAsianWidth == o.eastAsianWidth && digitValue == o.digitValue && mirrorDiff == o.mirrorDiff && lowerCaseDiff == o.lowerCaseDiff @@ -945,6 +992,8 @@ struct PropertyFlags { QChar::JoiningType joining : 3; // from DerivedAge.txt QChar::UnicodeVersion age : 5; + // From EastAsianWidth.txt + EastAsianWidth eastAsianWidth = EastAsianWidth::N; int digitValue = -1; int mirrorDiff : 16; @@ -1483,6 +1532,52 @@ static void readDerivedAge() } } +static void readEastAsianWidth() +{ + qDebug("Reading EastAsianWidth.txt"); + + QFile f("data/EastAsianWidth.txt"); + if (!f.exists() || !f.open(QFile::ReadOnly)) + qFatal("Couldn't find or read EastAsianWidth.txt"); + + while (!f.atEnd()) { + QByteArray line = f.readLine().trimmed(); + + int comment = line.indexOf('#'); + line = (comment < 0 ? line : line.left(comment)).simplified(); + + if (line.isEmpty()) + continue; + + QList<QByteArray> fields = line.split(';'); + Q_ASSERT(fields.size() == 2); + + // That would be split(".."), but that API does not exist. + const QByteArray codePoints = fields[0].trimmed().replace("..", "."); + QList<QByteArray> cl = codePoints.split('.'); + Q_ASSERT(cl.size() >= 1 && cl.size() <= 2); + + const QByteArray widthString = fields[1].trimmed(); + if (!eastAsianWidthMap.contains(widthString)) { + qFatal("Unhandled EastAsianWidth property value for %s: %s", + qPrintable(codePoints), qPrintable(widthString)); + } + auto width = eastAsianWidthMap.value(widthString); + + bool ok; + const int first = cl[0].toInt(&ok, 16); + const int last = ok && cl.size() == 2 ? cl[1].toInt(&ok, 16) : first; + Q_ASSERT(ok); + + for (int codepoint = first; codepoint <= last; ++codepoint) { + UnicodeData &ud = UnicodeData::valueRef(codepoint); + // Ensure that ranges don't overlap. + Q_ASSERT(ud.p.eastAsianWidth == EastAsianWidth::N); + ud.p.eastAsianWidth = width; + } + } +} + static void readDerivedNormalizationProps() { qDebug("Reading DerivedNormalizationProps.txt"); @@ -2896,9 +2991,12 @@ static QByteArray createPropertyInfo() // " signed short mirrorDiff : 16;\n" out += QByteArray::number( p.mirrorDiff ); out += ", "; -// " ushort unicodeVersion : 8; /* 5 used */\n" +// " ushort unicodeVersion : 5; /* 5 used */\n" out += QByteArray::number( p.age ); out += ", "; +// " ushort eastAsianWidth : 3;" /* 3 used */\n" + out += QByteArray::number( static_cast<unsigned int>(p.eastAsianWidth) ); + out += ", "; // " ushort nfQuickCheck : 8;\n" out += QByteArray::number( p.nfQuickCheck ); out += ", "; @@ -3003,6 +3101,11 @@ static QByteArray createPropertyInfo() "{\n" " return static_cast<IdnaStatus>(qGetProp(ucs4)->idnaStatus);\n" "}\n" + "\n" + "Q_CORE_EXPORT EastAsianWidth QT_FASTCALL eastAsianWidth(char32_t ucs4) noexcept\n" + "{\n" + " return static_cast<EastAsianWidth>(qGetProp(ucs4)->eastAsianWidth);\n" + "}\n" "\n"; return out; @@ -3458,6 +3561,7 @@ QByteArray createCasingInfo() int main(int, char **) { initAgeMap(); + initEastAsianWidthMap(); initCategoryMap(); initDecompositionMap(); initDirectionMap(); @@ -3473,6 +3577,7 @@ int main(int, char **) readBidiMirroring(); readArabicShaping(); readDerivedAge(); + readEastAsianWidth(); readDerivedNormalizationProps(); readSpecialCasing(); readCaseFolding(); @@ -3548,6 +3653,7 @@ int main(int, char **) f.write("namespace QUnicodeTables {\n\n"); f.write(property_string); f.write(sizeOfPropertiesStructCheck); + f.write(east_asian_width_string); f.write(grapheme_break_class_string); f.write(word_break_class_string); f.write(sentence_break_class_string); |