diff options
author | Tarja Sundqvist <tarja.sundqvist@qt.io> | 2022-05-16 18:16:04 +0300 |
---|---|---|
committer | Tarja Sundqvist <tarja.sundqvist@qt.io> | 2022-05-16 18:16:04 +0300 |
commit | 231d3670981a33ec42b91ad1cb33c1fc50551066 (patch) | |
tree | b1ef1096f4e279baaa2ea0d2daf14b5c4185a82f /util/unicode/main.cpp | |
parent | ffdd372c7bbda62e9d937f406319f38e3e982774 (diff) | |
parent | 8fc1a885d19a2dfb1a3a684aea1cfa41967e041f (diff) |
Merge remote-tracking branch 'origin/tqtc/lts-5.15.5' into tqtc/lts-5.15-opensourcev5.15.5-lts-lgpl
Change-Id: I0cdb390124e783dc9cd832a9954baa76a0e9eb6b
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r-- | util/unicode/main.cpp | 71 |
1 files changed, 58 insertions, 13 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index cde9f40a48..f22876c34c 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -291,10 +291,7 @@ static const char *grapheme_break_class_string = " GraphemeBreak_T,\n" " GraphemeBreak_LV,\n" " GraphemeBreak_LVT,\n" - " Graphemebreak_E_Base,\n" - " Graphemebreak_E_Modifier,\n" - " Graphemebreak_Glue_After_Zwj,\n" - " Graphemebreak_E_Base_GAZ,\n" + " GraphemeBreak_Extended_Pictographic,\n" "\n" " NumGraphemeBreakClasses\n" "};\n\n"; @@ -314,10 +311,7 @@ enum GraphemeBreakClass { GraphemeBreak_T, GraphemeBreak_LV, GraphemeBreak_LVT, - Graphemebreak_E_Base, - Graphemebreak_E_Modifier, - Graphemebreak_Glue_After_Zwj, - Graphemebreak_E_Base_GAZ, + GraphemeBreak_Extended_Pictographic, GraphemeBreak_Unassigned }; @@ -344,11 +338,8 @@ static void initGraphemeBreak() { GraphemeBreak_T, "T" }, { GraphemeBreak_LV, "LV" }, { GraphemeBreak_LVT, "LVT" }, - { Graphemebreak_E_Base, "E_Base" }, - { Graphemebreak_E_Modifier, "E_Modifier" }, - { Graphemebreak_Glue_After_Zwj, "Glue_After_Zwj" }, - { Graphemebreak_E_Base_GAZ, "E_Base_GAZ" }, - { GraphemeBreak_Unassigned, 0 } + { GraphemeBreak_Extended_Pictographic, "Extended_Pictographic" }, + { GraphemeBreak_Unassigned, nullptr } }; GraphemeBreakList *d = breaks; while (d->name) { @@ -1915,6 +1906,59 @@ static void readGraphemeBreak() } } +static void readEmojiData() +{ + qDebug("Reading emoji-data.txt"); + + QFile f("data/emoji-data.txt"); + if (!f.open(QFile::ReadOnly)) + qFatal("Couldn't find emoji-data.txt"); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + line.replace(" ", ""); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + Q_ASSERT(l.size() == 2); + + // NOTE: for the moment we process emoji_data only to extract + // the code points with Extended_Pictographic. This is needed by + // extended grapheme clustering (cf. the GB11 rule in UAX #29). + if (l[1] != "Extended_Pictographic") + continue; + + QByteArray codes = l[0]; + codes.replace("..", "."); + QList<QByteArray> cl = codes.split('.'); + + bool ok; + int from = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int to = from; + if (cl.size() == 2) { + to = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + for (int codepoint = from; codepoint <= to; ++codepoint) { + UnicodeData &ud = UnicodeData::valueRef(codepoint); + // Check we're not overwriting the data from GraphemeBreakProperty.txt... + Q_ASSERT(ud.p.graphemeBreakClass == GraphemeBreak_Any); + ud.p.graphemeBreakClass = GraphemeBreak_Extended_Pictographic; + } + } +} + static void readWordBreak() { qDebug("Reading WordBreakProperty.txt"); @@ -3037,6 +3081,7 @@ int main(int, char **) // readBlocks(); readScripts(); readGraphemeBreak(); + readEmojiData(); readWordBreak(); readSentenceBreak(); readLineBreak(); |