diff options
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r-- | util/unicode/main.cpp | 71 |
1 files changed, 58 insertions, 13 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index cde9f40a48..f22876c34c 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -291,10 +291,7 @@ static const char *grapheme_break_class_string = " GraphemeBreak_T,\n" " GraphemeBreak_LV,\n" " GraphemeBreak_LVT,\n" - " Graphemebreak_E_Base,\n" - " Graphemebreak_E_Modifier,\n" - " Graphemebreak_Glue_After_Zwj,\n" - " Graphemebreak_E_Base_GAZ,\n" + " GraphemeBreak_Extended_Pictographic,\n" "\n" " NumGraphemeBreakClasses\n" "};\n\n"; @@ -314,10 +311,7 @@ enum GraphemeBreakClass { GraphemeBreak_T, GraphemeBreak_LV, GraphemeBreak_LVT, - Graphemebreak_E_Base, - Graphemebreak_E_Modifier, - Graphemebreak_Glue_After_Zwj, - Graphemebreak_E_Base_GAZ, + GraphemeBreak_Extended_Pictographic, GraphemeBreak_Unassigned }; @@ -344,11 +338,8 @@ static void initGraphemeBreak() { GraphemeBreak_T, "T" }, { GraphemeBreak_LV, "LV" }, { GraphemeBreak_LVT, "LVT" }, - { Graphemebreak_E_Base, "E_Base" }, - { Graphemebreak_E_Modifier, "E_Modifier" }, - { Graphemebreak_Glue_After_Zwj, "Glue_After_Zwj" }, - { Graphemebreak_E_Base_GAZ, "E_Base_GAZ" }, - { GraphemeBreak_Unassigned, 0 } + { GraphemeBreak_Extended_Pictographic, "Extended_Pictographic" }, + { GraphemeBreak_Unassigned, nullptr } }; GraphemeBreakList *d = breaks; while (d->name) { @@ -1915,6 +1906,59 @@ static void readGraphemeBreak() } } +static void readEmojiData() +{ + qDebug("Reading emoji-data.txt"); + + QFile f("data/emoji-data.txt"); + if (!f.open(QFile::ReadOnly)) + qFatal("Couldn't find emoji-data.txt"); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + line.replace(" ", ""); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + Q_ASSERT(l.size() == 2); + + // NOTE: for the moment we process emoji_data only to extract + // the code points with Extended_Pictographic. This is needed by + // extended grapheme clustering (cf. the GB11 rule in UAX #29). + if (l[1] != "Extended_Pictographic") + continue; + + QByteArray codes = l[0]; + codes.replace("..", "."); + QList<QByteArray> cl = codes.split('.'); + + bool ok; + int from = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int to = from; + if (cl.size() == 2) { + to = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + for (int codepoint = from; codepoint <= to; ++codepoint) { + UnicodeData &ud = UnicodeData::valueRef(codepoint); + // Check we're not overwriting the data from GraphemeBreakProperty.txt... + Q_ASSERT(ud.p.graphemeBreakClass == GraphemeBreak_Any); + ud.p.graphemeBreakClass = GraphemeBreak_Extended_Pictographic; + } + } +} + static void readWordBreak() { qDebug("Reading WordBreakProperty.txt"); @@ -3037,6 +3081,7 @@ int main(int, char **) // readBlocks(); readScripts(); readGraphemeBreak(); + readEmojiData(); readWordBreak(); readSentenceBreak(); readLineBreak(); |