diff options
author | Lars Knoll <lars.knoll@qt.io> | 2017-12-12 10:14:28 +0100 |
---|---|---|
committer | Lars Knoll <lars.knoll@qt.io> | 2018-01-03 07:47:26 +0000 |
commit | 41b4e154d617a820cd7f7f732838647425a58227 (patch) | |
tree | 27e9300e3fc275bf4e50de8fb2c5e1f8aeb40fab /util/unicode/main.cpp | |
parent | 8bfabb34dec8a437a08b5a6e0ecac4a9dd3ae18c (diff) |
Update Text segmentation and line break data to Unicode 10.0
Also adjusted the text segmentation and line break algorithms
so that they can handle the new data, and pass the test suite.
Change-Id: Ib727fd80003e34e96458d7a681996de3fa3691e7
Reviewed-by: Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@qt.io>
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r-- | util/unicode/main.cpp | 105 |
1 files changed, 73 insertions, 32 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index c51995499d..0f3c28137d 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -274,11 +274,12 @@ static void initJoiningMap() static const char *grapheme_break_class_string = "enum GraphemeBreakClass {\n" - " GraphemeBreak_Other,\n" + " GraphemeBreak_Any,\n" " GraphemeBreak_CR,\n" " GraphemeBreak_LF,\n" " GraphemeBreak_Control,\n" " GraphemeBreak_Extend,\n" + " GraphemeBreak_ZWJ,\n" " GraphemeBreak_RegionalIndicator,\n" " GraphemeBreak_Prepend,\n" " GraphemeBreak_SpacingMark,\n" @@ -286,15 +287,21 @@ static const char *grapheme_break_class_string = " GraphemeBreak_V,\n" " GraphemeBreak_T,\n" " GraphemeBreak_LV,\n" - " GraphemeBreak_LVT\n" + " GraphemeBreak_LVT,\n" + " Graphemebreak_E_Base,\n" + " Graphemebreak_E_Modifier,\n" + " Graphemebreak_Glue_After_Zwj,\n" + " Graphemebreak_E_Base_GAZ,\n" + " NumGraphemeBreakClasses,\n" "};\n\n"; enum GraphemeBreakClass { - GraphemeBreak_Other, + GraphemeBreak_Any, GraphemeBreak_CR, GraphemeBreak_LF, GraphemeBreak_Control, GraphemeBreak_Extend, + GraphemeBreak_ZWJ, GraphemeBreak_RegionalIndicator, GraphemeBreak_Prepend, GraphemeBreak_SpacingMark, @@ -302,9 +309,13 @@ enum GraphemeBreakClass { GraphemeBreak_V, GraphemeBreak_T, GraphemeBreak_LV, - GraphemeBreak_LVT + GraphemeBreak_LVT, + Graphemebreak_E_Base, + Graphemebreak_E_Modifier, + Graphemebreak_Glue_After_Zwj, + Graphemebreak_E_Base_GAZ, - , GraphemeBreak_Unassigned + GraphemeBreak_Unassigned }; static QHash<QByteArray, GraphemeBreakClass> grapheme_break_map; @@ -315,11 +326,12 @@ static void initGraphemeBreak() GraphemeBreakClass brk; const char *name; } breaks[] = { - { GraphemeBreak_Other, "Other" }, + { GraphemeBreak_Any, "Any" }, { GraphemeBreak_CR, "CR" }, { GraphemeBreak_LF, "LF" }, { GraphemeBreak_Control, "Control" }, { GraphemeBreak_Extend, "Extend" }, + { GraphemeBreak_ZWJ, "ZWJ" }, { GraphemeBreak_RegionalIndicator, "Regional_Indicator" }, { GraphemeBreak_Prepend, "Prepend" }, { GraphemeBreak_SpacingMark, "SpacingMark" }, @@ -328,6 +340,10 @@ static void initGraphemeBreak() { GraphemeBreak_T, "T" }, { GraphemeBreak_LV, "LV" }, { GraphemeBreak_LVT, "LVT" }, + { Graphemebreak_E_Base, "E_Base" }, + { Graphemebreak_E_Modifier, "E_Modifier" }, + { Graphemebreak_Glue_After_Zwj, "Glue_After_Zwj" }, + { Graphemebreak_E_Base_GAZ, "E_Base_GAZ" }, { GraphemeBreak_Unassigned, 0 } }; GraphemeBreakList *d = breaks; @@ -340,11 +356,13 @@ static void initGraphemeBreak() static const char *word_break_class_string = "enum WordBreakClass {\n" - " WordBreak_Other,\n" + " WordBreak_Any,\n" " WordBreak_CR,\n" " WordBreak_LF,\n" " WordBreak_Newline,\n" " WordBreak_Extend,\n" + " WordBreak_ZWJ,\n" + " WordBreak_Format,\n" " WordBreak_RegionalIndicator,\n" " WordBreak_Katakana,\n" " WordBreak_HebrewLetter,\n" @@ -355,15 +373,22 @@ static const char *word_break_class_string = " WordBreak_MidLetter,\n" " WordBreak_MidNum,\n" " WordBreak_Numeric,\n" - " WordBreak_ExtendNumLet\n" + " WordBreak_ExtendNumLet,\n" + " WordBreak_E_Base,\n" + " WordBreak_E_Modifier,\n" + " WordBreak_Glue_After_Zwj,\n" + " WordBreak_E_Base_GAZ,\n" + " NumWordBreakClasses,\n" "};\n\n"; enum WordBreakClass { - WordBreak_Other, + WordBreak_Any, WordBreak_CR, WordBreak_LF, WordBreak_Newline, WordBreak_Extend, + WordBreak_ZWJ, + WordBreak_Format, WordBreak_RegionalIndicator, WordBreak_Katakana, WordBreak_HebrewLetter, @@ -374,9 +399,13 @@ enum WordBreakClass { WordBreak_MidLetter, WordBreak_MidNum, WordBreak_Numeric, - WordBreak_ExtendNumLet + WordBreak_ExtendNumLet, + WordBreak_E_Base, + WordBreak_E_Modifier, + WordBreak_Glue_After_Zwj, + WordBreak_E_Base_GAZ, - , WordBreak_Unassigned + WordBreak_Unassigned }; static QHash<QByteArray, WordBreakClass> word_break_map; @@ -387,12 +416,13 @@ static void initWordBreak() WordBreakClass brk; const char *name; } breaks[] = { - { WordBreak_Other, "Other" }, + { WordBreak_Any, "Any" }, { WordBreak_CR, "CR" }, { WordBreak_LF, "LF" }, { WordBreak_Newline, "Newline" }, { WordBreak_Extend, "Extend" }, - { WordBreak_Extend, "Format" }, + { WordBreak_ZWJ, "ZWJ" }, + { WordBreak_Format, "Format" }, { WordBreak_RegionalIndicator, "Regional_Indicator" }, { WordBreak_Katakana, "Katakana" }, { WordBreak_HebrewLetter, "Hebrew_Letter" }, @@ -404,6 +434,10 @@ static void initWordBreak() { WordBreak_MidNum, "MidNum" }, { WordBreak_Numeric, "Numeric" }, { WordBreak_ExtendNumLet, "ExtendNumLet" }, + { WordBreak_E_Base, "E_Base" }, + { WordBreak_E_Modifier, "E_Modifier" }, + { WordBreak_Glue_After_Zwj, "Glue_After_Zwj" }, + { WordBreak_E_Base_GAZ, "E_Base_GAZ" }, { WordBreak_Unassigned, 0 } }; WordBreakList *d = breaks; @@ -416,7 +450,7 @@ static void initWordBreak() static const char *sentence_break_class_string = "enum SentenceBreakClass {\n" - " SentenceBreak_Other,\n" + " SentenceBreak_Any,\n" " SentenceBreak_CR,\n" " SentenceBreak_LF,\n" " SentenceBreak_Sep,\n" @@ -429,11 +463,12 @@ static const char *sentence_break_class_string = " SentenceBreak_ATerm,\n" " SentenceBreak_SContinue,\n" " SentenceBreak_STerm,\n" - " SentenceBreak_Close\n" + " SentenceBreak_Close,\n" + " NumSentenceBreakClasses\n" "};\n\n"; enum SentenceBreakClass { - SentenceBreak_Other, + SentenceBreak_Any, SentenceBreak_CR, SentenceBreak_LF, SentenceBreak_Sep, @@ -459,7 +494,7 @@ static void initSentenceBreak() SentenceBreakClass brk; const char *name; } breaks[] = { - { SentenceBreak_Other, "Other" }, + { SentenceBreak_Any, "Any" }, { SentenceBreak_CR, "CR" }, { SentenceBreak_LF, "LF" }, { SentenceBreak_Sep, "Sep" }, @@ -494,8 +529,10 @@ static const char *line_break_class_string = " LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,\n" " LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,\n" " LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB,\n" - " LineBreak_SA, LineBreak_SG, LineBreak_SP, LineBreak_CR, LineBreak_LF,\n" - " LineBreak_BK\n" + " LineBreak_EB, LineBreak_EM, LineBreak_ZWJ,\n" + " LineBreak_SA, LineBreak_SG, LineBreak_SP,\n" + " LineBreak_CR, LineBreak_LF, LineBreak_BK,\n" + " NumLineBreakClasses\n" "};\n\n"; enum LineBreakClass { @@ -505,10 +542,11 @@ enum LineBreakClass { LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB, - LineBreak_SA, LineBreak_SG, LineBreak_SP, LineBreak_CR, LineBreak_LF, - LineBreak_BK + LineBreak_EB, LineBreak_EM, LineBreak_ZWJ, + LineBreak_SA, LineBreak_SG, LineBreak_SP, + LineBreak_CR, LineBreak_LF, LineBreak_BK, - , LineBreak_Unassigned + LineBreak_Unassigned }; static QHash<QByteArray, LineBreakClass> line_break_map; @@ -563,6 +601,9 @@ static void initLineBreak() { LineBreak_RI, "RI" }, { LineBreak_SA, "SA" }, { LineBreak_AL, "XX" }, + { LineBreak_EB, "EB" }, + { LineBreak_EM, "EM" }, + { LineBreak_ZWJ, "ZWJ" }, { LineBreak_Unassigned, 0 } }; LineBreakList *d = breaks; @@ -768,10 +809,10 @@ static const char *property_string = " signed short caseFoldDiff : 15;\n" " ushort unicodeVersion : 8; /* 5 used */\n" " ushort nfQuickCheck : 8;\n" // could be narrowed - " ushort graphemeBreakClass : 4; /* 4 used */\n" - " ushort wordBreakClass : 4; /* 4 used */\n" + " ushort graphemeBreakClass : 5; /* 5 used */\n" + " ushort wordBreakClass : 5; /* 5 used */\n" " ushort sentenceBreakClass : 8; /* 4 used */\n" - " ushort lineBreakClass : 8; /* 6 used */\n" + " ushort lineBreakClass : 6; /* 6 used */\n" " ushort script : 8;\n" "};\n\n" "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4) Q_DECL_NOTHROW;\n" @@ -1034,9 +1075,9 @@ struct UnicodeData { p.upperCaseSpecial = 0; p.titleCaseSpecial = 0; p.caseFoldSpecial = 0; - p.graphemeBreakClass = GraphemeBreak_Other; - p.wordBreakClass = WordBreak_Other; - p.sentenceBreakClass = SentenceBreak_Other; + p.graphemeBreakClass = GraphemeBreak_Any; + p.wordBreakClass = WordBreak_Any; + p.sentenceBreakClass = SentenceBreak_Any; p.script = QChar::Script_Unknown; p.nfQuickCheck = 0; propertyIndex = -1; @@ -1913,7 +1954,7 @@ static void readWordBreak() if (codepoint == 0x002E) // FULL STOP brk = WordBreak_MidNum; else if (codepoint == 0x003A) // COLON - brk = WordBreak_Other; + brk = WordBreak_Any; // ] ### UnicodeData &ud = UnicodeData::valueRef(codepoint); ud.p.wordBreakClass = brk; @@ -2456,10 +2497,10 @@ static QByteArray createPropertyInfo() // " ushort nfQuickCheck : 8;\n" out += QByteArray::number( p.nfQuickCheck ); out += ", "; -// " ushort graphemeBreakClass : 4; /* 4 used */\n" -// " ushort wordBreakClass : 4; /* 4 used */\n" +// " ushort graphemeBreakClass : 5; /* 5 used */\n" +// " ushort wordBreakClass : 5; /* 5 used */\n" // " ushort sentenceBreakClass : 8; /* 4 used */\n" -// " ushort lineBreakClass : 8; /* 6 used */\n" +// " ushort lineBreakClass : 6; /* 6 used */\n" out += QByteArray::number( p.graphemeBreakClass ); out += ", "; out += QByteArray::number( p.wordBreakClass ); |