summaryrefslogtreecommitdiffstats
path: root/util/unicode/main.cpp
diff options
context:
space:
mode:
authorLars Knoll <lars.knoll@qt.io>2017-12-12 10:14:28 +0100
committerLars Knoll <lars.knoll@qt.io>2018-01-03 07:47:26 +0000
commit41b4e154d617a820cd7f7f732838647425a58227 (patch)
tree27e9300e3fc275bf4e50de8fb2c5e1f8aeb40fab /util/unicode/main.cpp
parent8bfabb34dec8a437a08b5a6e0ecac4a9dd3ae18c (diff)
Update Text segmentation and line break data to Unicode 10.0
Also adjusted the text segmentation and line break algorithms so that they can handle the new data, and pass the test suite. Change-Id: Ib727fd80003e34e96458d7a681996de3fa3691e7 Reviewed-by: Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@qt.io>
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r--util/unicode/main.cpp105
1 files changed, 73 insertions, 32 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index c51995499d..0f3c28137d 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -274,11 +274,12 @@ static void initJoiningMap()
static const char *grapheme_break_class_string =
"enum GraphemeBreakClass {\n"
- " GraphemeBreak_Other,\n"
+ " GraphemeBreak_Any,\n"
" GraphemeBreak_CR,\n"
" GraphemeBreak_LF,\n"
" GraphemeBreak_Control,\n"
" GraphemeBreak_Extend,\n"
+ " GraphemeBreak_ZWJ,\n"
" GraphemeBreak_RegionalIndicator,\n"
" GraphemeBreak_Prepend,\n"
" GraphemeBreak_SpacingMark,\n"
@@ -286,15 +287,21 @@ static const char *grapheme_break_class_string =
" GraphemeBreak_V,\n"
" GraphemeBreak_T,\n"
" GraphemeBreak_LV,\n"
- " GraphemeBreak_LVT\n"
+ " GraphemeBreak_LVT,\n"
+ " Graphemebreak_E_Base,\n"
+ " Graphemebreak_E_Modifier,\n"
+ " Graphemebreak_Glue_After_Zwj,\n"
+ " Graphemebreak_E_Base_GAZ,\n"
+ " NumGraphemeBreakClasses,\n"
"};\n\n";
enum GraphemeBreakClass {
- GraphemeBreak_Other,
+ GraphemeBreak_Any,
GraphemeBreak_CR,
GraphemeBreak_LF,
GraphemeBreak_Control,
GraphemeBreak_Extend,
+ GraphemeBreak_ZWJ,
GraphemeBreak_RegionalIndicator,
GraphemeBreak_Prepend,
GraphemeBreak_SpacingMark,
@@ -302,9 +309,13 @@ enum GraphemeBreakClass {
GraphemeBreak_V,
GraphemeBreak_T,
GraphemeBreak_LV,
- GraphemeBreak_LVT
+ GraphemeBreak_LVT,
+ Graphemebreak_E_Base,
+ Graphemebreak_E_Modifier,
+ Graphemebreak_Glue_After_Zwj,
+ Graphemebreak_E_Base_GAZ,
- , GraphemeBreak_Unassigned
+ GraphemeBreak_Unassigned
};
static QHash<QByteArray, GraphemeBreakClass> grapheme_break_map;
@@ -315,11 +326,12 @@ static void initGraphemeBreak()
GraphemeBreakClass brk;
const char *name;
} breaks[] = {
- { GraphemeBreak_Other, "Other" },
+ { GraphemeBreak_Any, "Any" },
{ GraphemeBreak_CR, "CR" },
{ GraphemeBreak_LF, "LF" },
{ GraphemeBreak_Control, "Control" },
{ GraphemeBreak_Extend, "Extend" },
+ { GraphemeBreak_ZWJ, "ZWJ" },
{ GraphemeBreak_RegionalIndicator, "Regional_Indicator" },
{ GraphemeBreak_Prepend, "Prepend" },
{ GraphemeBreak_SpacingMark, "SpacingMark" },
@@ -328,6 +340,10 @@ static void initGraphemeBreak()
{ GraphemeBreak_T, "T" },
{ GraphemeBreak_LV, "LV" },
{ GraphemeBreak_LVT, "LVT" },
+ { Graphemebreak_E_Base, "E_Base" },
+ { Graphemebreak_E_Modifier, "E_Modifier" },
+ { Graphemebreak_Glue_After_Zwj, "Glue_After_Zwj" },
+ { Graphemebreak_E_Base_GAZ, "E_Base_GAZ" },
{ GraphemeBreak_Unassigned, 0 }
};
GraphemeBreakList *d = breaks;
@@ -340,11 +356,13 @@ static void initGraphemeBreak()
static const char *word_break_class_string =
"enum WordBreakClass {\n"
- " WordBreak_Other,\n"
+ " WordBreak_Any,\n"
" WordBreak_CR,\n"
" WordBreak_LF,\n"
" WordBreak_Newline,\n"
" WordBreak_Extend,\n"
+ " WordBreak_ZWJ,\n"
+ " WordBreak_Format,\n"
" WordBreak_RegionalIndicator,\n"
" WordBreak_Katakana,\n"
" WordBreak_HebrewLetter,\n"
@@ -355,15 +373,22 @@ static const char *word_break_class_string =
" WordBreak_MidLetter,\n"
" WordBreak_MidNum,\n"
" WordBreak_Numeric,\n"
- " WordBreak_ExtendNumLet\n"
+ " WordBreak_ExtendNumLet,\n"
+ " WordBreak_E_Base,\n"
+ " WordBreak_E_Modifier,\n"
+ " WordBreak_Glue_After_Zwj,\n"
+ " WordBreak_E_Base_GAZ,\n"
+ " NumWordBreakClasses,\n"
"};\n\n";
enum WordBreakClass {
- WordBreak_Other,
+ WordBreak_Any,
WordBreak_CR,
WordBreak_LF,
WordBreak_Newline,
WordBreak_Extend,
+ WordBreak_ZWJ,
+ WordBreak_Format,
WordBreak_RegionalIndicator,
WordBreak_Katakana,
WordBreak_HebrewLetter,
@@ -374,9 +399,13 @@ enum WordBreakClass {
WordBreak_MidLetter,
WordBreak_MidNum,
WordBreak_Numeric,
- WordBreak_ExtendNumLet
+ WordBreak_ExtendNumLet,
+ WordBreak_E_Base,
+ WordBreak_E_Modifier,
+ WordBreak_Glue_After_Zwj,
+ WordBreak_E_Base_GAZ,
- , WordBreak_Unassigned
+ WordBreak_Unassigned
};
static QHash<QByteArray, WordBreakClass> word_break_map;
@@ -387,12 +416,13 @@ static void initWordBreak()
WordBreakClass brk;
const char *name;
} breaks[] = {
- { WordBreak_Other, "Other" },
+ { WordBreak_Any, "Any" },
{ WordBreak_CR, "CR" },
{ WordBreak_LF, "LF" },
{ WordBreak_Newline, "Newline" },
{ WordBreak_Extend, "Extend" },
- { WordBreak_Extend, "Format" },
+ { WordBreak_ZWJ, "ZWJ" },
+ { WordBreak_Format, "Format" },
{ WordBreak_RegionalIndicator, "Regional_Indicator" },
{ WordBreak_Katakana, "Katakana" },
{ WordBreak_HebrewLetter, "Hebrew_Letter" },
@@ -404,6 +434,10 @@ static void initWordBreak()
{ WordBreak_MidNum, "MidNum" },
{ WordBreak_Numeric, "Numeric" },
{ WordBreak_ExtendNumLet, "ExtendNumLet" },
+ { WordBreak_E_Base, "E_Base" },
+ { WordBreak_E_Modifier, "E_Modifier" },
+ { WordBreak_Glue_After_Zwj, "Glue_After_Zwj" },
+ { WordBreak_E_Base_GAZ, "E_Base_GAZ" },
{ WordBreak_Unassigned, 0 }
};
WordBreakList *d = breaks;
@@ -416,7 +450,7 @@ static void initWordBreak()
static const char *sentence_break_class_string =
"enum SentenceBreakClass {\n"
- " SentenceBreak_Other,\n"
+ " SentenceBreak_Any,\n"
" SentenceBreak_CR,\n"
" SentenceBreak_LF,\n"
" SentenceBreak_Sep,\n"
@@ -429,11 +463,12 @@ static const char *sentence_break_class_string =
" SentenceBreak_ATerm,\n"
" SentenceBreak_SContinue,\n"
" SentenceBreak_STerm,\n"
- " SentenceBreak_Close\n"
+ " SentenceBreak_Close,\n"
+ " NumSentenceBreakClasses\n"
"};\n\n";
enum SentenceBreakClass {
- SentenceBreak_Other,
+ SentenceBreak_Any,
SentenceBreak_CR,
SentenceBreak_LF,
SentenceBreak_Sep,
@@ -459,7 +494,7 @@ static void initSentenceBreak()
SentenceBreakClass brk;
const char *name;
} breaks[] = {
- { SentenceBreak_Other, "Other" },
+ { SentenceBreak_Any, "Any" },
{ SentenceBreak_CR, "CR" },
{ SentenceBreak_LF, "LF" },
{ SentenceBreak_Sep, "Sep" },
@@ -494,8 +529,10 @@ static const char *line_break_class_string =
" LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,\n"
" LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,\n"
" LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB,\n"
- " LineBreak_SA, LineBreak_SG, LineBreak_SP, LineBreak_CR, LineBreak_LF,\n"
- " LineBreak_BK\n"
+ " LineBreak_EB, LineBreak_EM, LineBreak_ZWJ,\n"
+ " LineBreak_SA, LineBreak_SG, LineBreak_SP,\n"
+ " LineBreak_CR, LineBreak_LF, LineBreak_BK,\n"
+ " NumLineBreakClasses\n"
"};\n\n";
enum LineBreakClass {
@@ -505,10 +542,11 @@ enum LineBreakClass {
LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,
LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,
LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB,
- LineBreak_SA, LineBreak_SG, LineBreak_SP, LineBreak_CR, LineBreak_LF,
- LineBreak_BK
+ LineBreak_EB, LineBreak_EM, LineBreak_ZWJ,
+ LineBreak_SA, LineBreak_SG, LineBreak_SP,
+ LineBreak_CR, LineBreak_LF, LineBreak_BK,
- , LineBreak_Unassigned
+ LineBreak_Unassigned
};
static QHash<QByteArray, LineBreakClass> line_break_map;
@@ -563,6 +601,9 @@ static void initLineBreak()
{ LineBreak_RI, "RI" },
{ LineBreak_SA, "SA" },
{ LineBreak_AL, "XX" },
+ { LineBreak_EB, "EB" },
+ { LineBreak_EM, "EM" },
+ { LineBreak_ZWJ, "ZWJ" },
{ LineBreak_Unassigned, 0 }
};
LineBreakList *d = breaks;
@@ -768,10 +809,10 @@ static const char *property_string =
" signed short caseFoldDiff : 15;\n"
" ushort unicodeVersion : 8; /* 5 used */\n"
" ushort nfQuickCheck : 8;\n" // could be narrowed
- " ushort graphemeBreakClass : 4; /* 4 used */\n"
- " ushort wordBreakClass : 4; /* 4 used */\n"
+ " ushort graphemeBreakClass : 5; /* 5 used */\n"
+ " ushort wordBreakClass : 5; /* 5 used */\n"
" ushort sentenceBreakClass : 8; /* 4 used */\n"
- " ushort lineBreakClass : 8; /* 6 used */\n"
+ " ushort lineBreakClass : 6; /* 6 used */\n"
" ushort script : 8;\n"
"};\n\n"
"Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4) Q_DECL_NOTHROW;\n"
@@ -1034,9 +1075,9 @@ struct UnicodeData {
p.upperCaseSpecial = 0;
p.titleCaseSpecial = 0;
p.caseFoldSpecial = 0;
- p.graphemeBreakClass = GraphemeBreak_Other;
- p.wordBreakClass = WordBreak_Other;
- p.sentenceBreakClass = SentenceBreak_Other;
+ p.graphemeBreakClass = GraphemeBreak_Any;
+ p.wordBreakClass = WordBreak_Any;
+ p.sentenceBreakClass = SentenceBreak_Any;
p.script = QChar::Script_Unknown;
p.nfQuickCheck = 0;
propertyIndex = -1;
@@ -1913,7 +1954,7 @@ static void readWordBreak()
if (codepoint == 0x002E) // FULL STOP
brk = WordBreak_MidNum;
else if (codepoint == 0x003A) // COLON
- brk = WordBreak_Other;
+ brk = WordBreak_Any;
// ] ###
UnicodeData &ud = UnicodeData::valueRef(codepoint);
ud.p.wordBreakClass = brk;
@@ -2456,10 +2497,10 @@ static QByteArray createPropertyInfo()
// " ushort nfQuickCheck : 8;\n"
out += QByteArray::number( p.nfQuickCheck );
out += ", ";
-// " ushort graphemeBreakClass : 4; /* 4 used */\n"
-// " ushort wordBreakClass : 4; /* 4 used */\n"
+// " ushort graphemeBreakClass : 5; /* 5 used */\n"
+// " ushort wordBreakClass : 5; /* 5 used */\n"
// " ushort sentenceBreakClass : 8; /* 4 used */\n"
-// " ushort lineBreakClass : 8; /* 6 used */\n"
+// " ushort lineBreakClass : 6; /* 6 used */\n"
out += QByteArray::number( p.graphemeBreakClass );
out += ", ";
out += QByteArray::number( p.wordBreakClass );