From edfce46a6c0406af749ca7ef659df6315e36cd5d Mon Sep 17 00:00:00 2001
From: Konstantin Ritt <ritt.ks@gmail.com>
Date: Sun, 12 Jan 2014 21:14:25 +0200
Subject: Update the Unicode Data and Algorithms up to Unicode 6.3.0

* Mongolian and Phags-pa characters have been given a Joining_Type
  classification for contextual shaping. As a part of these additions,
  one Phags-pa character has the Joining_Type value of L (Left Joining),
  which no character had been assigned before.
* The unassigned code points in the Currency Symbols block have been
  given the Bidi_Class property value ET and the Line_Break property
  value PR, to help implementations support new currency symbols,
  when they are encoded.
* Hebrew letters and basic punctuation marks have been assigned
  the newly introduced Word_Break property values Hebrew_Letter,
  Single_Quote, and Double_Quote.
* The Bidi_Class property has been extended with four new values
  for directional isolates.
For more details, see http://www.unicode.org/versions/Unicode6.3.0/

Change-Id: Iad62d02edc58a8497898dcd6d6c70d5aece317ea
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
---
 util/unicode/main.cpp | 136 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 111 insertions(+), 25 deletions(-)

(limited to 'util/unicode')

diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index a4d3e0f377..59b95ad924 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -77,6 +77,7 @@ static void initAgeMap()
         { QChar::Unicode_6_0,   "6.0" },
         { QChar::Unicode_6_1,   "6.1" },
         { QChar::Unicode_6_2,   "6.2" },
+        { QChar::Unicode_6_3,   "6.3" },
         { QChar::Unicode_Unassigned, 0 }
     };
     AgeMap *d = ageMap;
@@ -176,34 +177,66 @@ static void initDecompositionMap()
 }
 
 
-static QHash<QByteArray, QChar::Direction> directionMap;
+enum Direction {
+    DirL = QChar::DirL,
+    DirR = QChar::DirR,
+    DirEN = QChar::DirEN,
+    DirES = QChar::DirES,
+    DirET = QChar::DirET,
+    DirAN = QChar::DirAN,
+    DirCS = QChar::DirCS,
+    DirB = QChar::DirB,
+    DirS = QChar::DirS,
+    DirWS = QChar::DirWS,
+    DirON = QChar::DirON,
+    DirLRE = QChar::DirLRE,
+    DirLRO = QChar::DirLRO,
+    DirAL = QChar::DirAL,
+    DirRLE = QChar::DirRLE,
+    DirRLO = QChar::DirRLO,
+    DirPDF = QChar::DirPDF,
+    DirNSM = QChar::DirNSM,
+    DirBN = QChar::DirBN,
+    DirLRI = QChar::DirLRI,
+    DirRLI = QChar::DirRLI,
+    DirFSI = QChar::DirFSI,
+    DirPDI = QChar::DirPDI
+
+    , Dir_Unassigned
+};
+
+static QHash<QByteArray, Direction> directionMap;
 
 static void initDirectionMap()
 {
     struct Dir {
-        QChar::Direction dir;
+        Direction dir;
         const char *name;
     } directions[] = {
-        { QChar::DirL, "L" },
-        { QChar::DirR, "R" },
-        { QChar::DirEN, "EN" },
-        { QChar::DirES, "ES" },
-        { QChar::DirET, "ET" },
-        { QChar::DirAN, "AN" },
-        { QChar::DirCS, "CS" },
-        { QChar::DirB, "B" },
-        { QChar::DirS, "S" },
-        { QChar::DirWS, "WS" },
-        { QChar::DirON, "ON" },
-        { QChar::DirLRE, "LRE" },
-        { QChar::DirLRO, "LRO" },
-        { QChar::DirAL, "AL" },
-        { QChar::DirRLE, "RLE" },
-        { QChar::DirRLO, "RLO" },
-        { QChar::DirPDF, "PDF" },
-        { QChar::DirNSM, "NSM" },
-        { QChar::DirBN, "BN" },
-        { QChar::DirL, 0 }
+        { DirL, "L" },
+        { DirR, "R" },
+        { DirEN, "EN" },
+        { DirES, "ES" },
+        { DirET, "ET" },
+        { DirAN, "AN" },
+        { DirCS, "CS" },
+        { DirB, "B" },
+        { DirS, "S" },
+        { DirWS, "WS" },
+        { DirON, "ON" },
+        { DirLRE, "LRE" },
+        { DirLRO, "LRO" },
+        { DirAL, "AL" },
+        { DirRLE, "RLE" },
+        { DirRLO, "RLO" },
+        { DirPDF, "PDF" },
+        { DirNSM, "NSM" },
+        { DirBN, "BN" },
+        { DirLRI, "LRI" },
+        { DirRLI, "RLI" },
+        { DirFSI, "FSI" },
+        { DirPDI, "PDI" },
+        { Dir_Unassigned, 0 }
     };
     Dir *d = directions;
     while (d->name) {
@@ -323,7 +356,10 @@ static const char *word_break_class_string =
     "    WordBreak_Extend,\n"
     "    WordBreak_RegionalIndicator,\n"
     "    WordBreak_Katakana,\n"
+    "    WordBreak_HebrewLetter,\n"
     "    WordBreak_ALetter,\n"
+    "    WordBreak_SingleQuote,\n"
+    "    WordBreak_DoubleQuote,\n"
     "    WordBreak_MidNumLet,\n"
     "    WordBreak_MidLetter,\n"
     "    WordBreak_MidNum,\n"
@@ -339,7 +375,10 @@ enum WordBreakClass {
     WordBreak_Extend,
     WordBreak_RegionalIndicator,
     WordBreak_Katakana,
+    WordBreak_HebrewLetter,
     WordBreak_ALetter,
+    WordBreak_SingleQuote,
+    WordBreak_DoubleQuote,
     WordBreak_MidNumLet,
     WordBreak_MidLetter,
     WordBreak_MidNum,
@@ -365,7 +404,10 @@ static void initWordBreak()
         { WordBreak_Extend, "Format" },
         { WordBreak_RegionalIndicator, "Regional_Indicator" },
         { WordBreak_Katakana, "Katakana" },
+        { WordBreak_HebrewLetter, "Hebrew_Letter" },
         { WordBreak_ALetter, "ALetter" },
+        { WordBreak_SingleQuote, "Single_Quote" },
+        { WordBreak_DoubleQuote, "Double_Quote" },
         { WordBreak_MidNumLet, "MidNumLet" },
         { WordBreak_MidLetter, "MidLetter" },
         { WordBreak_MidNum, "MidNum" },
@@ -815,6 +857,31 @@ static int appendToSpecialCaseMap(const QList<int> &map)
     return pos;
 }
 
+static inline bool isDefaultIgnorable(uint ucs4)
+{
+    // Default_Ignorable_Code_Point:
+    //  Generated from
+    //    Other_Default_Ignorable_Code_Point + Cf + Variation_Selector
+    //    - White_Space - FFF9..FFFB (Annotation Characters)
+    //    - 0600..0604, 06DD, 070F, 110BD (exceptional Cf characters that should be visible)
+    if (ucs4 <= 0xff)
+        return ucs4 == 0xad;
+
+    return ucs4 == 0x034f
+            || (ucs4 >= 0x115f && ucs4 <= 0x1160)
+            || (ucs4 >= 0x17b4 && ucs4 <= 0x17b5)
+            || (ucs4 >= 0x180b && ucs4 <= 0x180d)
+            || (ucs4 >= 0x200b && ucs4 <= 0x200f)
+            || (ucs4 >= 0x202a && ucs4 <= 0x202e)
+            || (ucs4 >= 0x2060 && ucs4 <= 0x206f)
+            || ucs4 == 0x3164
+            || (ucs4 >= 0xfe00 && ucs4 <= 0xfe0f)
+            || ucs4 == 0xfeff
+            || ucs4 == 0xffa0
+            || (ucs4 >= 0xfff0 && ucs4 <= 0xfff8)
+            || (ucs4 >= 0x1d173 && ucs4 <= 0xe0fff && (ucs4 <= 0x1d17a || ucs4 >= 0xe0000));
+}
+
 struct UnicodeData {
     UnicodeData(int codepoint = 0) {
         p.category = QChar::Other_NotAssigned; // Cn
@@ -842,6 +909,17 @@ struct UnicodeData {
             || (codepoint >= 0x1EF00 && codepoint <= 0x1EFFF)) {
             p.direction = QChar::DirR;
         }
+        // The unassigned code points that default to ET are in the range:
+        //     [U+20A0..U+20CF]
+        else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
+            p.direction = QChar::DirET;
+        }
+        // The unassigned code points that default to BN have one of the following properties:
+        //     Default_Ignorable_Code_Point
+        //     Noncharacter_Code_Point
+        else if (QChar::isNonCharacter(codepoint) || isDefaultIgnorable(codepoint)) {
+            p.direction = QChar::DirBN;
+        }
 
         p.lineBreakClass = LineBreak_AL; // XX -> AL
         // LineBreak.txt
@@ -858,6 +936,11 @@ struct UnicodeData {
             || (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) {
             p.lineBreakClass = LineBreak_ID;
         }
+        // The unassigned code points that default to "PR" comprise a range in the following block:
+        //     [U+20A0..U+20CF]
+        else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
+            p.lineBreakClass = LineBreak_PR;
+        }
 
         mirroredChar = 0;
         decompositionType = QChar::NoDecomposition;
@@ -1008,7 +1091,10 @@ static void readUnicodeData()
         else
             ++combiningClassUsage[data.p.combiningClass];
 
-        data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
+        Direction dir = directionMap.value(properties[UD_BidiCategory], Dir_Unassigned);
+        if (dir == Dir_Unassigned)
+            qFatal("unhandled direction value: %s", properties[UD_BidiCategory].constData());
+        data.p.direction = QChar::Direction(dir);
 
         if (!properties[UD_UpperCase].isEmpty()) {
             int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
@@ -1180,8 +1266,8 @@ static void readArabicShaping()
             qFatal("unassigned or unhandled joining value: %s", l[2].constData());
 
         if (joining == Joining_Left) {
-            // There are currently no characters of joining type Left_Joining defined in Unicode.
-            qFatal("%x: joining type '%s' was met; the current implementation needs to be revised!", codepoint, l[2].constData());
+            qWarning("ACHTUNG!!! joining type '%s' has been met for U+%X; the current implementation needs to be revised!",
+                     l[2].trimmed().constData(), codepoint);
         }
 
         UnicodeData &d = UnicodeData::valueRef(codepoint);
-- 
cgit v1.2.3