From 0ec07b68ad34e135451dd5291732bf73d297ba0c Mon Sep 17 00:00:00 2001
From: Konstantin Ritt <ritt.ks@gmail.com>
Date: Thu, 10 Apr 2014 13:50:53 +0300
Subject: Improve the Unicode script itemization implementation

Make it closer to the Unicode specs (UAX#24):
* Common now inherits the preceding character's script, if any;
* In a combining character sequence, if the base character is
  of Common script, the entire sequence is treated like if it were
  of the first non-Inherited, non-Common script in the sequence.
See http://www.unicode.org/reports/tr24/tr24-21.html for more details.

[ChangeLog][QtGui] Fixed regression in arabic text rendering.

Task-number: QTBUG-28813
Task-number: QTBUG-29930 (related)
Task-number: QTBUG-35836
Change-Id: Id85761965b08ca94c674d5f3613fe58b82b2ce9c
Reviewed-by: Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@digia.com>
Reviewed-by: Ahmed Saidi <justroftest@gmail.com>
---
 src/corelib/tools/qunicodetools.cpp                | 38 +++++++++++++++++++---
 src/gui/text/qtextengine.cpp                       | 17 +---------
 .../qtextscriptengine/tst_qtextscriptengine.cpp    | 22 ++++---------
 3 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp
index fac795051a..fc36d07a4a 100644
--- a/src/corelib/tools/qunicodetools.cpp
+++ b/src/corelib/tools/qunicodetools.cpp
@@ -667,7 +667,7 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
 
 // ----------------------------------------------------------------------------
 //
-// The Unicode script property. See http://www.unicode.org/reports/tr24/ (some very old version)
+// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-21.html
 //
 // ----------------------------------------------------------------------------
 
@@ -689,15 +689,36 @@ Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts)
 
         const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
 
-        if (Q_LIKELY(prop->script == script || prop->script == QChar::Script_Inherited))
+        if (Q_LIKELY(prop->script == script || prop->script <= QChar::Script_Inherited))
             continue;
 
         // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
         // Thus, a combining mark — whatever its script property value is — should inherit
         // the script property value of its base character.
         static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
-        if (Q_UNLIKELY(FLAG(prop->category) & test))
-            continue;
+        if (Q_UNLIKELY(FLAG(prop->category) & test)) {
+            // In cases where the base character itself has the Common script property value,
+            // and it is followed by one or more combining marks with a specific script property value,
+            // it may be even better for processing to let the base acquire the script property value
+            // from the first mark. This approach can be generalized by treating all the characters
+            // of a combining character sequence as having the script property value
+            // of the first non-Inherited, non-Common character in the sequence if there is one,
+            // and otherwise treating all the characters as having the Common script property value.
+            if (Q_LIKELY(script > QChar::Script_Common || prop->script <= QChar::Script_Common))
+                continue;
+
+            script = QChar::Script(prop->script);
+        }
+
+        if (Q_LIKELY(script != QChar::Script_Common)) {
+            // override preceding Common-s
+            while (sor > 0 && scripts[sor - 1] == QChar::Script_Common)
+                --sor;
+        } else {
+            // see if we are inheriting preceding run
+            if (sor > 0)
+                script = scripts[sor - 1];
+        }
 
         while (sor < eor)
             scripts[sor++] = script;
@@ -705,6 +726,15 @@ Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts)
         script = prop->script;
     }
     eor = length;
+    if (Q_LIKELY(script != QChar::Script_Common)) {
+        // override preceding Common-s
+        while (sor > 0 && scripts[sor - 1] == QChar::Script_Common)
+            --sor;
+    } else {
+        // see if we are inheriting preceding run
+        if (sor > 0)
+            script = scripts[sor - 1];
+    }
     while (sor < eor)
         scripts[sor++] = script;
 }
diff --git a/src/gui/text/qtextengine.cpp b/src/gui/text/qtextengine.cpp
index 967ba24fcf..34788dc4dc 100644
--- a/src/gui/text/qtextengine.cpp
+++ b/src/gui/text/qtextengine.cpp
@@ -122,20 +122,9 @@ private:
             return;
         const int end = start + length;
         for (int i = start + 1; i < end; ++i) {
-            // According to the unicode spec we should be treating characters in the Common script
-            // (punctuation, spaces, etc) as being the same script as the surrounding text for the
-            // purpose of splitting up text. This is important because, for example, a fullstop
-            // (0x2E) can be used to indicate an abbreviation and so must be treated as part of a
-            // word.  Thus it must be passed along with the word in languages that have to calculate
-            // word breaks.  For example the thai word "ครม." has no word breaks but the word "ครม"
-            // does.
-            // Unfortuntely because we split up the strings for both wordwrapping and for setting
-            // the font and because Japanese and Chinese are also aliases of the script "Common",
-            // doing this would break too many things.  So instead we only pass the full stop
-            // along, and nothing else.
             if (m_analysis[i].bidiLevel == m_analysis[start].bidiLevel
                 && m_analysis[i].flags == m_analysis[start].flags
-                && (m_analysis[i].script == m_analysis[start].script || m_string[i] == QLatin1Char('.'))
+                && m_analysis[i].script == m_analysis[start].script
                 && m_analysis[i].flags < QScriptAnalysis::SpaceTabOrObject
                 && i - start < MaxItemLength)
                 continue;
@@ -1515,26 +1504,22 @@ void QTextEngine::itemize() const
     while (uc < e) {
         switch (*uc) {
         case QChar::ObjectReplacementCharacter:
-            analysis->script = QChar::Script_Common;
             analysis->flags = QScriptAnalysis::Object;
             break;
         case QChar::LineSeparator:
             if (analysis->bidiLevel % 2)
                 --analysis->bidiLevel;
-            analysis->script = QChar::Script_Common;
             analysis->flags = QScriptAnalysis::LineOrParagraphSeparator;
             if (option.flags() & QTextOption::ShowLineAndParagraphSeparators)
                 *const_cast<ushort*>(uc) = 0x21B5; // visual line separator
             break;
         case QChar::Tabulation:
-            analysis->script = QChar::Script_Common;
             analysis->flags = QScriptAnalysis::Tab;
             analysis->bidiLevel = control.baseLevel();
             break;
         case QChar::Space:
         case QChar::Nbsp:
             if (option.flags() & QTextOption::ShowTabsAndSpaces) {
-                analysis->script = QChar::Script_Common;
                 analysis->flags = QScriptAnalysis::Space;
                 analysis->bidiLevel = control.baseLevel();
                 break;
diff --git a/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp b/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp
index 5dfb025510..74eb58670b 100644
--- a/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp
+++ b/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp
@@ -1258,29 +1258,21 @@ void tst_QTextScriptEngine::thaiWithZWJ()
     QTextLayout layout(s, font);
     QTextEngine *e = layout.engine();
     e->itemize();
-    QCOMPARE(e->layoutData->items.size(), 11);
+    QCOMPARE(e->layoutData->items.size(), 3);
 
     for (int item = 0; item < e->layoutData->items.size(); ++item)
         e->shape(item);
 
-    QCOMPARE(e->layoutData->items[0].num_glyphs, ushort(7));  // Thai: The ZWJ and ZWNJ characters are inherited, so should be part of the thai script
-    QCOMPARE(e->layoutData->items[1].num_glyphs, ushort(1));  // Common: The smart quotes cannot be handled by thai, so should be a separate item
-    QCOMPARE(e->layoutData->items[2].num_glyphs, ushort(1));  // Thai: Thai character
-    QCOMPARE(e->layoutData->items[3].num_glyphs, ushort(1));  // Common: Ellipsis
-    QCOMPARE(e->layoutData->items[4].num_glyphs, ushort(1));  // Thai: Thai character
-    QCOMPARE(e->layoutData->items[5].num_glyphs, ushort(1));  // Common: Smart quote
-    QCOMPARE(e->layoutData->items[6].num_glyphs, ushort(1));  // Thai: Thai character
-    QCOMPARE(e->layoutData->items[7].num_glyphs, ushort(1));  // Common: \xA0 = non-breaking space. Could be useful to have in thai, but not currently implemented
-    QCOMPARE(e->layoutData->items[8].num_glyphs, ushort(1));  // Thai: Thai character
-    QCOMPARE(e->layoutData->items[9].num_glyphs, ushort(1));  // Japanese: Kanji for tree
-    QCOMPARE(e->layoutData->items[10].num_glyphs, ushort(2)); // Thai: Thai character followed by superscript "a" which is of inherited type
+    QCOMPARE(e->layoutData->items[0].num_glyphs, ushort(15)); // Thai: The ZWJ and ZWNJ characters are inherited, so should be part of the thai script
+    QCOMPARE(e->layoutData->items[1].num_glyphs, ushort(1));  // Han: Kanji for tree
+    QCOMPARE(e->layoutData->items[2].num_glyphs, ushort(2));  // Thai: Thai character followed by superscript "a" which is of inherited type
 
     //A quick sanity check - check all the characters are individual clusters
     unsigned short *logClusters = e->layoutData->logClustersPtr;
-    for (int i = 0; i < 7; i++)
+    for (int i = 0; i <= 14; i++)
         QCOMPARE(logClusters[i], ushort(i));
-    for (int i = 0; i < 10; i++)
-        QCOMPARE(logClusters[i+7], ushort(0));
+    QCOMPARE(logClusters[15], ushort(0));
+    QCOMPARE(logClusters[16], ushort(0));
 #ifndef Q_OS_MAC
     // ### Result differs for HarfBuzz-NG
     QCOMPARE(logClusters[17], ushort(1));
-- 
cgit v1.2.3