From 0ec07b68ad34e135451dd5291732bf73d297ba0c Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Thu, 10 Apr 2014 13:50:53 +0300 Subject: Improve the Unicode script itemization implementation Make it closer to the Unicode specs (UAX#24): * Common now inherits the preceding character's script, if any; * In a combining character sequence, if the base character is of Common script, the entire sequence is treated like if it were of the first non-Inherited, non-Common script in the sequence. See http://www.unicode.org/reports/tr24/tr24-21.html for more details. [ChangeLog][QtGui] Fixed regression in arabic text rendering. Task-number: QTBUG-28813 Task-number: QTBUG-29930 (related) Task-number: QTBUG-35836 Change-Id: Id85761965b08ca94c674d5f3613fe58b82b2ce9c Reviewed-by: Eskil Abrahamsen Blomfeldt Reviewed-by: Ahmed Saidi --- src/corelib/tools/qunicodetools.cpp | 38 +++++++++++++++++++--- src/gui/text/qtextengine.cpp | 17 +--------- .../qtextscriptengine/tst_qtextscriptengine.cpp | 22 ++++--------- 3 files changed, 42 insertions(+), 35 deletions(-) diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp index fac795051a..fc36d07a4a 100644 --- a/src/corelib/tools/qunicodetools.cpp +++ b/src/corelib/tools/qunicodetools.cpp @@ -667,7 +667,7 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length, // ---------------------------------------------------------------------------- // -// The Unicode script property. See http://www.unicode.org/reports/tr24/ (some very old version) +// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-21.html // // ---------------------------------------------------------------------------- @@ -689,15 +689,36 @@ Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts) const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); - if (Q_LIKELY(prop->script == script || prop->script == QChar::Script_Inherited)) + if (Q_LIKELY(prop->script == script || prop->script <= QChar::Script_Inherited)) continue; // Never break between a combining mark (gc= Mc, Mn or Me) and its base character. // Thus, a combining mark — whatever its script property value is — should inherit // the script property value of its base character. static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing)); - if (Q_UNLIKELY(FLAG(prop->category) & test)) - continue; + if (Q_UNLIKELY(FLAG(prop->category) & test)) { + // In cases where the base character itself has the Common script property value, + // and it is followed by one or more combining marks with a specific script property value, + // it may be even better for processing to let the base acquire the script property value + // from the first mark. This approach can be generalized by treating all the characters + // of a combining character sequence as having the script property value + // of the first non-Inherited, non-Common character in the sequence if there is one, + // and otherwise treating all the characters as having the Common script property value. + if (Q_LIKELY(script > QChar::Script_Common || prop->script <= QChar::Script_Common)) + continue; + + script = QChar::Script(prop->script); + } + + if (Q_LIKELY(script != QChar::Script_Common)) { + // override preceding Common-s + while (sor > 0 && scripts[sor - 1] == QChar::Script_Common) + --sor; + } else { + // see if we are inheriting preceding run + if (sor > 0) + script = scripts[sor - 1]; + } while (sor < eor) scripts[sor++] = script; @@ -705,6 +726,15 @@ Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts) script = prop->script; } eor = length; + if (Q_LIKELY(script != QChar::Script_Common)) { + // override preceding Common-s + while (sor > 0 && scripts[sor - 1] == QChar::Script_Common) + --sor; + } else { + // see if we are inheriting preceding run + if (sor > 0) + script = scripts[sor - 1]; + } while (sor < eor) scripts[sor++] = script; } diff --git a/src/gui/text/qtextengine.cpp b/src/gui/text/qtextengine.cpp index 967ba24fcf..34788dc4dc 100644 --- a/src/gui/text/qtextengine.cpp +++ b/src/gui/text/qtextengine.cpp @@ -122,20 +122,9 @@ private: return; const int end = start + length; for (int i = start + 1; i < end; ++i) { - // According to the unicode spec we should be treating characters in the Common script - // (punctuation, spaces, etc) as being the same script as the surrounding text for the - // purpose of splitting up text. This is important because, for example, a fullstop - // (0x2E) can be used to indicate an abbreviation and so must be treated as part of a - // word. Thus it must be passed along with the word in languages that have to calculate - // word breaks. For example the thai word "ครม." has no word breaks but the word "ครม" - // does. - // Unfortuntely because we split up the strings for both wordwrapping and for setting - // the font and because Japanese and Chinese are also aliases of the script "Common", - // doing this would break too many things. So instead we only pass the full stop - // along, and nothing else. if (m_analysis[i].bidiLevel == m_analysis[start].bidiLevel && m_analysis[i].flags == m_analysis[start].flags - && (m_analysis[i].script == m_analysis[start].script || m_string[i] == QLatin1Char('.')) + && m_analysis[i].script == m_analysis[start].script && m_analysis[i].flags < QScriptAnalysis::SpaceTabOrObject && i - start < MaxItemLength) continue; @@ -1515,26 +1504,22 @@ void QTextEngine::itemize() const while (uc < e) { switch (*uc) { case QChar::ObjectReplacementCharacter: - analysis->script = QChar::Script_Common; analysis->flags = QScriptAnalysis::Object; break; case QChar::LineSeparator: if (analysis->bidiLevel % 2) --analysis->bidiLevel; - analysis->script = QChar::Script_Common; analysis->flags = QScriptAnalysis::LineOrParagraphSeparator; if (option.flags() & QTextOption::ShowLineAndParagraphSeparators) *const_cast(uc) = 0x21B5; // visual line separator break; case QChar::Tabulation: - analysis->script = QChar::Script_Common; analysis->flags = QScriptAnalysis::Tab; analysis->bidiLevel = control.baseLevel(); break; case QChar::Space: case QChar::Nbsp: if (option.flags() & QTextOption::ShowTabsAndSpaces) { - analysis->script = QChar::Script_Common; analysis->flags = QScriptAnalysis::Space; analysis->bidiLevel = control.baseLevel(); break; diff --git a/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp b/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp index 5dfb025510..74eb58670b 100644 --- a/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp +++ b/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp @@ -1258,29 +1258,21 @@ void tst_QTextScriptEngine::thaiWithZWJ() QTextLayout layout(s, font); QTextEngine *e = layout.engine(); e->itemize(); - QCOMPARE(e->layoutData->items.size(), 11); + QCOMPARE(e->layoutData->items.size(), 3); for (int item = 0; item < e->layoutData->items.size(); ++item) e->shape(item); - QCOMPARE(e->layoutData->items[0].num_glyphs, ushort(7)); // Thai: The ZWJ and ZWNJ characters are inherited, so should be part of the thai script - QCOMPARE(e->layoutData->items[1].num_glyphs, ushort(1)); // Common: The smart quotes cannot be handled by thai, so should be a separate item - QCOMPARE(e->layoutData->items[2].num_glyphs, ushort(1)); // Thai: Thai character - QCOMPARE(e->layoutData->items[3].num_glyphs, ushort(1)); // Common: Ellipsis - QCOMPARE(e->layoutData->items[4].num_glyphs, ushort(1)); // Thai: Thai character - QCOMPARE(e->layoutData->items[5].num_glyphs, ushort(1)); // Common: Smart quote - QCOMPARE(e->layoutData->items[6].num_glyphs, ushort(1)); // Thai: Thai character - QCOMPARE(e->layoutData->items[7].num_glyphs, ushort(1)); // Common: \xA0 = non-breaking space. Could be useful to have in thai, but not currently implemented - QCOMPARE(e->layoutData->items[8].num_glyphs, ushort(1)); // Thai: Thai character - QCOMPARE(e->layoutData->items[9].num_glyphs, ushort(1)); // Japanese: Kanji for tree - QCOMPARE(e->layoutData->items[10].num_glyphs, ushort(2)); // Thai: Thai character followed by superscript "a" which is of inherited type + QCOMPARE(e->layoutData->items[0].num_glyphs, ushort(15)); // Thai: The ZWJ and ZWNJ characters are inherited, so should be part of the thai script + QCOMPARE(e->layoutData->items[1].num_glyphs, ushort(1)); // Han: Kanji for tree + QCOMPARE(e->layoutData->items[2].num_glyphs, ushort(2)); // Thai: Thai character followed by superscript "a" which is of inherited type //A quick sanity check - check all the characters are individual clusters unsigned short *logClusters = e->layoutData->logClustersPtr; - for (int i = 0; i < 7; i++) + for (int i = 0; i <= 14; i++) QCOMPARE(logClusters[i], ushort(i)); - for (int i = 0; i < 10; i++) - QCOMPARE(logClusters[i+7], ushort(0)); + QCOMPARE(logClusters[15], ushort(0)); + QCOMPARE(logClusters[16], ushort(0)); #ifndef Q_OS_MAC // ### Result differs for HarfBuzz-NG QCOMPARE(logClusters[17], ushort(1)); -- cgit v1.2.3