summaryrefslogtreecommitdiffstats
path: root/src/corelib
diff options
context:
space:
mode:
authorKonstantin Ritt <ritt.ks@gmail.com>2013-03-12 18:37:07 +0200
committerThe Qt Project <gerrit-noreply@qt-project.org>2013-03-14 19:49:38 +0100
commitc20422af13fb30751eaa58e8755c7a9a7fd20a50 (patch)
treea2a08cb439700d6b1dfc916d9d499ee501a183f4 /src/corelib
parenta8e933a74ce44283182b1dea1ec5a73afe6e85a8 (diff)
Move Unicode script itemization code from text engine to UnicodeTools
This is still the same trivial implementation with the only difference in that that it properly handles surrogate pairs and combining marks. This temporarily makes QTextEngine::itemize() insignificatly slower due to using intermediate buffer, until refactoring is done. Change-Id: I7987d6306b0b5cdb21b837968e292dd70abfe223 Reviewed-by: Eskil Abrahamsen Blomfeldt <eskil.abrahamsen-blomfeldt@digia.com>
Diffstat (limited to 'src/corelib')
-rw-r--r--src/corelib/tools/qtextboundaryfinder.cpp39
-rw-r--r--src/corelib/tools/qunicodetools.cpp45
-rw-r--r--src/corelib/tools/qunicodetools_p.h3
3 files changed, 60 insertions, 27 deletions
diff --git a/src/corelib/tools/qtextboundaryfinder.cpp b/src/corelib/tools/qtextboundaryfinder.cpp
index 51b4ece4b1..5e8aed579d 100644
--- a/src/corelib/tools/qtextboundaryfinder.cpp
+++ b/src/corelib/tools/qtextboundaryfinder.cpp
@@ -53,39 +53,24 @@ public:
static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int length, QCharAttributes *attributes)
{
+ const ushort *string = reinterpret_cast<const ushort *>(chars);
+
QVarLengthArray<QUnicodeTools::ScriptItem> scriptItems;
+ {
+ QVarLengthArray<uchar> scripts(length);
- const ushort *string = reinterpret_cast<const ushort *>(chars);
- const ushort *unicode = string;
- // correctly assign script, isTab and isObject to the script analysis
- const ushort *uc = unicode;
- const ushort *e = uc + length;
- uchar script = QChar::Script_Common;
- uchar lastScript = QChar::Script_Common;
- const ushort *start = uc;
- while (uc < e) {
- int s = QChar::script(*uc);
- if (s != QChar::Script_Inherited)
- script = s;
- if (*uc == QChar::ObjectReplacementCharacter || *uc == QChar::LineSeparator || *uc == 9)
- script = QChar::Script_Common;
- if (script != lastScript) {
- if (uc != start) {
+ QUnicodeTools::initScripts(string, length, scripts.data());
+
+ int start = 0;
+ for (int i = start + 1; i <= length; ++i) {
+ if (i == length || scripts[i] != scripts[start]) {
QUnicodeTools::ScriptItem item;
- item.position = start - string;
- item.script = lastScript;
+ item.position = start;
+ item.script = scripts[start];
scriptItems.append(item);
- start = uc;
+ start = i;
}
- lastScript = script;
}
- ++uc;
- }
- if (uc != start) {
- QUnicodeTools::ScriptItem item;
- item.position = start - string;
- item.script = lastScript;
- scriptItems.append(item);
}
QUnicodeTools::CharAttributeOptions options = 0;
diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp
index 3102035684..4d5c978fd5 100644
--- a/src/corelib/tools/qunicodetools.cpp
+++ b/src/corelib/tools/qunicodetools.cpp
@@ -635,6 +635,51 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
}
}
+
+// ----------------------------------------------------------------------------
+//
+// The Unicode script property. See http://www.unicode.org/reports/tr24/ (some very old version)
+//
+// ----------------------------------------------------------------------------
+
+Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts)
+{
+ int sor = 0;
+ int eor = -1;
+ uchar script = QChar::Script_Common;
+ for (int i = 0; i < length; ++i) {
+ eor = i;
+ uint ucs4 = string[i];
+ if (QChar::isHighSurrogate(ucs4) && i + 1 < length) {
+ ushort low = string[i + 1];
+ if (QChar::isLowSurrogate(low)) {
+ ucs4 = QChar::surrogateToUcs4(ucs4, low);
+ ++i;
+ }
+ }
+
+ const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
+
+ if (Q_LIKELY(prop->script == script || prop->script == QChar::Script_Inherited))
+ continue;
+
+ // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
+ // Thus, a combining mark — whatever its script property value is — should inherit
+ // the script property value of its base character.
+ static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
+ if (Q_UNLIKELY(FLAG(prop->category) & test))
+ continue;
+
+ while (sor < eor)
+ scripts[sor++] = script;
+
+ script = prop->script;
+ }
+ eor = length;
+ while (sor < eor)
+ scripts[sor++] = script;
+}
+
} // namespace QUnicodeTools
QT_END_NAMESPACE
diff --git a/src/corelib/tools/qunicodetools_p.h b/src/corelib/tools/qunicodetools_p.h
index 5a4f1659c4..5db3126159 100644
--- a/src/corelib/tools/qunicodetools_p.h
+++ b/src/corelib/tools/qunicodetools_p.h
@@ -96,6 +96,9 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
const ScriptItem *items, int numItems,
QCharAttributes *attributes, CharAttributeOptions options = DefaultOptionsCompat);
+
+Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts);
+
} // namespace QUnicodeTools
QT_END_NAMESPACE