QTBF: Fix issue with no splitting the words at "." (FULL STOP)

As of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet for better URL and abbreviations handling which caused "hi.there" to be treated like if it were just a single word; until we have the Unicode Text Segmentation tailoring mechanism, retain the old behavior by remapping (some of) those characters back to their old values. Change-Id: I49dea6064f2ea40a82fc0b1bc3c4f0b4e803919f Reviewed-by: David Faure <david.faure@kdab.com> Reviewed-by: Lars Knoll <lars.knoll@digia.com>
author: Konstantin Ritt <ritt.ks@gmail.com> 2012-11-22 03:25:05 +0200
committer: The Qt Project <gerrit-noreply@qt-project.org> 2012-11-23 11:59:50 +0100
commit: 2fbb69a09361bf1ecf517516f76405c3be494d6d (patch)
tree: 9330626943971c1feb5570532963236f8a9935d6
parent: 44b1c5dde2dfbb69a29cbd4ad8d1f0ac0203b482 (diff)
4 files changed, 64 insertions, 4 deletions
diff --git a/src/corelib/tools/qunicodetables.cpp b/src/corelib/tools/qunicodetables.cpp
index 9a3d6c7069..60754968ac 100644
--- a/src/corelib/tools/qunicodetables.cpp
+++ b/src/corelib/tools/qunicodetables.cpp
@@ -4796,7 +4796,7 @@ static const Properties uc_properties[] = {
     { 26, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 9, 0 },
     { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 11, 8, 0 },
     { 20, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 16, 0 },
-    { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 8, 10, 8, 0 },
+    { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 10, 8, 0 },
     { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0 },
     { 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
     { 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
@@ -4808,7 +4808,7 @@ static const Properties uc_properties[] = {
     { 3, 2, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
     { 3, 2, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
     { 3, 2, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
-    { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 9, 11, 8, 0 },
+    { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 8, 0 },
     { 25, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 0, 8, 0 },
     { 26, 10, 0, 0, -1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 },
     { 26, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 },
diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp
index 3d58f16d19..e86fef61e7 100644
--- a/src/corelib/tools/qunicodetools.cpp
+++ b/src/corelib/tools/qunicodetools.cpp
@@ -156,6 +156,18 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
 
         const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
         QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
+#ifdef QT_BUILD_INTERNAL
+        if (qt_initcharattributes_default_algorithm_only) {
+            // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
+            // which caused "hi.there" to be treated like if it were just a single word;
+            // by remapping those characters in the Unicode tables generator.
+            // this code is needed to pass the coverage tests; remove once the issue is fixed.
+            if (ucs4 == 0x002E) // FULL STOP
+                ncls = QUnicodeTables::WordBreak_MidNumLet;
+            else if (ucs4 == 0x003A) // COLON
+                ncls = QUnicodeTables::WordBreak_MidLetter;
+        }
+#endif
 
         uchar action = WB::breakTable[cls][ncls];
         if (Q_UNLIKELY(action == WB::Lookup)) {
diff --git a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp
index eeb22f4c21..b126c62c72 100644
--- a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp
+++ b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp
@@ -330,8 +330,40 @@ void tst_QTextBoundaryFinder::wordBoundaries_manual_data()
         expectedStartPositions << 0 << 4 << 8  << 14 << 18 << 22;
         expectedEndPositions   << 3 << 7 << 11 << 17 << 21 << 25;
 
-        QTest::newRow("data1") << testString << expectedBreakPositions
-                               << expectedStartPositions << expectedEndPositions;
+        QTest::newRow("words1") << testString << expectedBreakPositions
+                                << expectedStartPositions << expectedEndPositions;
+    }
+    {
+        QString testString(QString::fromUtf8("Hello (sad) world !"));
+        QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
+        expectedBreakPositions << 0 << 5 << 6 << 7 << 10 << 11 << 12 << 17 << 18 << 19;
+        expectedStartPositions << 0 << 7  << 12;
+        expectedEndPositions   << 5 << 10 << 17;
+
+        QTest::newRow("words2") << testString << expectedBreakPositions
+                                << expectedStartPositions << expectedEndPositions;
+    }
+    {
+        QString testString(QString::fromUtf8("mr.Hamster"));
+        QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
+        expectedBreakPositions << 0 << 2 << 3 << 10;
+        expectedStartPositions << 0 << 3;
+        expectedEndPositions   << 2 << 10;
+
+        QTest::newRow("words3") << testString << expectedBreakPositions
+                                << expectedStartPositions << expectedEndPositions;
+    }
+    {
+        QString testString(QString::fromUtf8("This is     a sample buffer.Please test me .     He's don't Le'Clerk."));
+        QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
+        expectedBreakPositions << 0 << 4 << 5 << 7 << 8 << 9 << 10 << 11 << 12 << 13 << 14 << 20 << 21 << 27
+                               << 28 << 34 << 35 << 39 << 40 << 42 << 43 << 44 << 45 << 46 << 47 << 48
+                               << 49 << 53 << 54 << 59 << 60 << 68 << 69;
+        expectedStartPositions << 0 << 5 << 12 << 14 << 21 << 28 << 35 << 40 << 49 << 54 << 60;
+        expectedEndPositions   << 4 << 7 << 13 << 20 << 27 << 34 << 39 << 42 << 53 << 59 << 68;
+
+        QTest::newRow("words4") << testString << expectedBreakPositions
+                                << expectedStartPositions << expectedEndPositions;
     }
     {
         // text with trailing space
@@ -512,6 +544,13 @@ void tst_QTextBoundaryFinder::sentenceBoundaries_manual_data()
 
         QTest::newRow("data2") << testString << expectedBreakPositions;
     }
+    {
+        QString testString(QString::fromUtf8("mr.Hamster"));
+        QList<int> expectedBreakPositions;
+        expectedBreakPositions << 0 << 3 << 10;
+
+        QTest::newRow("data3") << testString << expectedBreakPositions;
+    }
 }
 
 void tst_QTextBoundaryFinder::sentenceBoundaries_manual()
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index 7bc667ca14..8e612f0b03 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -1572,6 +1572,15 @@ static void readWordBreak()
             qFatal("unassigned word break class: %s", l[1].constData());
 
         for (int codepoint = from; codepoint <= to; ++codepoint) {
+            // ### [
+            // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
+            // which caused "hi.there" to be treated like if it were just a single word;
+            // until we have a tailoring mechanism, retain the old behavior by remapping those characters here.
+            if (codepoint == 0x002E) // FULL STOP
+                brk = WordBreak_MidNum;
+            else if (codepoint == 0x003A) // COLON
+                brk = WordBreak_Other;
+            // ] ###
             UnicodeData &ud = UnicodeData::valueRef(codepoint);
             ud.p.wordBreakClass = brk;
         }
author	Konstantin Ritt <ritt.ks@gmail.com>	2012-11-22 03:25:05 +0200
committer	The Qt Project <gerrit-noreply@qt-project.org>	2012-11-23 11:59:50 +0100
commit	2fbb69a09361bf1ecf517516f76405c3be494d6d (patch)
tree	9330626943971c1feb5570532963236f8a9935d6
parent	44b1c5dde2dfbb69a29cbd4ad8d1f0ac0203b482 (diff)