4 files changed, 64 insertions, 4 deletions
diff --git a/src/corelib/tools/qunicodetables.cpp b/src/corelib/tools/qunicodetables.cpp
index 9a3d6c7069..60754968ac 100644
--- a/src/corelib/tools/qunicodetables.cpp
+++ b/src/corelib/tools/qunicodetables.cpp
@@ -4796,7 +4796,7 @@ static const Properties uc_properties[] = {
     { 26, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 9, 0 },
     { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 11, 8, 0 },
     { 20, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 16, 0 },
-    { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 8, 10, 8, 0 },
+    { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 10, 8, 0 },
     { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0 },
     { 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
     { 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
@@ -4808,7 +4808,7 @@ static const Properties uc_properties[] = {
     { 3, 2, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
     { 3, 2, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
     { 3, 2, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
-    { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 9, 11, 8, 0 },
+    { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 8, 0 },
     { 25, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 0, 8, 0 },
     { 26, 10, 0, 0, -1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 },
     { 26, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 },
diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp
index 3d58f16d19..e86fef61e7 100644
--- a/src/corelib/tools/qunicodetools.cpp
+++ b/src/corelib/tools/qunicodetools.cpp
@@ -156,6 +156,18 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
 
         const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
         QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
+#ifdef QT_BUILD_INTERNAL
+        if (qt_initcharattributes_default_algorithm_only) {
+            // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
+            // which caused "hi.there" to be treated like if it were just a single word;
+            // by remapping those characters in the Unicode tables generator.
+            // this code is needed to pass the coverage tests; remove once the issue is fixed.
+            if (ucs4 == 0x002E) // FULL STOP
+                ncls = QUnicodeTables::WordBreak_MidNumLet;
+            else if (ucs4 == 0x003A) // COLON
+                ncls = QUnicodeTables::WordBreak_MidLetter;
+        }
+#endif
 
         uchar action = WB::breakTable[cls][ncls];
         if (Q_UNLIKELY(action == WB::Lookup)) {
diff --git a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp
index eeb22f4c21..b126c62c72 100644
--- a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp
+++ b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp
@@ -330,8 +330,40 @@ void tst_QTextBoundaryFinder::wordBoundaries_manual_data()
         expectedStartPositions << 0 << 4 << 8  << 14 << 18 << 22;
         expectedEndPositions   << 3 << 7 << 11 << 17 << 21 << 25;
 
-        QTest::newRow("data1") << testString << expectedBreakPositions
-                               << expectedStartPositions << expectedEndPositions;
+        QTest::newRow("words1") << testString << expectedBreakPositions
+                                << expectedStartPositions << expectedEndPositions;
+    }
+    {
+        QString testString(QString::fromUtf8("Hello (sad) world !"));
+        QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
+        expectedBreakPositions << 0 << 5 << 6 << 7 << 10 << 11 << 12 << 17 << 18 << 19;
+        expectedStartPositions << 0 << 7  << 12;
+        expectedEndPositions   << 5 << 10 << 17;
+
+        QTest::newRow("words2") << testString << expectedBreakPositions
+                                << expectedStartPositions << expectedEndPositions;
+    }
+    {
+        QString testString(QString::fromUtf8("mr.Hamster"));
+        QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
+        expectedBreakPositions << 0 << 2 << 3 << 10;
+        expectedStartPositions << 0 << 3;
+        expectedEndPositions   << 2 << 10;
+
+        QTest::newRow("words3") << testString << expectedBreakPositions
+                                << expectedStartPositions << expectedEndPositions;
+    }
+    {
+        QString testString(QString::fromUtf8("This is     a sample buffer.Please test me .     He's don't Le'Clerk."));
+        QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
+        expectedBreakPositions << 0 << 4 << 5 << 7 << 8 << 9 << 10 << 11 << 12 << 13 << 14 << 20 << 21 << 27
+                               << 28 << 34 << 35 << 39 << 40 << 42 << 43 << 44 << 45 << 46 << 47 << 48
+                               << 49 << 53 << 54 << 59 << 60 << 68 << 69;
+        expectedStartPositions << 0 << 5 << 12 << 14 << 21 << 28 << 35 << 40 << 49 << 54 << 60;
+        expectedEndPositions   << 4 << 7 << 13 << 20 << 27 << 34 << 39 << 42 << 53 << 59 << 68;
+
+        QTest::newRow("words4") << testString << expectedBreakPositions
+                                << expectedStartPositions << expectedEndPositions;
     }
     {
         // text with trailing space
@@ -512,6 +544,13 @@ void tst_QTextBoundaryFinder::sentenceBoundaries_manual_data()
 
         QTest::newRow("data2") << testString << expectedBreakPositions;
     }
+    {
+        QString testString(QString::fromUtf8("mr.Hamster"));
+        QList<int> expectedBreakPositions;
+        expectedBreakPositions << 0 << 3 << 10;
+
+        QTest::newRow("data3") << testString << expectedBreakPositions;
+    }
 }
 
 void tst_QTextBoundaryFinder::sentenceBoundaries_manual()
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index 7bc667ca14..8e612f0b03 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -1572,6 +1572,15 @@ static void readWordBreak()
             qFatal("unassigned word break class: %s", l[1].constData());
 
         for (int codepoint = from; codepoint <= to; ++codepoint) {
+            // ### [
+            // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
+            // which caused "hi.there" to be treated like if it were just a single word;
+            // until we have a tailoring mechanism, retain the old behavior by remapping those characters here.
+            if (codepoint == 0x002E) // FULL STOP
+                brk = WordBreak_MidNum;
+            else if (codepoint == 0x003A) // COLON
+                brk = WordBreak_Other;
+            // ] ###
             UnicodeData &ud = UnicodeData::valueRef(codepoint);
             ud.p.wordBreakClass = brk;
         }