summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/corelib/tools/qunicodetables.cpp4
-rw-r--r--src/corelib/tools/qunicodetools.cpp12
-rw-r--r--tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp43
-rw-r--r--util/unicode/main.cpp9
4 files changed, 64 insertions, 4 deletions
diff --git a/src/corelib/tools/qunicodetables.cpp b/src/corelib/tools/qunicodetables.cpp
index 9a3d6c7069..60754968ac 100644
--- a/src/corelib/tools/qunicodetables.cpp
+++ b/src/corelib/tools/qunicodetables.cpp
@@ -4796,7 +4796,7 @@ static const Properties uc_properties[] = {
{ 26, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 9, 0 },
{ 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 11, 8, 0 },
{ 20, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 16, 0 },
- { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 8, 10, 8, 0 },
+ { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 10, 8, 0 },
{ 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0 },
{ 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
{ 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
@@ -4808,7 +4808,7 @@ static const Properties uc_properties[] = {
{ 3, 2, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
{ 3, 2, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
{ 3, 2, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
- { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 9, 11, 8, 0 },
+ { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 8, 0 },
{ 25, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 0, 8, 0 },
{ 26, 10, 0, 0, -1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 },
{ 26, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 },
diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp
index 3d58f16d19..e86fef61e7 100644
--- a/src/corelib/tools/qunicodetools.cpp
+++ b/src/corelib/tools/qunicodetools.cpp
@@ -156,6 +156,18 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
+#ifdef QT_BUILD_INTERNAL
+ if (qt_initcharattributes_default_algorithm_only) {
+ // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
+ // which caused "hi.there" to be treated like if it were just a single word;
+ // by remapping those characters in the Unicode tables generator.
+ // this code is needed to pass the coverage tests; remove once the issue is fixed.
+ if (ucs4 == 0x002E) // FULL STOP
+ ncls = QUnicodeTables::WordBreak_MidNumLet;
+ else if (ucs4 == 0x003A) // COLON
+ ncls = QUnicodeTables::WordBreak_MidLetter;
+ }
+#endif
uchar action = WB::breakTable[cls][ncls];
if (Q_UNLIKELY(action == WB::Lookup)) {
diff --git a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp
index eeb22f4c21..b126c62c72 100644
--- a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp
+++ b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp
@@ -330,8 +330,40 @@ void tst_QTextBoundaryFinder::wordBoundaries_manual_data()
expectedStartPositions << 0 << 4 << 8 << 14 << 18 << 22;
expectedEndPositions << 3 << 7 << 11 << 17 << 21 << 25;
- QTest::newRow("data1") << testString << expectedBreakPositions
- << expectedStartPositions << expectedEndPositions;
+ QTest::newRow("words1") << testString << expectedBreakPositions
+ << expectedStartPositions << expectedEndPositions;
+ }
+ {
+ QString testString(QString::fromUtf8("Hello (sad) world !"));
+ QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
+ expectedBreakPositions << 0 << 5 << 6 << 7 << 10 << 11 << 12 << 17 << 18 << 19;
+ expectedStartPositions << 0 << 7 << 12;
+ expectedEndPositions << 5 << 10 << 17;
+
+ QTest::newRow("words2") << testString << expectedBreakPositions
+ << expectedStartPositions << expectedEndPositions;
+ }
+ {
+ QString testString(QString::fromUtf8("mr.Hamster"));
+ QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
+ expectedBreakPositions << 0 << 2 << 3 << 10;
+ expectedStartPositions << 0 << 3;
+ expectedEndPositions << 2 << 10;
+
+ QTest::newRow("words3") << testString << expectedBreakPositions
+ << expectedStartPositions << expectedEndPositions;
+ }
+ {
+ QString testString(QString::fromUtf8("This is a sample buffer.Please test me . He's don't Le'Clerk."));
+ QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
+ expectedBreakPositions << 0 << 4 << 5 << 7 << 8 << 9 << 10 << 11 << 12 << 13 << 14 << 20 << 21 << 27
+ << 28 << 34 << 35 << 39 << 40 << 42 << 43 << 44 << 45 << 46 << 47 << 48
+ << 49 << 53 << 54 << 59 << 60 << 68 << 69;
+ expectedStartPositions << 0 << 5 << 12 << 14 << 21 << 28 << 35 << 40 << 49 << 54 << 60;
+ expectedEndPositions << 4 << 7 << 13 << 20 << 27 << 34 << 39 << 42 << 53 << 59 << 68;
+
+ QTest::newRow("words4") << testString << expectedBreakPositions
+ << expectedStartPositions << expectedEndPositions;
}
{
// text with trailing space
@@ -512,6 +544,13 @@ void tst_QTextBoundaryFinder::sentenceBoundaries_manual_data()
QTest::newRow("data2") << testString << expectedBreakPositions;
}
+ {
+ QString testString(QString::fromUtf8("mr.Hamster"));
+ QList<int> expectedBreakPositions;
+ expectedBreakPositions << 0 << 3 << 10;
+
+ QTest::newRow("data3") << testString << expectedBreakPositions;
+ }
}
void tst_QTextBoundaryFinder::sentenceBoundaries_manual()
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index 7bc667ca14..8e612f0b03 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -1572,6 +1572,15 @@ static void readWordBreak()
qFatal("unassigned word break class: %s", l[1].constData());
for (int codepoint = from; codepoint <= to; ++codepoint) {
+ // ### [
+ // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
+ // which caused "hi.there" to be treated like if it were just a single word;
+ // until we have a tailoring mechanism, retain the old behavior by remapping those characters here.
+ if (codepoint == 0x002E) // FULL STOP
+ brk = WordBreak_MidNum;
+ else if (codepoint == 0x003A) // COLON
+ brk = WordBreak_Other;
+ // ] ###
UnicodeData &ud = UnicodeData::valueRef(codepoint);
ud.p.wordBreakClass = brk;
}