diff options
-rw-r--r-- | src/corelib/tools/qunicodetables.cpp | 4 | ||||
-rw-r--r-- | src/corelib/tools/qunicodetools.cpp | 12 | ||||
-rw-r--r-- | tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp | 43 | ||||
-rw-r--r-- | util/unicode/main.cpp | 9 |
4 files changed, 64 insertions, 4 deletions
diff --git a/src/corelib/tools/qunicodetables.cpp b/src/corelib/tools/qunicodetables.cpp index 9a3d6c7069..60754968ac 100644 --- a/src/corelib/tools/qunicodetables.cpp +++ b/src/corelib/tools/qunicodetables.cpp @@ -4796,7 +4796,7 @@ static const Properties uc_properties[] = { { 26, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 9, 0 }, { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 11, 8, 0 }, { 20, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 16, 0 }, - { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 8, 10, 8, 0 }, + { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 10, 8, 0 }, { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0 }, { 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 }, { 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 }, @@ -4808,7 +4808,7 @@ static const Properties uc_properties[] = { { 3, 2, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 }, { 3, 2, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 }, { 3, 2, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 }, - { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 9, 11, 8, 0 }, + { 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 8, 0 }, { 25, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 0, 8, 0 }, { 26, 10, 0, 0, -1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 }, { 26, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 }, diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp index 3d58f16d19..e86fef61e7 100644 --- a/src/corelib/tools/qunicodetools.cpp +++ b/src/corelib/tools/qunicodetools.cpp @@ -156,6 +156,18 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass; +#ifdef QT_BUILD_INTERNAL + if (qt_initcharattributes_default_algorithm_only) { + // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet + // which caused "hi.there" to be treated like if it were just a single word; + // by remapping those characters in the Unicode tables generator. + // this code is needed to pass the coverage tests; remove once the issue is fixed. + if (ucs4 == 0x002E) // FULL STOP + ncls = QUnicodeTables::WordBreak_MidNumLet; + else if (ucs4 == 0x003A) // COLON + ncls = QUnicodeTables::WordBreak_MidLetter; + } +#endif uchar action = WB::breakTable[cls][ncls]; if (Q_UNLIKELY(action == WB::Lookup)) { diff --git a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp index eeb22f4c21..b126c62c72 100644 --- a/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp +++ b/tests/auto/corelib/tools/qtextboundaryfinder/tst_qtextboundaryfinder.cpp @@ -330,8 +330,40 @@ void tst_QTextBoundaryFinder::wordBoundaries_manual_data() expectedStartPositions << 0 << 4 << 8 << 14 << 18 << 22; expectedEndPositions << 3 << 7 << 11 << 17 << 21 << 25; - QTest::newRow("data1") << testString << expectedBreakPositions - << expectedStartPositions << expectedEndPositions; + QTest::newRow("words1") << testString << expectedBreakPositions + << expectedStartPositions << expectedEndPositions; + } + { + QString testString(QString::fromUtf8("Hello (sad) world !")); + QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions; + expectedBreakPositions << 0 << 5 << 6 << 7 << 10 << 11 << 12 << 17 << 18 << 19; + expectedStartPositions << 0 << 7 << 12; + expectedEndPositions << 5 << 10 << 17; + + QTest::newRow("words2") << testString << expectedBreakPositions + << expectedStartPositions << expectedEndPositions; + } + { + QString testString(QString::fromUtf8("mr.Hamster")); + QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions; + expectedBreakPositions << 0 << 2 << 3 << 10; + expectedStartPositions << 0 << 3; + expectedEndPositions << 2 << 10; + + QTest::newRow("words3") << testString << expectedBreakPositions + << expectedStartPositions << expectedEndPositions; + } + { + QString testString(QString::fromUtf8("This is a sample buffer.Please test me . He's don't Le'Clerk.")); + QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions; + expectedBreakPositions << 0 << 4 << 5 << 7 << 8 << 9 << 10 << 11 << 12 << 13 << 14 << 20 << 21 << 27 + << 28 << 34 << 35 << 39 << 40 << 42 << 43 << 44 << 45 << 46 << 47 << 48 + << 49 << 53 << 54 << 59 << 60 << 68 << 69; + expectedStartPositions << 0 << 5 << 12 << 14 << 21 << 28 << 35 << 40 << 49 << 54 << 60; + expectedEndPositions << 4 << 7 << 13 << 20 << 27 << 34 << 39 << 42 << 53 << 59 << 68; + + QTest::newRow("words4") << testString << expectedBreakPositions + << expectedStartPositions << expectedEndPositions; } { // text with trailing space @@ -512,6 +544,13 @@ void tst_QTextBoundaryFinder::sentenceBoundaries_manual_data() QTest::newRow("data2") << testString << expectedBreakPositions; } + { + QString testString(QString::fromUtf8("mr.Hamster")); + QList<int> expectedBreakPositions; + expectedBreakPositions << 0 << 3 << 10; + + QTest::newRow("data3") << testString << expectedBreakPositions; + } } void tst_QTextBoundaryFinder::sentenceBoundaries_manual() diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index 7bc667ca14..8e612f0b03 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -1572,6 +1572,15 @@ static void readWordBreak() qFatal("unassigned word break class: %s", l[1].constData()); for (int codepoint = from; codepoint <= to; ++codepoint) { + // ### [ + // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet + // which caused "hi.there" to be treated like if it were just a single word; + // until we have a tailoring mechanism, retain the old behavior by remapping those characters here. + if (codepoint == 0x002E) // FULL STOP + brk = WordBreak_MidNum; + else if (codepoint == 0x003A) // COLON + brk = WordBreak_Other; + // ] ### UnicodeData &ud = UnicodeData::valueRef(codepoint); ud.p.wordBreakClass = brk; } |