diff options
author | Giuseppe D'Angelo <giuseppe.dangelo@kdab.com> | 2021-08-22 19:36:49 +0200 |
---|---|---|
committer | Giuseppe D'Angelo <giuseppe.dangelo@kdab.com> | 2021-08-24 12:58:20 +0200 |
commit | d48058f1970a795afb4cedaae54dde7ca69cb252 (patch) | |
tree | ac304fba5ca1bf3ff3ec21952850d4b51d2540c8 /src | |
parent | ca604964f651b71f2b2a45a65e741167f520b714 (diff) |
Unicode: fix the grapheme clustering algorithm
An oversight in the code kept the algorithm in the GB11 state, even if
the codepoint that is being processed wouldn't allow for that (for
instance a sequence of ExtPic, Ext and Any).
Refactor the code of GB11/GB12/GB13 to deal with code points that break
the sequences (falling back to "normal" handling).
Add some manual tests; interestingly enough, the failing cases are not
covered by Unicode's tests, as we now pass the entire test suite.
Amends a794c5e287381bd056008b20ae55f9b1e0acf138.
Fixes: QTBUG-94951
Pick-to: 6.1 5.15
Change-Id: If987d5ccf7c6b13de36d049b1b3d88a3c4b6dd00
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/corelib/text/qunicodetools.cpp | 42 |
1 files changed, 27 insertions, 15 deletions
diff --git a/src/corelib/text/qunicodetools.cpp b/src/corelib/text/qunicodetools.cpp index f160c980cb..45538e5d9e 100644 --- a/src/corelib/text/qunicodetools.cpp +++ b/src/corelib/text/qunicodetools.cpp @@ -165,51 +165,63 @@ static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttrib QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass; bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls); + bool handled = false; switch (state) { case GB::State::Normal: - if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11 - if (cls == QUnicodeTables::GraphemeBreak_Extend) { - state = GB::State::GB11_ExtPicExt; - Q_ASSERT(!shouldBreak); // GB9, do not break before Extend - } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) { - state = GB::State::GB11_ExtPicExtZWJ; - Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ - } - } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13 - state = GB::State::GB12_13_RI; - } + break; // will deal with it below - break; case GB::State::GB11_ExtPicExt: Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend); if (cls == QUnicodeTables::GraphemeBreak_Extend) { // keep going in the current state Q_ASSERT(!shouldBreak); // GB9, do not break before Extend + handled = true; } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) { state = GB::State::GB11_ExtPicExtZWJ; Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ + handled = true; + } else { + state = GB::State::Normal; } - break; case GB::State::GB11_ExtPicExtZWJ: Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ); - if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) + if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { shouldBreak = false; + handled = true; + } state = GB::State::Normal; break; case GB::State::GB12_13_RI: Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator); - if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) + if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { shouldBreak = false; + handled = true; + } state = GB::State::Normal; break; } + if (!handled) { + Q_ASSERT(state == GB::State::Normal); + if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11 + if (cls == QUnicodeTables::GraphemeBreak_Extend) { + state = GB::State::GB11_ExtPicExt; + Q_ASSERT(!shouldBreak); // GB9, do not break before Extend + } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) { + state = GB::State::GB11_ExtPicExtZWJ; + Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ + } + } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13 + state = GB::State::GB12_13_RI; + } + } + if (shouldBreak) attributes[pos].graphemeBoundary = true; |