diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2024-04-09 19:50:15 -0500 |
---|---|---|
committer | Thiago Macieira <thiago.macieira@intel.com> | 2024-04-12 21:41:16 -0700 |
commit | d4c7da9a07dc1434692fe08a61ba22c794574c4f (patch) | |
tree | 38e25743f08918b4bc9709edca6283dc966ad92d /tests/auto/corelib/serialization/qcborvalue | |
parent | 8e5ce9cd369230256045864d6fad38dbd8bee413 (diff) |
CBOR: fix sorting of UTF16-to-UTF16 strings
This amends commit 394788c68efacdec2676988b4b4ff207b20557f2 (its
ChangeLog applies to this commit too). That fixed sorting of UTF8-to-
UTF16, but when adding more unit tests, I've discovered that some UTF-16
strings also sorted incorrectly. There were two problems:
First, we were assuming that we could rely on the UTF-16 length as a
proxy for the UTF-8 one, but that's not true for some cases:
* both 1-, 2- and 3-codepoint UTF-8 sequences are 1 codepoint
in UTF-16, so some strings would have identical UTF-16 length
* 4-codepoint UTF-8 sequences shrink to 2-codepoint UTF-16 ones
(2:1) but 3-codepoint UTF-8 sequences shrink to 1 (3:1), so
some strings would be longer in UTF-16 but shorter in UTF-8.
Second, QtPrivate::compareStrings performs UTF-16 codepoint comparisons
not Unicode character ones, so surrogate pairs were sorting before
U+E000 to U+FFFF.
To fix all of this, we need to decode the UTF-16 string into UTF-32 and
calculate the length of that in UTF-8 to be sure we have the sorting
order right.
Since this is a slight behavior change with a performance penalty, I am
choosing to backport only to 6.7. The penalty mostly does not apply to
6.8 due to commit 61556627f25e7c7acbfcc5e54127a392b5239977.
Pick-to: 6.7
Change-Id: If1bf59ecbe014b569ba1fffd17c4c4ddcc874aac
Reviewed-by: Ivan Solovev <ivan.solovev@qt.io>
Diffstat (limited to 'tests/auto/corelib/serialization/qcborvalue')
-rw-r--r-- | tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp | 123 |
1 files changed, 93 insertions, 30 deletions
diff --git a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp index 45c3799c19..23b25834b9 100644 --- a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp +++ b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp @@ -1870,6 +1870,18 @@ void tst_QCborValue::mapNested() void tst_QCborValue::sorting_data() { + // CBOR data comparisons are done as if we were comparing their canonically + // (deterministic) encoded forms in the byte stream, including the Major + // Type. That has a few surprises noted below: + // 1) because the length of a string precedes it, effectively strings are + // sorted by their UTF-8 length before their contents + // 2) because negative integers are stored in negated form, they sort in + // descending order (i.e. by absolute value) + // 3) negative integers (Major Type 1) sort after all positive integers + // (Major Type 0) + // Effectively, this means integers are sorted as sign+magnitude. + // 4) floating point types (Major Type 7) sort after all integers + QTest::addColumn<QCborValue>("lhs"); QTest::addColumn<QCborValue>("rhs"); QTest::addColumn<Qt::strong_ordering>("expectedOrdering"); @@ -1954,11 +1966,6 @@ void tst_QCborValue::sorting_data() addRow(vdouble1, vdouble2, Qt::strong_ordering::less); addRow(vdouble2, vdouble3, Qt::strong_ordering::less); // surprise: NaNs do compare addRow(vndouble1, vndouble2, Qt::strong_ordering::less); // surprise: -1.5 < -inf - // note: shorter length sorts first - addRow(vba1, vba2, Qt::strong_ordering::less); - addRow(vba2, vba3, Qt::strong_ordering::less); - addRow(vs1, vs2, Qt::strong_ordering::less); - addRow(vs2, vs3, Qt::strong_ordering::less); addRow(va1, va2, Qt::strong_ordering::less); addRow(va2, va3, Qt::strong_ordering::less); addRow(vm1, vm2, Qt::strong_ordering::less); @@ -1971,10 +1978,16 @@ void tst_QCborValue::sorting_data() addRow(vurl1, vurl2, Qt::strong_ordering::less); addRow(vuuid1, vuuid2, Qt::strong_ordering::less); - // surprise 1: CBOR sorts integrals by absolute value + // surprise 1: CBOR sorts strings by length first + addRow(vba1, vba2, Qt::strong_ordering::less); + addRow(vba2, vba3, Qt::strong_ordering::less); + addRow(vs1, vs2, Qt::strong_ordering::less); + addRow(vs2, vs3, Qt::strong_ordering::less); + + // surprise 2: CBOR sorts integrals by absolute value addRow(vneg1, vneg2, Qt::strong_ordering::less); - // surprise 2: CBOR sorts negatives after positives (sign+magnitude) + // surprise 3: CBOR sorts negatives after positives (sign+magnitude) addRow(vint2, vneg1, Qt::strong_ordering::less); addRow(vdouble2, vndouble1, Qt::strong_ordering::less); @@ -2008,29 +2021,79 @@ void tst_QCborValue::sorting_data() str.prepend(char(QCborValue::String) + str.size()); return QCborValue::fromCbor(str); }; - // 5 code units in UTF-8 - QCborValue vs4_utf16(u"Mørk"_s); - QCborValue vs4_utf8 = utf8string("Mørk"); - QTest::newRow("self-utf8_5b") << vs4_utf8 << vs4_utf8 << Qt::strong_ordering::equal; - QTest::newRow("self-utf16_5b") << vs4_utf16 << vs4_utf16 << Qt::strong_ordering::equal; - QTest::newRow("utf16_5b-cmp-utf8_5b") << vs4_utf16 << vs4_utf8 << Qt::strong_ordering::equal; - - // 5 code units in UTF-16 - QCborValue vs5_utf16(u"Først"_s); - QCborValue vs5_utf8 = utf8string("Først"); - QTest::newRow("self-utf8_5c") << vs5_utf8 << vs5_utf8 << Qt::strong_ordering::equal; - QTest::newRow("self-utf16_5c") << vs5_utf16 << vs5_utf16 << Qt::strong_ordering::equal; - QTest::newRow("utf16_5c-cmp-utf8_5c") << vs5_utf16 << vs5_utf8 << Qt::strong_ordering::equal; - - // sorted by UTF-8 length first, so "Mørk" < "World" < "Først" (!!) - QTest::newRow("ascii-cmp-utf8_5b") << vs3 << vs4_utf8 << Qt::strong_ordering::greater; - QTest::newRow("ascii-cmp-utf16_5b") << vs3 << vs4_utf16 << Qt::strong_ordering::greater; - QTest::newRow("ascii-cmp-utf8_5c") << vs3 << vs5_utf8 << Qt::strong_ordering::less; - QTest::newRow("ascii-cmp-utf16_5c") << vs3 << vs5_utf16 << Qt::strong_ordering::less; - QTest::newRow("utf8_5b-cmp-utf8_5c") << vs4_utf8 << vs5_utf8 << Qt::strong_ordering::less; - QTest::newRow("utf8_5b-cmp-utf16_5c") << vs4_utf8 << vs5_utf16 << Qt::strong_ordering::less; - QTest::newRow("utf16_5b-cmp-utf8_5c") << vs4_utf16 << vs5_utf8 << Qt::strong_ordering::less; - QTest::newRow("utf16_5b-cmp-utf16_5c") << vs4_utf16 << vs5_utf16 << Qt::strong_ordering::less; + + auto addStringCmp = [&](const char *prefix, const char *tag, QUtf8StringView lhs, + QUtf8StringView rhs) { + // CBOR orders strings by UTF-8 length + auto order = Qt::compareThreeWay(lhs.size(), rhs.size()); + if (is_eq(order)) + order = compareThreeWay(lhs, rhs); + Q_ASSERT(is_eq(order) || is_lt(order)); // please keep lhs <= rhs! + + QCborValue lhs_utf8 = utf8string(QByteArrayView(lhs).toByteArray()); + QCborValue rhs_utf8 = utf8string(QByteArrayView(rhs).toByteArray()); + QCborValue lhs_utf16 = QString::fromUtf8(lhs); + QCborValue rhs_utf16 = QString::fromUtf8(rhs); + + QTest::addRow("string-%s%s:utf8-utf8", prefix, tag) << lhs_utf8 << rhs_utf8 << order; + QTest::addRow("string-%s%s:utf8-utf16", prefix, tag) << lhs_utf8 << rhs_utf16 << order; + QTest::addRow("string-%s%s:utf16-utf8", prefix, tag) << lhs_utf16 << rhs_utf8 << order; + QTest::addRow("string-%s%s:utf16-utf16", prefix, tag) << lhs_utf16 << rhs_utf16 << order; + }; + auto addStringCmpSameLength = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) { + Q_ASSERT(lhs.size() == rhs.size()); + addStringCmp("samelength-", tag, lhs, rhs); + }; + auto addStringCmpShorter = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) { + Q_ASSERT(lhs.size() < rhs.size()); + addStringCmp("shorter-", tag, lhs, rhs); + }; + + // ascii-only is already tested + addStringCmp("equal-", "1continuation", "ab\u00A0c", "ab\u00A0c"); + addStringCmp("equal-", "2continuation", "ab\u0800", "ab\u0800"); + addStringCmp("equal-", "3continuation", "a\U00010000", "a\U00010000"); + + // these strings all have the same UTF-8 length (5 bytes) + addStringCmpSameLength("less-ascii", "abcde", "ab\u00A0c"); + addStringCmpSameLength("less-1continuation", "ab\u00A0c", "ab\u07FFc"); + addStringCmpSameLength("less-2continuation", "ab\u0800", "ab\uFFFC"); + addStringCmpSameLength("less-3continuation", "a\U00010000", "a\U0010FFFC"); + addStringCmpSameLength("less-0-vs-1continuation", "abcde", "ab\u00A0c"); + addStringCmpSameLength("less-0-vs-2continuation", "abcde", "ab\u0800"); + addStringCmpSameLength("less-0-vs-3continuation", "abcde", "a\U00010000"); + addStringCmpSameLength("less-1-vs-2continuation", "ab\u00A0c", "ab\uFFFC"); + addStringCmpSameLength("less-1-vs-3continuation", "ab\u00A0c", "a\U00010000"); + addStringCmpSameLength("less-2-vs-3continuation", "ab\u0800", "a\U00010000"); + addStringCmpSameLength("less-2-vs-3continuation_surrogate", "a\uFFFCz", "a\U00010000"); // even though U+D800 < U+FFFC + + // these strings have different lengths in UTF-8 + // (0continuation already tested) + addStringCmpShorter("1continuation", "ab\u00A0", "ab\u00A0c"); + addStringCmpShorter("2continuation", "ab\u0800", "ab\u0800c"); + addStringCmpShorter("3continuation", "ab\U00010000", "ab\U00010000c"); + // most of these have the same length in UTF-16! + addStringCmpShorter("0-vs-1continuation", "abc", "ab\u00A0"); + addStringCmpShorter("0-vs-2continuation", "abcd", "ab\u0800"); + addStringCmpShorter("0-vs-3continuation", "abcde", "ab\U00010000"); + addStringCmpShorter("1-vs-2continuation", "ab\u00A0", "ab\u0800"); + addStringCmpShorter("1-vs-3continuation", "abc\u00A0", "ab\U00010000"); + addStringCmpShorter("2-vs-3continuation", "ab\u0800", "ab\U00010000"); + + // lhs is 4xUTF-16 and 8xUTF-8; rhs is 3xUTF-16 but 9xUTF-8 + addStringCmpShorter("3x2-vs-2x3continuation", "\U00010000\U00010000", "\u0800\u0800\u0800"); + + // slight surprising because normally rhs would sort first ("aa" vs "ab" prefix) + // (0continuation_surprise already tested) + addStringCmpShorter("1continuation_surprise", "ab\u00A0", "aa\u00A0c"); + addStringCmpShorter("2continuation_surprise", "ab\u0800", "aa\u0800c"); + addStringCmpShorter("3continuation_surprise", "ab\U00010000", "aa\U00010000c"); + addStringCmpShorter("0-vs-1continuation_surprise", "abc", "aa\u00A0"); + addStringCmpShorter("0-vs-2continuation_surprise", "abcd", "aa\u0800"); + addStringCmpShorter("0-vs-3continuation_surprise", "abcde", "aa\U00010000"); + addStringCmpShorter("1-vs-2continuation_surprise", "ab\u00A0", "aa\u0800"); + addStringCmpShorter("1-vs-3continuation_surprise", "abc\u00A0", "aa\U00010000"); + addStringCmpShorter("2-vs-3continuation_surprise", "ab\u0800", "aa\U00010000"); } void tst_QCborValue::sorting() |