summaryrefslogtreecommitdiffstats
path: root/tests/auto/corelib/serialization/qcborvalue
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2024-04-09 19:50:15 -0500
committerThiago Macieira <thiago.macieira@intel.com>2024-04-12 21:41:16 -0700
commitd4c7da9a07dc1434692fe08a61ba22c794574c4f (patch)
tree38e25743f08918b4bc9709edca6283dc966ad92d /tests/auto/corelib/serialization/qcborvalue
parent8e5ce9cd369230256045864d6fad38dbd8bee413 (diff)
CBOR: fix sorting of UTF16-to-UTF16 strings
This amends commit 394788c68efacdec2676988b4b4ff207b20557f2 (its ChangeLog applies to this commit too). That fixed sorting of UTF8-to- UTF16, but when adding more unit tests, I've discovered that some UTF-16 strings also sorted incorrectly. There were two problems: First, we were assuming that we could rely on the UTF-16 length as a proxy for the UTF-8 one, but that's not true for some cases: * both 1-, 2- and 3-codepoint UTF-8 sequences are 1 codepoint in UTF-16, so some strings would have identical UTF-16 length * 4-codepoint UTF-8 sequences shrink to 2-codepoint UTF-16 ones (2:1) but 3-codepoint UTF-8 sequences shrink to 1 (3:1), so some strings would be longer in UTF-16 but shorter in UTF-8. Second, QtPrivate::compareStrings performs UTF-16 codepoint comparisons not Unicode character ones, so surrogate pairs were sorting before U+E000 to U+FFFF. To fix all of this, we need to decode the UTF-16 string into UTF-32 and calculate the length of that in UTF-8 to be sure we have the sorting order right. Since this is a slight behavior change with a performance penalty, I am choosing to backport only to 6.7. The penalty mostly does not apply to 6.8 due to commit 61556627f25e7c7acbfcc5e54127a392b5239977. Pick-to: 6.7 Change-Id: If1bf59ecbe014b569ba1fffd17c4c4ddcc874aac Reviewed-by: Ivan Solovev <ivan.solovev@qt.io>
Diffstat (limited to 'tests/auto/corelib/serialization/qcborvalue')
-rw-r--r--tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp123
1 files changed, 93 insertions, 30 deletions
diff --git a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp
index 45c3799c19..23b25834b9 100644
--- a/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp
+++ b/tests/auto/corelib/serialization/qcborvalue/tst_qcborvalue.cpp
@@ -1870,6 +1870,18 @@ void tst_QCborValue::mapNested()
void tst_QCborValue::sorting_data()
{
+ // CBOR data comparisons are done as if we were comparing their canonically
+ // (deterministic) encoded forms in the byte stream, including the Major
+ // Type. That has a few surprises noted below:
+ // 1) because the length of a string precedes it, effectively strings are
+ // sorted by their UTF-8 length before their contents
+ // 2) because negative integers are stored in negated form, they sort in
+ // descending order (i.e. by absolute value)
+ // 3) negative integers (Major Type 1) sort after all positive integers
+ // (Major Type 0)
+ // Effectively, this means integers are sorted as sign+magnitude.
+ // 4) floating point types (Major Type 7) sort after all integers
+
QTest::addColumn<QCborValue>("lhs");
QTest::addColumn<QCborValue>("rhs");
QTest::addColumn<Qt::strong_ordering>("expectedOrdering");
@@ -1954,11 +1966,6 @@ void tst_QCborValue::sorting_data()
addRow(vdouble1, vdouble2, Qt::strong_ordering::less);
addRow(vdouble2, vdouble3, Qt::strong_ordering::less); // surprise: NaNs do compare
addRow(vndouble1, vndouble2, Qt::strong_ordering::less); // surprise: -1.5 < -inf
- // note: shorter length sorts first
- addRow(vba1, vba2, Qt::strong_ordering::less);
- addRow(vba2, vba3, Qt::strong_ordering::less);
- addRow(vs1, vs2, Qt::strong_ordering::less);
- addRow(vs2, vs3, Qt::strong_ordering::less);
addRow(va1, va2, Qt::strong_ordering::less);
addRow(va2, va3, Qt::strong_ordering::less);
addRow(vm1, vm2, Qt::strong_ordering::less);
@@ -1971,10 +1978,16 @@ void tst_QCborValue::sorting_data()
addRow(vurl1, vurl2, Qt::strong_ordering::less);
addRow(vuuid1, vuuid2, Qt::strong_ordering::less);
- // surprise 1: CBOR sorts integrals by absolute value
+ // surprise 1: CBOR sorts strings by length first
+ addRow(vba1, vba2, Qt::strong_ordering::less);
+ addRow(vba2, vba3, Qt::strong_ordering::less);
+ addRow(vs1, vs2, Qt::strong_ordering::less);
+ addRow(vs2, vs3, Qt::strong_ordering::less);
+
+ // surprise 2: CBOR sorts integrals by absolute value
addRow(vneg1, vneg2, Qt::strong_ordering::less);
- // surprise 2: CBOR sorts negatives after positives (sign+magnitude)
+ // surprise 3: CBOR sorts negatives after positives (sign+magnitude)
addRow(vint2, vneg1, Qt::strong_ordering::less);
addRow(vdouble2, vndouble1, Qt::strong_ordering::less);
@@ -2008,29 +2021,79 @@ void tst_QCborValue::sorting_data()
str.prepend(char(QCborValue::String) + str.size());
return QCborValue::fromCbor(str);
};
- // 5 code units in UTF-8
- QCborValue vs4_utf16(u"Mørk"_s);
- QCborValue vs4_utf8 = utf8string("Mørk");
- QTest::newRow("self-utf8_5b") << vs4_utf8 << vs4_utf8 << Qt::strong_ordering::equal;
- QTest::newRow("self-utf16_5b") << vs4_utf16 << vs4_utf16 << Qt::strong_ordering::equal;
- QTest::newRow("utf16_5b-cmp-utf8_5b") << vs4_utf16 << vs4_utf8 << Qt::strong_ordering::equal;
-
- // 5 code units in UTF-16
- QCborValue vs5_utf16(u"Først"_s);
- QCborValue vs5_utf8 = utf8string("Først");
- QTest::newRow("self-utf8_5c") << vs5_utf8 << vs5_utf8 << Qt::strong_ordering::equal;
- QTest::newRow("self-utf16_5c") << vs5_utf16 << vs5_utf16 << Qt::strong_ordering::equal;
- QTest::newRow("utf16_5c-cmp-utf8_5c") << vs5_utf16 << vs5_utf8 << Qt::strong_ordering::equal;
-
- // sorted by UTF-8 length first, so "Mørk" < "World" < "Først" (!!)
- QTest::newRow("ascii-cmp-utf8_5b") << vs3 << vs4_utf8 << Qt::strong_ordering::greater;
- QTest::newRow("ascii-cmp-utf16_5b") << vs3 << vs4_utf16 << Qt::strong_ordering::greater;
- QTest::newRow("ascii-cmp-utf8_5c") << vs3 << vs5_utf8 << Qt::strong_ordering::less;
- QTest::newRow("ascii-cmp-utf16_5c") << vs3 << vs5_utf16 << Qt::strong_ordering::less;
- QTest::newRow("utf8_5b-cmp-utf8_5c") << vs4_utf8 << vs5_utf8 << Qt::strong_ordering::less;
- QTest::newRow("utf8_5b-cmp-utf16_5c") << vs4_utf8 << vs5_utf16 << Qt::strong_ordering::less;
- QTest::newRow("utf16_5b-cmp-utf8_5c") << vs4_utf16 << vs5_utf8 << Qt::strong_ordering::less;
- QTest::newRow("utf16_5b-cmp-utf16_5c") << vs4_utf16 << vs5_utf16 << Qt::strong_ordering::less;
+
+ auto addStringCmp = [&](const char *prefix, const char *tag, QUtf8StringView lhs,
+ QUtf8StringView rhs) {
+ // CBOR orders strings by UTF-8 length
+ auto order = Qt::compareThreeWay(lhs.size(), rhs.size());
+ if (is_eq(order))
+ order = compareThreeWay(lhs, rhs);
+ Q_ASSERT(is_eq(order) || is_lt(order)); // please keep lhs <= rhs!
+
+ QCborValue lhs_utf8 = utf8string(QByteArrayView(lhs).toByteArray());
+ QCborValue rhs_utf8 = utf8string(QByteArrayView(rhs).toByteArray());
+ QCborValue lhs_utf16 = QString::fromUtf8(lhs);
+ QCborValue rhs_utf16 = QString::fromUtf8(rhs);
+
+ QTest::addRow("string-%s%s:utf8-utf8", prefix, tag) << lhs_utf8 << rhs_utf8 << order;
+ QTest::addRow("string-%s%s:utf8-utf16", prefix, tag) << lhs_utf8 << rhs_utf16 << order;
+ QTest::addRow("string-%s%s:utf16-utf8", prefix, tag) << lhs_utf16 << rhs_utf8 << order;
+ QTest::addRow("string-%s%s:utf16-utf16", prefix, tag) << lhs_utf16 << rhs_utf16 << order;
+ };
+ auto addStringCmpSameLength = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) {
+ Q_ASSERT(lhs.size() == rhs.size());
+ addStringCmp("samelength-", tag, lhs, rhs);
+ };
+ auto addStringCmpShorter = [&](const char *tag, QUtf8StringView lhs, QUtf8StringView rhs) {
+ Q_ASSERT(lhs.size() < rhs.size());
+ addStringCmp("shorter-", tag, lhs, rhs);
+ };
+
+ // ascii-only is already tested
+ addStringCmp("equal-", "1continuation", "ab\u00A0c", "ab\u00A0c");
+ addStringCmp("equal-", "2continuation", "ab\u0800", "ab\u0800");
+ addStringCmp("equal-", "3continuation", "a\U00010000", "a\U00010000");
+
+ // these strings all have the same UTF-8 length (5 bytes)
+ addStringCmpSameLength("less-ascii", "abcde", "ab\u00A0c");
+ addStringCmpSameLength("less-1continuation", "ab\u00A0c", "ab\u07FFc");
+ addStringCmpSameLength("less-2continuation", "ab\u0800", "ab\uFFFC");
+ addStringCmpSameLength("less-3continuation", "a\U00010000", "a\U0010FFFC");
+ addStringCmpSameLength("less-0-vs-1continuation", "abcde", "ab\u00A0c");
+ addStringCmpSameLength("less-0-vs-2continuation", "abcde", "ab\u0800");
+ addStringCmpSameLength("less-0-vs-3continuation", "abcde", "a\U00010000");
+ addStringCmpSameLength("less-1-vs-2continuation", "ab\u00A0c", "ab\uFFFC");
+ addStringCmpSameLength("less-1-vs-3continuation", "ab\u00A0c", "a\U00010000");
+ addStringCmpSameLength("less-2-vs-3continuation", "ab\u0800", "a\U00010000");
+ addStringCmpSameLength("less-2-vs-3continuation_surrogate", "a\uFFFCz", "a\U00010000"); // even though U+D800 < U+FFFC
+
+ // these strings have different lengths in UTF-8
+ // (0continuation already tested)
+ addStringCmpShorter("1continuation", "ab\u00A0", "ab\u00A0c");
+ addStringCmpShorter("2continuation", "ab\u0800", "ab\u0800c");
+ addStringCmpShorter("3continuation", "ab\U00010000", "ab\U00010000c");
+ // most of these have the same length in UTF-16!
+ addStringCmpShorter("0-vs-1continuation", "abc", "ab\u00A0");
+ addStringCmpShorter("0-vs-2continuation", "abcd", "ab\u0800");
+ addStringCmpShorter("0-vs-3continuation", "abcde", "ab\U00010000");
+ addStringCmpShorter("1-vs-2continuation", "ab\u00A0", "ab\u0800");
+ addStringCmpShorter("1-vs-3continuation", "abc\u00A0", "ab\U00010000");
+ addStringCmpShorter("2-vs-3continuation", "ab\u0800", "ab\U00010000");
+
+ // lhs is 4xUTF-16 and 8xUTF-8; rhs is 3xUTF-16 but 9xUTF-8
+ addStringCmpShorter("3x2-vs-2x3continuation", "\U00010000\U00010000", "\u0800\u0800\u0800");
+
+ // slight surprising because normally rhs would sort first ("aa" vs "ab" prefix)
+ // (0continuation_surprise already tested)
+ addStringCmpShorter("1continuation_surprise", "ab\u00A0", "aa\u00A0c");
+ addStringCmpShorter("2continuation_surprise", "ab\u0800", "aa\u0800c");
+ addStringCmpShorter("3continuation_surprise", "ab\U00010000", "aa\U00010000c");
+ addStringCmpShorter("0-vs-1continuation_surprise", "abc", "aa\u00A0");
+ addStringCmpShorter("0-vs-2continuation_surprise", "abcd", "aa\u0800");
+ addStringCmpShorter("0-vs-3continuation_surprise", "abcde", "aa\U00010000");
+ addStringCmpShorter("1-vs-2continuation_surprise", "ab\u00A0", "aa\u0800");
+ addStringCmpShorter("1-vs-3continuation_surprise", "abc\u00A0", "aa\U00010000");
+ addStringCmpShorter("2-vs-3continuation_surprise", "ab\u0800", "aa\U00010000");
}
void tst_QCborValue::sorting()