From 45838673df6e64a6fd42570c4e8874c5181f7717 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Fri, 22 May 2020 11:25:36 -0700 Subject: Implement UTF-16 to UTF-8 case-insensitive compare and make public Change-Id: Ied637aece2a7427b8a2dfffd16116cf3645c6359 Reviewed-by: Lars Knoll --- src/corelib/serialization/qcborvalue.cpp | 1 + src/corelib/text/qstring.cpp | 55 +++++++++++++++++++++++++++++++- src/corelib/text/qstringalgorithms.h | 1 + src/corelib/text/qstringconverter.cpp | 2 +- src/corelib/text/qstringconverter_p.h | 2 +- 5 files changed, 58 insertions(+), 3 deletions(-) (limited to 'src/corelib') diff --git a/src/corelib/serialization/qcborvalue.cpp b/src/corelib/serialization/qcborvalue.cpp index 7e2d8003c7..ea2d092a1f 100644 --- a/src/corelib/serialization/qcborvalue.cpp +++ b/src/corelib/serialization/qcborvalue.cpp @@ -1168,6 +1168,7 @@ static int compareElementRecursive(const QCborContainerPrivate *c1, const Elemen if (!(e1.flags & Element::StringIsAscii) || !(e2.flags & Element::StringIsAscii)) { // Case 2: one of them is UTF-8 and the other is UTF-16, so lengths // are NOT comparable. We need to convert to UTF-16 first... + // (we can't use QUtf8::compareUtf8 because we need to compare lengths) auto string = [](const Element &e, const ByteData *b) { return e.flags & Element::StringIsUtf16 ? b->asQStringRaw() : b->toUtf8String(); }; diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 82cdeec1a3..f068c5e94a 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -1,7 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2020 The Qt Company Ltd. -** Copyright (C) 2018 Intel Corporation. +** Copyright (C) 2020 Intel Corporation. ** Copyright (C) 2019 Mail.ru Group. ** Contact: https://www.qt.io/licensing/ ** @@ -871,6 +871,35 @@ static int ucstricmp(const QChar *a, const QChar *ae, const char *b, const char return 1; } +// Case-insensitive comparison between a Unicode string and a UTF-8 string +static int ucstricmp8(const char *utf8, const char *utf8end, const QChar *utf16, const QChar *utf16end) +{ + auto src1 = reinterpret_cast(utf8); + auto end1 = reinterpret_cast(utf8end); + QStringIterator src2(utf16, utf16end); + + while (src1 < end1 && src2.hasNext()) { + uint uc1; + uint *output = &uc1; + uchar b = *src1++; + int res = QUtf8Functions::fromUtf8(b, output, src1, end1); + if (res < 0) { + // decoding error + uc1 = QChar::ReplacementCharacter; + } else { + uc1 = QChar::toCaseFolded(uc1); + } + + uint uc2 = QChar::toCaseFolded(src2.next()); + int diff = uc1 - uc2; // can't underflow + if (diff) + return diff; + } + + // the shorter string sorts first + return (end1 > src1) - int(src2.hasNext()); +} + #if defined(__mips_dsp) // From qstring_mips_dsp_asm.S extern "C" int qt_ucstrncmp_mips_dsp_asm(const char16_t *a, @@ -1334,6 +1363,30 @@ int QtPrivate::compareStrings(QLatin1String lhs, QLatin1String rhs, Qt::CaseSens return qt_compare_strings(lhs, rhs, cs); } +/*! + \relates QStringView + \internal + \since 6.0 + \overload + + Returns an integer that compares to 0 as \a lhs compares to \a rhs. + + If \a cs is Qt::CaseSensitive (the default), the comparison is case-sensitive; + otherwise the comparison is case-insensitive. + + Case-sensitive comparison is based exclusively on the numeric values of the + decoded Unicode code points and is very fast, but is not what a human would + expect. Consider sorting user-visible strings with + QString::localeAwareCompare(). +*/ +int QtPrivate::compareStringsUtf8(const char *u8str, qsizetype u8len, QStringView rhs, Qt::CaseSensitivity cs) noexcept +{ + if (cs == Qt::CaseSensitive) + return QUtf8::compareUtf8(u8str, u8len, rhs.data(), rhs.size()); + else + return ucstricmp8(u8str, u8str + u8len, rhs.begin(), rhs.end()); +} + #define REHASH(a) \ if (sl_minus_1 < sizeof(std::size_t) * CHAR_BIT) \ hashHaystack -= std::size_t(a) << sl_minus_1; \ diff --git a/src/corelib/text/qstringalgorithms.h b/src/corelib/text/qstringalgorithms.h index 4a0f7dce9a..c407c54268 100644 --- a/src/corelib/text/qstringalgorithms.h +++ b/src/corelib/text/qstringalgorithms.h @@ -62,6 +62,7 @@ Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION int compareStrings(QStringV Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION int compareStrings(QStringView lhs, QLatin1String rhs, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION int compareStrings(QLatin1String lhs, QStringView rhs, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION int compareStrings(QLatin1String lhs, QLatin1String rhs, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; +Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION int compareStringsUtf8(const char *, qsizetype, QStringView rhs, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; Q_REQUIRED_RESULT Q_CORE_EXPORT Q_DECL_PURE_FUNCTION bool startsWith(QStringView haystack, QStringView needle, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 8e314ffd34..2111d22b2f 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -713,7 +713,7 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len) return { true, isValidAscii }; } -int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, qsizetype u16len) +int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, qsizetype u16len) noexcept { uint uc1, uc2; auto src1 = reinterpret_cast(utf8); diff --git a/src/corelib/text/qstringconverter_p.h b/src/corelib/text/qstringconverter_p.h index 4e1efd3731..3a99a4191a 100644 --- a/src/corelib/text/qstringconverter_p.h +++ b/src/corelib/text/qstringconverter_p.h @@ -338,7 +338,7 @@ struct QUtf8 bool isValidAscii; }; static ValidUtf8Result isValidUtf8(const char *, qsizetype); - static int compareUtf8(const char *, qsizetype, const QChar *, qsizetype); + static int compareUtf8(const char *, qsizetype, const QChar *, qsizetype) noexcept; static int compareUtf8(const char *, qsizetype, QLatin1String s); }; -- cgit v1.2.3