From b977ae371a753a82e1d0bb32c5b62099da663721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=98ystein=20Heskestad?= Date: Tue, 22 Nov 2022 17:48:45 +0100 Subject: Add In-place utf-8 case-insensitive comparisons Also add optimizations for more string comparisons and add tests and benchmarks. [ChangeLog][QtCore][QString] Added utf-8 case-insensitive comparisons Fixes: QTBUG-100235 Change-Id: I7c0809c6d80c00e9a5d0e8ac3ebb045cf7004a30 Reviewed-by: Thiago Macieira --- src/corelib/text/qstring.cpp | 13 ++----- src/corelib/text/qstring.h | 13 +++++++ src/corelib/text/qstringconverter.cpp | 65 +++++++++++++++++++++++++++++++++-- src/corelib/text/qstringconverter_p.h | 8 +++-- src/corelib/text/qstringview.h | 16 +++++++++ src/corelib/text/qutf8stringview.h | 11 ++++++ 6 files changed, 111 insertions(+), 15 deletions(-) (limited to 'src/corelib') diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index ca949b9b32..3ae6404ed4 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -1482,8 +1482,7 @@ bool QtPrivate::equalStrings(QStringView lhs, QBasicUtf8StringView rhs) n bool QtPrivate::equalStrings(QLatin1StringView lhs, QBasicUtf8StringView rhs) noexcept { - QString r = rhs.toString(); - return QtPrivate::equalStrings(lhs, r); // ### optimize! + return QUtf8::compareUtf8(QByteArrayView(rhs), lhs) == 0; } bool QtPrivate::equalStrings(QBasicUtf8StringView lhs, QLatin1StringView rhs) noexcept @@ -1612,7 +1611,7 @@ int QtPrivate::compareStrings(QLatin1StringView lhs, QLatin1StringView rhs, Qt:: */ int QtPrivate::compareStrings(QLatin1StringView lhs, QBasicUtf8StringView rhs, Qt::CaseSensitivity cs) noexcept { - return compareStrings(lhs, rhs.toString(), cs); // ### optimize! + return -QUtf8::compareUtf8(QByteArrayView(rhs), lhs, cs); } /*! @@ -1647,13 +1646,7 @@ int QtPrivate::compareStrings(QBasicUtf8StringView lhs, QLatin1StringView */ int QtPrivate::compareStrings(QBasicUtf8StringView lhs, QBasicUtf8StringView rhs, Qt::CaseSensitivity cs) noexcept { - if (lhs.isEmpty()) - return lencmp(0, rhs.size()); - if (cs == Qt::CaseInsensitive) - return compareStrings(lhs.toString(), rhs.toString(), cs); // ### optimize! - const auto l = std::min(lhs.size(), rhs.size()); - int r = memcmp(lhs.data(), rhs.data(), l); - return r ? r : lencmp(lhs.size(), rhs.size()); + return QUtf8::compareUtf8(QByteArrayView(lhs), QByteArrayView(rhs), cs); } int QAnyStringView::compare(QAnyStringView lhs, QAnyStringView rhs, Qt::CaseSensitivity cs) noexcept diff --git a/src/corelib/text/qstring.h b/src/corelib/text/qstring.h index d8d8cd3425..32847e30f6 100644 --- a/src/corelib/text/qstring.h +++ b/src/corelib/text/qstring.h @@ -119,6 +119,12 @@ public: { return isEmpty() ? -1 : front() == c ? int(size() > 1) : uchar(m_data[0]) - c.unicode(); } [[nodiscard]] int compare(QChar c, Qt::CaseSensitivity cs) const noexcept { return QtPrivate::compareStrings(*this, QStringView(&c, 1), cs); } + template + [[nodiscard]] int compare(QBasicUtf8StringView other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept + { + return QtPrivate::compareStrings(*this, other, cs); + } [[nodiscard]] bool startsWith(QStringView s, Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept { return QtPrivate::startsWith(*this, s, cs); } @@ -1230,6 +1236,13 @@ QString QBasicUtf8StringView::toString() const return QString::fromUtf8(data(), size()); } +template +[[nodiscard]] int QBasicUtf8StringView::compare(QLatin1StringView other, + Qt::CaseSensitivity cs) const noexcept +{ + return QtPrivate::compareStrings(*this, other, cs); +} + // // QAnyStringView inline members that require QString: // diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index bdba20de3a..04bd8e9733 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -820,7 +820,7 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in) return { true, isValidAscii }; } -int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept +int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept { auto src1 = reinterpret_cast(utf8.data()); auto end1 = src1 + utf8.size(); @@ -847,7 +847,10 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2)) uc2 = QChar::surrogateToUcs4(uc2, *src2++); } - + if (cs == Qt::CaseInsensitive) { + uc1 = QChar::toCaseFolded(uc1); + uc2 = QChar::toCaseFolded(uc2); + } if (uc1 != uc2) return int(uc1) - int(uc2); } @@ -857,7 +860,7 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept return (end1 > src1) - int(end2 > src2); } -int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s) +int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs) { char32_t uc1 = QChar::Null; auto src1 = reinterpret_cast(utf8.data()); @@ -875,6 +878,62 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s) } char32_t uc2 = *src2++; + if (cs == Qt::CaseInsensitive) { + uc1 = QChar::toCaseFolded(uc1); + uc2 = QChar::toCaseFolded(uc2); + } + if (uc1 != uc2) + return int(uc1) - int(uc2); + } + + // the shorter string sorts first + return (end1 > src1) - (end2 > src2); +} + +static inline int lencmp(qsizetype lhs, qsizetype rhs) noexcept +{ + return lhs == rhs ? 0 : + lhs > rhs ? 1 : + /* else */ -1 ; +} + +int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept +{ + if (lhs.isEmpty()) + return lencmp(0, rhs.size()); + + if (cs == Qt::CaseSensitive) { + const auto l = std::min(lhs.size(), rhs.size()); + int r = memcmp(lhs.data(), rhs.data(), l); + return r ? r : lencmp(lhs.size(), rhs.size()); + } + + char32_t uc1 = QChar::Null; + auto src1 = reinterpret_cast(lhs.data()); + auto end1 = src1 + lhs.size(); + char32_t uc2 = QChar::Null; + auto src2 = reinterpret_cast(rhs.data()); + auto end2 = src2 + rhs.size(); + + while (src1 < end1 && src2 < end2) { + uchar b = *src1++; + char32_t *output = &uc1; + int res = QUtf8Functions::fromUtf8(b, output, src1, end1); + if (res < 0) { + // decoding error + uc1 = QChar::ReplacementCharacter; + } + + b = *src2++; + output = &uc2; + res = QUtf8Functions::fromUtf8(b, output, src2, end2); + if (res < 0) { + // decoding error + uc2 = QChar::ReplacementCharacter; + } + + uc1 = QChar::toCaseFolded(uc1); + uc2 = QChar::toCaseFolded(uc2); if (uc1 != uc2) return int(uc1) - int(uc2); } diff --git a/src/corelib/text/qstringconverter_p.h b/src/corelib/text/qstringconverter_p.h index 26be8b713c..95d1eea072 100644 --- a/src/corelib/text/qstringconverter_p.h +++ b/src/corelib/text/qstringconverter_p.h @@ -276,8 +276,12 @@ struct QUtf8 bool isValidAscii; }; static ValidUtf8Result isValidUtf8(QByteArrayView in); - static int compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept; - static int compareUtf8(QByteArrayView utf8, QLatin1StringView s); + static int compareUtf8(QByteArrayView utf8, QStringView utf16, + Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; + static int compareUtf8(QByteArrayView utf8, QLatin1StringView s, + Qt::CaseSensitivity cs = Qt::CaseSensitive); + static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs, + Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; }; struct QUtf16 diff --git a/src/corelib/text/qstringview.h b/src/corelib/text/qstringview.h index 2015927f19..95b089d565 100644 --- a/src/corelib/text/qstringview.h +++ b/src/corelib/text/qstringview.h @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -254,6 +255,12 @@ public: [[nodiscard]] int compare(QStringView other, Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept { return QtPrivate::compareStrings(*this, other, cs); } [[nodiscard]] inline int compare(QLatin1StringView other, Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept; + template + [[nodiscard]] int compare(QBasicUtf8StringView other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept + { + return QtPrivate::compareStrings(*this, other, cs); + } [[nodiscard]] constexpr int compare(QChar c) const noexcept { return size() >= 1 ? compare_single_char_helper(*utf16() - c.unicode()) : -1; } [[nodiscard]] int compare(QChar c, Qt::CaseSensitivity cs) const noexcept @@ -449,6 +456,15 @@ inline QStringView qToStringViewIgnoringNull(const QStringLike &s) noexcept R{{char16_t(c), u'\0'}} ; } +// QBasicUtf8StringView functions: + +template +[[nodiscard]] int QBasicUtf8StringView::compare(QStringView other, + Qt::CaseSensitivity cs) const noexcept +{ + return QtPrivate::compareStrings(*this, other, cs); +} + QT_END_NAMESPACE #endif /* QSTRINGVIEW_H */ diff --git a/src/corelib/text/qutf8stringview.h b/src/corelib/text/qutf8stringview.h index 3baac9c885..14a8a16e62 100644 --- a/src/corelib/text/qutf8stringview.h +++ b/src/corelib/text/qutf8stringview.h @@ -280,6 +280,17 @@ public: [[nodiscard]] constexpr qsizetype length() const noexcept { return size(); } + [[nodiscard]] int compare(QBasicUtf8StringView other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept + { + return QtPrivate::compareStrings(*this, other, cs); + } + + [[nodiscard]] int compare(QStringView other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept; + [[nodiscard]] int compare(QLatin1StringView other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept; + private: [[nodiscard]] static inline int compare(QBasicUtf8StringView lhs, QBasicUtf8StringView rhs) noexcept { -- cgit v1.2.3