diff options
author | Øystein Heskestad <oystein.heskestad@qt.io> | 2022-11-22 17:48:45 +0100 |
---|---|---|
committer | Øystein Heskestad <oystein.heskestad@qt.io> | 2022-12-02 11:35:49 +0100 |
commit | b977ae371a753a82e1d0bb32c5b62099da663721 (patch) | |
tree | 895d517f35c5a961781c156cf8c23857d688de6c /src/corelib | |
parent | 724329b79ea8bf00cc4b393fa33d91d477b35497 (diff) |
Add In-place utf-8 case-insensitive comparisons
Also add optimizations for more string comparisons and add tests and
benchmarks.
[ChangeLog][QtCore][QString] Added utf-8 case-insensitive comparisons
Fixes: QTBUG-100235
Change-Id: I7c0809c6d80c00e9a5d0e8ac3ebb045cf7004a30
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/corelib')
-rw-r--r-- | src/corelib/text/qstring.cpp | 13 | ||||
-rw-r--r-- | src/corelib/text/qstring.h | 13 | ||||
-rw-r--r-- | src/corelib/text/qstringconverter.cpp | 65 | ||||
-rw-r--r-- | src/corelib/text/qstringconverter_p.h | 8 | ||||
-rw-r--r-- | src/corelib/text/qstringview.h | 16 | ||||
-rw-r--r-- | src/corelib/text/qutf8stringview.h | 11 |
6 files changed, 111 insertions, 15 deletions
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index ca949b9b32..3ae6404ed4 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -1482,8 +1482,7 @@ bool QtPrivate::equalStrings(QStringView lhs, QBasicUtf8StringView<false> rhs) n bool QtPrivate::equalStrings(QLatin1StringView lhs, QBasicUtf8StringView<false> rhs) noexcept { - QString r = rhs.toString(); - return QtPrivate::equalStrings(lhs, r); // ### optimize! + return QUtf8::compareUtf8(QByteArrayView(rhs), lhs) == 0; } bool QtPrivate::equalStrings(QBasicUtf8StringView<false> lhs, QLatin1StringView rhs) noexcept @@ -1612,7 +1611,7 @@ int QtPrivate::compareStrings(QLatin1StringView lhs, QLatin1StringView rhs, Qt:: */ int QtPrivate::compareStrings(QLatin1StringView lhs, QBasicUtf8StringView<false> rhs, Qt::CaseSensitivity cs) noexcept { - return compareStrings(lhs, rhs.toString(), cs); // ### optimize! + return -QUtf8::compareUtf8(QByteArrayView(rhs), lhs, cs); } /*! @@ -1647,13 +1646,7 @@ int QtPrivate::compareStrings(QBasicUtf8StringView<false> lhs, QLatin1StringView */ int QtPrivate::compareStrings(QBasicUtf8StringView<false> lhs, QBasicUtf8StringView<false> rhs, Qt::CaseSensitivity cs) noexcept { - if (lhs.isEmpty()) - return lencmp(0, rhs.size()); - if (cs == Qt::CaseInsensitive) - return compareStrings(lhs.toString(), rhs.toString(), cs); // ### optimize! - const auto l = std::min(lhs.size(), rhs.size()); - int r = memcmp(lhs.data(), rhs.data(), l); - return r ? r : lencmp(lhs.size(), rhs.size()); + return QUtf8::compareUtf8(QByteArrayView(lhs), QByteArrayView(rhs), cs); } int QAnyStringView::compare(QAnyStringView lhs, QAnyStringView rhs, Qt::CaseSensitivity cs) noexcept diff --git a/src/corelib/text/qstring.h b/src/corelib/text/qstring.h index d8d8cd3425..32847e30f6 100644 --- a/src/corelib/text/qstring.h +++ b/src/corelib/text/qstring.h @@ -119,6 +119,12 @@ public: { return isEmpty() ? -1 : front() == c ? int(size() > 1) : uchar(m_data[0]) - c.unicode(); } [[nodiscard]] int compare(QChar c, Qt::CaseSensitivity cs) const noexcept { return QtPrivate::compareStrings(*this, QStringView(&c, 1), cs); } + template<bool UseChar8T> + [[nodiscard]] int compare(QBasicUtf8StringView<UseChar8T> other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept + { + return QtPrivate::compareStrings(*this, other, cs); + } [[nodiscard]] bool startsWith(QStringView s, Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept { return QtPrivate::startsWith(*this, s, cs); } @@ -1230,6 +1236,13 @@ QString QBasicUtf8StringView<UseChar8T>::toString() const return QString::fromUtf8(data(), size()); } +template<bool UseChar8T> +[[nodiscard]] int QBasicUtf8StringView<UseChar8T>::compare(QLatin1StringView other, + Qt::CaseSensitivity cs) const noexcept +{ + return QtPrivate::compareStrings(*this, other, cs); +} + // // QAnyStringView inline members that require QString: // diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index bdba20de3a..04bd8e9733 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -820,7 +820,7 @@ QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in) return { true, isValidAscii }; } -int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept +int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept { auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data()); auto end1 = src1 + utf8.size(); @@ -847,7 +847,10 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2)) uc2 = QChar::surrogateToUcs4(uc2, *src2++); } - + if (cs == Qt::CaseInsensitive) { + uc1 = QChar::toCaseFolded(uc1); + uc2 = QChar::toCaseFolded(uc2); + } if (uc1 != uc2) return int(uc1) - int(uc2); } @@ -857,7 +860,7 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept return (end1 > src1) - int(end2 > src2); } -int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s) +int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs) { char32_t uc1 = QChar::Null; auto src1 = reinterpret_cast<const uchar *>(utf8.data()); @@ -875,6 +878,62 @@ int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s) } char32_t uc2 = *src2++; + if (cs == Qt::CaseInsensitive) { + uc1 = QChar::toCaseFolded(uc1); + uc2 = QChar::toCaseFolded(uc2); + } + if (uc1 != uc2) + return int(uc1) - int(uc2); + } + + // the shorter string sorts first + return (end1 > src1) - (end2 > src2); +} + +static inline int lencmp(qsizetype lhs, qsizetype rhs) noexcept +{ + return lhs == rhs ? 0 : + lhs > rhs ? 1 : + /* else */ -1 ; +} + +int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept +{ + if (lhs.isEmpty()) + return lencmp(0, rhs.size()); + + if (cs == Qt::CaseSensitive) { + const auto l = std::min(lhs.size(), rhs.size()); + int r = memcmp(lhs.data(), rhs.data(), l); + return r ? r : lencmp(lhs.size(), rhs.size()); + } + + char32_t uc1 = QChar::Null; + auto src1 = reinterpret_cast<const uchar *>(lhs.data()); + auto end1 = src1 + lhs.size(); + char32_t uc2 = QChar::Null; + auto src2 = reinterpret_cast<const uchar *>(rhs.data()); + auto end2 = src2 + rhs.size(); + + while (src1 < end1 && src2 < end2) { + uchar b = *src1++; + char32_t *output = &uc1; + int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1); + if (res < 0) { + // decoding error + uc1 = QChar::ReplacementCharacter; + } + + b = *src2++; + output = &uc2; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src2, end2); + if (res < 0) { + // decoding error + uc2 = QChar::ReplacementCharacter; + } + + uc1 = QChar::toCaseFolded(uc1); + uc2 = QChar::toCaseFolded(uc2); if (uc1 != uc2) return int(uc1) - int(uc2); } diff --git a/src/corelib/text/qstringconverter_p.h b/src/corelib/text/qstringconverter_p.h index 26be8b713c..95d1eea072 100644 --- a/src/corelib/text/qstringconverter_p.h +++ b/src/corelib/text/qstringconverter_p.h @@ -276,8 +276,12 @@ struct QUtf8 bool isValidAscii; }; static ValidUtf8Result isValidUtf8(QByteArrayView in); - static int compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept; - static int compareUtf8(QByteArrayView utf8, QLatin1StringView s); + static int compareUtf8(QByteArrayView utf8, QStringView utf16, + Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; + static int compareUtf8(QByteArrayView utf8, QLatin1StringView s, + Qt::CaseSensitivity cs = Qt::CaseSensitive); + static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs, + Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; }; struct QUtf16 diff --git a/src/corelib/text/qstringview.h b/src/corelib/text/qstringview.h index 2015927f19..95b089d565 100644 --- a/src/corelib/text/qstringview.h +++ b/src/corelib/text/qstringview.h @@ -8,6 +8,7 @@ #include <QtCore/qbytearray.h> #include <QtCore/qstringliteral.h> #include <QtCore/qstringalgorithms.h> +#include <QtCore/qutf8stringview.h> #include <string> @@ -254,6 +255,12 @@ public: [[nodiscard]] int compare(QStringView other, Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept { return QtPrivate::compareStrings(*this, other, cs); } [[nodiscard]] inline int compare(QLatin1StringView other, Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept; + template<bool UseChar8T> + [[nodiscard]] int compare(QBasicUtf8StringView<UseChar8T> other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept + { + return QtPrivate::compareStrings(*this, other, cs); + } [[nodiscard]] constexpr int compare(QChar c) const noexcept { return size() >= 1 ? compare_single_char_helper(*utf16() - c.unicode()) : -1; } [[nodiscard]] int compare(QChar c, Qt::CaseSensitivity cs) const noexcept @@ -449,6 +456,15 @@ inline QStringView qToStringViewIgnoringNull(const QStringLike &s) noexcept R{{char16_t(c), u'\0'}} ; } +// QBasicUtf8StringView functions: + +template<bool UseChar8T> +[[nodiscard]] int QBasicUtf8StringView<UseChar8T>::compare(QStringView other, + Qt::CaseSensitivity cs) const noexcept +{ + return QtPrivate::compareStrings(*this, other, cs); +} + QT_END_NAMESPACE #endif /* QSTRINGVIEW_H */ diff --git a/src/corelib/text/qutf8stringview.h b/src/corelib/text/qutf8stringview.h index 3baac9c885..14a8a16e62 100644 --- a/src/corelib/text/qutf8stringview.h +++ b/src/corelib/text/qutf8stringview.h @@ -280,6 +280,17 @@ public: [[nodiscard]] constexpr qsizetype length() const noexcept { return size(); } + [[nodiscard]] int compare(QBasicUtf8StringView other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept + { + return QtPrivate::compareStrings(*this, other, cs); + } + + [[nodiscard]] int compare(QStringView other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept; + [[nodiscard]] int compare(QLatin1StringView other, + Qt::CaseSensitivity cs = Qt::CaseSensitive) const noexcept; + private: [[nodiscard]] static inline int compare(QBasicUtf8StringView lhs, QBasicUtf8StringView rhs) noexcept { |