diff options
author | Øystein Heskestad <oystein.heskestad@qt.io> | 2022-10-04 17:24:17 +0200 |
---|---|---|
committer | Øystein Heskestad <oystein.heskestad@qt.io> | 2022-12-08 17:56:47 +0100 |
commit | 3fedcd4e4aeb79e432276aaac128910f8de05f25 (patch) | |
tree | a6b7e9dfaa44df39bdc171e516024ad64de23364 /src | |
parent | bb5d4094e040359d2f74c75b8a25732c1ea89b87 (diff) |
Add Boyer-Moore Latin-1 string searcher with optional case sensitivity
[ChangeLog][QtCore][QString] Added Boyer-Moore Latin-1 string searcher with optional case sensitivity
Task-number: QTBUG-100236
Change-Id: I200a0dac7c8012add1ee02511dba791d233115e0
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/corelib/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/corelib/text/qlatin1stringmatcher.cpp | 198 | ||||
-rw-r--r-- | src/corelib/text/qlatin1stringmatcher.h | 160 | ||||
-rw-r--r-- | src/corelib/text/qstring.cpp | 116 | ||||
-rw-r--r-- | src/tools/bootstrap/CMakeLists.txt | 1 |
5 files changed, 378 insertions, 98 deletions
diff --git a/src/corelib/CMakeLists.txt b/src/corelib/CMakeLists.txt index fdaef27e5f..02854a710a 100644 --- a/src/corelib/CMakeLists.txt +++ b/src/corelib/CMakeLists.txt @@ -218,6 +218,7 @@ qt_internal_add_module(Core text/qchar.h text/qcollator.cpp text/qcollator.h text/qcollator_p.h text/qdoublescanprint_p.h + text/qlatin1stringmatcher.cpp text/qlatin1stringmatcher.h text/qlocale.cpp text/qlocale.h text/qlocale_p.h text/qlocale_data_p.h text/qlocale_tools.cpp text/qlocale_tools_p.h diff --git a/src/corelib/text/qlatin1stringmatcher.cpp b/src/corelib/text/qlatin1stringmatcher.cpp new file mode 100644 index 0000000000..e23b4f20f3 --- /dev/null +++ b/src/corelib/text/qlatin1stringmatcher.cpp @@ -0,0 +1,198 @@ +// Copyright (C) 2022 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +#include "qlatin1stringmatcher.h" +#include <limits.h> + +QT_BEGIN_NAMESPACE + +/*! \class QLatin1StringMatcher + \inmodule QtCore + \brief Optimized search for substring in Latin-1 text. + + A QLatin1StringMatcher can search for one QLatin1StringView + as a substring of another, either ignoring case or taking it into + account. + + \since 6.5 + \ingroup tools + \ingroup string-processing + + This class is useful when you have a Latin-1 encoded string that + you want to repeatedly search for in some QLatin1StringViews + (perhaps in a loop), or when you want to search for all + instances of it in a given QLatin1StringView. Using a matcher + object and indexIn() is faster than matching a plain + QLatin1StringView with QLatin1StringView::indexOf() if repeated + matching takes place. This class offers no benefit if you are + doing one-off matches. The string to be searched for must not + be destroyed or changed before the matcher object is destroyed, + as the matcher accesses the string when searching for it. + + Create a QLatin1StringMatcher for the QLatin1StringView + you want to search for and the case sensitivity. Then call + indexIn() with the QLatin1StringView that you want to search + within. + + \sa QLatin1StringView, QStringMatcher, QByteArrayMatcher +*/ + +/*! + Construct an empty Latin-1 string matcher. + This will match at each position in any string. + \sa setPattern(), setCaseSensitivity(), indexIn() +*/ +QLatin1StringMatcher::QLatin1StringMatcher() noexcept + : m_pattern(), + m_cs(Qt::CaseSensitive), + m_caseSensitiveSearcher(m_pattern.data(), m_pattern.data()) +{ +} + +/*! + Constructs a Latin-1 string matcher that searches for the given \a pattern + with given case sensitivity \a cs. The \a pattern argument must + not be destroyed before this matcher object. Call indexIn() + to find the \a pattern in the given QLatin1StringView. +*/ +QLatin1StringMatcher::QLatin1StringMatcher(QLatin1StringView pattern, + Qt::CaseSensitivity cs) noexcept + : m_pattern(pattern), m_cs(cs) +{ + setSearcher(); +} + +/*! + Destroys the Latin-1 string matcher. +*/ +QLatin1StringMatcher::~QLatin1StringMatcher() noexcept +{ + freeSearcher(); +} + +/*! + \internal +*/ +void QLatin1StringMatcher::setSearcher() noexcept +{ + if (m_cs == Qt::CaseSensitive) { + new (&m_caseSensitiveSearcher) CaseSensitiveSearcher(m_pattern.data(), m_pattern.end()); + } else { + QtPrivate::QCaseInsensitiveLatin1Hash foldCase; + qsizetype bufferSize = std::min(m_pattern.size(), qsizetype(sizeof m_foldBuffer)); + for (qsizetype i = 0; i < bufferSize; ++i) + m_foldBuffer[i] = static_cast<char>(foldCase(m_pattern[i].toLatin1())); + + new (&m_caseInsensitiveSearcher) + CaseInsensitiveSearcher(m_foldBuffer, &m_foldBuffer[bufferSize]); + } +} + +/*! + \internal +*/ +void QLatin1StringMatcher::freeSearcher() noexcept +{ + if (m_cs == Qt::CaseSensitive) + m_caseSensitiveSearcher.~CaseSensitiveSearcher(); + else + m_caseInsensitiveSearcher.~CaseInsensitiveSearcher(); +} + +/*! + Sets the \a pattern to search for. The string pointed to by the + QLatin1StringView must not be destroyed before the matcher is + destroyed, unless it is set to point to a different \a pattern + with longer lifetime first. + + \sa pattern(), indexIn() +*/ +void QLatin1StringMatcher::setPattern(QLatin1StringView pattern) noexcept +{ + if (m_pattern.latin1() == pattern.latin1() && m_pattern.size() == pattern.size()) + return; // Same address and size + + freeSearcher(); + m_pattern = pattern; + setSearcher(); +} + +/*! + Returns the Latin-1 pattern that the matcher searches for. + + \sa setPattern(), indexIn() +*/ +QLatin1StringView QLatin1StringMatcher::pattern() const noexcept +{ + return m_pattern; +} + +/*! + Sets the case sensitivity to \a cs. + + \sa caseSensitivity(), indexIn() +*/ +void QLatin1StringMatcher::setCaseSensitivity(Qt::CaseSensitivity cs) noexcept +{ + if (m_cs == cs) + return; + + freeSearcher(); + m_cs = cs; + setSearcher(); +} + +/*! + Returns the case sensitivity the matcher uses. + + \sa setCaseSensitivity(), indexIn() +*/ +Qt::CaseSensitivity QLatin1StringMatcher::caseSensitivity() const noexcept +{ + return m_cs; +} + +/*! + Searches for the pattern in the given \a haystack starting from + \a from. + + \sa caseSensitivity(), pattern() +*/ +qsizetype QLatin1StringMatcher::indexIn(QLatin1StringView haystack, qsizetype from) const noexcept +{ + if (m_pattern.isEmpty() && from == haystack.size()) + return from; + if (from >= haystack.size()) + return -1; + auto begin = haystack.begin() + from; + auto end = haystack.end(); + auto found = begin; + if (m_cs == Qt::CaseSensitive) { + found = m_caseSensitiveSearcher(begin, end, m_pattern.begin(), m_pattern.end()).begin; + if (found == end) + return -1; + } else { + const qsizetype bufferSize = std::min(m_pattern.size(), qsizetype(sizeof m_foldBuffer)); + const QLatin1StringView restNeedle = m_pattern.sliced(bufferSize); + const bool needleLongerThanBuffer = restNeedle.size() > 0; + QLatin1StringView restHaystack = haystack; + do { + found = m_caseInsensitiveSearcher(found, end, m_foldBuffer, &m_foldBuffer[bufferSize]) + .begin; + if (found == end) { + return -1; + } else if (!needleLongerThanBuffer) { + break; + } + restHaystack = haystack.sliced( + qMin(haystack.size(), + bufferSize + qsizetype(std::distance(haystack.begin(), found)))); + if (restHaystack.startsWith(restNeedle, Qt::CaseInsensitive)) + break; + ++found; + } while (true); + } + return std::distance(haystack.begin(), found); +} + +QT_END_NAMESPACE diff --git a/src/corelib/text/qlatin1stringmatcher.h b/src/corelib/text/qlatin1stringmatcher.h new file mode 100644 index 0000000000..a256250e9c --- /dev/null +++ b/src/corelib/text/qlatin1stringmatcher.h @@ -0,0 +1,160 @@ +// Copyright (C) 2022 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only + +#ifndef QLATIN1STRINGMATCHER_H +#define QLATIN1STRINGMATCHER_H + +#include <functional> +#include <iterator> +#include <limits> + +#include <QtCore/q20algorithm.h> +#include <QtCore/qstring.h> + +QT_BEGIN_NAMESPACE + +namespace QtPrivate { +template<class RandomIt1, + class Hash = std::hash<typename std::iterator_traits<RandomIt1>::value_type>, + class BinaryPredicate = std::equal_to<>> +class q_boyer_moore_searcher_hashed_needle +{ +public: + constexpr q_boyer_moore_searcher_hashed_needle(RandomIt1 pat_first, RandomIt1 pat_last) + : m_skiptable{} + { + const size_t n = std::distance(pat_first, pat_last); + constexpr auto uchar_max = (std::numeric_limits<uchar>::max)(); + uchar max = n > uchar_max ? uchar_max : uchar(n); + q20::fill(std::begin(m_skiptable), std::end(m_skiptable), max); + + RandomIt1 pattern = pat_first; + pattern += n - max; + while (max--) + m_skiptable[uchar(*pattern++)] = max; + } + + template<class RandomIt2> + constexpr auto operator()(RandomIt2 first, RandomIt2 last, RandomIt1 pat_first, + RandomIt1 pat_last) const + { + struct R + { + RandomIt2 begin, end; + }; + Hash hf; + BinaryPredicate pred; + auto pat_length = std::distance(pat_first, pat_last); + if (pat_length == 0) + return R{ first, first }; + + const qsizetype pl_minus_one = qsizetype(pat_length - 1); + RandomIt2 current = first + pl_minus_one; + + while (current < last) { + qsizetype skip = m_skiptable[hf(*current)]; + if (!skip) { + // possible match + while (skip < pat_length) { + if (!pred(hf(*(current - skip)), uchar(pat_first[pl_minus_one - skip]))) + break; + skip++; + } + if (skip > pl_minus_one) { // we have a match + auto match = current - skip + 1; + return R{ match, match + pat_length }; + } + + // If we don't have a match we are a bit inefficient as we only skip by one + // when we have the non matching char in the string. + if (m_skiptable[hf(*(current - skip))] == pat_length) + skip = pat_length - skip; + else + skip = 1; + } + current += skip; + } + + return R{ last, last }; + } + +private: + alignas(16) uchar m_skiptable[256]; +}; + +struct QCaseSensitiveLatin1Hash +{ + constexpr QCaseSensitiveLatin1Hash() = default; + + constexpr std::size_t operator()(char c) const noexcept { return std::size_t(uchar(c)); } +}; + +struct QCaseInsensitiveLatin1Hash +{ + constexpr QCaseInsensitiveLatin1Hash() = default; + + constexpr std::size_t operator()(char c) const noexcept + { + return std::size_t(latin1Lower[uchar(c)]); + } + +private: + static constexpr uchar latin1Lower[256] = { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, + 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f, + 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, + 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, + 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x5b,0x5c,0x5d,0x5e,0x5f, + 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, + 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, + 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, + 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, + 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, + 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, + // 0xd7 (multiplication sign) and 0xdf (sz ligature) complicate life + 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef, + 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xd7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xdf, + 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef, + 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff + }; +}; +} + +class QLatin1StringMatcher +{ +public: + Q_CORE_EXPORT QLatin1StringMatcher() noexcept; + Q_CORE_EXPORT explicit QLatin1StringMatcher( + QLatin1StringView pattern, Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; + Q_CORE_EXPORT ~QLatin1StringMatcher() noexcept; + + Q_CORE_EXPORT void setPattern(QLatin1StringView pattern) noexcept; + Q_CORE_EXPORT QLatin1StringView pattern() const noexcept; + Q_CORE_EXPORT void setCaseSensitivity(Qt::CaseSensitivity cs) noexcept; + Q_CORE_EXPORT Qt::CaseSensitivity caseSensitivity() const noexcept; + + Q_CORE_EXPORT qsizetype indexIn(QLatin1StringView haystack, qsizetype from = 0) const noexcept; + +private: + void setSearcher() noexcept; + void freeSearcher() noexcept; + + QLatin1StringView m_pattern; + Qt::CaseSensitivity m_cs; + typedef QtPrivate::q_boyer_moore_searcher_hashed_needle<const char *, + QtPrivate::QCaseSensitiveLatin1Hash> + CaseSensitiveSearcher; + typedef QtPrivate::q_boyer_moore_searcher_hashed_needle<const char *, + QtPrivate::QCaseInsensitiveLatin1Hash> + CaseInsensitiveSearcher; + union { + CaseSensitiveSearcher m_caseSensitiveSearcher; + CaseInsensitiveSearcher m_caseInsensitiveSearcher; + }; + char m_foldBuffer[256]; +}; + +QT_END_NAMESPACE + +#endif // QLATIN1MATCHER_H diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 6997ca05a8..030e583ddc 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -38,13 +38,12 @@ #include <wchar.h> #include "qchar.cpp" +#include "qlatin1stringmatcher.h" #include "qstringmatcher.cpp" #include "qstringiterator_p.h" #include "qstringalgorithms_p.h" #include "qthreadstorage.h" -#include "qbytearraymatcher.h" // Helper for comparison of QLatin1StringView - #include <algorithm> #include <functional> @@ -1415,70 +1414,6 @@ static int latin1nicmp(const char *lhsChar, qsizetype lSize, const char *rhsChar return lencmp(lSize, rSize); } -namespace { - -template<Qt::CaseSensitivity cs> -inline uchar latin1_fold(const uchar c) -{ - return c; -} - -template<> -inline uchar latin1_fold<Qt::CaseSensitivity::CaseInsensitive>(const uchar c) -{ - return latin1Lower[c]; -} - -template<Qt::CaseSensitivity cs> -inline void bm_latin1_init_skiptable(const uchar *cc, qsizetype len, uchar *skiptable) -{ - int l = int(qMin(len, qsizetype(255))); - memset(skiptable, l, 256 * sizeof(uchar)); - cc += len - l; - while (l--) - skiptable[latin1_fold<cs>(*cc++)] = l; -} - -template<Qt::CaseSensitivity cs> -inline qsizetype bm_latin1_find(const uchar *cc, qsizetype l, qsizetype index, const uchar *puc, - qsizetype pl, const uchar *skiptable) -{ - if (pl == 0) - return index > l ? -1 : index; - const qsizetype pl_minus_one = pl - 1; - - const uchar *current = cc + index + pl_minus_one; - const uchar *end = cc + l; - - while (current < end) { - qsizetype skip = skiptable[latin1_fold<cs>(*current)]; - if (!skip) { - // possible match - while (skip < pl) { - if (latin1_fold<cs>(*(current - skip)) != latin1_fold<cs>(puc[pl_minus_one - skip])) - break; - skip++; - } - if (skip > pl_minus_one) // we have a match - return (current - cc) - skip + 1; - - // in case we don't have a match we are a bit inefficient as we only skip by one - // when we have the non matching char in the string. - if (skiptable[latin1_fold<cs>(*(current - skip))] == pl) - skip = pl - skip; - else - skip = 1; - } - if (current > end - skip) - break; - current += skip; - } - - return -1; // not found -} - -} - bool QtPrivate::equalStrings(QStringView lhs, QStringView rhs) noexcept { return ucstreq(lhs.utf16(), lhs.size(), rhs.utf16(), rhs.size()); @@ -10686,16 +10621,10 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QLatin1StringView needle, qsizetype num = 0; qsizetype i = -1; - // TODO: use Boyer-Moore searcher for case-insensitive search too - // when QTBUG-100236 is done - if (cs == Qt::CaseSensitive) { - QByteArrayMatcher matcher(needle); - while ((i = matcher.indexIn(haystack, i + 1)) != -1) - ++num; - } else { - while ((i = QtPrivate::findString(haystack, i + 1, needle, cs)) != -1) - ++num; - } + QLatin1StringMatcher matcher(needle, cs); + while ((i = matcher.indexIn(haystack, i + 1)) != -1) + ++num; + return num; } @@ -10710,19 +10639,14 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QStringView needle, Qt::C qsizetype num = 0; qsizetype i = -1; - // TODO: use Boyer-Moore searcher for case-insensitive search too - // when QTBUG-100236 is done - if (cs == Qt::CaseSensitive) { - QVarLengthArray<uchar> s(needle.size()); - qt_to_latin1_unchecked(s.data(), needle.utf16(), needle.size()); + QVarLengthArray<uchar> s(needle.size()); + qt_to_latin1_unchecked(s.data(), needle.utf16(), needle.size()); + + QLatin1StringMatcher matcher(QLatin1StringView(reinterpret_cast<char *>(s.data()), s.size()), + cs); + while ((i = matcher.indexIn(haystack, i + 1)) != -1) + ++num; - QByteArrayMatcher matcher(s); - while ((i = matcher.indexIn(haystack, i + 1)) != -1) - ++num; - } else { - while ((i = QtPrivate::findString(haystack, i + 1, needle, cs)) != -1) - ++num; - } return num; } @@ -10743,12 +10667,11 @@ qsizetype QtPrivate::count(QLatin1StringView haystack, QChar needle, Qt::CaseSen if (needle.unicode() > 0xff) return 0; - const char needleL1 = needle.toLatin1(); if (cs == Qt::CaseSensitive) { - return std::count(haystack.cbegin(), haystack.cend(), needleL1); + return std::count(haystack.cbegin(), haystack.cend(), needle.toLatin1()); } else { auto toLower = [](char ch) { return latin1Lower[uchar(ch)]; }; - const uchar ch = toLower(needleL1); + const uchar ch = toLower(needle.toLatin1()); return std::count_if(haystack.cbegin(), haystack.cend(), [&toLower, ch](const char c) { return toLower(c) == ch; }); @@ -10961,7 +10884,7 @@ qsizetype QtPrivate::findString(QLatin1StringView haystack, qsizetype from, QLat return -1; } - const QByteArrayMatcher matcher(needle); + const QLatin1StringMatcher matcher(needle, Qt::CaseSensitivity::CaseSensitive); return matcher.indexIn(haystack, from); } @@ -10995,12 +10918,9 @@ qsizetype QtPrivate::findString(QLatin1StringView haystack, qsizetype from, QLat } return -1; } - uchar skiptable[256]; - bm_latin1_init_skiptable<Qt::CaseSensitivity::CaseInsensitive>( - reinterpret_cast<const unsigned char *>(needle.begin()), needle.size(), skiptable); - return bm_latin1_find<Qt::CaseSensitivity::CaseInsensitive>( - reinterpret_cast<const unsigned char *>(haystack.begin()), haystack.size(), from, - reinterpret_cast<const unsigned char *>(needle.begin()), needle.size(), skiptable); + + QLatin1StringMatcher matcher(needle, Qt::CaseSensitivity::CaseInsensitive); + return matcher.indexIn(haystack, from); } qsizetype QtPrivate::lastIndexOf(QStringView haystack, qsizetype from, QStringView needle, Qt::CaseSensitivity cs) noexcept diff --git a/src/tools/bootstrap/CMakeLists.txt b/src/tools/bootstrap/CMakeLists.txt index 5a2299e37c..2018a29b94 100644 --- a/src/tools/bootstrap/CMakeLists.txt +++ b/src/tools/bootstrap/CMakeLists.txt @@ -68,6 +68,7 @@ qt_internal_extend_target(Bootstrap ../../corelib/text/qbytearray.cpp ../../corelib/text/qbytearraylist.cpp ../../corelib/text/qbytearraymatcher.cpp + ../../corelib/text/qlatin1stringmatcher.cpp ../../corelib/text/qlocale.cpp ../../corelib/text/qlocale_tools.cpp ../../corelib/text/qregularexpression.cpp |