From 90f3e1d3494fcab722eee457608e80ec74d0e838 Mon Sep 17 00:00:00 2001 From: Sona Kurazyan Date: Tue, 5 Jul 2022 16:18:13 +0200 Subject: Move the docs for porting to QRegularExpression to a common place Copy them from QRegExp docs in qt5compat to doc/global/includes/corelib/port-from-qregexp.qdocinc, so that the porting docs can be included from both Qt 6 porting guide and qt5compat. Task-number: QTBUG-89702 Pick-to: 6.4 6.3 6.2 Change-Id: I616e2333f60f36e4851398479939fd062016748d Reviewed-by: Edward Welbourne --- doc/global/config.qdocconf | 1 + .../includes/corelib/port-from-qregexp.qdocinc | 175 +++++++++++++++++++++ .../snippets/code/doc_src_port_from_qregexp.cpp | 65 ++++++++ 3 files changed, 241 insertions(+) create mode 100644 doc/global/includes/corelib/port-from-qregexp.qdocinc create mode 100644 doc/global/snippets/code/doc_src_port_from_qregexp.cpp diff --git a/doc/global/config.qdocconf b/doc/global/config.qdocconf index 16d084f9c8..06e3565101 100644 --- a/doc/global/config.qdocconf +++ b/doc/global/config.qdocconf @@ -23,6 +23,7 @@ ignorewords += \ ignoresince = 5.0 sourcedirs += includes $$BUILDDIR +exampledirs += snippets url = https://doc.qt.io/qt diff --git a/doc/global/includes/corelib/port-from-qregexp.qdocinc b/doc/global/includes/corelib/port-from-qregexp.qdocinc new file mode 100644 index 0000000000..11f0a3136f --- /dev/null +++ b/doc/global/includes/corelib/port-from-qregexp.qdocinc @@ -0,0 +1,175 @@ +// Copyright (C) 2022 Giuseppe D'Angelo . +// Copyright (C) 2022 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo +// Copyright (C) 2022 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GFDL-1.3-no-invariants-only + +//! [porting-to-qregularexpression] + + The QRegularExpression class introduced in Qt 5 implements Perl-compatible + regular expressions and is a big improvement upon QRegExp in terms of APIs + offered, supported pattern syntax, and speed of execution. The biggest + difference is that QRegularExpression simply holds a regular expression, + and it's \e{not} modified when a match is requested. Instead, a + QRegularExpressionMatch object is returned, to check the result of a match + and extract the captured substring. The same applies to global matching and + QRegularExpressionMatchIterator. + + Other differences are outlined below. + + \note QRegularExpression does not support all the features available in + Perl-compatible regular expressions. The most notable one is the fact that + duplicated names for capturing groups are not supported, and using them can + lead to undefined behavior. This may change in a future version of Qt. + + \section3 Different pattern syntax + + Porting a regular expression from QRegExp to QRegularExpression may require + changes to the pattern itself. + + In specific scenarios, QRegExp was too lenient and accepted patterns that + are simply invalid when using QRegularExpression. These are easy to detect, + because the QRegularExpression objects built with these patterns are not + valid (see QRegularExpression::isValid()). + + In other cases, a pattern ported from QRegExp to QRegularExpression may + silently change semantics. Therefore, it is necessary to review the + patterns used. The most notable cases of silent incompatibility are: + + \list + + \li Curly braces are needed to use a hexadecimal escape like \c{\xHHHH} + with more than 2 digits. A pattern like \c{\x2022} needs to be ported + to \c{\x{2022}}, or it will match a space (\c{0x20}) followed by the + string \c{"22"}. In general, it is highly recommended to always use + curly braces with the \c{\x} escape, no matter the number of digits + specified. + + \li A 0-to-n quantification like \c{{,n}} needs to be ported to \c{{0,n}} + to preserve semantics. Otherwise, a pattern such as \c{\d{,3}} would + match a digit followed by the exact string \c{"{,3}"}. + + \li QRegExp by default does Unicode-aware matching, while + QRegularExpression requires a separate option; see below for more + details. + + \li c{.} in QRegExp does by default match all characters, including the + newline character. QRegularExpression excludes the newline character + by default. To include the newline character, set the + QRegularExpression::DotMatchesEverythingOption pattern option. + + \endlist + + For an overview of the regular expression syntax supported by + QRegularExpression, please refer to the + \l{https://pcre.org/original/doc/html/pcrepattern.html}{pcrepattern(3)} + man page, describing the pattern syntax supported by PCRE (the reference + implementation of Perl-compatible regular expressions). + + \section3 Porting from QRegExp::exactMatch() + + QRegExp::exactMatch() served two purposes: it exactly matched a regular + expression against a subject string, and it implemented partial matching. + + \section4 Porting from QRegExp's Exact Matching + + Exact matching indicates whether the regular expression matches the entire + subject string. For example, the classes yield on the subject string \c{"abc123"}: + + \table + \header \li \li QRegExp::exactMatch() \li QRegularExpressionMatch::hasMatch() + \row \li \c{"\\d+"} \li \b false \li \b true + \row \li \c{"[a-z]+\\d+"} \li \b true \li \b true + \endtable + + Exact matching is not reflected in QRegularExpression. If you want + to be sure that the subject string matches the regular expression + exactly, you can wrap the pattern using the QRegularExpression::anchoredPattern() + function: + + \snippet code/doc_src_port_from_qregexp.cpp 0 + + \section4 Porting from QRegExp's Partial Matching + + When using QRegExp::exactMatch(), if an exact match was not found, one + could still find out how much of the subject string was matched by the + regular expression by calling QRegExp::matchedLength(). If the returned length + was equal to the subject string's length, then one could conclude that a partial + match was found. + + QRegularExpression supports partial matching explicitly by means of the + appropriate QRegularExpression::MatchType. + + \section3 Global matching + + Due to limitations of the QRegExp API, it was impossible to implement global + matching correctly (that is, like Perl does). In particular, patterns that + can match 0 characters (like \c{"a*"}) are problematic. + + QRegularExpression::globalMatch() implements Perl global match correctly, and + the returned iterator can be used to examine each result. + + For example, if you have code like: + + \snippet code/doc_src_port_from_qregexp.cpp 1 + + You can rewrite it as: + + \snippet code/doc_src_port_from_qregexp.cpp 2 + + \section3 Unicode properties support + + When using QRegExp, character classes such as \c{\w}, \c{\d}, etc. match + characters with the corresponding Unicode property: for instance, \c{\d} + matches any character with the Unicode \c{Nd} (decimal digit) property. + + Those character classes only match ASCII characters by default when using + QRegularExpression: for instance, \c{\d} matches exactly a character in the + \c{0-9} ASCII range. It is possible to change this behavior by using the + QRegularExpression::UseUnicodePropertiesOption pattern option. + + \section3 Wildcard matching + + There is no direct way to do wildcard matching in QRegularExpression. + However, the QRegularExpression::wildcardToRegularExpression() method + is provided to translate glob patterns into a Perl-compatible regular + expression that can be used for that purpose. + + For example, if you have code like: + + \snippet code/doc_src_port_from_qregexp.cpp 3 + + You can rewrite it as: + + \snippet code/doc_src_port_from_qregexp.cpp 4 + + Please note though that some shell-like wildcard patterns might not be + translated to what you expect. The following example code will silently + break if simply converted using the above-mentioned function: + + \snippet code/doc_src_port_from_qregexp.cpp 5 + + This is because, by default, the regular expression returned by + QRegularExpression::wildcardToRegularExpression() is fully anchored. + To get a regular expression that is not anchored, pass + QRegularExpression::UnanchoredWildcardConversion as the conversion + options: + + \snippet code/doc_src_port_from_qregexp.cpp 6 + + \section3 Minimal matching + + QRegExp::setMinimal() implemented minimal matching by simply reversing the + greediness of the quantifiers (QRegExp did not support lazy quantifiers, + like \c{*?}, \c{+?}, etc.). QRegularExpression instead does support greedy, + lazy, and possessive quantifiers. The QRegularExpression::InvertedGreedinessOption + pattern option can be useful to emulate the effects of QRegExp::setMinimal(): + if enabled, it inverts the greediness of quantifiers (greedy ones become + lazy and vice versa). + + \section3 Caret modes + + The QRegularExpression::AnchorAtOffsetMatchOption match option can be used to + emulate the QRegExp::CaretAtOffset behavior. There is no equivalent for the + other QRegExp::CaretMode modes. + +//! [porting-to-qregularexpression] diff --git a/doc/global/snippets/code/doc_src_port_from_qregexp.cpp b/doc/global/snippets/code/doc_src_port_from_qregexp.cpp new file mode 100644 index 0000000000..58e79e5ac1 --- /dev/null +++ b/doc/global/snippets/code/doc_src_port_from_qregexp.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2022 Giuseppe D'Angelo . +// Copyright (C) 2022 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo +// Copyright (C) 2022 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GFDL-1.3-no-invariants-only + +//! [0] +QString p("a .*|pattern"); + +// re matches exactly the pattern string p +QRegularExpression re(QRegularExpression::anchoredPattern(p)); +//! [0] + +//! [1] +QString subject("the quick fox"); + +int offset = 0; +QRegExp re("(\\w+)"); +while ((offset = re.indexIn(subject, offset)) != -1) { + offset += re.matchedLength(); + // ... +} +//! [1] + +//! [2] +QString subject("the quick fox"); + +QRegularExpression re("(\\w+)"); +QRegularExpressionMatchIterator i = re.globalMatch(subject); +while (i.hasNext()) { + QRegularExpressionMatch match = i.next(); + // ... +} +//! [2] + +//! [3] +QRegExp wildcard("*.txt"); +wildcard.setPatternSyntax(QRegExp::Wildcard); +//! [3] + +//! [4] +auto wildcard = QRegularExpression(QRegularExpression::wildcardToRegularExpression("*.txt")); +//! [4] + +//! [5] +const QString fp1("C:/Users/dummy/files/content.txt"); +const QString fp2("/home/dummy/files/content.txt"); + +QRegExp re1("*/files/*"); +re1.setPatternSyntax(QRegExp::Wildcard); +re1.exactMatch(fp1); // returns true +re1.exactMatch(fp2); // returns true + +// but converted with QRegularExpression::wildcardToRegularExpression() + +QRegularExpression re2(QRegularExpression::wildcardToRegularExpression("*/files/*")); +re2.match(fp1).hasMatch(); // returns false +re2.match(fp2).hasMatch(); // returns false +//! [5] + +//! [6] +QRegularExpression re3(QRegularExpression::wildcardToRegularExpression( + "*/files/*", QRegularExpression::UnanchoredWildcardConversion)); +re3.match(fp1).hasMatch(); // returns true +re3.match(fp2).hasMatch(); // returns true +//! [6] -- cgit v1.2.3