From 25edeaa3d64163c43b71f8106ef04d6b494c2ed2 Mon Sep 17 00:00:00 2001 From: Karsten Heimrich Date: Mon, 19 Oct 2020 13:00:44 +0200 Subject: Doc: Add porting Guide for QRegExp Fixes: QTBUG-87101 Change-Id: I370c79e295489f4eaf8418bbd53b326f0a8e5123 Reviewed-by: Christian Stenger --- src/corelib/doc/src/qt6-changes.qdoc | 251 +++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) (limited to 'src/corelib/doc/src/qt6-changes.qdoc') diff --git a/src/corelib/doc/src/qt6-changes.qdoc b/src/corelib/doc/src/qt6-changes.qdoc index 753eb17302..d04c74efe8 100644 --- a/src/corelib/doc/src/qt6-changes.qdoc +++ b/src/corelib/doc/src/qt6-changes.qdoc @@ -361,4 +361,255 @@ QProcess::pid() and the Q_PID type have been removed; use QProcess::processId() instead to get the native process identifier. Code using native Win32 APIs to access the data in the Q_PID as a Win32 \c{PROCESS_INFORMATION} struct is no longer supported. + + \section1 Regular expression classes + + \section2 QRegularExpression + + In Qt6, all methods taking the \c QRegExp got removed from our code-base. + Therefore it is very likely that you will have to port your application or + library to \l QRegularExpression. + + \l QRegularExpression implements Perl-compatible regular expressions. It + fully supports Unicode. For an overview of the regular expression syntax + supported by \l QRegularExpression, please refer to the aforementioned + pcrepattern(3) man page. A regular expression is made up of two things: a + pattern string and a set of pattern options that change the meaning of the + pattern string. + + There are some subtle differences between \l QRegularExpression and \c + QRegExp that will be explained by this document to ease the porting effort. + + \l QRegularExpression is more strict when it comes to the syntax of the + regular expression. Therefore it is always good to check the expression + for \l {QRegularExpression::isValid}{validity}. + + \l QRegularExpression can almost always be declared const (except when the + pattern changes), while \c QRegExp almost never could be. + + There is no replacement for the \l {QRegExp::CaretMode}{CaretMode} + enumeration. The \l {QRegularExpression::AnchoredMatchOption} match option + can be used to emulate the QRegExp::CaretAtOffset behavior. There is no + equivalent for the other QRegExp::CaretMode modes. + + \l QRegularExpression supports only Perl-compatible regular expressions. + Still, it does not support all the features available in Perl-compatible + regular expressions. The most notable one is the fact that duplicated names + for capturing groups are not supported, and using them can lead to + undefined behavior. This may change in a future version of Qt. + + \section3 Wildcard matching + + There is no direct way to do wildcard matching in \l QRegularExpression. + However, the \l {QRegularExpression::wildcardToRegularExpression} method + is provided to translate glob patterns into a Perl-compatible regular + expression that can be used for that purpose. + + \oldcode + QRegExp wildcard("*.txt"); + wildcard.setPatternSyntax(QRegExp::Wildcard); + \newcode + auto wildcard = QRegularExpression(QRegularExpression::wildcardToRegularExpression("*.txt")); + \endcode + + Please note though that not all shell like wildcard pattern might be + translated in a way you would expect it. The following example code will + silently break if simply converted using the above mentioned function: + + \code + const QString fp1("C:/Users/dummy/files/content.txt"); + const QString fp2("/home/dummy/files/content.txt"); + + QRegExp re1("*/files/*"); + re1.setPatternSyntax(QRegExp::Wildcard); + ... = re1.exactMatch(fp1); // returns true + ... = re1.exactMatch(fp2); // returns true + + // but converted with QRegularExpression::wildcardToRegularExpression() + + QRegularExpression re2(QRegularExpression::wildcardToRegularExpression("*/files/*")); + ... = re2.match(fp1).hasMatch(); // returns false + ... = re2.match(fp2).hasMatch(); // returns false + \endcode + + \section3 Searching forward + + Forward searching inside a string was usually implemented with a loop using + \c {QRegExp::indexIn} and a growing offset, but can now be easily implemented + with \l QRegularExpressionMatchIterator or \l {QString::indexOf}. + + \oldcode + QString subject("the quick fox"); + + int offset = 0; + QRegExp re("(\\w+)"); + while ((offset = re.indexIn(subject, offset)) != -1) { + offset += re.matchedLength(); + // ... + } + \newcode + QRegularExpression re("(\\w+)"); + QString subject("the quick fox"); + + QRegularExpressionMatchIterator i = re.globalMatch(subject); + while (i.hasNext()) { + QRegularExpressionMatch match = i.next(); + // ... + } + + // or alternatively using QString::indexOf + + qsizetype from = 0; + QRegularExpressionMatch match; + while ((from = subject.indexOf(re, from, &match)) != -1) { + from += match.capturedLength(); + // ... + } + \endcode + + \section3 Searching backwards + + Backwards searching inside a string was usually often implemented as a loop + over \c {QRegExp::lastIndexIn}, but can now be easily implemented using + \l {QString::lastIndexOf} and \l {QRegularExpressionMatch}. + + \note \l QRegularExpressionMatchIterator is not capable of performing a + backwards search. + + \oldcode + int offset = -1; + QString subject("Lorem ipsum dolor sit amet, consetetur sadipscing."); + + QRegExp re("\\s+([ids]\\w+)"); + while ((offset = re.lastIndexIn(subject, offset)) != -1) { + --offset; + // ... + } + \newcode + qsizetype from = -1; + QString subject("Lorem ipsum dolor sit amet, consetetur sadipscing."); + + QRegularExpressionMatch match; + QRegularExpression re("\\s+([ids]\\w+)"); + while ((from = subject.lastIndexOf(re, from, &match)) != -1) { + --from; + // ... + } + \endcode + + \section3 exactMatch vs. match.hasMatch + + \c {QRegExp::exactMatch} served two purposes: it exactly matched a regular + expression against a subject string, and it implemented partial matching. + Exact matching indicates whether the regular expression matches the entire + subject string. For example: + + \code + QString source("abc123"); + + QRegExp("\\d+").exactMatch(source); // returns false + QRegExp("[a-z]+\\d+").exactMatch(source); // returns true + + QRegularExpression("\\d+").match(source).hasMatch(); // returns true + QRegularExpression("[a-z]+\\d+").match(source).hasMatch(); // returns true + \endcode + + Exact matching is not reflected in \l QRegularExpression. If you want to be + sure that the subject string matches the regular expression exactly, you + can wrap the pattern using the \l {QRegularExpression::anchoredPattern} + function: + + \code + QString source("abc123"); + + QString pattern("\\d+"); + QRegularExpression(pattern).match(source).hasMatch(); // returns true + + pattern = QRegularExpression::anchoredPattern(pattern); + QRegularExpression(pattern).match(source).hasMatch(); // returns false + \code + + \section3 Minimal matching + + \c QRegExp::setMinimal() implemented minimal matching by simply reversing + the greediness of the quantifiers (\c QRegExp did not support lazy + quantifiers, like *?, +?, etc.). QRegularExpression instead does support + greedy, lazy and possessive quantifiers. The \l + {QRegularExpression::InvertedGreedinessOption} pattern option can be useful + to emulate the effects of \c QRegExp::setMinimal(): if enabled, it inverts + the greediness of quantifiers (greedy ones become lazy and vice versa). + + \section3 Different pattern syntax + + Porting a regular expression from \c QRegExp to \l QRegularExpression may + require changes to the pattern itself. Therefore it is recommended to check + the pattern used with the \l {QRegularExpression::isValid} method. This is + especially important for user provided pattern or pattern not controlled by + the developer. + + In other cases, a pattern ported from \c QRegExp to \l QRegularExpression may + silently change semantics. Therefore, it is necessary to review the patterns + used. The most notable cases of silent incompatibility are: + + \list + \li Curly braces are needed in order to use a hexadecimal escape like \c + {\xHHHH} with more than 2 digits. A pattern like \c {\x2022} needs + to be ported to \c {\x{2022}}, or it will match a space \c {(0x20)} + followed by the string \c {"22"}. In general, it is highly recommended + to always use curly braces with the \c {\x} escape, no matter the + amount of digits specified. + + \li A \c{0-to-n} quantification like \c {{,n}} needs to be ported to + \c {{0,n}} to preserve semantics. Otherwise, a pattern such as + \c {\d{,3}} would actually match a digit followed by the exact + string \c {"{,3}"}. + \endlist + + \section3 Partial Matching + + When using \c QRegExp::exactMatch(), if an exact match was not found, one + could still find out how much of the subject string was matched by the + regular expression by calling \c QRegExp::matchedLength(). If the returned + length was equal to the subject string's length, then one could conclude + that a partial match was found. + \l QRegularExpression supports partial matching explicitly by means of the + appropriate \l {QRegularExpression::MatchType}. + + \section3 Global matching + + Due to limitations of the \c QRegExp API it was impossible to implement + global matching correctly (that is, like Perl does). In particular, patterns + that can match zero characters (like "a*") are problematic. \l + {QRegularExpression::wildcardToRegularExpression} implements Perl global + match correctly, and the returned iterator can be used to examine each + result. + + \section3 Unicode properties support + + When using \c QRegExp, character classes such as {\w}, {\d}, etc. match + characters with the corresponding Unicode property: for instance, {\d} + matches any character with the Unicode Nd (decimal digit) property. Those + character classes only match ASCII characters by default. When using \l + QRegularExpression: for instance, {\d} matches exactly a character in the + 0-9 ASCII range. It is possible to change this behavior by using the \l + {QRegularExpression::UseUnicodePropertiesOption} + pattern option. + + \section2 QRegExp + + In Qt6 \l QRegExp got removed from Qt Core. If your application cannot be + ported right now, \c QRegExp still exists in Qt5Compat to keep these + code-bases working. If you want to use \c QRegExp further, you need to link + against the new Qt5Compat module and add this line to your \l qmake \c .pro + file: + \code + QT += core5compat + \endcode + + In case you already ported your application or library to the \l cmake + build system, add the following to your \c CMakeList.txt: + \code + PUBLIC_LIBRARIES + Qt::Core5Compat + \endcode */ -- cgit v1.2.3