diff options
Diffstat (limited to 'src/corelib/io/qurl.cpp')
-rw-r--r-- | src/corelib/io/qurl.cpp | 209 |
1 files changed, 150 insertions, 59 deletions
diff --git a/src/corelib/io/qurl.cpp b/src/corelib/io/qurl.cpp index fbc8d761c2..aed1be2ffb 100644 --- a/src/corelib/io/qurl.cpp +++ b/src/corelib/io/qurl.cpp @@ -64,7 +64,7 @@ unencoded representation is suitable for showing to users, but the encoded representation is typically what you would send to a web server. For example, the unencoded URL - "http://b\uuml\c{}hler.example.com" would be sent to the server as + "http://b\\uuml\c{}hler.example.com" would be sent to the server as "http://xn--bhler-kva.example.com/List%20of%20applicants.xml". A URL can also be constructed piece by piece by calling @@ -126,36 +126,51 @@ The parsing mode controls the way QUrl parses strings. \value TolerantMode QUrl will try to correct some common errors in URLs. - This mode is useful when processing URLs entered by - users. + This mode is useful for parsing URLs coming from sources + not known to be strictly standards-conforming. \value StrictMode Only valid URLs are accepted. This mode is useful for general URL validation. - In TolerantMode, the parser corrects the following invalid input: + In TolerantMode, the parser has the following behaviour: \list - \li Spaces and "%20": If an encoded URL contains a space, this will be - replaced with "%20". If a decoded URL contains "%20", this will be - replaced with a single space before the URL is parsed. + \li Spaces and "%20": unencoded space characters will be accepted and will + be treated as equivalent to "%20". \li Single "%" characters: Any occurrences of a percent character "%" not followed by exactly two hexadecimal characters (e.g., "13% coverage.html") - will be replaced by "%25". + will be replaced by "%25". Note that one lone "%" character will trigger + the correction mode for all percent characters. \li Reserved and unreserved characters: An encoded URL should only contain a few characters as literals; all other characters should be percent-encoded. In TolerantMode, these characters will be automatically percent-encoded where they are not allowed: - space / double-quote / "<" / ">" / "[" / "\" / - "]" / "^" / "`" / "{" / "|" / "}" + space / double-quote / "<" / ">" / "\" / + "^" / "`" / "{" / "|" / "}" + Those same characters can be decoded again by passing QUrl::DecodeReserved + to toString() or toEncoded(). \endlist + + When in StrictMode, if a parsing error is found, isValid() will return \c + false and errorString() will return a simple message describing the error. + If more than one error is detected, it is undefined which error gets + reported. + + Note that TolerantMode is not usually enough for parsing user input, which + often contains more errors and expectations than the parser can deal with. + When dealing with data coming directly from the user -- as opposed to data + coming from data-transfer sources, such as other programs -- it is + recommended to use fromUserInput(). + + \sa fromUserInput(), setUrl(), toString(), toEncoded(), QUrl::FormattingOptions */ /*! - \enum QUrl::FormattingOption + \enum QUrl::FormattingOptions The formatting options define how the URL is formatted when written out as text. @@ -178,6 +193,62 @@ Note that the case folding rules in \l{RFC 3491}{Nameprep}, which QUrl conforms to, require host names to always be converted to lower case, regardless of the Qt::FormattingOptions used. + + The options from QUrl::ComponentFormattingOptions are also possible. + + \sa QUrl::ComponentFormattingOptions +*/ + +/*! + \enum QUrl::ComponentFormattingOptions + \since 5.0 + + The component formatting options define how the components of an URL will + be formatted when written out as text. They can be combined with the + options from QUrl::FormattingOptions when used in toString() and + toEncoded(). + + \value PrettyDecoded The component is returned in a "pretty form", with + most percent-encoded characters decoded. The exact + behavior of PrettyDecoded varies from component to + component and may also change from Qt release to Qt + release. This is the default. + + \value EncodeSpaces Leave space characters in their encoded form ("%20"). + + \value EncodeUnicode Leave non-US-ASCII characters encoded in their UTF-8 + percent-encoded form (e.g., "%C3%A9" for the U+00E9 + codepoint, LATIN SMALL LETTER E WITH ACUTE). + + \value EncodeDelimiters Leave certain delimiters in their encoded form, as + would appear in the URL when the full URL is + represented as text. The delimiters are affected + by this option change from component to component. + + \value EncodeReserved Leave the US-ASCII reserved characters in their encoded + forms. + + \value DecodeReseved Decode the US-ASCII reserved characters. + + \value FullyEncoded Leave all characters in their properly-encoded form, + as this component would appear as part of a URL. When + used with toString(), this produces a fully-compliant + URL in QString form, exactly equal to the result of + toEncoded() + + \value MostDecoded Attempt to decode as much as possible. For individual + components of the URL, this decodes every percent + encoding sequence, control characters (U+0000 to U+001F) + and non-US-ASCII sequences that aren't valid UTF-8 + sequences. + + The values of EncodeReserved and DecodeReserved should not be used together + in one call. The behaviour is undefined if that happens. They are provided + as separate values because the behaviour of the "pretty mode" with regards + to reserved characters is different on certain components and specially on + the full URL. + + \sa QUrl::FormattingOptions */ #include "qurl.h" @@ -208,6 +279,12 @@ static inline char toHex(quint8 c) return c > 9 ? c - 10 + 'A' : c + '0'; } +static inline quint8 fromHex(quint8 c) +{ + c |= 0x20; + return c >= 'a' ? c - 'a' + 10 : c - '0'; +} + static inline QString ftpScheme() { return QStringLiteral("ftp"); @@ -246,24 +323,6 @@ QUrlPrivate::QUrlPrivate(const QUrlPrivate ©) { } -void QUrlPrivate::clear() -{ - scheme.clear(); - userName.clear(); - password.clear(); - host.clear(); - port = -1; - path.clear(); - query.clear(); - fragment.clear(); - - errorCode = NoError; - errorSupplement = 0; - sectionIsPresent = 0; - sectionHasError = 0; -} - - // From RFC 3896, Appendix A Collected ABNF for URI // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] //[...] @@ -1391,20 +1450,25 @@ const QByteArray &QUrlPrivate::normalized() const /*! - Constructs a URL by parsing \a url. \a url is assumed to be in human - readable representation, with no percent encoding. QUrl will automatically - percent encode all characters that are not allowed in a URL. - The default parsing mode is TolerantMode. - - Parses the \a url using the parser mode \a parsingMode. + Constructs a URL by parsing \a url. QUrl will automatically percent encode + all characters that are not allowed in a URL and decode the percent-encoded + sequences that represent a character that is allowed in a URL. + + Parses the \a url using the parser mode \a parsingMode. In TolerantMode + (the default), QUrl will correct certain mistakes, notably the presence of + a percent character ('%') not followed by two hexadecimal digits, and it + will accept any character in any position. In StrictMode, encoding mistakes + will not be tolerated and QUrl will also check that certain forbidden + characters are not present in unencoded form. If an error is detected in + StrictMode, isValid() will return false. Example: - \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 0 + \snippet code/src_corelib_io_qurl.cpp 0 To construct a URL from an encoded string, call fromEncoded(): - \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 1 + \snippet code/src_corelib_io_qurl.cpp 1 \sa setUrl(), setEncodedUrl(), fromEncoded(), TolerantMode */ @@ -1445,7 +1509,7 @@ QUrl::~QUrl() must conform to the standard encoding rules of the URI standard for the URL to be reported as valid. - \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 2 + \snippet code/src_corelib_io_qurl.cpp 2 */ bool QUrl::isValid() const { @@ -1475,12 +1539,18 @@ void QUrl::clear() } /*! - Parses \a url using the parsing mode \a parsingMode. - - \a url is assumed to be in unicode format, with no percent - encoding. - - Calling isValid() will tell whether or not a valid URL was constructed. + Parses \a url and sets this object to that value. QUrl will automatically + percent encode all characters that are not allowed in a URL and decode the + percent-encoded sequences that represent a character that is allowed in a + URL. + + Parses the \a url using the parser mode \a parsingMode. In TolerantMode + (the default), QUrl will correct certain mistakes, notably the presence of + a percent character ('%') not followed by two hexadecimal digits, and it + will accept any character in any position. In StrictMode, encoding mistakes + will not be tolerated and QUrl will also check that certain forbidden + characters are not present in unencoded form. If an error is detected in + StrictMode, isValid() will return false. \sa setEncodedUrl() */ @@ -1501,7 +1571,7 @@ void QUrl::setUrl(const QString &url, ParsingMode parsingMode) and is followed by a ':'. The following example shows a URL where the scheme is "ftp": - \img qurl-authority2.png + \image qurl-authority2.png The scheme can also be empty, in which case the URL is interpreted as relative. @@ -1548,7 +1618,7 @@ QString QUrl::scheme() const The following example shows a valid authority string: - \img qurl-authority.png + \image qurl-authority.png */ void QUrl::setAuthority(const QString &authority) { @@ -1585,7 +1655,7 @@ QString QUrl::authority(ComponentFormattingOptions options) const separated by a ':'. If the password is empty, the colon must be omitted. The following example shows a valid user info string: - \img qurl-authority3.png + \image qurl-authority3.png \sa userInfo(), setUserName(), setPassword(), setAuthority() */ @@ -1746,7 +1816,7 @@ void QUrl::setPort(int port) Example: - \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 3 + \snippet code/src_corelib_io_qurl.cpp 3 */ int QUrl::port(int defaultPort) const { @@ -1758,12 +1828,12 @@ int QUrl::port(int defaultPort) const Sets the path of the URL to \a path. The path is the part of the URL that comes after the authority but before the query string. - \img qurl-ftppath.png + \image qurl-ftppath.png For non-hierarchical schemes, the path will be everything following the scheme declaration, as in the following example: - \img qurl-mailtopath.png + \image qurl-mailtopath.png \sa path() */ @@ -1862,7 +1932,7 @@ QString QUrl::query(ComponentFormattingOptions options) const characters. It is typically used in HTTP for referring to a certain link or point on a page: - \img qurl-fragment.png + \image qurl-fragment.png The fragment is sometimes also referred to as the URL "reference". @@ -1938,7 +2008,7 @@ QString QUrl::topLevelDomain(ComponentFormattingOptions options) const the base URL, but with the merged path, as in the following example: - \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 5 + \snippet code/src_corelib_io_qurl.cpp 5 Calling resolved() with ".." returns a QUrl whose directory is one level higher than the original. Similarly, calling resolved() @@ -2012,14 +2082,16 @@ QUrl QUrl::resolved(const QUrl &relative) const } /*! - Returns true if the URL is relative; otherwise returns false. A - URL is relative if its scheme is undefined; this function is - therefore equivalent to calling scheme().isEmpty(). + Returns true if the URL is relative; otherwise returns false. A URL is + relative reference if its scheme is undefined; this function is therefore + equivalent to calling scheme().isEmpty(). + + Relative references are defined in RFC 3986 section 4.2. */ bool QUrl::isRelative() const { if (!d) return true; - return !d->hasScheme() && !d->path.startsWith(QLatin1Char('/')); + return !d->hasScheme(); } /*! @@ -2173,7 +2245,7 @@ QString QUrl::fromPercentEncoding(const QByteArray &input) Unreserved is defined as: ALPHA / DIGIT / "-" / "." / "_" / "~" - \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 6 + \snippet code/src_corelib_io_qurl.cpp 6 */ QByteArray QUrl::toPercentEncoding(const QString &input, const QByteArray &exclude, const QByteArray &include) { @@ -2429,6 +2501,9 @@ QUrl QUrl::fromLocalFile(const QString &localFile) returned value in the form found on SMB networks (for example, "//servername/path/to/file.txt"). + Note: if the path component of this URL contains a non-UTF-8 binary + sequence (such as %80), the behaviour of this function is undefined. + \sa fromLocalFile(), isLocalFile() */ QString QUrl::toLocalFile() const @@ -2438,19 +2513,35 @@ QString QUrl::toLocalFile() const return QString(); QString tmp; - QString ourPath = path(); + QString ourPath = path(QUrl::MostDecoded); // magic for shared drive on windows if (!d->host.isEmpty()) { - tmp = QStringLiteral("//") + d->host + (ourPath.length() > 0 && ourPath.at(0) != QLatin1Char('/') + tmp = QStringLiteral("//") + host() + (ourPath.length() > 0 && ourPath.at(0) != QLatin1Char('/') ? QLatin1Char('/') + ourPath : ourPath); } else { tmp = ourPath; +#ifdef Q_OS_WIN // magic for drives on windows if (ourPath.length() > 2 && ourPath.at(0) == QLatin1Char('/') && ourPath.at(2) == QLatin1Char(':')) tmp.remove(0, 1); +#endif } + // check if we need to do one more decoding pass + int pct = tmp.indexOf(QLatin1Char('%')); + while (pct != -1) { + Q_ASSERT(tmp.size() >= pct + 2); + ushort char1 = tmp.at(pct + 1).unicode(); + ushort char2 = tmp.at(pct + 2).unicode(); + + Q_ASSERT(isHex(char1) && char1 < 0x80u); + Q_ASSERT(isHex(char2) && char2 < 0x80u); + tmp.replace(pct, 3, QChar(fromHex(char1) << 4 | fromHex(char2))); + + // next iteration + pct = tmp.indexOf(QLatin1Char('%'), pct + 1); + } return tmp; } |