From 53d0624403f7f2ac8fe8364a7c5cd136717d40ed Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Mon, 21 May 2012 17:02:27 +0200 Subject: Add QUrl::ParsingMode to the component setters in QUrl This allows one to instruct QUrl to ignore the percent-encodings and interpret the data exactly as provided. This is useful in certain use-cases where the data comes from a non-URL context. The strict-mode checking of the components is not implemented yet. Currently, the behaviour is equal to that of TolerantMode. Discussed-on: http://lists.qt-project.org/pipermail/development/2012-May/003811.html Change-Id: Ia5abe045a8ce7f9b50cbce3b5a7e3735e068d03a Reviewed-by: Lars Knoll Reviewed-by: Shane Kearns --- src/corelib/io/qurl.cpp | 205 +++++++++++++++++++++++++++----- src/corelib/io/qurl.h | 16 +-- tests/auto/corelib/io/qurl/tst_qurl.cpp | 198 ++++++++++++++++++++++++++++++ 3 files changed, 384 insertions(+), 35 deletions(-) diff --git a/src/corelib/io/qurl.cpp b/src/corelib/io/qurl.cpp index bb97890d15..a7ceb7337f 100644 --- a/src/corelib/io/qurl.cpp +++ b/src/corelib/io/qurl.cpp @@ -503,6 +503,10 @@ static const ushort decodedQueryInUrlActions[] = { 0 }; +static inline void parseDecodedComponent(QString &data) +{ + data.replace(QLatin1Char('%'), QStringLiteral("%25")); +} static inline QString recodeFromUser(const QString &input, const ushort *actions, int from, int to) @@ -1560,7 +1564,7 @@ void QUrl::clear() StrictMode, isValid() will return false. The parsing mode DecodedMode is not permitted in this context and will produce a run-time warning. - \sa setEncodedUrl() + \sa url(), toString() */ void QUrl::setUrl(const QString &url, ParsingMode parsingMode) { @@ -1576,7 +1580,7 @@ void QUrl::setUrl(const QString &url, ParsingMode parsingMode) /*! Sets the scheme of the URL to \a scheme. As a scheme can only contain ASCII characters, no conversion or encoding is done on the - input. + input. It must also start with an ASCII letter. The scheme describes the type (or protocol) of the URL. It's represented by one or more ASCII characters at the start the URL, @@ -1607,6 +1611,9 @@ void QUrl::setScheme(const QString &scheme) Returns the scheme of the URL. If an empty string is returned, this means the scheme is undefined and the URL is then relative. + The scheme can only contain US-ASCII letters or digits, which means it + cannot contain any character that would otherwise require encoding. + \sa setScheme(), isRelative() */ QString QUrl::scheme() const @@ -1631,11 +1638,29 @@ QString QUrl::scheme() const The following example shows a valid authority string: \image qurl-authority.png + + The \a authority data is interpreted according to \a mode: in StrictMode, + any '%' characters must be followed by exactly two hexadecimal characters + and some characters (including space) are not allowed in undecoded form. In + TolerantMode (the default), all characters are accepted in undecoded form + and the tolerant parser will correct stray '%' not followed by two hex + characters. In DecodedMode, '%' stand for themselves and encoded characters + are not possible. Because of that, in DecodedMode, it is not possible to + use the delimiter characters as non-delimiters (e.g., a password containing + a '@'). + + \sa setUserInfo, setHost, setPort */ -void QUrl::setAuthority(const QString &authority) +void QUrl::setAuthority(const QString &authority, ParsingMode mode) { detach(); - d->setAuthority(authority, 0, authority.length()); + QString data = authority; + if (mode == DecodedMode) { + parseDecodedComponent(data); + mode = TolerantMode; + } + + d->setAuthority(data, 0, data.length()); if (authority.isNull()) { // QUrlPrivate::setAuthority cleared almost everything // but it leaves the Host bit set @@ -1669,12 +1694,26 @@ QString QUrl::authority(ComponentFormattingOptions options) const \image qurl-authority3.png + The \a userInfo data is interpreted according to \a mode: in StrictMode, + any '%' characters must be followed by exactly two hexadecimal characters + and some characters (including space) are not allowed in undecoded form. In + TolerantMode (the default), all characters are accepted in undecoded form + and the tolerant parser will correct stray '%' not followed by two hex + characters. In DecodedMode, '%' stand for themselves and encoded characters + are not possible. Because of that, in DecodedMode, it is not possible to + use the ':' delimiter characters as non-delimiter in the user name. + \sa userInfo(), setUserName(), setPassword(), setAuthority() */ -void QUrl::setUserInfo(const QString &userInfo) +void QUrl::setUserInfo(const QString &userInfo, ParsingMode mode) { detach(); QString trimmed = userInfo.trimmed(); + if (mode == DecodedMode) { + parseDecodedComponent(trimmed); + mode = TolerantMode; + } + d->setUserInfo(trimmed, 0, trimmed.length()); if (userInfo.isNull()) { // QUrlPrivate::setUserInfo cleared almost everything @@ -1701,12 +1740,33 @@ QString QUrl::userInfo(ComponentFormattingOptions options) const of the user info element in the authority of the URL, as described in setUserInfo(). - \sa setEncodedUserName(), userName(), setUserInfo() + The \a userName data is interpreted according to \a mode: in StrictMode, + any '%' characters must be followed by exactly two hexadecimal characters + and some characters (including space) are not allowed in undecoded form. In + TolerantMode (the default), all characters are accepted in undecoded form + and the tolerant parser will correct stray '%' not followed by two hex + characters. In DecodedMode, '%' stand for themselves and encoded characters + are not possible. + + QUrl::DecodedMode should be used when setting the user name from a data + source which is not a URL, such as a password dialog shown to the user or + with a user name obtained by calling userName() with the QUrl::FullyEncoded + formatting option. + + \sa userName(), setUserInfo() */ -void QUrl::setUserName(const QString &userName) +void QUrl::setUserName(const QString &userName, ParsingMode mode) { detach(); - d->setUserName(userName, 0, userName.length()); + + QString data = userName; + if (mode == DecodedMode) { + parseDecodedComponent(data); + mode = TolerantMode; + } + + + d->setUserName(data, 0, data.length()); if (userName.isNull()) d->sectionIsPresent &= ~QUrlPrivate::UserName; } @@ -1731,12 +1791,32 @@ QString QUrl::userName(ComponentFormattingOptions options) const the user info element in the authority of the URL, as described in setUserInfo(). + The \a password data is interpreted according to \a mode: in StrictMode, + any '%' characters must be followed by exactly two hexadecimal characters + and some characters (including space) are not allowed in undecoded form. In + TolerantMode, all characters are accepted in undecoded form and the + tolerant parser will correct stray '%' not followed by two hex characters. + In DecodedMode, '%' stand for themselves and encoded characters are not + possible. + + QUrl::DecodedMode should be used when setting the password from a data + source which is not a URL, such as a password dialog shown to the user or + with a password obtained by calling password() with the QUrl::FullyEncoded + formatting option. + \sa password(), setUserInfo() */ -void QUrl::setPassword(const QString &password) +void QUrl::setPassword(const QString &password, ParsingMode mode) { detach(); - d->setPassword(password, 0, password.length()); + + QString data = password; + if (mode == DecodedMode) { + parseDecodedComponent(data); + mode = TolerantMode; + } + + d->setPassword(data, 0, data.length()); if (password.isNull()) d->sectionIsPresent &= ~QUrlPrivate::Password; } @@ -1760,21 +1840,43 @@ QString QUrl::password(ComponentFormattingOptions options) const Sets the host of the URL to \a host. The host is part of the authority. + The \a host data is interpreted according to \a mode: in StrictMode, + any '%' characters must be followed by exactly two hexadecimal characters + and some characters (including space) are not allowed in undecoded form. In + TolerantMode, all characters are accepted in undecoded form and the + tolerant parser will correct stray '%' not followed by two hex characters. + In DecodedMode, '%' stand for themselves and encoded characters are not + possible. + + Note that, in all cases, the result of the parsing must be a valid hostname + according to STD 3 rules, as modified by the Internationalized Resource + Identifiers specification (RFC 3987). Invalid hostnames are not permitted + and will cause isValid() to become false. + \sa host(), setAuthority() */ -void QUrl::setHost(const QString &host) +void QUrl::setHost(const QString &host, ParsingMode mode) { detach(); - if (d->setHost(host, 0, host.length())) { + + QString data = host; + if (mode == DecodedMode) { + parseDecodedComponent(data); + mode = TolerantMode; + } + + if (d->setHost(data, 0, data.length())) { if (host.isNull()) d->sectionIsPresent &= ~QUrlPrivate::Host; - } else if (!host.startsWith(QLatin1Char('['))) { + } else if (!data.startsWith(QLatin1Char('['))) { // setHost failed, it might be IPv6 or IPvFuture in need of bracketing ushort oldCode = d->errorCode; ushort oldSupplement = d->errorSupplement; - if (!d->setHost(QLatin1Char('[') + host + QLatin1Char(']'), 0, host.length() + 2)) { + data.prepend(QLatin1Char('[')); + data.append(QLatin1Char(']')); + if (!d->setHost(data, 0, data.length())) { // failed again: choose if this was an IPv6 error or not - if (!host.contains(QLatin1Char(':'))) { + if (!data.contains(QLatin1Char(':'))) { d->errorCode = oldCode; d->errorSupplement = oldSupplement; } @@ -1847,12 +1949,31 @@ int QUrl::port(int defaultPort) const \image qurl-mailtopath.png + The \a path data is interpreted according to \a mode: in StrictMode, + any '%' characters must be followed by exactly two hexadecimal characters + and some characters (including space) are not allowed in undecoded form. In + TolerantMode (the default), all characters are accepted in undecoded form and the + tolerant parser will correct stray '%' not followed by two hex characters. + In DecodedMode, '%' stand for themselves and encoded characters are not + possible. + + QUrl::DecodedMode should be used when setting the path from a data source + which is not a URL, such as a dialog shown to the user or with a path + obtained by calling path() with the QUrl::FullyEncoded formatting option. + \sa path() */ -void QUrl::setPath(const QString &path) +void QUrl::setPath(const QString &path, ParsingMode mode) { detach(); - d->setPath(path, 0, path.length()); + + QString data = path; + if (mode == DecodedMode) { + parseDecodedComponent(data); + mode = TolerantMode; + } + + d->setPath(data, 0, data.length()); // optimized out, since there is no path delimiter // if (path.isNull()) @@ -1887,27 +2008,39 @@ bool QUrl::hasQuery() const } /*! - Sets the query string of the URL to \a query. The string is - inserted as-is, and no further encoding is performed when calling - toEncoded(). + Sets the query string of the URL to \a query. This function is useful if you need to pass a query string that does not fit into the key-value pattern, or that uses a different scheme for encoding special characters than what is suggested by QUrl. - Passing a value of QByteArray() to \a query (a null QByteArray) unsets - the query completely. However, passing a value of QByteArray("") + Passing a value of QString() to \a query (a null QString) unsets + the query completely. However, passing a value of QString("") will set the query to an empty value, as if the original URL had a lone "?". + The \a query data is interpreted according to \a mode: in StrictMode, + any '%' characters must be followed by exactly two hexadecimal characters + and some characters (including space) are not allowed in undecoded form. In + TolerantMode, all characters are accepted in undecoded form and the + tolerant parser will correct stray '%' not followed by two hex characters. + In DecodedMode, '%' stand for themselves and encoded characters are not + possible. + \sa encodedQuery(), hasQuery() */ -void QUrl::setQuery(const QString &query) +void QUrl::setQuery(const QString &query, ParsingMode mode) { detach(); - d->setQuery(query, 0, query.length()); + QString data = query; + if (mode == DecodedMode) { + parseDecodedComponent(data); + mode = TolerantMode; + } + + d->setQuery(data, 0, data.length()); if (query.isNull()) d->sectionIsPresent &= ~QUrlPrivate::Query; } @@ -1953,13 +2086,31 @@ QString QUrl::query(ComponentFormattingOptions options) const will set the fragment to an empty string (as if the original URL had a lone "#"). + The \a fragment data is interpreted according to \a mode: in StrictMode, + any '%' characters must be followed by exactly two hexadecimal characters + and some characters (including space) are not allowed in undecoded form. In + TolerantMode, all characters are accepted in undecoded form and the + tolerant parser will correct stray '%' not followed by two hex characters. + In DecodedMode, '%' stand for themselves and encoded characters are not + possible. + + QUrl::DecodedMode should be used when setting the fragment from a data + source which is not a URL or with a fragment obtained by calling + fragment() with the QUrl::FullyEncoded formatting option. + \sa fragment(), hasFragment() */ -void QUrl::setFragment(const QString &fragment) +void QUrl::setFragment(const QString &fragment, ParsingMode mode) { detach(); - d->setFragment(fragment, 0, fragment.length()); + QString data = fragment; + if (mode == DecodedMode) { + parseDecodedComponent(data); + mode = TolerantMode; + } + + d->setFragment(data, 0, data.length()); if (fragment.isNull()) d->sectionIsPresent &= ~QUrlPrivate::Fragment; } @@ -2514,7 +2665,7 @@ QUrl QUrl::fromLocalFile(const QString &localFile) deslashified.clear(); } - url.setPath(deslashified.replace(QLatin1Char('%'), QStringLiteral("%25"))); + url.setPath(deslashified, DecodedMode); return url; } diff --git a/src/corelib/io/qurl.h b/src/corelib/io/qurl.h index 533489ae2b..41e6c17fd5 100644 --- a/src/corelib/io/qurl.h +++ b/src/corelib/io/qurl.h @@ -194,36 +194,36 @@ public: void setScheme(const QString &scheme); QString scheme() const; - void setAuthority(const QString &authority); + void setAuthority(const QString &authority, ParsingMode mode = TolerantMode); QString authority(ComponentFormattingOptions options = PrettyDecoded) const; - void setUserInfo(const QString &userInfo); + void setUserInfo(const QString &userInfo, ParsingMode mode = TolerantMode); QString userInfo(ComponentFormattingOptions options = PrettyDecoded) const; - void setUserName(const QString &userName); + void setUserName(const QString &userName, ParsingMode mode = TolerantMode); QString userName(ComponentFormattingOptions options = PrettyDecoded) const; - void setPassword(const QString &password); + void setPassword(const QString &password, ParsingMode mode = TolerantMode); QString password(ComponentFormattingOptions = PrettyDecoded) const; - void setHost(const QString &host); + void setHost(const QString &host, ParsingMode mode = TolerantMode); QString host(ComponentFormattingOptions = PrettyDecoded) const; QString topLevelDomain(ComponentFormattingOptions options = PrettyDecoded) const; void setPort(int port); int port(int defaultPort = -1) const; - void setPath(const QString &path); + void setPath(const QString &path, ParsingMode mode = TolerantMode); QString path(ComponentFormattingOptions options = PrettyDecoded) const; bool hasQuery() const; - void setQuery(const QString &query); + void setQuery(const QString &query, ParsingMode mode = TolerantMode); void setQuery(const QUrlQuery &query); QString query(ComponentFormattingOptions = PrettyDecoded) const; bool hasFragment() const; QString fragment(ComponentFormattingOptions options = PrettyDecoded) const; - void setFragment(const QString &fragment); + void setFragment(const QString &fragment, ParsingMode mode = TolerantMode); QUrl resolved(const QUrl &relative) const; diff --git a/tests/auto/corelib/io/qurl/tst_qurl.cpp b/tests/auto/corelib/io/qurl/tst_qurl.cpp index 45f24710ff..76e5c580a8 100644 --- a/tests/auto/corelib/io/qurl/tst_qurl.cpp +++ b/tests/auto/corelib/io/qurl/tst_qurl.cpp @@ -163,6 +163,8 @@ private slots: void lowercasesScheme(); void componentEncodings_data(); void componentEncodings(); + void setComponents_data(); + void setComponents(); }; // Testing get/set functions @@ -2928,5 +2930,201 @@ void tst_QUrl::componentEncodings() QCOMPARE(url2, url); } +enum Component { + Scheme = 0x01, + UserName = 0x02, + Password = 0x04, + UserInfo = UserName | Password, + Host = 0x08, + Port = 0x10, + Authority = UserInfo | Host | Port, + Path = 0x20, + Hierarchy = Authority | Path, + Query = 0x40, + Fragment = 0x80, + FullUrl = 0xff +}; + +void tst_QUrl::setComponents_data() +{ + QTest::addColumn("original"); + QTest::addColumn("component"); + QTest::addColumn("newValue"); + QTest::addColumn("parsingMode"); + QTest::addColumn("isValid"); + QTest::addColumn("encoding"); + QTest::addColumn("output"); + QTest::addColumn("toString"); + + const int Tolerant = QUrl::TolerantMode; + const int Strict = QUrl::StrictMode; + const int Decoded = QUrl::DecodedMode; + const int PrettyDecoded = QUrl::PrettyDecoded; + const int FullyDecoded = QUrl::FullyDecoded; + + // -- test empty vs null -- + // there's no empty-but-present scheme or path + // a URL with an empty scheme is a "URI reference" + // and the path is always non-empty if it's present + QTest::newRow("scheme-null") << QUrl("http://example.com") + << int(Scheme) << QString() << Tolerant << true + << PrettyDecoded << QString() << "//example.com"; + QTest::newRow("scheme-empty") << QUrl("http://example.com") + << int(Scheme) << "" << Tolerant << true + << PrettyDecoded << "" << "//example.com"; + QTest::newRow("path-null") << QUrl("http://example.com/path") + << int(Path) << QString() << Tolerant << true + << PrettyDecoded << QString() << "http://example.com"; + QTest::newRow("path-empty") << QUrl("http://example.com/path") + << int(Path) << "" << Tolerant << true + << PrettyDecoded << "" << "http://example.com"; + + // the other fields can be present and be empty + // that is, their delimiters would be present, but there would be nothing to one side + QTest::newRow("userinfo-null") << QUrl("http://user:pass@example.com") + << int(UserInfo) << QString() << Tolerant << true + << PrettyDecoded << QString() << "http://example.com"; + QTest::newRow("userinfo-empty") << QUrl("http://user:pass@example.com") + << int(UserInfo) << "" << Tolerant << true + << PrettyDecoded << "" << "http://@example.com"; + QTest::newRow("username-null") << QUrl("http://user@example.com") + << int(UserName) << QString() << Tolerant << true + << PrettyDecoded << QString() << "http://example.com"; + QTest::newRow("username-empty") << QUrl("http://user@example.com") + << int(UserName) << "" << Tolerant << true + << PrettyDecoded << "" << "http://@example.com"; + QTest::newRow("username-empty-path-nonempty") << QUrl("http://user:pass@example.com") + << int(UserName) << "" << Tolerant << true + << PrettyDecoded << "" << "http://:pass@example.com"; + QTest::newRow("password-null") << QUrl("http://user:pass@example.com") + << int(Password) << QString() << Tolerant << true + << PrettyDecoded << QString() << "http://user@example.com"; + QTest::newRow("password-empty") << QUrl("http://user:pass@example.com") + << int(Password) << "" << Tolerant << true + << PrettyDecoded << "" << "http://user:@example.com"; + QTest::newRow("host-null") << QUrl("foo://example.com/path") + << int(Host) << QString() << Tolerant << true + << PrettyDecoded << QString() << "foo:/path"; + QTest::newRow("host-empty") << QUrl("foo://example.com/path") + << int(Host) << "" << Tolerant << true + << PrettyDecoded << QString() << "foo:///path"; + QTest::newRow("query-null") << QUrl("http://example.com/?q=foo") + << int(Query) << QString() << Tolerant << true + << PrettyDecoded << QString() << "http://example.com/"; + QTest::newRow("query-empty") << QUrl("http://example.com/?q=foo") + << int(Query) << "" << Tolerant << true + << PrettyDecoded << QString() << "http://example.com/?"; + QTest::newRow("fragment-null") << QUrl("http://example.com/#bar") + << int(Fragment) << QString() << Tolerant << true + << PrettyDecoded << QString() << "http://example.com/"; + QTest::newRow("fragment-empty") << QUrl("http://example.com/#bar") + << int(Fragment) << "" << Tolerant << true + << PrettyDecoded << "" << "http://example.com/#"; + + // -- test some non-valid components -- + QTest::newRow("invalid-scheme-1") << QUrl("http://example.com") + << int(Scheme) << "1http" << Tolerant << false + << PrettyDecoded << "" << ""; + QTest::newRow("invalid-scheme-2") << QUrl("http://example.com") + << int(Scheme) << "http%40" << Tolerant << false + << PrettyDecoded << "" << ""; + + QTest::newRow("invalid-host-1") << QUrl("http://example.com") + << int(Host) << "-not-valid-" << Tolerant << false + << PrettyDecoded << "" << ""; + QTest::newRow("invalid-path-1") << QUrl("/relative") + << int(Path) << "c:/" << Strict << false + << PrettyDecoded << "" << ""; + + // -- test decoded behaviour -- + QTest::newRow("userinfo-encode") << QUrl("http://example.com") + << int(UserInfo) << "h%61llo:world@" << Decoded << true + << PrettyDecoded << "h%2561llo:world@" << "http://h%2561llo:world%40@example.com"; + QTest::newRow("username-encode") << QUrl("http://example.com") + << int(UserName) << "h%61llo:world" << Decoded << true + << PrettyDecoded << "h%2561llo:world" << "http://h%2561llo%3Aworld@example.com"; + QTest::newRow("password-encode") << QUrl("http://example.com") + << int(Password) << "h%61llo:world@" << Decoded << true + << PrettyDecoded << "h%2561llo:world@" << "http://:h%2561llo:world%40@example.com"; + // '%' characters are not permitted in the hostname, this tests that it fails to set anything + QTest::newRow("invalid-host-encode") << QUrl("http://example.com") + << int(Host) << "ex%61mple.com" << Decoded << false + << PrettyDecoded << "" << ""; + QTest::newRow("path-encode") << QUrl("http://example.com/foo") + << int(Path) << "bar%23" << Decoded << true + << PrettyDecoded << "bar%2523" << "http://example.com/bar%2523"; + QTest::newRow("query-encode") << QUrl("http://example.com/foo?q") + << int(Query) << "bar%23" << Decoded << true + << PrettyDecoded << "bar%2523" << "http://example.com/foo?bar%2523"; + QTest::newRow("fragment-encode") << QUrl("http://example.com/foo#z") + << int(Fragment) << "bar%23" << Decoded << true + << PrettyDecoded << "bar%2523" << "http://example.com/foo#bar%2523"; +} + +void tst_QUrl::setComponents() +{ + QFETCH(QUrl, original); + QUrl copy(original); + + QFETCH(int, component); + QFETCH(int, parsingMode); + QFETCH(QString, newValue); + QFETCH(int, encoding); + QFETCH(QString, output); + + switch (component) { + case Scheme: + copy.setScheme(newValue); + QCOMPARE(copy.scheme(), output); + break; + + case Path: + copy.setPath(newValue, QUrl::ParsingMode(parsingMode)); + QEXPECT_FAIL("invalid-path-1", "QUrl does not forbid paths with a colon before the first slash yet", Abort); + QCOMPARE(copy.path(QUrl::ComponentFormattingOptions(encoding)), output); + break; + + case UserInfo: + copy.setUserInfo(newValue, QUrl::ParsingMode(parsingMode)); + QCOMPARE(copy.userInfo(QUrl::ComponentFormattingOptions(encoding)), output); + break; + + case UserName: + copy.setUserName(newValue, QUrl::ParsingMode(parsingMode)); + QCOMPARE(copy.userName(QUrl::ComponentFormattingOptions(encoding)), output); + break; + + case Password: + copy.setPassword(newValue, QUrl::ParsingMode(parsingMode)); + QCOMPARE(copy.password(QUrl::ComponentFormattingOptions(encoding)), output); + break; + + case Host: + copy.setHost(newValue, QUrl::ParsingMode(parsingMode)); + QCOMPARE(copy.host(QUrl::ComponentFormattingOptions(encoding)), output); + break; + + case Query: + copy.setQuery(newValue, QUrl::ParsingMode(parsingMode)); + QCOMPARE(copy.hasQuery(), !newValue.isNull()); + QCOMPARE(copy.query(QUrl::ComponentFormattingOptions(encoding)), output); + break; + + case Fragment: + copy.setFragment(newValue, QUrl::ParsingMode(parsingMode)); + QCOMPARE(copy.hasFragment(), !newValue.isNull()); + QCOMPARE(copy.fragment(QUrl::ComponentFormattingOptions(encoding)), output); + break; + } + + QFETCH(bool, isValid); + QCOMPARE(copy.isValid(), isValid); + + if (isValid) { + QFETCH(QString, toString); + QCOMPARE(copy.toString(), toString); + } +} + QTEST_MAIN(tst_QUrl) #include "tst_qurl.moc" -- cgit v1.2.3