summaryrefslogtreecommitdiffstats
path: root/src/corelib/io/qurl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/io/qurl.cpp')
-rw-r--r--src/corelib/io/qurl.cpp209
1 files changed, 150 insertions, 59 deletions
diff --git a/src/corelib/io/qurl.cpp b/src/corelib/io/qurl.cpp
index fbc8d761c2..aed1be2ffb 100644
--- a/src/corelib/io/qurl.cpp
+++ b/src/corelib/io/qurl.cpp
@@ -64,7 +64,7 @@
unencoded representation is suitable for showing to users, but
the encoded representation is typically what you would send to
a web server. For example, the unencoded URL
- "http://b\uuml\c{}hler.example.com" would be sent to the server as
+ "http://b\\uuml\c{}hler.example.com" would be sent to the server as
"http://xn--bhler-kva.example.com/List%20of%20applicants.xml".
A URL can also be constructed piece by piece by calling
@@ -126,36 +126,51 @@
The parsing mode controls the way QUrl parses strings.
\value TolerantMode QUrl will try to correct some common errors in URLs.
- This mode is useful when processing URLs entered by
- users.
+ This mode is useful for parsing URLs coming from sources
+ not known to be strictly standards-conforming.
\value StrictMode Only valid URLs are accepted. This mode is useful for
general URL validation.
- In TolerantMode, the parser corrects the following invalid input:
+ In TolerantMode, the parser has the following behaviour:
\list
- \li Spaces and "%20": If an encoded URL contains a space, this will be
- replaced with "%20". If a decoded URL contains "%20", this will be
- replaced with a single space before the URL is parsed.
+ \li Spaces and "%20": unencoded space characters will be accepted and will
+ be treated as equivalent to "%20".
\li Single "%" characters: Any occurrences of a percent character "%" not
followed by exactly two hexadecimal characters (e.g., "13% coverage.html")
- will be replaced by "%25".
+ will be replaced by "%25". Note that one lone "%" character will trigger
+ the correction mode for all percent characters.
\li Reserved and unreserved characters: An encoded URL should only
contain a few characters as literals; all other characters should
be percent-encoded. In TolerantMode, these characters will be
automatically percent-encoded where they are not allowed:
- space / double-quote / "<" / ">" / "[" / "\" /
- "]" / "^" / "`" / "{" / "|" / "}"
+ space / double-quote / "<" / ">" / "\" /
+ "^" / "`" / "{" / "|" / "}"
+ Those same characters can be decoded again by passing QUrl::DecodeReserved
+ to toString() or toEncoded().
\endlist
+
+ When in StrictMode, if a parsing error is found, isValid() will return \c
+ false and errorString() will return a simple message describing the error.
+ If more than one error is detected, it is undefined which error gets
+ reported.
+
+ Note that TolerantMode is not usually enough for parsing user input, which
+ often contains more errors and expectations than the parser can deal with.
+ When dealing with data coming directly from the user -- as opposed to data
+ coming from data-transfer sources, such as other programs -- it is
+ recommended to use fromUserInput().
+
+ \sa fromUserInput(), setUrl(), toString(), toEncoded(), QUrl::FormattingOptions
*/
/*!
- \enum QUrl::FormattingOption
+ \enum QUrl::FormattingOptions
The formatting options define how the URL is formatted when written out
as text.
@@ -178,6 +193,62 @@
Note that the case folding rules in \l{RFC 3491}{Nameprep}, which QUrl
conforms to, require host names to always be converted to lower case,
regardless of the Qt::FormattingOptions used.
+
+ The options from QUrl::ComponentFormattingOptions are also possible.
+
+ \sa QUrl::ComponentFormattingOptions
+*/
+
+/*!
+ \enum QUrl::ComponentFormattingOptions
+ \since 5.0
+
+ The component formatting options define how the components of an URL will
+ be formatted when written out as text. They can be combined with the
+ options from QUrl::FormattingOptions when used in toString() and
+ toEncoded().
+
+ \value PrettyDecoded The component is returned in a "pretty form", with
+ most percent-encoded characters decoded. The exact
+ behavior of PrettyDecoded varies from component to
+ component and may also change from Qt release to Qt
+ release. This is the default.
+
+ \value EncodeSpaces Leave space characters in their encoded form ("%20").
+
+ \value EncodeUnicode Leave non-US-ASCII characters encoded in their UTF-8
+ percent-encoded form (e.g., "%C3%A9" for the U+00E9
+ codepoint, LATIN SMALL LETTER E WITH ACUTE).
+
+ \value EncodeDelimiters Leave certain delimiters in their encoded form, as
+ would appear in the URL when the full URL is
+ represented as text. The delimiters are affected
+ by this option change from component to component.
+
+ \value EncodeReserved Leave the US-ASCII reserved characters in their encoded
+ forms.
+
+ \value DecodeReseved Decode the US-ASCII reserved characters.
+
+ \value FullyEncoded Leave all characters in their properly-encoded form,
+ as this component would appear as part of a URL. When
+ used with toString(), this produces a fully-compliant
+ URL in QString form, exactly equal to the result of
+ toEncoded()
+
+ \value MostDecoded Attempt to decode as much as possible. For individual
+ components of the URL, this decodes every percent
+ encoding sequence, control characters (U+0000 to U+001F)
+ and non-US-ASCII sequences that aren't valid UTF-8
+ sequences.
+
+ The values of EncodeReserved and DecodeReserved should not be used together
+ in one call. The behaviour is undefined if that happens. They are provided
+ as separate values because the behaviour of the "pretty mode" with regards
+ to reserved characters is different on certain components and specially on
+ the full URL.
+
+ \sa QUrl::FormattingOptions
*/
#include "qurl.h"
@@ -208,6 +279,12 @@ static inline char toHex(quint8 c)
return c > 9 ? c - 10 + 'A' : c + '0';
}
+static inline quint8 fromHex(quint8 c)
+{
+ c |= 0x20;
+ return c >= 'a' ? c - 'a' + 10 : c - '0';
+}
+
static inline QString ftpScheme()
{
return QStringLiteral("ftp");
@@ -246,24 +323,6 @@ QUrlPrivate::QUrlPrivate(const QUrlPrivate &copy)
{
}
-void QUrlPrivate::clear()
-{
- scheme.clear();
- userName.clear();
- password.clear();
- host.clear();
- port = -1;
- path.clear();
- query.clear();
- fragment.clear();
-
- errorCode = NoError;
- errorSupplement = 0;
- sectionIsPresent = 0;
- sectionHasError = 0;
-}
-
-
// From RFC 3896, Appendix A Collected ABNF for URI
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
//[...]
@@ -1391,20 +1450,25 @@ const QByteArray &QUrlPrivate::normalized() const
/*!
- Constructs a URL by parsing \a url. \a url is assumed to be in human
- readable representation, with no percent encoding. QUrl will automatically
- percent encode all characters that are not allowed in a URL.
- The default parsing mode is TolerantMode.
-
- Parses the \a url using the parser mode \a parsingMode.
+ Constructs a URL by parsing \a url. QUrl will automatically percent encode
+ all characters that are not allowed in a URL and decode the percent-encoded
+ sequences that represent a character that is allowed in a URL.
+
+ Parses the \a url using the parser mode \a parsingMode. In TolerantMode
+ (the default), QUrl will correct certain mistakes, notably the presence of
+ a percent character ('%') not followed by two hexadecimal digits, and it
+ will accept any character in any position. In StrictMode, encoding mistakes
+ will not be tolerated and QUrl will also check that certain forbidden
+ characters are not present in unencoded form. If an error is detected in
+ StrictMode, isValid() will return false.
Example:
- \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 0
+ \snippet code/src_corelib_io_qurl.cpp 0
To construct a URL from an encoded string, call fromEncoded():
- \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 1
+ \snippet code/src_corelib_io_qurl.cpp 1
\sa setUrl(), setEncodedUrl(), fromEncoded(), TolerantMode
*/
@@ -1445,7 +1509,7 @@ QUrl::~QUrl()
must conform to the standard encoding rules of the URI standard
for the URL to be reported as valid.
- \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 2
+ \snippet code/src_corelib_io_qurl.cpp 2
*/
bool QUrl::isValid() const
{
@@ -1475,12 +1539,18 @@ void QUrl::clear()
}
/*!
- Parses \a url using the parsing mode \a parsingMode.
-
- \a url is assumed to be in unicode format, with no percent
- encoding.
-
- Calling isValid() will tell whether or not a valid URL was constructed.
+ Parses \a url and sets this object to that value. QUrl will automatically
+ percent encode all characters that are not allowed in a URL and decode the
+ percent-encoded sequences that represent a character that is allowed in a
+ URL.
+
+ Parses the \a url using the parser mode \a parsingMode. In TolerantMode
+ (the default), QUrl will correct certain mistakes, notably the presence of
+ a percent character ('%') not followed by two hexadecimal digits, and it
+ will accept any character in any position. In StrictMode, encoding mistakes
+ will not be tolerated and QUrl will also check that certain forbidden
+ characters are not present in unencoded form. If an error is detected in
+ StrictMode, isValid() will return false.
\sa setEncodedUrl()
*/
@@ -1501,7 +1571,7 @@ void QUrl::setUrl(const QString &url, ParsingMode parsingMode)
and is followed by a ':'. The following example shows a URL where
the scheme is "ftp":
- \img qurl-authority2.png
+ \image qurl-authority2.png
The scheme can also be empty, in which case the URL is interpreted
as relative.
@@ -1548,7 +1618,7 @@ QString QUrl::scheme() const
The following example shows a valid authority string:
- \img qurl-authority.png
+ \image qurl-authority.png
*/
void QUrl::setAuthority(const QString &authority)
{
@@ -1585,7 +1655,7 @@ QString QUrl::authority(ComponentFormattingOptions options) const
separated by a ':'. If the password is empty, the colon must be
omitted. The following example shows a valid user info string:
- \img qurl-authority3.png
+ \image qurl-authority3.png
\sa userInfo(), setUserName(), setPassword(), setAuthority()
*/
@@ -1746,7 +1816,7 @@ void QUrl::setPort(int port)
Example:
- \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 3
+ \snippet code/src_corelib_io_qurl.cpp 3
*/
int QUrl::port(int defaultPort) const
{
@@ -1758,12 +1828,12 @@ int QUrl::port(int defaultPort) const
Sets the path of the URL to \a path. The path is the part of the
URL that comes after the authority but before the query string.
- \img qurl-ftppath.png
+ \image qurl-ftppath.png
For non-hierarchical schemes, the path will be everything
following the scheme declaration, as in the following example:
- \img qurl-mailtopath.png
+ \image qurl-mailtopath.png
\sa path()
*/
@@ -1862,7 +1932,7 @@ QString QUrl::query(ComponentFormattingOptions options) const
characters. It is typically used in HTTP for referring to a
certain link or point on a page:
- \img qurl-fragment.png
+ \image qurl-fragment.png
The fragment is sometimes also referred to as the URL "reference".
@@ -1938,7 +2008,7 @@ QString QUrl::topLevelDomain(ComponentFormattingOptions options) const
the base URL, but with the merged path, as in the following
example:
- \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 5
+ \snippet code/src_corelib_io_qurl.cpp 5
Calling resolved() with ".." returns a QUrl whose directory is
one level higher than the original. Similarly, calling resolved()
@@ -2012,14 +2082,16 @@ QUrl QUrl::resolved(const QUrl &relative) const
}
/*!
- Returns true if the URL is relative; otherwise returns false. A
- URL is relative if its scheme is undefined; this function is
- therefore equivalent to calling scheme().isEmpty().
+ Returns true if the URL is relative; otherwise returns false. A URL is
+ relative reference if its scheme is undefined; this function is therefore
+ equivalent to calling scheme().isEmpty().
+
+ Relative references are defined in RFC 3986 section 4.2.
*/
bool QUrl::isRelative() const
{
if (!d) return true;
- return !d->hasScheme() && !d->path.startsWith(QLatin1Char('/'));
+ return !d->hasScheme();
}
/*!
@@ -2173,7 +2245,7 @@ QString QUrl::fromPercentEncoding(const QByteArray &input)
Unreserved is defined as:
ALPHA / DIGIT / "-" / "." / "_" / "~"
- \snippet doc/src/snippets/code/src_corelib_io_qurl.cpp 6
+ \snippet code/src_corelib_io_qurl.cpp 6
*/
QByteArray QUrl::toPercentEncoding(const QString &input, const QByteArray &exclude, const QByteArray &include)
{
@@ -2429,6 +2501,9 @@ QUrl QUrl::fromLocalFile(const QString &localFile)
returned value in the form found on SMB networks (for example,
"//servername/path/to/file.txt").
+ Note: if the path component of this URL contains a non-UTF-8 binary
+ sequence (such as %80), the behaviour of this function is undefined.
+
\sa fromLocalFile(), isLocalFile()
*/
QString QUrl::toLocalFile() const
@@ -2438,19 +2513,35 @@ QString QUrl::toLocalFile() const
return QString();
QString tmp;
- QString ourPath = path();
+ QString ourPath = path(QUrl::MostDecoded);
// magic for shared drive on windows
if (!d->host.isEmpty()) {
- tmp = QStringLiteral("//") + d->host + (ourPath.length() > 0 && ourPath.at(0) != QLatin1Char('/')
+ tmp = QStringLiteral("//") + host() + (ourPath.length() > 0 && ourPath.at(0) != QLatin1Char('/')
? QLatin1Char('/') + ourPath : ourPath);
} else {
tmp = ourPath;
+#ifdef Q_OS_WIN
// magic for drives on windows
if (ourPath.length() > 2 && ourPath.at(0) == QLatin1Char('/') && ourPath.at(2) == QLatin1Char(':'))
tmp.remove(0, 1);
+#endif
}
+ // check if we need to do one more decoding pass
+ int pct = tmp.indexOf(QLatin1Char('%'));
+ while (pct != -1) {
+ Q_ASSERT(tmp.size() >= pct + 2);
+ ushort char1 = tmp.at(pct + 1).unicode();
+ ushort char2 = tmp.at(pct + 2).unicode();
+
+ Q_ASSERT(isHex(char1) && char1 < 0x80u);
+ Q_ASSERT(isHex(char2) && char2 < 0x80u);
+ tmp.replace(pct, 3, QChar(fromHex(char1) << 4 | fromHex(char2)));
+
+ // next iteration
+ pct = tmp.indexOf(QLatin1Char('%'), pct + 1);
+ }
return tmp;
}