summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSona Kurazyan <sona.kurazyan@qt.io>2022-06-17 14:43:17 +0200
committerQt Cherry-pick Bot <cherrypick_bot@qt-project.org>2022-06-22 17:21:42 +0000
commit1892763c8c3872a41679f9e6540504cb1ef85b00 (patch)
tree78b418b702ac20b1f8856ec4254156b1c74e519e
parentd0123f8866478689125be98d2118036a54cdebbc (diff)
QDom: Stop treating non-BMP characters as invalid
According to https://www.w3.org/TR/REC-xml/#NT-Char unicode characters within the range of [#x10000-#x10FFFF] are considered to be valid, so fix the check for valid characters accordingly. This requires changing the loop over the input QString to iterate over code points (instead of code units). Fixes: QTBUG-104362 Change-Id: I7dcf5cad05265a54882807a50522d28b647e06ee Reviewed-by: Marc Mutz <marc.mutz@qt.io> (cherry picked from commit da0d7f61c851431d14430684c62345bc23dbf001) Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
-rw-r--r--src/corelib/serialization/qxmlutils.cpp15
-rw-r--r--src/corelib/serialization/qxmlutils_p.h2
-rw-r--r--src/xml/dom/qdom.cpp9
-rw-r--r--tests/auto/xml/dom/qdom/tst_qdom.cpp22
4 files changed, 37 insertions, 11 deletions
diff --git a/src/corelib/serialization/qxmlutils.cpp b/src/corelib/serialization/qxmlutils.cpp
index 74a0cf0c23..778e8de72d 100644
--- a/src/corelib/serialization/qxmlutils.cpp
+++ b/src/corelib/serialization/qxmlutils.cpp
@@ -235,13 +235,16 @@ bool QXmlUtils::isLetter(const QChar c)
\sa {http://www.w3.org/TR/REC-xml/#NT-Char},
{Extensible Markup Language (XML) 1.0 (Fourth Edition), [2] Char}
*/
-bool QXmlUtils::isChar(const QChar c)
+bool QXmlUtils::isChar(const char32_t c)
{
- return (c.unicode() >= 0x0020 && c.unicode() <= 0xD7FF)
- || c.unicode() == 0x0009
- || c.unicode() == 0x000A
- || c.unicode() == 0x000D
- || (c.unicode() >= 0xE000 && c.unicode() <= 0xFFFD);
+ // The valid range is defined by https://www.w3.org/TR/REC-xml/#NT-Char as following:
+ // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+ return (c >= 0x0020 && c <= 0xD7FF)
+ || c == 0x0009
+ || c == 0x000A
+ || c == 0x000D
+ || (c >= 0xE000 && c <= 0xFFFD)
+ || (c >= 0x10000 && c <= 0x10FFFF);
}
/*!
diff --git a/src/corelib/serialization/qxmlutils_p.h b/src/corelib/serialization/qxmlutils_p.h
index 2e709e8323..0ad1758979 100644
--- a/src/corelib/serialization/qxmlutils_p.h
+++ b/src/corelib/serialization/qxmlutils_p.h
@@ -33,7 +33,7 @@ class Q_CORE_EXPORT QXmlUtils
{
public:
static bool isEncName(QStringView encName);
- static bool isChar(const QChar c);
+ static bool isChar(const char32_t c);
static bool isNameChar(const QChar c);
static bool isLetter(const QChar c);
static bool isNCName(QStringView ncName);
diff --git a/src/xml/dom/qdom.cpp b/src/xml/dom/qdom.cpp
index 486894019c..e0e9a465ef 100644
--- a/src/xml/dom/qdom.cpp
+++ b/src/xml/dom/qdom.cpp
@@ -22,7 +22,7 @@
#include <qdebug.h>
#include <qxmlstream.h>
#include <private/qduplicatetracker_p.h>
-
+#include <private/qstringiterator_p.h>
#include <stdio.h>
#include <limits>
@@ -156,10 +156,11 @@ static QString fixedCharData(const QString &data, bool *ok)
}
QString result;
- for (int i = 0; i < data.size(); ++i) {
- QChar c = data.at(i);
+ QStringIterator it(data);
+ while (it.hasNext()) {
+ const char32_t c = it.next(QChar::Null);
if (QXmlUtils::isChar(c)) {
- result.append(c);
+ result.append(QChar::fromUcs4(c));
} else if (QDomImplementationPrivate::invalidDataPolicy == QDomImplementation::ReturnNullNode) {
*ok = false;
return QString();
diff --git a/tests/auto/xml/dom/qdom/tst_qdom.cpp b/tests/auto/xml/dom/qdom/tst_qdom.cpp
index e1c2b12ab5..f05020f61c 100644
--- a/tests/auto/xml/dom/qdom/tst_qdom.cpp
+++ b/tests/auto/xml/dom/qdom/tst_qdom.cpp
@@ -9,6 +9,7 @@
#include <QFile>
#include <QList>
#include <QRegularExpression>
+#include <QScopeGuard>
#include <QTextStream>
#include <QTest>
#include <QtXml>
@@ -62,6 +63,7 @@ private slots:
void invalidQualifiedName();
void invalidCharData_data();
void invalidCharData();
+ void nonBMPCharacters();
void roundTripAttributes() const;
void roundTripCDATA() const;
@@ -1342,6 +1344,10 @@ void tst_QDom::invalidCharData_data()
QTest::newRow( "f<o&o" ) << QString("f<o&o") << true << true << true << QString("f<o&o");
QTest::newRow( "empty" ) << QString() << true << true << true << QString();
QTest::newRow("f\\x07o\\x02")<< QString("f\x07o\x02")<< true << true << false << QString("fo");
+
+ const QChar pair[2] = { QChar(0xdc00), QChar(0xe000) };
+ QString invalid(pair, 2);
+ QTest::newRow("\\xdc00\\xe000") << invalid << true << true << false << invalid.last(1);
}
void tst_QDom::invalidCharData()
@@ -1385,6 +1391,22 @@ void tst_QDom::invalidCharData()
}
}
+void tst_QDom::nonBMPCharacters()
+{
+ const auto invalidDataPolicy = QDomImplementation::invalidDataPolicy();
+ auto resetInvalidDataPolicy = qScopeGuard(
+ [invalidDataPolicy] { QDomImplementation::setInvalidDataPolicy(invalidDataPolicy); });
+ QDomImplementation::setInvalidDataPolicy(QDomImplementation::DropInvalidChars);
+
+ const QString input = u"<text>Supplementary Plane: 𝄞 😂 🀄 🀶 🃪 🃋</text>"_qs;
+
+ QString errorMsg;
+ QDomDocument doc;
+ doc.setContent(input, &errorMsg);
+ QVERIFY(errorMsg.isEmpty());
+ QCOMPARE(doc.toString(-1), input);
+}
+
void tst_QDom::roundTripAttributes() const
{
/* Create an attribute via the QDom API with weird whitespace content. */