diff options
author | Giuseppe D'Angelo <giuseppe.dangelo@kdab.com> | 2014-02-03 16:54:49 +0100 |
---|---|---|
committer | The Qt Project <gerrit-noreply@qt-project.org> | 2014-02-07 15:00:36 +0100 |
commit | bcd1b7fe8ee0ab83f7838172c287557c94711602 (patch) | |
tree | 64d25bb31bd489cde934568bac4d44a3566458b2 | |
parent | cc14f85730813b9f0bb752627bab3c44803091a6 (diff) |
Fix QString::toUcs4 returning invalid data when encountering stray surrogates
Code units 0xD800 .. 0xDFFF are not UCS-4, so we can't happily return them.
Instead, if we encounter a stray surrogate, replace it with 0xFFFD, which
is what Unicode recommends anyhow.
References:
§3.9 Unicode Encoding Forms
D76: Unicode scalar value: Any Unicode code point except high-surrogate
and low surrogate code points.
As a result of this definition, the set of Unicode scalar values consists
of the ranges 0 to D7FF_16 and E000_16 to 10FFFF_16, inclusive.
[...]
UTF-32 encoding form: The Unicode encoding form that assigns each Unicode
scalar value to a single unsigned 32-bit code unit with the same numeric
value as the Unicode scalar value.
§ C.2 Encoding Forms in ISO/IEC 10646
UCS-4. UCS-4 stands for “Universal Character Set coded in 4 octets.” It is
now treated simply as a synonym for UTF-32, and is considered the canonical
form for representation of characters in 10646.
§ 3.9 Unicode Encoding Forms (Best Practices for Using U+FFFD)
and
§ 5.22 Best Practice for U+FFFD Substitution
Whenever an unconvertible offset is reached during conversion of a code
unit sequence:
1. The maximal subpart at that offset should be replaced by a single
U+FFFD.
2. The conversion should proceed at the offset immediately after the
maximal subpart.
[...]
Whenever an unconvertible offset is reached during conversion of a code
unit sequence to Unicode:
1. Find the longest code unit sequence that is the initial subsequence of
some sequence that could be converted. If there is such a sequence, replace
it with a single U+FFFD; otherwise replace a single code unit with a single
U+FFFD.
2. The conversion should proceed at the offset immediately after the
subsequence which has been replaced.
[ChangeLog][QtCore][QString] QString::toUcs4 now does not return invalid
UCS-4 code units belonging to the surrogate range (U+D800 to U+DFFF)
when the QString contains malformed UTF-16 data. Instead, U+FFFD
is returned in place of the malformed subsequence.
Change-Id: I19d7af03e749fea680fd5d9635439bc9d56558a9
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
Reviewed-by: Konstantin Ritt <ritt.ks@gmail.com>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
-rw-r--r-- | src/corelib/tools/qstring.cpp | 39 | ||||
-rw-r--r-- | tests/auto/corelib/tools/qstring/tst_qstring.cpp | 73 |
2 files changed, 88 insertions, 24 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 390a65aa23..2c6c5b7057 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -76,6 +76,7 @@ #include "qchar.cpp" #include "qstringmatcher.cpp" +#include "qstringiterator_p.h" #ifdef Q_OS_WIN # include <qt_windows.h> @@ -1325,21 +1326,13 @@ const QString::Null QString::null = { }; int QString::toUcs4_helper(const ushort *uc, int length, uint *out) { - int i = 0; - const ushort *const e = uc + length; - while (uc < e) { - uint u = *uc; - if (QChar::isHighSurrogate(u) && uc + 1 < e) { - ushort low = uc[1]; - if (QChar::isLowSurrogate(low)) { - ++uc; - u = QChar::surrogateToUcs4(u, low); - } - } - out[i++] = u; - ++uc; - } - return i; + int count = 0; + + QStringIterator i(reinterpret_cast<const QChar *>(uc), reinterpret_cast<const QChar *>(uc + length)); + while (i.hasNext()) + out[count++] = i.next(); + + return count; } /*! \fn int QString::toWCharArray(wchar_t *array) const @@ -4315,8 +4308,12 @@ QByteArray QString::toUtf8_helper(const QString &str) Returns a UCS-4/UTF-32 representation of the string as a QVector<uint>. - UCS-4 is a Unicode codec and is lossless. All characters from this string - can be encoded in UCS-4. The vector is not null terminated. + UCS-4 is a Unicode codec and therefore it is lossless. All characters from + this string will be encoded in UCS-4. Any invalid sequence of code units in + this string is replaced by the Unicode's replacement character + (QChar::ReplacementCharacter, which corresponds to \c{U+FFFD}). + + The returned vector is not NUL terminated. \sa fromUtf8(), toUtf8(), toLatin1(), toLocal8Bit(), QTextCodec, fromUcs4(), toWCharArray() */ @@ -9529,8 +9526,12 @@ QByteArray QStringRef::toUtf8() const Returns a UCS-4/UTF-32 representation of the string as a QVector<uint>. - UCS-4 is a Unicode codec and is lossless. All characters from this string - can be encoded in UCS-4. + UCS-4 is a Unicode codec and therefore it is lossless. All characters from + this string will be encoded in UCS-4. Any invalid sequence of code units in + this string is replaced by the Unicode's replacement character + (QChar::ReplacementCharacter, which corresponds to \c{U+FFFD}). + + The returned vector is not NUL terminated. \sa toUtf8(), toLatin1(), toLocal8Bit(), QTextCodec */ diff --git a/tests/auto/corelib/tools/qstring/tst_qstring.cpp b/tests/auto/corelib/tools/qstring/tst_qstring.cpp index 4ac2fb9fa0..0ee1595ecc 100644 --- a/tests/auto/corelib/tools/qstring/tst_qstring.cpp +++ b/tests/auto/corelib/tools/qstring/tst_qstring.cpp @@ -4019,15 +4019,78 @@ void tst_QString::fromUcs4() void tst_QString::toUcs4() { QString s; + QVector<uint> ucs4; QCOMPARE( s.toUcs4().size(), 0 ); - QChar bmp = QLatin1Char('a'); + static const QChar bmp = QLatin1Char('a'); s = QString(&bmp, 1); - QCOMPARE( s.toUcs4().size(), 1 ); + ucs4 = s.toUcs4(); + QCOMPARE( ucs4.size(), 1 ); + QCOMPARE( ucs4.at(0), 0x0061u ); + +#define QSTRING_FROM_QCHARARRAY(x) (QString((x), sizeof(x)/sizeof((x)[0]))) + + static const QChar smp[] = { QChar::highSurrogate(0x10000), QChar::lowSurrogate(0x10000) }; + s = QSTRING_FROM_QCHARARRAY(smp); + ucs4 = s.toUcs4(); + QCOMPARE( ucs4.size(), 1 ); + QCOMPARE( ucs4.at(0), 0x10000u ); + + static const QChar smp2[] = { QChar::highSurrogate(0x10000), QChar::lowSurrogate(0x10000), QChar::highSurrogate(0x10000), QChar::lowSurrogate(0x10000) }; + s = QSTRING_FROM_QCHARARRAY(smp2); + ucs4 = s.toUcs4(); + QCOMPARE( ucs4.size(), 2 ); + QCOMPARE( ucs4.at(0), 0x10000u ); + QCOMPARE( ucs4.at(1), 0x10000u ); + + static const QChar invalid_01[] = { QChar(0xd800) }; + s = QSTRING_FROM_QCHARARRAY(invalid_01); + ucs4 = s.toUcs4(); + QCOMPARE( ucs4.size(), 1 ); + QCOMPARE( ucs4.at(0), 0xFFFDu ); + + static const QChar invalid_02[] = { QChar(0xdc00) }; + s = QSTRING_FROM_QCHARARRAY(invalid_02); + ucs4 = s.toUcs4(); + QCOMPARE( ucs4.size(), 1 ); + QCOMPARE( ucs4.at(0), 0xFFFDu ); + + static const QChar invalid_03[] = { QLatin1Char('a'), QChar(0xd800), QLatin1Char('b') }; + s = QSTRING_FROM_QCHARARRAY(invalid_03); + ucs4 = s.toUcs4(); + QCOMPARE( ucs4.size(), 3 ); + QCOMPARE( ucs4.at(0), 0x0061u ); + QCOMPARE( ucs4.at(1), 0xFFFDu ); + QCOMPARE( ucs4.at(2), 0x0062u ); + + static const QChar invalid_04[] = { QLatin1Char('a'), QChar(0xdc00), QLatin1Char('b') }; + s = QSTRING_FROM_QCHARARRAY(invalid_04); + ucs4 = s.toUcs4(); + QCOMPARE( ucs4.size(), 3 ); + QCOMPARE( ucs4.at(0), 0x0061u ); + QCOMPARE( ucs4.at(1), 0xFFFDu ); + QCOMPARE( ucs4.at(2), 0x0062u ); + + static const QChar invalid_05[] = { QLatin1Char('a'), QChar(0xd800), QChar(0xd800), QLatin1Char('b') }; + s = QSTRING_FROM_QCHARARRAY(invalid_05); + ucs4 = s.toUcs4(); + QCOMPARE( ucs4.size(), 4 ); + QCOMPARE( ucs4.at(0), 0x0061u ); + QCOMPARE( ucs4.at(1), 0xFFFDu ); + QCOMPARE( ucs4.at(2), 0xFFFDu ); + QCOMPARE( ucs4.at(3), 0x0062u ); + + static const QChar invalid_06[] = { QLatin1Char('a'), QChar(0xdc00), QChar(0xdc00), QLatin1Char('b') }; + s = QSTRING_FROM_QCHARARRAY(invalid_06); + ucs4 = s.toUcs4(); + QCOMPARE( ucs4.size(), 4 ); + QCOMPARE( ucs4.at(0), 0x0061u ); + QCOMPARE( ucs4.at(1), 0xFFFDu ); + QCOMPARE( ucs4.at(2), 0xFFFDu ); + QCOMPARE( ucs4.at(3), 0x0062u ); + +#undef QSTRING_FROM_QCHARARRAY - QChar smp[] = { QChar::highSurrogate(0x10000), QChar::lowSurrogate(0x10000) }; - s = QString(smp, 2); - QCOMPARE( s.toUcs4().size(), 1 ); } void tst_QString::arg() |