diff options
author | Kurt Pattyn <pattyn.kurt@gmail.com> | 2013-10-06 11:40:47 +0200 |
---|---|---|
committer | The Qt Project <gerrit-noreply@qt-project.org> | 2013-10-17 09:50:58 +0200 |
commit | add2bf739ae96603cb919b908cbb53c00d0628cc (patch) | |
tree | 9702a95d145fc9f429aa6f2ec104cfab75cae753 /tests/auto/corelib/codecs | |
parent | e8853506bf82e569009e68a23437d6a134176f63 (diff) |
Allow non-character codes in utf8 strings
Changed the processing of non-character code handling in the UTF8 codec.
Non-character codes are now accepted in QStrings, QUrls and QJson strings.
Unit tests were adapted accordingly.
For more info about non-character codes,
see: http://www.unicode.org/versions/corrigendum9.html
[ChangeLog][QtCore][QUtf8]
UTF-8 now accepts non-character unicode points; these are not replaced
by the replacement character anymore
[ChangeLog][QtCore][QUrl]
QUrl now fully accepts non-character unicode points; they are encoded as
percent characters; they can also be pretty decoded
[ChangeLog][QtCore][QJson]
The Writer and the Parser now fully accept non-character unicode points.
Change-Id: I77cf4f0e6210741eac8082912a0b6118eced4f77
Task-number: QTBUG-33229
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'tests/auto/corelib/codecs')
-rw-r--r-- | tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp | 54 | ||||
-rw-r--r-- | tests/auto/corelib/codecs/utf8/tst_utf8.cpp | 20 | ||||
-rw-r--r-- | tests/auto/corelib/codecs/utf8/utf8data.cpp | 4 |
3 files changed, 39 insertions, 39 deletions
diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp index dd557b8d21..8e1b3cf3b2 100644 --- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp +++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp @@ -66,9 +66,9 @@ private slots: void codecForLocale(); void asciiToIscii() const; - void flagCodepointFFFF() const; + void nonFlaggedCodepointFFFF() const; void flagF7808080() const; - void flagEFBFBF() const; + void nonFlaggedEFBFBF() const; void decode0D() const; void aliasForUTF16() const; void mibForTSCII() const; @@ -409,9 +409,9 @@ void tst_QTextCodec::asciiToIscii() const } } -void tst_QTextCodec::flagCodepointFFFF() const +void tst_QTextCodec::nonFlaggedCodepointFFFF() const { - // This is an invalid Unicode codepoint. + //Check that the code point 0xFFFF (=non-character code 0xEFBFBF) is not flagged const QChar ch(0xFFFF); QString input(ch); @@ -419,12 +419,11 @@ void tst_QTextCodec::flagCodepointFFFF() const QVERIFY(codec); const QByteArray asDecoded(codec->fromUnicode(input)); - QCOMPARE(asDecoded, QByteArray("?")); + QCOMPARE(asDecoded, QByteArray("\357\277\277")); QByteArray ffff("\357\277\277"); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); - QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QChar(0)); - QVERIFY(codec->toUnicode(ffff) == QChar(0xfffd)); + QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QByteArray::fromHex("EFBFBF")); } void tst_QTextCodec::flagF7808080() const @@ -460,13 +459,16 @@ void tst_QTextCodec::flagF7808080() const QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0)); } -void tst_QTextCodec::flagEFBFBF() const +void tst_QTextCodec::nonFlaggedEFBFBF() const { - QByteArray invalidInput; - invalidInput.resize(3); - invalidInput[0] = char(0xEF); - invalidInput[1] = char(0xBF); - invalidInput[2] = char(0xBF); + /* Check that the codec does NOT flag EFBFBF. + * This is a regression test; see QTBUG-33229 + */ + QByteArray validInput; + validInput.resize(3); + validInput[0] = char(0xEF); + validInput[1] = char(0xBF); + validInput[2] = char(0xBF); const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8 QVERIFY(codec); @@ -474,21 +476,20 @@ void tst_QTextCodec::flagEFBFBF() const { //QVERIFY(!codec->canEncode(QChar(0xFFFF))); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); - QVERIFY(codec->toUnicode(invalidInput.constData(), invalidInput.length(), &state) == QChar(0)); + QVERIFY(codec->toUnicode(validInput.constData(), validInput.length(), &state) == QByteArray::fromHex("EFBFBF")); QByteArray start("<?pi "); - start.append(invalidInput); + start.append(validInput); start.append("?>"); } - /* When 0xEFBFBF is preceded by what seems to be an arbitrary character, - * QTextCodec fails to flag it. */ + // Check that 0xEFBFBF is correctly decoded when preceded by an arbitrary character { QByteArray start("B"); - start.append(invalidInput); + start.append(validInput); QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull); - QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QString::fromLatin1("B\0", 2)); + QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QByteArray("B").append(QByteArray::fromHex("EFBFBF"))); } } @@ -674,13 +675,12 @@ void tst_QTextCodec::utf8Codec_data() str = QChar(0x7ff); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.2") << utf8 << str << -1; - // 2.2.3 U+000FFFF + // 2.2.3 U+000FFFF - non-character code utf8.clear(); utf8 += char(0xef); utf8 += char(0xbf); utf8 += char(0xbf); - str.clear(); - str += QChar::ReplacementCharacter; + str = QString::fromUtf8(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.3") << utf8 << str << -1; // 2.2.4 U+001FFFFF @@ -1535,20 +1535,22 @@ void tst_QTextCodec::utf8Codec_data() str += QChar(QChar::ReplacementCharacter); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1; - // 5.3.1 + // 5.3.1 - non-character code utf8.clear(); utf8 += char(0xef); utf8 += char(0xbf); utf8 += char(0xbe); - str = QChar(QChar::ReplacementCharacter); + //str = QChar(QChar::ReplacementCharacter); + str = QString::fromUtf8(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1; - // 5.3.2 + // 5.3.2 - non-character code utf8.clear(); utf8 += char(0xef); utf8 += char(0xbf); utf8 += char(0xbf); - str = QChar(QChar::ReplacementCharacter); + //str = QChar(QChar::ReplacementCharacter); + str = QString::fromUtf8(utf8); QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1; } diff --git a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp index 99147f3aff..e18f6f73b9 100644 --- a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp +++ b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp @@ -233,8 +233,9 @@ void tst_Utf8::nonCharacters_data() QTest::addColumn<QByteArray>("utf8"); QTest::addColumn<QString>("utf16"); - // Unicode has a couple of "non-characters" that one can use internally, - // but are not allowed to be used for text interchange. + // Unicode has a couple of "non-characters" that one can use internally + // These characters may be used for interchange; + // see: http://www.unicode.org/versions/corrigendum9.html // // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and @@ -279,20 +280,17 @@ void tst_Utf8::nonCharacters() decoder->toUnicode(utf8); // Only enforce correctness on our UTF-8 decoder - // The system's UTF-8 codec is sometimes buggy - // GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8 - // OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF if (!useLocale) - QVERIFY(decoder->hasFailure()); - else if (!decoder->hasFailure()) - qWarning("System codec does not report failure when it should. Should report bug upstream."); + QVERIFY(!decoder->hasFailure()); + else if (decoder->hasFailure()) + qWarning("System codec reports failure when it shouldn't. Should report bug upstream."); QSharedPointer<QTextEncoder> encoder(codec->makeEncoder()); encoder->fromUnicode(utf16); if (!useLocale) - QVERIFY(encoder->hasFailure()); - else if (!encoder->hasFailure()) - qWarning("System codec does not report failure when it should. Should report bug upstream."); + QVERIFY(!encoder->hasFailure()); + else if (encoder->hasFailure()) + qWarning("System codec reports failure when it shouldn't. Should report bug upstream."); } QTEST_MAIN(tst_Utf8) diff --git a/tests/auto/corelib/codecs/utf8/utf8data.cpp b/tests/auto/corelib/codecs/utf8/utf8data.cpp index 2516cc9734..a41b0772e6 100644 --- a/tests/auto/corelib/codecs/utf8/utf8data.cpp +++ b/tests/auto/corelib/codecs/utf8/utf8data.cpp @@ -129,8 +129,8 @@ void loadInvalidUtf8Rows() void loadNonCharactersRows() { - // Unicode has a couple of "non-characters" that one can use internally, - // but are not allowed to be used for text interchange. + // Unicode has a couple of "non-characters" that one can use internally + // These characters are allowed for text-interchange (see http://www.unicode.org/versions/corrigendum9.html) // // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and |