Allow non-character codes in utf8 strings

Changed the processing of non-character code handling in the UTF8 codec. Non-character codes are now accepted in QStrings, QUrls and QJson strings. Unit tests were adapted accordingly. For more info about non-character codes, see: http://www.unicode.org/versions/corrigendum9.html [ChangeLog][QtCore][QUtf8] UTF-8 now accepts non-character unicode points; these are not replaced by the replacement character anymore [ChangeLog][QtCore][QUrl] QUrl now fully accepts non-character unicode points; they are encoded as percent characters; they can also be pretty decoded [ChangeLog][QtCore][QJson] The Writer and the Parser now fully accept non-character unicode points. Change-Id: I77cf4f0e6210741eac8082912a0b6118eced4f77 Task-number: QTBUG-33229 Reviewed-by: Lars Knoll <lars.knoll@digia.com> Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Kurt Pattyn <pattyn.kurt@gmail.com> 2013-10-06 11:40:47 +0200
committer: The Qt Project <gerrit-noreply@qt-project.org> 2013-10-17 09:50:58 +0200
commit: add2bf739ae96603cb919b908cbb53c00d0628cc (patch)
tree: 9702a95d145fc9f429aa6f2ec104cfab75cae753 /tests/auto/corelib/codecs
parent: e8853506bf82e569009e68a23437d6a134176f63 (diff)
3 files changed, 39 insertions, 39 deletions
diff --git a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
index dd557b8d21..8e1b3cf3b2 100644
--- a/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/corelib/codecs/qtextcodec/tst_qtextcodec.cpp
@@ -66,9 +66,9 @@ private slots:
     void codecForLocale();
 
     void asciiToIscii() const;
-    void flagCodepointFFFF() const;
+    void nonFlaggedCodepointFFFF() const;
     void flagF7808080() const;
-    void flagEFBFBF() const;
+    void nonFlaggedEFBFBF() const;
     void decode0D() const;
     void aliasForUTF16() const;
     void mibForTSCII() const;
@@ -409,9 +409,9 @@ void tst_QTextCodec::asciiToIscii() const
     }
 }
 
-void tst_QTextCodec::flagCodepointFFFF() const
+void tst_QTextCodec::nonFlaggedCodepointFFFF() const
 {
-    // This is an invalid Unicode codepoint.
+    //Check that the code point 0xFFFF (=non-character code 0xEFBFBF) is not flagged
     const QChar ch(0xFFFF);
     QString input(ch);
 
@@ -419,12 +419,11 @@ void tst_QTextCodec::flagCodepointFFFF() const
     QVERIFY(codec);
 
     const QByteArray asDecoded(codec->fromUnicode(input));
-    QCOMPARE(asDecoded, QByteArray("?"));
+    QCOMPARE(asDecoded, QByteArray("\357\277\277"));
 
     QByteArray ffff("\357\277\277");
     QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
-    QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QChar(0));
-    QVERIFY(codec->toUnicode(ffff) == QChar(0xfffd));
+    QVERIFY(codec->toUnicode(ffff.constData(), ffff.length(), &state) == QByteArray::fromHex("EFBFBF"));
 }
 
 void tst_QTextCodec::flagF7808080() const
@@ -460,13 +459,16 @@ void tst_QTextCodec::flagF7808080() const
     QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0));
 }
 
-void tst_QTextCodec::flagEFBFBF() const
+void tst_QTextCodec::nonFlaggedEFBFBF() const
 {
-    QByteArray invalidInput;
-    invalidInput.resize(3);
-    invalidInput[0] = char(0xEF);
-    invalidInput[1] = char(0xBF);
-    invalidInput[2] = char(0xBF);
+    /* Check that the codec does NOT flag EFBFBF.
+     * This is a regression test; see QTBUG-33229
+     */
+    QByteArray validInput;
+    validInput.resize(3);
+    validInput[0] = char(0xEF);
+    validInput[1] = char(0xBF);
+    validInput[2] = char(0xBF);
 
     const QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8
     QVERIFY(codec);
@@ -474,21 +476,20 @@ void tst_QTextCodec::flagEFBFBF() const
     {
         //QVERIFY(!codec->canEncode(QChar(0xFFFF)));
         QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
-        QVERIFY(codec->toUnicode(invalidInput.constData(), invalidInput.length(), &state) == QChar(0));
+        QVERIFY(codec->toUnicode(validInput.constData(), validInput.length(), &state) == QByteArray::fromHex("EFBFBF"));
 
         QByteArray start("<?pi ");
-        start.append(invalidInput);
+        start.append(validInput);
         start.append("?>");
     }
 
-    /* When 0xEFBFBF is preceded by what seems to be an arbitrary character,
-     * QTextCodec fails to flag it. */
+    // Check that 0xEFBFBF is correctly decoded when preceded by an arbitrary character
     {
         QByteArray start("B");
-        start.append(invalidInput);
+        start.append(validInput);
 
         QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
-        QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QString::fromLatin1("B\0", 2));
+        QVERIFY(codec->toUnicode(start.constData(), start.length(), &state) == QByteArray("B").append(QByteArray::fromHex("EFBFBF")));
     }
 }
 
@@ -674,13 +675,12 @@ void tst_QTextCodec::utf8Codec_data()
     str = QChar(0x7ff);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.2") << utf8 << str << -1;
 
-    // 2.2.3 U+000FFFF
+    // 2.2.3 U+000FFFF - non-character code
     utf8.clear();
     utf8 += char(0xef);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str.clear();
-    str += QChar::ReplacementCharacter;
+    str = QString::fromUtf8(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.3") << utf8 << str << -1;
 
     // 2.2.4 U+001FFFFF
@@ -1535,20 +1535,22 @@ void tst_QTextCodec::utf8Codec_data()
     str += QChar(QChar::ReplacementCharacter);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1;
 
-    // 5.3.1
+    // 5.3.1 - non-character code
     utf8.clear();
     utf8 += char(0xef);
     utf8 += char(0xbf);
     utf8 += char(0xbe);
-    str = QChar(QChar::ReplacementCharacter);
+    //str = QChar(QChar::ReplacementCharacter);
+    str = QString::fromUtf8(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1;
 
-    // 5.3.2
+    // 5.3.2 - non-character code
     utf8.clear();
     utf8 += char(0xef);
     utf8 += char(0xbf);
     utf8 += char(0xbf);
-    str = QChar(QChar::ReplacementCharacter);
+    //str = QChar(QChar::ReplacementCharacter);
+    str = QString::fromUtf8(utf8);
     QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1;
 }
 
diff --git a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp
index 99147f3aff..e18f6f73b9 100644
--- a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp
+++ b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp
@@ -233,8 +233,9 @@ void tst_Utf8::nonCharacters_data()
     QTest::addColumn<QByteArray>("utf8");
     QTest::addColumn<QString>("utf16");
 
-    // Unicode has a couple of "non-characters" that one can use internally,
-    // but are not allowed to be used for text interchange.
+    // Unicode has a couple of "non-characters" that one can use internally
+    // These characters may be used for interchange;
+    // see: http://www.unicode.org/versions/corrigendum9.html
     //
     // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
     // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
@@ -279,20 +280,17 @@ void tst_Utf8::nonCharacters()
     decoder->toUnicode(utf8);
 
     // Only enforce correctness on our UTF-8 decoder
-    // The system's UTF-8 codec is sometimes buggy
-    //  GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
-    //  OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
     if (!useLocale)
-        QVERIFY(decoder->hasFailure());
-    else if (!decoder->hasFailure())
-        qWarning("System codec does not report failure when it should. Should report bug upstream.");
+        QVERIFY(!decoder->hasFailure());
+    else if (decoder->hasFailure())
+        qWarning("System codec reports failure when it shouldn't. Should report bug upstream.");
 
     QSharedPointer<QTextEncoder> encoder(codec->makeEncoder());
     encoder->fromUnicode(utf16);
     if (!useLocale)
-        QVERIFY(encoder->hasFailure());
-    else if (!encoder->hasFailure())
-        qWarning("System codec does not report failure when it should. Should report bug upstream.");
+        QVERIFY(!encoder->hasFailure());
+    else if (encoder->hasFailure())
+        qWarning("System codec reports failure when it shouldn't. Should report bug upstream.");
 }
 
 QTEST_MAIN(tst_Utf8)
diff --git a/tests/auto/corelib/codecs/utf8/utf8data.cpp b/tests/auto/corelib/codecs/utf8/utf8data.cpp
index 2516cc9734..a41b0772e6 100644
--- a/tests/auto/corelib/codecs/utf8/utf8data.cpp
+++ b/tests/auto/corelib/codecs/utf8/utf8data.cpp
@@ -129,8 +129,8 @@ void loadInvalidUtf8Rows()
 
 void loadNonCharactersRows()
 {
-    // Unicode has a couple of "non-characters" that one can use internally,
-    // but are not allowed to be used for text interchange.
+    // Unicode has a couple of "non-characters" that one can use internally
+    // These characters are allowed for text-interchange (see http://www.unicode.org/versions/corrigendum9.html)
     //
     // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
     // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
author	Kurt Pattyn <pattyn.kurt@gmail.com>	2013-10-06 11:40:47 +0200
committer	The Qt Project <gerrit-noreply@qt-project.org>	2013-10-17 09:50:58 +0200
commit	add2bf739ae96603cb919b908cbb53c00d0628cc (patch)
tree	9702a95d145fc9f429aa6f2ec104cfab75cae753 /tests/auto/corelib/codecs
parent	e8853506bf82e569009e68a23437d6a134176f63 (diff)