From e20c4730192f312881591fb50e571af0a88fe421 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Mon, 5 Sep 2011 20:34:38 +0200 Subject: Move the UTF-8 data into a separate .cpp so I can use later This allows me to keep the UTF-8 invalid data in one safe place. I won't need to copy & paste it. Change-Id: Icb909d08b7f8d0e1ffbc28e01a0ba0c1fa9dccf0 Reviewed-by: David Faure --- tests/auto/corelib/codecs/utf8/tst_utf8.cpp | 117 +------------------- tests/auto/corelib/codecs/utf8/utf8.pro | 2 +- tests/auto/corelib/codecs/utf8/utf8data.cpp | 163 ++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+), 114 deletions(-) create mode 100644 tests/auto/corelib/codecs/utf8/utf8data.cpp diff --git a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp index dd6774e101..025bb349d6 100644 --- a/tests/auto/corelib/codecs/utf8/tst_utf8.cpp +++ b/tests/auto/corelib/codecs/utf8/tst_utf8.cpp @@ -206,88 +206,8 @@ void tst_Utf8::invalidUtf8_data() { QTest::addColumn("utf8"); - QTest::newRow("1char") << QByteArray("\x80"); - QTest::newRow("2chars-1") << QByteArray("\xC2\xC0"); - QTest::newRow("2chars-2") << QByteArray("\xC3\xDF"); - QTest::newRow("2chars-3") << QByteArray("\xC7\xF0"); - QTest::newRow("3chars-1") << QByteArray("\xE0\xA0\xC0"); - QTest::newRow("3chars-2") << QByteArray("\xE0\xC0\xA0"); - QTest::newRow("4chars-1") << QByteArray("\xF0\x90\x80\xC0"); - QTest::newRow("4chars-2") << QByteArray("\xF0\x90\xC0\x80"); - QTest::newRow("4chars-3") << QByteArray("\xF0\xC0\x80\x80"); - - // Surrogate pairs must now be present either - // U+D800: 1101 10 0000 00 0000 - // encoding: xxxz:1101 xz10:0000 xz00:0000 - QTest::newRow("hi-surrogate") << QByteArray("\xED\xA0\x80"); - // U+DC00: 1101 11 0000 00 0000 - // encoding: xxxz:1101 xz11:0000 xz00:0000 - QTest::newRow("lo-surrogate") << QByteArray("\xED\xB0\x80"); - - // not even in pair: - QTest::newRow("surrogate-pair") << QByteArray("\xED\xA0\x80\xED\xB0\x80"); - - // Characters outside the Unicode range: - // 0x110000: 00 0100 01 0000 00 0000 00 0000 - // encoding: xxxx:z100 xz01:0000 xz00:0000 xz00:0000 - QTest::newRow("non-unicode-1") << QByteArray("\xF4\x90\x80\x80"); - // 0x200000: 00 1000 00 0000 00 0000 00 0000 - // encoding: xxxx:xz00 xz00:1000 xz00:0000 xz00:0000 xz00:0000 - QTest::newRow("non-unicode-2") << QByteArray("\xF8\x88\x80\x80\x80"); - // 0x04000000: 0100 00 0000 00 0000 00 0000 00 0000 - // encoding: xxxx:xxz0 xz00:0100 xz00:0000 xz00:0000 xz00:0001 xz00:0001 - QTest::newRow("non-unicode-3") << QByteArray("\xFC\x84\x80\x80\x80\x80"); - // 0x7fffffff: 1 11 1111 11 1111 11 1111 11 1111 11 1111 - // encoding: xxxx:xxz0 xz00:0100 xz00:0000 xz00:0000 xz00:0001 xz00:0001 - QTest::newRow("non-unicode-4") << QByteArray("\xFD\xBF\xBF\xBF\xBF\xBF"); - - // As seen above, 0xFE and 0xFF never appear: - QTest::newRow("fe") << QByteArray("\xFE"); - QTest::newRow("fe-bis") << QByteArray("\xFE\xBF\xBF\xBF\xBF\xBF\xBF"); - QTest::newRow("ff") << QByteArray("\xFF"); - QTest::newRow("ff-bis") << QByteArray("\xFF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"); - - // some combinations in UTF-8 are invalid even though they have the proper bits set - // these are known as overlong sequences - - // "A": U+0041: 01 00 0001 - // overlong 2: xxz0:0001 xz00:0001 - QTest::newRow("overlong-1-2") << QByteArray("\xC1\x81"); - // overlong 3: xxxz:0000 xz00:0001 xz00:0001 - QTest::newRow("overlong-1-3") << QByteArray("\xE0\x81\x81"); - // overlong 4: xxxx:z000 xz00:0000 xz00:0001 xz00:0001 - QTest::newRow("overlong-1-4") << QByteArray("\xF0\x80\x81\x81"); - // overlong 5: xxxx:xz00 xz00:0000 xz00:0000 xz00:0001 xz00:0001 - QTest::newRow("overlong-1-5") << QByteArray("\xF8\x80\x80\x81\x81"); - // overlong 6: xxxx:xxz0 xz00:0000 xz00:0000 xz00:0000 xz00:0001 xz00:0001 - QTest::newRow("overlong-1-6") << QByteArray("\xFC\x80\x80\x80\x81\x81"); - - // NBSP: U+00A0: 10 00 0000 - // proper encoding: xxz0:0010 xz00:0000 - // overlong 3: xxxz:0000 xz00:0010 xz00:0000 - QTest::newRow("overlong-2-3") << QByteArray("\xC0\x82\x80"); - // overlong 4: xxxx:z000 xz00:0000 xz00:0010 xz00:0000 - QTest::newRow("overlong-2-4") << QByteArray("\xF0\x80\x82\x80"); - // overlong 5: xxxx:xz00 xz00:0000 xz00:0000 xz00:0010 xz00:0000 - QTest::newRow("overlong-2-5") << QByteArray("\xF8\x80\x80\x82\x80"); - // overlong 6: xxxx:xxz0 xz00:0000 xz00:0000 xz00:0000 xz00:0010 xz00:0000 - QTest::newRow("overlong-2-6") << QByteArray("\xFC\x80\x80\x80\x82\x80"); - - // U+0800: 10 0000 00 0000 - // proper encoding: xxxz:0000 xz10:0000 xz00:0000 - // overlong 4: xxxx:z000 xz00:0000 xz10:0000 xz00:0000 - QTest::newRow("overlong-3-4") << QByteArray("\xF0\x80\xA0\x80"); - // overlong 5: xxxx:xz00 xz00:0000 xz00:0000 xz10:0000 xz00:0000 - QTest::newRow("overlong-3-5") << QByteArray("\xF8\x80\x80\xA0\x80"); - // overlong 6: xxxx:xxz0 xz00:0000 xz00:0000 xz00:0000 xz10:0000 xz00:0000 - QTest::newRow("overlong-3-6") << QByteArray("\xFC\x80\x80\x80\xA0\x80"); - - // U+010000: 00 0100 00 0000 00 0000 - // proper encoding: xxxx:z000 xz00:0100 xz00:0000 xz00:0000 - // overlong 5: xxxx:xz00 xz00:0000 xz00:0100 xz00:0000 xz00:0000 - QTest::newRow("overlong-4-5") << QByteArray("\xF8\x80\x84\x80\x80"); - // overlong 6: xxxx:xxz0 xz00:0000 xz00:0000 xz00:0100 xz00:0000 xz00:0000 - QTest::newRow("overlong-4-6") << QByteArray("\xFC\x80\x80\x84\x80\x80"); + extern void loadInvalidUtf8Rows(); + loadInvalidUtf8Rows(); } void tst_Utf8::invalidUtf8() @@ -313,37 +233,8 @@ void tst_Utf8::nonCharacters_data() QTest::addColumn("utf8"); QTest::addColumn("utf16"); - // Unicode has a couple of "non-characters" that one can use internally, - // but are not allowed to be used for text interchange. - // - // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, - // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and - // U+FDEF (inclusive) - - // U+FDD0 through U+FDEF - for (int i = 0; i < 16; ++i) { - char utf8[] = { char(0357), char(0267), char(0220 + i), 0 }; - QString utf16 = QChar(0xfdd0 + i); - QTest::newRow(qPrintable(QString::number(0xfdd0 + i, 16))) << QByteArray(utf8) << utf16; - } - - // the last two in Planes 1 through 16 - for (uint plane = 1; plane <= 16; ++plane) { - for (uint lower = 0xfffe; lower < 0x10000; ++lower) { - uint ucs4 = (plane << 16) | lower; - char utf8[] = { char(0xf0 | uchar(ucs4 >> 18)), - char(0x80 | (uchar(ucs4 >> 12) & 0x3f)), - char(0x80 | (uchar(ucs4 >> 6) & 0x3f)), - char(0x80 | (uchar(ucs4) & 0x3f)), - 0 }; - ushort utf16[] = { QChar::highSurrogate(ucs4), QChar::lowSurrogate(ucs4), 0 }; - - QTest::newRow(qPrintable(QString::number(ucs4, 16))) << QByteArray(utf8) << QString::fromUtf16(utf16); - } - } - - QTest::newRow("fffe") << QByteArray("\xEF\xBF\xBE") << QString(QChar(0xfffe)); - QTest::newRow("ffff") << QByteArray("\xEF\xBF\xBF") << QString(QChar(0xffff)); + extern void loadNonCharactersRows(); + loadNonCharactersRows(); } void tst_Utf8::nonCharacters() diff --git a/tests/auto/corelib/codecs/utf8/utf8.pro b/tests/auto/corelib/codecs/utf8/utf8.pro index 2d200e82a4..1df6ac7e50 100644 --- a/tests/auto/corelib/codecs/utf8/utf8.pro +++ b/tests/auto/corelib/codecs/utf8/utf8.pro @@ -1,5 +1,5 @@ CONFIG += testcase TARGET = tst_utf8 QT = core testlib -SOURCES += tst_utf8.cpp +SOURCES += tst_utf8.cpp utf8data.cpp CONFIG += parallel_test diff --git a/tests/auto/corelib/codecs/utf8/utf8data.cpp b/tests/auto/corelib/codecs/utf8/utf8data.cpp new file mode 100644 index 0000000000..2c0bae3e5d --- /dev/null +++ b/tests/auto/corelib/codecs/utf8/utf8data.cpp @@ -0,0 +1,163 @@ +/**************************************************************************** +** +** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies). +** Contact: http://www.qt-project.org/ +** +** This file is part of the test suite of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** GNU Lesser General Public License Usage +** This file may be used under the terms of the GNU Lesser General Public +** License version 2.1 as published by the Free Software Foundation and +** appearing in the file LICENSE.LGPL included in the packaging of this +** file. Please review the following information to ensure the GNU Lesser +** General Public License version 2.1 requirements will be met: +** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU General +** Public License version 3.0 as published by the Free Software Foundation +** and appearing in the file LICENSE.GPL included in the packaging of this +** file. Please review the following information to ensure the GNU General +** Public License version 3.0 requirements will be met: +** http://www.gnu.org/copyleft/gpl.html. +** +** Other Usage +** Alternatively, this file may be used in accordance with the terms and +** conditions contained in a signed written agreement between you and Nokia. +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ +#include + +void loadInvalidUtf8Rows() +{ + QTest::newRow("1char") << QByteArray("\x80"); + QTest::newRow("2chars-1") << QByteArray("\xC2\xC0"); + QTest::newRow("2chars-2") << QByteArray("\xC3\xDF"); + QTest::newRow("2chars-3") << QByteArray("\xC7\xF0"); + QTest::newRow("3chars-1") << QByteArray("\xE0\xA0\xC0"); + QTest::newRow("3chars-2") << QByteArray("\xE0\xC0\xA0"); + QTest::newRow("4chars-1") << QByteArray("\xF0\x90\x80\xC0"); + QTest::newRow("4chars-2") << QByteArray("\xF0\x90\xC0\x80"); + QTest::newRow("4chars-3") << QByteArray("\xF0\xC0\x80\x80"); + + // Surrogate pairs must now be present either + // U+D800: 1101 10 0000 00 0000 + // encoding: xxxz:1101 xz10:0000 xz00:0000 + QTest::newRow("hi-surrogate") << QByteArray("\xED\xA0\x80"); + // U+DC00: 1101 11 0000 00 0000 + // encoding: xxxz:1101 xz11:0000 xz00:0000 + QTest::newRow("lo-surrogate") << QByteArray("\xED\xB0\x80"); + + // not even in pair: + QTest::newRow("surrogate-pair") << QByteArray("\xED\xA0\x80\xED\xB0\x80"); + + // Characters outside the Unicode range: + // 0x110000: 00 0100 01 0000 00 0000 00 0000 + // encoding: xxxx:z100 xz01:0000 xz00:0000 xz00:0000 + QTest::newRow("non-unicode-1") << QByteArray("\xF4\x90\x80\x80"); + // 0x200000: 00 1000 00 0000 00 0000 00 0000 + // encoding: xxxx:xz00 xz00:1000 xz00:0000 xz00:0000 xz00:0000 + QTest::newRow("non-unicode-2") << QByteArray("\xF8\x88\x80\x80\x80"); + // 0x04000000: 0100 00 0000 00 0000 00 0000 00 0000 + // encoding: xxxx:xxz0 xz00:0100 xz00:0000 xz00:0000 xz00:0001 xz00:0001 + QTest::newRow("non-unicode-3") << QByteArray("\xFC\x84\x80\x80\x80\x80"); + // 0x7fffffff: 1 11 1111 11 1111 11 1111 11 1111 11 1111 + // encoding: xxxx:xxz0 xz00:0100 xz00:0000 xz00:0000 xz00:0001 xz00:0001 + QTest::newRow("non-unicode-4") << QByteArray("\xFD\xBF\xBF\xBF\xBF\xBF"); + + // As seen above, 0xFE and 0xFF never appear: + QTest::newRow("fe") << QByteArray("\xFE"); + QTest::newRow("fe-bis") << QByteArray("\xFE\xBF\xBF\xBF\xBF\xBF\xBF"); + QTest::newRow("ff") << QByteArray("\xFF"); + QTest::newRow("ff-bis") << QByteArray("\xFF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"); + + // some combinations in UTF-8 are invalid even though they have the proper bits set + // these are known as overlong sequences + + // "A": U+0041: 01 00 0001 + // overlong 2: xxz0:0001 xz00:0001 + QTest::newRow("overlong-1-2") << QByteArray("\xC1\x81"); + // overlong 3: xxxz:0000 xz00:0001 xz00:0001 + QTest::newRow("overlong-1-3") << QByteArray("\xE0\x81\x81"); + // overlong 4: xxxx:z000 xz00:0000 xz00:0001 xz00:0001 + QTest::newRow("overlong-1-4") << QByteArray("\xF0\x80\x81\x81"); + // overlong 5: xxxx:xz00 xz00:0000 xz00:0000 xz00:0001 xz00:0001 + QTest::newRow("overlong-1-5") << QByteArray("\xF8\x80\x80\x81\x81"); + // overlong 6: xxxx:xxz0 xz00:0000 xz00:0000 xz00:0000 xz00:0001 xz00:0001 + QTest::newRow("overlong-1-6") << QByteArray("\xFC\x80\x80\x80\x81\x81"); + + // NBSP: U+00A0: 10 00 0000 + // proper encoding: xxz0:0010 xz00:0000 + // overlong 3: xxxz:0000 xz00:0010 xz00:0000 + QTest::newRow("overlong-2-3") << QByteArray("\xC0\x82\x80"); + // overlong 4: xxxx:z000 xz00:0000 xz00:0010 xz00:0000 + QTest::newRow("overlong-2-4") << QByteArray("\xF0\x80\x82\x80"); + // overlong 5: xxxx:xz00 xz00:0000 xz00:0000 xz00:0010 xz00:0000 + QTest::newRow("overlong-2-5") << QByteArray("\xF8\x80\x80\x82\x80"); + // overlong 6: xxxx:xxz0 xz00:0000 xz00:0000 xz00:0000 xz00:0010 xz00:0000 + QTest::newRow("overlong-2-6") << QByteArray("\xFC\x80\x80\x80\x82\x80"); + + // U+0800: 10 0000 00 0000 + // proper encoding: xxxz:0000 xz10:0000 xz00:0000 + // overlong 4: xxxx:z000 xz00:0000 xz10:0000 xz00:0000 + QTest::newRow("overlong-3-4") << QByteArray("\xF0\x80\xA0\x80"); + // overlong 5: xxxx:xz00 xz00:0000 xz00:0000 xz10:0000 xz00:0000 + QTest::newRow("overlong-3-5") << QByteArray("\xF8\x80\x80\xA0\x80"); + // overlong 6: xxxx:xxz0 xz00:0000 xz00:0000 xz00:0000 xz10:0000 xz00:0000 + QTest::newRow("overlong-3-6") << QByteArray("\xFC\x80\x80\x80\xA0\x80"); + + // U+010000: 00 0100 00 0000 00 0000 + // proper encoding: xxxx:z000 xz00:0100 xz00:0000 xz00:0000 + // overlong 5: xxxx:xz00 xz00:0000 xz00:0100 xz00:0000 xz00:0000 + QTest::newRow("overlong-4-5") << QByteArray("\xF8\x80\x84\x80\x80"); + // overlong 6: xxxx:xxz0 xz00:0000 xz00:0000 xz00:0100 xz00:0000 xz00:0000 + QTest::newRow("overlong-4-6") << QByteArray("\xFC\x80\x80\x84\x80\x80"); + +} + +void loadNonCharactersRows() +{ + // Unicode has a couple of "non-characters" that one can use internally, + // but are not allowed to be used for text interchange. + // + // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, + // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and + // U+FDEF (inclusive) + + // U+FDD0 through U+FDEF + for (int i = 0; i < 16; ++i) { + char utf8[] = { char(0357), char(0267), char(0220 + i), 0 }; + QString utf16 = QChar(0xfdd0 + i); + QTest::newRow(qPrintable(QString::number(0xfdd0 + i, 16))) << QByteArray(utf8) << utf16; + } + + // the last two in Planes 1 through 16 + for (uint plane = 1; plane <= 16; ++plane) { + for (uint lower = 0xfffe; lower < 0x10000; ++lower) { + uint ucs4 = (plane << 16) | lower; + char utf8[] = { char(0xf0 | uchar(ucs4 >> 18)), + char(0x80 | (uchar(ucs4 >> 12) & 0x3f)), + char(0x80 | (uchar(ucs4 >> 6) & 0x3f)), + char(0x80 | (uchar(ucs4) & 0x3f)), + 0 }; + ushort utf16[] = { QChar::highSurrogate(ucs4), QChar::lowSurrogate(ucs4), 0 }; + + QTest::newRow(qPrintable(QString::number(ucs4, 16))) << QByteArray(utf8) << QString::fromUtf16(utf16); + } + } + + QTest::newRow("fffe") << QByteArray("\xEF\xBF\xBE") << QString(QChar(0xfffe)); + QTest::newRow("ffff") << QByteArray("\xEF\xBF\xBF") << QString(QChar(0xffff)); +} -- cgit v1.2.3