From 4f02ca5c9458299185d447da87058d6e8ca1b260 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Fri, 5 Mar 2010 13:12:43 +0100 Subject: Finish reverting the UTF-8 hack added in 80ea01c6 (P4 106704, Qt 3.2). This code was removed from QString in 539cd1e5 (P4 259474, Qt 4.3), but apparently lingered on the UTF-8 codec code. Reviewed-by: Denis Dzyubenko --- src/corelib/codecs/qutfcodec.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'src/corelib/codecs') diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 7655c510ae..742b2e78b2 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -107,15 +107,8 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve *cursor++ = 0xc0 | ((uchar) (u >> 6)); } else { if (u > 0xffff) { - // see QString::fromUtf8() and QString::utf8() for explanations - if (u > 0x10fe00 && u < 0x10ff00) { - *cursor++ = (u - 0x10fe00); - ++ch; - continue; - } else { - *cursor++ = 0xf0 | ((uchar) (u >> 18)); - *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); - } + *cursor++ = 0xf0 | ((uchar) (u >> 18)); + *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); } else { *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f); } -- cgit v1.2.3 From 8b30b72948d5f16d59f8799f3889f563e2c6e24d Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Fri, 5 Mar 2010 14:10:57 +0100 Subject: Make the UTF-8 encoder/decoder not accept Unicode non-characters Reviewed-By: Denis Dzyubenko --- src/corelib/codecs/qutfcodec.cpp | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'src/corelib/codecs') diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 742b2e78b2..233bd8fd5e 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -48,6 +48,19 @@ QT_BEGIN_NAMESPACE enum { Endian = 0, Data = 1 }; +static inline bool isUnicodeNonCharacter(uint ucs4) +{ + // Unicode has a couple of "non-characters" that one can use internally, + // but are not allowed to be used for text interchange. + // + // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, + // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and + // U+FDEF (inclusive) + + return (ucs4 & 0xfffe) == 0xfffe + || (ucs4 - 0xfdd0U) < 16; +} + QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) { uchar replacement = '?'; @@ -106,6 +119,14 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve if (u < 0x0800) { *cursor++ = 0xc0 | ((uchar) (u >> 6)); } else { + // is it one of the Unicode non-characters? + if (isUnicodeNonCharacter(u)) { + *cursor = replacement; + ++ch; + ++invalid; + continue; + } + if (u > 0xffff) { *cursor++ = 0xf0 | ((uchar) (u >> 18)); *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); @@ -172,15 +193,16 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte --need; if (!need) { // utf-8 bom composes into 0xfeff code point + bool nonCharacter; if (!headerdone && uc == 0xfeff) { // dont do anything, just skip the BOM - } else if (uc > 0xffff && uc < 0x110000) { + } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) { // surrogate pair Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); *qch++ = QChar::highSurrogate(uc); *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || (uc >= 0xfffe)) { - // error: overlong sequence, UTF16 surrogate or BOM + } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { + // error: overlong sequence, UTF16 surrogate or non-character *qch++ = replacement; ++invalid; } else { -- cgit v1.2.3 From 2e8ee73bd681cfe5898b8c054b0cb4df51663c91 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Mon, 8 Mar 2010 14:59:56 +0100 Subject: Autotest: Fix failing QTextCodec tests --- src/corelib/codecs/qutfcodec.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/corelib/codecs') diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 233bd8fd5e..f747bf754f 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -121,7 +121,7 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve } else { // is it one of the Unicode non-characters? if (isUnicodeNonCharacter(u)) { - *cursor = replacement; + *cursor++ = replacement; ++ch; ++invalid; continue; -- cgit v1.2.3