From 704c4d0e107969cbfde7ba35a1a3f332a2268773 Mon Sep 17 00:00:00 2001 From: Giuseppe D'Angelo Date: Thu, 6 Feb 2014 00:44:03 +0100 Subject: QUtfCodec: don't encode invalid UCS-4 codepoints The code didn't check for malformed surrogate pairs. That means that - high surrogates followed by *anything* were decoded as they formed a valid surrogate pair; - stray low surrogates were returned as-is. We can't return surrogate values in UCS-4, so properly detect these cases and return U+FFFD instead. [ChangeLog][QtCore][QTextCodec] Encoding a QString in UTF-32 will now replace malformed UTF-16 subsequences in the string with the Unicode replacement character (U+FFFD). Change-Id: I5cd771d6aa21ffeff4dd9d9e5a7961cf692dc457 Reviewed-by: Thiago Macieira Reviewed-by: Konstantin Ritt --- src/corelib/codecs/qutfcodec.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 20bacb1584..a5d16b0b54 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -46,6 +46,7 @@ #include "qchar.h" #include "private/qsimd_p.h" +#include "private/qstringiterator_p.h" QT_BEGIN_NAMESPACE @@ -503,21 +504,21 @@ QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conv } data += 4; } + + QStringIterator i(uc, uc + len); if (endian == BigEndianness) { - for (int i = 0; i < len; ++i) { - uint cp = uc[i].unicode(); - if (uc[i].isHighSurrogate() && i < len - 1) - cp = QChar::surrogateToUcs4(cp, uc[++i].unicode()); + while (i.hasNext()) { + uint cp = i.next(); + *(data++) = cp >> 24; *(data++) = (cp >> 16) & 0xff; *(data++) = (cp >> 8) & 0xff; *(data++) = cp & 0xff; } } else { - for (int i = 0; i < len; ++i) { - uint cp = uc[i].unicode(); - if (uc[i].isHighSurrogate() && i < len - 1) - cp = QChar::surrogateToUcs4(cp, uc[++i].unicode()); + while (i.hasNext()) { + uint cp = i.next(); + *(data++) = cp & 0xff; *(data++) = (cp >> 8) & 0xff; *(data++) = (cp >> 16) & 0xff; -- cgit v1.2.3