From 1823c8f2ddd0a5c1b4301e7af7109796090a3c9a Mon Sep 17 00:00:00 2001 From: Erik Verbruggen Date: Wed, 16 Dec 2015 14:04:27 +0100 Subject: Fix utf8->utf16 BOM/ZWNBSP decoding. When the byte sequence for a BOM occurs in the middle of a utf8 stream, it is a ZWNBSP. When a ZWNBSP occurs in the middle of a utf8 character sequence, and the SIMD conversion does some work (meaning: the length is at least 16 characters long), it would not recognize the fact some charactes were already decoded. So the conversion would then strip the ZWNBSP out, thinking it's a BOM. The non-SIMD conversion did not have this problem: the very first character conversion would already set the headerdone flag. Change-Id: I39aacf607e2e068107106254021a8042d164f628 Reviewed-by: Thiago Macieira --- src/corelib/codecs/qutfcodec.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 98d4b2e4e3..a33c1bc9ce 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -364,6 +364,7 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte // main body, stateless decoding res = 0; const uchar *nextAscii = src; + const uchar *start = src; while (res >= 0 && src < end) { if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end)) break; @@ -372,9 +373,11 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte res = QUtf8Functions::fromUtf8(ch, dst, src, end); if (!headerdone && res >= 0) { headerdone = true; - // eat the UTF-8 BOM - if (dst[-1] == 0xfeff) - --dst; + if (src == start + 3) { // 3 == sizeof(utf8-bom) + // eat the UTF-8 BOM (it can only appear at the beginning of the string). + if (dst[-1] == 0xfeff) + --dst; + } } if (res == QUtf8BaseTraits::Error) { res = 0; -- cgit v1.2.3