summaryrefslogtreecommitdiffstats
path: root/src/corelib/codecs/qutfcodec.cpp
diff options
context:
space:
mode:
authorErik Verbruggen <erik.verbruggen@theqtcompany.com>2015-12-16 14:04:27 +0100
committerErik Verbruggen <erik.verbruggen@theqtcompany.com>2015-12-21 09:56:49 +0000
commit1823c8f2ddd0a5c1b4301e7af7109796090a3c9a (patch)
tree7179725508e8ad7a71b5e73ad2e74744b15450d9 /src/corelib/codecs/qutfcodec.cpp
parent0629e879422eae8751973b62ac29a61ca546f78c (diff)
Fix utf8->utf16 BOM/ZWNBSP decoding.
When the byte sequence for a BOM occurs in the middle of a utf8 stream, it is a ZWNBSP. When a ZWNBSP occurs in the middle of a utf8 character sequence, and the SIMD conversion does some work (meaning: the length is at least 16 characters long), it would not recognize the fact some charactes were already decoded. So the conversion would then strip the ZWNBSP out, thinking it's a BOM. The non-SIMD conversion did not have this problem: the very first character conversion would already set the headerdone flag. Change-Id: I39aacf607e2e068107106254021a8042d164f628 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/corelib/codecs/qutfcodec.cpp')
-rw-r--r--src/corelib/codecs/qutfcodec.cpp9
1 files changed, 6 insertions, 3 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 98d4b2e4e3..a33c1bc9ce 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -364,6 +364,7 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
// main body, stateless decoding
res = 0;
const uchar *nextAscii = src;
+ const uchar *start = src;
while (res >= 0 && src < end) {
if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
break;
@@ -372,9 +373,11 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
if (!headerdone && res >= 0) {
headerdone = true;
- // eat the UTF-8 BOM
- if (dst[-1] == 0xfeff)
- --dst;
+ if (src == start + 3) { // 3 == sizeof(utf8-bom)
+ // eat the UTF-8 BOM (it can only appear at the beginning of the string).
+ if (dst[-1] == 0xfeff)
+ --dst;
+ }
}
if (res == QUtf8BaseTraits::Error) {
res = 0;