From f56ef579ba5b1d3adda060fa9c0707e37f9f1baa Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 23 Apr 2014 11:18:17 -0700 Subject: Restore handling of BOMs in QString::fromUtf8 8dd47e34b9b96ac27a99cdcf10b8aec506882fc2 removed the handling of the BOMs but did not document it. This brings the behavior back and adds a unit test so we don't break it again. Discussed-on: http://lists.qt-project.org/pipermail/development/2014-April/016532.html Change-Id: Ifb7a9a6e5a494622f46b8ab435e1d168b862d952 Reviewed-by: Olivier Goffart Reviewed-by: Lars Knoll --- src/corelib/codecs/qutfcodec.cpp | 44 ++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 54312601e4..072cda63aa 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -52,6 +52,8 @@ QT_BEGIN_NAMESPACE enum { Endian = 0, Data = 1 }; +static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf }; + #if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) { @@ -187,9 +189,9 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve int invalid = 0; if (state && !(state->flags & QTextCodec::IgnoreHeader)) { // append UTF-8 BOM - *cursor++ = 0xef; - *cursor++ = 0xbb; - *cursor++ = 0xbf; + *cursor++ = utf8bom[0]; + *cursor++ = utf8bom[1]; + *cursor++ = utf8bom[2]; } const ushort *nextAscii = src; @@ -240,19 +242,31 @@ QString QUtf8::convertToUnicode(const char *chars, int len) const uchar *src = reinterpret_cast(chars); const uchar *end = src + len; - while (src < end) { - const uchar *nextAscii = end; - if (simdDecodeAscii(dst, nextAscii, src, end)) - break; + // attempt to do a full decoding in SIMD + const uchar *nextAscii = end; + if (!simdDecodeAscii(dst, nextAscii, src, end)) { + // at least one non-ASCII entry + // check if we failed to decode the UTF-8 BOM; if so, skip it + if (Q_UNLIKELY(src == reinterpret_cast(chars)) + && end - src >= 3 + && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) { + src += 3; + } - do { - uchar b = *src++; - int res = QUtf8Functions::fromUtf8(b, dst, src, end); - if (res < 0) { - // decoding error - *dst++ = QChar::ReplacementCharacter; - } - } while (src < nextAscii); + while (src < end) { + nextAscii = end; + if (simdDecodeAscii(dst, nextAscii, src, end)) + break; + + do { + uchar b = *src++; + int res = QUtf8Functions::fromUtf8(b, dst, src, end); + if (res < 0) { + // decoding error + *dst++ = QChar::ReplacementCharacter; + } + } while (src < nextAscii); + } } result.truncate(dst - reinterpret_cast(result.constData())); -- cgit v1.2.3