From f56ef579ba5b1d3adda060fa9c0707e37f9f1baa Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 23 Apr 2014 11:18:17 -0700 Subject: Restore handling of BOMs in QString::fromUtf8 8dd47e34b9b96ac27a99cdcf10b8aec506882fc2 removed the handling of the BOMs but did not document it. This brings the behavior back and adds a unit test so we don't break it again. Discussed-on: http://lists.qt-project.org/pipermail/development/2014-April/016532.html Change-Id: Ifb7a9a6e5a494622f46b8ab435e1d168b862d952 Reviewed-by: Olivier Goffart Reviewed-by: Lars Knoll --- src/corelib/codecs/qutfcodec.cpp | 44 ++++++++++++++++-------- tests/auto/corelib/tools/qstring/tst_qstring.cpp | 8 +++++ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 54312601e4..072cda63aa 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -52,6 +52,8 @@ QT_BEGIN_NAMESPACE enum { Endian = 0, Data = 1 }; +static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf }; + #if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) { @@ -187,9 +189,9 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve int invalid = 0; if (state && !(state->flags & QTextCodec::IgnoreHeader)) { // append UTF-8 BOM - *cursor++ = 0xef; - *cursor++ = 0xbb; - *cursor++ = 0xbf; + *cursor++ = utf8bom[0]; + *cursor++ = utf8bom[1]; + *cursor++ = utf8bom[2]; } const ushort *nextAscii = src; @@ -240,19 +242,31 @@ QString QUtf8::convertToUnicode(const char *chars, int len) const uchar *src = reinterpret_cast(chars); const uchar *end = src + len; - while (src < end) { - const uchar *nextAscii = end; - if (simdDecodeAscii(dst, nextAscii, src, end)) - break; + // attempt to do a full decoding in SIMD + const uchar *nextAscii = end; + if (!simdDecodeAscii(dst, nextAscii, src, end)) { + // at least one non-ASCII entry + // check if we failed to decode the UTF-8 BOM; if so, skip it + if (Q_UNLIKELY(src == reinterpret_cast(chars)) + && end - src >= 3 + && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) { + src += 3; + } - do { - uchar b = *src++; - int res = QUtf8Functions::fromUtf8(b, dst, src, end); - if (res < 0) { - // decoding error - *dst++ = QChar::ReplacementCharacter; - } - } while (src < nextAscii); + while (src < end) { + nextAscii = end; + if (simdDecodeAscii(dst, nextAscii, src, end)) + break; + + do { + uchar b = *src++; + int res = QUtf8Functions::fromUtf8(b, dst, src, end); + if (res < 0) { + // decoding error + *dst++ = QChar::ReplacementCharacter; + } + } while (src < nextAscii); + } } result.truncate(dst - reinterpret_cast(result.constData())); diff --git a/tests/auto/corelib/tools/qstring/tst_qstring.cpp b/tests/auto/corelib/tools/qstring/tst_qstring.cpp index d9d6b985b7..95d377b176 100644 --- a/tests/auto/corelib/tools/qstring/tst_qstring.cpp +++ b/tests/auto/corelib/tools/qstring/tst_qstring.cpp @@ -3619,6 +3619,14 @@ void tst_QString::fromUtf8_data() str += " some "; QTest::newRow("str3-len") << QByteArray("\342\202\254 some text") << str << 9; + // test that QString::fromUtf8 suppresses an initial BOM, but not a ZWNBSP + str = "hello"; + QByteArray bom("\357\273\277"); + QTest::newRow("bom0") << bom << QString() << 3; + QTest::newRow("bom1") << bom + "hello" << str << -1; + QTest::newRow("bom+zwnbsp0") << bom + bom << QString(QChar(0xfeff)) << -1; + QTest::newRow("bom+zwnbsp1") << bom + "hello" + bom << str + QChar(0xfeff) << -1; + str = "hello"; str += QChar::ReplacementCharacter; str += QChar(0x68); -- cgit v1.2.3