From f56ef579ba5b1d3adda060fa9c0707e37f9f1baa Mon Sep 17 00:00:00 2001
From: Thiago Macieira <thiago.macieira@intel.com>
Date: Wed, 23 Apr 2014 11:18:17 -0700
Subject: Restore handling of BOMs in QString::fromUtf8

8dd47e34b9b96ac27a99cdcf10b8aec506882fc2 removed the handling of the
BOMs but did not document it. This brings the behavior back and adds a
unit test so we don't break it again.

Discussed-on: http://lists.qt-project.org/pipermail/development/2014-April/016532.html
Change-Id: Ifb7a9a6e5a494622f46b8ab435e1d168b862d952
Reviewed-by: Olivier Goffart <ogoffart@woboq.com>
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
---
 src/corelib/codecs/qutfcodec.cpp | 44 ++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'src')
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 54312601e4..072cda63aa 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -52,6 +52,8 @@ QT_BEGIN_NAMESPACE
 
 enum { Endian = 0, Data = 1 };
 
+static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
+
 #if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
 static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
 {
@@ -187,9 +189,9 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
     int invalid = 0;
     if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
         // append UTF-8 BOM
-        *cursor++ = 0xef;
-        *cursor++ = 0xbb;
-        *cursor++ = 0xbf;
+        *cursor++ = utf8bom[0];
+        *cursor++ = utf8bom[1];
+        *cursor++ = utf8bom[2];
     }
 
     const ushort *nextAscii = src;
@@ -240,19 +242,31 @@ QString QUtf8::convertToUnicode(const char *chars, int len)
     const uchar *src = reinterpret_cast<const uchar *>(chars);
     const uchar *end = src + len;
 
-    while (src < end) {
-        const uchar *nextAscii = end;
-        if (simdDecodeAscii(dst, nextAscii, src, end))
-            break;
+    // attempt to do a full decoding in SIMD
+    const uchar *nextAscii = end;
+    if (!simdDecodeAscii(dst, nextAscii, src, end)) {
+        // at least one non-ASCII entry
+        // check if we failed to decode the UTF-8 BOM; if so, skip it
+        if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars))
+                && end - src >= 3
+                && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
+            src += 3;
+        }
 
-        do {
-            uchar b = *src++;
-            int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
-            if (res < 0) {
-                // decoding error
-                *dst++ = QChar::ReplacementCharacter;
-            }
-        } while (src < nextAscii);
+        while (src < end) {
+            nextAscii = end;
+            if (simdDecodeAscii(dst, nextAscii, src, end))
+                break;
+
+            do {
+                uchar b = *src++;
+                int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+                if (res < 0) {
+                    // decoding error
+                    *dst++ = QChar::ReplacementCharacter;
+                }
+            } while (src < nextAscii);
+        }
     }
 
     result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
-- 
cgit v1.2.3