Add a new UTF-8 decoder, similar to the encoder we've just added

Like before, this is taken from the existing QUrl code and is optimized for ASCII handling (for the same reasons). And like previously, make QString::fromUtf8 use a stateless version of the codec, which is faster. There's a small change in behavior in the decoding: we insert a U+FFFD for each byte that cannot be decoded properly. Previously, it would "eat" all bad high-bit bytes and replace them all with one single U+FFFD. Either behavior is allowed by the UTF-8 specifications, even though this new behavior will cause misalignment in the Bradley Kuhn sample UTF-8 text. Change-Id: Ib1b1f0b4291293bab345acaf376e00204ed87565 Reviewed-by: Olivier Goffart <ogoffart@woboq.com> Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Thiago Macieira <thiago.macieira@intel.com> 2013-10-20 17:43:46 +0100
committer: The Qt Project <gerrit-noreply@qt-project.org> 2014-01-09 22:34:54 +0100
commit: 8dd47e34b9b96ac27a99cdcf10b8aec506882fc2 (patch)
tree: be92b77f4006e2b96683e5bfd4810db09a5b15ab /src/corelib/codecs
parent: d51130cc3a00df8147e2eb0799e06865c901c6e0 (diff)
2 files changed, 194 insertions, 86 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index d1fc5b851a..b0e0b3f010 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -128,114 +128,117 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
     return rstr;
 }
 
+QString QUtf8::convertToUnicode(const char *chars, int len)
+{
+    QString result(len + 1, Qt::Uninitialized); // worst case
+    ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+    const uchar *src = reinterpret_cast<const uchar *>(chars);
+    const uchar *end = src + len;
+
+    while (src < end) {
+        uchar b = *src++;
+        int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+        if (res < 0) {
+            // decoding error
+            *dst++ = QChar::ReplacementCharacter;
+        }
+    }
+
+    result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
+    return result;
+}
+
 QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
 {
     bool headerdone = false;
     ushort replacement = QChar::ReplacementCharacter;
     int need = 0;
-    int error = -1;
-    uint uc = 0;
-    uint min_uc = 0;
+    int invalid = 0;
+    int res;
+    uchar ch = 0;
+
+    QString result(need + len + 1, Qt::Uninitialized); // worst case
+    ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
+    const uchar *src = reinterpret_cast<const uchar *>(chars);
+    const uchar *end = src + len;
+
     if (state) {
         if (state->flags & QTextCodec::IgnoreHeader)
             headerdone = true;
         if (state->flags & QTextCodec::ConvertInvalidToNull)
             replacement = QChar::Null;
-        need = state->remainingChars;
-        if (need) {
-            uc = state->state_data[0];
-            min_uc = state->state_data[1];
-        }
-    }
-    if (!headerdone && len > 3
-        && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
-        // starts with a byte order mark
-        chars += 3;
-        len -= 3;
-        headerdone = true;
-    }
-
-    QString result(need + len + 1, Qt::Uninitialized); // worst case
-    ushort *qch = (ushort *)result.unicode();
-    uchar ch;
-    int invalid = 0;
-
-    for (int i = 0; i < len; ++i) {
-        ch = chars[i];
-        if (need) {
-            if ((ch&0xc0) == 0x80) {
-                uc = (uc << 6) | (ch & 0x3f);
-                --need;
-                if (!need) {
-                    // utf-8 bom composes into 0xfeff code point
-                    if (!headerdone && uc == 0xfeff) {
-                        // don't do anything, just skip the BOM
-                    } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
-                        // surrogate pair
-                        Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
-                        *qch++ = QChar::highSurrogate(uc);
-                        *qch++ = QChar::lowSurrogate(uc);
-                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
-                        // error: overlong sequence, UTF16 surrogate or non-character
-                        *qch++ = replacement;
-                        ++invalid;
-                    } else {
-                        *qch++ = uc;
-                    }
-                    headerdone = true;
-                }
-            } else {
-                // error
-                i = error;
-                *qch++ = replacement;
-                ++invalid;
-                need = 0;
-                headerdone = true;
-            }
-        } else {
-            if (ch < 128) {
-                *qch++ = ushort(ch);
-                headerdone = true;
-            } else if ((ch & 0xe0) == 0xc0) {
-                uc = ch & 0x1f;
-                need = 1;
-                error = i;
-                min_uc = 0x80;
-                headerdone = true;
-            } else if ((ch & 0xf0) == 0xe0) {
-                uc = ch & 0x0f;
-                need = 2;
-                error = i;
-                min_uc = 0x800;
-            } else if ((ch&0xf8) == 0xf0) {
-                uc = ch & 0x07;
-                need = 3;
-                error = i;
-                min_uc = 0x10000;
-                headerdone = true;
-            } else {
-                // error
-                *qch++ = replacement;
+        if (state->remainingChars) {
+            // handle incoming state first
+            uchar remainingCharsData[4]; // longest UTF-8 sequence possible
+            int remainingCharsCount = state->remainingChars;
+            int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
+
+            memset(remainingCharsData, 0, sizeof(remainingCharsData));
+            memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
+            memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
+
+            const uchar *begin = &remainingCharsData[1];
+            res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
+                    static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
+            if (res == QUtf8BaseTraits::EndOfString) {
+                // if we got EndOfString again, then there were too few bytes in src;
+                // copy to our state and return
+                state->remainingChars = remainingCharsCount + newCharsToCopy;
+                memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
+                return QString();
+            } else if (res == QUtf8BaseTraits::Error) {
                 ++invalid;
+                *dst++ = replacement;
+            } else if (!headerdone && res >= 0) {
+                // eat the UTF-8 BOM
                 headerdone = true;
+                if (dst[-1] == 0xfeff)
+                    --dst;
             }
+
+            // adjust src now that we have maybe consumed a few chars
+            //Q_ASSERT(res > remainingCharsCount)
+            src += res - remainingCharsCount;
         }
     }
-    if (!state && need > 0) {
-        // unterminated UTF sequence
-        for (int i = error; i < len; ++i) {
-            *qch++ = replacement;
+
+    // main body, stateless decoding
+    res = 0;
+    while (res >= 0 && src < end) {
+        ch = *src++;
+        res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
+        if (!headerdone && res >= 0) {
+            headerdone = true;
+            // eat the UTF-8 BOM
+            if (dst[-1] == 0xfeff)
+                --dst;
+        }
+        if (res == QUtf8BaseTraits::Error) {
+            res = 0;
             ++invalid;
+            *dst++ = replacement;
         }
     }
-    result.truncate(qch - (ushort *)result.unicode());
+
+    if (!state && res == QUtf8BaseTraits::EndOfString) {
+        // unterminated UTF sequence
+        *dst++ = QChar::ReplacementCharacter;
+        while (src++ < end)
+            *dst++ = QChar::ReplacementCharacter;
+    }
+
+    result.truncate(dst - (ushort *)result.unicode());
     if (state) {
         state->invalidChars += invalid;
-        state->remainingChars = need;
         if (headerdone)
             state->flags |= QTextCodec::IgnoreHeader;
-        state->state_data[0] = need ? uc : 0;
-        state->state_data[1] = need ? min_uc : 0;
+        if (res == QUtf8BaseTraits::EndOfString) {
+            --src; // unread the byte in ch
+            state->remainingChars = end - src;
+            memcpy(&state->state_data[0], src, end - src);
+        } else {
+            state->remainingChars = 0;
+        }
     }
     return result;
 }
diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h
index 4f0e2394fe..c94a7a12e4 100644
--- a/src/corelib/codecs/qutfcodec_p.h
+++ b/src/corelib/codecs/qutfcodec_p.h
@@ -169,6 +169,110 @@ namespace QUtf8Functions
         Traits::appendByte(dst, 0x80 | (u & 0x3f));
         return 0;
     }
+
+    inline bool isContinuationByte(uchar b)
+    {
+        return (b & 0xc0) == 0x80;
+    }
+
+    /// returns the number of characters consumed (including \a b) in case of success;
+    /// returns negative in case of error: Traits::Error or Traits::EndOfString
+    template <typename Traits, typename OutputPtr, typename InputPtr> inline
+    int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
+    {
+        int charsNeeded;
+        uint min_uc;
+        uint uc;
+
+        if (!Traits::skipAsciiHandling && b < 0x80) {
+            // US-ASCII
+            Traits::appendUtf16(dst, b);
+            return 1;
+        }
+
+        if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
+            // an UTF-8 first character must be at least 0xC0
+            // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
+            return Traits::Error;
+        } else if (b < 0xe0) {
+            charsNeeded = 2;
+            min_uc = 0x80;
+            uc = b & 0x1f;
+        } else if (b < 0xf0) {
+            charsNeeded = 3;
+            min_uc = 0x800;
+            uc = b & 0x0f;
+        } else if (b < 0xf5) {
+            charsNeeded = 4;
+            min_uc = 0x10000;
+            uc = b & 0x07;
+        } else {
+            // the last Unicode character is U+10FFFF
+            // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
+            // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
+            return Traits::Error;
+        }
+
+        int bytesAvailable = Traits::availableBytes(src, end);
+        if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
+            // it's possible that we have an error instead of just unfinished bytes
+            if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
+                return Traits::Error;
+            if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
+                return Traits::Error;
+            if (bytesAvailable > 2 && !isContinuationByte(Traits::peekByte(src, 2)))
+                return Traits::Error;
+            return Traits::EndOfString;
+        }
+
+        // first continuation character
+        b = Traits::peekByte(src, 0);
+        if (!isContinuationByte(b))
+            return Traits::Error;
+        uc <<= 6;
+        uc |= b & 0x3f;
+
+        if (charsNeeded > 2) {
+            // second continuation character
+            b = Traits::peekByte(src, 1);
+            if (!isContinuationByte(b))
+                return Traits::Error;
+            uc <<= 6;
+            uc |= b & 0x3f;
+
+            if (charsNeeded > 3) {
+                // third continuation character
+                b = Traits::peekByte(src, 2);
+                if (!isContinuationByte(b))
+                    return Traits::Error;
+                uc <<= 6;
+                uc |= b & 0x3f;
+            }
+        }
+
+        // we've decoded something; safety-check it
+        if (!Traits::isTrusted) {
+            if (uc < min_uc)
+                return Traits::Error;
+            if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
+                return Traits::Error;
+            if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
+                return Traits::Error;
+        }
+
+        // write the UTF-16 sequence
+        if (!QChar::requiresSurrogates(uc)) {
+            // UTF-8 decoded and no surrogates are required
+            // detach if necessary
+            Traits::appendUtf16(dst, ushort(uc));
+        } else {
+            // UTF-8 decoded to something that requires a surrogate pair
+            Traits::appendUcs4(dst, uc);
+        }
+
+        Traits::advanceByte(src, charsNeeded - 1);
+        return charsNeeded;
+    }
 }
 
 enum DataEndianness
@@ -180,6 +284,7 @@ enum DataEndianness
 
 struct QUtf8
 {
+    static QString convertToUnicode(const char *, int);
     static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
     static QByteArray convertFromUnicode(const QChar *, int);
     static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
author	Thiago Macieira <thiago.macieira@intel.com>	2013-10-20 17:43:46 +0100
committer	The Qt Project <gerrit-noreply@qt-project.org>	2014-01-09 22:34:54 +0100
commit	8dd47e34b9b96ac27a99cdcf10b8aec506882fc2 (patch)
tree	be92b77f4006e2b96683e5bfd4810db09a5b15ab /src/corelib/codecs
parent	d51130cc3a00df8147e2eb0799e06865c901c6e0 (diff)