Add a new UTF-8 encoder and use it from QString

This is a new and faster UTF-8 encoder, based on the code from QUrl. This code specializes for ASCII, which is the most common case anyway, especially since QString's "ascii" mode is actually UTF-8 now. In addition, make QString::toUtf8 use a stateless encoder. Stateless means that the function doesn't handle state between calls in the form of QTextCodec::ConverterState. This allows it to be faster than otherwise. The new code is in the form of a template so that it can be used from QJsonDocument and QUrl, which have small modifications to how the encoding is handled. Change-Id: I305ee0fd8523cc4ec74c2678cb9ea88b75bac7ac Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Thiago Macieira <thiago.macieira@intel.com> 2013-10-19 18:54:55 -0400
committer: The Qt Project <gerrit-noreply@qt-project.org> 2014-01-09 22:34:54 +0100
commit: d51130cc3a00df8147e2eb0799e06865c901c6e0 (patch)
tree: 16dd61dc71a36772baec6b9d34e66445dac554c1 /src/corelib/codecs/qutfcodec_p.h
parent: 86fa8b4fb800620ef065f5e151fa6896931cfc99 (diff)
1 files changed, 114 insertions, 0 deletions
diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h
index e1214d50bc..4f0e2394fe 100644
--- a/src/corelib/codecs/qutfcodec_p.h
+++ b/src/corelib/codecs/qutfcodec_p.h
@@ -1,6 +1,7 @@
 /****************************************************************************
 **
 ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
+** Copyright (C) 2013 Intel Corporation
 ** Contact: http://www.qt-project.org/legal
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@@ -58,6 +59,118 @@
 
 QT_BEGIN_NAMESPACE
 
+struct QUtf8BaseTraits
+{
+    static const bool isTrusted = false;
+    static const bool allowNonCharacters = true;
+    static const bool skipAsciiHandling = false;
+    static const int Error = -1;
+    static const int EndOfString = -2;
+
+    static bool isValidCharacter(uint u)
+    { return int(u) >= 0; }
+
+    static void appendByte(uchar *&ptr, uchar b)
+    { *ptr++ = b; }
+
+    static uchar peekByte(const uchar *ptr, int n = 0)
+    { return ptr[n]; }
+
+    static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
+    { return end - ptr; }
+
+    static void advanceByte(const uchar *&ptr, int n = 1)
+    { ptr += n; }
+
+    static void appendUtf16(ushort *&ptr, ushort uc)
+    { *ptr++ = uc; }
+
+    static void appendUcs4(ushort *&ptr, uint uc)
+    {
+        appendUtf16(ptr, QChar::highSurrogate(uc));
+        appendUtf16(ptr, QChar::lowSurrogate(uc));
+    }
+
+    static ushort peekUtf16(const ushort *ptr, int n = 0)
+    { return ptr[n]; }
+
+    static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
+    { return end - ptr; }
+
+    static void advanceUtf16(const ushort *&ptr, int n = 1)
+    { ptr += n; }
+
+    // it's possible to output to UCS-4 too
+    static void appendUtf16(uint *&ptr, ushort uc)
+    { *ptr++ = uc; }
+
+    static void appendUcs4(uint *&ptr, uint uc)
+    { *ptr++ = uc; }
+};
+
+namespace QUtf8Functions
+{
+    /// returns 0 on success; errors can only happen if \a u is a surrogate:
+    /// Error if \a u is a low surrogate;
+    /// if \a u is a high surrogate, Error if the next isn't a low one,
+    /// EndOfString if we run into the end of the string.
+    template <typename Traits, typename OutputPtr, typename InputPtr> inline
+    int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
+    {
+        if (!Traits::skipAsciiHandling && u < 0x80) {
+            // U+0000 to U+007F (US-ASCII) - one byte
+            Traits::appendByte(dst, uchar(u));
+            return 0;
+        } else if (u < 0x0800) {
+            // U+0080 to U+07FF - two bytes
+            // first of two bytes
+            Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
+        } else {
+            if (!QChar::isSurrogate(u)) {
+                // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
+                if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
+                    return Traits::Error;
+
+                // first of three bytes
+                Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
+            } else {
+                // U+10000 to U+10FFFF - four bytes
+                // need to get one extra codepoint
+                if (Traits::availableUtf16(src, end) == 0)
+                    return Traits::EndOfString;
+
+                ushort low = Traits::peekUtf16(src);
+                if (!QChar::isHighSurrogate(u))
+                    return Traits::Error;
+                if (!QChar::isLowSurrogate(low))
+                    return Traits::Error;
+
+                Traits::advanceUtf16(src);
+                uint ucs4 = QChar::surrogateToUcs4(u, low);
+
+                if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
+                    return Traits::Error;
+
+                // first byte
+                Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
+
+                // second of four bytes
+                Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
+
+                // for the rest of the bytes
+                u = ushort(ucs4);
+            }
+
+            // second to last byte
+            Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
+        }
+
+        // last byte
+        Traits::appendByte(dst, 0x80 | (u & 0x3f));
+        return 0;
+    }
+}
+
 enum DataEndianness
 {
     DetectEndianness,
@@ -68,6 +181,7 @@ enum DataEndianness
 struct QUtf8
 {
     static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
+    static QByteArray convertFromUnicode(const QChar *, int);
     static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
 };
author	Thiago Macieira <thiago.macieira@intel.com>	2013-10-19 18:54:55 -0400
committer	The Qt Project <gerrit-noreply@qt-project.org>	2014-01-09 22:34:54 +0100
commit	d51130cc3a00df8147e2eb0799e06865c901c6e0 (patch)
tree	16dd61dc71a36772baec6b9d34e66445dac554c1 /src/corelib/codecs/qutfcodec_p.h
parent	86fa8b4fb800620ef065f5e151fa6896931cfc99 (diff)