From d51130cc3a00df8147e2eb0799e06865c901c6e0 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Sat, 19 Oct 2013 18:54:55 -0400 Subject: Add a new UTF-8 encoder and use it from QString This is a new and faster UTF-8 encoder, based on the code from QUrl. This code specializes for ASCII, which is the most common case anyway, especially since QString's "ascii" mode is actually UTF-8 now. In addition, make QString::toUtf8 use a stateless encoder. Stateless means that the function doesn't handle state between calls in the form of QTextCodec::ConverterState. This allows it to be faster than otherwise. The new code is in the form of a template so that it can be used from QJsonDocument and QUrl, which have small modifications to how the encoding is handled. Change-Id: I305ee0fd8523cc4ec74c2678cb9ea88b75bac7ac Reviewed-by: Thiago Macieira --- src/corelib/codecs/qutfcodec.cpp | 86 ++++++++++++++--------------- src/corelib/codecs/qutfcodec_p.h | 114 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 45 deletions(-) (limited to 'src/corelib/codecs') diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index e425f8634c..d1fc5b851a 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -1,6 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). +** Copyright (C) 2013 Intel Corporation ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -48,6 +49,27 @@ QT_BEGIN_NAMESPACE enum { Endian = 0, Data = 1 }; +QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len) +{ + // create a QByteArray with the worst case scenario size + QByteArray result(len * 3, Qt::Uninitialized); + uchar *dst = reinterpret_cast(const_cast(result.constData())); + const ushort *src = reinterpret_cast(uc); + const ushort *const end = src + len; + + while (src != end) { + ushort uc = *src++; + int res = QUtf8Functions::toUtf8(uc, dst, src, end); + if (res < 0) { + // encoding error - append '?' + *dst++ = '?'; + } + } + + result.truncate(dst - reinterpret_cast(const_cast(result.constData()))); + return result; +} + QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) { uchar replacement = '?'; @@ -62,61 +84,35 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve surrogate_high = state->state_data[0]; } - QByteArray rstr; - rstr.resize(rlen); - uchar* cursor = (uchar*)rstr.data(); - const QChar *ch = uc; + + QByteArray rstr(rlen, Qt::Uninitialized); + uchar *cursor = reinterpret_cast(const_cast(rstr.constData())); + const ushort *src = reinterpret_cast(uc); + const ushort *const end = src + len; + int invalid = 0; if (state && !(state->flags & QTextCodec::IgnoreHeader)) { + // append UTF-8 BOM *cursor++ = 0xef; *cursor++ = 0xbb; *cursor++ = 0xbf; } - const QChar *end = ch + len; - while (ch < end) { - uint u = ch->unicode(); - if (surrogate_high >= 0) { - if (ch->isLowSurrogate()) { - u = QChar::surrogateToUcs4(surrogate_high, u); - surrogate_high = -1; - } else { - // high surrogate without low - *cursor = replacement; - ++ch; - ++invalid; - surrogate_high = -1; - continue; - } - } else if (ch->isLowSurrogate()) { - // low surrogate without high - *cursor = replacement; - ++ch; - ++invalid; - continue; - } else if (ch->isHighSurrogate()) { - surrogate_high = u; - ++ch; + while (src != end) { + ushort uc = surrogate_high == -1 ? *src++ : surrogate_high; + surrogate_high = -1; + int res = QUtf8Functions::toUtf8(uc, cursor, src, end); + if (Q_LIKELY(res >= 0)) continue; - } - if (u < 0x80) { - *cursor++ = (uchar)u; - } else { - if (u < 0x0800) { - *cursor++ = 0xc0 | ((uchar) (u >> 6)); - } else { - if (QChar::requiresSurrogates(u)) { - *cursor++ = 0xf0 | ((uchar) (u >> 18)); - *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); - } else { - *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f); - } - *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f); - } - *cursor++ = 0x80 | ((uchar) (u&0x3f)); + if (res == QUtf8BaseTraits::Error) { + // encoding error + ++invalid; + *cursor++ = replacement; + } else if (res == QUtf8BaseTraits::EndOfString) { + surrogate_high = uc; + break; } - ++ch; } rstr.resize(cursor - (const uchar*)rstr.constData()); diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h index e1214d50bc..4f0e2394fe 100644 --- a/src/corelib/codecs/qutfcodec_p.h +++ b/src/corelib/codecs/qutfcodec_p.h @@ -1,6 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). +** Copyright (C) 2013 Intel Corporation ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -58,6 +59,118 @@ QT_BEGIN_NAMESPACE +struct QUtf8BaseTraits +{ + static const bool isTrusted = false; + static const bool allowNonCharacters = true; + static const bool skipAsciiHandling = false; + static const int Error = -1; + static const int EndOfString = -2; + + static bool isValidCharacter(uint u) + { return int(u) >= 0; } + + static void appendByte(uchar *&ptr, uchar b) + { *ptr++ = b; } + + static uchar peekByte(const uchar *ptr, int n = 0) + { return ptr[n]; } + + static qptrdiff availableBytes(const uchar *ptr, const uchar *end) + { return end - ptr; } + + static void advanceByte(const uchar *&ptr, int n = 1) + { ptr += n; } + + static void appendUtf16(ushort *&ptr, ushort uc) + { *ptr++ = uc; } + + static void appendUcs4(ushort *&ptr, uint uc) + { + appendUtf16(ptr, QChar::highSurrogate(uc)); + appendUtf16(ptr, QChar::lowSurrogate(uc)); + } + + static ushort peekUtf16(const ushort *ptr, int n = 0) + { return ptr[n]; } + + static qptrdiff availableUtf16(const ushort *ptr, const ushort *end) + { return end - ptr; } + + static void advanceUtf16(const ushort *&ptr, int n = 1) + { ptr += n; } + + // it's possible to output to UCS-4 too + static void appendUtf16(uint *&ptr, ushort uc) + { *ptr++ = uc; } + + static void appendUcs4(uint *&ptr, uint uc) + { *ptr++ = uc; } +}; + +namespace QUtf8Functions +{ + /// returns 0 on success; errors can only happen if \a u is a surrogate: + /// Error if \a u is a low surrogate; + /// if \a u is a high surrogate, Error if the next isn't a low one, + /// EndOfString if we run into the end of the string. + template inline + int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end) + { + if (!Traits::skipAsciiHandling && u < 0x80) { + // U+0000 to U+007F (US-ASCII) - one byte + Traits::appendByte(dst, uchar(u)); + return 0; + } else if (u < 0x0800) { + // U+0080 to U+07FF - two bytes + // first of two bytes + Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); + } else { + if (!QChar::isSurrogate(u)) { + // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes + if (!Traits::allowNonCharacters && QChar::isNonCharacter(u)) + return Traits::Error; + + // first of three bytes + Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); + } else { + // U+10000 to U+10FFFF - four bytes + // need to get one extra codepoint + if (Traits::availableUtf16(src, end) == 0) + return Traits::EndOfString; + + ushort low = Traits::peekUtf16(src); + if (!QChar::isHighSurrogate(u)) + return Traits::Error; + if (!QChar::isLowSurrogate(low)) + return Traits::Error; + + Traits::advanceUtf16(src); + uint ucs4 = QChar::surrogateToUcs4(u, low); + + if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) + return Traits::Error; + + // first byte + Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); + + // second of four bytes + Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); + + // for the rest of the bytes + u = ushort(ucs4); + } + + // second to last byte + Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); + } + + // last byte + Traits::appendByte(dst, 0x80 | (u & 0x3f)); + return 0; + } +} + enum DataEndianness { DetectEndianness, @@ -68,6 +181,7 @@ enum DataEndianness struct QUtf8 { static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *); + static QByteArray convertFromUnicode(const QChar *, int); static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *); }; -- cgit v1.2.3