diff options
Diffstat (limited to 'src/corelib/codecs/qutfcodec.cpp')
-rw-r--r-- | src/corelib/codecs/qutfcodec.cpp | 261 |
1 files changed, 130 insertions, 131 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index e425f8634c..b0e0b3f010 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -1,6 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). +** Copyright (C) 2013 Intel Corporation ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -48,6 +49,27 @@ QT_BEGIN_NAMESPACE enum { Endian = 0, Data = 1 }; +QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len) +{ + // create a QByteArray with the worst case scenario size + QByteArray result(len * 3, Qt::Uninitialized); + uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData())); + const ushort *src = reinterpret_cast<const ushort *>(uc); + const ushort *const end = src + len; + + while (src != end) { + ushort uc = *src++; + int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end); + if (res < 0) { + // encoding error - append '?' + *dst++ = '?'; + } + } + + result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData()))); + return result; +} + QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) { uchar replacement = '?'; @@ -62,61 +84,35 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve surrogate_high = state->state_data[0]; } - QByteArray rstr; - rstr.resize(rlen); - uchar* cursor = (uchar*)rstr.data(); - const QChar *ch = uc; + + QByteArray rstr(rlen, Qt::Uninitialized); + uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData())); + const ushort *src = reinterpret_cast<const ushort *>(uc); + const ushort *const end = src + len; + int invalid = 0; if (state && !(state->flags & QTextCodec::IgnoreHeader)) { + // append UTF-8 BOM *cursor++ = 0xef; *cursor++ = 0xbb; *cursor++ = 0xbf; } - const QChar *end = ch + len; - while (ch < end) { - uint u = ch->unicode(); - if (surrogate_high >= 0) { - if (ch->isLowSurrogate()) { - u = QChar::surrogateToUcs4(surrogate_high, u); - surrogate_high = -1; - } else { - // high surrogate without low - *cursor = replacement; - ++ch; - ++invalid; - surrogate_high = -1; - continue; - } - } else if (ch->isLowSurrogate()) { - // low surrogate without high - *cursor = replacement; - ++ch; - ++invalid; + while (src != end) { + ushort uc = surrogate_high == -1 ? *src++ : surrogate_high; + surrogate_high = -1; + int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); + if (Q_LIKELY(res >= 0)) continue; - } else if (ch->isHighSurrogate()) { - surrogate_high = u; - ++ch; - continue; - } - if (u < 0x80) { - *cursor++ = (uchar)u; - } else { - if (u < 0x0800) { - *cursor++ = 0xc0 | ((uchar) (u >> 6)); - } else { - if (QChar::requiresSurrogates(u)) { - *cursor++ = 0xf0 | ((uchar) (u >> 18)); - *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); - } else { - *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f); - } - *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f); - } - *cursor++ = 0x80 | ((uchar) (u&0x3f)); + if (res == QUtf8BaseTraits::Error) { + // encoding error + ++invalid; + *cursor++ = replacement; + } else if (res == QUtf8BaseTraits::EndOfString) { + surrogate_high = uc; + break; } - ++ch; } rstr.resize(cursor - (const uchar*)rstr.constData()); @@ -132,114 +128,117 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve return rstr; } +QString QUtf8::convertToUnicode(const char *chars, int len) +{ + QString result(len + 1, Qt::Uninitialized); // worst case + ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); + const uchar *src = reinterpret_cast<const uchar *>(chars); + const uchar *end = src + len; + + while (src < end) { + uchar b = *src++; + int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end); + if (res < 0) { + // decoding error + *dst++ = QChar::ReplacementCharacter; + } + } + + result.truncate(dst - reinterpret_cast<const ushort *>(result.constData())); + return result; +} + QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state) { bool headerdone = false; ushort replacement = QChar::ReplacementCharacter; int need = 0; - int error = -1; - uint uc = 0; - uint min_uc = 0; + int invalid = 0; + int res; + uchar ch = 0; + + QString result(need + len + 1, Qt::Uninitialized); // worst case + ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); + const uchar *src = reinterpret_cast<const uchar *>(chars); + const uchar *end = src + len; + if (state) { if (state->flags & QTextCodec::IgnoreHeader) headerdone = true; if (state->flags & QTextCodec::ConvertInvalidToNull) replacement = QChar::Null; - need = state->remainingChars; - if (need) { - uc = state->state_data[0]; - min_uc = state->state_data[1]; - } - } - if (!headerdone && len > 3 - && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { - // starts with a byte order mark - chars += 3; - len -= 3; - headerdone = true; - } - - QString result(need + len + 1, Qt::Uninitialized); // worst case - ushort *qch = (ushort *)result.unicode(); - uchar ch; - int invalid = 0; - - for (int i = 0; i < len; ++i) { - ch = chars[i]; - if (need) { - if ((ch&0xc0) == 0x80) { - uc = (uc << 6) | (ch & 0x3f); - --need; - if (!need) { - // utf-8 bom composes into 0xfeff code point - if (!headerdone && uc == 0xfeff) { - // don't do anything, just skip the BOM - } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { - // surrogate pair - Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); - *qch++ = QChar::highSurrogate(uc); - *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) { - // error: overlong sequence, UTF16 surrogate or non-character - *qch++ = replacement; - ++invalid; - } else { - *qch++ = uc; - } - headerdone = true; - } - } else { - // error - i = error; - *qch++ = replacement; - ++invalid; - need = 0; - headerdone = true; - } - } else { - if (ch < 128) { - *qch++ = ushort(ch); - headerdone = true; - } else if ((ch & 0xe0) == 0xc0) { - uc = ch & 0x1f; - need = 1; - error = i; - min_uc = 0x80; - headerdone = true; - } else if ((ch & 0xf0) == 0xe0) { - uc = ch & 0x0f; - need = 2; - error = i; - min_uc = 0x800; - } else if ((ch&0xf8) == 0xf0) { - uc = ch & 0x07; - need = 3; - error = i; - min_uc = 0x10000; - headerdone = true; - } else { - // error - *qch++ = replacement; + if (state->remainingChars) { + // handle incoming state first + uchar remainingCharsData[4]; // longest UTF-8 sequence possible + int remainingCharsCount = state->remainingChars; + int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src); + + memset(remainingCharsData, 0, sizeof(remainingCharsData)); + memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount); + memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy); + + const uchar *begin = &remainingCharsData[1]; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin, + static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy); + if (res == QUtf8BaseTraits::EndOfString) { + // if we got EndOfString again, then there were too few bytes in src; + // copy to our state and return + state->remainingChars = remainingCharsCount + newCharsToCopy; + memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); + return QString(); + } else if (res == QUtf8BaseTraits::Error) { ++invalid; + *dst++ = replacement; + } else if (!headerdone && res >= 0) { + // eat the UTF-8 BOM headerdone = true; + if (dst[-1] == 0xfeff) + --dst; } + + // adjust src now that we have maybe consumed a few chars + //Q_ASSERT(res > remainingCharsCount) + src += res - remainingCharsCount; } } - if (!state && need > 0) { - // unterminated UTF sequence - for (int i = error; i < len; ++i) { - *qch++ = replacement; + + // main body, stateless decoding + res = 0; + while (res >= 0 && src < end) { + ch = *src++; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end); + if (!headerdone && res >= 0) { + headerdone = true; + // eat the UTF-8 BOM + if (dst[-1] == 0xfeff) + --dst; + } + if (res == QUtf8BaseTraits::Error) { + res = 0; ++invalid; + *dst++ = replacement; } } - result.truncate(qch - (ushort *)result.unicode()); + + if (!state && res == QUtf8BaseTraits::EndOfString) { + // unterminated UTF sequence + *dst++ = QChar::ReplacementCharacter; + while (src++ < end) + *dst++ = QChar::ReplacementCharacter; + } + + result.truncate(dst - (ushort *)result.unicode()); if (state) { state->invalidChars += invalid; - state->remainingChars = need; if (headerdone) state->flags |= QTextCodec::IgnoreHeader; - state->state_data[0] = need ? uc : 0; - state->state_data[1] = need ? min_uc : 0; + if (res == QUtf8BaseTraits::EndOfString) { + --src; // unread the byte in ch + state->remainingChars = end - src; + memcpy(&state->state_data[0], src, end - src); + } else { + state->remainingChars = 0; + } } return result; } |