diff options
Diffstat (limited to 'src/corelib/codecs')
-rw-r--r-- | src/corelib/codecs/qutfcodec.cpp | 175 | ||||
-rw-r--r-- | src/corelib/codecs/qutfcodec_p.h | 105 |
2 files changed, 194 insertions, 86 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index d1fc5b851a..b0e0b3f010 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -128,114 +128,117 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve return rstr; } +QString QUtf8::convertToUnicode(const char *chars, int len) +{ + QString result(len + 1, Qt::Uninitialized); // worst case + ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); + const uchar *src = reinterpret_cast<const uchar *>(chars); + const uchar *end = src + len; + + while (src < end) { + uchar b = *src++; + int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end); + if (res < 0) { + // decoding error + *dst++ = QChar::ReplacementCharacter; + } + } + + result.truncate(dst - reinterpret_cast<const ushort *>(result.constData())); + return result; +} + QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state) { bool headerdone = false; ushort replacement = QChar::ReplacementCharacter; int need = 0; - int error = -1; - uint uc = 0; - uint min_uc = 0; + int invalid = 0; + int res; + uchar ch = 0; + + QString result(need + len + 1, Qt::Uninitialized); // worst case + ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData())); + const uchar *src = reinterpret_cast<const uchar *>(chars); + const uchar *end = src + len; + if (state) { if (state->flags & QTextCodec::IgnoreHeader) headerdone = true; if (state->flags & QTextCodec::ConvertInvalidToNull) replacement = QChar::Null; - need = state->remainingChars; - if (need) { - uc = state->state_data[0]; - min_uc = state->state_data[1]; - } - } - if (!headerdone && len > 3 - && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { - // starts with a byte order mark - chars += 3; - len -= 3; - headerdone = true; - } - - QString result(need + len + 1, Qt::Uninitialized); // worst case - ushort *qch = (ushort *)result.unicode(); - uchar ch; - int invalid = 0; - - for (int i = 0; i < len; ++i) { - ch = chars[i]; - if (need) { - if ((ch&0xc0) == 0x80) { - uc = (uc << 6) | (ch & 0x3f); - --need; - if (!need) { - // utf-8 bom composes into 0xfeff code point - if (!headerdone && uc == 0xfeff) { - // don't do anything, just skip the BOM - } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { - // surrogate pair - Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); - *qch++ = QChar::highSurrogate(uc); - *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) { - // error: overlong sequence, UTF16 surrogate or non-character - *qch++ = replacement; - ++invalid; - } else { - *qch++ = uc; - } - headerdone = true; - } - } else { - // error - i = error; - *qch++ = replacement; - ++invalid; - need = 0; - headerdone = true; - } - } else { - if (ch < 128) { - *qch++ = ushort(ch); - headerdone = true; - } else if ((ch & 0xe0) == 0xc0) { - uc = ch & 0x1f; - need = 1; - error = i; - min_uc = 0x80; - headerdone = true; - } else if ((ch & 0xf0) == 0xe0) { - uc = ch & 0x0f; - need = 2; - error = i; - min_uc = 0x800; - } else if ((ch&0xf8) == 0xf0) { - uc = ch & 0x07; - need = 3; - error = i; - min_uc = 0x10000; - headerdone = true; - } else { - // error - *qch++ = replacement; + if (state->remainingChars) { + // handle incoming state first + uchar remainingCharsData[4]; // longest UTF-8 sequence possible + int remainingCharsCount = state->remainingChars; + int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src); + + memset(remainingCharsData, 0, sizeof(remainingCharsData)); + memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount); + memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy); + + const uchar *begin = &remainingCharsData[1]; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin, + static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy); + if (res == QUtf8BaseTraits::EndOfString) { + // if we got EndOfString again, then there were too few bytes in src; + // copy to our state and return + state->remainingChars = remainingCharsCount + newCharsToCopy; + memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); + return QString(); + } else if (res == QUtf8BaseTraits::Error) { ++invalid; + *dst++ = replacement; + } else if (!headerdone && res >= 0) { + // eat the UTF-8 BOM headerdone = true; + if (dst[-1] == 0xfeff) + --dst; } + + // adjust src now that we have maybe consumed a few chars + //Q_ASSERT(res > remainingCharsCount) + src += res - remainingCharsCount; } } - if (!state && need > 0) { - // unterminated UTF sequence - for (int i = error; i < len; ++i) { - *qch++ = replacement; + + // main body, stateless decoding + res = 0; + while (res >= 0 && src < end) { + ch = *src++; + res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end); + if (!headerdone && res >= 0) { + headerdone = true; + // eat the UTF-8 BOM + if (dst[-1] == 0xfeff) + --dst; + } + if (res == QUtf8BaseTraits::Error) { + res = 0; ++invalid; + *dst++ = replacement; } } - result.truncate(qch - (ushort *)result.unicode()); + + if (!state && res == QUtf8BaseTraits::EndOfString) { + // unterminated UTF sequence + *dst++ = QChar::ReplacementCharacter; + while (src++ < end) + *dst++ = QChar::ReplacementCharacter; + } + + result.truncate(dst - (ushort *)result.unicode()); if (state) { state->invalidChars += invalid; - state->remainingChars = need; if (headerdone) state->flags |= QTextCodec::IgnoreHeader; - state->state_data[0] = need ? uc : 0; - state->state_data[1] = need ? min_uc : 0; + if (res == QUtf8BaseTraits::EndOfString) { + --src; // unread the byte in ch + state->remainingChars = end - src; + memcpy(&state->state_data[0], src, end - src); + } else { + state->remainingChars = 0; + } } return result; } diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h index 4f0e2394fe..c94a7a12e4 100644 --- a/src/corelib/codecs/qutfcodec_p.h +++ b/src/corelib/codecs/qutfcodec_p.h @@ -169,6 +169,110 @@ namespace QUtf8Functions Traits::appendByte(dst, 0x80 | (u & 0x3f)); return 0; } + + inline bool isContinuationByte(uchar b) + { + return (b & 0xc0) == 0x80; + } + + /// returns the number of characters consumed (including \a b) in case of success; + /// returns negative in case of error: Traits::Error or Traits::EndOfString + template <typename Traits, typename OutputPtr, typename InputPtr> inline + int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) + { + int charsNeeded; + uint min_uc; + uint uc; + + if (!Traits::skipAsciiHandling && b < 0x80) { + // US-ASCII + Traits::appendUtf16(dst, b); + return 1; + } + + if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { + // an UTF-8 first character must be at least 0xC0 + // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences + return Traits::Error; + } else if (b < 0xe0) { + charsNeeded = 2; + min_uc = 0x80; + uc = b & 0x1f; + } else if (b < 0xf0) { + charsNeeded = 3; + min_uc = 0x800; + uc = b & 0x0f; + } else if (b < 0xf5) { + charsNeeded = 4; + min_uc = 0x10000; + uc = b & 0x07; + } else { + // the last Unicode character is U+10FFFF + // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" + // therefore, a byte higher than 0xF4 is not the UTF-8 first byte + return Traits::Error; + } + + int bytesAvailable = Traits::availableBytes(src, end); + if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { + // it's possible that we have an error instead of just unfinished bytes + if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) + return Traits::Error; + if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) + return Traits::Error; + if (bytesAvailable > 2 && !isContinuationByte(Traits::peekByte(src, 2))) + return Traits::Error; + return Traits::EndOfString; + } + + // first continuation character + b = Traits::peekByte(src, 0); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + + if (charsNeeded > 2) { + // second continuation character + b = Traits::peekByte(src, 1); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + + if (charsNeeded > 3) { + // third continuation character + b = Traits::peekByte(src, 2); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + } + } + + // we've decoded something; safety-check it + if (!Traits::isTrusted) { + if (uc < min_uc) + return Traits::Error; + if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) + return Traits::Error; + if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc)) + return Traits::Error; + } + + // write the UTF-16 sequence + if (!QChar::requiresSurrogates(uc)) { + // UTF-8 decoded and no surrogates are required + // detach if necessary + Traits::appendUtf16(dst, ushort(uc)); + } else { + // UTF-8 decoded to something that requires a surrogate pair + Traits::appendUcs4(dst, uc); + } + + Traits::advanceByte(src, charsNeeded - 1); + return charsNeeded; + } } enum DataEndianness @@ -180,6 +284,7 @@ enum DataEndianness struct QUtf8 { + static QString convertToUnicode(const char *, int); static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *); static QByteArray convertFromUnicode(const QChar *, int); static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *); |