diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2013-10-20 17:43:46 +0100 |
---|---|---|
committer | The Qt Project <gerrit-noreply@qt-project.org> | 2014-01-09 22:34:54 +0100 |
commit | 8dd47e34b9b96ac27a99cdcf10b8aec506882fc2 (patch) | |
tree | be92b77f4006e2b96683e5bfd4810db09a5b15ab /src/corelib/codecs/qutfcodec_p.h | |
parent | d51130cc3a00df8147e2eb0799e06865c901c6e0 (diff) |
Add a new UTF-8 decoder, similar to the encoder we've just added
Like before, this is taken from the existing QUrl code and is optimized for
ASCII handling (for the same reasons). And like previously, make
QString::fromUtf8 use a stateless version of the codec, which is faster.
There's a small change in behavior in the decoding: we insert a U+FFFD for
each byte that cannot be decoded properly. Previously, it would "eat" all bad
high-bit bytes and replace them all with one single U+FFFD. Either behavior is
allowed by the UTF-8 specifications, even though this new behavior will cause
misalignment in the Bradley Kuhn sample UTF-8 text.
Change-Id: Ib1b1f0b4291293bab345acaf376e00204ed87565
Reviewed-by: Olivier Goffart <ogoffart@woboq.com>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/corelib/codecs/qutfcodec_p.h')
-rw-r--r-- | src/corelib/codecs/qutfcodec_p.h | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h index 4f0e2394fe..c94a7a12e4 100644 --- a/src/corelib/codecs/qutfcodec_p.h +++ b/src/corelib/codecs/qutfcodec_p.h @@ -169,6 +169,110 @@ namespace QUtf8Functions Traits::appendByte(dst, 0x80 | (u & 0x3f)); return 0; } + + inline bool isContinuationByte(uchar b) + { + return (b & 0xc0) == 0x80; + } + + /// returns the number of characters consumed (including \a b) in case of success; + /// returns negative in case of error: Traits::Error or Traits::EndOfString + template <typename Traits, typename OutputPtr, typename InputPtr> inline + int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) + { + int charsNeeded; + uint min_uc; + uint uc; + + if (!Traits::skipAsciiHandling && b < 0x80) { + // US-ASCII + Traits::appendUtf16(dst, b); + return 1; + } + + if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { + // an UTF-8 first character must be at least 0xC0 + // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences + return Traits::Error; + } else if (b < 0xe0) { + charsNeeded = 2; + min_uc = 0x80; + uc = b & 0x1f; + } else if (b < 0xf0) { + charsNeeded = 3; + min_uc = 0x800; + uc = b & 0x0f; + } else if (b < 0xf5) { + charsNeeded = 4; + min_uc = 0x10000; + uc = b & 0x07; + } else { + // the last Unicode character is U+10FFFF + // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" + // therefore, a byte higher than 0xF4 is not the UTF-8 first byte + return Traits::Error; + } + + int bytesAvailable = Traits::availableBytes(src, end); + if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { + // it's possible that we have an error instead of just unfinished bytes + if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) + return Traits::Error; + if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) + return Traits::Error; + if (bytesAvailable > 2 && !isContinuationByte(Traits::peekByte(src, 2))) + return Traits::Error; + return Traits::EndOfString; + } + + // first continuation character + b = Traits::peekByte(src, 0); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + + if (charsNeeded > 2) { + // second continuation character + b = Traits::peekByte(src, 1); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + + if (charsNeeded > 3) { + // third continuation character + b = Traits::peekByte(src, 2); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + } + } + + // we've decoded something; safety-check it + if (!Traits::isTrusted) { + if (uc < min_uc) + return Traits::Error; + if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) + return Traits::Error; + if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc)) + return Traits::Error; + } + + // write the UTF-16 sequence + if (!QChar::requiresSurrogates(uc)) { + // UTF-8 decoded and no surrogates are required + // detach if necessary + Traits::appendUtf16(dst, ushort(uc)); + } else { + // UTF-8 decoded to something that requires a surrogate pair + Traits::appendUcs4(dst, uc); + } + + Traits::advanceByte(src, charsNeeded - 1); + return charsNeeded; + } } enum DataEndianness @@ -180,6 +284,7 @@ enum DataEndianness struct QUtf8 { + static QString convertToUnicode(const char *, int); static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *); static QByteArray convertFromUnicode(const QChar *, int); static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *); |