diff options
Diffstat (limited to 'src/corelib/codecs/qutfcodec_p.h')
-rw-r--r-- | src/corelib/codecs/qutfcodec_p.h | 224 |
1 files changed, 224 insertions, 0 deletions
diff --git a/src/corelib/codecs/qutfcodec_p.h b/src/corelib/codecs/qutfcodec_p.h index e1214d50bc..c252edede7 100644 --- a/src/corelib/codecs/qutfcodec_p.h +++ b/src/corelib/codecs/qutfcodec_p.h @@ -1,6 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). +** Copyright (C) 2013 Intel Corporation ** Contact: http://www.qt-project.org/legal ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -58,6 +59,227 @@ QT_BEGIN_NAMESPACE +struct QUtf8BaseTraits +{ + static const bool isTrusted = false; + static const bool allowNonCharacters = true; + static const bool skipAsciiHandling = false; + static const int Error = -1; + static const int EndOfString = -2; + + static bool isValidCharacter(uint u) + { return int(u) >= 0; } + + static void appendByte(uchar *&ptr, uchar b) + { *ptr++ = b; } + + static uchar peekByte(const uchar *ptr, int n = 0) + { return ptr[n]; } + + static qptrdiff availableBytes(const uchar *ptr, const uchar *end) + { return end - ptr; } + + static void advanceByte(const uchar *&ptr, int n = 1) + { ptr += n; } + + static void appendUtf16(ushort *&ptr, ushort uc) + { *ptr++ = uc; } + + static void appendUcs4(ushort *&ptr, uint uc) + { + appendUtf16(ptr, QChar::highSurrogate(uc)); + appendUtf16(ptr, QChar::lowSurrogate(uc)); + } + + static ushort peekUtf16(const ushort *ptr, int n = 0) + { return ptr[n]; } + + static qptrdiff availableUtf16(const ushort *ptr, const ushort *end) + { return end - ptr; } + + static void advanceUtf16(const ushort *&ptr, int n = 1) + { ptr += n; } + + // it's possible to output to UCS-4 too + static void appendUtf16(uint *&ptr, ushort uc) + { *ptr++ = uc; } + + static void appendUcs4(uint *&ptr, uint uc) + { *ptr++ = uc; } +}; + +struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits +{ + static const bool skipAsciiHandling = true; +}; + +namespace QUtf8Functions +{ + /// returns 0 on success; errors can only happen if \a u is a surrogate: + /// Error if \a u is a low surrogate; + /// if \a u is a high surrogate, Error if the next isn't a low one, + /// EndOfString if we run into the end of the string. + template <typename Traits, typename OutputPtr, typename InputPtr> inline + int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end) + { + if (!Traits::skipAsciiHandling && u < 0x80) { + // U+0000 to U+007F (US-ASCII) - one byte + Traits::appendByte(dst, uchar(u)); + return 0; + } else if (u < 0x0800) { + // U+0080 to U+07FF - two bytes + // first of two bytes + Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); + } else { + if (!QChar::isSurrogate(u)) { + // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes + if (!Traits::allowNonCharacters && QChar::isNonCharacter(u)) + return Traits::Error; + + // first of three bytes + Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); + } else { + // U+10000 to U+10FFFF - four bytes + // need to get one extra codepoint + if (Traits::availableUtf16(src, end) == 0) + return Traits::EndOfString; + + ushort low = Traits::peekUtf16(src); + if (!QChar::isHighSurrogate(u)) + return Traits::Error; + if (!QChar::isLowSurrogate(low)) + return Traits::Error; + + Traits::advanceUtf16(src); + uint ucs4 = QChar::surrogateToUcs4(u, low); + + if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) + return Traits::Error; + + // first byte + Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); + + // second of four bytes + Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); + + // for the rest of the bytes + u = ushort(ucs4); + } + + // second to last byte + Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); + } + + // last byte + Traits::appendByte(dst, 0x80 | (u & 0x3f)); + return 0; + } + + inline bool isContinuationByte(uchar b) + { + return (b & 0xc0) == 0x80; + } + + /// returns the number of characters consumed (including \a b) in case of success; + /// returns negative in case of error: Traits::Error or Traits::EndOfString + template <typename Traits, typename OutputPtr, typename InputPtr> inline + int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) + { + int charsNeeded; + uint min_uc; + uint uc; + + if (!Traits::skipAsciiHandling && b < 0x80) { + // US-ASCII + Traits::appendUtf16(dst, b); + return 1; + } + + if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { + // an UTF-8 first character must be at least 0xC0 + // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences + return Traits::Error; + } else if (b < 0xe0) { + charsNeeded = 2; + min_uc = 0x80; + uc = b & 0x1f; + } else if (b < 0xf0) { + charsNeeded = 3; + min_uc = 0x800; + uc = b & 0x0f; + } else if (b < 0xf5) { + charsNeeded = 4; + min_uc = 0x10000; + uc = b & 0x07; + } else { + // the last Unicode character is U+10FFFF + // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" + // therefore, a byte higher than 0xF4 is not the UTF-8 first byte + return Traits::Error; + } + + int bytesAvailable = Traits::availableBytes(src, end); + if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { + // it's possible that we have an error instead of just unfinished bytes + if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) + return Traits::Error; + if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) + return Traits::Error; + if (bytesAvailable > 2 && !isContinuationByte(Traits::peekByte(src, 2))) + return Traits::Error; + return Traits::EndOfString; + } + + // first continuation character + b = Traits::peekByte(src, 0); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + + if (charsNeeded > 2) { + // second continuation character + b = Traits::peekByte(src, 1); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + + if (charsNeeded > 3) { + // third continuation character + b = Traits::peekByte(src, 2); + if (!isContinuationByte(b)) + return Traits::Error; + uc <<= 6; + uc |= b & 0x3f; + } + } + + // we've decoded something; safety-check it + if (!Traits::isTrusted) { + if (uc < min_uc) + return Traits::Error; + if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) + return Traits::Error; + if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc)) + return Traits::Error; + } + + // write the UTF-16 sequence + if (!QChar::requiresSurrogates(uc)) { + // UTF-8 decoded and no surrogates are required + // detach if necessary + Traits::appendUtf16(dst, ushort(uc)); + } else { + // UTF-8 decoded to something that requires a surrogate pair + Traits::appendUcs4(dst, uc); + } + + Traits::advanceByte(src, charsNeeded - 1); + return charsNeeded; + } +} + enum DataEndianness { DetectEndianness, @@ -67,7 +289,9 @@ enum DataEndianness struct QUtf8 { + static QString convertToUnicode(const char *, int); static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *); + static QByteArray convertFromUnicode(const QChar *, int); static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *); }; |