/**************************************************************************** ** ** Copyright (C) 2016 The Qt Company Ltd. ** Copyright (C) 2016 Intel Corporation. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtCore module of the Qt Toolkit. ** ** $QT_BEGIN_LICENSE:LGPL$ ** Commercial License Usage ** Licensees holding valid commercial Qt licenses may use this file in ** accordance with the commercial license agreement provided with the ** Software or, alternatively, in accordance with the terms contained in ** a written agreement between you and The Qt Company. For licensing terms ** and conditions see https://www.qt.io/terms-conditions. For further ** information use the contact form at https://www.qt.io/contact-us. ** ** GNU Lesser General Public License Usage ** Alternatively, this file may be used under the terms of the GNU Lesser ** General Public License version 3 as published by the Free Software ** Foundation and appearing in the file LICENSE.LGPL3 included in the ** packaging of this file. Please review the following information to ** ensure the GNU Lesser General Public License version 3 requirements ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. ** ** GNU General Public License Usage ** Alternatively, this file may be used under the terms of the GNU ** General Public License version 2.0 or (at your option) the GNU General ** Public license version 3 or any later version approved by the KDE Free ** Qt Foundation. The licenses are as published by the Free Software ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 ** included in the packaging of this file. Please review the following ** information to ensure the GNU General Public License requirements will ** be met: https://www.gnu.org/licenses/gpl-2.0.html and ** https://www.gnu.org/licenses/gpl-3.0.html. ** ** $QT_END_LICENSE$ ** ****************************************************************************/ #ifndef QUTFCODEC_P_H #define QUTFCODEC_P_H // // W A R N I N G // ------------- // // This file is not part of the Qt API. It exists purely as an // implementation detail. This header file may change from version to // version without notice, or even be removed. // // We mean it. // #include "QtCore/qtextcodec.h" #include "private/qtextcodec_p.h" QT_BEGIN_NAMESPACE struct QUtf8BaseTraits { static const bool isTrusted = false; static const bool allowNonCharacters = true; static const bool skipAsciiHandling = false; static const int Error = -1; static const int EndOfString = -2; static bool isValidCharacter(uint u) { return int(u) >= 0; } static void appendByte(uchar *&ptr, uchar b) { *ptr++ = b; } static uchar peekByte(const uchar *ptr, int n = 0) { return ptr[n]; } static qptrdiff availableBytes(const uchar *ptr, const uchar *end) { return end - ptr; } static void advanceByte(const uchar *&ptr, int n = 1) { ptr += n; } static void appendUtf16(ushort *&ptr, ushort uc) { *ptr++ = uc; } static void appendUcs4(ushort *&ptr, uint uc) { appendUtf16(ptr, QChar::highSurrogate(uc)); appendUtf16(ptr, QChar::lowSurrogate(uc)); } static ushort peekUtf16(const ushort *ptr, int n = 0) { return ptr[n]; } static qptrdiff availableUtf16(const ushort *ptr, const ushort *end) { return end - ptr; } static void advanceUtf16(const ushort *&ptr, int n = 1) { ptr += n; } // it's possible to output to UCS-4 too static void appendUtf16(uint *&ptr, ushort uc) { *ptr++ = uc; } static void appendUcs4(uint *&ptr, uint uc) { *ptr++ = uc; } }; struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits { static const bool skipAsciiHandling = true; }; namespace QUtf8Functions { /// returns 0 on success; errors can only happen if \a u is a surrogate: /// Error if \a u is a low surrogate; /// if \a u is a high surrogate, Error if the next isn't a low one, /// EndOfString if we run into the end of the string. template inline int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end) { if (!Traits::skipAsciiHandling && u < 0x80) { // U+0000 to U+007F (US-ASCII) - one byte Traits::appendByte(dst, uchar(u)); return 0; } else if (u < 0x0800) { // U+0080 to U+07FF - two bytes // first of two bytes Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); } else { if (!QChar::isSurrogate(u)) { // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes if (!Traits::allowNonCharacters && QChar::isNonCharacter(u)) return Traits::Error; // first of three bytes Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); } else { // U+10000 to U+10FFFF - four bytes // need to get one extra codepoint if (Traits::availableUtf16(src, end) == 0) return Traits::EndOfString; ushort low = Traits::peekUtf16(src); if (!QChar::isHighSurrogate(u)) return Traits::Error; if (!QChar::isLowSurrogate(low)) return Traits::Error; Traits::advanceUtf16(src); uint ucs4 = QChar::surrogateToUcs4(u, low); if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) return Traits::Error; // first byte Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); // second of four bytes Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); // for the rest of the bytes u = ushort(ucs4); } // second to last byte Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); } // last byte Traits::appendByte(dst, 0x80 | (u & 0x3f)); return 0; } inline bool isContinuationByte(uchar b) { return (b & 0xc0) == 0x80; } /// returns the number of characters consumed (including \a b) in case of success; /// returns negative in case of error: Traits::Error or Traits::EndOfString template inline int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) { int charsNeeded; uint min_uc; uint uc; if (!Traits::skipAsciiHandling && b < 0x80) { // US-ASCII Traits::appendUtf16(dst, b); return 1; } if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { // an UTF-8 first character must be at least 0xC0 // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences return Traits::Error; } else if (b < 0xe0) { charsNeeded = 2; min_uc = 0x80; uc = b & 0x1f; } else if (b < 0xf0) { charsNeeded = 3; min_uc = 0x800; uc = b & 0x0f; } else if (b < 0xf5) { charsNeeded = 4; min_uc = 0x10000; uc = b & 0x07; } else { // the last Unicode character is U+10FFFF // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" // therefore, a byte higher than 0xF4 is not the UTF-8 first byte return Traits::Error; } int bytesAvailable = Traits::availableBytes(src, end); if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { // it's possible that we have an error instead of just unfinished bytes if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) return Traits::Error; if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) return Traits::Error; return Traits::EndOfString; } // first continuation character b = Traits::peekByte(src, 0); if (!isContinuationByte(b)) return Traits::Error; uc <<= 6; uc |= b & 0x3f; if (charsNeeded > 2) { // second continuation character b = Traits::peekByte(src, 1); if (!isContinuationByte(b)) return Traits::Error; uc <<= 6; uc |= b & 0x3f; if (charsNeeded > 3) { // third continuation character b = Traits::peekByte(src, 2); if (!isContinuationByte(b)) return Traits::Error; uc <<= 6; uc |= b & 0x3f; } } // we've decoded something; safety-check it if (!Traits::isTrusted) { if (uc < min_uc) return Traits::Error; if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) return Traits::Error; if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc)) return Traits::Error; } // write the UTF-16 sequence if (!QChar::requiresSurrogates(uc)) { // UTF-8 decoded and no surrogates are required // detach if necessary Traits::appendUtf16(dst, ushort(uc)); } else { // UTF-8 decoded to something that requires a surrogate pair Traits::appendUcs4(dst, uc); } Traits::advanceByte(src, charsNeeded - 1); return charsNeeded; } } enum DataEndianness { DetectEndianness, BigEndianness, LittleEndianness }; struct QUtf8 { static QChar *convertToUnicode(QChar *, const char *, int) Q_DECL_NOTHROW; static QString convertToUnicode(const char *, int); static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *); static QByteArray convertFromUnicode(const QChar *, int); static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *); }; struct QUtf16 { static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); }; struct QUtf32 { static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness); }; #ifndef QT_NO_TEXTCODEC class QUtf8Codec : public QTextCodec { public: ~QUtf8Codec(); QByteArray name() const override; int mibEnum() const override; QString convertToUnicode(const char *, int, ConverterState *) const override; QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override; void convertToUnicode(QString *target, const char *, int, ConverterState *) const; }; class QUtf16Codec : public QTextCodec { protected: public: QUtf16Codec() { e = DetectEndianness; } ~QUtf16Codec(); QByteArray name() const override; QList aliases() const override; int mibEnum() const override; QString convertToUnicode(const char *, int, ConverterState *) const override; QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override; protected: DataEndianness e; }; class QUtf16BECodec : public QUtf16Codec { public: QUtf16BECodec() : QUtf16Codec() { e = BigEndianness; } QByteArray name() const override; QList aliases() const override; int mibEnum() const override; }; class QUtf16LECodec : public QUtf16Codec { public: QUtf16LECodec() : QUtf16Codec() { e = LittleEndianness; } QByteArray name() const override; QList aliases() const override; int mibEnum() const override; }; class QUtf32Codec : public QTextCodec { public: QUtf32Codec() { e = DetectEndianness; } ~QUtf32Codec(); QByteArray name() const override; QList aliases() const override; int mibEnum() const override; QString convertToUnicode(const char *, int, ConverterState *) const override; QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override; protected: DataEndianness e; }; class QUtf32BECodec : public QUtf32Codec { public: QUtf32BECodec() : QUtf32Codec() { e = BigEndianness; } QByteArray name() const override; QList aliases() const override; int mibEnum() const override; }; class QUtf32LECodec : public QUtf32Codec { public: QUtf32LECodec() : QUtf32Codec() { e = LittleEndianness; } QByteArray name() const override; QList aliases() const override; int mibEnum() const override; }; #endif // QT_NO_TEXTCODEC QT_END_NAMESPACE #endif // QUTFCODEC_P_H