/**************************************************************************** ** ** Copyright (C) 2016 The Qt Company Ltd. ** Copyright (C) 2018 Intel Corporation. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtCore module of the Qt Toolkit. ** ** $QT_BEGIN_LICENSE:LGPL$ ** Commercial License Usage ** Licensees holding valid commercial Qt licenses may use this file in ** accordance with the commercial license agreement provided with the ** Software or, alternatively, in accordance with the terms contained in ** a written agreement between you and The Qt Company. For licensing terms ** and conditions see https://www.qt.io/terms-conditions. For further ** information use the contact form at https://www.qt.io/contact-us. ** ** GNU Lesser General Public License Usage ** Alternatively, this file may be used under the terms of the GNU Lesser ** General Public License version 3 as published by the Free Software ** Foundation and appearing in the file LICENSE.LGPL3 included in the ** packaging of this file. Please review the following information to ** ensure the GNU Lesser General Public License version 3 requirements ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. ** ** GNU General Public License Usage ** Alternatively, this file may be used under the terms of the GNU ** General Public License version 2.0 or (at your option) the GNU General ** Public license version 3 or any later version approved by the KDE Free ** Qt Foundation. The licenses are as published by the Free Software ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 ** included in the packaging of this file. Please review the following ** information to ensure the GNU General Public License requirements will ** be met: https://www.gnu.org/licenses/gpl-2.0.html and ** https://www.gnu.org/licenses/gpl-3.0.html. ** ** $QT_END_LICENSE$ ** ****************************************************************************/ #include "qutfcodec_p.h" #include "qlist.h" #include "qendian.h" #include "qchar.h" #include "private/qsimd_p.h" #include "private/qstringiterator_p.h" QT_BEGIN_NAMESPACE enum { Endian = 0, Data = 1 }; static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf }; #if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \ || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64)) static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) Q_DECL_NOTHROW { uint result = qCountLeadingZeroBits(v); // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31 // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when // counting up: msb index is 0 (because it starts there), and the lsb index is 31. result ^= sizeof(unsigned) * 8 - 1; return result; } #endif #if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) { // do sixteen characters at a time for ( ; end - src >= 16; src += 16, dst += 16) { __m128i data1 = _mm_loadu_si128((const __m128i*)src); __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src); // check if everything is ASCII // the highest ASCII value is U+007F // Do the packing directly: // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff, // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII, // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as // "non-ASCII", but it's an acceptable compromise. __m128i packed = _mm_packus_epi16(data1, data2); __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128()); // store, even if there are non-ASCII characters here _mm_storeu_si128((__m128i*)dst, packed); // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL) ushort n = ~_mm_movemask_epi8(nonAscii); if (n) { // find the next probable ASCII character // we don't want to load 32 bytes again in this loop if we know there are non-ASCII // characters still coming nextAscii = src + qBitScanReverse(n) + 1; n = qCountTrailingZeroBits(n); dst += n; src += n; return false; } } return src == end; } static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) { // do sixteen characters at a time for ( ; end - src >= 16; src += 16, dst += 16) { __m128i data = _mm_loadu_si128((const __m128i*)src); #ifdef __AVX2__ const int BitSpacing = 2; // load and zero extend to an YMM register const __m256i extended = _mm256_cvtepu8_epi16(data); uint n = _mm256_movemask_epi8(extended); if (!n) { // store _mm256_storeu_si256((__m256i*)dst, extended); continue; } #else const int BitSpacing = 1; // check if everything is ASCII // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII uint n = _mm_movemask_epi8(data); if (!n) { // unpack _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128())); continue; } #endif // copy the front part that is still ASCII while (!(n & 1)) { *dst++ = *src++; n >>= BitSpacing; } // find the next probable ASCII character // we don't want to load 16 bytes again in this loop if we know there are non-ASCII // characters still coming n = qBitScanReverse(n); nextAscii = src + (n / BitSpacing) + 1; return false; } return src == end; } static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) { #ifdef __AVX2__ // do 32 characters at a time // (this is similar to simdTestMask in qstring.cpp) const __m256i mask = _mm256_set1_epi8(0x80); for ( ; end - src >= 32; src += 32) { __m256i data = _mm256_loadu_si256(reinterpret_cast(src)); if (_mm256_testz_si256(mask, data)) continue; uint n = _mm256_movemask_epi8(data); Q_ASSUME(n); // find the next probable ASCII character // we don't want to load 32 bytes again in this loop if we know there are non-ASCII // characters still coming nextAscii = src + qBitScanReverse(n) + 1; // return the non-ASCII character return src + qCountTrailingZeroBits(n); } #endif // do sixteen characters at a time for ( ; end - src >= 16; src += 16) { __m128i data = _mm_loadu_si128(reinterpret_cast(src)); // check if everything is ASCII // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII uint n = _mm_movemask_epi8(data); if (!n) continue; // find the next probable ASCII character // we don't want to load 16 bytes again in this loop if we know there are non-ASCII // characters still coming nextAscii = src + qBitScanReverse(n) + 1; // return the non-ASCII character return src + qCountTrailingZeroBits(n); } // do four characters at a time for ( ; end - src >= 4; src += 4) { quint32 data = qFromUnaligned(src); data &= 0x80808080U; if (!data) continue; // We don't try to guess which of the three bytes is ASCII and which // one isn't. The chance that at least two of them are non-ASCII is // better than 75%. nextAscii = src; return src; } nextAscii = end; return src; } #elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64 static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) { uint16x8_t maxAscii = vdupq_n_u16(0x7f); uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; uint16x8_t mask2 = vshlq_n_u16(mask1, 1); // do sixteen characters at a time for ( ; end - src >= 16; src += 16, dst += 16) { // load 2 lanes (or: "load interleaved") uint16x8x2_t in = vld2q_u16(src); // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc), // add those together into a scalar, and merge the scalars. uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1)) | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2)); // merge the two lanes by shifting the values of the second by 8 and inserting them uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8); // store, even if there are non-ASCII characters here vst1q_u8(dst, vreinterpretq_u8_u16(out)); if (nonAscii) { // find the next probable ASCII character // we don't want to load 32 bytes again in this loop if we know there are non-ASCII // characters still coming nextAscii = src + qBitScanReverse(nonAscii) + 1; nonAscii = qCountTrailingZeroBits(nonAscii); dst += nonAscii; src += nonAscii; return false; } } return src == end; } static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) { // do eight characters at a time uint8x8_t msb_mask = vdup_n_u8(0x80); uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; for ( ; end - src >= 8; src += 8, dst += 8) { uint8x8_t c = vld1_u8(src); uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); if (!n) { // store vst1q_u16(dst, vmovl_u8(c)); continue; } // copy the front part that is still ASCII while (!(n & 1)) { *dst++ = *src++; n >>= 1; } // find the next probable ASCII character // we don't want to load 16 bytes again in this loop if we know there are non-ASCII // characters still coming n = qBitScanReverse(n); nextAscii = src + n + 1; return false; } return src == end; } static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) { // The SIMD code below is untested, so just force an early return until // we've had the time to verify it works. nextAscii = end; return src; // do eight characters at a time uint8x8_t msb_mask = vdup_n_u8(0x80); uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; for ( ; end - src >= 8; src += 8) { uint8x8_t c = vld1_u8(src); uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); if (!n) continue; // find the next probable ASCII character // we don't want to load 16 bytes again in this loop if we know there are non-ASCII // characters still coming nextAscii = src + qBitScanReverse(n) + 1; // return the non-ASCII character return src + qCountTrailingZeroBits(n); } nextAscii = end; return src; } #else static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *) { return false; } static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *) { return false; } static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii) { nextAscii = end; return src; } #endif QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len) { // create a QByteArray with the worst case scenario size QByteArray result(len * 3, Qt::Uninitialized); uchar *dst = reinterpret_cast(const_cast(result.constData())); const ushort *src = reinterpret_cast(uc); const ushort *const end = src + len; while (src != end) { const ushort *nextAscii = end; if (simdEncodeAscii(dst, nextAscii, src, end)) break; do { ushort uc = *src++; int res = QUtf8Functions::toUtf8(uc, dst, src, end); if (res < 0) { // encoding error - append '?' *dst++ = '?'; } } while (src < nextAscii); } result.truncate(dst - reinterpret_cast(const_cast(result.constData()))); return result; } QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) { uchar replacement = '?'; int rlen = 3*len; int surrogate_high = -1; if (state) { if (state->flags & QTextCodec::ConvertInvalidToNull) replacement = 0; if (!(state->flags & QTextCodec::IgnoreHeader)) rlen += 3; if (state->remainingChars) surrogate_high = state->state_data[0]; } QByteArray rstr(rlen, Qt::Uninitialized); uchar *cursor = reinterpret_cast(const_cast(rstr.constData())); const ushort *src = reinterpret_cast(uc); const ushort *const end = src + len; int invalid = 0; if (state && !(state->flags & QTextCodec::IgnoreHeader)) { // append UTF-8 BOM *cursor++ = utf8bom[0]; *cursor++ = utf8bom[1]; *cursor++ = utf8bom[2]; } const ushort *nextAscii = src; while (src != end) { int res; ushort uc; if (surrogate_high != -1) { uc = surrogate_high; surrogate_high = -1; res = QUtf8Functions::toUtf8(uc, cursor, src, end); } else { if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end)) break; uc = *src++; res = QUtf8Functions::toUtf8(uc, cursor, src, end); } if (Q_LIKELY(res >= 0)) continue; if (res == QUtf8BaseTraits::Error) { // encoding error ++invalid; *cursor++ = replacement; } else if (res == QUtf8BaseTraits::EndOfString) { surrogate_high = uc; break; } } rstr.resize(cursor - (const uchar*)rstr.constData()); if (state) { state->invalidChars += invalid; state->flags |= QTextCodec::IgnoreHeader; state->remainingChars = 0; if (surrogate_high >= 0) { state->remainingChars = 1; state->state_data[0] = surrogate_high; } } return rstr; } QString QUtf8::convertToUnicode(const char *chars, int len) { // UTF-8 to UTF-16 always needs the exact same number of words or less: // UTF-8 UTF-16 // 1 byte 1 word // 2 bytes 1 word // 3 bytes 1 word // 4 bytes 2 words (one surrogate pair) // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8), // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK). // // The table holds for invalid sequences too: we'll insert one replacement char // per invalid byte. QString result(len, Qt::Uninitialized); QChar *data = const_cast(result.constData()); // we know we're not shared const QChar *end = convertToUnicode(data, chars, len); result.truncate(end - data); return result; } /*! \since 5.7 \overload Converts the UTF-8 sequence of \a len octets beginning at \a chars to a sequence of QChar starting at \a buffer. The buffer is expected to be large enough to hold the result. An upper bound for the size of the buffer is \a len QChars. If, during decoding, an error occurs, a QChar::ReplacementCharacter is written. Returns a pointer to one past the last QChar written. This function never throws. */ QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, int len) Q_DECL_NOTHROW { ushort *dst = reinterpret_cast(buffer); const uchar *src = reinterpret_cast(chars); const uchar *end = src + len; // attempt to do a full decoding in SIMD const uchar *nextAscii = end; if (!simdDecodeAscii(dst, nextAscii, src, end)) { // at least one non-ASCII entry // check if we failed to decode the UTF-8 BOM; if so, skip it if (Q_UNLIKELY(src == reinterpret_cast(chars)) && end - src >= 3 && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) { src += 3; } while (src < end) { nextAscii = end; if (simdDecodeAscii(dst, nextAscii, src, end)) break; do { uchar b = *src++; int res = QUtf8Functions::fromUtf8(b, dst, src, end); if (res < 0) { // decoding error *dst++ = QChar::ReplacementCharacter; } } while (src < nextAscii); } } return reinterpret_cast(dst); } QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state) { bool headerdone = false; ushort replacement = QChar::ReplacementCharacter; int invalid = 0; int res; uchar ch = 0; // See above for buffer requirements for stateless decoding. However, that // fails if the state is not empty. The following situations can add to the // requirements: // state contains chars starts with requirement // 1 of 2 bytes valid continuation 0 // 2 of 3 bytes same 0 // 3 bytes of 4 same +1 (need to insert surrogate pair) // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart) // 2 of 3 bytes same +1 (same) // 3 of 4 bytes same +1 (same) QString result(len + 1, Qt::Uninitialized); ushort *dst = reinterpret_cast(const_cast(result.constData())); const uchar *src = reinterpret_cast(chars); const uchar *end = src + len; if (state) { if (state->flags & QTextCodec::IgnoreHeader) headerdone = true; if (state->flags & QTextCodec::ConvertInvalidToNull) replacement = QChar::Null; if (state->remainingChars) { // handle incoming state first uchar remainingCharsData[4]; // longest UTF-8 sequence possible int remainingCharsCount = state->remainingChars; int newCharsToCopy = qMin(sizeof(remainingCharsData) - remainingCharsCount, end - src); memset(remainingCharsData, 0, sizeof(remainingCharsData)); memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount); memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy); const uchar *begin = &remainingCharsData[1]; res = QUtf8Functions::fromUtf8(remainingCharsData[0], dst, begin, static_cast(remainingCharsData) + remainingCharsCount + newCharsToCopy); if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) { // special case for len == 0: // if we were supplied an empty string, terminate the previous, unfinished sequence with error ++invalid; *dst++ = replacement; } else if (res == QUtf8BaseTraits::EndOfString) { // if we got EndOfString again, then there were too few bytes in src; // copy to our state and return state->remainingChars = remainingCharsCount + newCharsToCopy; memcpy(&state->state_data[0], remainingCharsData, state->remainingChars); return QString(); } else if (!headerdone && res >= 0) { // eat the UTF-8 BOM headerdone = true; if (dst[-1] == 0xfeff) --dst; } // adjust src now that we have maybe consumed a few chars if (res >= 0) { Q_ASSERT(res > remainingCharsCount); src += res - remainingCharsCount; } } } // main body, stateless decoding res = 0; const uchar *nextAscii = src; const uchar *start = src; while (res >= 0 && src < end) { if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end)) break; ch = *src++; res = QUtf8Functions::fromUtf8(ch, dst, src, end); if (!headerdone && res >= 0) { headerdone = true; if (src == start + 3) { // 3 == sizeof(utf8-bom) // eat the UTF-8 BOM (it can only appear at the beginning of the string). if (dst[-1] == 0xfeff) --dst; } } if (res == QUtf8BaseTraits::Error) { res = 0; ++invalid; *dst++ = replacement; } } if (!state && res == QUtf8BaseTraits::EndOfString) { // unterminated UTF sequence *dst++ = QChar::ReplacementCharacter; while (src++ < end) *dst++ = QChar::ReplacementCharacter; } result.truncate(dst - (const ushort *)result.unicode()); if (state) { state->invalidChars += invalid; if (headerdone) state->flags |= QTextCodec::IgnoreHeader; if (res == QUtf8BaseTraits::EndOfString) { --src; // unread the byte in ch state->remainingChars = end - src; memcpy(&state->state_data[0], src, end - src); } else { state->remainingChars = 0; } } return result; } struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii { struct NoOutput {}; static void appendUtf16(const NoOutput &, ushort) {} static void appendUcs4(const NoOutput &, uint) {} }; QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len) { const uchar *src = reinterpret_cast(chars); const uchar *end = src + len; const uchar *nextAscii = src; bool isValidAscii = true; while (src < end) { if (src >= nextAscii) src = simdFindNonAscii(src, end, nextAscii); if (src == end) break; do { uchar b = *src++; if ((b & 0x80) == 0) continue; isValidAscii = false; QUtf8NoOutputTraits::NoOutput output; int res = QUtf8Functions::fromUtf8(b, output, src, end); if (res < 0) { // decoding error return { false, false }; } } while (src < nextAscii); } return { true, isValidAscii }; } int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, int u16len) { uint uc1, uc2; auto src1 = reinterpret_cast(utf8); auto end1 = src1 + u8len; QStringIterator src2(utf16, utf16 + u16len); while (src1 < end1 && src2.hasNext()) { uchar b = *src1++; uint *output = &uc1; int res = QUtf8Functions::fromUtf8(b, output, src1, end1); if (res < 0) { // decoding error uc1 = QChar::ReplacementCharacter; } uc2 = src2.next(); if (uc1 != uc2) return int(uc1) - int(uc2); } // the shorter string sorts first return (end1 > src1) - int(src2.hasNext()); } int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s) { uint uc1; auto src1 = reinterpret_cast(utf8); auto end1 = src1 + u8len; auto src2 = reinterpret_cast(s.latin1()); auto end2 = src2 + s.size(); while (src1 < end1 && src2 < end2) { uchar b = *src1++; uint *output = &uc1; int res = QUtf8Functions::fromUtf8(b, output, src1, end1); if (res < 0) { // decoding error uc1 = QChar::ReplacementCharacter; } uint uc2 = *src2++; if (uc1 != uc2) return int(uc1) - int(uc2); } // the shorter string sorts first return (end1 > src1) - (end2 > src2); } QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) { DataEndianness endian = e; int length = 2*len; if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { length += 2; } if (e == DetectEndianness) { endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; } QByteArray d; d.resize(length); char *data = d.data(); if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { QChar bom(QChar::ByteOrderMark); if (endian == BigEndianness) { data[0] = bom.row(); data[1] = bom.cell(); } else { data[0] = bom.cell(); data[1] = bom.row(); } data += 2; } if (endian == BigEndianness) { for (int i = 0; i < len; ++i) { *(data++) = uc[i].row(); *(data++) = uc[i].cell(); } } else { for (int i = 0; i < len; ++i) { *(data++) = uc[i].cell(); *(data++) = uc[i].row(); } } if (state) { state->remainingChars = 0; state->flags |= QTextCodec::IgnoreHeader; } return d; } QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) { DataEndianness endian = e; bool half = false; uchar buf = 0; bool headerdone = false; if (state) { headerdone = state->flags & QTextCodec::IgnoreHeader; if (endian == DetectEndianness) endian = (DataEndianness)state->state_data[Endian]; if (state->remainingChars) { half = true; buf = state->state_data[Data]; } } if (headerdone && endian == DetectEndianness) endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; QString result(len, Qt::Uninitialized); // worst case QChar *qch = (QChar *)result.data(); while (len--) { if (half) { QChar ch; if (endian == LittleEndianness) { ch.setRow(*chars++); ch.setCell(buf); } else { ch.setRow(buf); ch.setCell(*chars++); } if (!headerdone) { headerdone = true; if (endian == DetectEndianness) { if (ch == QChar::ByteOrderSwapped) { endian = LittleEndianness; } else if (ch == QChar::ByteOrderMark) { endian = BigEndianness; } else { if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { endian = BigEndianness; } else { endian = LittleEndianness; ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8)); } *qch++ = ch; } } else if (ch != QChar::ByteOrderMark) { *qch++ = ch; } } else { *qch++ = ch; } half = false; } else { buf = *chars++; half = true; } } result.truncate(qch - result.unicode()); if (state) { if (headerdone) state->flags |= QTextCodec::IgnoreHeader; state->state_data[Endian] = endian; if (half) { state->remainingChars = 1; state->state_data[Data] = buf; } else { state->remainingChars = 0; state->state_data[Data] = 0; } } return result; } QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e) { DataEndianness endian = e; int length = 4*len; if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) { length += 4; } if (e == DetectEndianness) { endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; } QByteArray d(length, Qt::Uninitialized); char *data = d.data(); if (!state || !(state->flags & QTextCodec::IgnoreHeader)) { if (endian == BigEndianness) { data[0] = 0; data[1] = 0; data[2] = (char)0xfe; data[3] = (char)0xff; } else { data[0] = (char)0xff; data[1] = (char)0xfe; data[2] = 0; data[3] = 0; } data += 4; } QStringIterator i(uc, uc + len); if (endian == BigEndianness) { while (i.hasNext()) { uint cp = i.next(); *(data++) = cp >> 24; *(data++) = (cp >> 16) & 0xff; *(data++) = (cp >> 8) & 0xff; *(data++) = cp & 0xff; } } else { while (i.hasNext()) { uint cp = i.next(); *(data++) = cp & 0xff; *(data++) = (cp >> 8) & 0xff; *(data++) = (cp >> 16) & 0xff; *(data++) = cp >> 24; } } if (state) { state->remainingChars = 0; state->flags |= QTextCodec::IgnoreHeader; } return d; } QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e) { DataEndianness endian = e; uchar tuple[4]; int num = 0; bool headerdone = false; if (state) { headerdone = state->flags & QTextCodec::IgnoreHeader; if (endian == DetectEndianness) { endian = (DataEndianness)state->state_data[Endian]; } num = state->remainingChars; memcpy(tuple, &state->state_data[Data], 4); } if (headerdone && endian == DetectEndianness) endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness; QString result; result.resize((num + len) >> 2 << 1); // worst case QChar *qch = (QChar *)result.data(); const char *end = chars + len; while (chars < end) { tuple[num++] = *chars++; if (num == 4) { if (!headerdone) { if (endian == DetectEndianness) { if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) { endian = LittleEndianness; num = 0; continue; } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) { endian = BigEndianness; num = 0; continue; } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) { endian = BigEndianness; } else { endian = LittleEndianness; } } else if (((endian == BigEndianness) ? qFromBigEndian(tuple) : qFromLittleEndian(tuple)) == QChar::ByteOrderMark) { num = 0; continue; } } uint code = (endian == BigEndianness) ? qFromBigEndian(tuple) : qFromLittleEndian(tuple); if (QChar::requiresSurrogates(code)) { *qch++ = QChar::highSurrogate(code); *qch++ = QChar::lowSurrogate(code); } else { *qch++ = code; } num = 0; } } result.truncate(qch - result.unicode()); if (state) { if (headerdone) state->flags |= QTextCodec::IgnoreHeader; state->state_data[Endian] = endian; state->remainingChars = num; memcpy(&state->state_data[Data], tuple, 4); } return result; } #ifndef QT_NO_TEXTCODEC QUtf8Codec::~QUtf8Codec() { } QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const { return QUtf8::convertFromUnicode(uc, len, state); } void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const { *target += QUtf8::convertToUnicode(chars, len, state); } QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const { return QUtf8::convertToUnicode(chars, len, state); } QByteArray QUtf8Codec::name() const { return "UTF-8"; } int QUtf8Codec::mibEnum() const { return 106; } QUtf16Codec::~QUtf16Codec() { } QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const { return QUtf16::convertFromUnicode(uc, len, state, e); } QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const { return QUtf16::convertToUnicode(chars, len, state, e); } int QUtf16Codec::mibEnum() const { return 1015; } QByteArray QUtf16Codec::name() const { return "UTF-16"; } QList QUtf16Codec::aliases() const { return QList(); } int QUtf16BECodec::mibEnum() const { return 1013; } QByteArray QUtf16BECodec::name() const { return "UTF-16BE"; } QList QUtf16BECodec::aliases() const { QList list; return list; } int QUtf16LECodec::mibEnum() const { return 1014; } QByteArray QUtf16LECodec::name() const { return "UTF-16LE"; } QList QUtf16LECodec::aliases() const { QList list; return list; } QUtf32Codec::~QUtf32Codec() { } QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const { return QUtf32::convertFromUnicode(uc, len, state, e); } QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const { return QUtf32::convertToUnicode(chars, len, state, e); } int QUtf32Codec::mibEnum() const { return 1017; } QByteArray QUtf32Codec::name() const { return "UTF-32"; } QList QUtf32Codec::aliases() const { QList list; return list; } int QUtf32BECodec::mibEnum() const { return 1018; } QByteArray QUtf32BECodec::name() const { return "UTF-32BE"; } QList QUtf32BECodec::aliases() const { QList list; return list; } int QUtf32LECodec::mibEnum() const { return 1019; } QByteArray QUtf32LECodec::name() const { return "UTF-32LE"; } QList QUtf32LECodec::aliases() const { QList list; return list; } #endif //QT_NO_TEXTCODEC QT_END_NAMESPACE